From 1349c6ee6885140e0da64986fb132f31a09e6327 Mon Sep 17 00:00:00 2001 From: Angelo Paparazzi Date: Mon, 10 Nov 2025 12:42:57 -0600 Subject: [PATCH 1/2] feat(tts): add new voice models --- .../text_to_speech/v1/TextToSpeech.java | 37 ++++++++++--------- .../v1/model/GetPronunciationOptions.java | 18 +++++++++ .../v1/model/GetVoiceOptions.java | 18 +++++++++ .../v1/model/SynthesizeOptions.java | 18 +++++++++ 4 files changed, 73 insertions(+), 18 deletions(-) diff --git a/text-to-speech/src/main/java/com/ibm/watson/text_to_speech/v1/TextToSpeech.java b/text-to-speech/src/main/java/com/ibm/watson/text_to_speech/v1/TextToSpeech.java index 2c8a53d343..7697b36132 100644 --- a/text-to-speech/src/main/java/com/ibm/watson/text_to_speech/v1/TextToSpeech.java +++ b/text-to-speech/src/main/java/com/ibm/watson/text_to_speech/v1/TextToSpeech.java @@ -307,24 +307,25 @@ public ServiceCall getVoice(GetVoiceOptions getVoiceOptions) { * format with the Opus codec (`audio/ogg;codecs=opus`). The service always returns single-channel * audio. * `audio/alaw` - You must specify the `rate` of the audio. * `audio/basic` - The service * returns audio with a sampling rate of 8000 Hz. * `audio/flac` - You can optionally specify the - * `rate` of the audio. The default sampling rate is 22,050 Hz. * `audio/l16` - You must specify - * the `rate` of the audio. You can optionally specify the `endianness` of the audio. The default - * endianness is `little-endian`. * `audio/mp3` - You can optionally specify the `rate` of the - * audio. The default sampling rate is 24,000 Hz for Natural voices and 22,050 Hz for for all - * other voices. * `audio/mpeg` - You can optionally specify the `rate` of the audio. The default - * sampling rate is 22,050 Hz. * `audio/mulaw` - You must specify the `rate` of the audio. * - * `audio/ogg` - The service returns the audio in the `vorbis` codec. You can optionally specify - * the `rate` of the audio. The default sampling rate is 22,050 Hz. * `audio/ogg;codecs=opus` - - * You can optionally specify the `rate` of the audio. Only the following values are valid - * sampling rates: `48000`, `24000`, `16000`, `12000`, or `8000`. If you specify a value other - * than one of these, the service returns an error. The default sampling rate is 48,000 Hz. * - * `audio/ogg;codecs=vorbis` - You can optionally specify the `rate` of the audio. The default - * sampling rate is 22,050 Hz. * `audio/wav` - You can optionally specify the `rate` of the audio. - * The default sampling rate is 22,050 Hz. * `audio/webm` - The service returns the audio in the - * `opus` codec. The service returns audio with a sampling rate of 48,000 Hz. * - * `audio/webm;codecs=opus` - The service returns audio with a sampling rate of 48,000 Hz. * - * `audio/webm;codecs=vorbis` - You can optionally specify the `rate` of the audio. The default - * sampling rate is 22,050 Hz. + * `rate` of the audio. The default sampling rate is 24,000 Hz for Natural voices and 22,050 Hz + * for all other voices. * `audio/l16` - You must specify the `rate` of the audio. You can + * optionally specify the `endianness` of the audio. The default endianness is `little-endian`. * + * `audio/mp3` - You can optionally specify the `rate` of the audio. The default sampling rate is + * 24,000 Hz for Natural voices and 22,050 Hz for for all other voices. * `audio/mpeg` - You can + * optionally specify the `rate` of the audio. The default sampling rate is 24,000 Hz for Natural + * voices and 22,050 Hz for all other voices. * `audio/mulaw` - You must specify the `rate` of the + * audio. * `audio/ogg` - The service returns the audio in the `vorbis` codec. You can optionally + * specify the `rate` of the audio. The default sampling rate is 48,000 Hz. * + * `audio/ogg;codecs=opus` - You can optionally specify the `rate` of the audio. Only the + * following values are valid sampling rates: `48000`, `24000`, `16000`, `12000`, or `8000`. If + * you specify a value other than one of these, the service returns an error. The default sampling + * rate is 48,000 Hz. * `audio/ogg;codecs=vorbis` - You can optionally specify the `rate` of the + * audio. The default sampling rate is 48,000 Hz. * `audio/wav` - You can optionally specify the + * `rate` of the audio. The default sampling rate is 24,000 Hz for Natural voices and 22,050 Hz + * for all other voices. * `audio/webm` - The service returns the audio in the `opus` codec. The + * service returns audio with a sampling rate of 48,000 Hz. * `audio/webm;codecs=opus` - The + * service returns audio with a sampling rate of 48,000 Hz. * `audio/webm;codecs=vorbis` - You can + * optionally specify the `rate` of the audio. The default sampling rate is 48,000 Hz. * *

For more information about specifying an audio format, including additional details about * some of the formats, see [Using audio diff --git a/text-to-speech/src/main/java/com/ibm/watson/text_to_speech/v1/model/GetPronunciationOptions.java b/text-to-speech/src/main/java/com/ibm/watson/text_to_speech/v1/model/GetPronunciationOptions.java index a0280c67ff..45edefb78e 100644 --- a/text-to-speech/src/main/java/com/ibm/watson/text_to_speech/v1/model/GetPronunciationOptions.java +++ b/text-to-speech/src/main/java/com/ibm/watson/text_to_speech/v1/model/GetPronunciationOptions.java @@ -41,12 +41,18 @@ public interface Voice { String EN_AU_HEIDIEXPRESSIVE = "en-AU_HeidiExpressive"; /** en-AU_JackExpressive. */ String EN_AU_JACKEXPRESSIVE = "en-AU_JackExpressive"; + /** en-CA_HannahNatural. */ + String EN_CA_HANNAHNATURAL = "en-CA_HannahNatural"; /** en-GB_CharlotteV3Voice. */ String EN_GB_CHARLOTTEV3VOICE = "en-GB_CharlotteV3Voice"; + /** en-GB_ChloeNatural. */ + String EN_GB_CHLOENATURAL = "en-GB_ChloeNatural"; /** en-GB_GeorgeExpressive. */ String EN_GB_GEORGEEXPRESSIVE = "en-GB_GeorgeExpressive"; /** en-GB_JamesV3Voice. */ String EN_GB_JAMESV3VOICE = "en-GB_JamesV3Voice"; + /** en-GB_GeorgeNatural. */ + String EN_GB_GEORGENATURAL = "en-GB_GeorgeNatural"; /** en-GB_KateV3Voice. */ String EN_GB_KATEV3VOICE = "en-GB_KateV3Voice"; /** en-US_AllisonExpressive. */ @@ -59,8 +65,14 @@ public interface Voice { String EN_US_EMILYV3VOICE = "en-US_EmilyV3Voice"; /** en-US_EmmaExpressive. */ String EN_US_EMMAEXPRESSIVE = "en-US_EmmaExpressive"; + /** en-US_EmmaNatural. */ + String EN_US_EMMANATURAL = "en-US_EmmaNatural"; + /** en-US_EthanNatural. */ + String EN_US_ETHANNATURAL = "en-US_EthanNatural"; /** en-US_HenryV3Voice. */ String EN_US_HENRYV3VOICE = "en-US_HenryV3Voice"; + /** en-US_JacksonNatural. */ + String EN_US_JACKSONNATURAL = "en-US_JacksonNatural"; /** en-US_KevinV3Voice. */ String EN_US_KEVINV3VOICE = "en-US_KevinV3Voice"; /** en-US_LisaExpressive. */ @@ -73,6 +85,8 @@ public interface Voice { String EN_US_MICHAELV3VOICE = "en-US_MichaelV3Voice"; /** en-US_OliviaV3Voice. */ String EN_US_OLIVIAV3VOICE = "en-US_OliviaV3Voice"; + /** en-US_VictoriaNatural. */ + String EN_US_VICTORIANATURAL = "en-US_VictoriaNatural"; /** es-ES_EnriqueV3Voice. */ String ES_ES_ENRIQUEV3VOICE = "es-ES_EnriqueV3Voice"; /** es-ES_LauraV3Voice. */ @@ -97,10 +111,14 @@ public interface Voice { String KO_KR_JINV3VOICE = "ko-KR_JinV3Voice"; /** nl-NL_MerelV3Voice. */ String NL_NL_MERELV3VOICE = "nl-NL_MerelV3Voice"; + /** pt-BR_CamilaNatural. */ + String PT_BR_CAMILANATURAL = "pt-BR_CamilaNatural"; /** pt-BR_IsabelaV3Voice. */ String PT_BR_ISABELAV3VOICE = "pt-BR_IsabelaV3Voice"; /** pt-BR_LucasExpressive. */ String PT_BR_LUCASEXPRESSIVE = "pt-BR_LucasExpressive"; + /** pt-BR_LucasNatural. */ + String PT_BR_LUCASNATURAL = "pt-BR_LucasNatural"; } /** diff --git a/text-to-speech/src/main/java/com/ibm/watson/text_to_speech/v1/model/GetVoiceOptions.java b/text-to-speech/src/main/java/com/ibm/watson/text_to_speech/v1/model/GetVoiceOptions.java index 5f028d6394..2dd8206020 100644 --- a/text-to-speech/src/main/java/com/ibm/watson/text_to_speech/v1/model/GetVoiceOptions.java +++ b/text-to-speech/src/main/java/com/ibm/watson/text_to_speech/v1/model/GetVoiceOptions.java @@ -30,12 +30,18 @@ public interface Voice { String EN_AU_HEIDIEXPRESSIVE = "en-AU_HeidiExpressive"; /** en-AU_JackExpressive. */ String EN_AU_JACKEXPRESSIVE = "en-AU_JackExpressive"; + /** en-CA_HannahNatural. */ + String EN_CA_HANNAHNATURAL = "en-CA_HannahNatural"; /** en-GB_CharlotteV3Voice. */ String EN_GB_CHARLOTTEV3VOICE = "en-GB_CharlotteV3Voice"; + /** en-GB_ChloeNatural. */ + String EN_GB_CHLOENATURAL = "en-GB_ChloeNatural"; /** en-GB_GeorgeExpressive. */ String EN_GB_GEORGEEXPRESSIVE = "en-GB_GeorgeExpressive"; /** en-GB_JamesV3Voice. */ String EN_GB_JAMESV3VOICE = "en-GB_JamesV3Voice"; + /** en-GB_GeorgeNatural. */ + String EN_GB_GEORGENATURAL = "en-GB_GeorgeNatural"; /** en-GB_KateV3Voice. */ String EN_GB_KATEV3VOICE = "en-GB_KateV3Voice"; /** en-US_AllisonExpressive. */ @@ -48,8 +54,14 @@ public interface Voice { String EN_US_EMILYV3VOICE = "en-US_EmilyV3Voice"; /** en-US_EmmaExpressive. */ String EN_US_EMMAEXPRESSIVE = "en-US_EmmaExpressive"; + /** en-US_EmmaNatural. */ + String EN_US_EMMANATURAL = "en-US_EmmaNatural"; + /** en-US_EthanNatural. */ + String EN_US_ETHANNATURAL = "en-US_EthanNatural"; /** en-US_HenryV3Voice. */ String EN_US_HENRYV3VOICE = "en-US_HenryV3Voice"; + /** en-US_JacksonNatural. */ + String EN_US_JACKSONNATURAL = "en-US_JacksonNatural"; /** en-US_KevinV3Voice. */ String EN_US_KEVINV3VOICE = "en-US_KevinV3Voice"; /** en-US_LisaExpressive. */ @@ -62,6 +74,8 @@ public interface Voice { String EN_US_MICHAELV3VOICE = "en-US_MichaelV3Voice"; /** en-US_OliviaV3Voice. */ String EN_US_OLIVIAV3VOICE = "en-US_OliviaV3Voice"; + /** en-US_VictoriaNatural. */ + String EN_US_VICTORIANATURAL = "en-US_VictoriaNatural"; /** es-ES_EnriqueV3Voice. */ String ES_ES_ENRIQUEV3VOICE = "es-ES_EnriqueV3Voice"; /** es-ES_LauraV3Voice. */ @@ -86,10 +100,14 @@ public interface Voice { String KO_KR_JINV3VOICE = "ko-KR_JinV3Voice"; /** nl-NL_MerelV3Voice. */ String NL_NL_MERELV3VOICE = "nl-NL_MerelV3Voice"; + /** pt-BR_CamilaNatural. */ + String PT_BR_CAMILANATURAL = "pt-BR_CamilaNatural"; /** pt-BR_IsabelaV3Voice. */ String PT_BR_ISABELAV3VOICE = "pt-BR_IsabelaV3Voice"; /** pt-BR_LucasExpressive. */ String PT_BR_LUCASEXPRESSIVE = "pt-BR_LucasExpressive"; + /** pt-BR_LucasNatural. */ + String PT_BR_LUCASNATURAL = "pt-BR_LucasNatural"; } protected String voice; diff --git a/text-to-speech/src/main/java/com/ibm/watson/text_to_speech/v1/model/SynthesizeOptions.java b/text-to-speech/src/main/java/com/ibm/watson/text_to_speech/v1/model/SynthesizeOptions.java index 714beaa5c7..edead2d467 100644 --- a/text-to-speech/src/main/java/com/ibm/watson/text_to_speech/v1/model/SynthesizeOptions.java +++ b/text-to-speech/src/main/java/com/ibm/watson/text_to_speech/v1/model/SynthesizeOptions.java @@ -43,12 +43,18 @@ public interface Voice { String EN_AU_HEIDIEXPRESSIVE = "en-AU_HeidiExpressive"; /** en-AU_JackExpressive. */ String EN_AU_JACKEXPRESSIVE = "en-AU_JackExpressive"; + /** en-CA_HannahNatural. */ + String EN_CA_HANNAHNATURAL = "en-CA_HannahNatural"; /** en-GB_CharlotteV3Voice. */ String EN_GB_CHARLOTTEV3VOICE = "en-GB_CharlotteV3Voice"; + /** en-GB_ChloeNatural. */ + String EN_GB_CHLOENATURAL = "en-GB_ChloeNatural"; /** en-GB_GeorgeExpressive. */ String EN_GB_GEORGEEXPRESSIVE = "en-GB_GeorgeExpressive"; /** en-GB_JamesV3Voice. */ String EN_GB_JAMESV3VOICE = "en-GB_JamesV3Voice"; + /** en-GB_GeorgeNatural. */ + String EN_GB_GEORGENATURAL = "en-GB_GeorgeNatural"; /** en-GB_KateV3Voice. */ String EN_GB_KATEV3VOICE = "en-GB_KateV3Voice"; /** en-US_AllisonExpressive. */ @@ -61,8 +67,14 @@ public interface Voice { String EN_US_EMILYV3VOICE = "en-US_EmilyV3Voice"; /** en-US_EmmaExpressive. */ String EN_US_EMMAEXPRESSIVE = "en-US_EmmaExpressive"; + /** en-US_EmmaNatural. */ + String EN_US_EMMANATURAL = "en-US_EmmaNatural"; + /** en-US_EthanNatural. */ + String EN_US_ETHANNATURAL = "en-US_EthanNatural"; /** en-US_HenryV3Voice. */ String EN_US_HENRYV3VOICE = "en-US_HenryV3Voice"; + /** en-US_JacksonNatural. */ + String EN_US_JACKSONNATURAL = "en-US_JacksonNatural"; /** en-US_KevinV3Voice. */ String EN_US_KEVINV3VOICE = "en-US_KevinV3Voice"; /** en-US_LisaExpressive. */ @@ -75,6 +87,8 @@ public interface Voice { String EN_US_MICHAELV3VOICE = "en-US_MichaelV3Voice"; /** en-US_OliviaV3Voice. */ String EN_US_OLIVIAV3VOICE = "en-US_OliviaV3Voice"; + /** en-US_VictoriaNatural. */ + String EN_US_VICTORIANATURAL = "en-US_VictoriaNatural"; /** es-ES_EnriqueV3Voice. */ String ES_ES_ENRIQUEV3VOICE = "es-ES_EnriqueV3Voice"; /** es-ES_LauraV3Voice. */ @@ -99,10 +113,14 @@ public interface Voice { String KO_KR_JINV3VOICE = "ko-KR_JinV3Voice"; /** nl-NL_MerelV3Voice. */ String NL_NL_MERELV3VOICE = "nl-NL_MerelV3Voice"; + /** pt-BR_CamilaNatural. */ + String PT_BR_CAMILANATURAL = "pt-BR_CamilaNatural"; /** pt-BR_IsabelaV3Voice. */ String PT_BR_ISABELAV3VOICE = "pt-BR_IsabelaV3Voice"; /** pt-BR_LucasExpressive. */ String PT_BR_LUCASEXPRESSIVE = "pt-BR_LucasExpressive"; + /** pt-BR_LucasNatural. */ + String PT_BR_LUCASNATURAL = "pt-BR_LucasNatural"; } /** From 7b0ab86ed50b6885b3d6180f865f8fe07a0d260d Mon Sep 17 00:00:00 2001 From: Angelo Paparazzi Date: Mon, 10 Nov 2025 12:44:21 -0600 Subject: [PATCH 2/2] feat(stt): add new sad_module param to recognize functions --- .../speech_to_text/v1/SpeechToText.java | 10 ++++-- .../v1/model/CreateJobOptions.java | 32 +++++++++++++++++ .../v1/model/RecognizeOptions.java | 35 ++++++++++++++++++- .../model/RecognizeWithWebsocketsOptions.java | 32 +++++++++++++++++ .../speech_to_text/v1/SpeechToTextTest.java | 6 +++- .../v1/model/CreateJobOptionsTest.java | 4 ++- .../v1/model/RecognizeOptionsTest.java | 4 ++- 7 files changed, 117 insertions(+), 6 deletions(-) diff --git a/speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/SpeechToText.java b/speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/SpeechToText.java index e4b316cbc9..e26990f3a5 100644 --- a/speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/SpeechToText.java +++ b/speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/SpeechToText.java @@ -1,5 +1,5 @@ /* - * (C) Copyright IBM Corp. 2016, 2024. + * (C) Copyright IBM Corp. 2016, 2025. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -12,7 +12,7 @@ */ /* - * IBM OpenAPI SDK Code Generator Version: 3.97.0-0e90eab1-20241120-170029 + * IBM OpenAPI SDK Code Generator Version: 3.105.0-3c13b041-20250605-193116 */ package com.ibm.watson.speech_to_text.v1; @@ -520,6 +520,9 @@ public ServiceCall recognize(RecognizeOptions recogniz "speech_detector_sensitivity", String.valueOf(recognizeOptions.speechDetectorSensitivity())); } + if (recognizeOptions.sadModule() != null) { + builder.query("sad_module", String.valueOf(recognizeOptions.sadModule())); + } if (recognizeOptions.backgroundAudioSuppression() != null) { builder.query( "background_audio_suppression", @@ -854,6 +857,9 @@ public ServiceCall createJob(CreateJobOptions createJobOptions) "speech_detector_sensitivity", String.valueOf(createJobOptions.speechDetectorSensitivity())); } + if (createJobOptions.sadModule() != null) { + builder.query("sad_module", String.valueOf(createJobOptions.sadModule())); + } if (createJobOptions.backgroundAudioSuppression() != null) { builder.query( "background_audio_suppression", diff --git a/speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/model/CreateJobOptions.java b/speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/model/CreateJobOptions.java index febc7d50a6..08bec5f813 100644 --- a/speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/model/CreateJobOptions.java +++ b/speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/model/CreateJobOptions.java @@ -270,6 +270,7 @@ public interface Events { protected Double endOfPhraseSilenceTime; protected Boolean splitTranscriptAtPhraseEnd; protected Float speechDetectorSensitivity; + protected Long sadModule; protected Float backgroundAudioSuppression; protected Boolean lowLatency; protected Float characterInsertionBias; @@ -306,6 +307,7 @@ public static class Builder { private Double endOfPhraseSilenceTime; private Boolean splitTranscriptAtPhraseEnd; private Float speechDetectorSensitivity; + private Long sadModule; private Float backgroundAudioSuppression; private Boolean lowLatency; private Float characterInsertionBias; @@ -346,6 +348,7 @@ private Builder(CreateJobOptions createJobOptions) { this.endOfPhraseSilenceTime = createJobOptions.endOfPhraseSilenceTime; this.splitTranscriptAtPhraseEnd = createJobOptions.splitTranscriptAtPhraseEnd; this.speechDetectorSensitivity = createJobOptions.speechDetectorSensitivity; + this.sadModule = createJobOptions.sadModule; this.backgroundAudioSuppression = createJobOptions.backgroundAudioSuppression; this.lowLatency = createJobOptions.lowLatency; this.characterInsertionBias = createJobOptions.characterInsertionBias; @@ -717,6 +720,17 @@ public Builder speechDetectorSensitivity(Float speechDetectorSensitivity) { return this; } + /** + * Set the sadModule. + * + * @param sadModule the sadModule + * @return the CreateJobOptions builder + */ + public Builder sadModule(long sadModule) { + this.sadModule = sadModule; + return this; + } + /** * Set the backgroundAudioSuppression. * @@ -797,6 +811,7 @@ protected CreateJobOptions(Builder builder) { endOfPhraseSilenceTime = builder.endOfPhraseSilenceTime; splitTranscriptAtPhraseEnd = builder.splitTranscriptAtPhraseEnd; speechDetectorSensitivity = builder.speechDetectorSensitivity; + sadModule = builder.sadModule; backgroundAudioSuppression = builder.backgroundAudioSuppression; lowLatency = builder.lowLatency; characterInsertionBias = builder.characterInsertionBias; @@ -1353,6 +1368,23 @@ public Float speechDetectorSensitivity() { return speechDetectorSensitivity; } + /** + * Gets the sadModule. + * + *

Detects speech boundaries within the audio stream with better performance, improved noise + * suppression, faster responsiveness, and increased accuracy. + * + *

Specify `sad_module: 2` + * + *

See [Speech Activity Detection + * (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad). + * + * @return the sadModule + */ + public Long sadModule() { + return sadModule; + } + /** * Gets the backgroundAudioSuppression. * diff --git a/speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/model/RecognizeOptions.java b/speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/model/RecognizeOptions.java index 5ea02904b9..0b7ee243a2 100644 --- a/speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/model/RecognizeOptions.java +++ b/speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/model/RecognizeOptions.java @@ -237,6 +237,7 @@ public interface Model { protected Double endOfPhraseSilenceTime; protected Boolean splitTranscriptAtPhraseEnd; protected Float speechDetectorSensitivity; + protected Long sadModule; protected Float backgroundAudioSuppression; protected Boolean lowLatency; protected Float characterInsertionBias; @@ -268,6 +269,7 @@ public static class Builder { private Double endOfPhraseSilenceTime; private Boolean splitTranscriptAtPhraseEnd; private Float speechDetectorSensitivity; + private Long sadModule; private Float backgroundAudioSuppression; private Boolean lowLatency; private Float characterInsertionBias; @@ -303,6 +305,7 @@ private Builder(RecognizeOptions recognizeOptions) { this.endOfPhraseSilenceTime = recognizeOptions.endOfPhraseSilenceTime; this.splitTranscriptAtPhraseEnd = recognizeOptions.splitTranscriptAtPhraseEnd; this.speechDetectorSensitivity = recognizeOptions.speechDetectorSensitivity; + this.sadModule = recognizeOptions.sadModule; this.backgroundAudioSuppression = recognizeOptions.backgroundAudioSuppression; this.lowLatency = recognizeOptions.lowLatency; this.characterInsertionBias = recognizeOptions.characterInsertionBias; @@ -619,6 +622,17 @@ public Builder speechDetectorSensitivity(Float speechDetectorSensitivity) { return this; } + /** + * Set the sadModule. + * + * @param sadModule the sadModule + * @return the RecognizeOptions builder + */ + public Builder sadModule(long sadModule) { + this.sadModule = sadModule; + return this; + } + /** * Set the backgroundAudioSuppression. * @@ -694,6 +708,7 @@ protected RecognizeOptions(Builder builder) { endOfPhraseSilenceTime = builder.endOfPhraseSilenceTime; splitTranscriptAtPhraseEnd = builder.splitTranscriptAtPhraseEnd; speechDetectorSensitivity = builder.speechDetectorSensitivity; + sadModule = builder.sadModule; backgroundAudioSuppression = builder.backgroundAudioSuppression; lowLatency = builder.lowLatency; characterInsertionBias = builder.characterInsertionBias; @@ -759,7 +774,8 @@ public String model() { * when a speech activity is detected in the stream. This can be used both in standard and low * latency mode. This feature enables client applications to know that some words/speech has been * detected and the service is in the process of decoding. This can be used in lieu of interim - * results in standard mode. See [Using speech recognition + * results in standard mode. Use `sad_module: 2` to increase accuracy and performance in detecting + * speech boundaries within the audio stream. See [Using speech recognition * parameters](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-service-features#features-parameters). * * @return the speechBeginEvent @@ -1154,6 +1170,23 @@ public Float speechDetectorSensitivity() { return speechDetectorSensitivity; } + /** + * Gets the sadModule. + * + *

Detects speech boundaries within the audio stream with better performance, improved noise + * suppression, faster responsiveness, and increased accuracy. + * + *

Specify `sad_module: 2` + * + *

See [Speech Activity Detection + * (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad). + * + * @return the sadModule + */ + public Long sadModule() { + return sadModule; + } + /** * Gets the backgroundAudioSuppression. * diff --git a/speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/model/RecognizeWithWebsocketsOptions.java b/speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/model/RecognizeWithWebsocketsOptions.java index 6d6f85a517..fa404c61ee 100644 --- a/speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/model/RecognizeWithWebsocketsOptions.java +++ b/speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/model/RecognizeWithWebsocketsOptions.java @@ -203,6 +203,7 @@ public interface Model { protected Float backgroundAudioSuppression; protected Boolean lowLatency; protected Float characterInsertionBias; + protected Long sadModule; private Boolean interimResults; private Boolean processingMetrics; private Float processingMetricsInterval; @@ -236,6 +237,7 @@ public static class Builder { private Float backgroundAudioSuppression; private Boolean lowLatency; private Float characterInsertionBias; + private Long sadModule; private Boolean interimResults; private Boolean processingMetrics; private Float processingMetricsInterval; @@ -268,6 +270,7 @@ private Builder(RecognizeWithWebsocketsOptions recognizeWithWebsocketsOptions) { this.backgroundAudioSuppression = recognizeWithWebsocketsOptions.backgroundAudioSuppression; this.lowLatency = recognizeWithWebsocketsOptions.lowLatency; this.characterInsertionBias = recognizeWithWebsocketsOptions.characterInsertionBias; + this.sadModule = recognizeWithWebsocketsOptions.sadModule; this.interimResults = recognizeWithWebsocketsOptions.interimResults; this.processingMetrics = recognizeWithWebsocketsOptions.processingMetrics; this.processingMetricsInterval = recognizeWithWebsocketsOptions.processingMetricsInterval; @@ -606,6 +609,17 @@ public Builder characterInsertionBias(Float characterInsertionBias) { return this; } + /** + * Set the sadModule. + * + * @param sadModule the sadModule + * @return the RecognizeOptions builder + */ + public Builder sadModule(Long sadModule) { + this.sadModule = sadModule; + return this; + } + /** * Set the interimResults. * @@ -687,6 +701,7 @@ protected RecognizeWithWebsocketsOptions(Builder builder) { backgroundAudioSuppression = builder.backgroundAudioSuppression; lowLatency = builder.lowLatency; characterInsertionBias = builder.characterInsertionBias; + sadModule = builder.sadModule; interimResults = builder.interimResults; processingMetrics = builder.processingMetrics; processingMetricsInterval = builder.processingMetricsInterval; @@ -1176,6 +1191,23 @@ public Float characterInsertionBias() { return characterInsertionBias; } + /** + * Gets the sadModule. + * + *

Detects speech boundaries within the audio stream with better performance, improved noise + * suppression, faster responsiveness, and increased accuracy. + * + *

Specify `sad_module: 2` + * + *

See [Speech Activity Detection + * (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad). + * + * @return the sadModule + */ + public Long sadModule() { + return sadModule; + } + /** * Gets the interimResults. * diff --git a/speech-to-text/src/test/java/com/ibm/watson/speech_to_text/v1/SpeechToTextTest.java b/speech-to-text/src/test/java/com/ibm/watson/speech_to_text/v1/SpeechToTextTest.java index 64e1dcbd1e..2a32b0a815 100755 --- a/speech-to-text/src/test/java/com/ibm/watson/speech_to_text/v1/SpeechToTextTest.java +++ b/speech-to-text/src/test/java/com/ibm/watson/speech_to_text/v1/SpeechToTextTest.java @@ -1,5 +1,5 @@ /* - * (C) Copyright IBM Corp. 2019, 2024. + * (C) Copyright IBM Corp. 2019, 2025. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -249,6 +249,7 @@ public void testRecognizeWOptions() throws Throwable { .endOfPhraseSilenceTime(Double.valueOf("0.8")) .splitTranscriptAtPhraseEnd(false) .speechDetectorSensitivity(Float.valueOf("0.5")) + .sadModule(Long.valueOf("1")) .backgroundAudioSuppression(Float.valueOf("0.0")) .lowLatency(false) .characterInsertionBias(Float.valueOf("0.0")) @@ -296,6 +297,7 @@ public void testRecognizeWOptions() throws Throwable { assertEquals( Boolean.valueOf(query.get("split_transcript_at_phrase_end")), Boolean.valueOf(false)); assertEquals(Float.valueOf(query.get("speech_detector_sensitivity")), Float.valueOf("0.5")); + assertEquals(Long.valueOf(query.get("sad_module")), Long.valueOf("1")); assertEquals(Float.valueOf(query.get("background_audio_suppression")), Float.valueOf("0.0")); assertEquals(Boolean.valueOf(query.get("low_latency")), Boolean.valueOf(false)); assertEquals(Float.valueOf(query.get("character_insertion_bias")), Float.valueOf("0.0")); @@ -470,6 +472,7 @@ public void testCreateJobWOptions() throws Throwable { .endOfPhraseSilenceTime(Double.valueOf("0.8")) .splitTranscriptAtPhraseEnd(false) .speechDetectorSensitivity(Float.valueOf("0.5")) + .sadModule(Long.valueOf("1")) .backgroundAudioSuppression(Float.valueOf("0.0")) .lowLatency(false) .characterInsertionBias(Float.valueOf("0.0")) @@ -522,6 +525,7 @@ public void testCreateJobWOptions() throws Throwable { assertEquals( Boolean.valueOf(query.get("split_transcript_at_phrase_end")), Boolean.valueOf(false)); assertEquals(Float.valueOf(query.get("speech_detector_sensitivity")), Float.valueOf("0.5")); + assertEquals(Long.valueOf(query.get("sad_module")), Long.valueOf("1")); assertEquals(Float.valueOf(query.get("background_audio_suppression")), Float.valueOf("0.0")); assertEquals(Boolean.valueOf(query.get("low_latency")), Boolean.valueOf(false)); assertEquals(Float.valueOf(query.get("character_insertion_bias")), Float.valueOf("0.0")); diff --git a/speech-to-text/src/test/java/com/ibm/watson/speech_to_text/v1/model/CreateJobOptionsTest.java b/speech-to-text/src/test/java/com/ibm/watson/speech_to_text/v1/model/CreateJobOptionsTest.java index ba87a75647..5ea9b488a5 100644 --- a/speech-to-text/src/test/java/com/ibm/watson/speech_to_text/v1/model/CreateJobOptionsTest.java +++ b/speech-to-text/src/test/java/com/ibm/watson/speech_to_text/v1/model/CreateJobOptionsTest.java @@ -1,5 +1,5 @@ /* - * (C) Copyright IBM Corp. 2020, 2024. + * (C) Copyright IBM Corp. 2020, 2025. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -63,6 +63,7 @@ public void testCreateJobOptions() throws Throwable { .endOfPhraseSilenceTime(Double.valueOf("0.8")) .splitTranscriptAtPhraseEnd(false) .speechDetectorSensitivity(Float.valueOf("0.5")) + .sadModule(Long.valueOf("1")) .backgroundAudioSuppression(Float.valueOf("0.0")) .lowLatency(false) .characterInsertionBias(Float.valueOf("0.0")) @@ -99,6 +100,7 @@ public void testCreateJobOptions() throws Throwable { assertEquals(createJobOptionsModel.endOfPhraseSilenceTime(), Double.valueOf("0.8")); assertEquals(createJobOptionsModel.splitTranscriptAtPhraseEnd(), Boolean.valueOf(false)); assertEquals(createJobOptionsModel.speechDetectorSensitivity(), Float.valueOf("0.5")); + assertEquals(createJobOptionsModel.sadModule(), Long.valueOf("1")); assertEquals(createJobOptionsModel.backgroundAudioSuppression(), Float.valueOf("0.0")); assertEquals(createJobOptionsModel.lowLatency(), Boolean.valueOf(false)); assertEquals(createJobOptionsModel.characterInsertionBias(), Float.valueOf("0.0")); diff --git a/speech-to-text/src/test/java/com/ibm/watson/speech_to_text/v1/model/RecognizeOptionsTest.java b/speech-to-text/src/test/java/com/ibm/watson/speech_to_text/v1/model/RecognizeOptionsTest.java index fe108d4c26..3495772274 100644 --- a/speech-to-text/src/test/java/com/ibm/watson/speech_to_text/v1/model/RecognizeOptionsTest.java +++ b/speech-to-text/src/test/java/com/ibm/watson/speech_to_text/v1/model/RecognizeOptionsTest.java @@ -1,5 +1,5 @@ /* - * (C) Copyright IBM Corp. 2020, 2024. + * (C) Copyright IBM Corp. 2020, 2025. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at @@ -58,6 +58,7 @@ public void testRecognizeOptions() throws Throwable { .endOfPhraseSilenceTime(Double.valueOf("0.8")) .splitTranscriptAtPhraseEnd(false) .speechDetectorSensitivity(Float.valueOf("0.5")) + .sadModule(Long.valueOf("1")) .backgroundAudioSuppression(Float.valueOf("0.0")) .lowLatency(false) .characterInsertionBias(Float.valueOf("0.0")) @@ -89,6 +90,7 @@ public void testRecognizeOptions() throws Throwable { assertEquals(recognizeOptionsModel.endOfPhraseSilenceTime(), Double.valueOf("0.8")); assertEquals(recognizeOptionsModel.splitTranscriptAtPhraseEnd(), Boolean.valueOf(false)); assertEquals(recognizeOptionsModel.speechDetectorSensitivity(), Float.valueOf("0.5")); + assertEquals(recognizeOptionsModel.sadModule(), Long.valueOf("1")); assertEquals(recognizeOptionsModel.backgroundAudioSuppression(), Float.valueOf("0.0")); assertEquals(recognizeOptionsModel.lowLatency(), Boolean.valueOf(false)); assertEquals(recognizeOptionsModel.characterInsertionBias(), Float.valueOf("0.0"));