]> git.basschouten.com Git - openhab-addons.git/commitdiff
[watsonstt] use next gen model (#12971)
authorGiviMAD <GiviMAD@users.noreply.github.com>
Wed, 22 Jun 2022 06:54:13 +0000 (08:54 +0200)
committerGitHub <noreply@github.com>
Wed, 22 Jun 2022 06:54:13 +0000 (08:54 +0200)
Signed-off-by: Miguel Álvarez <miguelwork92@gmail.com>
bundles/org.openhab.voice.watsonstt/README.md
bundles/org.openhab.voice.watsonstt/pom.xml
bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTConfiguration.java
bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTService.java
bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/config/config.xml
bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/i18n/watsonstt.properties

index 554829e57a7a57ac13d09b7723f79088655033c1..46ba04c50e06a9917ca6ddc05b07f83cdbfcdae9 100644 (file)
@@ -24,6 +24,7 @@ Use your favorite configuration UI to edit **Settings / Other Services - IBM Wat
 
 Use your favorite configuration UI to edit **Settings / Other Services - IBM Watson Speech-to-Text**:
 
+* **Prefer Multimedia Model** - Prefer multimedia to telephony [models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng). Multimedia models are intended for audio that has a minimum sampling rate of 16 kHz, while telephony models are intended for audio that has a minimum sampling rate of 8 kHz.
 * **Background Audio Suppression** - Use the parameter to suppress side conversations or background noise.
 * **Speech Detector Sensitivity** - Use the parameter to suppress word insertions from music, coughing, and other non-speech events.
 * **Single Utterance Mode** - When enabled recognition stops listening after a single utterance.
index 1e0c4d15336890b514aa7ad015e823eed5941435..45cbf5d02ed8af16c642de93a3489cddd417b494 100644 (file)
     <dependency>
       <groupId>com.ibm.watson</groupId>
       <artifactId>speech-to-text</artifactId>
-      <version>9.3.1</version>
+      <version>10.0.1</version>
       <scope>compile</scope>
     </dependency>
     <!-- sdk deps -->
     <dependency>
       <groupId>com.ibm.cloud</groupId>
       <artifactId>sdk-core</artifactId>
-      <version>9.15.4</version>
+      <version>9.15.5</version>
       <scope>compile</scope>
     </dependency>
     <dependency>
       <groupId>com.ibm.watson</groupId>
       <artifactId>common</artifactId>
-      <version>9.3.1</version>
+      <version>10.0.1</version>
       <scope>compile</scope>
     </dependency>
     <dependency>
@@ -51,7 +51,7 @@
     <dependency>
       <groupId>org.jetbrains.kotlin</groupId>
       <artifactId>kotlin-stdlib</artifactId>
-      <version>1.4.10</version>
+      <version>1.4.32</version>
       <scope>compile</scope>
     </dependency>
     <dependency>
index 5358d92d7d72d01f1b11c46c9fe74c3ba80a47e3..0dd2b4f930e39e79442b938a67c17ac6515dc77e 100644 (file)
@@ -30,6 +30,11 @@ public class WatsonSTTConfiguration {
      * Url for Speech-to-Text instance created on IBM Cloud.
      */
     public String instanceUrl = "";
+    /**
+     * Prefer multimedia to telephony models. Multimedia models are intended for audio that has a minimum sampling rate
+     * of 16 kHz, while telephony models are intended for audio that has a minimum sampling rate of 8 kHz.
+     */
+    public boolean preferMultimediaModel = true;
     /**
      * Use the parameter to suppress side conversations or background noise.
      */
index 9454a778a4e8fb88cb252cf62fe8f31e3154bd54..5a90252df21d147deaa4fd03417b14c5b5dc2bd8 100644 (file)
@@ -22,6 +22,7 @@ import java.util.concurrent.ScheduledExecutorService;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.stream.Collectors;
+import java.util.stream.Stream;
 
 import org.eclipse.jdt.annotation.NonNullByDefault;
 import org.eclipse.jdt.annotation.Nullable;
@@ -69,13 +70,18 @@ import okhttp3.WebSocket;
 public class WatsonSTTService implements STTService {
     private final Logger logger = LoggerFactory.getLogger(WatsonSTTService.class);
     private final ScheduledExecutorService executor = ThreadPoolManager.getScheduledPool("OH-voice-watsonstt");
-    private final List<String> models = List.of("ar-AR_BroadbandModel", "de-DE_BroadbandModel", "en-AU_BroadbandModel",
-            "en-GB_BroadbandModel", "en-US_BroadbandModel", "es-AR_BroadbandModel", "es-CL_BroadbandModel",
-            "es-CO_BroadbandModel", "es-ES_BroadbandModel", "es-MX_BroadbandModel", "es-PE_BroadbandModel",
-            "fr-CA_BroadbandModel", "fr-FR_BroadbandModel", "it-IT_BroadbandModel", "ja-JP_BroadbandModel",
-            "ko-KR_BroadbandModel", "nl-NL_BroadbandModel", "pt-BR_BroadbandModel", "zh-CN_BroadbandModel");
-    private final Set<Locale> supportedLocales = models.stream().map(name -> name.split("_")[0])
-            .map(Locale::forLanguageTag).collect(Collectors.toSet());
+    private final List<String> telephonyModels = List.of("ar-MS_Telephony", "zh-CN_Telephony", "nl-BE_Telephony",
+            "nl-NL_Telephony", "en-AU_Telephony", "en-IN_Telephony", "en-GB_Telephony", "en-US_Telephony",
+            "fr-CA_Telephony", "fr-FR_Telephony", "hi-IN_Telephony", "pt-BR_Telephony", "es-ES_Telephony");
+    private final List<String> multimediaModels = List.of("en-AU_Multimedia", "en-GB_Multimedia", "en-US_Multimedia",
+            "fr-FR_Multimedia", "de-DE_Multimedia", "it-IT_Multimedia", "ja-JP_Multimedia", "ko-KR_Multimedia",
+            "pt-BR_Multimedia", "es-ES_Multimedia");
+    // model 'en-WW_Medical_Telephony' and 'es-LA_Telephony' will be used as fallbacks for es and en
+    private final List<Locale> fallbackLocales = List.of(Locale.forLanguageTag("es"), Locale.ENGLISH);
+    private final Set<Locale> supportedLocales = Stream
+            .concat(Stream.concat(telephonyModels.stream(), multimediaModels.stream()).map(name -> name.split("_")[0])
+                    .distinct().map(Locale::forLanguageTag), fallbackLocales.stream())
+            .collect(Collectors.toSet());
     private WatsonSTTConfiguration config = new WatsonSTTConfiguration();
     private @Nullable SpeechToText speechToText = null;
 
@@ -134,7 +140,7 @@ public class WatsonSTTService implements STTService {
         logger.debug("Content-Type: {}", contentType);
         RecognizeWithWebsocketsOptions wsOptions = new RecognizeWithWebsocketsOptions.Builder().audio(audioStream)
                 .contentType(contentType).redaction(config.redaction).smartFormatting(config.smartFormatting)
-                .model(locale.toLanguageTag() + "_BroadbandModel").interimResults(true)
+                .model(getModel(locale)).interimResults(true)
                 .backgroundAudioSuppression(config.backgroundAudioSuppression)
                 .speechDetectorSensitivity(config.speechDetectorSensitivity).inactivityTimeout(config.maxSilenceSeconds)
                 .build();
@@ -157,6 +163,33 @@ public class WatsonSTTService implements STTService {
         };
     }
 
+    private String getModel(Locale locale) throws STTException {
+        String languageTag = locale.toLanguageTag();
+        Stream<String> allModels;
+        if (config.preferMultimediaModel) {
+            allModels = Stream.concat(multimediaModels.stream(), telephonyModels.stream());
+        } else {
+            allModels = Stream.concat(telephonyModels.stream(), multimediaModels.stream());
+        }
+        var modelOption = allModels.filter(model -> model.startsWith(languageTag)).findFirst();
+        if (modelOption.isEmpty()) {
+            if ("es".equals(locale.getLanguage())) {
+                // fallback for latin american spanish languages
+                var model = "es-LA_Telephony";
+                logger.debug("Falling back to model: {}", model);
+            }
+            if ("en".equals(locale.getLanguage())) {
+                // fallback english dialects
+                var model = "en-WW_Medical_Telephony";
+                logger.debug("Falling back to model: {}", model);
+            }
+            throw new STTException("No compatible model for language " + languageTag);
+        }
+        var model = modelOption.get();
+        logger.debug("Using model: {}", model);
+        return model;
+    }
+
     private @Nullable String getContentType(AudioStream audioStream) throws STTException {
         AudioFormat format = audioStream.getFormat();
         String container = format.getContainer();
index 3be580499ded1f2f44f4465b016fbb40b13ea157..ed54844ae97fc54c629424a949e1bac711e4976c 100644 (file)
                        <label>Instance Url</label>
                        <description>Url for Speech-to-Text instance created on IBM Cloud.</description>
                </parameter>
+               <parameter name="preferMultimediaModel" type="boolean" groupName="stt">
+                       <label>Prefer Multimedia Model</label>
+                       <description>Prefer multimedia to telephony models. Multimedia models are intended for audio that has a minimum
+                               sampling rate of 16 kHz, while telephony models are intended for audio that has a minimum sampling rate of 8 kHz.</description>
+                       <default>true</default>
+               </parameter>
                <parameter name="backgroundAudioSuppression" type="decimal" min="0" max="1" step="0.1" groupName="stt">
                        <label>Background Audio Suppression</label>
                        <description>Use the parameter to suppress side conversations or background noise.</description>
index 29d5c40563a722b3c10b2965fef28c26ce1c0e2c..6ca306aac5fe4e9fdbec18ef27e761d3a1943792 100644 (file)
@@ -14,6 +14,8 @@ voice.config.watsonstt.noResultsMessage.label = No Results Message
 voice.config.watsonstt.noResultsMessage.description = Message to be told when no transcription is done.
 voice.config.watsonstt.optOutLogging.label = Opt Out Logging
 voice.config.watsonstt.optOutLogging.description = By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the services for future users. The logged data is not shared or made public.
+voice.config.watsonstt.preferMultimediaModel.label = Prefer Multimedia Model
+voice.config.watsonstt.preferMultimediaModel.description = Prefer multimedia to telephony models. Multimedia models are intended for audio that has a minimum sampling rate of 16 kHz, while telephony models are intended for audio that has a minimum sampling rate of 8 kHz.
 voice.config.watsonstt.redaction.label = Redaction
 voice.config.watsonstt.redaction.description = If true, the service redacts, or masks, numeric data from final transcripts. (Not available for all locales)
 voice.config.watsonstt.singleUtteranceMode.label = Single Utterance Mode