STT service improvements (#12453)

author GiviMAD <GiviMAD@users.noreply.github.com>

Sat, 12 Mar 2022 22:06:51 +0000 (23:06 +0100)

committer GitHub <noreply@github.com>

Sat, 12 Mar 2022 22:06:51 +0000 (23:06 +0100)
author GiviMAD <GiviMAD@users.noreply.github.com>
Sat, 12 Mar 2022 22:06:51 +0000 (23:06 +0100)
committer GitHub <noreply@github.com>
Sat, 12 Mar 2022 22:06:51 +0000 (23:06 +0100)
diff --git a/bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTConfiguration.java b/bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTConfiguration.java

index a844bdb7b7cb7085b61858dce62450c94eb64a94..4811dc1c7d7bffc02590b75ff52cb3f69996c9db 100644 (file)
--- a/bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTConfiguration.java
+++ b/bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTConfiguration.java
@@ -49,7 +49,7 @@ public class GoogleSTTConfiguration {
       * Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop
       * listening.
       */
-    public int maxSilenceSeconds = 5;
+    public int maxSilenceSeconds = 3;
      /**
       * Single phrase mode.
       */
diff --git a/bundles/org.openhab.voice.googlestt/src/main/resources/OH-INF/config/config.xml b/bundles/org.openhab.voice.googlestt/src/main/resources/OH-INF/config/config.xml

index 1711473833497adc18597fefd7ee309677cdc93f..58a0d9b0b1717830dcb604edb2331a10e53f133d 100644 (file)
--- a/bundles/org.openhab.voice.googlestt/src/main/resources/OH-INF/config/config.xml
+++ b/bundles/org.openhab.voice.googlestt/src/main/resources/OH-INF/config/config.xml
@@ -46,7 +46,7 @@
                         <label>Max Silence Seconds</label>
                         <description>Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop
                                 listening.</description>
-                       <default>5</default>
+                       <default>3</default>
                 </parameter>
                 <parameter name="refreshSupportedLocales" type="boolean" groupName="stt">
                         <label>Refresh Supported Locales</label>
diff --git a/bundles/org.openhab.voice.voskstt/src/main/java/org/openhab/voice/voskstt/internal/VoskSTTConfiguration.java b/bundles/org.openhab.voice.voskstt/src/main/java/org/openhab/voice/voskstt/internal/VoskSTTConfiguration.java

index 1f09cf98ddc9c6b72945c3dee2947fc430ab1d35..b4ebc9a739ee3db61902ca090666a84bf2de5612 100644 (file)
--- a/bundles/org.openhab.voice.voskstt/src/main/java/org/openhab/voice/voskstt/internal/VoskSTTConfiguration.java
+++ b/bundles/org.openhab.voice.voskstt/src/main/java/org/openhab/voice/voskstt/internal/VoskSTTConfiguration.java
@@ -33,7 +33,7 @@ public class VoskSTTConfiguration {
       * Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop
       * listening.
       */
-    public int maxSilenceSeconds = 5;
+    public int maxSilenceSeconds = 3;
      /**
       * Message to be told when no results.
       */
diff --git a/bundles/org.openhab.voice.voskstt/src/main/resources/OH-INF/config/config.xml b/bundles/org.openhab.voice.voskstt/src/main/resources/OH-INF/config/config.xml

index 627b4d6970b2027ab3a320980056d0eb3200f2de..1a6e37c1c28e21c7332bfb5324052a4aa6b025e5 100644 (file)
--- a/bundles/org.openhab.voice.voskstt/src/main/resources/OH-INF/config/config.xml
+++ b/bundles/org.openhab.voice.voskstt/src/main/resources/OH-INF/config/config.xml
@@ -27,7 +27,7 @@
                         <label>Max Silence Seconds</label>
                         <description>Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop
                                 listening.</description>
-                       <default>5</default>
+                       <default>3</default>
                 </parameter>
                 <parameter name="preloadModel" type="boolean" groupName="stt">
                         <label>Preload Model</label>
diff --git a/bundles/org.openhab.voice.watsonstt/README.md b/bundles/org.openhab.voice.watsonstt/README.md

index adcfcf970218c8ee1f300d5257b55de01711fa09..554829e57a7a57ac13d09b7723f79088655033c1 100644 (file)
--- a/bundles/org.openhab.voice.watsonstt/README.md
+++ b/bundles/org.openhab.voice.watsonstt/README.md
@@ -26,7 +26,8 @@ Use your favorite configuration UI to edit **Settings / Other Services - IBM Wat
  
  * **Background Audio Suppression** - Use the parameter to suppress side conversations or background noise.
  * **Speech Detector Sensitivity** - Use the parameter to suppress word insertions from music, coughing, and other non-speech events.
-* **Inactivity Timeout** - The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed.
+* **Single Utterance Mode** - When enabled recognition stops listening after a single utterance.
+* **Max Silence Seconds** - The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed.
  * **Opt Out Logging** - By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the services for future users. The logged data is not shared or made public.
  * **No Results Message** - Message to be told when no results.
  * **Smart Formatting** - If true, the service converts dates, times, series of digits and numbers, phone numbers, currency values, and internet addresses into more readable. (Not available for all locales)
@@ -43,7 +44,8 @@ org.openhab.voice.watsonstt:apiKey=******
  org.openhab.voice.watsonstt:instanceUrl=https://api.***.speech-to-text.watson.cloud.ibm.com/instances/*****
  org.openhab.voice.watsonstt:backgroundAudioSuppression=0.5
  org.openhab.voice.watsonstt:speechDetectorSensitivity=0.5
-org.openhab.voice.watsonstt:inactivityTimeout=2
+org.openhab.voice.watsonstt:singleUtteranceMode=true
+org.openhab.voice.watsonstt:maxSilenceSeconds=2
  org.openhab.voice.watsonstt:optOutLogging=false
  org.openhab.voice.watsonstt:smartFormatting=false
  org.openhab.voice.watsonstt:redaction=false
diff --git a/bundles/org.openhab.voice.watsonstt/pom.xml b/bundles/org.openhab.voice.watsonstt/pom.xml

index 30ad6db8be30ef235697074c0d8b717f9207ce9f..1e0c4d15336890b514aa7ad015e823eed5941435 100644 (file)
--- a/bundles/org.openhab.voice.watsonstt/pom.xml
+++ b/bundles/org.openhab.voice.watsonstt/pom.xml
@@ -27,7 +27,7 @@
      <dependency>
        <groupId>com.ibm.cloud</groupId>
        <artifactId>sdk-core</artifactId>
-      <version>9.15.0</version>
+      <version>9.15.4</version>
        <scope>compile</scope>
      </dependency>
      <dependency>
@@ -39,13 +39,13 @@
      <dependency>
        <groupId>com.squareup.okhttp3</groupId>
        <artifactId>okhttp</artifactId>
-      <version>4.9.1</version>
+      <version>4.9.3</version>
        <scope>compile</scope>
      </dependency>
      <dependency>
        <groupId>com.squareup.okhttp3</groupId>
        <artifactId>okhttp-urlconnection</artifactId>
-      <version>4.9.1</version>
+      <version>4.9.3</version>
        <scope>compile</scope>
      </dependency>
      <dependency>
diff --git a/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTConfiguration.java b/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTConfiguration.java

index b7785541e3a0b639caaf382df48640bbab8a6f48..5358d92d7d72d01f1b11c46c9fe74c3ba80a47e3 100644 (file)
--- a/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTConfiguration.java
+++ b/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTConfiguration.java
@@ -48,9 +48,13 @@ public class WatsonSTTConfiguration {
       */
      public boolean redaction = false;
      /**
-     * The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed.
+     * Single phrase mode.
       */
-    public int inactivityTimeout = 3;
+    public boolean singleUtteranceMode = true;
+    /**
+     * max seconds without getting new transcriptions to stop listening.
+     */
+    public int maxSilenceSeconds = 3;
      /**
       * Message to be told when no results
       */
diff --git a/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTService.java b/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTService.java

index ebd5c0759a069eb3c0e7461185e90d80adaefdf8..311ebb769c1882e98a9ccf18629b28e5aa968118 100644 (file)
--- a/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTService.java
+++ b/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTService.java
@@ -23,8 +23,6 @@ import java.util.concurrent.atomic.AtomicBoolean;
  import java.util.concurrent.atomic.AtomicReference;
  import java.util.stream.Collectors;
  
-import javax.net.ssl.SSLPeerUnverifiedException;
-
  import org.eclipse.jdt.annotation.NonNullByDefault;
  import org.eclipse.jdt.annotation.Nullable;
  import org.openhab.core.audio.AudioFormat;
@@ -47,6 +45,7 @@ import org.osgi.service.component.annotations.Modified;
  import org.slf4j.Logger;
  import org.slf4j.LoggerFactory;
  
+import com.google.gson.JsonObject;
  import com.ibm.cloud.sdk.core.http.HttpMediaType;
  import com.ibm.cloud.sdk.core.security.IamAuthenticator;
  import com.ibm.watson.speech_to_text.v1.SpeechToText;
@@ -130,31 +129,13 @@ public class WatsonSTTService implements STTService {
                  .contentType(contentType).redaction(config.redaction).smartFormatting(config.smartFormatting)
                  .model(locale.toLanguageTag() + "_BroadbandModel").interimResults(true)
                  .backgroundAudioSuppression(config.backgroundAudioSuppression)
-                .speechDetectorSensitivity(config.speechDetectorSensitivity).inactivityTimeout(config.inactivityTimeout)
+                .speechDetectorSensitivity(config.speechDetectorSensitivity).inactivityTimeout(config.maxSilenceSeconds)
                  .build();
          final AtomicReference<@Nullable WebSocket> socketRef = new AtomicReference<>();
          final AtomicBoolean aborted = new AtomicBoolean(false);
          executor.submit(() -> {
-            int retries = 2;
-            while (retries > 0) {
-                try {
-                    socketRef.set(speechToText.recognizeUsingWebSocket(wsOptions,
-                            new TranscriptionListener(sttListener, config, aborted)));
-                    break;
-                } catch (RuntimeException e) {
-                    var cause = e.getCause();
-                    if (cause instanceof SSLPeerUnverifiedException) {
-                        logger.debug("Retrying on error: {}", cause.getMessage());
-                        retries--;
-                    } else {
-                        var errorMessage = e.getMessage();
-                        logger.warn("Aborting on error: {}", errorMessage);
-                        sttListener.sttEventReceived(
-                                new SpeechRecognitionErrorEvent(errorMessage != null ? errorMessage : "Unknown error"));
-                        break;
-                    }
-                }
-            }
+            socketRef.set(speechToText.recognizeUsingWebSocket(wsOptions,
+                    new TranscriptionListener(socketRef, sttListener, config, aborted)));
          });
          return new STTServiceHandle() {
              @Override
@@ -162,12 +143,7 @@ public class WatsonSTTService implements STTService {
                  if (!aborted.getAndSet(true)) {
                      var socket = socketRef.get();
                      if (socket != null) {
-                        socket.close(1000, null);
-                        socket.cancel();
-                        try {
-                            Thread.sleep(100);
-                        } catch (InterruptedException ignored) {
-                        }
+                        sendStopMessage(socket);
                      }
                  }
              }
@@ -224,17 +200,26 @@ public class WatsonSTTService implements STTService {
          return null;
      }
  
+    private static void sendStopMessage(WebSocket ws) {
+        JsonObject stopMessage = new JsonObject();
+        stopMessage.addProperty("action", "stop");
+        ws.send(stopMessage.toString());
+    }
+
      private static class TranscriptionListener implements RecognizeCallback {
          private final Logger logger = LoggerFactory.getLogger(TranscriptionListener.class);
          private final StringBuilder transcriptBuilder = new StringBuilder();
          private final STTListener sttListener;
          private final WatsonSTTConfiguration config;
          private final AtomicBoolean aborted;
+        private final AtomicReference<@Nullable WebSocket> socketRef;
          private float confidenceSum = 0f;
          private int responseCount = 0;
          private boolean disconnected = false;
  
-        public TranscriptionListener(STTListener sttListener, WatsonSTTConfiguration config, AtomicBoolean aborted) {
+        public TranscriptionListener(AtomicReference<@Nullable WebSocket> socketRef, STTListener sttListener,
+                WatsonSTTConfiguration config, AtomicBoolean aborted) {
+            this.socketRef = socketRef;
              this.sttListener = sttListener;
              this.config = config;
              this.aborted = aborted;
@@ -256,6 +241,12 @@ public class WatsonSTTService implements STTService {
                  transcriptBuilder.append(alternative.getTranscript());
                  confidenceSum += confidence != null ? confidence.floatValue() : 0f;
                  responseCount++;
+                if (config.singleUtteranceMode) {
+                    var socket = socketRef.get();
+                    if (socket != null) {
+                        sendStopMessage(socket);
+                    }
+                }
              });
          }
  
@@ -272,7 +263,7 @@ public class WatsonSTTService implements STTService {
                  return;
              }
              logger.warn("TranscriptionError: {}", errorMessage);
-            if (!aborted.get()) {
+            if (!aborted.getAndSet(true)) {
                  sttListener.sttEventReceived(
                          new SpeechRecognitionErrorEvent(errorMessage != null ? errorMessage : "Unknown error"));
              }
@@ -285,7 +276,7 @@ public class WatsonSTTService implements STTService {
              if (!aborted.getAndSet(true)) {
                  sttListener.sttEventReceived(new RecognitionStopEvent());
                  float averageConfidence = confidenceSum / (float) responseCount;
-                String transcript = transcriptBuilder.toString();
+                String transcript = transcriptBuilder.toString().trim();
                  if (!transcript.isBlank()) {
                      sttListener.sttEventReceived(new SpeechRecognitionEvent(transcript, averageConfidence));
                  } else {
diff --git a/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/config/config.xml b/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/config/config.xml

index 4dbd2a28124f21c91a63b293f2a443092f6c3d93..3be580499ded1f2f44f4465b016fbb40b13ea157 100644 (file)
--- a/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/config/config.xml
+++ b/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/config/config.xml
@@ -32,8 +32,8 @@
                         <description>Use the parameter to suppress word insertions from music, coughing, and other non-speech events.</description>
                         <default>0.5</default>
                 </parameter>
-               <parameter name="inactivityTimeout" type="integer" unit="s" groupName="stt">
-                       <label>Inactivity Timeout</label>
+               <parameter name="maxSilenceSeconds" type="integer" unit="s" groupName="stt">
+                       <label>Max Silence Seconds</label>
                         <description>The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is
                                 closed.</description>
                         <default>3</default>
@@ -43,6 +43,11 @@
                         <description>Message to be told when no transcription is done.</description>
                         <default>No results</default>
                 </parameter>
+               <parameter name="singleUtteranceMode" type="boolean" groupName="stt">
+                       <label>Single Utterance Mode</label>
+                       <description>When enabled recognition stops listening after a single utterance.</description>
+                       <default>true</default>
+               </parameter>
                 <parameter name="optOutLogging" type="boolean" groupName="stt">
                         <label>Opt Out Logging</label>
                         <description>By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the
diff --git a/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/i18n/watsonstt.properties b/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/i18n/watsonstt.properties

index eebbd4792271edd2f9b5eb06905c1e68d90c98de..29d5c40563a722b3c10b2965fef28c26ce1c0e2c 100644 (file)
--- a/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/i18n/watsonstt.properties
+++ b/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/i18n/watsonstt.properties
@@ -6,16 +6,18 @@ voice.config.watsonstt.group.authentication.label = Authentication
  voice.config.watsonstt.group.authentication.description = Information for connection to your Watson Speech-to-Text instance.
  voice.config.watsonstt.group.stt.label = STT Configuration
  voice.config.watsonstt.group.stt.description = Parameters for Watson Speech-to-Text API.
-voice.config.watsonstt.inactivityTimeout.label = Inactivity Timeout
-voice.config.watsonstt.inactivityTimeout.description = The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed.
  voice.config.watsonstt.instanceUrl.label = Instance Url
  voice.config.watsonstt.instanceUrl.description = Url for Speech-to-Text instance created on IBM Cloud.
+voice.config.watsonstt.maxSilenceSeconds.label = Max Silence Seconds
+voice.config.watsonstt.maxSilenceSeconds.description = The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed.
  voice.config.watsonstt.noResultsMessage.label = No Results Message
  voice.config.watsonstt.noResultsMessage.description = Message to be told when no transcription is done.
  voice.config.watsonstt.optOutLogging.label = Opt Out Logging
  voice.config.watsonstt.optOutLogging.description = By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the services for future users. The logged data is not shared or made public.
  voice.config.watsonstt.redaction.label = Redaction
  voice.config.watsonstt.redaction.description = If true, the service redacts, or masks, numeric data from final transcripts. (Not available for all locales)
+voice.config.watsonstt.singleUtteranceMode.label = Single Utterance Mode
+voice.config.watsonstt.singleUtteranceMode.description = When enabled recognition stops listening after a single utterance.
  voice.config.watsonstt.smartFormatting.label = Smart Formatting
  voice.config.watsonstt.smartFormatting.description = If true, the service converts dates, times, series of digits and numbers, phone numbers, currency values, and internet addresses into more readable. (Not available for all locales)
  voice.config.watsonstt.speechDetectorSensitivity.label = Speech Detector Sensitivity
author	GiviMAD <GiviMAD@users.noreply.github.com>
	Sat, 12 Mar 2022 22:06:51 +0000 (23:06 +0100)
committer	GitHub <noreply@github.com>
	Sat, 12 Mar 2022 22:06:51 +0000 (23:06 +0100)
bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTConfiguration.java		patch \| blob \| history
bundles/org.openhab.voice.googlestt/src/main/resources/OH-INF/config/config.xml		patch \| blob \| history
bundles/org.openhab.voice.voskstt/src/main/java/org/openhab/voice/voskstt/internal/VoskSTTConfiguration.java		patch \| blob \| history
bundles/org.openhab.voice.voskstt/src/main/resources/OH-INF/config/config.xml		patch \| blob \| history
bundles/org.openhab.voice.watsonstt/README.md		patch \| blob \| history
bundles/org.openhab.voice.watsonstt/pom.xml		patch \| blob \| history
bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTConfiguration.java		patch \| blob \| history
bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTService.java		patch \| blob \| history
bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/config/config.xml		patch \| blob \| history
bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/i18n/watsonstt.properties		patch \| blob \| history