2 * Copyright (c) 2010-2022 Contributors to the openHAB project
4 * See the NOTICE file(s) distributed with this work for additional
7 * This program and the accompanying materials are made available under the
8 * terms of the Eclipse Public License 2.0 which is available at
9 * http://www.eclipse.org/legal/epl-2.0
11 * SPDX-License-Identifier: EPL-2.0
13 package org.openhab.voice.watsonstt.internal;
15 import static org.openhab.voice.watsonstt.internal.WatsonSTTConstants.*;
17 import java.util.List;
18 import java.util.Locale;
21 import java.util.concurrent.ScheduledExecutorService;
22 import java.util.concurrent.atomic.AtomicBoolean;
23 import java.util.concurrent.atomic.AtomicReference;
24 import java.util.stream.Collectors;
26 import org.eclipse.jdt.annotation.NonNullByDefault;
27 import org.eclipse.jdt.annotation.Nullable;
28 import org.openhab.core.audio.AudioFormat;
29 import org.openhab.core.audio.AudioStream;
30 import org.openhab.core.common.ThreadPoolManager;
31 import org.openhab.core.config.core.ConfigurableService;
32 import org.openhab.core.config.core.Configuration;
33 import org.openhab.core.voice.RecognitionStartEvent;
34 import org.openhab.core.voice.RecognitionStopEvent;
35 import org.openhab.core.voice.STTException;
36 import org.openhab.core.voice.STTListener;
37 import org.openhab.core.voice.STTService;
38 import org.openhab.core.voice.STTServiceHandle;
39 import org.openhab.core.voice.SpeechRecognitionErrorEvent;
40 import org.openhab.core.voice.SpeechRecognitionEvent;
41 import org.osgi.framework.Constants;
42 import org.osgi.service.component.annotations.Activate;
43 import org.osgi.service.component.annotations.Component;
44 import org.osgi.service.component.annotations.Modified;
45 import org.slf4j.Logger;
46 import org.slf4j.LoggerFactory;
48 import com.google.gson.JsonObject;
49 import com.ibm.cloud.sdk.core.http.HttpMediaType;
50 import com.ibm.cloud.sdk.core.security.IamAuthenticator;
51 import com.ibm.watson.speech_to_text.v1.SpeechToText;
52 import com.ibm.watson.speech_to_text.v1.model.RecognizeWithWebsocketsOptions;
53 import com.ibm.watson.speech_to_text.v1.model.SpeechRecognitionAlternative;
54 import com.ibm.watson.speech_to_text.v1.model.SpeechRecognitionResult;
55 import com.ibm.watson.speech_to_text.v1.model.SpeechRecognitionResults;
56 import com.ibm.watson.speech_to_text.v1.websocket.RecognizeCallback;
58 import okhttp3.WebSocket;
61 * The {@link WatsonSTTService} allows to use Watson as Speech-to-Text engine
63 * @author Miguel Álvarez - Initial contribution
66 @Component(configurationPid = SERVICE_PID, property = Constants.SERVICE_PID + "=" + SERVICE_PID)
67 @ConfigurableService(category = SERVICE_CATEGORY, label = SERVICE_NAME
68 + " Speech-to-Text", description_uri = SERVICE_CATEGORY + ":" + SERVICE_ID)
69 public class WatsonSTTService implements STTService {
70 private final Logger logger = LoggerFactory.getLogger(WatsonSTTService.class);
71 private final ScheduledExecutorService executor = ThreadPoolManager.getScheduledPool("OH-voice-watsonstt");
72 private final List<String> models = List.of("ar-AR_BroadbandModel", "de-DE_BroadbandModel", "en-AU_BroadbandModel",
73 "en-GB_BroadbandModel", "en-US_BroadbandModel", "es-AR_BroadbandModel", "es-CL_BroadbandModel",
74 "es-CO_BroadbandModel", "es-ES_BroadbandModel", "es-MX_BroadbandModel", "es-PE_BroadbandModel",
75 "fr-CA_BroadbandModel", "fr-FR_BroadbandModel", "it-IT_BroadbandModel", "ja-JP_BroadbandModel",
76 "ko-KR_BroadbandModel", "nl-NL_BroadbandModel", "pt-BR_BroadbandModel", "zh-CN_BroadbandModel");
77 private final Set<Locale> supportedLocales = models.stream().map(name -> name.split("_")[0])
78 .map(Locale::forLanguageTag).collect(Collectors.toSet());
79 private WatsonSTTConfiguration config = new WatsonSTTConfiguration();
82 protected void activate(Map<String, Object> config) {
83 this.config = new Configuration(config).as(WatsonSTTConfiguration.class);
87 protected void modified(Map<String, Object> config) {
88 this.config = new Configuration(config).as(WatsonSTTConfiguration.class);
92 public String getId() {
97 public String getLabel(@Nullable Locale locale) {
102 public Set<Locale> getSupportedLocales() {
103 return supportedLocales;
107 public Set<AudioFormat> getSupportedFormats() {
108 return Set.of(AudioFormat.WAV, AudioFormat.OGG, new AudioFormat("OGG", "OPUS", null, null, null, null),
113 public STTServiceHandle recognize(STTListener sttListener, AudioStream audioStream, Locale locale, Set<String> set)
114 throws STTException {
115 if (config.apiKey.isBlank() || config.instanceUrl.isBlank()) {
116 throw new STTException("service is not correctly configured");
118 String contentType = getContentType(audioStream);
119 if (contentType == null) {
120 throw new STTException("Unsupported format, unable to resolve audio content type");
122 logger.debug("Content-Type: {}", contentType);
123 var speechToText = new SpeechToText(new IamAuthenticator.Builder().apikey(config.apiKey).build());
124 speechToText.setServiceUrl(config.instanceUrl);
125 if (config.optOutLogging) {
126 speechToText.setDefaultHeaders(Map.of("X-Watson-Learning-Opt-Out", "1"));
128 RecognizeWithWebsocketsOptions wsOptions = new RecognizeWithWebsocketsOptions.Builder().audio(audioStream)
129 .contentType(contentType).redaction(config.redaction).smartFormatting(config.smartFormatting)
130 .model(locale.toLanguageTag() + "_BroadbandModel").interimResults(true)
131 .backgroundAudioSuppression(config.backgroundAudioSuppression)
132 .speechDetectorSensitivity(config.speechDetectorSensitivity).inactivityTimeout(config.maxSilenceSeconds)
134 final AtomicReference<@Nullable WebSocket> socketRef = new AtomicReference<>();
135 final AtomicBoolean aborted = new AtomicBoolean(false);
136 executor.submit(() -> {
137 socketRef.set(speechToText.recognizeUsingWebSocket(wsOptions,
138 new TranscriptionListener(socketRef, sttListener, config, aborted)));
140 return new STTServiceHandle() {
142 public void abort() {
143 if (!aborted.getAndSet(true)) {
144 var socket = socketRef.get();
145 if (socket != null) {
146 sendStopMessage(socket);
153 private @Nullable String getContentType(AudioStream audioStream) throws STTException {
154 AudioFormat format = audioStream.getFormat();
155 String container = format.getContainer();
156 String codec = format.getCodec();
157 if (container == null || codec == null) {
158 throw new STTException("Missing audio stream info");
160 Long frequency = format.getFrequency();
161 Integer bitDepth = format.getBitDepth();
163 case AudioFormat.CONTAINER_WAVE:
164 if (AudioFormat.CODEC_PCM_SIGNED.equals(codec)) {
165 if (bitDepth == null || bitDepth != 16) {
168 // rate is a required parameter for this type
169 if (frequency == null) {
172 StringBuilder contentTypeL16 = new StringBuilder(HttpMediaType.AUDIO_PCM).append(";rate=")
174 // // those are optional
175 Integer channels = format.getChannels();
176 if (channels != null) {
177 contentTypeL16.append(";channels=").append(channels);
179 Boolean bigEndian = format.isBigEndian();
180 if (bigEndian != null) {
181 contentTypeL16.append(";")
182 .append(bigEndian ? "endianness=big-endian" : "endianness=little-endian");
184 return contentTypeL16.toString();
186 case AudioFormat.CONTAINER_OGG:
188 case AudioFormat.CODEC_VORBIS:
189 return "audio/ogg;codecs=vorbis";
191 return "audio/ogg;codecs=opus";
194 case AudioFormat.CONTAINER_NONE:
195 if (AudioFormat.CODEC_MP3.equals(codec)) {
203 private static void sendStopMessage(WebSocket ws) {
204 JsonObject stopMessage = new JsonObject();
205 stopMessage.addProperty("action", "stop");
206 ws.send(stopMessage.toString());
209 private static class TranscriptionListener implements RecognizeCallback {
210 private final Logger logger = LoggerFactory.getLogger(TranscriptionListener.class);
211 private final StringBuilder transcriptBuilder = new StringBuilder();
212 private final STTListener sttListener;
213 private final WatsonSTTConfiguration config;
214 private final AtomicBoolean aborted;
215 private final AtomicReference<@Nullable WebSocket> socketRef;
216 private float confidenceSum = 0f;
217 private int responseCount = 0;
218 private boolean disconnected = false;
220 public TranscriptionListener(AtomicReference<@Nullable WebSocket> socketRef, STTListener sttListener,
221 WatsonSTTConfiguration config, AtomicBoolean aborted) {
222 this.socketRef = socketRef;
223 this.sttListener = sttListener;
224 this.config = config;
225 this.aborted = aborted;
229 public void onTranscription(@Nullable SpeechRecognitionResults speechRecognitionResults) {
230 logger.debug("onTranscription");
231 if (speechRecognitionResults == null) {
234 speechRecognitionResults.getResults().stream().filter(SpeechRecognitionResult::isXFinal).forEach(result -> {
235 SpeechRecognitionAlternative alternative = result.getAlternatives().stream().findFirst().orElse(null);
236 if (alternative == null) {
239 logger.debug("onTranscription Final");
240 Double confidence = alternative.getConfidence();
241 transcriptBuilder.append(alternative.getTranscript());
242 confidenceSum += confidence != null ? confidence.floatValue() : 0f;
244 if (config.singleUtteranceMode) {
245 var socket = socketRef.get();
246 if (socket != null) {
247 sendStopMessage(socket);
254 public void onConnected() {
255 logger.debug("onConnected");
259 public void onError(@Nullable Exception e) {
260 var errorMessage = e != null ? e.getMessage() : null;
261 if (errorMessage != null && disconnected && errorMessage.contains("Socket closed")) {
262 logger.debug("Error ignored: {}", errorMessage);
265 logger.warn("TranscriptionError: {}", errorMessage);
266 if (!aborted.getAndSet(true)) {
267 sttListener.sttEventReceived(
268 new SpeechRecognitionErrorEvent(errorMessage != null ? errorMessage : "Unknown error"));
273 public void onDisconnected() {
274 logger.debug("onDisconnected");
276 if (!aborted.getAndSet(true)) {
277 sttListener.sttEventReceived(new RecognitionStopEvent());
278 float averageConfidence = confidenceSum / (float) responseCount;
279 String transcript = transcriptBuilder.toString().trim();
280 if (!transcript.isBlank()) {
281 sttListener.sttEventReceived(new SpeechRecognitionEvent(transcript, averageConfidence));
283 if (!config.noResultsMessage.isBlank()) {
284 sttListener.sttEventReceived(new SpeechRecognitionErrorEvent(config.noResultsMessage));
286 sttListener.sttEventReceived(new SpeechRecognitionErrorEvent("No results"));
293 public void onInactivityTimeout(@Nullable RuntimeException e) {
295 logger.debug("InactivityTimeout: {}", e.getMessage());
300 public void onListening() {
301 logger.debug("onListening");
302 sttListener.sttEventReceived(new RecognitionStartEvent());
306 public void onTranscriptionComplete() {
307 logger.debug("onTranscriptionComplete");