]> git.basschouten.com Git - openhab-addons.git/blob
e6626b190b881dc49ee23fd759285e0d8d125941
[openhab-addons.git] /
1 /**
2  * Copyright (c) 2010-2022 Contributors to the openHAB project
3  *
4  * See the NOTICE file(s) distributed with this work for additional
5  * information.
6  *
7  * This program and the accompanying materials are made available under the
8  * terms of the Eclipse Public License 2.0 which is available at
9  * http://www.eclipse.org/legal/epl-2.0
10  *
11  * SPDX-License-Identifier: EPL-2.0
12  */
13 package org.openhab.voice.voskstt.internal;
14
15 import static org.openhab.voice.voskstt.internal.VoskSTTConstants.*;
16
17 import java.io.File;
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.nio.file.Path;
21 import java.util.Locale;
22 import java.util.Map;
23 import java.util.Set;
24 import java.util.concurrent.Future;
25 import java.util.concurrent.ScheduledExecutorService;
26 import java.util.concurrent.atomic.AtomicBoolean;
27
28 import org.eclipse.jdt.annotation.NonNullByDefault;
29 import org.eclipse.jdt.annotation.Nullable;
30 import org.openhab.core.OpenHAB;
31 import org.openhab.core.audio.AudioFormat;
32 import org.openhab.core.audio.AudioStream;
33 import org.openhab.core.common.ThreadPoolManager;
34 import org.openhab.core.config.core.ConfigurableService;
35 import org.openhab.core.config.core.Configuration;
36 import org.openhab.core.io.rest.LocaleService;
37 import org.openhab.core.voice.RecognitionStartEvent;
38 import org.openhab.core.voice.RecognitionStopEvent;
39 import org.openhab.core.voice.STTException;
40 import org.openhab.core.voice.STTListener;
41 import org.openhab.core.voice.STTService;
42 import org.openhab.core.voice.STTServiceHandle;
43 import org.openhab.core.voice.SpeechRecognitionErrorEvent;
44 import org.openhab.core.voice.SpeechRecognitionEvent;
45 import org.osgi.framework.Constants;
46 import org.osgi.service.component.annotations.Activate;
47 import org.osgi.service.component.annotations.Component;
48 import org.osgi.service.component.annotations.Deactivate;
49 import org.osgi.service.component.annotations.Modified;
50 import org.osgi.service.component.annotations.Reference;
51 import org.slf4j.Logger;
52 import org.slf4j.LoggerFactory;
53 import org.vosk.LibVosk;
54 import org.vosk.LogLevel;
55 import org.vosk.Model;
56 import org.vosk.Recognizer;
57
58 import com.fasterxml.jackson.databind.ObjectMapper;
59
60 /**
61  * The {@link VoskSTTService} class is a service implementation to use Vosk-API for Speech-to-Text.
62  *
63  * @author Miguel Álvarez - Initial contribution
64  */
65 @NonNullByDefault
66 @Component(configurationPid = SERVICE_PID, property = Constants.SERVICE_PID + "=" + SERVICE_PID)
67 @ConfigurableService(category = SERVICE_CATEGORY, label = SERVICE_NAME
68         + " Speech-to-Text", description_uri = SERVICE_CATEGORY + ":" + SERVICE_ID)
69 public class VoskSTTService implements STTService {
70     private static final String VOSK_FOLDER = Path.of(OpenHAB.getUserDataFolder(), "vosk").toString();
71     private static final String MODEL_PATH = Path.of(VOSK_FOLDER, "model").toString();
72     static {
73         Logger logger = LoggerFactory.getLogger(VoskSTTService.class);
74         File directory = new File(VOSK_FOLDER);
75         if (!directory.exists()) {
76             if (directory.mkdir()) {
77                 logger.info("vosk dir created {}", VOSK_FOLDER);
78             }
79         }
80         try {
81             LibVosk.setLogLevel(LogLevel.WARNINGS);
82         } catch (UnsatisfiedLinkError e) {
83             logger.warn("UnsatisfiedLinkError: {}", e.getMessage());
84         }
85     }
86     private final Logger logger = LoggerFactory.getLogger(VoskSTTService.class);
87     private final ScheduledExecutorService executor = ThreadPoolManager.getScheduledPool("OH-voice-voskstt");
88     private final LocaleService localeService;
89     private VoskSTTConfiguration config = new VoskSTTConfiguration();
90     private @Nullable Model model;
91
92     @Activate
93     public VoskSTTService(@Reference LocaleService localeService) {
94         this.localeService = localeService;
95     }
96
97     @Activate
98     protected void activate(Map<String, Object> config) {
99         configChange(config);
100     }
101
102     @Modified
103     protected void modified(Map<String, Object> config) {
104         configChange(config);
105     }
106
107     @Deactivate
108     protected void deactivate(Map<String, Object> config) {
109         try {
110             unloadModel();
111         } catch (IOException e) {
112             logger.warn("IOException unloading model: {}", e.getMessage());
113         }
114     }
115
116     private void configChange(Map<String, Object> config) {
117         this.config = new Configuration(config).as(VoskSTTConfiguration.class);
118         if (this.config.preloadModel) {
119             try {
120                 loadModel();
121             } catch (IOException e) {
122                 logger.warn("IOException loading model: {}", e.getMessage());
123             } catch (UnsatisfiedLinkError e) {
124                 logger.warn("Missing native dependency: {}", e.getMessage());
125             }
126         } else {
127             try {
128                 unloadModel();
129             } catch (IOException e) {
130                 logger.warn("IOException unloading model: {}", e.getMessage());
131             }
132         }
133     }
134
135     @Override
136     public String getId() {
137         return SERVICE_ID;
138     }
139
140     @Override
141     public String getLabel(@Nullable Locale locale) {
142         return SERVICE_NAME;
143     }
144
145     @Override
146     public Set<Locale> getSupportedLocales() {
147         // as it is not possible to determine the language of the model that was downloaded and setup by the user, it is
148         // assumed the language of the model is matching the locale of the openHAB server
149         return Set.of(localeService.getLocale(null));
150     }
151
152     @Override
153     public Set<AudioFormat> getSupportedFormats() {
154         return Set.of(
155                 new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, false, null, null, 16000L));
156     }
157
158     @Override
159     public STTServiceHandle recognize(STTListener sttListener, AudioStream audioStream, Locale locale, Set<String> set)
160             throws STTException {
161         AtomicBoolean aborted = new AtomicBoolean(false);
162         try {
163             var frequency = audioStream.getFormat().getFrequency();
164             if (frequency == null) {
165                 throw new IOException("missing audio stream frequency");
166             }
167             backgroundRecognize(sttListener, audioStream, frequency, aborted);
168         } catch (IOException e) {
169             throw new STTException(e);
170         }
171         return () -> {
172             aborted.set(true);
173         };
174     }
175
176     private Model getModel() throws IOException, UnsatisfiedLinkError {
177         var model = this.model;
178         if (model != null) {
179             return model;
180         }
181         return loadModel();
182     }
183
184     private Model loadModel() throws IOException, UnsatisfiedLinkError {
185         unloadModel();
186         var modelFile = new File(MODEL_PATH);
187         if (!modelFile.exists() || !modelFile.isDirectory()) {
188             throw new IOException("missing model dir: " + MODEL_PATH);
189         }
190         logger.debug("loading model");
191         var model = new Model(MODEL_PATH);
192         if (config.preloadModel) {
193             this.model = model;
194         }
195         return model;
196     }
197
198     private void unloadModel() throws IOException {
199         var model = this.model;
200         if (model != null) {
201             logger.debug("unloading model");
202             model.close();
203             this.model = null;
204         }
205     }
206
207     private Future<?> backgroundRecognize(STTListener sttListener, InputStream audioStream, long frequency,
208             AtomicBoolean aborted) {
209         StringBuilder transcriptBuilder = new StringBuilder();
210         long maxTranscriptionMillis = (config.maxTranscriptionSeconds * 1000L);
211         long maxSilenceMillis = (config.maxSilenceSeconds * 1000L);
212         long startTime = System.currentTimeMillis();
213         return executor.submit(() -> {
214             Recognizer recognizer = null;
215             Model model = null;
216             try {
217                 model = getModel();
218                 recognizer = new Recognizer(model, frequency);
219                 long lastInputTime = System.currentTimeMillis();
220                 int nbytes;
221                 byte[] b = new byte[4096];
222                 sttListener.sttEventReceived(new RecognitionStartEvent());
223                 while (!aborted.get()) {
224                     nbytes = audioStream.read(b);
225                     if (aborted.get()) {
226                         break;
227                     }
228                     if (isExpiredInterval(maxTranscriptionMillis, startTime)) {
229                         logger.debug("Stops listening, max transcription time reached");
230                         break;
231                     }
232                     if (!config.singleUtteranceMode && isExpiredInterval(maxSilenceMillis, lastInputTime)) {
233                         logger.debug("Stops listening, max silence time reached");
234                         break;
235                     }
236                     if (nbytes == 0) {
237                         trySleep(100);
238                         continue;
239                     }
240                     if (recognizer.acceptWaveForm(b, nbytes)) {
241                         lastInputTime = System.currentTimeMillis();
242                         var result = recognizer.getResult();
243                         logger.debug("Result: {}", result);
244                         ObjectMapper mapper = new ObjectMapper();
245                         var json = mapper.readTree(result);
246                         transcriptBuilder.append(json.get("text").asText()).append(" ");
247                         if (config.singleUtteranceMode) {
248                             break;
249                         }
250                     } else {
251                         logger.debug("Partial: {}", recognizer.getPartialResult());
252                     }
253                 }
254                 if (!aborted.get()) {
255                     sttListener.sttEventReceived(new RecognitionStopEvent());
256                     var transcript = transcriptBuilder.toString().trim();
257                     logger.debug("Final: {}", transcript);
258                     if (!transcript.isBlank()) {
259                         sttListener.sttEventReceived(new SpeechRecognitionEvent(transcript, 1F));
260                     } else {
261                         if (!config.noResultsMessage.isBlank()) {
262                             sttListener.sttEventReceived(new SpeechRecognitionErrorEvent(config.noResultsMessage));
263                         } else {
264                             sttListener.sttEventReceived(new SpeechRecognitionErrorEvent("No results"));
265                         }
266                     }
267                 }
268             } catch (IOException e) {
269                 logger.warn("Error running speech to text: {}", e.getMessage());
270                 if (config.errorMessage.isBlank()) {
271                     sttListener.sttEventReceived(new SpeechRecognitionErrorEvent("Error"));
272                 } else {
273                     sttListener.sttEventReceived(new SpeechRecognitionErrorEvent(config.errorMessage));
274                 }
275             } catch (UnsatisfiedLinkError e) {
276                 logger.warn("Missing native dependency: {}", e.getMessage());
277                 if (config.errorMessage.isBlank()) {
278                     sttListener.sttEventReceived(new SpeechRecognitionErrorEvent("Error"));
279                 } else {
280                     sttListener.sttEventReceived(new SpeechRecognitionErrorEvent(config.errorMessage));
281                 }
282             } finally {
283                 if (recognizer != null) {
284                     recognizer.close();
285                 }
286                 if (!config.preloadModel && model != null) {
287                     model.close();
288                 }
289             }
290             try {
291                 audioStream.close();
292             } catch (IOException e) {
293                 logger.warn("IOException on close: {}", e.getMessage());
294             }
295         });
296     }
297
298     private void trySleep(long ms) {
299         try {
300             Thread.sleep(ms);
301         } catch (InterruptedException ignored) {
302         }
303     }
304
305     private boolean isExpiredInterval(long interval, long referenceTime) {
306         return System.currentTimeMillis() - referenceTime > interval;
307     }
308 }