2 * Copyright (c) 2010-2022 Contributors to the openHAB project
4 * See the NOTICE file(s) distributed with this work for additional
7 * This program and the accompanying materials are made available under the
8 * terms of the Eclipse Public License 2.0 which is available at
9 * http://www.eclipse.org/legal/epl-2.0
11 * SPDX-License-Identifier: EPL-2.0
13 package org.openhab.voice.voskstt.internal;
15 import static org.openhab.voice.voskstt.internal.VoskSTTConstants.*;
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.nio.file.Path;
21 import java.util.Locale;
24 import java.util.concurrent.Future;
25 import java.util.concurrent.ScheduledExecutorService;
26 import java.util.concurrent.atomic.AtomicBoolean;
28 import org.eclipse.jdt.annotation.NonNullByDefault;
29 import org.eclipse.jdt.annotation.Nullable;
30 import org.openhab.core.OpenHAB;
31 import org.openhab.core.audio.AudioFormat;
32 import org.openhab.core.audio.AudioStream;
33 import org.openhab.core.common.ThreadPoolManager;
34 import org.openhab.core.config.core.ConfigurableService;
35 import org.openhab.core.config.core.Configuration;
36 import org.openhab.core.io.rest.LocaleService;
37 import org.openhab.core.voice.RecognitionStartEvent;
38 import org.openhab.core.voice.RecognitionStopEvent;
39 import org.openhab.core.voice.STTException;
40 import org.openhab.core.voice.STTListener;
41 import org.openhab.core.voice.STTService;
42 import org.openhab.core.voice.STTServiceHandle;
43 import org.openhab.core.voice.SpeechRecognitionErrorEvent;
44 import org.openhab.core.voice.SpeechRecognitionEvent;
45 import org.osgi.framework.Constants;
46 import org.osgi.service.component.annotations.Activate;
47 import org.osgi.service.component.annotations.Component;
48 import org.osgi.service.component.annotations.Deactivate;
49 import org.osgi.service.component.annotations.Modified;
50 import org.osgi.service.component.annotations.Reference;
51 import org.slf4j.Logger;
52 import org.slf4j.LoggerFactory;
53 import org.vosk.LibVosk;
54 import org.vosk.LogLevel;
55 import org.vosk.Model;
56 import org.vosk.Recognizer;
58 import com.fasterxml.jackson.databind.ObjectMapper;
61 * The {@link VoskSTTService} class is a service implementation to use Vosk-API for Speech-to-Text.
63 * @author Miguel Álvarez - Initial contribution
66 @Component(configurationPid = SERVICE_PID, property = Constants.SERVICE_PID + "=" + SERVICE_PID)
67 @ConfigurableService(category = SERVICE_CATEGORY, label = SERVICE_NAME
68 + " Speech-to-Text", description_uri = SERVICE_CATEGORY + ":" + SERVICE_ID)
69 public class VoskSTTService implements STTService {
70 private static final String VOSK_FOLDER = Path.of(OpenHAB.getUserDataFolder(), "vosk").toString();
71 private static final String MODEL_PATH = Path.of(VOSK_FOLDER, "model").toString();
73 Logger logger = LoggerFactory.getLogger(VoskSTTService.class);
74 File directory = new File(VOSK_FOLDER);
75 if (!directory.exists()) {
76 if (directory.mkdir()) {
77 logger.info("vosk dir created {}", VOSK_FOLDER);
81 LibVosk.setLogLevel(LogLevel.WARNINGS);
82 } catch (UnsatisfiedLinkError e) {
83 logger.warn("UnsatisfiedLinkError: {}", e.getMessage());
86 private final Logger logger = LoggerFactory.getLogger(VoskSTTService.class);
87 private final ScheduledExecutorService executor = ThreadPoolManager.getScheduledPool("OH-voice-voskstt");
88 private final LocaleService localeService;
89 private VoskSTTConfiguration config = new VoskSTTConfiguration();
90 private @Nullable Model model;
93 public VoskSTTService(@Reference LocaleService localeService) {
94 this.localeService = localeService;
98 protected void activate(Map<String, Object> config) {
103 protected void modified(Map<String, Object> config) {
104 configChange(config);
108 protected void deactivate(Map<String, Object> config) {
111 } catch (IOException e) {
112 logger.warn("IOException unloading model: {}", e.getMessage());
116 private void configChange(Map<String, Object> config) {
117 this.config = new Configuration(config).as(VoskSTTConfiguration.class);
118 if (this.config.preloadModel) {
121 } catch (IOException e) {
122 logger.warn("IOException loading model: {}", e.getMessage());
123 } catch (UnsatisfiedLinkError e) {
124 logger.warn("Missing native dependency: {}", e.getMessage());
129 } catch (IOException e) {
130 logger.warn("IOException unloading model: {}", e.getMessage());
136 public String getId() {
141 public String getLabel(@Nullable Locale locale) {
146 public Set<Locale> getSupportedLocales() {
147 // as it is not possible to determine the language of the model that was downloaded and setup by the user, it is
148 // assumed the language of the model is matching the locale of the openHAB server
149 return Set.of(localeService.getLocale(null));
153 public Set<AudioFormat> getSupportedFormats() {
155 new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, false, null, null, 16000L));
159 public STTServiceHandle recognize(STTListener sttListener, AudioStream audioStream, Locale locale, Set<String> set)
160 throws STTException {
161 AtomicBoolean aborted = new AtomicBoolean(false);
163 var frequency = audioStream.getFormat().getFrequency();
164 if (frequency == null) {
165 throw new IOException("missing audio stream frequency");
167 backgroundRecognize(sttListener, audioStream, frequency, aborted);
168 } catch (IOException e) {
169 throw new STTException(e);
176 private Model getModel() throws IOException, UnsatisfiedLinkError {
177 var model = this.model;
184 private Model loadModel() throws IOException, UnsatisfiedLinkError {
186 var modelFile = new File(MODEL_PATH);
187 if (!modelFile.exists() || !modelFile.isDirectory()) {
188 throw new IOException("missing model dir: " + MODEL_PATH);
190 logger.debug("loading model");
191 var model = new Model(MODEL_PATH);
192 if (config.preloadModel) {
198 private void unloadModel() throws IOException {
199 var model = this.model;
201 logger.debug("unloading model");
207 private Future<?> backgroundRecognize(STTListener sttListener, InputStream audioStream, long frequency,
208 AtomicBoolean aborted) {
209 StringBuilder transcriptBuilder = new StringBuilder();
210 long maxTranscriptionMillis = (config.maxTranscriptionSeconds * 1000L);
211 long maxSilenceMillis = (config.maxSilenceSeconds * 1000L);
212 long startTime = System.currentTimeMillis();
213 return executor.submit(() -> {
214 Recognizer recognizer = null;
218 recognizer = new Recognizer(model, frequency);
219 long lastInputTime = System.currentTimeMillis();
221 byte[] b = new byte[4096];
222 sttListener.sttEventReceived(new RecognitionStartEvent());
223 while (!aborted.get()) {
224 nbytes = audioStream.read(b);
228 if (isExpiredInterval(maxTranscriptionMillis, startTime)) {
229 logger.debug("Stops listening, max transcription time reached");
232 if (!config.singleUtteranceMode && isExpiredInterval(maxSilenceMillis, lastInputTime)) {
233 logger.debug("Stops listening, max silence time reached");
240 if (recognizer.acceptWaveForm(b, nbytes)) {
241 lastInputTime = System.currentTimeMillis();
242 var result = recognizer.getResult();
243 logger.debug("Result: {}", result);
244 ObjectMapper mapper = new ObjectMapper();
245 var json = mapper.readTree(result);
246 transcriptBuilder.append(json.get("text").asText()).append(" ");
247 if (config.singleUtteranceMode) {
251 logger.debug("Partial: {}", recognizer.getPartialResult());
254 if (!aborted.get()) {
255 sttListener.sttEventReceived(new RecognitionStopEvent());
256 var transcript = transcriptBuilder.toString().trim();
257 logger.debug("Final: {}", transcript);
258 if (!transcript.isBlank()) {
259 sttListener.sttEventReceived(new SpeechRecognitionEvent(transcript, 1F));
261 if (!config.noResultsMessage.isBlank()) {
262 sttListener.sttEventReceived(new SpeechRecognitionErrorEvent(config.noResultsMessage));
264 sttListener.sttEventReceived(new SpeechRecognitionErrorEvent("No results"));
268 } catch (IOException e) {
269 logger.warn("Error running speech to text: {}", e.getMessage());
270 if (config.errorMessage.isBlank()) {
271 sttListener.sttEventReceived(new SpeechRecognitionErrorEvent("Error"));
273 sttListener.sttEventReceived(new SpeechRecognitionErrorEvent(config.errorMessage));
275 } catch (UnsatisfiedLinkError e) {
276 logger.warn("Missing native dependency: {}", e.getMessage());
277 if (config.errorMessage.isBlank()) {
278 sttListener.sttEventReceived(new SpeechRecognitionErrorEvent("Error"));
280 sttListener.sttEventReceived(new SpeechRecognitionErrorEvent(config.errorMessage));
283 if (recognizer != null) {
286 if (!config.preloadModel && model != null) {
292 } catch (IOException e) {
293 logger.warn("IOException on close: {}", e.getMessage());
298 private void trySleep(long ms) {
301 } catch (InterruptedException ignored) {
305 private boolean isExpiredInterval(long interval, long referenceTime) {
306 return System.currentTimeMillis() - referenceTime > interval;