2 * Copyright (c) 2010-2022 Contributors to the openHAB project
4 * See the NOTICE file(s) distributed with this work for additional
7 * This program and the accompanying materials are made available under the
8 * terms of the Eclipse Public License 2.0 which is available at
9 * http://www.eclipse.org/legal/epl-2.0
11 * SPDX-License-Identifier: EPL-2.0
13 package org.openhab.voice.voskstt.internal;
15 import static org.openhab.voice.voskstt.internal.VoskSTTConstants.*;
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.nio.file.Path;
21 import java.util.Locale;
24 import java.util.concurrent.Future;
25 import java.util.concurrent.ScheduledExecutorService;
26 import java.util.concurrent.atomic.AtomicBoolean;
28 import org.eclipse.jdt.annotation.NonNullByDefault;
29 import org.eclipse.jdt.annotation.Nullable;
30 import org.openhab.core.OpenHAB;
31 import org.openhab.core.audio.AudioFormat;
32 import org.openhab.core.audio.AudioStream;
33 import org.openhab.core.common.ThreadPoolManager;
34 import org.openhab.core.config.core.ConfigurableService;
35 import org.openhab.core.config.core.Configuration;
36 import org.openhab.core.io.rest.LocaleService;
37 import org.openhab.core.voice.RecognitionStartEvent;
38 import org.openhab.core.voice.RecognitionStopEvent;
39 import org.openhab.core.voice.STTException;
40 import org.openhab.core.voice.STTListener;
41 import org.openhab.core.voice.STTService;
42 import org.openhab.core.voice.STTServiceHandle;
43 import org.openhab.core.voice.SpeechRecognitionErrorEvent;
44 import org.openhab.core.voice.SpeechRecognitionEvent;
45 import org.osgi.framework.Constants;
46 import org.osgi.service.component.annotations.Activate;
47 import org.osgi.service.component.annotations.Component;
48 import org.osgi.service.component.annotations.Deactivate;
49 import org.osgi.service.component.annotations.Modified;
50 import org.osgi.service.component.annotations.Reference;
51 import org.slf4j.Logger;
52 import org.slf4j.LoggerFactory;
53 import org.vosk.Model;
54 import org.vosk.Recognizer;
56 import com.fasterxml.jackson.databind.ObjectMapper;
59 * The {@link VoskSTTService} class is a service implementation to use Vosk-API for Speech-to-Text.
61 * @author Miguel Álvarez - Initial contribution
64 @Component(configurationPid = SERVICE_PID, property = Constants.SERVICE_PID + "=" + SERVICE_PID)
65 @ConfigurableService(category = SERVICE_CATEGORY, label = SERVICE_NAME
66 + " Speech-to-Text", description_uri = SERVICE_CATEGORY + ":" + SERVICE_ID)
67 public class VoskSTTService implements STTService {
68 private static final String VOSK_FOLDER = Path.of(OpenHAB.getUserDataFolder(), "vosk").toString();
69 private static final String MODEL_PATH = Path.of(VOSK_FOLDER, "model").toString();
71 Logger logger = LoggerFactory.getLogger(VoskSTTService.class);
72 File directory = new File(VOSK_FOLDER);
73 if (!directory.exists()) {
74 if (directory.mkdir()) {
75 logger.info("vosk dir created {}", VOSK_FOLDER);
79 private final Logger logger = LoggerFactory.getLogger(VoskSTTService.class);
80 private final ScheduledExecutorService executor = ThreadPoolManager.getScheduledPool("OH-voice-voskstt");
81 private final LocaleService localeService;
82 private VoskSTTConfiguration config = new VoskSTTConfiguration();
83 private @Nullable Model model;
86 public VoskSTTService(@Reference LocaleService localeService) {
87 this.localeService = localeService;
91 protected void activate(Map<String, Object> config) {
96 protected void modified(Map<String, Object> config) {
101 protected void deactivate(Map<String, Object> config) {
104 } catch (IOException e) {
105 logger.warn("IOException unloading model: {}", e.getMessage());
109 private void configChange(Map<String, Object> config) {
110 this.config = new Configuration(config).as(VoskSTTConfiguration.class);
111 if (this.config.preloadModel) {
114 } catch (IOException e) {
115 logger.warn("IOException loading model: {}", e.getMessage());
116 } catch (UnsatisfiedLinkError e) {
117 logger.warn("Missing native dependency: {}", e.getMessage());
122 } catch (IOException e) {
123 logger.warn("IOException unloading model: {}", e.getMessage());
129 public String getId() {
134 public String getLabel(@Nullable Locale locale) {
139 public Set<Locale> getSupportedLocales() {
140 // as it is not possible to determine the language of the model that was downloaded and setup by the user, it is
141 // assumed the language of the model is matching the locale of the openHAB server
142 return Set.of(localeService.getLocale(null));
146 public Set<AudioFormat> getSupportedFormats() {
148 new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, false, null, null, 16000L));
152 public STTServiceHandle recognize(STTListener sttListener, AudioStream audioStream, Locale locale, Set<String> set)
153 throws STTException {
154 AtomicBoolean aborted = new AtomicBoolean(false);
156 var frequency = audioStream.getFormat().getFrequency();
157 if (frequency == null) {
158 throw new IOException("missing audio stream frequency");
160 backgroundRecognize(sttListener, audioStream, frequency, aborted);
161 } catch (IOException e) {
162 throw new STTException(e);
169 private Model getModel() throws IOException, UnsatisfiedLinkError {
170 var model = this.model;
177 private Model loadModel() throws IOException, UnsatisfiedLinkError {
179 var modelFile = new File(MODEL_PATH);
180 if (!modelFile.exists() || !modelFile.isDirectory()) {
181 throw new IOException("missing model dir: " + MODEL_PATH);
183 logger.debug("loading model");
184 var model = new Model(MODEL_PATH);
185 if (config.preloadModel) {
191 private void unloadModel() throws IOException {
192 var model = this.model;
194 logger.debug("unloading model");
200 private Future<?> backgroundRecognize(STTListener sttListener, InputStream audioStream, long frequency,
201 AtomicBoolean aborted) {
202 StringBuilder transcriptBuilder = new StringBuilder();
203 long maxTranscriptionMillis = (config.maxTranscriptionSeconds * 1000L);
204 long maxSilenceMillis = (config.maxSilenceSeconds * 1000L);
205 long startTime = System.currentTimeMillis();
206 return executor.submit(() -> {
207 Recognizer recognizer = null;
211 recognizer = new Recognizer(model, frequency);
212 long lastInputTime = System.currentTimeMillis();
214 byte[] b = new byte[4096];
215 sttListener.sttEventReceived(new RecognitionStartEvent());
216 while (!aborted.get()) {
217 nbytes = audioStream.read(b);
221 if (isExpiredInterval(maxTranscriptionMillis, startTime)) {
222 logger.debug("Stops listening, max transcription time reached");
225 if (!config.singleUtteranceMode && isExpiredInterval(maxSilenceMillis, lastInputTime)) {
226 logger.debug("Stops listening, max silence time reached");
233 if (recognizer.acceptWaveForm(b, nbytes)) {
234 lastInputTime = System.currentTimeMillis();
235 var result = recognizer.getResult();
236 logger.debug("Result: {}", result);
237 ObjectMapper mapper = new ObjectMapper();
238 var json = mapper.readTree(result);
239 transcriptBuilder.append(json.get("text").asText()).append(" ");
240 if (config.singleUtteranceMode) {
244 logger.debug("Partial: {}", recognizer.getPartialResult());
247 if (!aborted.get()) {
248 sttListener.sttEventReceived(new RecognitionStopEvent());
249 var transcript = transcriptBuilder.toString().trim();
250 logger.debug("Final: {}", transcript);
251 if (!transcript.isBlank()) {
252 sttListener.sttEventReceived(new SpeechRecognitionEvent(transcript, 1F));
254 if (!config.noResultsMessage.isBlank()) {
255 sttListener.sttEventReceived(new SpeechRecognitionErrorEvent(config.noResultsMessage));
257 sttListener.sttEventReceived(new SpeechRecognitionErrorEvent("No results"));
261 } catch (IOException e) {
262 logger.warn("Error running speech to text: {}", e.getMessage());
263 if (config.errorMessage.isBlank()) {
264 sttListener.sttEventReceived(new SpeechRecognitionErrorEvent("Error"));
266 sttListener.sttEventReceived(new SpeechRecognitionErrorEvent(config.errorMessage));
268 } catch (UnsatisfiedLinkError e) {
269 logger.warn("Missing native dependency: {}", e.getMessage());
270 if (config.errorMessage.isBlank()) {
271 sttListener.sttEventReceived(new SpeechRecognitionErrorEvent("Error"));
273 sttListener.sttEventReceived(new SpeechRecognitionErrorEvent(config.errorMessage));
276 if (recognizer != null) {
279 if (!config.preloadModel && model != null) {
285 } catch (IOException e) {
286 logger.warn("IOException on close: {}", e.getMessage());
291 private void trySleep(long ms) {
294 } catch (InterruptedException ignored) {
298 private boolean isExpiredInterval(long interval, long referenceTime) {
299 return System.currentTimeMillis() - referenceTime > interval;