git.basschouten.com Git - openhab-addons.git/blob

   1 /**
   2  * Copyright (c) 2010-2022 Contributors to the openHAB project
   3  *
   4  * See the NOTICE file(s) distributed with this work for additional
   5  * information.
   6  *
   7  * This program and the accompanying materials are made available under the
   8  * terms of the Eclipse Public License 2.0 which is available at
   9  * http://www.eclipse.org/legal/epl-2.0
  10  *
  11  * SPDX-License-Identifier: EPL-2.0
  12  */
  13 package org.openhab.voice.voskstt.internal;
  14
  15 import static org.openhab.voice.voskstt.internal.VoskSTTConstants.*;
  16
  17 import java.io.File;
  18 import java.io.IOException;
  19 import java.io.InputStream;
  20 import java.nio.file.Path;
  21 import java.util.Locale;
  22 import java.util.Map;
  23 import java.util.Set;
  24 import java.util.concurrent.Future;
  25 import java.util.concurrent.ScheduledExecutorService;
  26 import java.util.concurrent.atomic.AtomicBoolean;
  27
  28 import org.eclipse.jdt.annotation.NonNullByDefault;
  29 import org.eclipse.jdt.annotation.Nullable;
  30 import org.openhab.core.OpenHAB;
  31 import org.openhab.core.audio.AudioFormat;
  32 import org.openhab.core.audio.AudioStream;
  33 import org.openhab.core.common.ThreadPoolManager;
  34 import org.openhab.core.config.core.ConfigurableService;
  35 import org.openhab.core.config.core.Configuration;
  36 import org.openhab.core.io.rest.LocaleService;
  37 import org.openhab.core.voice.RecognitionStartEvent;
  38 import org.openhab.core.voice.RecognitionStopEvent;
  39 import org.openhab.core.voice.STTException;
  40 import org.openhab.core.voice.STTListener;
  41 import org.openhab.core.voice.STTService;
  42 import org.openhab.core.voice.STTServiceHandle;
  43 import org.openhab.core.voice.SpeechRecognitionErrorEvent;
  44 import org.openhab.core.voice.SpeechRecognitionEvent;
  45 import org.osgi.framework.Constants;
  46 import org.osgi.service.component.annotations.Activate;
  47 import org.osgi.service.component.annotations.Component;
  48 import org.osgi.service.component.annotations.Deactivate;
  49 import org.osgi.service.component.annotations.Modified;
  50 import org.osgi.service.component.annotations.Reference;
  51 import org.slf4j.Logger;
  52 import org.slf4j.LoggerFactory;
  53 import org.vosk.Model;
  54 import org.vosk.Recognizer;
  55
  56 import com.fasterxml.jackson.databind.ObjectMapper;
  57
  58 /**
  59  * The {@link VoskSTTService} class is a service implementation to use Vosk-API for Speech-to-Text.
  60  *
  61  * @author Miguel Álvarez - Initial contribution
  62  */
  63 @NonNullByDefault
  64 @Component(configurationPid = SERVICE_PID, property = Constants.SERVICE_PID + "=" + SERVICE_PID)
  65 @ConfigurableService(category = SERVICE_CATEGORY, label = SERVICE_NAME
  66         + " Speech-to-Text", description_uri = SERVICE_CATEGORY + ":" + SERVICE_ID)
  67 public class VoskSTTService implements STTService {
  68     private static final String VOSK_FOLDER = Path.of(OpenHAB.getUserDataFolder(), "vosk").toString();
  69     private static final String MODEL_PATH = Path.of(VOSK_FOLDER, "model").toString();
  70     static {
  71         Logger logger = LoggerFactory.getLogger(VoskSTTService.class);
  72         File directory = new File(VOSK_FOLDER);
  73         if (!directory.exists()) {
  74             if (directory.mkdir()) {
  75                 logger.info("vosk dir created {}", VOSK_FOLDER);
  76             }
  77         }
  78     }
  79     private final Logger logger = LoggerFactory.getLogger(VoskSTTService.class);
  80     private final ScheduledExecutorService executor = ThreadPoolManager.getScheduledPool("OH-voice-voskstt");
  81     private final LocaleService localeService;
  82     private VoskSTTConfiguration config = new VoskSTTConfiguration();
  83     private @Nullable Model model;
  84
  85     @Activate
  86     public VoskSTTService(@Reference LocaleService localeService) {
  87         this.localeService = localeService;
  88     }
  89
  90     @Activate
  91     protected void activate(Map<String, Object> config) {
  92         configChange(config);
  93     }
  94
  95     @Modified
  96     protected void modified(Map<String, Object> config) {
  97         configChange(config);
  98     }
  99
 100     @Deactivate
 101     protected void deactivate(Map<String, Object> config) {
 102         try {
 103             unloadModel();
 104         } catch (IOException e) {
 105             logger.warn("IOException unloading model: {}", e.getMessage());
 106         }
 107     }
 108
 109     private void configChange(Map<String, Object> config) {
 110         this.config = new Configuration(config).as(VoskSTTConfiguration.class);
 111         if (this.config.preloadModel) {
 112             try {
 113                 loadModel();
 114             } catch (IOException e) {
 115                 logger.warn("IOException loading model: {}", e.getMessage());
 116             } catch (UnsatisfiedLinkError e) {
 117                 logger.warn("Missing native dependency: {}", e.getMessage());
 118             }
 119         } else {
 120             try {
 121                 unloadModel();
 122             } catch (IOException e) {
 123                 logger.warn("IOException unloading model: {}", e.getMessage());
 124             }
 125         }
 126     }
 127
 128     @Override
 129     public String getId() {
 130         return SERVICE_ID;
 131     }
 132
 133     @Override
 134     public String getLabel(@Nullable Locale locale) {
 135         return SERVICE_NAME;
 136     }
 137
 138     @Override
 139     public Set<Locale> getSupportedLocales() {
 140         // as it is not possible to determine the language of the model that was downloaded and setup by the user, it is
 141         // assumed the language of the model is matching the locale of the openHAB server
 142         return Set.of(localeService.getLocale(null));
 143     }
 144
 145     @Override
 146     public Set<AudioFormat> getSupportedFormats() {
 147         return Set.of(
 148                 new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, false, null, null, 16000L));
 149     }
 150
 151     @Override
 152     public STTServiceHandle recognize(STTListener sttListener, AudioStream audioStream, Locale locale, Set<String> set)
 153             throws STTException {
 154         AtomicBoolean aborted = new AtomicBoolean(false);
 155         try {
 156             var frequency = audioStream.getFormat().getFrequency();
 157             if (frequency == null) {
 158                 throw new IOException("missing audio stream frequency");
 159             }
 160             backgroundRecognize(sttListener, audioStream, frequency, aborted);
 161         } catch (IOException e) {
 162             throw new STTException(e);
 163         }
 164         return () -> {
 165             aborted.set(true);
 166         };
 167     }
 168
 169     private Model getModel() throws IOException, UnsatisfiedLinkError {
 170         var model = this.model;
 171         if (model != null) {
 172             return model;
 173         }
 174         return loadModel();
 175     }
 176
 177     private Model loadModel() throws IOException, UnsatisfiedLinkError {
 178         unloadModel();
 179         var modelFile = new File(MODEL_PATH);
 180         if (!modelFile.exists() || !modelFile.isDirectory()) {
 181             throw new IOException("missing model dir: " + MODEL_PATH);
 182         }
 183         logger.debug("loading model");
 184         var model = new Model(MODEL_PATH);
 185         if (config.preloadModel) {
 186             this.model = model;
 187         }
 188         return model;
 189     }
 190
 191     private void unloadModel() throws IOException {
 192         var model = this.model;
 193         if (model != null) {
 194             logger.debug("unloading model");
 195             model.close();
 196             this.model = null;
 197         }
 198     }
 199
 200     private Future<?> backgroundRecognize(STTListener sttListener, InputStream audioStream, long frequency,
 201             AtomicBoolean aborted) {
 202         StringBuilder transcriptBuilder = new StringBuilder();
 203         long maxTranscriptionMillis = (config.maxTranscriptionSeconds * 1000L);
 204         long maxSilenceMillis = (config.maxSilenceSeconds * 1000L);
 205         long startTime = System.currentTimeMillis();
 206         return executor.submit(() -> {
 207             Recognizer recognizer = null;
 208             Model model = null;
 209             try {
 210                 model = getModel();
 211                 recognizer = new Recognizer(model, frequency);
 212                 long lastInputTime = System.currentTimeMillis();
 213                 int nbytes;
 214                 byte[] b = new byte[4096];
 215                 sttListener.sttEventReceived(new RecognitionStartEvent());
 216                 while (!aborted.get()) {
 217                     nbytes = audioStream.read(b);
 218                     if (aborted.get()) {
 219                         break;
 220                     }
 221                     if (isExpiredInterval(maxTranscriptionMillis, startTime)) {
 222                         logger.debug("Stops listening, max transcription time reached");
 223                         break;
 224                     }
 225                     if (!config.singleUtteranceMode && isExpiredInterval(maxSilenceMillis, lastInputTime)) {
 226                         logger.debug("Stops listening, max silence time reached");
 227                         break;
 228                     }
 229                     if (nbytes == 0) {
 230                         trySleep(100);
 231                         continue;
 232                     }
 233                     if (recognizer.acceptWaveForm(b, nbytes)) {
 234                         lastInputTime = System.currentTimeMillis();
 235                         var result = recognizer.getResult();
 236                         logger.debug("Result: {}", result);
 237                         ObjectMapper mapper = new ObjectMapper();
 238                         var json = mapper.readTree(result);
 239                         transcriptBuilder.append(json.get("text").asText()).append(" ");
 240                         if (config.singleUtteranceMode) {
 241                             break;
 242                         }
 243                     } else {
 244                         logger.debug("Partial: {}", recognizer.getPartialResult());
 245                     }
 246                 }
 247                 if (!aborted.get()) {
 248                     sttListener.sttEventReceived(new RecognitionStopEvent());
 249                     var transcript = transcriptBuilder.toString().trim();
 250                     logger.debug("Final: {}", transcript);
 251                     if (!transcript.isBlank()) {
 252                         sttListener.sttEventReceived(new SpeechRecognitionEvent(transcript, 1F));
 253                     } else {
 254                         if (!config.noResultsMessage.isBlank()) {
 255                             sttListener.sttEventReceived(new SpeechRecognitionErrorEvent(config.noResultsMessage));
 256                         } else {
 257                             sttListener.sttEventReceived(new SpeechRecognitionErrorEvent("No results"));
 258                         }
 259                     }
 260                 }
 261             } catch (IOException e) {
 262                 logger.warn("Error running speech to text: {}", e.getMessage());
 263                 if (config.errorMessage.isBlank()) {
 264                     sttListener.sttEventReceived(new SpeechRecognitionErrorEvent("Error"));
 265                 } else {
 266                     sttListener.sttEventReceived(new SpeechRecognitionErrorEvent(config.errorMessage));
 267                 }
 268             } catch (UnsatisfiedLinkError e) {
 269                 logger.warn("Missing native dependency: {}", e.getMessage());
 270                 if (config.errorMessage.isBlank()) {
 271                     sttListener.sttEventReceived(new SpeechRecognitionErrorEvent("Error"));
 272                 } else {
 273                     sttListener.sttEventReceived(new SpeechRecognitionErrorEvent(config.errorMessage));
 274                 }
 275             } finally {
 276                 if (recognizer != null) {
 277                     recognizer.close();
 278                 }
 279                 if (!config.preloadModel && model != null) {
 280                     model.close();
 281                 }
 282             }
 283             try {
 284                 audioStream.close();
 285             } catch (IOException e) {
 286                 logger.warn("IOException on close: {}", e.getMessage());
 287             }
 288         });
 289     }
 290
 291     private void trySleep(long ms) {
 292         try {
 293             Thread.sleep(ms);
 294         } catch (InterruptedException ignored) {
 295         }
 296     }
 297
 298     private boolean isExpiredInterval(long interval, long referenceTime) {
 299         return System.currentTimeMillis() - referenceTime > interval;
 300     }
 301 }