]> git.basschouten.com Git - openhab-addons.git/commitdiff
[mimictts] Fix ssml and playing from audiosinks using the audio servlet (#14120)
authorGwendal Roulleau <dalgwen@users.noreply.github.com>
Sat, 14 Jan 2023 08:39:59 +0000 (09:39 +0100)
committerGitHub <noreply@github.com>
Sat, 14 Jan 2023 08:39:59 +0000 (09:39 +0100)
* [mimictts] Fix ssml and playing from an audiosink using the audio servlet

Fix :
- ssml not working
- add an option to store the audio on a file before sending it to openhab. It enables audiosink based on the audio servlet to play the sound (the servlet requires the getClonedStream method, unavailable with a pure streaming approach). The files are stored in the user data directory and deleted as soon as possible (stream close detection).
- fix error with voice name not encoded

Signed-off-by: Gwendal Roulleau <gwendal.roulleau@gmail.com>
bundles/org.openhab.voice.mimictts/README.md
bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/AutoDeleteFileAudioStream.java [new file with mode: 0644]
bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicConfiguration.java
bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicTTSService.java
bundles/org.openhab.voice.mimictts/src/main/resources/OH-INF/config/config.xml
bundles/org.openhab.voice.mimictts/src/main/resources/OH-INF/i18n/mimictts.properties

index 265890c8de60457e8b5865dc3ee0cdeeb89aa70d..b5d572a25e7da0dd03ba87bc2a6a286d0f1feeb3 100644 (file)
@@ -17,6 +17,7 @@ It supports a subset of SSML, and if you want to use it, be sure to start your t
 Using your favorite configuration UI to edit **Settings / Other Services - Mimic Text-to-Speech** and set:
 
 * **url** - Mimic URL. Default to `http://localhost:59125`
+* **workaroundServletSink** - A boolean activating a workaround for audiosink using the openHAB servlet. It stores audio file temporarily on disk, allowing the servlet to get a cloned stream as needed. Default false.
 * **speakingRate** - Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less than 1 is faster, and more than 1 is slower.
 * **audioVolatility** - The amount of noise added to the generated audio (0-1). Can help mask audio artifacts from the voice model. Multi-speaker models tend to sound better with a lower amount of noise than single speaker models.
 * **phonemeVolatility** - The amount of noise used to generate phoneme durations (0-1). Allows for variable speaking cadance, with a value closer to 1 being more variable. Multi-speaker models tend to sound better with a lower amount of phoneme variability than single speaker models.
diff --git a/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/AutoDeleteFileAudioStream.java b/bundles/org.openhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/AutoDeleteFileAudioStream.java
new file mode 100644 (file)
index 0000000..465a2b2
--- /dev/null
@@ -0,0 +1,84 @@
+/**
+ * Copyright (c) 2010-2023 Contributors to the openHAB project
+ *
+ * See the NOTICE file(s) distributed with this work for additional
+ * information.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ */
+package org.openhab.voice.mimic.internal;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.eclipse.jdt.annotation.NonNullByDefault;
+import org.openhab.core.audio.AudioException;
+import org.openhab.core.audio.AudioFormat;
+import org.openhab.core.audio.FileAudioStream;
+
+/**
+ * A FileAudioStream that autodelete after it and its clone are closed
+ * Useful to not congest temporary directory
+ *
+ * @author Gwendal Roulleau - Initial contribution
+ */
+@NonNullByDefault
+public class AutoDeleteFileAudioStream extends FileAudioStream {
+
+    private final File file;
+    private final AudioFormat audioFormat;
+    private final List<ClonedFileInputStream> clonedAudioStreams = new ArrayList<>(1);
+    private boolean isOpen = true;
+
+    public AutoDeleteFileAudioStream(File file, AudioFormat format) throws AudioException {
+        super(file, format);
+        this.file = file;
+        this.audioFormat = format;
+    }
+
+    @Override
+    public void close() throws IOException {
+        super.close();
+        this.isOpen = false;
+        deleteIfPossible();
+    }
+
+    protected void deleteIfPossible() {
+        boolean aClonedStreamIsOpen = clonedAudioStreams.stream().anyMatch(as -> as.isOpen);
+        if (!isOpen && !aClonedStreamIsOpen) {
+            file.delete();
+        }
+    }
+
+    @Override
+    public InputStream getClonedStream() throws AudioException {
+        ClonedFileInputStream clonedInputStream = new ClonedFileInputStream(this, file, audioFormat);
+        clonedAudioStreams.add(clonedInputStream);
+        return clonedInputStream;
+    }
+
+    private static class ClonedFileInputStream extends FileAudioStream {
+        protected boolean isOpen = true;
+        private final AutoDeleteFileAudioStream parent;
+
+        public ClonedFileInputStream(AutoDeleteFileAudioStream parent, File file, AudioFormat audioFormat)
+                throws AudioException {
+            super(file, audioFormat);
+            this.parent = parent;
+        }
+
+        @Override
+        public void close() throws IOException {
+            super.close();
+            this.isOpen = false;
+            parent.deleteIfPossible();
+        }
+    }
+}
index e8c56c14635d76e542928789d5075ee58fce815c..e35064e39558db5b29cb15caf486735ef4466c42 100644 (file)
@@ -25,4 +25,5 @@ public class MimicConfiguration {
     public Double speakingRate = 1.0;
     public Double audioVolatility = 0.667;
     public Double phonemeVolatility = 0.8;
+    public Boolean workaroundServletSink = false;
 }
index abde4cb0feb6797aafefc5917a461ce88c7920a8..39364035c8fd44d97f77629354e93f2e4882dc29 100644 (file)
  */
 package org.openhab.voice.mimic.internal;
 
+import java.io.File;
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.net.URLEncoder;
 import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.StandardCopyOption;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
+import java.util.UUID;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
@@ -31,6 +38,8 @@ import org.eclipse.jetty.client.util.InputStreamResponseListener;
 import org.eclipse.jetty.client.util.StringContentProvider;
 import org.eclipse.jetty.http.HttpHeader;
 import org.eclipse.jetty.http.HttpStatus;
+import org.openhab.core.OpenHAB;
+import org.openhab.core.audio.AudioException;
 import org.openhab.core.audio.AudioFormat;
 import org.openhab.core.audio.AudioStream;
 import org.openhab.core.config.core.ConfigurableService;
@@ -75,6 +84,7 @@ public class MimicTTSService implements TTSService {
      * Configuration parameters
      */
     private static final String PARAM_URL = "url";
+    private static final String PARAM_WORKAROUNDSERVLETSINK = "workaroundServletSink";
     private static final String PARAM_SPEAKINGRATE = "speakingRate";
     private static final String PARAM_AUDIOVOLATITLITY = "audioVolatility";
     private static final String PARAM_PHONEMEVOLATITLITY = "phonemeVolatility";
@@ -120,6 +130,12 @@ public class MimicTTSService implements TTSService {
             config.url = param.toString();
         }
 
+        // workaround
+        param = newConfig.get(PARAM_WORKAROUNDSERVLETSINK);
+        if (param != null) {
+            config.workaroundServletSink = Boolean.parseBoolean(param.toString());
+        }
+
         // audio volatility
         try {
             param = newConfig.get(PARAM_AUDIOVOLATITLITY);
@@ -232,22 +248,29 @@ public class MimicTTSService implements TTSService {
             throw new TTSException("The passed AudioFormat is unsupported");
         }
 
-        String ssml = "";
-        if (text.startsWith("<speak>")) {
-            ssml = "&ssml=true";
+        String encodedVoice;
+        try {
+            encodedVoice = URLEncoder.encode(((MimicVoice) voice).getTechnicalName(),
+                    StandardCharsets.UTF_8.toString());
+        } catch (UnsupportedEncodingException e) {
+            throw new IllegalArgumentException("Cannot encode voice in URL " + ((MimicVoice) voice).getTechnicalName());
         }
 
         // create the url for given locale, format
-        String urlTTS = config.url + SYNTHETIZE_URL + "?voice=" + ((MimicVoice) voice).getTechnicalName() + ssml
-                + "&noiseScale=" + config.audioVolatility + "&noiseW=" + config.phonemeVolatility + "&lengthScale="
-                + config.speakingRate + "&audioTarget=client";
+        String urlTTS = config.url + SYNTHETIZE_URL + "?voice=" + encodedVoice + "&noiseScale=" + config.audioVolatility
+                + "&noiseW=" + config.phonemeVolatility + "&lengthScale=" + config.speakingRate + "&audioTarget=client";
         logger.debug("Querying mimic with URL {}", urlTTS);
 
         // prepare the response as an inputstream
         InputStreamResponseListener inputStreamResponseListener = new InputStreamResponseListener();
         // we will use a POST method for the text
         StringContentProvider textContentProvider = new StringContentProvider(text, StandardCharsets.UTF_8);
-        httpClient.POST(urlTTS).content(textContentProvider).accept("audio/wav").send(inputStreamResponseListener);
+        if (text.startsWith("<speak>")) {
+            httpClient.POST(urlTTS).header("Content-Type", "application/ssml+xml").content(textContentProvider)
+                    .accept("audio/wav").send(inputStreamResponseListener);
+        } else {
+            httpClient.POST(urlTTS).content(textContentProvider).accept("audio/wav").send(inputStreamResponseListener);
+        }
 
         // compute the estimated timeout using a "stupid" method based on text length, as the response time depends on
         // the requested text. Average speaker speed estimated to 10/second.
@@ -269,7 +292,26 @@ public class MimicTTSService implements TTSService {
                             "Cannot get Content-Length header from mimic response. Are you sure to query a mimic TTS server at "
                                     + urlTTS + " ?");
                 }
-                return new InputStreamAudioStream(inputStreamResponseListener.getInputStream(), AUDIO_FORMAT, length);
+
+                InputStream inputStreamFromMimic = inputStreamResponseListener.getInputStream();
+                try {
+                    if (!config.workaroundServletSink) {
+                        return new InputStreamAudioStream(inputStreamFromMimic, AUDIO_FORMAT, length);
+                    } else {
+                        // Some audio sinks use the openHAB servlet to get audio. This servlet require the
+                        // getClonedStream()
+                        // method
+                        // So we cache the file on disk, thus implementing the method thanks to FileAudioStream.
+                        return createTemporaryFile(inputStreamFromMimic, AUDIO_FORMAT);
+                    }
+                } catch (TTSException e) {
+                    try {
+                        inputStreamFromMimic.close();
+                    } catch (IOException e1) {
+                    }
+                    throw e;
+                }
+
             } else {
                 String errorMessage = "Cannot get wav from mimic url " + urlTTS + " with HTTP response code "
                         + response.getStatus() + " for reason " + response.getReason();
@@ -282,4 +324,17 @@ public class MimicTTSService implements TTSService {
             throw new TTSException(errorMessage, e);
         }
     }
+
+    private AudioStream createTemporaryFile(InputStream inputStream, AudioFormat audioFormat) throws TTSException {
+        File mimicDirectory = new File(OpenHAB.getUserDataFolder(), "mimic");
+        mimicDirectory.mkdir();
+        try {
+            File tempFile = File.createTempFile(UUID.randomUUID().toString(), ".wav", mimicDirectory);
+            tempFile.deleteOnExit();
+            Files.copy(inputStream, tempFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
+            return new AutoDeleteFileAudioStream(tempFile, audioFormat);
+        } catch (AudioException | IOException e) {
+            throw new TTSException("Cannot create temporary audio file", e);
+        }
+    }
 }
index ab8619a39e25825714a0dce115eb93b7fafa8284..2107070d73499ed4368e38a396aa9cb8e07a72fd 100644 (file)
                        <description>Mimic 3 URL.</description>
                        <default>http://localhost:59125</default>
                </parameter>
+               <parameter name="workaroundServletSink" type="boolean" required="false">
+                       <label>Workaround For Servlet-Based Audiosink</label>
+                       <description>Enable this workaround to store temporarily the file on disk. Needed if you play on audiosink based on
+                               the openHAB audio servlet.</description>
+                       <default>false</default>
+               </parameter>
                <parameter name="speakingRate" min="0" max="1" type="decimal" required="false">
                        <label>Speaking Rate</label>
                        <description>Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less
index f34268e20aa1eebb41745216221b0456ce28852c..2472d5b8916cb5f5405a2e165a7cb1571b30c94e 100644 (file)
@@ -4,6 +4,8 @@ voice.config.mimictts.phonemeVolatility.label = Phoneme Volatility
 voice.config.mimictts.phonemeVolatility.description = The amount of noise used to generate phoneme durations (0-1). Allows for variable speaking cadance, with a value closer to 1 being more variable. Multi-speaker models tend to sound better with a lower amount of phoneme variability than single speaker models.
 voice.config.mimictts.speakingRate.label = Speaking Rate
 voice.config.mimictts.speakingRate.description = Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less than 1 is faster, and more than 1 is slower.
+voice.config.mimictts.workaroundServletSink.label= Workaround For Servlet-Based Audiosink
+voice.config.mimictts.workaroundServletSink.description= Enable this workaround to store temporarily the file on disk. Needed if you play on audiosink based on the openHAB audio servlet.
 voice.config.mimictts.url.label = URL
 voice.config.mimictts.url.description = Mimic 3 URL.