git.basschouten.com Git - openhab-addons.git/blob

   1 /**
   2  * Copyright (c) 2010-2023 Contributors to the openHAB project
   3  *
   4  * See the NOTICE file(s) distributed with this work for additional
   5  * information.
   6  *
   7  * This program and the accompanying materials are made available under the
   8  * terms of the Eclipse Public License 2.0 which is available at
   9  * http://www.eclipse.org/legal/epl-2.0
  10  *
  11  * SPDX-License-Identifier: EPL-2.0
  12  */
  13 package org.openhab.voice.googletts.internal;
  14
  15 import static org.openhab.voice.googletts.internal.GoogleTTSService.*;
  16
  17 import java.io.ByteArrayInputStream;
  18 import java.io.File;
  19 import java.io.IOException;
  20 import java.io.InputStream;
  21 import java.util.Collections;
  22 import java.util.HashSet;
  23 import java.util.Locale;
  24 import java.util.Map;
  25 import java.util.Set;
  26
  27 import org.eclipse.jdt.annotation.NonNullByDefault;
  28 import org.eclipse.jdt.annotation.Nullable;
  29 import org.openhab.core.OpenHAB;
  30 import org.openhab.core.audio.AudioFormat;
  31 import org.openhab.core.audio.AudioStream;
  32 import org.openhab.core.audio.ByteArrayAudioStream;
  33 import org.openhab.core.audio.utils.AudioWaveUtils;
  34 import org.openhab.core.auth.client.oauth2.OAuthFactory;
  35 import org.openhab.core.config.core.ConfigurableService;
  36 import org.openhab.core.voice.TTSException;
  37 import org.openhab.core.voice.TTSService;
  38 import org.openhab.core.voice.Voice;
  39 import org.openhab.voice.googletts.internal.dto.AudioEncoding;
  40 import org.osgi.framework.Constants;
  41 import org.osgi.service.cm.ConfigurationAdmin;
  42 import org.osgi.service.component.annotations.Activate;
  43 import org.osgi.service.component.annotations.Component;
  44 import org.osgi.service.component.annotations.Deactivate;
  45 import org.osgi.service.component.annotations.Modified;
  46 import org.osgi.service.component.annotations.Reference;
  47 import org.slf4j.Logger;
  48 import org.slf4j.LoggerFactory;
  49
  50 /**
  51  * Voice service implementation.
  52  *
  53  * @author Gabor Bicskei - Initial contribution
  54  */
  55 @Component(configurationPid = SERVICE_PID, property = Constants.SERVICE_PID + "=" + SERVICE_PID)
  56 @ConfigurableService(category = SERVICE_CATEGORY, label = SERVICE_NAME
  57         + " Text-to-Speech", description_uri = SERVICE_CATEGORY + ":" + SERVICE_ID)
  58 public class GoogleTTSService implements TTSService {
  59     /**
  60      * Service name
  61      */
  62     static final String SERVICE_NAME = "Google Cloud";
  63
  64     /**
  65      * Service id
  66      */
  67     static final String SERVICE_ID = "googletts";
  68
  69     /**
  70      * Service category
  71      */
  72     static final String SERVICE_CATEGORY = "voice";
  73
  74     /**
  75      * Service pid
  76      */
  77     static final String SERVICE_PID = "org.openhab." + SERVICE_CATEGORY + "." + SERVICE_ID;
  78
  79     /**
  80      * Cache folder under $userdata
  81      */
  82     private static final String CACHE_FOLDER_NAME = "cache";
  83
  84     /**
  85      * Configuration parameters
  86      */
  87     private static final String PARAM_CLIENT_ID = "clientId";
  88     private static final String PARAM_CLIEND_SECRET = "clientSecret";
  89     static final String PARAM_AUTHCODE = "authcode";
  90     private static final String PARAM_PITCH = "pitch";
  91     private static final String PARAM_SPEAKING_RATE = "speakingRate";
  92     private static final String PARAM_VOLUME_GAIN_DB = "volumeGainDb";
  93     private static final String PARAM_PURGE_CACHE = "purgeCache";
  94
  95     /**
  96      * Logger.
  97      */
  98     private final Logger logger = LoggerFactory.getLogger(GoogleTTSService.class);
  99
 100     /**
 101      * Set of supported audio formats
 102      */
 103     private Set<AudioFormat> audioFormats = new HashSet<>();
 104
 105     /**
 106      * Google Cloud TTS API implementation
 107      */
 108     private @NonNullByDefault({}) GoogleCloudAPI apiImpl;
 109     private final ConfigurationAdmin configAdmin;
 110     private final OAuthFactory oAuthFactory;
 111
 112     /**
 113      * All voices for all supported locales
 114      */
 115     private Set<Voice> allVoices = new HashSet<>();
 116
 117     private final GoogleTTSConfig config = new GoogleTTSConfig();
 118
 119     @Activate
 120     public GoogleTTSService(final @Reference ConfigurationAdmin configAdmin,
 121             final @Reference OAuthFactory oAuthFactory) {
 122         this.configAdmin = configAdmin;
 123         this.oAuthFactory = oAuthFactory;
 124     }
 125
 126     /**
 127      * DS activate, with access to ConfigAdmin
 128      */
 129     @Activate
 130     protected void activate(Map<String, Object> config) {
 131         // create cache folder
 132         File userData = new File(OpenHAB.getUserDataFolder());
 133         File cacheFolder = new File(new File(userData, CACHE_FOLDER_NAME), SERVICE_PID);
 134         if (!cacheFolder.exists()) {
 135             cacheFolder.mkdirs();
 136         }
 137         logger.debug("Using cache folder {}", cacheFolder.getAbsolutePath());
 138
 139         apiImpl = new GoogleCloudAPI(configAdmin, oAuthFactory, cacheFolder);
 140         updateConfig(config);
 141     }
 142
 143     @Deactivate
 144     protected void dispose() {
 145         apiImpl.dispose();
 146         audioFormats.clear();
 147         allVoices.clear();
 148     }
 149
 150     /**
 151      * Initializing audio formats. Google supports 3 formats:
 152      * LINEAR16
 153      * Uncompressed 16-bit signed little-endian samples (Linear PCM). Audio content returned as LINEAR16
 154      * also contains a WAV header.
 155      * MP3
 156      * MP3 audio.
 157      * OGG_OPUS
 158      * Opus encoded audio wrapped in an ogg container. This is not supported by openHAB.
 159      *
 160      * @return Set of supported AudioFormats
 161      */
 162     private Set<AudioFormat> initAudioFormats() {
 163         logger.trace("Initializing audio formats");
 164         Set<AudioFormat> result = new HashSet<>();
 165         for (String format : apiImpl.getSupportedAudioFormats()) {
 166             AudioFormat audioFormat = getAudioFormat(format);
 167             if (audioFormat != null) {
 168                 result.add(audioFormat);
 169                 logger.trace("Audio format supported: {}", format);
 170             } else {
 171                 logger.trace("Audio format not supported: {}", format);
 172             }
 173         }
 174         return Collections.unmodifiableSet(result);
 175     }
 176
 177     /**
 178      * Loads available voices from Google API
 179      *
 180      * @return Set of available voices.
 181      */
 182     private Set<Voice> initVoices() {
 183         logger.trace("Initializing voices");
 184         Set<Voice> result = new HashSet<>();
 185         for (Locale locale : apiImpl.getSupportedLocales()) {
 186             result.addAll(apiImpl.getVoicesForLocale(locale));
 187         }
 188         if (logger.isTraceEnabled()) {
 189             for (Voice voice : result) {
 190                 logger.trace("Google Cloud TTS voice: {}", voice.getLabel());
 191             }
 192         }
 193         return Collections.unmodifiableSet(result);
 194     }
 195
 196     /**
 197      * Called by the framework when the configuration was updated.
 198      *
 199      * @param newConfig Updated configuration
 200      */
 201     @Modified
 202     private void updateConfig(Map<String, Object> newConfig) {
 203         logger.debug("Updating configuration");
 204         if (newConfig != null) {
 205             // client id
 206             String param = newConfig.containsKey(PARAM_CLIENT_ID) ? newConfig.get(PARAM_CLIENT_ID).toString() : null;
 207             config.clientId = param;
 208             if (param == null) {
 209                 logger.warn("Missing client id configuration to access Google Cloud TTS API.");
 210             }
 211             // client secret
 212             param = newConfig.containsKey(PARAM_CLIEND_SECRET) ? newConfig.get(PARAM_CLIEND_SECRET).toString() : null;
 213             config.clientSecret = param;
 214             if (param == null) {
 215                 logger.warn("Missing client secret configuration to access Google Cloud TTS API.");
 216             }
 217             // authcode
 218             param = newConfig.containsKey(PARAM_AUTHCODE) ? newConfig.get(PARAM_AUTHCODE).toString() : null;
 219             config.authcode = param;
 220
 221             // pitch
 222             param = newConfig.containsKey(PARAM_PITCH) ? newConfig.get(PARAM_PITCH).toString() : null;
 223             if (param != null) {
 224                 config.pitch = Double.parseDouble(param);
 225             }
 226
 227             // speakingRate
 228             param = newConfig.containsKey(PARAM_SPEAKING_RATE) ? newConfig.get(PARAM_SPEAKING_RATE).toString() : null;
 229             if (param != null) {
 230                 config.speakingRate = Double.parseDouble(param);
 231             }
 232
 233             // volumeGainDb
 234             param = newConfig.containsKey(PARAM_VOLUME_GAIN_DB) ? newConfig.get(PARAM_VOLUME_GAIN_DB).toString() : null;
 235             if (param != null) {
 236                 config.volumeGainDb = Double.parseDouble(param);
 237             }
 238
 239             // purgeCache
 240             param = newConfig.containsKey(PARAM_PURGE_CACHE) ? newConfig.get(PARAM_PURGE_CACHE).toString() : null;
 241             if (param != null) {
 242                 config.purgeCache = Boolean.parseBoolean(param);
 243             }
 244             logger.trace("New configuration: {}", config.toString());
 245
 246             if (config.clientId != null && !config.clientId.isEmpty() && config.clientSecret != null
 247                     && !config.clientSecret.isEmpty()) {
 248                 apiImpl.setConfig(config);
 249                 if (apiImpl.isInitialized()) {
 250                     allVoices = initVoices();
 251                     audioFormats = initAudioFormats();
 252                 }
 253             }
 254         } else {
 255             logger.warn("Missing Google Cloud TTS configuration.");
 256         }
 257     }
 258
 259     @Override
 260     public String getId() {
 261         return SERVICE_ID;
 262     }
 263
 264     @Override
 265     public String getLabel(@Nullable Locale locale) {
 266         return SERVICE_NAME;
 267     }
 268
 269     @Override
 270     public Set<Voice> getAvailableVoices() {
 271         return allVoices;
 272     }
 273
 274     @Override
 275     public Set<AudioFormat> getSupportedFormats() {
 276         return audioFormats;
 277     }
 278
 279     /**
 280      * Helper to create AudioFormat objects from Google names.
 281      *
 282      * @param format Google audio format.
 283      * @return Audio format object.
 284      */
 285     private @Nullable AudioFormat getAudioFormat(String format) {
 286         Integer bitDepth = 16;
 287         Long frequency = 44100L;
 288
 289         AudioEncoding encoding = AudioEncoding.valueOf(format);
 290
 291         switch (encoding) {
 292             case MP3:
 293                 // we use by default: MP3, 44khz_16bit_mono with bitrate 64 kbps
 294                 return new AudioFormat(AudioFormat.CONTAINER_NONE, AudioFormat.CODEC_MP3, null, bitDepth, 64000,
 295                         frequency);
 296             case LINEAR16:
 297                 // we use by default: wav, 44khz_16bit_mono
 298                 return new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, null, bitDepth, null,
 299                         frequency);
 300             default:
 301                 logger.warn("Audio format {} is not yet supported.", format);
 302                 return null;
 303         }
 304     }
 305
 306     /**
 307      * Checks parameters and calls the API to synthesize voice.
 308      *
 309      * @param text Input text.
 310      * @param voice Selected voice.
 311      * @param requestedFormat Format that is supported by the target sink as well.
 312      * @return Output audio stream
 313      * @throws TTSException in case the service is unavailable or a parameter is invalid.
 314      */
 315     @Override
 316     public AudioStream synthesize(String text, Voice voice, AudioFormat requestedFormat) throws TTSException {
 317         logger.debug("Synthesize '{}' for voice '{}' in format {}", text, voice.getUID(), requestedFormat);
 318         // Validate known api key
 319         if (!apiImpl.isInitialized()) {
 320             throw new TTSException("Missing service configuration.");
 321         }
 322         // Validate arguments
 323         // trim text
 324         String trimmedText = text.trim();
 325         if (trimmedText.isEmpty()) {
 326             throw new TTSException("The passed text is null or empty");
 327         }
 328         if (!this.allVoices.contains(voice)) {
 329             throw new TTSException("The passed voice is unsupported");
 330         }
 331         boolean isAudioFormatSupported = false;
 332         for (AudioFormat currentAudioFormat : this.audioFormats) {
 333             if (currentAudioFormat.isCompatible(requestedFormat)) {
 334                 isAudioFormatSupported = true;
 335                 break;
 336             }
 337         }
 338         if (!isAudioFormatSupported) {
 339             throw new TTSException("The passed AudioFormat is unsupported");
 340         }
 341
 342         // create the audio byte array for given text, locale, format
 343         byte[] audio = apiImpl.synthesizeSpeech(trimmedText, (GoogleTTSVoice) voice, requestedFormat.getCodec());
 344         if (audio == null) {
 345             throw new TTSException("Could not synthesize text via Google Cloud TTS Service");
 346         }
 347
 348         // compute the real format returned by google if wave file
 349         AudioFormat finalFormat = requestedFormat;
 350         if (AudioFormat.CONTAINER_WAVE.equals(requestedFormat.getContainer())) {
 351             finalFormat = parseAudioFormat(audio);
 352         }
 353
 354         return new ByteArrayAudioStream(audio, finalFormat);
 355     }
 356
 357     private AudioFormat parseAudioFormat(byte[] audio) throws TTSException {
 358         try (InputStream inputStream = new ByteArrayInputStream(audio)) {
 359             return AudioWaveUtils.parseWavFormat(inputStream);
 360         } catch (IOException e) {
 361             throw new TTSException("Cannot parse WAV format", e);
 362         }
 363     }
 364 }