git.basschouten.com Git - openhab-addons.git/blob

   1 /**
   2  * Copyright (c) 2010-2023 Contributors to the openHAB project
   3  *
   4  * See the NOTICE file(s) distributed with this work for additional
   5  * information.
   6  *
   7  * This program and the accompanying materials are made available under the
   8  * terms of the Eclipse Public License 2.0 which is available at
   9  * http://www.eclipse.org/legal/epl-2.0
  10  *
  11  * SPDX-License-Identifier: EPL-2.0
  12  */
  13 package org.openhab.voice.googletts.internal;
  14
  15 import static org.openhab.voice.googletts.internal.GoogleTTSService.*;
  16
  17 import java.io.ByteArrayInputStream;
  18 import java.io.IOException;
  19 import java.io.InputStream;
  20 import java.math.BigInteger;
  21 import java.nio.charset.StandardCharsets;
  22 import java.security.MessageDigest;
  23 import java.security.NoSuchAlgorithmException;
  24 import java.util.Collections;
  25 import java.util.HashSet;
  26 import java.util.Locale;
  27 import java.util.Map;
  28 import java.util.Set;
  29
  30 import org.eclipse.jdt.annotation.NonNull;
  31 import org.eclipse.jdt.annotation.NonNullByDefault;
  32 import org.eclipse.jdt.annotation.Nullable;
  33 import org.openhab.core.audio.AudioFormat;
  34 import org.openhab.core.audio.AudioStream;
  35 import org.openhab.core.audio.ByteArrayAudioStream;
  36 import org.openhab.core.audio.utils.AudioWaveUtils;
  37 import org.openhab.core.auth.client.oauth2.OAuthFactory;
  38 import org.openhab.core.config.core.ConfigurableService;
  39 import org.openhab.core.voice.AbstractCachedTTSService;
  40 import org.openhab.core.voice.TTSCache;
  41 import org.openhab.core.voice.TTSException;
  42 import org.openhab.core.voice.TTSService;
  43 import org.openhab.core.voice.Voice;
  44 import org.openhab.voice.googletts.internal.dto.AudioEncoding;
  45 import org.osgi.framework.Constants;
  46 import org.osgi.service.cm.ConfigurationAdmin;
  47 import org.osgi.service.component.annotations.Activate;
  48 import org.osgi.service.component.annotations.Component;
  49 import org.osgi.service.component.annotations.Deactivate;
  50 import org.osgi.service.component.annotations.Modified;
  51 import org.osgi.service.component.annotations.Reference;
  52 import org.slf4j.Logger;
  53 import org.slf4j.LoggerFactory;
  54
  55 /**
  56  * Voice service implementation.
  57  *
  58  * @author Gabor Bicskei - Initial contribution
  59  */
  60 @Component(configurationPid = SERVICE_PID, property = Constants.SERVICE_PID + "="
  61         + SERVICE_PID, service = TTSService.class)
  62 @ConfigurableService(category = SERVICE_CATEGORY, label = SERVICE_NAME
  63         + " Text-to-Speech", description_uri = SERVICE_CATEGORY + ":" + SERVICE_ID)
  64 public class GoogleTTSService extends AbstractCachedTTSService {
  65     /**
  66      * Service name
  67      */
  68     static final String SERVICE_NAME = "Google Cloud";
  69
  70     /**
  71      * Service id
  72      */
  73     static final String SERVICE_ID = "googletts";
  74
  75     /**
  76      * Service category
  77      */
  78     static final String SERVICE_CATEGORY = "voice";
  79
  80     /**
  81      * Service pid
  82      */
  83     static final String SERVICE_PID = "org.openhab." + SERVICE_CATEGORY + "." + SERVICE_ID;
  84
  85     /**
  86      * Configuration parameters
  87      */
  88     private static final String PARAM_CLIENT_ID = "clientId";
  89     private static final String PARAM_CLIEND_SECRET = "clientSecret";
  90     static final String PARAM_AUTHCODE = "authcode";
  91     private static final String PARAM_PITCH = "pitch";
  92     private static final String PARAM_SPEAKING_RATE = "speakingRate";
  93     private static final String PARAM_VOLUME_GAIN_DB = "volumeGainDb";
  94
  95     /**
  96      * Logger.
  97      */
  98     private final Logger logger = LoggerFactory.getLogger(GoogleTTSService.class);
  99
 100     /**
 101      * Set of supported audio formats
 102      */
 103     private Set<AudioFormat> audioFormats = new HashSet<>();
 104
 105     /**
 106      * Google Cloud TTS API implementation
 107      */
 108     private @NonNullByDefault({}) GoogleCloudAPI apiImpl;
 109     private final ConfigurationAdmin configAdmin;
 110     private final OAuthFactory oAuthFactory;
 111
 112     /**
 113      * All voices for all supported locales
 114      */
 115     private Set<Voice> allVoices = new HashSet<>();
 116
 117     private final GoogleTTSConfig config = new GoogleTTSConfig();
 118
 119     @Activate
 120     public GoogleTTSService(final @Reference ConfigurationAdmin configAdmin, final @Reference OAuthFactory oAuthFactory,
 121             @Reference TTSCache ttsCache, Map<String, Object> config) {
 122         super(ttsCache);
 123         this.configAdmin = configAdmin;
 124         this.oAuthFactory = oAuthFactory;
 125     }
 126
 127     /**
 128      * DS activate, with access to ConfigAdmin
 129      */
 130     @Activate
 131     protected void activate(Map<String, Object> config) {
 132         apiImpl = new GoogleCloudAPI(configAdmin, oAuthFactory);
 133         updateConfig(config);
 134     }
 135
 136     @Deactivate
 137     protected void dispose() {
 138         apiImpl.dispose();
 139         audioFormats = new HashSet<>();
 140         allVoices = new HashSet<>();
 141     }
 142
 143     /**
 144      * Initializing audio formats. Google supports 3 formats:
 145      * LINEAR16
 146      * Uncompressed 16-bit signed little-endian samples (Linear PCM). Audio content returned as LINEAR16
 147      * also contains a WAV header.
 148      * MP3
 149      * MP3 audio.
 150      * OGG_OPUS
 151      * Opus encoded audio wrapped in an ogg container. This is not supported by openHAB.
 152      *
 153      * @return Set of supported AudioFormats
 154      */
 155     private Set<AudioFormat> initAudioFormats() {
 156         logger.trace("Initializing audio formats");
 157         Set<AudioFormat> result = new HashSet<>();
 158         for (String format : apiImpl.getSupportedAudioFormats()) {
 159             AudioFormat audioFormat = getAudioFormat(format);
 160             if (audioFormat != null) {
 161                 result.add(audioFormat);
 162                 logger.trace("Audio format supported: {}", format);
 163             } else {
 164                 logger.trace("Audio format not supported: {}", format);
 165             }
 166         }
 167         return Collections.unmodifiableSet(result);
 168     }
 169
 170     /**
 171      * Loads available voices from Google API
 172      *
 173      * @return Set of available voices.
 174      */
 175     private Set<Voice> initVoices() {
 176         logger.trace("Initializing voices");
 177         Set<Voice> result = new HashSet<>();
 178         for (Locale locale : apiImpl.getSupportedLocales()) {
 179             result.addAll(apiImpl.getVoicesForLocale(locale));
 180         }
 181         if (logger.isTraceEnabled()) {
 182             for (Voice voice : result) {
 183                 logger.trace("Google Cloud TTS voice: {}", voice.getLabel());
 184             }
 185         }
 186         return Collections.unmodifiableSet(result);
 187     }
 188
 189     /**
 190      * Called by the framework when the configuration was updated.
 191      *
 192      * @param newConfig Updated configuration
 193      */
 194     @Modified
 195     private void updateConfig(Map<String, Object> newConfig) {
 196         logger.debug("Updating configuration");
 197         if (newConfig != null) {
 198             // client id
 199             String param = newConfig.containsKey(PARAM_CLIENT_ID) ? newConfig.get(PARAM_CLIENT_ID).toString() : null;
 200             config.clientId = param;
 201             if (param == null) {
 202                 logger.warn("Missing client id configuration to access Google Cloud TTS API.");
 203             }
 204             // client secret
 205             param = newConfig.containsKey(PARAM_CLIEND_SECRET) ? newConfig.get(PARAM_CLIEND_SECRET).toString() : null;
 206             config.clientSecret = param;
 207             if (param == null) {
 208                 logger.warn("Missing client secret configuration to access Google Cloud TTS API.");
 209             }
 210             // authcode
 211             param = newConfig.containsKey(PARAM_AUTHCODE) ? newConfig.get(PARAM_AUTHCODE).toString() : null;
 212             config.authcode = param;
 213
 214             // pitch
 215             param = newConfig.containsKey(PARAM_PITCH) ? newConfig.get(PARAM_PITCH).toString() : null;
 216             if (param != null) {
 217                 config.pitch = Double.parseDouble(param);
 218             }
 219
 220             // speakingRate
 221             param = newConfig.containsKey(PARAM_SPEAKING_RATE) ? newConfig.get(PARAM_SPEAKING_RATE).toString() : null;
 222             if (param != null) {
 223                 config.speakingRate = Double.parseDouble(param);
 224             }
 225
 226             // volumeGainDb
 227             param = newConfig.containsKey(PARAM_VOLUME_GAIN_DB) ? newConfig.get(PARAM_VOLUME_GAIN_DB).toString() : null;
 228             if (param != null) {
 229                 config.volumeGainDb = Double.parseDouble(param);
 230             }
 231
 232             if (config.clientId != null && !config.clientId.isEmpty() && config.clientSecret != null
 233                     && !config.clientSecret.isEmpty()) {
 234                 apiImpl.setConfig(config);
 235                 if (apiImpl.isInitialized()) {
 236                     allVoices = initVoices();
 237                     audioFormats = initAudioFormats();
 238                 }
 239             }
 240         } else {
 241             logger.warn("Missing Google Cloud TTS configuration.");
 242         }
 243     }
 244
 245     @Override
 246     public String getId() {
 247         return SERVICE_ID;
 248     }
 249
 250     @Override
 251     public String getLabel(@Nullable Locale locale) {
 252         return SERVICE_NAME;
 253     }
 254
 255     @Override
 256     public Set<Voice> getAvailableVoices() {
 257         return allVoices;
 258     }
 259
 260     @Override
 261     public Set<AudioFormat> getSupportedFormats() {
 262         return audioFormats;
 263     }
 264
 265     /**
 266      * Helper to create AudioFormat objects from Google names.
 267      *
 268      * @param format Google audio format.
 269      * @return Audio format object.
 270      */
 271     private @Nullable AudioFormat getAudioFormat(String format) {
 272         Integer bitDepth = 16;
 273         Long frequency = 44100L;
 274
 275         AudioEncoding encoding = AudioEncoding.valueOf(format);
 276
 277         switch (encoding) {
 278             case MP3:
 279                 // we use by default: MP3, 44khz_16bit_mono with bitrate 64 kbps
 280                 return new AudioFormat(AudioFormat.CONTAINER_NONE, AudioFormat.CODEC_MP3, null, bitDepth, 64000,
 281                         frequency);
 282             case LINEAR16:
 283                 // we use by default: wav, 44khz_16bit_mono
 284                 return new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, null, bitDepth, null,
 285                         frequency);
 286             default:
 287                 logger.warn("Audio format {} is not yet supported.", format);
 288                 return null;
 289         }
 290     }
 291
 292     /**
 293      * Checks parameters and calls the API to synthesize voice.
 294      *
 295      * @param text Input text.
 296      * @param voice Selected voice.
 297      * @param requestedFormat Format that is supported by the target sink as well.
 298      * @return Output audio stream
 299      * @throws TTSException in case the service is unavailable or a parameter is invalid.
 300      */
 301     @Override
 302     public AudioStream synthesizeForCache(String text, Voice voice, AudioFormat requestedFormat) throws TTSException {
 303         logger.debug("Synthesize '{}' for voice '{}' in format {}", text, voice.getUID(), requestedFormat);
 304         // Validate known api key
 305         if (!apiImpl.isInitialized()) {
 306             throw new TTSException("Missing service configuration.");
 307         }
 308         // Validate arguments
 309         // trim text
 310         String trimmedText = text.trim();
 311         if (trimmedText.isEmpty()) {
 312             throw new TTSException("The passed text is null or empty");
 313         }
 314         if (!this.allVoices.contains(voice)) {
 315             throw new TTSException("The passed voice is unsupported");
 316         }
 317         boolean isAudioFormatSupported = false;
 318         for (AudioFormat currentAudioFormat : this.audioFormats) {
 319             if (currentAudioFormat.isCompatible(requestedFormat)) {
 320                 isAudioFormatSupported = true;
 321                 break;
 322             }
 323         }
 324         if (!isAudioFormatSupported) {
 325             throw new TTSException("The passed AudioFormat is unsupported");
 326         }
 327
 328         // create the audio byte array for given text, locale, format
 329         byte[] audio = apiImpl.synthesizeSpeech(trimmedText, (GoogleTTSVoice) voice, requestedFormat.getCodec());
 330         if (audio == null) {
 331             throw new TTSException("Could not synthesize text via Google Cloud TTS Service");
 332         }
 333
 334         // compute the real format returned by google if wave file
 335         AudioFormat finalFormat = requestedFormat;
 336         if (AudioFormat.CONTAINER_WAVE.equals(requestedFormat.getContainer())) {
 337             finalFormat = parseAudioFormat(audio);
 338         }
 339
 340         return new ByteArrayAudioStream(audio, finalFormat);
 341     }
 342
 343     private AudioFormat parseAudioFormat(byte[] audio) throws TTSException {
 344         try (InputStream inputStream = new ByteArrayInputStream(audio)) {
 345             return AudioWaveUtils.parseWavFormat(inputStream);
 346         } catch (IOException e) {
 347             throw new TTSException("Cannot parse WAV format", e);
 348         }
 349     }
 350
 351     @Override
 352     public @NonNull String getCacheKey(@NonNull String text, @NonNull Voice voice,
 353             @NonNull AudioFormat requestedFormat) {
 354         try {
 355             MessageDigest md = MessageDigest.getInstance("MD5");
 356             byte[] bytesOfMessage = (config.toConfigString() + text + requestedFormat).getBytes(StandardCharsets.UTF_8);
 357             String hash = String.format("%032x", new BigInteger(1, md.digest(bytesOfMessage)));
 358             return ((GoogleTTSVoice) voice).getTechnicalName() + "_" + hash;
 359         } catch (NoSuchAlgorithmException e) {
 360             // should not happen
 361             logger.warn("Could not create MD5 hash for '{}'", text, e);
 362             return "nomd5algorithm";
 363         }
 364     }
 365 }