Add Google Cloud Chirp 3 TTS provider with service account support

2026-05-13 02:42:54 +02:00
parent 6e9a26557f
commit f05e57493c
7 changed files with 992 additions and 13 deletions
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@@ -19,6 +19,7 @@
    "prepublishOnly": "npm run build"
  },
  "dependencies": {
+    "@google-cloud/text-to-speech": "^6.4.1",
    "@google/generative-ai": "^0.24.0",
    "axios": "^1.6.2",
    "dotenv": "^16.3.1",
@@ -51,4 +52,4 @@
  ],
  "author": "",
  "license": "MIT"
-}
+}
--- a/src/config/config.ts
+++ b/src/config/config.ts
@@ -86,6 +86,12 @@ export function getDefaultConfig(): Config {
        apiKey: process.env.ELEVENLABS_API_KEY,
        model: "eleven_multilingual_v2",
        voice: "JBFqnCBsd6RMkjVDRZzb"
+      },
+      google: {
+        apiKey: process.env.GOOGLE_CLOUD_TTS_KEY,
+        keyFilename: process.env.GOOGLE_CLOUD_TTS_KEYFILE,
+        model: "chirp-hd",
+        voice: "en-US-Chirp-HD-F"
      }
    },
    
--- a/src/interfaces/index.ts
+++ b/src/interfaces/index.ts
@@ -46,6 +46,7 @@ export interface TTSProviderConfig {
  apiKey?: string;
  model: string;
  voice?: string;
+  keyFilename?: string;
 }

 export interface TTSProvider {
--- a/src/providers/tts/googleCloudTTSProvider.ts
+++ b/src/providers/tts/googleCloudTTSProvider.ts
@@ -0,0 +1,94 @@
+import fs from 'fs';
+import { execSync } from 'child_process';
+import { TextToSpeechClient } from '@google-cloud/text-to-speech';
+import { google } from '@google-cloud/text-to-speech/build/protos/protos';
+import { TTSProvider, TTSProviderConfig, TTSOptions, TTSResult } from '../../interfaces';
+import { getAudioDuration } from '../../utils/mediaUtils';
+
+export class GoogleCloudTTSProvider implements TTSProvider {
+  private config: TTSProviderConfig;
+  private client: TextToSpeechClient;
+
+  constructor(config: TTSProviderConfig) {
+    this.config = config;
+
+    const clientConfig: any = {
+      apiKey: config.apiKey,
+      fallback: true
+    };
+
+    if (config.keyFilename) {
+      clientConfig.keyFilename = config.keyFilename;
+    }
+
+    this.client = new TextToSpeechClient(clientConfig);
+  }
+
+  async textToSpeech(
+    text: string,
+    outputPath: string,
+    options: TTSOptions = {}
+  ): Promise<TTSResult> {
+    try {
+      const voice = options.voice || this.config.voice || 'en-US-Chirp-HD-F';
+      const model = options.model || this.config.model || 'chirp-hd';
+      const speedFactor = options.speedFactor || 1.0;
+
+      const request: google.cloud.texttospeech.v1.ISynthesizeSpeechRequest = {
+        input: { text },
+        voice: {
+          languageCode: this.extractLanguageCode(voice),
+          name: voice
+        },
+        audioConfig: {
+          audioEncoding: 'MP3',
+          speakingRate: speedFactor
+        }
+      };
+
+      const [response] = await this.client.synthesizeSpeech(request);
+
+      if (!response.audioContent) {
+        throw new Error('No audio content returned from Google Cloud TTS');
+      }
+
+      const audioBuffer = response.audioContent instanceof Uint8Array
+        ? Buffer.from(response.audioContent)
+        : Buffer.from(response.audioContent as any);
+
+      const tempOutputPath = outputPath.replace(/\.\w+$/, '_temp$&');
+      fs.writeFileSync(tempOutputPath, audioBuffer);
+
+      const cost = text.length;
+
+      if (speedFactor !== 1.0) {
+        execSync(`ffmpeg -v error -i "${tempOutputPath}" -filter:a "atempo=${speedFactor}" -c:a libmp3lame -q:a 2 "${outputPath}" -y`);
+        fs.unlinkSync(tempOutputPath);
+      } else {
+        fs.renameSync(tempOutputPath, outputPath);
+      }
+
+      const audioDuration = getAudioDuration(outputPath);
+
+      return {
+        duration: audioDuration,
+        cost: cost
+      };
+    } catch (error: any) {
+      console.error('Google Cloud TTS error:', error.message);
+      execSync(`ffmpeg -v error -f lavfi -i anullsrc=r=24000:cl=mono -t 1 -q:a 9 -acodec libmp3lame "${outputPath}" -y`);
+      return {
+        duration: 1,
+        cost: 0
+      };
+    }
+  }
+
+  private extractLanguageCode(voiceName: string): string {
+    const parts = voiceName.split('-');
+    if (parts.length >= 2) {
+      return `${parts[0]}-${parts[1]}`;
+    }
+    return 'en-US';
+  }
+}
--- a/src/providers/tts/index.ts
+++ b/src/providers/tts/index.ts
@@ -1,3 +1,4 @@
 export * from './ttsProviderFactory';
 export * from './openAITTSProvider';
-export * from './elevenLabsTTSProvider';
+export * from './elevenLabsTTSProvider';
+export * from './googleCloudTTSProvider';
--- a/src/providers/tts/ttsProviderFactory.ts
+++ b/src/providers/tts/ttsProviderFactory.ts
@@ -2,6 +2,7 @@ import { TTSProvider } from '../../interfaces';
 import { Config } from '../../config/config';
 import { OpenAITTSProvider } from './openAITTSProvider';
 import { ElevenLabsTTSProvider } from './elevenLabsTTSProvider';
+import { GoogleCloudTTSProvider } from './googleCloudTTSProvider';

 /**
 * Factory for creating TTS providers
@@ -20,6 +21,8 @@ export class TTSProviderFactory {
        return new OpenAITTSProvider(providerConfig);
      case 'elevenlabs':
        return new ElevenLabsTTSProvider(providerConfig);
+      case 'google':
+        return new GoogleCloudTTSProvider(providerConfig);
      // Add other providers here
      default:
        throw new Error(`TTS provider "${providerName}" not implemented.`);