chat-with-gpt/app/src/tts/elevenlabs.tsx

286 lines
9.2 KiB
TypeScript

import { Button } from "@mantine/core";
import EventEmitter from "events";
import { useCallback, useEffect, useRef, useState } from "react";
import { split } from 'sentence-splitter';
import { cloneArrayBuffer, md5, sleep } from "../utils";
import * as idb from '../idb';
import { useAppDispatch, useAppSelector } from "../store";
import { selectElevenLabsApiKey } from "../store/api-keys";
import { selectVoice } from "../store/voices";
import { openElevenLabsApiKeyPanel } from "../store/settings-ui";
import { defaultElevenLabsVoiceID } from "./defaults";
import { FormattedMessage, useIntl } from "react-intl";
const endpoint = 'https://api.elevenlabs.io';
let currentReader: ElevenLabsReader | null = null;
const cache = new Map<string, ArrayBuffer>();
export function createHeaders(apiKey = localStorage.getItem('elevenlabs-api-key') || '') {
return {
'xi-api-key': apiKey,
'content-type': 'application/json',
};
}
export async function getVoices() {
const response = await fetch(`${endpoint}/v1/voices`, {
headers: createHeaders(),
});
const json = await response.json();
return json;
}
const audioContext = new AudioContext();
export default class ElevenLabsReader extends EventEmitter {
private apiKey: string;
private initialized = false;
private cancelled = false;
private textSegments: string[] = [];
private currentTrack: number = -1;
private nextTrack: number = 0;
private audios: (AudioBuffer | null)[] = [];
private element: HTMLElement | undefined | null;
private voiceID = defaultElevenLabsVoiceID;
currentSource: AudioBufferSourceNode | undefined;
constructor() {
super();
this.apiKey = localStorage.getItem('elevenlabs-api-key') || '';
}
private async createAudio() {
if (this.initialized) {
return;
}
this.initialized = true;
const chunkSize = 3;
for (let i = 0; i < this.textSegments.length && !this.cancelled; i += chunkSize) {
const chunk = this.textSegments.slice(i, i + chunkSize);
await Promise.all(chunk.map((_, index) => this.createAudioForTextSegment(i + index)));
}
}
private async createAudioForTextSegment(index: number) {
if (this.audios[index] || this.cancelled) {
return;
}
const hash = await md5(this.textSegments[index]);
const cacheKey = `audio:${this.voiceID}:${hash}`;
let buffer = cache.get(cacheKey);
if (!buffer) {
buffer = await idb.get(cacheKey);
}
if (!buffer) {
const url = endpoint + '/v1/text-to-speech/' + this.voiceID;
const maxAttempts = 3;
for (let i = 0; i < maxAttempts && !this.cancelled; i++) {
try {
const response = await fetch(url, {
headers: createHeaders(this.apiKey),
method: 'POST',
body: JSON.stringify({
text: this.textSegments[index],
}),
});
if (response.ok) {
buffer = await response.arrayBuffer();
cache.set(cacheKey, cloneArrayBuffer(buffer));
idb.set(cacheKey, cloneArrayBuffer(buffer));
break;
}
} catch (e) {
console.error(e);
}
await sleep(2000 + i * 5000); // increasing backoff time
}
}
if (buffer) {
const data = await audioContext.decodeAudioData(buffer);
this.audios[index] = data;
}
}
private async waitForAudio(index: number, timeoutSeconds = 30) {
if (!this.initialized) {
this.createAudio().then(() => { });
}
const timeoutAt = Date.now() + timeoutSeconds * 1000;
while (Date.now() < timeoutAt && !this.cancelled) {
if (this.audios[index]) {
return;
}
this.emit('buffering');
await sleep(100);
}
this.cancelled = true;
this.emit('error', new Error('Timed out waiting for audio'));
}
public async play(element: HTMLElement, voiceID: string = defaultElevenLabsVoiceID, apiKey = this.apiKey) {
this.element = element;
this.voiceID = voiceID;
this.apiKey = apiKey;
if (!this.element || !this.voiceID) {
return;
}
this.emit('init');
if (currentReader != null) {
await currentReader.stop();
}
currentReader = this;
this.cancelled = false;
if (!this.textSegments?.length) {
this.textSegments = this.extractTextSegments();
}
await this.next(true);
}
private async next(play = false) {
if (this.cancelled) {
return;
}
if (!play && this.nextTrack === 0) {
this.emit('done');
return;
}
const currentTrack = this.nextTrack;
this.currentTrack = currentTrack;
const nextTrack = (this.nextTrack + 1) % this.textSegments.length;
this.nextTrack = nextTrack;
await this.waitForAudio(currentTrack);
if (this.cancelled) {
return;
}
this.emit('playing');
try {
this.currentSource = audioContext.createBufferSource();
this.currentSource.buffer = this.audios[currentTrack];
this.currentSource.connect(audioContext.destination);
this.currentSource.onended = () => {
this.next();
};
this.currentSource.start();
} catch (e) {
console.error('failed to play', e);
this.emit('done');
}
}
public stop() {
if (this.currentSource) {
this.currentSource.stop();
}
this.audios = [];
this.textSegments = [];
this.nextTrack = 0;
this.cancelled = true;
this.initialized = false;
this.emit('done');
}
private extractTextSegments() {
const selector = 'p, li, th, td, blockquote, pre code, h1, h2, h3, h3, h5, h6';
const nodes = Array.from(this.element?.querySelectorAll(selector) || []);
const lines: string[] = [];
const blocks = nodes.filter(node => !node.parentElement?.closest(selector) && node.textContent);
for (const block of blocks) {
const tagName = block.tagName.toLowerCase();
if (tagName === 'p' || tagName === 'li' || tagName === 'blockquote') {
const sentences = split(block.textContent!);
for (const sentence of sentences) {
lines.push(sentence.raw.trim());
}
} else {
lines.push(block.textContent!.trim());
}
}
return lines.filter(line => line.length);
}
}
export function ElevenLabsReaderButton(props: { selector: string }) {
const elevenLabsApiKey = useAppSelector(selectElevenLabsApiKey);
const dispatch = useAppDispatch();
const intl = useIntl();
const voice = useAppSelector(selectVoice);
const [status, setStatus] = useState<'idle' | 'init' | 'playing' | 'buffering'>('idle');
// const [error, setError] = useState(false);
const reader = useRef(new ElevenLabsReader());
useEffect(() => {
const currentReader = reader.current;
currentReader.on('init', () => setStatus('init'));
currentReader.on('playing', () => setStatus('playing'));
currentReader.on('buffering', () => setStatus('buffering'));
currentReader.on('error', () => {
setStatus('idle');
// setError(true);
});
currentReader.on('done', () => setStatus('idle'));
return () => {
currentReader.removeAllListeners();
currentReader.stop();
};
}, [props.selector]);
const onClick = useCallback(() => {
if (status === 'idle') {
if (!elevenLabsApiKey?.length) {
dispatch(openElevenLabsApiKeyPanel());
return;
}
audioContext.resume();
reader.current.play(document.querySelector(props.selector)!, voice, elevenLabsApiKey);
} else {
reader.current.stop();
}
}, [dispatch, status, props.selector, elevenLabsApiKey, voice]);
return (
<Button variant="subtle" size="sm" compact onClickCapture={onClick} loading={status === 'init'}>
{status !== 'init' && <i className="fa fa-headphones" />}
{status === 'idle' && <span>
<FormattedMessage defaultMessage="Play" description="Label for the button that starts text-to-speech playback" />
</span>}
{status === 'buffering' && <span>
<FormattedMessage defaultMessage="Loading audio..." description="Message indicating that text-to-speech audio is buffering" />
</span>}
{status !== 'idle' && status !== 'buffering' && <span>
<FormattedMessage defaultMessage="Stop" description="Label for the button that stops text-to-speech playback" />
</span>}
</Button>
);
}