286 lines
9.2 KiB
TypeScript
286 lines
9.2 KiB
TypeScript
import { Button } from "@mantine/core";
|
|
import EventEmitter from "events";
|
|
import { useCallback, useEffect, useRef, useState } from "react";
|
|
import { split } from 'sentence-splitter';
|
|
import { cloneArrayBuffer, md5, sleep } from "../utils";
|
|
import * as idb from '../idb';
|
|
import { useAppDispatch, useAppSelector } from "../store";
|
|
import { selectElevenLabsApiKey } from "../store/api-keys";
|
|
import { selectVoice } from "../store/voices";
|
|
import { openElevenLabsApiKeyPanel } from "../store/settings-ui";
|
|
import { defaultElevenLabsVoiceID } from "./defaults";
|
|
import { FormattedMessage, useIntl } from "react-intl";
|
|
|
|
const endpoint = 'https://api.elevenlabs.io';
|
|
|
|
let currentReader: ElevenLabsReader | null = null;
|
|
|
|
const cache = new Map<string, ArrayBuffer>();
|
|
|
|
export function createHeaders(apiKey = localStorage.getItem('elevenlabs-api-key') || '') {
|
|
return {
|
|
'xi-api-key': apiKey,
|
|
'content-type': 'application/json',
|
|
};
|
|
}
|
|
|
|
export async function getVoices() {
|
|
const response = await fetch(`${endpoint}/v1/voices`, {
|
|
headers: createHeaders(),
|
|
});
|
|
const json = await response.json();
|
|
return json;
|
|
}
|
|
|
|
const audioContext = new AudioContext();
|
|
|
|
export default class ElevenLabsReader extends EventEmitter {
|
|
private apiKey: string;
|
|
private initialized = false;
|
|
private cancelled = false;
|
|
private textSegments: string[] = [];
|
|
private currentTrack: number = -1;
|
|
private nextTrack: number = 0;
|
|
private audios: (AudioBuffer | null)[] = [];
|
|
private element: HTMLElement | undefined | null;
|
|
private voiceID = defaultElevenLabsVoiceID;
|
|
currentSource: AudioBufferSourceNode | undefined;
|
|
|
|
constructor() {
|
|
super();
|
|
this.apiKey = localStorage.getItem('elevenlabs-api-key') || '';
|
|
}
|
|
|
|
private async createAudio() {
|
|
if (this.initialized) {
|
|
return;
|
|
}
|
|
this.initialized = true;
|
|
|
|
const chunkSize = 3;
|
|
for (let i = 0; i < this.textSegments.length && !this.cancelled; i += chunkSize) {
|
|
const chunk = this.textSegments.slice(i, i + chunkSize);
|
|
await Promise.all(chunk.map((_, index) => this.createAudioForTextSegment(i + index)));
|
|
}
|
|
}
|
|
|
|
private async createAudioForTextSegment(index: number) {
|
|
if (this.audios[index] || this.cancelled) {
|
|
return;
|
|
}
|
|
|
|
const hash = await md5(this.textSegments[index]);
|
|
const cacheKey = `audio:${this.voiceID}:${hash}`;
|
|
|
|
let buffer = cache.get(cacheKey);
|
|
|
|
if (!buffer) {
|
|
buffer = await idb.get(cacheKey);
|
|
}
|
|
|
|
if (!buffer) {
|
|
const url = endpoint + '/v1/text-to-speech/' + this.voiceID;
|
|
const maxAttempts = 3;
|
|
|
|
for (let i = 0; i < maxAttempts && !this.cancelled; i++) {
|
|
try {
|
|
const response = await fetch(url, {
|
|
headers: createHeaders(this.apiKey),
|
|
method: 'POST',
|
|
body: JSON.stringify({
|
|
text: this.textSegments[index],
|
|
}),
|
|
});
|
|
|
|
if (response.ok) {
|
|
buffer = await response.arrayBuffer();
|
|
cache.set(cacheKey, cloneArrayBuffer(buffer));
|
|
idb.set(cacheKey, cloneArrayBuffer(buffer));
|
|
break;
|
|
}
|
|
} catch (e) {
|
|
console.error(e);
|
|
}
|
|
|
|
await sleep(2000 + i * 5000); // increasing backoff time
|
|
}
|
|
}
|
|
|
|
if (buffer) {
|
|
const data = await audioContext.decodeAudioData(buffer);
|
|
this.audios[index] = data;
|
|
}
|
|
}
|
|
|
|
private async waitForAudio(index: number, timeoutSeconds = 30) {
|
|
if (!this.initialized) {
|
|
this.createAudio().then(() => { });
|
|
}
|
|
|
|
const timeoutAt = Date.now() + timeoutSeconds * 1000;
|
|
while (Date.now() < timeoutAt && !this.cancelled) {
|
|
if (this.audios[index]) {
|
|
return;
|
|
}
|
|
this.emit('buffering');
|
|
await sleep(100);
|
|
}
|
|
|
|
this.cancelled = true;
|
|
this.emit('error', new Error('Timed out waiting for audio'));
|
|
}
|
|
|
|
public async play(element: HTMLElement, voiceID: string = defaultElevenLabsVoiceID, apiKey = this.apiKey) {
|
|
this.element = element;
|
|
this.voiceID = voiceID;
|
|
this.apiKey = apiKey;
|
|
|
|
if (!this.element || !this.voiceID) {
|
|
return;
|
|
}
|
|
|
|
this.emit('init');
|
|
|
|
if (currentReader != null) {
|
|
await currentReader.stop();
|
|
}
|
|
currentReader = this;
|
|
|
|
this.cancelled = false;
|
|
|
|
if (!this.textSegments?.length) {
|
|
this.textSegments = this.extractTextSegments();
|
|
}
|
|
|
|
await this.next(true);
|
|
}
|
|
|
|
private async next(play = false) {
|
|
if (this.cancelled) {
|
|
return;
|
|
}
|
|
|
|
if (!play && this.nextTrack === 0) {
|
|
this.emit('done');
|
|
return;
|
|
}
|
|
|
|
const currentTrack = this.nextTrack;
|
|
this.currentTrack = currentTrack;
|
|
|
|
const nextTrack = (this.nextTrack + 1) % this.textSegments.length;
|
|
this.nextTrack = nextTrack;
|
|
|
|
await this.waitForAudio(currentTrack);
|
|
|
|
if (this.cancelled) {
|
|
return;
|
|
}
|
|
|
|
this.emit('playing');
|
|
|
|
try {
|
|
this.currentSource = audioContext.createBufferSource();
|
|
this.currentSource.buffer = this.audios[currentTrack];
|
|
this.currentSource.connect(audioContext.destination);
|
|
this.currentSource.onended = () => {
|
|
this.next();
|
|
};
|
|
this.currentSource.start();
|
|
} catch (e) {
|
|
console.error('failed to play', e);
|
|
this.emit('done');
|
|
}
|
|
}
|
|
|
|
public stop() {
|
|
if (this.currentSource) {
|
|
this.currentSource.stop();
|
|
}
|
|
this.audios = [];
|
|
this.textSegments = [];
|
|
this.nextTrack = 0;
|
|
this.cancelled = true;
|
|
this.initialized = false;
|
|
this.emit('done');
|
|
}
|
|
|
|
private extractTextSegments() {
|
|
const selector = 'p, li, th, td, blockquote, pre code, h1, h2, h3, h3, h5, h6';
|
|
const nodes = Array.from(this.element?.querySelectorAll(selector) || []);
|
|
const lines: string[] = [];
|
|
const blocks = nodes.filter(node => !node.parentElement?.closest(selector) && node.textContent);
|
|
for (const block of blocks) {
|
|
const tagName = block.tagName.toLowerCase();
|
|
if (tagName === 'p' || tagName === 'li' || tagName === 'blockquote') {
|
|
const sentences = split(block.textContent!);
|
|
for (const sentence of sentences) {
|
|
lines.push(sentence.raw.trim());
|
|
}
|
|
} else {
|
|
lines.push(block.textContent!.trim());
|
|
}
|
|
}
|
|
return lines.filter(line => line.length);
|
|
}
|
|
}
|
|
|
|
export function ElevenLabsReaderButton(props: { selector: string }) {
|
|
const elevenLabsApiKey = useAppSelector(selectElevenLabsApiKey);
|
|
const dispatch = useAppDispatch();
|
|
const intl = useIntl();
|
|
|
|
const voice = useAppSelector(selectVoice);
|
|
|
|
const [status, setStatus] = useState<'idle' | 'init' | 'playing' | 'buffering'>('idle');
|
|
// const [error, setError] = useState(false);
|
|
const reader = useRef(new ElevenLabsReader());
|
|
|
|
useEffect(() => {
|
|
const currentReader = reader.current;
|
|
|
|
currentReader.on('init', () => setStatus('init'));
|
|
currentReader.on('playing', () => setStatus('playing'));
|
|
currentReader.on('buffering', () => setStatus('buffering'));
|
|
currentReader.on('error', () => {
|
|
setStatus('idle');
|
|
// setError(true);
|
|
});
|
|
currentReader.on('done', () => setStatus('idle'));
|
|
|
|
return () => {
|
|
currentReader.removeAllListeners();
|
|
currentReader.stop();
|
|
};
|
|
}, [props.selector]);
|
|
|
|
const onClick = useCallback(() => {
|
|
if (status === 'idle') {
|
|
if (!elevenLabsApiKey?.length) {
|
|
dispatch(openElevenLabsApiKeyPanel());
|
|
return;
|
|
}
|
|
|
|
audioContext.resume();
|
|
reader.current.play(document.querySelector(props.selector)!, voice, elevenLabsApiKey);
|
|
} else {
|
|
reader.current.stop();
|
|
}
|
|
}, [dispatch, status, props.selector, elevenLabsApiKey, voice]);
|
|
|
|
return (
|
|
<Button variant="subtle" size="sm" compact onClickCapture={onClick} loading={status === 'init'}>
|
|
{status !== 'init' && <i className="fa fa-headphones" />}
|
|
{status === 'idle' && <span>
|
|
<FormattedMessage defaultMessage="Play" description="Label for the button that starts text-to-speech playback" />
|
|
</span>}
|
|
{status === 'buffering' && <span>
|
|
<FormattedMessage defaultMessage="Loading audio..." description="Message indicating that text-to-speech audio is buffering" />
|
|
</span>}
|
|
{status !== 'idle' && status !== 'buffering' && <span>
|
|
<FormattedMessage defaultMessage="Stop" description="Label for the button that stops text-to-speech playback" />
|
|
</span>}
|
|
</Button>
|
|
);
|
|
}
|