Merge pull request #58 from tluyben/whisper-stt-api

Whisper stt api
main
Cogent Apps 2023-03-20 06:06:18 -07:00 committed by GitHub
commit 95bf0aef6c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 300 additions and 18 deletions

View File

@ -4,6 +4,7 @@
"dependencies": { "dependencies": {
"@auth0/auth0-spa-js": "^2.0.4", "@auth0/auth0-spa-js": "^2.0.4",
"@emotion/css": "^11.10.6", "@emotion/css": "^11.10.6",
"@emotion/react": "^11.10.6",
"@emotion/styled": "^11.10.6", "@emotion/styled": "^11.10.6",
"@mantine/core": "^5.10.5", "@mantine/core": "^5.10.5",
"@mantine/hooks": "^5.10.5", "@mantine/hooks": "^5.10.5",
@ -20,6 +21,7 @@
"jshashes": "^1.0.8", "jshashes": "^1.0.8",
"localforage": "^1.10.0", "localforage": "^1.10.0",
"match-sorter": "^6.3.1", "match-sorter": "^6.3.1",
"mic-recorder-to-mp3": "^2.2.2",
"minisearch": "^6.0.1", "minisearch": "^6.0.1",
"natural": "^6.2.0", "natural": "^6.2.0",
"openai": "^3.2.1", "openai": "^3.2.1",

View File

@ -1,7 +1,7 @@
import styled from '@emotion/styled'; import styled from '@emotion/styled';
import { Button, ActionIcon, Textarea, Loader } from '@mantine/core'; import { Button, ActionIcon, Textarea, Loader } from '@mantine/core';
import { useMediaQuery } from '@mantine/hooks'; import { useMediaQuery } from '@mantine/hooks';
import { useCallback, useMemo } from 'react'; import { useCallback, useMemo, useState } from 'react';
import { FormattedMessage, useIntl } from 'react-intl'; import { FormattedMessage, useIntl } from 'react-intl';
import { useLocation } from 'react-router-dom'; import { useLocation } from 'react-router-dom';
import { useAppContext } from '../context'; import { useAppContext } from '../context';
@ -9,6 +9,10 @@ import { useAppDispatch, useAppSelector } from '../store';
import { selectMessage, setMessage } from '../store/message'; import { selectMessage, setMessage } from '../store/message';
import { selectTemperature } from '../store/parameters'; import { selectTemperature } from '../store/parameters';
import { openSystemPromptPanel, openTemperaturePanel } from '../store/settings-ui'; import { openSystemPromptPanel, openTemperaturePanel } from '../store/settings-ui';
import { speechRecognition } from '../speech-recognition-types.d'
import MicRecorder from 'mic-recorder-to-mp3';
import { selectUseOpenAIWhisper, selectOpenAIApiKey } from '../store/api-keys';
import { Mp3Encoder } from 'lamejs';
const Container = styled.div` const Container = styled.div`
background: #292933; background: #292933;
@ -34,11 +38,53 @@ export interface MessageInputProps {
disabled?: boolean; disabled?: boolean;
} }
async function chunkAndEncodeMP3File(file: Blob): Promise<Array<File>> {
const MAX_CHUNK_SIZE = 25 * 1024 * 1024; // 25 MB
const audioContext = new AudioContext();
const audioBuffer = await audioContext.decodeAudioData(await file.arrayBuffer());
const duration = audioBuffer.duration;
const sampleRate = audioBuffer.sampleRate;
const numChannels = audioBuffer.numberOfChannels;
const bytesPerSample = 2; // 16-bit audio
const samplesPerChunk = Math.floor((MAX_CHUNK_SIZE / bytesPerSample) / numChannels);
const totalSamples = Math.floor(duration * sampleRate);
const numChunks = Math.ceil(totalSamples / samplesPerChunk);
const chunks: Array<File> = [];
for (let i = 0; i < numChunks; i++) {
const startSample = i * samplesPerChunk;
const endSample = Math.min(startSample + samplesPerChunk, totalSamples);
const chunkDuration = (endSample - startSample) / sampleRate;
const chunkBuffer = audioContext.createBuffer(numChannels, endSample - startSample, sampleRate);
for (let c = 0; c < numChannels; c++) {
const channelData = audioBuffer.getChannelData(c).subarray(startSample, endSample);
chunkBuffer.copyToChannel(channelData, c);
}
const chunkBlob = await new Promise<Blob>((resolve) => {
const encoder = new Mp3Encoder(numChannels, sampleRate, 128);
const leftData = chunkBuffer.getChannelData(0);
const rightData = numChannels === 1 ? leftData : chunkBuffer.getChannelData(1);
const mp3Data = encoder.encodeBuffer(leftData, rightData);
const blob = new Blob([mp3Data], { type: 'audio/mp3' });
resolve(blob);
});
chunks.push(new File([chunkBlob], `text-${i}.mp3`, { type: 'audio/mp3' }));
}
return chunks;
}
export default function MessageInput(props: MessageInputProps) { export default function MessageInput(props: MessageInputProps) {
const temperature = useAppSelector(selectTemperature); const temperature = useAppSelector(selectTemperature);
const message = useAppSelector(selectMessage); const message = useAppSelector(selectMessage);
const [recording, setRecording] = useState(false);
const hasVerticalSpace = useMediaQuery('(min-height: 1000px)'); const hasVerticalSpace = useMediaQuery('(min-height: 1000px)');
const recorder = useMemo(() => new MicRecorder({ bitRate: 128 }), []);
const useOpenAIWhisper = useAppSelector(selectUseOpenAIWhisper);
const openAIApiKey = useAppSelector(selectOpenAIApiKey);
const context = useAppContext(); const context = useAppContext();
const dispatch = useAppDispatch(); const dispatch = useAppDispatch();
@ -58,6 +104,69 @@ export default function MessageInput(props: MessageInputProps) {
} }
}, [context, message, dispatch]); }, [context, message, dispatch]);
const onSpeechStart = useCallback(() => {
if (!recording) {
setRecording(true);
// if we are using whisper, the we will just record with the browser and send the api when done
if (useOpenAIWhisper) {
recorder.start().catch((e: any) => console.error(e));
} else {
speechRecognition.continuous = true;
speechRecognition.interimResults = true;
speechRecognition.onresult = (event) => {
const transcript = event.results[event.results.length - 1][0].transcript;
dispatch(setMessage(transcript));
};
speechRecognition.start();
}
} else {
setRecording(false);
if (useOpenAIWhisper) {
const mp3 = recorder.stop().getMp3();
mp3.then(async ([buffer, blob]) => {
const file = new File(buffer, 'chat.mp3', {
type: blob.type,
lastModified: Date.now()
});
// TODO: cut in chunks
var data = new FormData()
data.append('file', file);
data.append('model', 'whisper-1')
try {
const response = await fetch("https://api.openai.com/v1/audio/transcriptions", {
method: "POST",
headers: {
'Authorization': `Bearer ${openAIApiKey}`,
},
body: data,
});
const json = await response.json()
if (json.text) {
dispatch(setMessage(json.text));
}
} catch (e) {
console.log(e)
}
}).catch((e: any) => console.error(e));
} else {
speechRecognition.stop();
}
}
}, [recording, message, dispatch]);
const onKeyDown = useCallback((e: React.KeyboardEvent<HTMLTextAreaElement>) => { const onKeyDown = useCallback((e: React.KeyboardEvent<HTMLTextAreaElement>) => {
if (e.key === 'Enter' && e.shiftKey === false && !props.disabled) { if (e.key === 'Enter' && e.shiftKey === false && !props.disabled) {
e.preventDefault(); e.preventDefault();
@ -66,6 +175,7 @@ export default function MessageInput(props: MessageInputProps) {
}, [onSubmit, props.disabled]); }, [onSubmit, props.disabled]);
const rightSection = useMemo(() => { const rightSection = useMemo(() => {
return ( return (
<div style={{ <div style={{
opacity: '0.8', opacity: '0.8',
@ -84,14 +194,20 @@ export default function MessageInput(props: MessageInputProps) {
<Loader size="xs" style={{ padding: '0 0.8rem 0 0.5rem' }} /> <Loader size="xs" style={{ padding: '0 0.8rem 0 0.5rem' }} />
</>)} </>)}
{!context.generating && ( {!context.generating && (
<>
<ActionIcon size="xl"
onClick={onSpeechStart}>
<i className="fa fa-microphone" style={{ fontSize: '90%', color: recording ? 'red' : 'inherit' }} />
</ActionIcon>
<ActionIcon size="xl" <ActionIcon size="xl"
onClick={onSubmit}> onClick={onSubmit}>
<i className="fa fa-paper-plane" style={{ fontSize: '90%' }} /> <i className="fa fa-paper-plane" style={{ fontSize: '90%' }} />
</ActionIcon> </ActionIcon>
</>
)} )}
</div> </div>
); );
}, [onSubmit, props.disabled, context.generating]); }, [recording, onSubmit, props.disabled, context.generating]);
const disabled = context.generating; const disabled = context.generating;

View File

@ -1,19 +1,21 @@
import SettingsTab from "./tab"; import SettingsTab from "./tab";
import SettingsOption from "./option"; import SettingsOption from "./option";
import { TextInput } from "@mantine/core"; import { Checkbox, TextInput } from "@mantine/core";
import { useCallback, useMemo } from "react"; import { useCallback, useMemo } from "react";
import { useAppDispatch, useAppSelector } from "../../store"; import { useAppDispatch, useAppSelector } from "../../store";
import { selectOpenAIApiKey, setOpenAIApiKeyFromEvent } from "../../store/api-keys"; import { selectOpenAIApiKey, setOpenAIApiKeyFromEvent, selectUseOpenAIWhisper, setUseOpenAIWhisperFromEvent } from "../../store/api-keys";
import { selectSettingsOption } from "../../store/settings-ui"; import { selectSettingsOption } from "../../store/settings-ui";
import { FormattedMessage, useIntl } from "react-intl"; import { FormattedMessage, useIntl } from "react-intl";
export default function UserOptionsTab(props: any) { export default function UserOptionsTab(props: any) {
const option = useAppSelector(selectSettingsOption); const option = useAppSelector(selectSettingsOption);
const openaiApiKey = useAppSelector(selectOpenAIApiKey); const openaiApiKey = useAppSelector(selectOpenAIApiKey);
const useOpenAIWhisper = useAppSelector(selectUseOpenAIWhisper);
const intl = useIntl() const intl = useIntl()
const dispatch = useAppDispatch(); const dispatch = useAppDispatch();
const onOpenAIApiKeyChange = useCallback((event: React.ChangeEvent<HTMLInputElement>) => dispatch(setOpenAIApiKeyFromEvent(event)), [dispatch]); const onOpenAIApiKeyChange = useCallback((event: React.ChangeEvent<HTMLInputElement>) => dispatch(setOpenAIApiKeyFromEvent(event)), [dispatch]);
const onUseOpenAIWhisperChange = useCallback((event: React.ChangeEvent<HTMLInputElement>) => dispatch(setUseOpenAIWhisperFromEvent(event)), [dispatch]);
const elem = useMemo(() => ( const elem = useMemo(() => (
<SettingsTab name="user"> <SettingsTab name="user">
@ -28,6 +30,13 @@ export default function UserOptionsTab(props: any) {
<FormattedMessage defaultMessage="Find your API key here." description="Label for the link that takes the user to the page on the OpenAI website where they can find their API key." /> <FormattedMessage defaultMessage="Find your API key here." description="Label for the link that takes the user to the page on the OpenAI website where they can find their API key." />
</a> </a>
</p> </p>
<Checkbox
style={{ marginTop: '1rem' }}
id="use-openai-whisper-api" checked={useOpenAIWhisper!} onChange={onUseOpenAIWhisperChange}
label="Use the OpenAI Whisper API for speech recognition."
/>
<p> <p>
<FormattedMessage defaultMessage="Your API key is stored only on this device and never transmitted to anyone except OpenAI." /> <FormattedMessage defaultMessage="Your API key is stored only on this device and never transmitted to anyone except OpenAI." />
</p> </p>
@ -36,7 +45,7 @@ export default function UserOptionsTab(props: any) {
</p> </p>
</SettingsOption> </SettingsOption>
</SettingsTab> </SettingsTab>
), [option, openaiApiKey, onOpenAIApiKeyChange]); ), [option, openaiApiKey, useOpenAIWhisper, onOpenAIApiKeyChange]);
return elem; return elem;
} }

View File

@ -72,7 +72,7 @@ async function bootstrapApplication() {
root.render( root.render(
<React.StrictMode> <React.StrictMode>
<IntlProvider locale={navigator.language} messages={messages}> <IntlProvider locale={navigator.language} defaultLocale="en-GB" messages={messages}>
<MantineProvider theme={{ colorScheme: "dark" }}> <MantineProvider theme={{ colorScheme: "dark" }}>
<Provider store={store}> <Provider store={store}>
<PersistGate loading={null} persistor={persistor}> <PersistGate loading={null} persistor={persistor}>

View File

@ -131,6 +131,7 @@ export async function createStreamingChatCompletion(messages: OpenAIMessage[], p
}); });
eventSource.addEventListener('message', async (event: any) => { eventSource.addEventListener('message', async (event: any) => {
if (event.data === '[DONE]') { if (event.data === '[DONE]') {
emitter.emit('done'); emitter.emit('done');
return; return;

View File

@ -0,0 +1,133 @@
declare global {
interface Window {
SpeechRecognition: SpeechRecognition
}
interface SpeechGrammar {
src: string
weight: number
}
const SpeechGrammar: {
prototype: SpeechGrammar
new(): SpeechGrammar
}
interface SpeechGrammarList {
readonly length: number
addFromString(string: string, weight?: number): void
addFromURI(src: string, weight?: number): void
item(index: number): SpeechGrammar
[index: number]: SpeechGrammar
}
const SpeechGrammarList: {
prototype: SpeechGrammarList
new(): SpeechGrammarList
}
interface SpeechRecognitionEventMap {
audioend: Event
audiostart: Event
end: Event
error: SpeechRecognitionError
nomatch: SpeechRecognitionEvent
result: SpeechRecognitionEvent
soundend: Event
soundstart: Event
speechend: Event
speechstart: Event
start: Event
}
interface SpeechRecognition {
continuous: boolean
grammars: SpeechGrammarList
interimResults: boolean
lang: string
maxAlternatives: number
onaudioend: ((this: SpeechRecognition, ev: Event) => any) | null
onaudiostart: ((this: SpeechRecognition, ev: Event) => any) | null
onend: ((this: SpeechRecognition, ev: Event) => any) | null
onerror:
| ((this: SpeechRecognition, ev: SpeechRecognitionError) => any)
| null
onnomatch:
| ((this: SpeechRecognition, ev: SpeechRecognitionEvent) => any)
| null
onresult:
| ((this: SpeechRecognition, ev: SpeechRecognitionEvent) => any)
| null
onsoundend: ((this: SpeechRecognition, ev: Event) => any) | null
onsoundstart: ((this: SpeechRecognition, ev: Event) => any) | null
onspeechend: ((this: SpeechRecognition, ev: Event) => any) | null
onspeechstart: ((this: SpeechRecognition, ev: Event) => any) | null
onstart: ((this: SpeechRecognition, ev: Event) => any) | null
serviceURI: string
abort(): void
start(): void
stop(): void
addEventListener<K extends keyof SpeechRecognitionEventMap>(
type: K,
listener: (
this: SpeechRecognition,
ev: SpeechRecognitionEventMap[K]
) => any,
options?: boolean | AddEventListenerOptions
): void
addEventListener(
type: string,
listener: EventListenerOrEventListenerObject,
options?: boolean | AddEventListenerOptions
): void
removeEventListener<K extends keyof SpeechRecognitionEventMap>(
type: K,
listener: (
this: SpeechRecognition,
ev: SpeechRecognitionEventMap[K]
) => any,
options?: boolean | EventListenerOptions
): void
removeEventListener(
type: string,
listener: EventListenerOrEventListenerObject,
options?: boolean | EventListenerOptions
): void
}
const SpeechRecognition: {
prototype: SpeechRecognition
new(): SpeechRecognition
}
interface SpeechRecognitionError extends Event {
// readonly error: SpeechRecognitionErrorCode;
readonly message: string
}
const SpeechRecognitionError: {
prototype: SpeechRecognitionError
new(): SpeechRecognitionError
}
interface SpeechRecognitionEvent extends Event {
readonly emma: Document | null
readonly interpretation: any
readonly resultIndex: number
readonly results: SpeechRecognitionResultList
}
const SpeechRecognitionEvent: {
prototype: SpeechRecognitionEvent
new(): SpeechRecognitionEvent
}
}
let speechRecognition: SpeechRecognition
if (window.SpeechRecognition) {
speechRecognition = new SpeechRecognition()
} else {
speechRecognition = new webkitSpeechRecognition()
}
export { speechRecognition }

View File

@ -3,9 +3,12 @@ import type { RootState } from '.';
const initialState: { const initialState: {
openAIApiKey?: string | null | undefined; openAIApiKey?: string | null | undefined;
useOpenAIWhisper: boolean;
elevenLabsApiKey?: string | null | undefined; elevenLabsApiKey?: string | null | undefined;
} = { } = {
openAIApiKey: localStorage.getItem('openai-api-key'), openAIApiKey: localStorage.getItem('openai-api-key'),
useOpenAIWhisper: false,
elevenLabsApiKey: localStorage.getItem('elevenlabs-api-key'), elevenLabsApiKey: localStorage.getItem('elevenlabs-api-key'),
}; };
@ -18,7 +21,11 @@ export const apiKeysSlice = createSlice({
}, },
setElevenLabsApiKey: (state, action: PayloadAction<string>) => { setElevenLabsApiKey: (state, action: PayloadAction<string>) => {
state.elevenLabsApiKey = action.payload; state.elevenLabsApiKey = action.payload;
},
setUseOpenAIWhisper: (state, action: PayloadAction<boolean>) => {
state.useOpenAIWhisper = action.payload;
} }
}, },
}) })
@ -26,8 +33,10 @@ export const { setOpenAIApiKey, setElevenLabsApiKey } = apiKeysSlice.actions;
export const setOpenAIApiKeyFromEvent = (event: React.ChangeEvent<HTMLInputElement>) => apiKeysSlice.actions.setOpenAIApiKey(event.target.value); export const setOpenAIApiKeyFromEvent = (event: React.ChangeEvent<HTMLInputElement>) => apiKeysSlice.actions.setOpenAIApiKey(event.target.value);
export const setElevenLabsApiKeyFromEvent = (event: React.ChangeEvent<HTMLInputElement>) => apiKeysSlice.actions.setElevenLabsApiKey(event.target.value); export const setElevenLabsApiKeyFromEvent = (event: React.ChangeEvent<HTMLInputElement>) => apiKeysSlice.actions.setElevenLabsApiKey(event.target.value);
export const setUseOpenAIWhisperFromEvent = (event: React.ChangeEvent<HTMLInputElement>) => apiKeysSlice.actions.setUseOpenAIWhisper(event.target.checked);
export const selectOpenAIApiKey = (state: RootState) => state.apiKeys.openAIApiKey; export const selectOpenAIApiKey = (state: RootState) => state.apiKeys.openAIApiKey;
export const selectElevenLabsApiKey = (state: RootState) => state.apiKeys.elevenLabsApiKey; export const selectElevenLabsApiKey = (state: RootState) => state.apiKeys.elevenLabsApiKey;
export const selectUseOpenAIWhisper = (state: RootState) => state.apiKeys.useOpenAIWhisper;
export default apiKeysSlice.reducer; export default apiKeysSlice.reducer;

View File

@ -25,6 +25,8 @@ const persistMessageConfig = {
storage, storage,
} }
const store = configureStore({ const store = configureStore({
reducer: { reducer: {
// auth: authReducer, // auth: authReducer,

View File

@ -0,0 +1,8 @@
import express from 'express';
import RequestHandler from "./base";
export default class WhisperRequestHandler extends RequestHandler {
handler(req: express.Request, res: express.Response): any {
res.json({ status: 'ok' });
}
}

View File

@ -18,6 +18,7 @@ import BasicCompletionRequestHandler from './endpoints/completion/basic';
import StreamingCompletionRequestHandler from './endpoints/completion/streaming'; import StreamingCompletionRequestHandler from './endpoints/completion/streaming';
import SessionRequestHandler from './endpoints/session'; import SessionRequestHandler from './endpoints/session';
import GetShareRequestHandler from './endpoints/get-share'; import GetShareRequestHandler from './endpoints/get-share';
import WhisperRequestHandler from './endpoints/whisper';
import { configurePassport } from './passport'; import { configurePassport } from './passport';
import { configureAuth0 } from './auth0'; import { configureAuth0 } from './auth0';
import DeleteChatRequestHandler from './endpoints/delete-chat'; import DeleteChatRequestHandler from './endpoints/delete-chat';
@ -82,6 +83,7 @@ export default class ChatServer {
this.app.post('/chatapi/sync', (req, res) => new SyncRequestHandler(this, req, res)); this.app.post('/chatapi/sync', (req, res) => new SyncRequestHandler(this, req, res));
this.app.get('/chatapi/share/:id', (req, res) => new GetShareRequestHandler(this, req, res)); this.app.get('/chatapi/share/:id', (req, res) => new GetShareRequestHandler(this, req, res));
this.app.post('/chatapi/share', (req, res) => new ShareRequestHandler(this, req, res)); this.app.post('/chatapi/share', (req, res) => new ShareRequestHandler(this, req, res));
this.app.post('/chatapi/whisper', (req, res) => new WhisperRequestHandler(this, req, res));
if (process.env.ENABLE_SERVER_COMPLETION) { if (process.env.ENABLE_SERVER_COMPLETION) {
this.app.post('/chatapi/completion', (req, res) => new BasicCompletionRequestHandler(this, req, res)); this.app.post('/chatapi/completion', (req, res) => new BasicCompletionRequestHandler(this, req, res));