Merge pull request #58 from tluyben/whisper-stt-api

Whisper stt api
2023-03-20 06:06:18 -07:00
parent b6881c69b4 39e175bef6
commit 95bf0aef6c
11 changed files with 300 additions and 18 deletions
--- a/app/package.json
+++ b/app/package.json
@@ -4,6 +4,7 @@
  "dependencies": {
    "@auth0/auth0-spa-js": "^2.0.4",
    "@emotion/css": "^11.10.6",
    "@emotion/react": "^11.10.6",
    "@emotion/styled": "^11.10.6",
    "@mantine/core": "^5.10.5",
    "@mantine/hooks": "^5.10.5",
@@ -20,6 +21,7 @@
    "jshashes": "^1.0.8",
    "localforage": "^1.10.0",
    "match-sorter": "^6.3.1",
    "mic-recorder-to-mp3": "^2.2.2",
    "minisearch": "^6.0.1",
    "natural": "^6.2.0",
    "openai": "^3.2.1",
--- a/app/src/components/input.tsx
+++ b/app/src/components/input.tsx
@@ -1,7 +1,7 @@
 import styled from '@emotion/styled';
 import { Button, ActionIcon, Textarea, Loader } from '@mantine/core';
 import { useMediaQuery } from '@mantine/hooks';
-import { useCallback, useMemo } from 'react';
+import { useCallback, useMemo, useState } from 'react';
 import { FormattedMessage, useIntl } from 'react-intl';
 import { useLocation } from 'react-router-dom';
 import { useAppContext } from '../context';
@@ -9,6 +9,10 @@ import { useAppDispatch, useAppSelector } from '../store';
 import { selectMessage, setMessage } from '../store/message';
 import { selectTemperature } from '../store/parameters';
 import { openSystemPromptPanel, openTemperaturePanel } from '../store/settings-ui';
 import { speechRecognition } from '../speech-recognition-types.d'
 import MicRecorder from 'mic-recorder-to-mp3';
 import { selectUseOpenAIWhisper, selectOpenAIApiKey } from '../store/api-keys';
 import { Mp3Encoder } from 'lamejs';
 const Container = styled.div`
    background: #292933;
@@ -34,11 +38,53 @@ export interface MessageInputProps {
    disabled?: boolean;
 }
 async function chunkAndEncodeMP3File(file: Blob): Promise<Array<File>> {
    const MAX_CHUNK_SIZE = 25 * 1024 * 1024; // 25 MB
    const audioContext = new AudioContext();
    const audioBuffer = await audioContext.decodeAudioData(await file.arrayBuffer());
    const duration = audioBuffer.duration;
    const sampleRate = audioBuffer.sampleRate;
    const numChannels = audioBuffer.numberOfChannels;
    const bytesPerSample = 2; // 16-bit audio
    const samplesPerChunk = Math.floor((MAX_CHUNK_SIZE / bytesPerSample) / numChannels);
    const totalSamples = Math.floor(duration * sampleRate);
    const numChunks = Math.ceil(totalSamples / samplesPerChunk);
    const chunks: Array<File> = [];
    for (let i = 0; i < numChunks; i++) {
        const startSample = i * samplesPerChunk;
        const endSample = Math.min(startSample + samplesPerChunk, totalSamples);
        const chunkDuration = (endSample - startSample) / sampleRate;
        const chunkBuffer = audioContext.createBuffer(numChannels, endSample - startSample, sampleRate);
        for (let c = 0; c < numChannels; c++) {
            const channelData = audioBuffer.getChannelData(c).subarray(startSample, endSample);
            chunkBuffer.copyToChannel(channelData, c);
        }
        const chunkBlob = await new Promise<Blob>((resolve) => {
            const encoder = new Mp3Encoder(numChannels, sampleRate, 128);
            const leftData = chunkBuffer.getChannelData(0);
            const rightData = numChannels === 1 ? leftData : chunkBuffer.getChannelData(1);
            const mp3Data = encoder.encodeBuffer(leftData, rightData);
            const blob = new Blob([mp3Data], { type: 'audio/mp3' });
            resolve(blob);
        });
        chunks.push(new File([chunkBlob], `text-${i}.mp3`, { type: 'audio/mp3' }));
    }
    return chunks;
 }
 export default function MessageInput(props: MessageInputProps) {
    const temperature = useAppSelector(selectTemperature);
    const message = useAppSelector(selectMessage);
-
+    const [recording, setRecording] = useState(false);
    const hasVerticalSpace = useMediaQuery('(min-height: 1000px)');
    const recorder = useMemo(() => new MicRecorder({ bitRate: 128 }), []);
    const useOpenAIWhisper = useAppSelector(selectUseOpenAIWhisper);
    const openAIApiKey = useAppSelector(selectOpenAIApiKey);
    const context = useAppContext();
    const dispatch = useAppDispatch();
@@ -58,6 +104,69 @@ export default function MessageInput(props: MessageInputProps) {
        }
    }, [context, message, dispatch]);
    const onSpeechStart = useCallback(() => {
        if (!recording) {
            setRecording(true);
            // if we are using whisper, the we will just record with the browser and send the api when done 
            if (useOpenAIWhisper) {
                recorder.start().catch((e: any) => console.error(e));
            } else {
                speechRecognition.continuous = true;
                speechRecognition.interimResults = true;
                speechRecognition.onresult = (event) => {
                    const transcript = event.results[event.results.length - 1][0].transcript;
                    dispatch(setMessage(transcript));
                };
                speechRecognition.start();
            }
        } else {
            setRecording(false);
            if (useOpenAIWhisper) {
                const mp3 = recorder.stop().getMp3();
                mp3.then(async ([buffer, blob]) => {
                    const file = new File(buffer, 'chat.mp3', {
                        type: blob.type,
                        lastModified: Date.now()
                    });
                    // TODO: cut in chunks
                    var data = new FormData()
                    data.append('file', file);
                    data.append('model', 'whisper-1')
                    try {
                        const response = await fetch("https://api.openai.com/v1/audio/transcriptions", {
                            method: "POST",
                            headers: {
                                'Authorization': `Bearer ${openAIApiKey}`,
                            },
                            body: data,
                        });
                        const json = await response.json()
                        if (json.text) {
                            dispatch(setMessage(json.text));
                        }
                    } catch (e) {
                        console.log(e)
                    }
                }).catch((e: any) => console.error(e));
            } else {
                speechRecognition.stop();
            }
        }
    }, [recording, message, dispatch]);
    const onKeyDown = useCallback((e: React.KeyboardEvent<HTMLTextAreaElement>) => {
        if (e.key === 'Enter' && e.shiftKey === false && !props.disabled) {
            e.preventDefault();
@@ -66,6 +175,7 @@ export default function MessageInput(props: MessageInputProps) {
    }, [onSubmit, props.disabled]);
    const rightSection = useMemo(() => {
        return (
            <div style={{
                opacity: '0.8',
@@ -84,14 +194,20 @@ export default function MessageInput(props: MessageInputProps) {
                    <Loader size="xs" style={{ padding: '0 0.8rem 0 0.5rem' }} />
                </>)}
                {!context.generating && (
                    <>
                        <ActionIcon size="xl"
                            onClick={onSpeechStart}>
                            <i className="fa fa-microphone" style={{ fontSize: '90%', color: recording ? 'red' : 'inherit' }} />
                        </ActionIcon>
                        <ActionIcon size="xl"
                            onClick={onSubmit}>
                            <i className="fa fa-paper-plane" style={{ fontSize: '90%' }} />
                        </ActionIcon>
                    </>
                )}
            </div>
        );
-    }, [onSubmit, props.disabled, context.generating]);
+    }, [recording, onSubmit, props.disabled, context.generating]);
    const disabled = context.generating;
--- a/app/src/components/settings/user.tsx
+++ b/app/src/components/settings/user.tsx
@@ -1,19 +1,21 @@
 import SettingsTab from "./tab";
 import SettingsOption from "./option";
-import { TextInput } from "@mantine/core";
+import { Checkbox, TextInput } from "@mantine/core";
 import { useCallback, useMemo } from "react";
 import { useAppDispatch, useAppSelector } from "../../store";
-import { selectOpenAIApiKey, setOpenAIApiKeyFromEvent } from "../../store/api-keys";
+import { selectOpenAIApiKey, setOpenAIApiKeyFromEvent, selectUseOpenAIWhisper, setUseOpenAIWhisperFromEvent } from "../../store/api-keys";
 import { selectSettingsOption } from "../../store/settings-ui";
 import { FormattedMessage, useIntl } from "react-intl";
 export default function UserOptionsTab(props: any) {
    const option = useAppSelector(selectSettingsOption);
    const openaiApiKey = useAppSelector(selectOpenAIApiKey);
    const useOpenAIWhisper = useAppSelector(selectUseOpenAIWhisper);
    const intl = useIntl()
    const dispatch = useAppDispatch();
    const onOpenAIApiKeyChange = useCallback((event: React.ChangeEvent<HTMLInputElement>) => dispatch(setOpenAIApiKeyFromEvent(event)), [dispatch]);
    const onUseOpenAIWhisperChange = useCallback((event: React.ChangeEvent<HTMLInputElement>) => dispatch(setUseOpenAIWhisperFromEvent(event)), [dispatch]);
    const elem = useMemo(() => (
        <SettingsTab name="user">
@@ -28,6 +30,13 @@ export default function UserOptionsTab(props: any) {
                        <FormattedMessage defaultMessage="Find your API key here." description="Label for the link that takes the user to the page on the OpenAI website where they can find their API key." />
                    </a>
                </p>
                <Checkbox
                    style={{ marginTop: '1rem' }}
                    id="use-openai-whisper-api" checked={useOpenAIWhisper!} onChange={onUseOpenAIWhisperChange}
                    label="Use the OpenAI Whisper API for speech recognition."
                />
                <p>
                    <FormattedMessage defaultMessage="Your API key is stored only on this device and never transmitted to anyone except OpenAI." />
                </p>
@@ -36,7 +45,7 @@ export default function UserOptionsTab(props: any) {
                </p>
            </SettingsOption>
        </SettingsTab>
-    ), [option, openaiApiKey, onOpenAIApiKeyChange]);
+    ), [option, openaiApiKey, useOpenAIWhisper, onOpenAIApiKeyChange]);
    return elem;
 }
--- a/app/src/index.tsx
+++ b/app/src/index.tsx
@@ -72,7 +72,7 @@ async function bootstrapApplication() {
    root.render(
        <React.StrictMode>
-            <IntlProvider locale={navigator.language} messages={messages}>
+            <IntlProvider locale={navigator.language} defaultLocale="en-GB" messages={messages}>
                <MantineProvider theme={{ colorScheme: "dark" }}>
                    <Provider store={store}>
                        <PersistGate loading={null} persistor={persistor}>
--- a/app/src/openai.ts
+++ b/app/src/openai.ts
@@ -131,6 +131,7 @@ export async function createStreamingChatCompletion(messages: OpenAIMessage[], p
    });
    eventSource.addEventListener('message', async (event: any) => {
        if (event.data === '[DONE]') {
            emitter.emit('done');
            return;
--- a/app/src/speech-recognition-types.d.ts
+++ b/app/src/speech-recognition-types.d.ts
@@ -0,0 +1,133 @@
 declare global {
    interface Window {
        SpeechRecognition: SpeechRecognition
    }
    interface SpeechGrammar {
        src: string
        weight: number
    }
    const SpeechGrammar: {
        prototype: SpeechGrammar
        new(): SpeechGrammar
    }
    interface SpeechGrammarList {
        readonly length: number
        addFromString(string: string, weight?: number): void
        addFromURI(src: string, weight?: number): void
        item(index: number): SpeechGrammar
        [index: number]: SpeechGrammar
    }
    const SpeechGrammarList: {
        prototype: SpeechGrammarList
        new(): SpeechGrammarList
    }
    interface SpeechRecognitionEventMap {
        audioend: Event
        audiostart: Event
        end: Event
        error: SpeechRecognitionError
        nomatch: SpeechRecognitionEvent
        result: SpeechRecognitionEvent
        soundend: Event
        soundstart: Event
        speechend: Event
        speechstart: Event
        start: Event
    }
    interface SpeechRecognition {
        continuous: boolean
        grammars: SpeechGrammarList
        interimResults: boolean
        lang: string
        maxAlternatives: number
        onaudioend: ((this: SpeechRecognition, ev: Event) => any) | null
        onaudiostart: ((this: SpeechRecognition, ev: Event) => any) | null
        onend: ((this: SpeechRecognition, ev: Event) => any) | null
        onerror:
        | ((this: SpeechRecognition, ev: SpeechRecognitionError) => any)
        | null
        onnomatch:
        | ((this: SpeechRecognition, ev: SpeechRecognitionEvent) => any)
        | null
        onresult:
        | ((this: SpeechRecognition, ev: SpeechRecognitionEvent) => any)
        | null
        onsoundend: ((this: SpeechRecognition, ev: Event) => any) | null
        onsoundstart: ((this: SpeechRecognition, ev: Event) => any) | null
        onspeechend: ((this: SpeechRecognition, ev: Event) => any) | null
        onspeechstart: ((this: SpeechRecognition, ev: Event) => any) | null
        onstart: ((this: SpeechRecognition, ev: Event) => any) | null
        serviceURI: string
        abort(): void
        start(): void
        stop(): void
        addEventListener<K extends keyof SpeechRecognitionEventMap>(
            type: K,
            listener: (
                this: SpeechRecognition,
                ev: SpeechRecognitionEventMap[K]
            ) => any,
            options?: boolean | AddEventListenerOptions
        ): void
        addEventListener(
            type: string,
            listener: EventListenerOrEventListenerObject,
            options?: boolean | AddEventListenerOptions
        ): void
        removeEventListener<K extends keyof SpeechRecognitionEventMap>(
            type: K,
            listener: (
                this: SpeechRecognition,
                ev: SpeechRecognitionEventMap[K]
            ) => any,
            options?: boolean | EventListenerOptions
        ): void
        removeEventListener(
            type: string,
            listener: EventListenerOrEventListenerObject,
            options?: boolean | EventListenerOptions
        ): void
    }
    const SpeechRecognition: {
        prototype: SpeechRecognition
        new(): SpeechRecognition
    }
    interface SpeechRecognitionError extends Event {
        // readonly error: SpeechRecognitionErrorCode;
        readonly message: string
    }
    const SpeechRecognitionError: {
        prototype: SpeechRecognitionError
        new(): SpeechRecognitionError
    }
    interface SpeechRecognitionEvent extends Event {
        readonly emma: Document | null
        readonly interpretation: any
        readonly resultIndex: number
        readonly results: SpeechRecognitionResultList
    }
    const SpeechRecognitionEvent: {
        prototype: SpeechRecognitionEvent
        new(): SpeechRecognitionEvent
    }
 }
 let speechRecognition: SpeechRecognition
 if (window.SpeechRecognition) {
    speechRecognition = new SpeechRecognition()
 } else {
    speechRecognition = new webkitSpeechRecognition()
 }
 export { speechRecognition }
--- a/app/src/store/api-keys.ts
+++ b/app/src/store/api-keys.ts
@@ -3,9 +3,12 @@ import type { RootState } from '.';
 const initialState: {
    openAIApiKey?: string | null | undefined;
    useOpenAIWhisper: boolean;
    elevenLabsApiKey?: string | null | undefined;
 } = {
    openAIApiKey: localStorage.getItem('openai-api-key'),
    useOpenAIWhisper: false,
    elevenLabsApiKey: localStorage.getItem('elevenlabs-api-key'),
 };
@@ -18,7 +21,11 @@ export const apiKeysSlice = createSlice({
        },
        setElevenLabsApiKey: (state, action: PayloadAction<string>) => {
            state.elevenLabsApiKey = action.payload;
        },
        setUseOpenAIWhisper: (state, action: PayloadAction<boolean>) => {
            state.useOpenAIWhisper = action.payload;
        }
    },
 })
@@ -26,8 +33,10 @@ export const { setOpenAIApiKey, setElevenLabsApiKey } = apiKeysSlice.actions;
 export const setOpenAIApiKeyFromEvent = (event: React.ChangeEvent<HTMLInputElement>) => apiKeysSlice.actions.setOpenAIApiKey(event.target.value);
 export const setElevenLabsApiKeyFromEvent = (event: React.ChangeEvent<HTMLInputElement>) => apiKeysSlice.actions.setElevenLabsApiKey(event.target.value);
 export const setUseOpenAIWhisperFromEvent = (event: React.ChangeEvent<HTMLInputElement>) => apiKeysSlice.actions.setUseOpenAIWhisper(event.target.checked);
 export const selectOpenAIApiKey = (state: RootState) => state.apiKeys.openAIApiKey;
 export const selectElevenLabsApiKey = (state: RootState) => state.apiKeys.elevenLabsApiKey;
 export const selectUseOpenAIWhisper = (state: RootState) => state.apiKeys.useOpenAIWhisper;
 export default apiKeysSlice.reducer;
--- a/app/src/store/index.ts
+++ b/app/src/store/index.ts
@@ -25,6 +25,8 @@ const persistMessageConfig = {
  storage,
 }
 const store = configureStore({
  reducer: {
    // auth: authReducer,
--- a/server/src/endpoints/whisper.ts
+++ b/server/src/endpoints/whisper.ts
@@ -0,0 +1,8 @@
 import express from 'express';
 import RequestHandler from "./base";
 export default class WhisperRequestHandler extends RequestHandler {
    handler(req: express.Request, res: express.Response): any {
        res.json({ status: 'ok' });
    }
 }
--- a/server/src/index.ts
+++ b/server/src/index.ts
@@ -18,6 +18,7 @@ import BasicCompletionRequestHandler from './endpoints/completion/basic';
 import StreamingCompletionRequestHandler from './endpoints/completion/streaming';
 import SessionRequestHandler from './endpoints/session';
 import GetShareRequestHandler from './endpoints/get-share';
 import WhisperRequestHandler from './endpoints/whisper';
 import { configurePassport } from './passport';
 import { configureAuth0 } from './auth0';
 import DeleteChatRequestHandler from './endpoints/delete-chat';
@@ -82,6 +83,7 @@ export default class ChatServer {
        this.app.post('/chatapi/sync', (req, res) => new SyncRequestHandler(this, req, res));
        this.app.get('/chatapi/share/:id', (req, res) => new GetShareRequestHandler(this, req, res));
        this.app.post('/chatapi/share', (req, res) => new ShareRequestHandler(this, req, res));
        this.app.post('/chatapi/whisper', (req, res) => new WhisperRequestHandler(this, req, res));
        if (process.env.ENABLE_SERVER_COMPLETION) {
            this.app.post('/chatapi/completion', (req, res) => new BasicCompletionRequestHandler(this, req, res));