speech recognition error handling & browser fixes

2023-03-21 00:11:05 +00:00 · 2023-03-21 00:11:05 +00:00 · d7251e34da
parent 9e10796b5a
commit d7251e34da
2 changed files with 96 additions and 98 deletions
--- a/app/package.json
+++ b/app/package.json
@ -3,6 +3,7 @@
  "version": "0.2.1",
  "dependencies": {
    "@auth0/auth0-spa-js": "^2.0.4",
    "@chengsokdara/use-whisper": "^0.2.0",
    "@emotion/css": "^11.10.6",
    "@emotion/react": "^11.10.6",
    "@emotion/styled": "^11.10.6",
--- a/app/src/components/input.tsx
+++ b/app/src/components/input.tsx
@ -1,7 +1,7 @@
 import styled from '@emotion/styled';
-import { Button, ActionIcon, Textarea, Loader } from '@mantine/core';
+import { Button, ActionIcon, Textarea, Loader, Popover } from '@mantine/core';
 import { useMediaQuery } from '@mantine/hooks';
-import { useCallback, useMemo, useState } from 'react';
+import { useCallback, useEffect, useMemo, useState } from 'react';
 import { FormattedMessage, useIntl } from 'react-intl';
 import { useLocation } from 'react-router-dom';
 import { useAppContext } from '../context';
@ -10,9 +10,8 @@ import { selectMessage, setMessage } from '../store/message';
 import { selectTemperature } from '../store/parameters';
 import { openOpenAIApiKeyPanel, openSystemPromptPanel, openTemperaturePanel } from '../store/settings-ui';
 import { speechRecognition, supportsSpeechRecognition } from '../speech-recognition-types'
-import MicRecorder from 'mic-recorder-to-mp3';
+import { useWhisper } from '@chengsokdara/use-whisper';
 import { selectUseOpenAIWhisper, selectOpenAIApiKey } from '../store/api-keys';
 import { Mp3Encoder } from 'lamejs';
 const Container = styled.div`
    background: #292933;
@ -38,55 +37,26 @@ export interface MessageInputProps {
    disabled?: boolean;
 }
 async function chunkAndEncodeMP3File(file: Blob): Promise<Array<File>> {
    const MAX_CHUNK_SIZE = 25 * 1024 * 1024; // 25 MB
    const audioContext = new AudioContext();
    const audioBuffer = await audioContext.decodeAudioData(await file.arrayBuffer());
    const duration = audioBuffer.duration;
    const sampleRate = audioBuffer.sampleRate;
    const numChannels = audioBuffer.numberOfChannels;
    const bytesPerSample = 2; // 16-bit audio
    const samplesPerChunk = Math.floor((MAX_CHUNK_SIZE / bytesPerSample) / numChannels);
    const totalSamples = Math.floor(duration * sampleRate);
    const numChunks = Math.ceil(totalSamples / samplesPerChunk);
    const chunks: Array<File> = [];
    for (let i = 0; i < numChunks; i++) {
        const startSample = i * samplesPerChunk;
        const endSample = Math.min(startSample + samplesPerChunk, totalSamples);
        const chunkDuration = (endSample - startSample) / sampleRate;
        const chunkBuffer = audioContext.createBuffer(numChannels, endSample - startSample, sampleRate);
        for (let c = 0; c < numChannels; c++) {
            const channelData = audioBuffer.getChannelData(c).subarray(startSample, endSample);
            chunkBuffer.copyToChannel(channelData, c);
        }
        const chunkBlob = await new Promise<Blob>((resolve) => {
            const encoder = new Mp3Encoder(numChannels, sampleRate, 128);
            const leftData = chunkBuffer.getChannelData(0);
            const rightData = numChannels === 1 ? leftData : chunkBuffer.getChannelData(1);
            const mp3Data = encoder.encodeBuffer(leftData, rightData);
            const blob = new Blob([mp3Data], { type: 'audio/mp3' });
            resolve(blob);
        });
        chunks.push(new File([chunkBlob], `text-${i}.mp3`, { type: 'audio/mp3' }));
    }
    return chunks;
 }
 export default function MessageInput(props: MessageInputProps) {
    const temperature = useAppSelector(selectTemperature);
    const message = useAppSelector(selectMessage);
    const [recording, setRecording] = useState(false);
-    const [transcribing, setTranscribing] = useState(false);
+    const [speechError, setSpeechError] = useState<string | null>(null);
    const hasVerticalSpace = useMediaQuery('(min-height: 1000px)');
    const recorder = useMemo(() => new MicRecorder({ bitRate: 128 }), []);
    const useOpenAIWhisper = useAppSelector(selectUseOpenAIWhisper);
    const openAIApiKey = useAppSelector(selectOpenAIApiKey);
    const [initialMessage, setInitialMessage] = useState('');
    const {
        transcribing,
        transcript,
        startRecording,
        stopRecording,
    } = useWhisper({
        apiKey: openAIApiKey || ' ',
        streaming: false,
    });
    const context = useAppContext();
    const dispatch = useAppDispatch();
    const intl = useIntl();
@ -100,6 +70,8 @@ export default function MessageInput(props: MessageInputProps) {
    const pathname = useLocation().pathname;
    const onSubmit = useCallback(async () => {
        setSpeechError(null);
        if (await context.onNewMessage(message)) {
            dispatch(setMessage(''));
        }
@ -107,6 +79,7 @@ export default function MessageInput(props: MessageInputProps) {
    const onSpeechError = useCallback((e: any) => {
        console.error('speech recognition error', e);
        setSpeechError(e.message);
        try {
            speechRecognition?.stop();
@ -114,26 +87,54 @@ export default function MessageInput(props: MessageInputProps) {
        }
        try {
-            recorder.stop();
+            stopRecording();
        } catch (e) { }
        setRecording(false);
-        setTranscribing(false);
+    }, [stopRecording]);
    }, [recorder]);
-    const onSpeechStart = useCallback(() => {
+    const onHideSpeechError = useCallback(() => setSpeechError(null), []);
-        if (!openAIApiKey) {
+
-            dispatch(openOpenAIApiKeyPanel());
+    const onSpeechStart = useCallback(async () => {
-            return false;
+        let granted = false;
        let denied = false;
        try {
            const result = await navigator.permissions.query({ name: 'microphone' as any });
            if (result.state == 'granted') {
                granted = true;
            } else if (result.state == 'denied') {
                denied = true;
            }
        } catch (e) {}
        if (!granted && !denied) {
            try {
                const stream = await navigator.mediaDevices.getUserMedia({ video: false, audio: true });
                stream.getTracks().forEach(track => track.stop());
                granted = true;
            } catch (e) {
                denied = true;
            }
        }
        if (denied) {
            onSpeechError(new Error('speech permission was not granted'));
            return;
        }
        try {
            if (!recording) {
                setRecording(true);
                // if we are using whisper, the we will just record with the browser and send the api when done 
                if (useOpenAIWhisper || !supportsSpeechRecognition) {
-                    recorder.start().catch(onSpeechError);
+                    if (!openAIApiKey) {
                        dispatch(openOpenAIApiKeyPanel());
                        return false;
                    }
                    // recorder.start().catch(onSpeechError);
                    setInitialMessage(message);
                    await startRecording();
                } else if (speechRecognition) {
                    const initialMessage = message;
@ -155,45 +156,12 @@ export default function MessageInput(props: MessageInputProps) {
                    onSpeechError(new Error('not supported'));
                }
            } else {
                setRecording(false);
                if (useOpenAIWhisper || !supportsSpeechRecognition) {
-                    setTranscribing(true);
+                    await stopRecording();
-                    const mp3 = recorder.stop().getMp3();
+                    setTimeout(() => setRecording(false), 500);
                    mp3.then(async ([buffer, blob]) => {
                        const file = new File(buffer, 'chat.mp3', {
                            type: blob.type,
                            lastModified: Date.now()
                        });
                        // TODO: cut in chunks
                        var data = new FormData()
                        data.append('file', file);
                        data.append('model', 'whisper-1')
                        try {
                            const response = await fetch("https://api.openai.com/v1/audio/transcriptions", {
                                method: "POST",
                                headers: {
                                    'Authorization': `Bearer ${openAIApiKey}`,
                                },
                                body: data,
                            });
                            const json = await response.json()
                            if (json.text) {
                                dispatch(setMessage(message + ' ' + json.text));
                                setTranscribing(false);
                            }
                        } catch (e) {
                            onSpeechError(e);
                        }
                    }).catch(onSpeechError);
                } else if (speechRecognition) {
                    speechRecognition.stop();
                    setRecording(false);
                } else {
                    onSpeechError(new Error('not supported'));
                }
@ -201,8 +169,15 @@ export default function MessageInput(props: MessageInputProps) {
        } catch (e) {
            onSpeechError(e);
        }
-    }, [recording, message, dispatch, onSpeechError, openAIApiKey]);
+    }, [recording, message, dispatch, onSpeechError, setInitialMessage, openAIApiKey]);
    useEffect(() => {
        if (useOpenAIWhisper || !supportsSpeechRecognition) {
            if (!transcribing && !recording && transcript?.text) {
                dispatch(setMessage(initialMessage + ' ' + transcript.text));
            }
        }
    }, [initialMessage, transcript, recording, transcribing, useOpenAIWhisper, dispatch]);
    const onKeyDown = useCallback((e: React.KeyboardEvent<HTMLTextAreaElement>) => {
        if (e.key === 'Enter' && e.shiftKey === false && !props.disabled) {
@ -212,7 +187,6 @@ export default function MessageInput(props: MessageInputProps) {
    }, [onSubmit, props.disabled]);
    const rightSection = useMemo(() => {
        return (
            <div style={{
                opacity: '0.8',
@ -232,11 +206,34 @@ export default function MessageInput(props: MessageInputProps) {
                </>)}
                {!context.generating && (
                    <>
-                        <ActionIcon size="xl"
+                        <Popover width={200} position="bottom" withArrow shadow="md" opened={speechError !== null}>
-                            onClick={onSpeechStart}>
+                            <Popover.Target>
-                            {transcribing && <Loader size="xs" />}
+                                <ActionIcon size="xl"
-                            {!transcribing && <i className="fa fa-microphone" style={{ fontSize: '90%', color: recording ? 'red' : 'inherit' }} />}
+                                    onClick={onSpeechStart}>
-                        </ActionIcon>
+                                    {transcribing && <Loader size="xs" />}
                                    {!transcribing && <i className="fa fa-microphone" style={{ fontSize: '90%', color: recording ? 'red' : 'inherit' }} />}
                                </ActionIcon>
                            </Popover.Target>
                            <Popover.Dropdown>
                                <div style={{
                                    display: 'flex',
                                    flexDirection: 'column',
                                    alignItems: 'flex-start',
                                }}>
                                    <p style={{
                                        fontFamily: `"Work Sans", sans-serif`,
                                        fontSize: '0.9rem',
                                        textAlign: 'center',
                                        marginBottom: '0.5rem',
                                    }}>
                                        Sorry, an error occured trying to record audio.
                                    </p>
                                    <Button variant="light" size="xs" fullWidth onClick={onHideSpeechError}>
                                        Close
                                    </Button>
                                </div>
                            </Popover.Dropdown>
                        </Popover>
                        <ActionIcon size="xl"
                            onClick={onSubmit}>
                            <i className="fa fa-paper-plane" style={{ fontSize: '90%' }} />
@ -245,7 +242,7 @@ export default function MessageInput(props: MessageInputProps) {
                )}
            </div>
        );
-    }, [recording, transcribing, onSubmit, onSpeechStart, props.disabled, context.generating]);
+    }, [recording, transcribing, onSubmit, onSpeechStart, props.disabled, context.generating, speechError, onHideSpeechError]);
    const disabled = context.generating;