From d7251e34dad16fc3480c3c15c70ec461ae6a3938 Mon Sep 17 00:00:00 2001 From: Cogent Apps Date: Tue, 21 Mar 2023 00:11:05 +0000 Subject: [PATCH] speech recognition error handling & browser fixes --- app/package.json | 1 + app/src/components/input.tsx | 193 +++++++++++++++++------------------ 2 files changed, 96 insertions(+), 98 deletions(-) diff --git a/app/package.json b/app/package.json index 2b4850f..cb61878 100644 --- a/app/package.json +++ b/app/package.json @@ -3,6 +3,7 @@ "version": "0.2.1", "dependencies": { "@auth0/auth0-spa-js": "^2.0.4", + "@chengsokdara/use-whisper": "^0.2.0", "@emotion/css": "^11.10.6", "@emotion/react": "^11.10.6", "@emotion/styled": "^11.10.6", diff --git a/app/src/components/input.tsx b/app/src/components/input.tsx index e509ea3..d014811 100644 --- a/app/src/components/input.tsx +++ b/app/src/components/input.tsx @@ -1,7 +1,7 @@ import styled from '@emotion/styled'; -import { Button, ActionIcon, Textarea, Loader } from '@mantine/core'; +import { Button, ActionIcon, Textarea, Loader, Popover } from '@mantine/core'; import { useMediaQuery } from '@mantine/hooks'; -import { useCallback, useMemo, useState } from 'react'; +import { useCallback, useEffect, useMemo, useState } from 'react'; import { FormattedMessage, useIntl } from 'react-intl'; import { useLocation } from 'react-router-dom'; import { useAppContext } from '../context'; @@ -10,9 +10,8 @@ import { selectMessage, setMessage } from '../store/message'; import { selectTemperature } from '../store/parameters'; import { openOpenAIApiKeyPanel, openSystemPromptPanel, openTemperaturePanel } from '../store/settings-ui'; import { speechRecognition, supportsSpeechRecognition } from '../speech-recognition-types' -import MicRecorder from 'mic-recorder-to-mp3'; +import { useWhisper } from '@chengsokdara/use-whisper'; import { selectUseOpenAIWhisper, selectOpenAIApiKey } from '../store/api-keys'; -import { Mp3Encoder } from 'lamejs'; const Container = styled.div` background: #292933; @@ -38,55 +37,26 @@ export interface MessageInputProps { disabled?: boolean; } - - -async function chunkAndEncodeMP3File(file: Blob): Promise> { - const MAX_CHUNK_SIZE = 25 * 1024 * 1024; // 25 MB - const audioContext = new AudioContext(); - const audioBuffer = await audioContext.decodeAudioData(await file.arrayBuffer()); - const duration = audioBuffer.duration; - const sampleRate = audioBuffer.sampleRate; - const numChannels = audioBuffer.numberOfChannels; - const bytesPerSample = 2; // 16-bit audio - const samplesPerChunk = Math.floor((MAX_CHUNK_SIZE / bytesPerSample) / numChannels); - const totalSamples = Math.floor(duration * sampleRate); - const numChunks = Math.ceil(totalSamples / samplesPerChunk); - - const chunks: Array = []; - for (let i = 0; i < numChunks; i++) { - const startSample = i * samplesPerChunk; - const endSample = Math.min(startSample + samplesPerChunk, totalSamples); - const chunkDuration = (endSample - startSample) / sampleRate; - const chunkBuffer = audioContext.createBuffer(numChannels, endSample - startSample, sampleRate); - for (let c = 0; c < numChannels; c++) { - const channelData = audioBuffer.getChannelData(c).subarray(startSample, endSample); - chunkBuffer.copyToChannel(channelData, c); - } - const chunkBlob = await new Promise((resolve) => { - const encoder = new Mp3Encoder(numChannels, sampleRate, 128); - const leftData = chunkBuffer.getChannelData(0); - const rightData = numChannels === 1 ? leftData : chunkBuffer.getChannelData(1); - const mp3Data = encoder.encodeBuffer(leftData, rightData); - const blob = new Blob([mp3Data], { type: 'audio/mp3' }); - resolve(blob); - }); - chunks.push(new File([chunkBlob], `text-${i}.mp3`, { type: 'audio/mp3' })); - } - - return chunks; -} - - export default function MessageInput(props: MessageInputProps) { const temperature = useAppSelector(selectTemperature); const message = useAppSelector(selectMessage); const [recording, setRecording] = useState(false); - const [transcribing, setTranscribing] = useState(false); + const [speechError, setSpeechError] = useState(null); const hasVerticalSpace = useMediaQuery('(min-height: 1000px)'); - const recorder = useMemo(() => new MicRecorder({ bitRate: 128 }), []); const useOpenAIWhisper = useAppSelector(selectUseOpenAIWhisper); const openAIApiKey = useAppSelector(selectOpenAIApiKey); + const [initialMessage, setInitialMessage] = useState(''); + const { + transcribing, + transcript, + startRecording, + stopRecording, + } = useWhisper({ + apiKey: openAIApiKey || ' ', + streaming: false, + }); + const context = useAppContext(); const dispatch = useAppDispatch(); const intl = useIntl(); @@ -100,6 +70,8 @@ export default function MessageInput(props: MessageInputProps) { const pathname = useLocation().pathname; const onSubmit = useCallback(async () => { + setSpeechError(null); + if (await context.onNewMessage(message)) { dispatch(setMessage('')); } @@ -107,6 +79,7 @@ export default function MessageInput(props: MessageInputProps) { const onSpeechError = useCallback((e: any) => { console.error('speech recognition error', e); + setSpeechError(e.message); try { speechRecognition?.stop(); @@ -114,26 +87,54 @@ export default function MessageInput(props: MessageInputProps) { } try { - recorder.stop(); + stopRecording(); } catch (e) { } setRecording(false); - setTranscribing(false); - }, [recorder]); + }, [stopRecording]); - const onSpeechStart = useCallback(() => { - if (!openAIApiKey) { - dispatch(openOpenAIApiKeyPanel()); - return false; + const onHideSpeechError = useCallback(() => setSpeechError(null), []); + + const onSpeechStart = useCallback(async () => { + let granted = false; + let denied = false; + + try { + const result = await navigator.permissions.query({ name: 'microphone' as any }); + if (result.state == 'granted') { + granted = true; + } else if (result.state == 'denied') { + denied = true; + } + } catch (e) {} + + if (!granted && !denied) { + try { + const stream = await navigator.mediaDevices.getUserMedia({ video: false, audio: true }); + stream.getTracks().forEach(track => track.stop()); + granted = true; + } catch (e) { + denied = true; + } + } + + if (denied) { + onSpeechError(new Error('speech permission was not granted')); + return; } try { if (!recording) { setRecording(true); - // if we are using whisper, the we will just record with the browser and send the api when done if (useOpenAIWhisper || !supportsSpeechRecognition) { - recorder.start().catch(onSpeechError); + if (!openAIApiKey) { + dispatch(openOpenAIApiKeyPanel()); + return false; + } + // recorder.start().catch(onSpeechError); + setInitialMessage(message); + await startRecording(); } else if (speechRecognition) { const initialMessage = message; @@ -155,45 +156,12 @@ export default function MessageInput(props: MessageInputProps) { onSpeechError(new Error('not supported')); } } else { - setRecording(false); if (useOpenAIWhisper || !supportsSpeechRecognition) { - setTranscribing(true); - const mp3 = recorder.stop().getMp3(); - - mp3.then(async ([buffer, blob]) => { - const file = new File(buffer, 'chat.mp3', { - type: blob.type, - lastModified: Date.now() - }); - - // TODO: cut in chunks - - var data = new FormData() - data.append('file', file); - data.append('model', 'whisper-1') - - try { - const response = await fetch("https://api.openai.com/v1/audio/transcriptions", { - method: "POST", - headers: { - 'Authorization': `Bearer ${openAIApiKey}`, - }, - body: data, - }); - - const json = await response.json() - - if (json.text) { - dispatch(setMessage(message + ' ' + json.text)); - setTranscribing(false); - } - } catch (e) { - onSpeechError(e); - } - - }).catch(onSpeechError); + await stopRecording(); + setTimeout(() => setRecording(false), 500); } else if (speechRecognition) { speechRecognition.stop(); + setRecording(false); } else { onSpeechError(new Error('not supported')); } @@ -201,8 +169,15 @@ export default function MessageInput(props: MessageInputProps) { } catch (e) { onSpeechError(e); } - }, [recording, message, dispatch, onSpeechError, openAIApiKey]); + }, [recording, message, dispatch, onSpeechError, setInitialMessage, openAIApiKey]); + useEffect(() => { + if (useOpenAIWhisper || !supportsSpeechRecognition) { + if (!transcribing && !recording && transcript?.text) { + dispatch(setMessage(initialMessage + ' ' + transcript.text)); + } + } + }, [initialMessage, transcript, recording, transcribing, useOpenAIWhisper, dispatch]); const onKeyDown = useCallback((e: React.KeyboardEvent) => { if (e.key === 'Enter' && e.shiftKey === false && !props.disabled) { @@ -212,7 +187,6 @@ export default function MessageInput(props: MessageInputProps) { }, [onSubmit, props.disabled]); const rightSection = useMemo(() => { - return (
)} {!context.generating && ( <> - - {transcribing && } - {!transcribing && } - + + + + {transcribing && } + {!transcribing && } + + + +
+

+ Sorry, an error occured trying to record audio. +

+ +
+
+
@@ -245,7 +242,7 @@ export default function MessageInput(props: MessageInputProps) { )}
); - }, [recording, transcribing, onSubmit, onSpeechStart, props.disabled, context.generating]); + }, [recording, transcribing, onSubmit, onSpeechStart, props.disabled, context.generating, speechError, onHideSpeechError]); const disabled = context.generating;