speech recognition error handling & browser fixes
This commit is contained in:
		| @@ -3,6 +3,7 @@ | |||||||
|   "version": "0.2.1", |   "version": "0.2.1", | ||||||
|   "dependencies": { |   "dependencies": { | ||||||
|     "@auth0/auth0-spa-js": "^2.0.4", |     "@auth0/auth0-spa-js": "^2.0.4", | ||||||
|  |     "@chengsokdara/use-whisper": "^0.2.0", | ||||||
|     "@emotion/css": "^11.10.6", |     "@emotion/css": "^11.10.6", | ||||||
|     "@emotion/react": "^11.10.6", |     "@emotion/react": "^11.10.6", | ||||||
|     "@emotion/styled": "^11.10.6", |     "@emotion/styled": "^11.10.6", | ||||||
|   | |||||||
| @@ -1,7 +1,7 @@ | |||||||
| import styled from '@emotion/styled'; | import styled from '@emotion/styled'; | ||||||
| import { Button, ActionIcon, Textarea, Loader } from '@mantine/core'; | import { Button, ActionIcon, Textarea, Loader, Popover } from '@mantine/core'; | ||||||
| import { useMediaQuery } from '@mantine/hooks'; | import { useMediaQuery } from '@mantine/hooks'; | ||||||
| import { useCallback, useMemo, useState } from 'react'; | import { useCallback, useEffect, useMemo, useState } from 'react'; | ||||||
| import { FormattedMessage, useIntl } from 'react-intl'; | import { FormattedMessage, useIntl } from 'react-intl'; | ||||||
| import { useLocation } from 'react-router-dom'; | import { useLocation } from 'react-router-dom'; | ||||||
| import { useAppContext } from '../context'; | import { useAppContext } from '../context'; | ||||||
| @@ -10,9 +10,8 @@ import { selectMessage, setMessage } from '../store/message'; | |||||||
| import { selectTemperature } from '../store/parameters'; | import { selectTemperature } from '../store/parameters'; | ||||||
| import { openOpenAIApiKeyPanel, openSystemPromptPanel, openTemperaturePanel } from '../store/settings-ui'; | import { openOpenAIApiKeyPanel, openSystemPromptPanel, openTemperaturePanel } from '../store/settings-ui'; | ||||||
| import { speechRecognition, supportsSpeechRecognition } from '../speech-recognition-types' | import { speechRecognition, supportsSpeechRecognition } from '../speech-recognition-types' | ||||||
| import MicRecorder from 'mic-recorder-to-mp3'; | import { useWhisper } from '@chengsokdara/use-whisper'; | ||||||
| import { selectUseOpenAIWhisper, selectOpenAIApiKey } from '../store/api-keys'; | import { selectUseOpenAIWhisper, selectOpenAIApiKey } from '../store/api-keys'; | ||||||
| import { Mp3Encoder } from 'lamejs'; |  | ||||||
|  |  | ||||||
| const Container = styled.div` | const Container = styled.div` | ||||||
|     background: #292933; |     background: #292933; | ||||||
| @@ -38,55 +37,26 @@ export interface MessageInputProps { | |||||||
|     disabled?: boolean; |     disabled?: boolean; | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| async function chunkAndEncodeMP3File(file: Blob): Promise<Array<File>> { |  | ||||||
|     const MAX_CHUNK_SIZE = 25 * 1024 * 1024; // 25 MB |  | ||||||
|     const audioContext = new AudioContext(); |  | ||||||
|     const audioBuffer = await audioContext.decodeAudioData(await file.arrayBuffer()); |  | ||||||
|     const duration = audioBuffer.duration; |  | ||||||
|     const sampleRate = audioBuffer.sampleRate; |  | ||||||
|     const numChannels = audioBuffer.numberOfChannels; |  | ||||||
|     const bytesPerSample = 2; // 16-bit audio |  | ||||||
|     const samplesPerChunk = Math.floor((MAX_CHUNK_SIZE / bytesPerSample) / numChannels); |  | ||||||
|     const totalSamples = Math.floor(duration * sampleRate); |  | ||||||
|     const numChunks = Math.ceil(totalSamples / samplesPerChunk); |  | ||||||
|  |  | ||||||
|     const chunks: Array<File> = []; |  | ||||||
|     for (let i = 0; i < numChunks; i++) { |  | ||||||
|         const startSample = i * samplesPerChunk; |  | ||||||
|         const endSample = Math.min(startSample + samplesPerChunk, totalSamples); |  | ||||||
|         const chunkDuration = (endSample - startSample) / sampleRate; |  | ||||||
|         const chunkBuffer = audioContext.createBuffer(numChannels, endSample - startSample, sampleRate); |  | ||||||
|         for (let c = 0; c < numChannels; c++) { |  | ||||||
|             const channelData = audioBuffer.getChannelData(c).subarray(startSample, endSample); |  | ||||||
|             chunkBuffer.copyToChannel(channelData, c); |  | ||||||
|         } |  | ||||||
|         const chunkBlob = await new Promise<Blob>((resolve) => { |  | ||||||
|             const encoder = new Mp3Encoder(numChannels, sampleRate, 128); |  | ||||||
|             const leftData = chunkBuffer.getChannelData(0); |  | ||||||
|             const rightData = numChannels === 1 ? leftData : chunkBuffer.getChannelData(1); |  | ||||||
|             const mp3Data = encoder.encodeBuffer(leftData, rightData); |  | ||||||
|             const blob = new Blob([mp3Data], { type: 'audio/mp3' }); |  | ||||||
|             resolve(blob); |  | ||||||
|         }); |  | ||||||
|         chunks.push(new File([chunkBlob], `text-${i}.mp3`, { type: 'audio/mp3' })); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     return chunks; |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| export default function MessageInput(props: MessageInputProps) { | export default function MessageInput(props: MessageInputProps) { | ||||||
|     const temperature = useAppSelector(selectTemperature); |     const temperature = useAppSelector(selectTemperature); | ||||||
|     const message = useAppSelector(selectMessage); |     const message = useAppSelector(selectMessage); | ||||||
|     const [recording, setRecording] = useState(false); |     const [recording, setRecording] = useState(false); | ||||||
|     const [transcribing, setTranscribing] = useState(false); |     const [speechError, setSpeechError] = useState<string | null>(null); | ||||||
|     const hasVerticalSpace = useMediaQuery('(min-height: 1000px)'); |     const hasVerticalSpace = useMediaQuery('(min-height: 1000px)'); | ||||||
|     const recorder = useMemo(() => new MicRecorder({ bitRate: 128 }), []); |  | ||||||
|     const useOpenAIWhisper = useAppSelector(selectUseOpenAIWhisper); |     const useOpenAIWhisper = useAppSelector(selectUseOpenAIWhisper); | ||||||
|     const openAIApiKey = useAppSelector(selectOpenAIApiKey); |     const openAIApiKey = useAppSelector(selectOpenAIApiKey); | ||||||
|  |  | ||||||
|  |     const [initialMessage, setInitialMessage] = useState(''); | ||||||
|  |     const { | ||||||
|  |         transcribing, | ||||||
|  |         transcript, | ||||||
|  |         startRecording, | ||||||
|  |         stopRecording, | ||||||
|  |     } = useWhisper({ | ||||||
|  |         apiKey: openAIApiKey || ' ', | ||||||
|  |         streaming: false, | ||||||
|  |     }); | ||||||
|  |  | ||||||
|     const context = useAppContext(); |     const context = useAppContext(); | ||||||
|     const dispatch = useAppDispatch(); |     const dispatch = useAppDispatch(); | ||||||
|     const intl = useIntl(); |     const intl = useIntl(); | ||||||
| @@ -100,6 +70,8 @@ export default function MessageInput(props: MessageInputProps) { | |||||||
|     const pathname = useLocation().pathname; |     const pathname = useLocation().pathname; | ||||||
|  |  | ||||||
|     const onSubmit = useCallback(async () => { |     const onSubmit = useCallback(async () => { | ||||||
|  |         setSpeechError(null); | ||||||
|  |  | ||||||
|         if (await context.onNewMessage(message)) { |         if (await context.onNewMessage(message)) { | ||||||
|             dispatch(setMessage('')); |             dispatch(setMessage('')); | ||||||
|         } |         } | ||||||
| @@ -107,6 +79,7 @@ export default function MessageInput(props: MessageInputProps) { | |||||||
|  |  | ||||||
|     const onSpeechError = useCallback((e: any) => { |     const onSpeechError = useCallback((e: any) => { | ||||||
|         console.error('speech recognition error', e); |         console.error('speech recognition error', e); | ||||||
|  |         setSpeechError(e.message); | ||||||
|  |  | ||||||
|         try { |         try { | ||||||
|             speechRecognition?.stop(); |             speechRecognition?.stop(); | ||||||
| @@ -114,26 +87,54 @@ export default function MessageInput(props: MessageInputProps) { | |||||||
|         } |         } | ||||||
|  |  | ||||||
|         try { |         try { | ||||||
|             recorder.stop(); |             stopRecording(); | ||||||
|         } catch (e) { } |         } catch (e) { } | ||||||
|  |  | ||||||
|         setRecording(false); |         setRecording(false); | ||||||
|         setTranscribing(false); |     }, [stopRecording]); | ||||||
|     }, [recorder]); |  | ||||||
|  |  | ||||||
|     const onSpeechStart = useCallback(() => { |     const onHideSpeechError = useCallback(() => setSpeechError(null), []); | ||||||
|         if (!openAIApiKey) { |  | ||||||
|             dispatch(openOpenAIApiKeyPanel()); |     const onSpeechStart = useCallback(async () => { | ||||||
|             return false; |         let granted = false; | ||||||
|  |         let denied = false; | ||||||
|  |  | ||||||
|  |         try { | ||||||
|  |             const result = await navigator.permissions.query({ name: 'microphone' as any }); | ||||||
|  |             if (result.state == 'granted') { | ||||||
|  |                 granted = true; | ||||||
|  |             } else if (result.state == 'denied') { | ||||||
|  |                 denied = true; | ||||||
|  |             } | ||||||
|  |         } catch (e) {} | ||||||
|  |  | ||||||
|  |         if (!granted && !denied) { | ||||||
|  |             try { | ||||||
|  |                 const stream = await navigator.mediaDevices.getUserMedia({ video: false, audio: true }); | ||||||
|  |                 stream.getTracks().forEach(track => track.stop()); | ||||||
|  |                 granted = true; | ||||||
|  |             } catch (e) { | ||||||
|  |                 denied = true; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         if (denied) { | ||||||
|  |             onSpeechError(new Error('speech permission was not granted')); | ||||||
|  |             return; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         try { |         try { | ||||||
|             if (!recording) { |             if (!recording) { | ||||||
|                 setRecording(true); |                 setRecording(true); | ||||||
|  |  | ||||||
|                 // if we are using whisper, the we will just record with the browser and send the api when done  |  | ||||||
|                 if (useOpenAIWhisper || !supportsSpeechRecognition) { |                 if (useOpenAIWhisper || !supportsSpeechRecognition) { | ||||||
|                     recorder.start().catch(onSpeechError); |                     if (!openAIApiKey) { | ||||||
|  |                         dispatch(openOpenAIApiKeyPanel()); | ||||||
|  |                         return false; | ||||||
|  |                     } | ||||||
|  |                     // recorder.start().catch(onSpeechError); | ||||||
|  |                     setInitialMessage(message); | ||||||
|  |                     await startRecording(); | ||||||
|                 } else if (speechRecognition) { |                 } else if (speechRecognition) { | ||||||
|                     const initialMessage = message; |                     const initialMessage = message; | ||||||
|  |  | ||||||
| @@ -155,45 +156,12 @@ export default function MessageInput(props: MessageInputProps) { | |||||||
|                     onSpeechError(new Error('not supported')); |                     onSpeechError(new Error('not supported')); | ||||||
|                 } |                 } | ||||||
|             } else { |             } else { | ||||||
|                 setRecording(false); |  | ||||||
|                 if (useOpenAIWhisper || !supportsSpeechRecognition) { |                 if (useOpenAIWhisper || !supportsSpeechRecognition) { | ||||||
|                     setTranscribing(true); |                     await stopRecording(); | ||||||
|                     const mp3 = recorder.stop().getMp3(); |                     setTimeout(() => setRecording(false), 500); | ||||||
|  |  | ||||||
|                     mp3.then(async ([buffer, blob]) => { |  | ||||||
|                         const file = new File(buffer, 'chat.mp3', { |  | ||||||
|                             type: blob.type, |  | ||||||
|                             lastModified: Date.now() |  | ||||||
|                         }); |  | ||||||
|  |  | ||||||
|                         // TODO: cut in chunks |  | ||||||
|  |  | ||||||
|                         var data = new FormData() |  | ||||||
|                         data.append('file', file); |  | ||||||
|                         data.append('model', 'whisper-1') |  | ||||||
|  |  | ||||||
|                         try { |  | ||||||
|                             const response = await fetch("https://api.openai.com/v1/audio/transcriptions", { |  | ||||||
|                                 method: "POST", |  | ||||||
|                                 headers: { |  | ||||||
|                                     'Authorization': `Bearer ${openAIApiKey}`, |  | ||||||
|                                 }, |  | ||||||
|                                 body: data, |  | ||||||
|                             }); |  | ||||||
|  |  | ||||||
|                             const json = await response.json() |  | ||||||
|  |  | ||||||
|                             if (json.text) { |  | ||||||
|                                 dispatch(setMessage(message + ' ' + json.text)); |  | ||||||
|                                 setTranscribing(false); |  | ||||||
|                             } |  | ||||||
|                         } catch (e) { |  | ||||||
|                             onSpeechError(e); |  | ||||||
|                         } |  | ||||||
|  |  | ||||||
|                     }).catch(onSpeechError); |  | ||||||
|                 } else if (speechRecognition) { |                 } else if (speechRecognition) { | ||||||
|                     speechRecognition.stop(); |                     speechRecognition.stop(); | ||||||
|  |                     setRecording(false); | ||||||
|                 } else { |                 } else { | ||||||
|                     onSpeechError(new Error('not supported')); |                     onSpeechError(new Error('not supported')); | ||||||
|                 } |                 } | ||||||
| @@ -201,8 +169,15 @@ export default function MessageInput(props: MessageInputProps) { | |||||||
|         } catch (e) { |         } catch (e) { | ||||||
|             onSpeechError(e); |             onSpeechError(e); | ||||||
|         } |         } | ||||||
|     }, [recording, message, dispatch, onSpeechError, openAIApiKey]); |     }, [recording, message, dispatch, onSpeechError, setInitialMessage, openAIApiKey]); | ||||||
|  |  | ||||||
|  |     useEffect(() => { | ||||||
|  |         if (useOpenAIWhisper || !supportsSpeechRecognition) { | ||||||
|  |             if (!transcribing && !recording && transcript?.text) { | ||||||
|  |                 dispatch(setMessage(initialMessage + ' ' + transcript.text)); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     }, [initialMessage, transcript, recording, transcribing, useOpenAIWhisper, dispatch]); | ||||||
|  |  | ||||||
|     const onKeyDown = useCallback((e: React.KeyboardEvent<HTMLTextAreaElement>) => { |     const onKeyDown = useCallback((e: React.KeyboardEvent<HTMLTextAreaElement>) => { | ||||||
|         if (e.key === 'Enter' && e.shiftKey === false && !props.disabled) { |         if (e.key === 'Enter' && e.shiftKey === false && !props.disabled) { | ||||||
| @@ -212,7 +187,6 @@ export default function MessageInput(props: MessageInputProps) { | |||||||
|     }, [onSubmit, props.disabled]); |     }, [onSubmit, props.disabled]); | ||||||
|  |  | ||||||
|     const rightSection = useMemo(() => { |     const rightSection = useMemo(() => { | ||||||
|  |  | ||||||
|         return ( |         return ( | ||||||
|             <div style={{ |             <div style={{ | ||||||
|                 opacity: '0.8', |                 opacity: '0.8', | ||||||
| @@ -232,11 +206,34 @@ export default function MessageInput(props: MessageInputProps) { | |||||||
|                 </>)} |                 </>)} | ||||||
|                 {!context.generating && ( |                 {!context.generating && ( | ||||||
|                     <> |                     <> | ||||||
|  |                         <Popover width={200} position="bottom" withArrow shadow="md" opened={speechError !== null}> | ||||||
|  |                             <Popover.Target> | ||||||
|                                 <ActionIcon size="xl" |                                 <ActionIcon size="xl" | ||||||
|                                     onClick={onSpeechStart}> |                                     onClick={onSpeechStart}> | ||||||
|                                     {transcribing && <Loader size="xs" />} |                                     {transcribing && <Loader size="xs" />} | ||||||
|                                     {!transcribing && <i className="fa fa-microphone" style={{ fontSize: '90%', color: recording ? 'red' : 'inherit' }} />} |                                     {!transcribing && <i className="fa fa-microphone" style={{ fontSize: '90%', color: recording ? 'red' : 'inherit' }} />} | ||||||
|                                 </ActionIcon> |                                 </ActionIcon> | ||||||
|  |                             </Popover.Target> | ||||||
|  |                             <Popover.Dropdown> | ||||||
|  |                                 <div style={{ | ||||||
|  |                                     display: 'flex', | ||||||
|  |                                     flexDirection: 'column', | ||||||
|  |                                     alignItems: 'flex-start', | ||||||
|  |                                 }}> | ||||||
|  |                                     <p style={{ | ||||||
|  |                                         fontFamily: `"Work Sans", sans-serif`, | ||||||
|  |                                         fontSize: '0.9rem', | ||||||
|  |                                         textAlign: 'center', | ||||||
|  |                                         marginBottom: '0.5rem', | ||||||
|  |                                     }}> | ||||||
|  |                                         Sorry, an error occured trying to record audio. | ||||||
|  |                                     </p> | ||||||
|  |                                     <Button variant="light" size="xs" fullWidth onClick={onHideSpeechError}> | ||||||
|  |                                         Close | ||||||
|  |                                     </Button> | ||||||
|  |                                 </div> | ||||||
|  |                             </Popover.Dropdown> | ||||||
|  |                         </Popover> | ||||||
|                         <ActionIcon size="xl" |                         <ActionIcon size="xl" | ||||||
|                             onClick={onSubmit}> |                             onClick={onSubmit}> | ||||||
|                             <i className="fa fa-paper-plane" style={{ fontSize: '90%' }} /> |                             <i className="fa fa-paper-plane" style={{ fontSize: '90%' }} /> | ||||||
| @@ -245,7 +242,7 @@ export default function MessageInput(props: MessageInputProps) { | |||||||
|                 )} |                 )} | ||||||
|             </div> |             </div> | ||||||
|         ); |         ); | ||||||
|     }, [recording, transcribing, onSubmit, onSpeechStart, props.disabled, context.generating]); |     }, [recording, transcribing, onSubmit, onSpeechStart, props.disabled, context.generating, speechError, onHideSpeechError]); | ||||||
|  |  | ||||||
|     const disabled = context.generating; |     const disabled = context.generating; | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user