speech recognition error handling & browser fixes
parent
9e10796b5a
commit
d7251e34da
|
@ -3,6 +3,7 @@
|
||||||
"version": "0.2.1",
|
"version": "0.2.1",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@auth0/auth0-spa-js": "^2.0.4",
|
"@auth0/auth0-spa-js": "^2.0.4",
|
||||||
|
"@chengsokdara/use-whisper": "^0.2.0",
|
||||||
"@emotion/css": "^11.10.6",
|
"@emotion/css": "^11.10.6",
|
||||||
"@emotion/react": "^11.10.6",
|
"@emotion/react": "^11.10.6",
|
||||||
"@emotion/styled": "^11.10.6",
|
"@emotion/styled": "^11.10.6",
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import styled from '@emotion/styled';
|
import styled from '@emotion/styled';
|
||||||
import { Button, ActionIcon, Textarea, Loader } from '@mantine/core';
|
import { Button, ActionIcon, Textarea, Loader, Popover } from '@mantine/core';
|
||||||
import { useMediaQuery } from '@mantine/hooks';
|
import { useMediaQuery } from '@mantine/hooks';
|
||||||
import { useCallback, useMemo, useState } from 'react';
|
import { useCallback, useEffect, useMemo, useState } from 'react';
|
||||||
import { FormattedMessage, useIntl } from 'react-intl';
|
import { FormattedMessage, useIntl } from 'react-intl';
|
||||||
import { useLocation } from 'react-router-dom';
|
import { useLocation } from 'react-router-dom';
|
||||||
import { useAppContext } from '../context';
|
import { useAppContext } from '../context';
|
||||||
|
@ -10,9 +10,8 @@ import { selectMessage, setMessage } from '../store/message';
|
||||||
import { selectTemperature } from '../store/parameters';
|
import { selectTemperature } from '../store/parameters';
|
||||||
import { openOpenAIApiKeyPanel, openSystemPromptPanel, openTemperaturePanel } from '../store/settings-ui';
|
import { openOpenAIApiKeyPanel, openSystemPromptPanel, openTemperaturePanel } from '../store/settings-ui';
|
||||||
import { speechRecognition, supportsSpeechRecognition } from '../speech-recognition-types'
|
import { speechRecognition, supportsSpeechRecognition } from '../speech-recognition-types'
|
||||||
import MicRecorder from 'mic-recorder-to-mp3';
|
import { useWhisper } from '@chengsokdara/use-whisper';
|
||||||
import { selectUseOpenAIWhisper, selectOpenAIApiKey } from '../store/api-keys';
|
import { selectUseOpenAIWhisper, selectOpenAIApiKey } from '../store/api-keys';
|
||||||
import { Mp3Encoder } from 'lamejs';
|
|
||||||
|
|
||||||
const Container = styled.div`
|
const Container = styled.div`
|
||||||
background: #292933;
|
background: #292933;
|
||||||
|
@ -38,55 +37,26 @@ export interface MessageInputProps {
|
||||||
disabled?: boolean;
|
disabled?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
async function chunkAndEncodeMP3File(file: Blob): Promise<Array<File>> {
|
|
||||||
const MAX_CHUNK_SIZE = 25 * 1024 * 1024; // 25 MB
|
|
||||||
const audioContext = new AudioContext();
|
|
||||||
const audioBuffer = await audioContext.decodeAudioData(await file.arrayBuffer());
|
|
||||||
const duration = audioBuffer.duration;
|
|
||||||
const sampleRate = audioBuffer.sampleRate;
|
|
||||||
const numChannels = audioBuffer.numberOfChannels;
|
|
||||||
const bytesPerSample = 2; // 16-bit audio
|
|
||||||
const samplesPerChunk = Math.floor((MAX_CHUNK_SIZE / bytesPerSample) / numChannels);
|
|
||||||
const totalSamples = Math.floor(duration * sampleRate);
|
|
||||||
const numChunks = Math.ceil(totalSamples / samplesPerChunk);
|
|
||||||
|
|
||||||
const chunks: Array<File> = [];
|
|
||||||
for (let i = 0; i < numChunks; i++) {
|
|
||||||
const startSample = i * samplesPerChunk;
|
|
||||||
const endSample = Math.min(startSample + samplesPerChunk, totalSamples);
|
|
||||||
const chunkDuration = (endSample - startSample) / sampleRate;
|
|
||||||
const chunkBuffer = audioContext.createBuffer(numChannels, endSample - startSample, sampleRate);
|
|
||||||
for (let c = 0; c < numChannels; c++) {
|
|
||||||
const channelData = audioBuffer.getChannelData(c).subarray(startSample, endSample);
|
|
||||||
chunkBuffer.copyToChannel(channelData, c);
|
|
||||||
}
|
|
||||||
const chunkBlob = await new Promise<Blob>((resolve) => {
|
|
||||||
const encoder = new Mp3Encoder(numChannels, sampleRate, 128);
|
|
||||||
const leftData = chunkBuffer.getChannelData(0);
|
|
||||||
const rightData = numChannels === 1 ? leftData : chunkBuffer.getChannelData(1);
|
|
||||||
const mp3Data = encoder.encodeBuffer(leftData, rightData);
|
|
||||||
const blob = new Blob([mp3Data], { type: 'audio/mp3' });
|
|
||||||
resolve(blob);
|
|
||||||
});
|
|
||||||
chunks.push(new File([chunkBlob], `text-${i}.mp3`, { type: 'audio/mp3' }));
|
|
||||||
}
|
|
||||||
|
|
||||||
return chunks;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
export default function MessageInput(props: MessageInputProps) {
|
export default function MessageInput(props: MessageInputProps) {
|
||||||
const temperature = useAppSelector(selectTemperature);
|
const temperature = useAppSelector(selectTemperature);
|
||||||
const message = useAppSelector(selectMessage);
|
const message = useAppSelector(selectMessage);
|
||||||
const [recording, setRecording] = useState(false);
|
const [recording, setRecording] = useState(false);
|
||||||
const [transcribing, setTranscribing] = useState(false);
|
const [speechError, setSpeechError] = useState<string | null>(null);
|
||||||
const hasVerticalSpace = useMediaQuery('(min-height: 1000px)');
|
const hasVerticalSpace = useMediaQuery('(min-height: 1000px)');
|
||||||
const recorder = useMemo(() => new MicRecorder({ bitRate: 128 }), []);
|
|
||||||
const useOpenAIWhisper = useAppSelector(selectUseOpenAIWhisper);
|
const useOpenAIWhisper = useAppSelector(selectUseOpenAIWhisper);
|
||||||
const openAIApiKey = useAppSelector(selectOpenAIApiKey);
|
const openAIApiKey = useAppSelector(selectOpenAIApiKey);
|
||||||
|
|
||||||
|
const [initialMessage, setInitialMessage] = useState('');
|
||||||
|
const {
|
||||||
|
transcribing,
|
||||||
|
transcript,
|
||||||
|
startRecording,
|
||||||
|
stopRecording,
|
||||||
|
} = useWhisper({
|
||||||
|
apiKey: openAIApiKey || ' ',
|
||||||
|
streaming: false,
|
||||||
|
});
|
||||||
|
|
||||||
const context = useAppContext();
|
const context = useAppContext();
|
||||||
const dispatch = useAppDispatch();
|
const dispatch = useAppDispatch();
|
||||||
const intl = useIntl();
|
const intl = useIntl();
|
||||||
|
@ -100,6 +70,8 @@ export default function MessageInput(props: MessageInputProps) {
|
||||||
const pathname = useLocation().pathname;
|
const pathname = useLocation().pathname;
|
||||||
|
|
||||||
const onSubmit = useCallback(async () => {
|
const onSubmit = useCallback(async () => {
|
||||||
|
setSpeechError(null);
|
||||||
|
|
||||||
if (await context.onNewMessage(message)) {
|
if (await context.onNewMessage(message)) {
|
||||||
dispatch(setMessage(''));
|
dispatch(setMessage(''));
|
||||||
}
|
}
|
||||||
|
@ -107,6 +79,7 @@ export default function MessageInput(props: MessageInputProps) {
|
||||||
|
|
||||||
const onSpeechError = useCallback((e: any) => {
|
const onSpeechError = useCallback((e: any) => {
|
||||||
console.error('speech recognition error', e);
|
console.error('speech recognition error', e);
|
||||||
|
setSpeechError(e.message);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
speechRecognition?.stop();
|
speechRecognition?.stop();
|
||||||
|
@ -114,26 +87,54 @@ export default function MessageInput(props: MessageInputProps) {
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
recorder.stop();
|
stopRecording();
|
||||||
} catch (e) { }
|
} catch (e) { }
|
||||||
|
|
||||||
setRecording(false);
|
setRecording(false);
|
||||||
setTranscribing(false);
|
}, [stopRecording]);
|
||||||
}, [recorder]);
|
|
||||||
|
|
||||||
const onSpeechStart = useCallback(() => {
|
const onHideSpeechError = useCallback(() => setSpeechError(null), []);
|
||||||
if (!openAIApiKey) {
|
|
||||||
dispatch(openOpenAIApiKeyPanel());
|
const onSpeechStart = useCallback(async () => {
|
||||||
return false;
|
let granted = false;
|
||||||
|
let denied = false;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = await navigator.permissions.query({ name: 'microphone' as any });
|
||||||
|
if (result.state == 'granted') {
|
||||||
|
granted = true;
|
||||||
|
} else if (result.state == 'denied') {
|
||||||
|
denied = true;
|
||||||
|
}
|
||||||
|
} catch (e) {}
|
||||||
|
|
||||||
|
if (!granted && !denied) {
|
||||||
|
try {
|
||||||
|
const stream = await navigator.mediaDevices.getUserMedia({ video: false, audio: true });
|
||||||
|
stream.getTracks().forEach(track => track.stop());
|
||||||
|
granted = true;
|
||||||
|
} catch (e) {
|
||||||
|
denied = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (denied) {
|
||||||
|
onSpeechError(new Error('speech permission was not granted'));
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (!recording) {
|
if (!recording) {
|
||||||
setRecording(true);
|
setRecording(true);
|
||||||
|
|
||||||
// if we are using whisper, the we will just record with the browser and send the api when done
|
|
||||||
if (useOpenAIWhisper || !supportsSpeechRecognition) {
|
if (useOpenAIWhisper || !supportsSpeechRecognition) {
|
||||||
recorder.start().catch(onSpeechError);
|
if (!openAIApiKey) {
|
||||||
|
dispatch(openOpenAIApiKeyPanel());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// recorder.start().catch(onSpeechError);
|
||||||
|
setInitialMessage(message);
|
||||||
|
await startRecording();
|
||||||
} else if (speechRecognition) {
|
} else if (speechRecognition) {
|
||||||
const initialMessage = message;
|
const initialMessage = message;
|
||||||
|
|
||||||
|
@ -155,45 +156,12 @@ export default function MessageInput(props: MessageInputProps) {
|
||||||
onSpeechError(new Error('not supported'));
|
onSpeechError(new Error('not supported'));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
setRecording(false);
|
|
||||||
if (useOpenAIWhisper || !supportsSpeechRecognition) {
|
if (useOpenAIWhisper || !supportsSpeechRecognition) {
|
||||||
setTranscribing(true);
|
await stopRecording();
|
||||||
const mp3 = recorder.stop().getMp3();
|
setTimeout(() => setRecording(false), 500);
|
||||||
|
|
||||||
mp3.then(async ([buffer, blob]) => {
|
|
||||||
const file = new File(buffer, 'chat.mp3', {
|
|
||||||
type: blob.type,
|
|
||||||
lastModified: Date.now()
|
|
||||||
});
|
|
||||||
|
|
||||||
// TODO: cut in chunks
|
|
||||||
|
|
||||||
var data = new FormData()
|
|
||||||
data.append('file', file);
|
|
||||||
data.append('model', 'whisper-1')
|
|
||||||
|
|
||||||
try {
|
|
||||||
const response = await fetch("https://api.openai.com/v1/audio/transcriptions", {
|
|
||||||
method: "POST",
|
|
||||||
headers: {
|
|
||||||
'Authorization': `Bearer ${openAIApiKey}`,
|
|
||||||
},
|
|
||||||
body: data,
|
|
||||||
});
|
|
||||||
|
|
||||||
const json = await response.json()
|
|
||||||
|
|
||||||
if (json.text) {
|
|
||||||
dispatch(setMessage(message + ' ' + json.text));
|
|
||||||
setTranscribing(false);
|
|
||||||
}
|
|
||||||
} catch (e) {
|
|
||||||
onSpeechError(e);
|
|
||||||
}
|
|
||||||
|
|
||||||
}).catch(onSpeechError);
|
|
||||||
} else if (speechRecognition) {
|
} else if (speechRecognition) {
|
||||||
speechRecognition.stop();
|
speechRecognition.stop();
|
||||||
|
setRecording(false);
|
||||||
} else {
|
} else {
|
||||||
onSpeechError(new Error('not supported'));
|
onSpeechError(new Error('not supported'));
|
||||||
}
|
}
|
||||||
|
@ -201,8 +169,15 @@ export default function MessageInput(props: MessageInputProps) {
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
onSpeechError(e);
|
onSpeechError(e);
|
||||||
}
|
}
|
||||||
}, [recording, message, dispatch, onSpeechError, openAIApiKey]);
|
}, [recording, message, dispatch, onSpeechError, setInitialMessage, openAIApiKey]);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
if (useOpenAIWhisper || !supportsSpeechRecognition) {
|
||||||
|
if (!transcribing && !recording && transcript?.text) {
|
||||||
|
dispatch(setMessage(initialMessage + ' ' + transcript.text));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, [initialMessage, transcript, recording, transcribing, useOpenAIWhisper, dispatch]);
|
||||||
|
|
||||||
const onKeyDown = useCallback((e: React.KeyboardEvent<HTMLTextAreaElement>) => {
|
const onKeyDown = useCallback((e: React.KeyboardEvent<HTMLTextAreaElement>) => {
|
||||||
if (e.key === 'Enter' && e.shiftKey === false && !props.disabled) {
|
if (e.key === 'Enter' && e.shiftKey === false && !props.disabled) {
|
||||||
|
@ -212,7 +187,6 @@ export default function MessageInput(props: MessageInputProps) {
|
||||||
}, [onSubmit, props.disabled]);
|
}, [onSubmit, props.disabled]);
|
||||||
|
|
||||||
const rightSection = useMemo(() => {
|
const rightSection = useMemo(() => {
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div style={{
|
<div style={{
|
||||||
opacity: '0.8',
|
opacity: '0.8',
|
||||||
|
@ -232,11 +206,34 @@ export default function MessageInput(props: MessageInputProps) {
|
||||||
</>)}
|
</>)}
|
||||||
{!context.generating && (
|
{!context.generating && (
|
||||||
<>
|
<>
|
||||||
<ActionIcon size="xl"
|
<Popover width={200} position="bottom" withArrow shadow="md" opened={speechError !== null}>
|
||||||
onClick={onSpeechStart}>
|
<Popover.Target>
|
||||||
{transcribing && <Loader size="xs" />}
|
<ActionIcon size="xl"
|
||||||
{!transcribing && <i className="fa fa-microphone" style={{ fontSize: '90%', color: recording ? 'red' : 'inherit' }} />}
|
onClick={onSpeechStart}>
|
||||||
</ActionIcon>
|
{transcribing && <Loader size="xs" />}
|
||||||
|
{!transcribing && <i className="fa fa-microphone" style={{ fontSize: '90%', color: recording ? 'red' : 'inherit' }} />}
|
||||||
|
</ActionIcon>
|
||||||
|
</Popover.Target>
|
||||||
|
<Popover.Dropdown>
|
||||||
|
<div style={{
|
||||||
|
display: 'flex',
|
||||||
|
flexDirection: 'column',
|
||||||
|
alignItems: 'flex-start',
|
||||||
|
}}>
|
||||||
|
<p style={{
|
||||||
|
fontFamily: `"Work Sans", sans-serif`,
|
||||||
|
fontSize: '0.9rem',
|
||||||
|
textAlign: 'center',
|
||||||
|
marginBottom: '0.5rem',
|
||||||
|
}}>
|
||||||
|
Sorry, an error occured trying to record audio.
|
||||||
|
</p>
|
||||||
|
<Button variant="light" size="xs" fullWidth onClick={onHideSpeechError}>
|
||||||
|
Close
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
</Popover.Dropdown>
|
||||||
|
</Popover>
|
||||||
<ActionIcon size="xl"
|
<ActionIcon size="xl"
|
||||||
onClick={onSubmit}>
|
onClick={onSubmit}>
|
||||||
<i className="fa fa-paper-plane" style={{ fontSize: '90%' }} />
|
<i className="fa fa-paper-plane" style={{ fontSize: '90%' }} />
|
||||||
|
@ -245,7 +242,7 @@ export default function MessageInput(props: MessageInputProps) {
|
||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
}, [recording, transcribing, onSubmit, onSpeechStart, props.disabled, context.generating]);
|
}, [recording, transcribing, onSubmit, onSpeechStart, props.disabled, context.generating, speechError, onHideSpeechError]);
|
||||||
|
|
||||||
const disabled = context.generating;
|
const disabled = context.generating;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue