Text to speech example (#216)

* feat: working on open ai example * feat: cleanup and renaming
software-mansion-labs · Dec 5, 2024 · d1038f3 · d1038f3
1 parent bdad8eb
commit d1038f3
Show file tree

Hide file tree

Showing 8 changed files with 160 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -83,4 +83,6 @@ react-native-audio-api*.tgz
 # Android
 .kotlin
 
+
+# Envs
 .env
diff --git a/apps/common-app/src/examples/TextToSpeech/TextToSpeech.tsx b/apps/common-app/src/examples/TextToSpeech/TextToSpeech.tsx
@@ -0,0 +1,125 @@
+import React, { useState, FC } from 'react';
+import { AudioBuffer, AudioContext } from 'react-native-audio-api';
+import { ActivityIndicator, TextInput, StyleSheet } from 'react-native';
+
+import { Container, Button, Spacer } from '../../components';
+import Env from '../../utils/env';
+import { colors } from '../../styles';
+
+async function getOpenAIResponse(input: string, voice: string = 'alloy') {
+  return await fetch('https://api.openai.com/v1/audio/speech', {
+    method: 'POST',
+    headers: {
+      'Content-Type': 'application/json',
+      'Authorization': `Bearer ${Env.openAiToken}`,
+    },
+    body: JSON.stringify({
+      model: 'tts-1-hd',
+      voice: voice,
+      input: input,
+      response_format: 'pcm',
+    }),
+  }).then((response) => response.arrayBuffer());
+}
+
+const openAISampleRate = 24000;
+const maxInputValue = 32768.0;
+
+// TODO: this should ideally be done using native code through .decodeAudioData
+function goofyResample(
+  audioContext: AudioContext,
+  input: Int16Array
+): AudioBuffer {
+  const scale = audioContext.sampleRate / openAISampleRate;
+
+  const outputBuffer = audioContext.createBuffer(
+    2,
+    input.length * scale,
+    audioContext.sampleRate
+  );
+
+  const processingChannel: Array<number> = [];
+  const upSampleChannel: Array<number> = [];
+
+  for (let i = 0; i < input.length; i += 1) {
+    processingChannel[i] = input[i] / maxInputValue;
+  }
+
+  for (let i = 0; i < input.length; i += 1) {
+    const isLast = i === input.length - 1;
+    const currentSample = processingChannel[i];
+    const nextSample = isLast ? currentSample : processingChannel[i + 1];
+
+    upSampleChannel[2 * i] = currentSample;
+    upSampleChannel[2 * i + 1] = (currentSample + nextSample) / 2;
+  }
+
+  outputBuffer.copyToChannel(upSampleChannel, 0);
+  outputBuffer.copyToChannel(upSampleChannel, 1);
+
+  return outputBuffer;
+}
+
+const TextToSpeech: FC = () => {
+  const [isLoading, setIsLoading] = useState(false);
+  const [textToRead, setTextToRead] = useState('');
+
+  const onReadText = async () => {
+    if (isLoading) {
+      return;
+    }
+
+    const aCtx = new AudioContext();
+
+    setIsLoading(true);
+    const results = await getOpenAIResponse(textToRead, 'alloy');
+    setIsLoading(false);
+
+    const audioBuffer = goofyResample(aCtx, new Int16Array(results));
+    const sourceNode = aCtx.createBufferSource();
+    const duration = audioBuffer.duration;
+    const now = aCtx.currentTime;
+
+    sourceNode.buffer = audioBuffer;
+
+    sourceNode.connect(aCtx.destination);
+
+    sourceNode.start(now);
+    sourceNode.stop(now + duration);
+  };
+
+  return (
+    <Container style={styles.container}>
+      <Spacer.Vertical size={60} />
+      <TextInput
+        value={textToRead}
+        onChangeText={setTextToRead}
+        style={styles.textInput}
+        multiline
+      />
+      <Spacer.Vertical size={24} />
+      <Button onPress={onReadText} title="Read Text" />
+      <Spacer.Vertical size={24} />
+      {isLoading && <ActivityIndicator />}
+    </Container>
+  );
+};
+
+export default TextToSpeech;
+
+const styles = StyleSheet.create({
+  container: {
+    alignItems: 'center',
+  },
+  textInput: {
+    backgroundColor: 'transparent',
+    borderColor: colors.border,
+    color: colors.white,
+    borderWidth: 1,
+    fontSize: 16,
+    padding: 16,
+    width: 280,
+    height: 200,
+    borderRadius: 6,
+  },
+});
diff --git a/apps/common-app/src/examples/TextToSpeech/index.ts b/apps/common-app/src/examples/TextToSpeech/index.ts
@@ -0,0 +1 @@
+export { default } from './TextToSpeech';
diff --git a/apps/common-app/src/examples/index.ts b/apps/common-app/src/examples/index.ts
@@ -1,6 +1,7 @@
 import { StackNavigationProp } from '@react-navigation/stack';
 
 import Piano from './Piano';
+import TextToSpeech from './TextToSpeech';
 import Metronome from './Metronome';
 import Oscillator from './Oscillator';
 import DrumMachine from './DrumMachine';
@@ -11,6 +12,7 @@ type NavigationParamList = {
   Metronome: undefined;
   DrumMachine: undefined;
   Piano: undefined;
+  TextToSpeech: undefined;
   AudioFile: undefined;
 };
 
@@ -37,6 +39,12 @@ export const Examples: Example[] = [
     subtitle: 'Play some notes',
     screen: Piano,
   },
+  {
+    key: 'TextToSpeech',
+    title: 'Text to Speech',
+    subtitle: 'type some text and hear it spoken',
+    screen: TextToSpeech,
+  },
   {
     key: 'Metronome',
     title: 'Metronome',

diff --git a/apps/common-app/src/utils/env.ts b/apps/common-app/src/utils/env.ts
@@ -0,0 +1,3 @@
+export default {
+  openAiToken: process.env.OPENAI_API_TOKEN,
+};
diff --git a/apps/fabric-example/babel.config.js b/apps/fabric-example/babel.config.js
@@ -1,4 +1,4 @@
 module.exports = {
   presets: ['module:@react-native/babel-preset'],
-  plugins: ['react-native-reanimated/plugin'],
+  plugins: ['react-native-reanimated/plugin', 'module:react-native-dotenv'],
 };
diff --git a/apps/fabric-example/package.json b/apps/fabric-example/package.json
@@ -42,6 +42,7 @@
     "eslint": "^8.19.0",
     "jest": "^29.6.3",
     "prettier": "2.8.8",
+    "react-native-dotenv": "^3.4.11",
     "react-test-renderer": "18.3.1",
     "typescript": "5.0.4"
   },

diff --git a/yarn.lock b/yarn.lock
@@ -5418,6 +5418,13 @@ __metadata:
   languageName: node
   linkType: hard
 
+"dotenv@npm:^16.4.5":
+  version: 16.4.5
+  resolution: "dotenv@npm:16.4.5"
+  checksum: 10/55a3134601115194ae0f924e54473459ed0d9fc340ae610b676e248cca45aa7c680d86365318ea964e6da4e2ea80c4514c1adab5adb43d6867fb57ff068f95c8
+  languageName: node
+  linkType: hard
+
 "eastasianwidth@npm:^0.2.0":
   version: 0.2.0
   resolution: "eastasianwidth@npm:0.2.0"
@@ -6150,6 +6157,7 @@ __metadata:
     react-dom: "npm:18.2.0"
     react-native: "npm:0.76.0"
     react-native-audio-api: "workspace:*"
+    react-native-dotenv: "npm:^3.4.11"
     react-native-gesture-handler: "npm:^2.20.2"
     react-native-reanimated: "npm:^3.16.1"
     react-native-safe-area-context: "npm:^4.12.0"
@@ -10219,6 +10227,17 @@ __metadata:
   languageName: node
   linkType: hard
 
+"react-native-dotenv@npm:^3.4.11":
+  version: 3.4.11
+  resolution: "react-native-dotenv@npm:3.4.11"
+  dependencies:
+    dotenv: "npm:^16.4.5"
+  peerDependencies:
+    "@babel/runtime": ^7.20.6
+  checksum: 10/09e8a7310fcb01ac021e71db9328e9d342d1e117bf68026b12de0392bfe17292ac6a071f03b88e7fb42c82a8f2fdf03bc520c7dedd2f80a1448cb3de5e03d4fb
+  languageName: node
+  linkType: hard
+
 "react-native-gesture-handler@npm:^2.20.2":
   version: 2.21.1
   resolution: "react-native-gesture-handler@npm:2.21.1"
-Original file line number
+Diff line change
@@ Expand Up / @@ -83,4 +83,6 @@ react-native-audio-api*.tgz @@
     # Android
     .kotlin
+    # Envs
     .env