feat: volume change event support (#37)

* feat: volume change event support * chore: update readme * update events doc * refactor: rmsDB -> value, use max power in buffer * chore: autoformat * chore: update VolumeMeteringAvatar and docs * update volume metering link
jamsch · Oct 11, 2024 · 2d2e73d · 2d2e73d
1 parent 1da7819
commit 2d2e73d
Show file tree

Hide file tree

Showing 16 changed files with 865 additions and 214 deletions.
diff --git a/README.md b/README.md
@@ -19,6 +19,8 @@ expo-speech-recognition implements the iOS [`SFSpeechRecognizer`](https://develo
 - [Transcribing audio files](#transcribing-audio-files)
   - [Supported input audio formats](#supported-input-audio-formats)
   - [File transcription example](#file-transcription-example)
+- [Volume metering](#volume-metering)
+  - [Volume metering example](#volume-metering-example)
 - [Polyfilling the Web SpeechRecognition API](#polyfilling-the-web-speechrecognition-api)
 - [Muting the beep sound on Android](#muting-the-beep-sound-on-android)
 - [Improving accuracy of single-word prompts](#improving-accuracy-of-single-word-prompts)
@@ -299,6 +301,13 @@ ExpoSpeechRecognitionModule.start({
     // Default: 50ms for network-based recognition, 15ms for on-device recognition
     chunkDelayMillis: undefined,
   },
+  // Settings for volume change events.
+  volumeChangeEventOptions: {
+    // [Default: false] Whether to emit the `volumechange` events when the input volume changes.
+    enabled: false,
+    // [Default: 100ms on iOS] The interval (in milliseconds) to emit `volumechange` events.
+    intervalMillis: 300,
+  },
 });
 
 // Stop capturing audio (and emit a final result if there is one)
@@ -312,17 +321,18 @@ ExpoSpeechRecognitionModule.abort();
 
 Events are largely based on the [Web Speech API](https://developer.mozilla.org/en-US/docs/Web/API/SpeechRecognition). The following events are supported:
 
-| Event Name    | Description                                                                                | Notes                                                                                                                                                                                                                                                                                    |
-| ------------- | ------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `audiostart`  | Audio capturing has started                                                                | Includes the `uri` if `recordingOptions.persist` is enabled.                                                                                                                                                                                                                             |
-| `audioend`    | Audio capturing has ended                                                                  | Includes the `uri` if `recordingOptions.persist` is enabled.                                                                                                                                                                                                                             |
-| `end`         | Speech recognition service has disconnected.                                               | This should always be the last event dispatched, including after errors.                                                                                                                                                                                                                 |
-| `error`       | Fired when a speech recognition error occurs.                                              | You'll also receive an `error` event (with code "aborted") when calling `.abort()`                                                                                                                                                                                                       |
-| `nomatch`     | Speech recognition service returns a final result with no significant recognition.         | You may have non-final results recognized. This may get emitted after cancellation.                                                                                                                                                                                                      |
-| `result`      | Speech recognition service returns a word or phrase has been positively recognized.        | On Android, continous mode runs as a segmented session, meaning when a final result is reached, additional partial and final results will cover a new segment separate from the previous final result. On iOS, you should expect one final result before speech recognition has stopped. |
-| `speechstart` | Fired when any sound — recognizable speech or not — has been detected                      | On iOS, this will fire once in the session after a result has occurred                                                                                                                                                                                                                   |
-| `speechend`   | Fired when speech recognized by the speech recognition service has stopped being detected. | Not supported yet on iOS                                                                                                                                                                                                                                                                 |
-| `start`       | Speech recognition has started                                                             | Use this event to indicate to the user when to speak.                                                                                                                                                                                                                                    |
+| Event Name     | Description                                                                                | Notes                                                                                                                                                                                                                                                                                    |
+| -------------- | ------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `audiostart`   | Audio capturing has started                                                                | Includes the `uri` if `recordingOptions.persist` is enabled.                                                                                                                                                                                                                             |
+| `audioend`     | Audio capturing has ended                                                                  | Includes the `uri` if `recordingOptions.persist` is enabled.                                                                                                                                                                                                                             |
+| `end`          | Speech recognition service has disconnected.                                               | This should always be the last event dispatched, including after errors.                                                                                                                                                                                                                 |
+| `error`        | Fired when a speech recognition error occurs.                                              | You'll also receive an `error` event (with code "aborted") when calling `.abort()`                                                                                                                                                                                                       |
+| `nomatch`      | Speech recognition service returns a final result with no significant recognition.         | You may have non-final results recognized. This may get emitted after cancellation.                                                                                                                                                                                                      |
+| `result`       | Speech recognition service returns a word or phrase has been positively recognized.        | On Android, continous mode runs as a segmented session, meaning when a final result is reached, additional partial and final results will cover a new segment separate from the previous final result. On iOS, you should expect one final result before speech recognition has stopped. |
+| `speechstart`  | Fired when any sound — recognizable speech or not — has been detected                      | On iOS, this will fire once in the session after a result has occurred                                                                                                                                                                                                                   |
+| `speechend`    | Fired when speech recognized by the speech recognition service has stopped being detected. | Not supported yet on iOS                                                                                                                                                                                                                                                                 |
+| `start`        | Speech recognition has started                                                             | Use this event to indicate to the user when to speak.                                                                                                                                                                                                                                    |
+| `volumechange` | Fired when the input volume changes.                                                       | Returns a value between -2 and 10 indicating the volume of the input audio. Consider anything below 0 to be inaudible.                                                                                                                                                                   |
 
 ## Handling Errors
 
@@ -530,6 +540,43 @@ function TranscribeAudioFile() {
 }
 ```
 
+## Volume metering
+
+You can use the `volumeChangeEventOptions.enabled` option to enable volume metering. This will emit a `volumechange` event with the current volume level (between -2 and 10) as a value. You can use this value to animate the volume metering of a user's voice, or to provide feedback to the user about the volume level.
+
+### Volume metering example
+
+![Volume metering example](./images/volume-metering.gif)
+
+See: [VolumeMeteringAvatar.tsx](https://github.com/jamsch/expo-speech-recognition/tree/main/example/components/VolumeMeteringAvatar.tsx) for a complete example that involves using `react-native-reanimated` to animate the volume metering.
+
+```tsx
+import { ExpoSpeechRecognitionModule } from "expo-speech-recognition";
+
+function VolumeMeteringAvatar() {
+  useSpeechRecognitionEvent("volumechange", (event) => {
+    console.log("Volume changed to:", event.value);
+  });
+
+  const handleStart = () => {
+    ExpoSpeechRecognitionModule.start({
+      lang: "en-US",
+      volumeChangeEventOptions: {
+        enabled: true,
+        intervalMillis: 300,
+      },
+    });
+  };
+
+  return (
+    <View>
+      <Button title="Start" onPress={handleStart} />
+      <Text>Volume: {volume}</Text>
+    </View>
+  );
+}
+```
+
 ## Polyfilling the Web SpeechRecognition API
 
 > [!IMPORTANT]

diff --git a/android/src/main/java/expo/modules/speechrecognition/ExpoSpeechRecognitionModule.kt b/android/src/main/java/expo/modules/speechrecognition/ExpoSpeechRecognitionModule.kt
@@ -86,6 +86,8 @@ class ExpoSpeechRecognitionModule : Module() {
                 "start",
                 // Called when there's results (as a string array, not API compliant)
                 "results",
+                // Fired when the input volume changes
+                "volumechange",
             )
 
             Function("getDefaultRecognitionService") {
@@ -325,26 +327,32 @@ class ExpoSpeechRecognitionModule : Module() {
         promise: Promise,
     ) {
         if (Build.VERSION.SDK_INT < Build.VERSION_CODES.TIRAMISU) {
-            promise.resolve(mapOf(
-                "locales" to mutableListOf<String>(),
-                "installedLocales" to mutableListOf<String>(),
-            ))
+            promise.resolve(
+                mapOf(
+                    "locales" to mutableListOf<String>(),
+                    "installedLocales" to mutableListOf<String>(),
+                ),
+            )
             return
         }
 
         if (options.androidRecognitionServicePackage == null && !SpeechRecognizer.isOnDeviceRecognitionAvailable(appContext)) {
-            promise.resolve(mapOf(
-                "locales" to mutableListOf<String>(),
-                "installedLocales" to mutableListOf<String>(),
-            ))
+            promise.resolve(
+                mapOf(
+                    "locales" to mutableListOf<String>(),
+                    "installedLocales" to mutableListOf<String>(),
+                ),
+            )
             return
         }
 
         if (options.androidRecognitionServicePackage != null && !SpeechRecognizer.isRecognitionAvailable(appContext)) {
-            promise.resolve(mapOf(
-                "locales" to mutableListOf<String>(),
-                "installedLocales" to mutableListOf<String>(),
-            ))
+            promise.resolve(
+                mapOf(
+                    "locales" to mutableListOf<String>(),
+                    "installedLocales" to mutableListOf<String>(),
+                ),
+            )
             return
         }
 

diff --git a/android/src/main/java/expo/modules/speechrecognition/ExpoSpeechRecognitionOptions.kt b/android/src/main/java/expo/modules/speechrecognition/ExpoSpeechRecognitionOptions.kt
@@ -50,6 +50,17 @@ class SpeechRecognitionOptions : Record {
 
     @Field
     val iosCategory: Map<String, Any>? = null
+
+    @Field
+    val volumeChangeEventOptions: VolumeChangeEventOptions? = null
+}
+
+class VolumeChangeEventOptions : Record {
+    @Field
+    val enabled: Boolean? = false
+
+    @Field
+    val intervalMillis: Int? = null
 }
 
 class RecordingOptions : Record {

diff --git a/android/src/main/java/expo/modules/speechrecognition/ExpoSpeechService.kt b/android/src/main/java/expo/modules/speechrecognition/ExpoSpeechService.kt
@@ -50,6 +50,9 @@ class ExpoSpeechService(
     private var speech: SpeechRecognizer? = null
     private val mainHandler = Handler(Looper.getMainLooper())
 
+    private lateinit var options: SpeechRecognitionOptions
+    private var lastVolumeChangeEventTime: Long = 0L
+
     /** Audio recorder for persisting audio */
     private var audioRecorder: ExpoAudioRecorder? = null
 
@@ -108,6 +111,7 @@ class ExpoSpeechService(
 
     /** Starts speech recognition */
     fun start(options: SpeechRecognitionOptions) {
+        this.options = options
         mainHandler.post {
             log("Start recognition.")
 
@@ -119,6 +123,7 @@ class ExpoSpeechService(
             delayedFileStreamer = null
             recognitionState = RecognitionState.STARTING
             soundState = SoundState.INACTIVE
+            lastVolumeChangeEventTime = 0L
             try {
                 val intent = createSpeechIntent(options)
                 speech = createSpeechRecognizer(options)
@@ -454,6 +459,21 @@ class ExpoSpeechService(
     }
 
     override fun onRmsChanged(rmsdB: Float) {
+        if (options.volumeChangeEventOptions?.enabled != true) {
+            return
+        }
+
+        val intervalMs = options.volumeChangeEventOptions?.intervalMillis
+
+        if (intervalMs == null) {
+            sendEvent("volumechange", mapOf("value" to rmsdB))
+        } else {
+            val currentTime = System.currentTimeMillis()
+            if (currentTime - lastVolumeChangeEventTime >= intervalMs) {
+                sendEvent("volumechange", mapOf("value" to rmsdB))
+                lastVolumeChangeEventTime = currentTime
+            }
+        }
         /*
         val isSilent = rmsdB <= 0
 

diff --git a/example/App.tsx b/example/App.tsx
@@ -47,6 +47,7 @@ import {
   AndroidOutputFormat,
   IOSOutputFormat,
 } from "expo-av/build/Audio";
+import { VolumeMeteringAvatar } from "./components/VolumeMeteringAvatar";
 
 const speechRecognitionServices = getSpeechRecognitionServices();
 
@@ -71,7 +72,16 @@ export default function App() {
     continuous: true,
     requiresOnDeviceRecognition: false,
     addsPunctuation: true,
-    contextualStrings: ["Carlsen", "Ian Nepomniachtchi", "Praggnanandhaa"],
+    contextualStrings: [
+      "expo-speech-recognition",
+      "Carlsen",
+      "Ian Nepomniachtchi",
+      "Praggnanandhaa",
+    ],
+    volumeChangeEventOptions: {
+      enabled: false,
+      intervalMillis: 300,
+    },
   });
 
   useSpeechRecognitionEvent("result", (ev) => {
@@ -140,6 +150,10 @@ export default function App() {
     <SafeAreaView style={styles.container}>
       <StatusBar style="dark" translucent={false} />
 
+      {settings.volumeChangeEventOptions?.enabled ? (
+        <VolumeMeteringAvatar />
+      ) : null}
+
       <View style={styles.card}>
         <Text style={styles.text}>
           {error ? JSON.stringify(error) : "Error messages go here"}
@@ -510,6 +524,17 @@ function GeneralSettings(props: {
           checked={Boolean(settings.continuous)}
           onPress={() => handleChange("continuous", !settings.continuous)}
         />
+
+        <CheckboxButton
+          title="Volume events"
+          checked={Boolean(settings.volumeChangeEventOptions?.enabled)}
+          onPress={() =>
+            handleChange("volumeChangeEventOptions", {
+              enabled: !settings.volumeChangeEventOptions?.enabled,
+              intervalMillis: settings.volumeChangeEventOptions?.intervalMillis,
+            })
+          }
+        />
       </View>
 
       <View style={styles.textOptionContainer}>
@@ -714,7 +739,7 @@ function AndroidSettings(props: {
               onPress={() =>
                 handleChange("androidIntentOptions", {
                   ...settings.androidIntentOptions,
-                  [key]: !settings.androidIntentOptions?.[key] ?? false,
+                  [key]: !settings.androidIntentOptions?.[key],
                 })
               }
             />

diff --git a/example/assets/avatar.png b/example/assets/avatar.png
diff --git a/example/babel.config.js b/example/babel.config.js
@@ -1,9 +1,10 @@
 const path = require("path");
-module.exports = function (api) {
+module.exports = (api) => {
   api.cache(true);
   return {
     presets: ["babel-preset-expo"],
     plugins: [
+      "react-native-reanimated/plugin",
       [
         "module-resolver",
         {