diff --git a/ios/ExpoSpeechRecognizer.swift b/ios/ExpoSpeechRecognizer.swift index 4853a07..650b010 100644 --- a/ios/ExpoSpeechRecognizer.swift +++ b/ios/ExpoSpeechRecognizer.swift @@ -286,8 +286,8 @@ actor ExpoSpeechRecognizer: ObservableObject { ) } - // Don't run any timers if the audio source is from a file - let continuous = options.continuous || isSourcedFromFile + // Run timers on non-continuous mode, as long as the audio source is the mic + let shouldRunTimers = !options.continuous && !isSourcedFromFile let audioEngine = self.audioEngine self.task = recognizer.recognitionTask( @@ -300,18 +300,20 @@ actor ExpoSpeechRecognizer: ObservableObject { } } - // Result handler + // Handle the result self?.recognitionHandler( audioEngine: audioEngine, result: result, error: error, resultHandler: resultHandler, errorHandler: errorHandler, - continuous: continuous + continuous: options.continuous, + shouldRunTimers: shouldRunTimers, + canEmitInterimResults: options.interimResults ) }) - if !continuous { + if shouldRunTimers { invalidateAndScheduleTimer() } @@ -449,7 +451,10 @@ actor ExpoSpeechRecognizer: ObservableObject { request = SFSpeechAudioBufferRecognitionRequest() } - request.shouldReportPartialResults = options.interimResults + // We also force-enable partial results on non-continuous mode, + // which will allow us to re-schedule timers when text is detected + // These won't get emitted to the user, however + request.shouldReportPartialResults = options.interimResults || options.continuous if recognizer.supportsOnDeviceRecognition { request.requiresOnDeviceRecognition = options.requiresOnDeviceRecognition @@ -613,12 +618,25 @@ actor ExpoSpeechRecognizer: ObservableObject { error: Error?, resultHandler: @escaping (SFSpeechRecognitionResult) -> Void, errorHandler: @escaping (Error) -> Void, - continuous: Bool + continuous: Bool, + shouldRunTimers: Bool, + canEmitInterimResults: Bool ) { + // When a final result is returned, we should expect the task to be idle or stopping let receivedFinalResult = result?.isFinal ?? false let receivedError = error != nil - if let result: SFSpeechRecognitionResult { + // Hack for iOS 18 to detect final results + // See: https://forums.developer.apple.com/forums/thread/762952 for more info + // This can be emitted multiple times during a continuous session, unlike `result.isFinal` which is only emitted once + var receivedFinalLikeResult: Bool = receivedFinalResult + if #available(iOS 18.0, *), !receivedFinalLikeResult { + receivedFinalLikeResult = result?.speechRecognitionMetadata?.speechDuration ?? 0 > 0 + } + + let shouldEmitResult = receivedFinalResult || canEmitInterimResults || receivedFinalLikeResult + + if let result: SFSpeechRecognitionResult, shouldEmitResult { Task { @MainActor in let taskState = await task?.state // Make sure the task is running before emitting the result @@ -638,15 +656,16 @@ actor ExpoSpeechRecognizer: ObservableObject { } } - if receivedFinalResult || receivedError { + if (receivedFinalLikeResult && !continuous) || receivedError || receivedFinalResult { Task { @MainActor in await reset() } + return } // Non-continuous speech recognition // Stop the speech recognizer if the timer fires after not receiving a result for 3 seconds - if !continuous && !receivedError { + if shouldRunTimers && !receivedError { invalidateAndScheduleTimer() } } diff --git a/package.json b/package.json index 6c23422..dabd50a 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "expo-speech-recognition", - "version": "0.2.22", + "version": "0.2.23", "description": "Speech Recognition for React Native Expo projects", "main": "build/index.js", "types": "build/index.d.ts",