From 4e324c245bc695deb620b50fbf105d5bcbc37375 Mon Sep 17 00:00:00 2001 From: Gilmm27 Date: Sun, 19 Jan 2025 15:02:39 -0800 Subject: [PATCH 1/6] mulithread hear service implementation --- hri/packages/speech/config/hear.yaml | 4 +- hri/packages/speech/scripts/hear.py | 69 ++++++++++++++++++--- hri/packages/speech/scripts/useful_audio.py | 18 +++--- 3 files changed, 72 insertions(+), 19 deletions(-) diff --git a/hri/packages/speech/config/hear.yaml b/hri/packages/speech/config/hear.yaml index 82a3720..db0bafb 100644 --- a/hri/packages/speech/config/hear.yaml +++ b/hri/packages/speech/config/hear.yaml @@ -1,3 +1,5 @@ hear: ros__parameters: - STT_SERVER_IP: "127.0.0.1:50051" \ No newline at end of file + STT_SERVER_IP: "127.0.0.1:50051" + START_SERVICE: True + detection_publish_topic: "/keyword_detected" \ No newline at end of file diff --git a/hri/packages/speech/scripts/hear.py b/hri/packages/speech/scripts/hear.py index 303d211..820076a 100755 --- a/hri/packages/speech/scripts/hear.py +++ b/hri/packages/speech/scripts/hear.py @@ -2,10 +2,12 @@ import rclpy from rclpy.node import Node -from rclpy.executors import ExternalShutdownException +from rclpy.executors import ExternalShutdownException, MultiThreadedExecutor +from rclpy.callback_groups import ReentrantCallbackGroup import grpc from frida_interfaces.msg import AudioData -from std_msgs.msg import String +from frida_interfaces.srv import STT +from std_msgs.msg import Bool, String from speech.speech_api_utils import SpeechApiUtils import sys @@ -37,12 +39,18 @@ def __init__(self): super().__init__("hear_node") self.get_logger().info("*Starting Hear Node*") - # Get the gRPC server address from parameters server_ip = ( self.declare_parameter("STT_SERVER_IP", "127.0.0.1:50051") .get_parameter_value() .string_value ) + start_service = ( + self.declare_parameter("START_SERVICE", False) + .get_parameter_value() + .bool_value + ) + + # Initialize the Whisper gRPC client self.client = WhisperClient(server_ip) # Create a publisher for the transcriptions @@ -50,11 +58,37 @@ def __init__(self): String, "/speech/raw_command", 10 ) + # Create groups for the subscription and service + multithread_group = ReentrantCallbackGroup() + # Subscribe to audio data self.audio_subscription = self.create_subscription( - AudioData, "UsefulAudio", self.callback_audio, 10 + AudioData, + "UsefulAudio", + self.callback_audio, + 10, + callback_group=multithread_group, ) + # Create a service + self.service_active = False + if start_service: + self.service_text = "" + detection_publish_topic = ( + self.declare_parameter("detection_publish_topic", "/keyword_detected") + .get_parameter_value() + .string_value + ) + self.KWS_publisher_mock = self.create_publisher( + Bool, detection_publish_topic, 10 + ) + self.stt_service = self.create_service( + STT, + "stt_service", + self.stt_service_callback, + callback_group=multithread_group, + ) + self.get_logger().info("*Hear Node is ready*") def callback_audio(self, data): @@ -75,21 +109,42 @@ def callback_audio(self, data): # Publish the transcription msg = String() msg.data = transcription - self.transcription_publisher.publish(msg) - self.get_logger().info("Transcription published to ROS topic.") + + if self.service_active: + # If the service is active, store the transcription + self.service_text = transcription + self.service_active = False + else: + # If the service is not active, publish the transcription + self.transcription_publisher.publish(msg) + self.get_logger().info("Transcription published to ROS topic.") except grpc.RpcError as e: self.get_logger().error(f"gRPC error: {e.code()}, {e.details()}") except Exception as ex: self.get_logger().error(f"Error during transcription: {str(ex)}") + def stt_service_callback(self, request, response): + self.get_logger().info("STT service requested") + self.service_active = True + self.KWS_publisher_mock.publish(Bool(data=True)) + while self.service_active: + pass + response.text_heard = self.service_text + # self.get_logger().info(f"STT service response: {response.text_heard}") + return response + def main(args=None): rclpy.init(args=args) + node = HearNode() + executor = MultiThreadedExecutor() + executor.add_node(node) try: - rclpy.spin(HearNode()) + executor.spin() except (ExternalShutdownException, KeyboardInterrupt): pass finally: + node.destroy_node() rclpy.shutdown() diff --git a/hri/packages/speech/scripts/useful_audio.py b/hri/packages/speech/scripts/useful_audio.py index a7bba09..06dafeb 100755 --- a/hri/packages/speech/scripts/useful_audio.py +++ b/hri/packages/speech/scripts/useful_audio.py @@ -76,7 +76,6 @@ def __init__(self): self.timer = None self.is_saying = False self.audio_state = "None" - self.service_active = False self.publisher = self.create_publisher(AudioData, "UsefulAudio", 20) self.audio_state_publisher = self.create_publisher(String, "AudioState", 10) @@ -89,7 +88,7 @@ def __init__(self): AudioData, "rawAudioChunk", self.callback_raw_audio, 10 ) self.create_subscription(Bool, "saying", self.callback_saying, 10) - self.create_subscription(Bool, "keyword_detected", self.callback_keyword, 10) + self.create_subscription(Bool, "/keyword_detected", self.callback_keyword, 10) if not self.use_silero_vad: self.vad = webrtcvad.Vad() @@ -126,10 +125,9 @@ def build_audio(self, data): self.chunk_count += 1 def discard_audio(self): - if not self.service_active: - self.ring_buffer.clear() - self.voiced_frames = None - self.chunk_count = 0 + self.ring_buffer.clear() + self.voiced_frames = None + self.chunk_count = 0 def publish_audio(self): if self.chunk_count > MIN_CHUNKS_AUDIO_LENGTH: @@ -207,8 +205,7 @@ def vad_collector(self, chunk): ): self.triggered = False self.compute_audio_state() - if not self.service_active: - self.publish_audio() + self.publish_audio() self.timer = None def callback_raw_audio(self, msg): @@ -225,9 +222,8 @@ def callback_saying(self, msg): def callback_keyword(self, msg): self.triggered = True - if not self.service_active: - self.discard_audio() - self.compute_audio_state() + self.discard_audio() + self.compute_audio_state() def compute_audio_state(self): new_state = ( From 3f9ea734842deeb2feecf7e50800c44b1a1c9063 Mon Sep 17 00:00:00 2001 From: Gilmm27 Date: Sun, 19 Jan 2025 16:15:30 -0800 Subject: [PATCH 2/6] changed to mutually exclusive --- hri/packages/speech/scripts/hear.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/hri/packages/speech/scripts/hear.py b/hri/packages/speech/scripts/hear.py index 820076a..987b9e5 100755 --- a/hri/packages/speech/scripts/hear.py +++ b/hri/packages/speech/scripts/hear.py @@ -3,7 +3,7 @@ import rclpy from rclpy.node import Node from rclpy.executors import ExternalShutdownException, MultiThreadedExecutor -from rclpy.callback_groups import ReentrantCallbackGroup +from rclpy.callback_groups import MutuallyExclusiveCallbackGroup import grpc from frida_interfaces.msg import AudioData from frida_interfaces.srv import STT @@ -59,7 +59,8 @@ def __init__(self): ) # Create groups for the subscription and service - multithread_group = ReentrantCallbackGroup() + subscription_group = MutuallyExclusiveCallbackGroup() + service_group = MutuallyExclusiveCallbackGroup() # Subscribe to audio data self.audio_subscription = self.create_subscription( @@ -67,7 +68,7 @@ def __init__(self): "UsefulAudio", self.callback_audio, 10, - callback_group=multithread_group, + callback_group=subscription_group, ) # Create a service @@ -86,7 +87,7 @@ def __init__(self): STT, "stt_service", self.stt_service_callback, - callback_group=multithread_group, + callback_group=service_group, ) self.get_logger().info("*Hear Node is ready*") @@ -124,13 +125,12 @@ def callback_audio(self, data): self.get_logger().error(f"Error during transcription: {str(ex)}") def stt_service_callback(self, request, response): - self.get_logger().info("STT service requested") + self.get_logger().info("Keyword mock service activated, recording audio...") self.service_active = True self.KWS_publisher_mock.publish(Bool(data=True)) while self.service_active: pass response.text_heard = self.service_text - # self.get_logger().info(f"STT service response: {response.text_heard}") return response From fefc2d0db40a7bd0d49526bee7d9c30fa75b7e18 Mon Sep 17 00:00:00 2001 From: Gilmm27 Date: Mon, 20 Jan 2025 20:34:40 -0800 Subject: [PATCH 3/6] Fixed keyword erasing audio --- hri/packages/speech/scripts/useful_audio.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/hri/packages/speech/scripts/useful_audio.py b/hri/packages/speech/scripts/useful_audio.py index 06dafeb..27e2b65 100755 --- a/hri/packages/speech/scripts/useful_audio.py +++ b/hri/packages/speech/scripts/useful_audio.py @@ -75,7 +75,7 @@ def __init__(self): self.timer = None self.is_saying = False - self.audio_state = "None" + self.audio_state = "idle" self.publisher = self.create_publisher(AudioData, "UsefulAudio", 20) self.audio_state_publisher = self.create_publisher(String, "AudioState", 10) @@ -221,9 +221,10 @@ def callback_saying(self, msg): self.compute_audio_state() def callback_keyword(self, msg): - self.triggered = True - self.discard_audio() - self.compute_audio_state() + if self.audio_state == "idle": + self.triggered = True + self.discard_audio() + self.compute_audio_state() def compute_audio_state(self): new_state = ( From 680d6c0c7b3e02bb78c1e5667d0e172f14808661 Mon Sep 17 00:00:00 2001 From: Gilmm27 Date: Sun, 26 Jan 2025 21:43:45 -0800 Subject: [PATCH 4/6] added docker compose for stt --- hri/.gitignore | 1 + hri/README.md | 2 +- hri/packages/speech/scripts/stt/Dockerfile | 5 -- hri/packages/speech/scripts/stt/README.md | 16 +---- hri/packages/speech/scripts/stt/Whisper.py | 62 ------------------- .../speech/scripts/stt/docker-compose.yaml | 12 ++++ .../speech/scripts/stt/requirements.txt | 1 - 7 files changed, 16 insertions(+), 83 deletions(-) delete mode 100755 hri/packages/speech/scripts/stt/Whisper.py create mode 100644 hri/packages/speech/scripts/stt/docker-compose.yaml diff --git a/hri/.gitignore b/hri/.gitignore index 079e92f..3132a86 100644 --- a/hri/.gitignore +++ b/hri/.gitignore @@ -6,3 +6,4 @@ **/.vscode/ **/__pychache__/ **/models +**.pyc \ No newline at end of file diff --git a/hri/README.md b/hri/README.md index ab150fe..b6621dc 100644 --- a/hri/README.md +++ b/hri/README.md @@ -70,7 +70,7 @@ Most of the final commands will be executed using the docker compose file. However, some testing commands are the following: ```bash -# Speech (Remember to start the whisper docker before) +# Speech (Remember to start the stt docker before) ros2 launch speech devices_launch.py ros2 topic pub /speech/speak_now --once std_msgs/msg/String "data: 'Go to the kitchen and grab cookies'" diff --git a/hri/packages/speech/scripts/stt/Dockerfile b/hri/packages/speech/scripts/stt/Dockerfile index f7fdb0e..44c42d7 100644 --- a/hri/packages/speech/scripts/stt/Dockerfile +++ b/hri/packages/speech/scripts/stt/Dockerfile @@ -14,8 +14,3 @@ RUN apt-get update && apt-get install -y \ RUN pip install --no-cache-dir -r requirements.txt RUN apt-get update && apt-get install -y ffmpeg - -# Expose the port for the gRPC server -EXPOSE 50051 - -CMD ["bash", "-c", "python -u $SCRIPT_NAME --port $PORT --model_size $MODEL_SIZE"] diff --git a/hri/packages/speech/scripts/stt/README.md b/hri/packages/speech/scripts/stt/README.md index 9a6c6ca..7a17035 100644 --- a/hri/packages/speech/scripts/stt/README.md +++ b/hri/packages/speech/scripts/stt/README.md @@ -24,23 +24,11 @@ python3 Whisper.py ## Running on CPU -If testing or running on a PC/laptop, use the Dockerfile to run the scripts. +If testing or running on a PC/laptop, use docker compose to run the scripts. ```bash # pwd -> /speech/scripts/stt -docker build -t roborregos/home2:stt . - -# To run whisper for the first time: -# pwd -> /speech/scripts/stt -docker run -e SCRIPT_NAME=Whisper.py -e PORT=50051 -e MODEL_SIZE=base.en -p 50051:50051 --name whisper -v .:/app roborregos/home2:stt -# In the future you can use: -docker start -ai whisper - -# To run faster-whisper for the first time: -# pwd -> /speech/scripts/stt -docker run -e SCRIPT_NAME=Faster-whisper.py -e PORT=50051 -e MODEL_SIZE=base.en -p 50051:50051 --name faster-whisper -v .:/app roborregos/home2:stt -# In the future you can use: -docker start -ai faster-whisper +docker compose up ``` ## gRPC implementation diff --git a/hri/packages/speech/scripts/stt/Whisper.py b/hri/packages/speech/scripts/stt/Whisper.py deleted file mode 100755 index 182aec9..0000000 --- a/hri/packages/speech/scripts/stt/Whisper.py +++ /dev/null @@ -1,62 +0,0 @@ -import grpc -from concurrent import futures -import speech_pb2 -import speech_pb2_grpc -import whisper -import torch -import argparse -from wav_utils import WavUtils -import os - - -class WhisperServicer(speech_pb2_grpc.SpeechServiceServicer): - def __init__(self, model_size): - self.model_size = model_size - self.audio_model = self.load_model() - - def load_model(self): - model_directory = os.path.join(os.path.dirname(__file__), "models") - whisper._download(whisper._MODELS[self.model_size], model_directory, False) - return whisper.load_model(self.model_size, download_root=model_directory) - - def Transcribe(self, request, context): - # Generate a temporary WAV file from received audio data - temp_file = WavUtils.generate_temp_wav(1, 2, 16000, request.audio_data) - - # Perform transcription - result = self.audio_model.transcribe(temp_file, fp16=torch.cuda.is_available()) - WavUtils.discard_wav(temp_file) - - # Return the transcribed text - return speech_pb2.TextResponse(text=result["text"].strip()) - - -def serve(port, model_size): - # Create the gRPC server - server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) - speech_pb2_grpc.add_SpeechServiceServicer_to_server( - WhisperServicer(model_size), server - ) - - # Bind to a port - server.add_insecure_port(f"0.0.0.0:{port}") - print( - f"Whisper gRPC server is running on port {port} with model size {model_size}..." - ) - server.start() - server.wait_for_termination() - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Whisper gRPC server") - parser.add_argument( - "--port", type=int, default=50051, help="Port to run the gRPC server on" - ) - parser.add_argument( - "--model_size", - type=str, - default="base.en", - help="Model size to use (e.g., base.en, small.en, medium.en, large.en)", - ) - args = parser.parse_args() - serve(args.port, args.model_size) diff --git a/hri/packages/speech/scripts/stt/docker-compose.yaml b/hri/packages/speech/scripts/stt/docker-compose.yaml new file mode 100644 index 0000000..355a1e2 --- /dev/null +++ b/hri/packages/speech/scripts/stt/docker-compose.yaml @@ -0,0 +1,12 @@ +services: + stt: + container_name: home2-hri-stt + image: roborregos/home2:hri-stt + build: + context: . + dockerfile: Dockerfile + ports: + - "50051:50051" + volumes: + - .:/app + command: ["bash", "-c", "python -u Faster-whisper.py --port 50051 --model_size base.en"] diff --git a/hri/packages/speech/scripts/stt/requirements.txt b/hri/packages/speech/scripts/stt/requirements.txt index f60b09d..9e9c6bd 100644 --- a/hri/packages/speech/scripts/stt/requirements.txt +++ b/hri/packages/speech/scripts/stt/requirements.txt @@ -2,7 +2,6 @@ grpcio grpcio-tools torch torchaudio -whisper faster-whisper pydub soundfile From 856f89200da1058e098c01f6af9e9ccc76dfcb48 Mon Sep 17 00:00:00 2001 From: Gilmm27 Date: Mon, 27 Jan 2025 23:08:41 -0800 Subject: [PATCH 5/6] moved docker to correct directory --- .../Dockerfile => docker/hri/Dockerfile.stt | 4 +- .../hri/stt.yaml | 7 +- .../speech/scripts/stt/Faster-whisper.py | 7 +- hri/packages/speech/scripts/stt/README.md | 4 +- hri/packages/speech/scripts/stt/wav_utils.py | 71 ------------------- 5 files changed, 14 insertions(+), 79 deletions(-) rename hri/packages/speech/scripts/stt/Dockerfile => docker/hri/Dockerfile.stt (69%) rename hri/packages/speech/scripts/stt/docker-compose.yaml => docker/hri/stt.yaml (58%) delete mode 100644 hri/packages/speech/scripts/stt/wav_utils.py diff --git a/hri/packages/speech/scripts/stt/Dockerfile b/docker/hri/Dockerfile.stt similarity index 69% rename from hri/packages/speech/scripts/stt/Dockerfile rename to docker/hri/Dockerfile.stt index 44c42d7..fb57156 100644 --- a/hri/packages/speech/scripts/stt/Dockerfile +++ b/docker/hri/Dockerfile.stt @@ -2,7 +2,7 @@ FROM python:3.9-slim WORKDIR /app -COPY requirements.txt . +COPY ../../hri/packages/speech/scripts/stt/requirements.txt . # Install dependencies RUN apt-get update && apt-get install -y \ @@ -13,4 +13,4 @@ RUN apt-get update && apt-get install -y \ RUN pip install --no-cache-dir -r requirements.txt -RUN apt-get update && apt-get install -y ffmpeg +RUN apt-get update && apt-get install -y ffmpeg \ No newline at end of file diff --git a/hri/packages/speech/scripts/stt/docker-compose.yaml b/docker/hri/stt.yaml similarity index 58% rename from hri/packages/speech/scripts/stt/docker-compose.yaml rename to docker/hri/stt.yaml index 355a1e2..54dc135 100644 --- a/hri/packages/speech/scripts/stt/docker-compose.yaml +++ b/docker/hri/stt.yaml @@ -3,10 +3,11 @@ services: container_name: home2-hri-stt image: roborregos/home2:hri-stt build: - context: . - dockerfile: Dockerfile + context: ../.. + dockerfile: docker/hri/Dockerfile.stt ports: - "50051:50051" volumes: - - .:/app + - ../../hri/packages/speech/scripts/stt/:/app + - ../../hri/packages/speech/speech/:/app/speech command: ["bash", "-c", "python -u Faster-whisper.py --port 50051 --model_size base.en"] diff --git a/hri/packages/speech/scripts/stt/Faster-whisper.py b/hri/packages/speech/scripts/stt/Faster-whisper.py index f999751..5499655 100644 --- a/hri/packages/speech/scripts/stt/Faster-whisper.py +++ b/hri/packages/speech/scripts/stt/Faster-whisper.py @@ -5,8 +5,13 @@ from faster_whisper import WhisperModel import os import torch -from wav_utils import WavUtils import argparse +import sys + +# Add the directory containing the protos to the Python path +sys.path.append(os.path.join(os.path.dirname(__file__), "speech")) + +from wav_utils import WavUtils class WhisperServicer(speech_pb2_grpc.SpeechServiceServicer): diff --git a/hri/packages/speech/scripts/stt/README.md b/hri/packages/speech/scripts/stt/README.md index 7a17035..f2bd7a3 100644 --- a/hri/packages/speech/scripts/stt/README.md +++ b/hri/packages/speech/scripts/stt/README.md @@ -27,8 +27,8 @@ python3 Whisper.py If testing or running on a PC/laptop, use docker compose to run the scripts. ```bash -# pwd -> /speech/scripts/stt -docker compose up +# pwd -> /docker/hri +docker compose -f stt.yaml up ``` ## gRPC implementation diff --git a/hri/packages/speech/scripts/stt/wav_utils.py b/hri/packages/speech/scripts/stt/wav_utils.py deleted file mode 100644 index 4716f73..0000000 --- a/hri/packages/speech/scripts/stt/wav_utils.py +++ /dev/null @@ -1,71 +0,0 @@ -import tempfile -import wave -import os -from pydub import AudioSegment -import soundfile as sf -import sounddevice as sd - - -class WavUtils: - @staticmethod - # Convert incoming data (AudioData) to wav file - def generate_temp_wav(n_channels, sample_width, sample_rate, data): - # data = bytes(data) - with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: - with wave.open(temp_file, "w") as wav_file: - wav_file.setnchannels(n_channels) - wav_file.setsampwidth(sample_width) - wav_file.setframerate(sample_rate) - - wav_file.writeframes(data) - # Return the temporary file name - return temp_file.name - - @staticmethod - def discard_wav(file_path): - if ( - os.path.exists(file_path) - and os.path.isfile(file_path) - and file_path.endswith(".wav") - ): - os.remove(file_path) - - @staticmethod - # Return if the audio is over the minumum threshold - def within_time_frame(file_path, min_time, max_time): - time = WavUtils.get_file_time(file_path) - if max_time > time and time > min_time: - return True - return False - - @staticmethod - def get_file_time(file_path): - with wave.open(file_path, "r") as f: - frames = f.getnframes() - rate = f.getframerate() - duration = frames / float(rate * f.getnchannels()) - return duration - - @staticmethod - def play_wav(file_path, device_index): - # playsound(file_path,block=True) - # return - data, samplerate = sf.read(file_path) - sd.default.device = device_index - sd.play(data, samplerate) - sd.wait() - - @staticmethod - def mp3_to_wav(mp3_path, wav_path, sample_rate=44100): - # Load the MP3 file - audio = AudioSegment.from_mp3(mp3_path) - - # Export the audio to WAV format - audio.export(wav_path, format="wav", parameters=["-ar", str(sample_rate)]) - - @staticmethod - def play_mp3(mp3_path, device_index=11): - wav_name = os.path.splitext(mp3_path)[0] + "_temp" + ".wav" - WavUtils.mp3_to_wav(mp3_path, wav_name) - WavUtils.play_wav(wav_name, device_index) - WavUtils.discard_wav(wav_name) From 388c041ac59a1f6bd503dc9b11898cf614a770e5 Mon Sep 17 00:00:00 2001 From: Gilmm27 Date: Wed, 29 Jan 2025 17:57:14 -0800 Subject: [PATCH 6/6] added service name parameter --- hri/packages/speech/config/hear.yaml | 1 + hri/packages/speech/scripts/hear.py | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/hri/packages/speech/config/hear.yaml b/hri/packages/speech/config/hear.yaml index db0bafb..1708a6f 100644 --- a/hri/packages/speech/config/hear.yaml +++ b/hri/packages/speech/config/hear.yaml @@ -2,4 +2,5 @@ hear: ros__parameters: STT_SERVER_IP: "127.0.0.1:50051" START_SERVICE: True + STT_SERVICE_NAME: "stt_service" detection_publish_topic: "/keyword_detected" \ No newline at end of file diff --git a/hri/packages/speech/scripts/hear.py b/hri/packages/speech/scripts/hear.py index 987b9e5..6056e43 100755 --- a/hri/packages/speech/scripts/hear.py +++ b/hri/packages/speech/scripts/hear.py @@ -49,6 +49,11 @@ def __init__(self): .get_parameter_value() .bool_value ) + service_name = ( + self.declare_parameter("STT_SERVICE_NAME", "stt_service") + .get_parameter_value() + .string_value + ) # Initialize the Whisper gRPC client self.client = WhisperClient(server_ip) @@ -85,7 +90,7 @@ def __init__(self): ) self.stt_service = self.create_service( STT, - "stt_service", + service_name, self.stt_service_callback, callback_group=service_group, )