RoBorregos · GilMM27 · Jan 30, 2025 · Jan 19, 2025 · Jan 20, 2025 · Jan 21, 2025
@@ -2,7 +2,7 @@ FROM python:3.9-slim
 
 WORKDIR /app
 
-COPY requirements.txt .
+COPY ../../hri/packages/speech/scripts/stt/requirements.txt .
 
 # Install dependencies
 RUN apt-get update && apt-get install -y \
@@ -13,9 +13,4 @@ RUN apt-get update && apt-get install -y \
 
 RUN pip install --no-cache-dir -r requirements.txt
 
-RUN apt-get update && apt-get install -y ffmpeg
-
-# Expose the port for the gRPC server
-EXPOSE 50051
-
-CMD ["bash", "-c", "python -u $SCRIPT_NAME --port $PORT --model_size $MODEL_SIZE"]
+RUN apt-get update && apt-get install -y ffmpeg
@@ -0,0 +1,13 @@
+services:
+  stt:
+    container_name: home2-hri-stt
+    image: roborregos/home2:hri-stt
+    build:
+      context: ../..
+      dockerfile: docker/hri/Dockerfile.stt
+    ports:
+      - "50051:50051"
+    volumes:
+      - ../../hri/packages/speech/scripts/stt/:/app
+      - ../../hri/packages/speech/speech/:/app/speech
+    command: ["bash", "-c", "python -u Faster-whisper.py --port 50051 --model_size base.en"]
@@ -6,3 +6,4 @@
 **/.vscode/
 **/__pychache__/
 **/models
+**.pyc
@@ -70,7 +70,7 @@ Most of the final commands will be executed using the docker compose file.
 However, some testing commands are the following:
 
 ```bash
-# Speech (Remember to start the whisper docker before)
+# Speech (Remember to start the stt docker before)
 ros2 launch speech devices_launch.py
 
 ros2 topic pub /speech/speak_now --once std_msgs/msg/String "data: 'Go to the kitchen and grab cookies'"

@@ -1,3 +1,6 @@
 hear:
   ros__parameters:
-    STT_SERVER_IP: "127.0.0.1:50051"
+    STT_SERVER_IP: "127.0.0.1:50051"
+    START_SERVICE: True
+    STT_SERVICE_NAME: "stt_service"
+    detection_publish_topic: "/keyword_detected"
@@ -2,10 +2,12 @@
 
 import rclpy
 from rclpy.node import Node
-from rclpy.executors import ExternalShutdownException
+from rclpy.executors import ExternalShutdownException, MultiThreadedExecutor
+from rclpy.callback_groups import MutuallyExclusiveCallbackGroup
 import grpc
 from frida_interfaces.msg import AudioData
-from std_msgs.msg import String
+from frida_interfaces.srv import STT
+from std_msgs.msg import Bool, String
 from speech.speech_api_utils import SpeechApiUtils
 
 import sys
@@ -37,24 +39,62 @@ def __init__(self):
         super().__init__("hear_node")
         self.get_logger().info("*Starting Hear Node*")
 
-        # Get the gRPC server address from parameters
         server_ip = (
             self.declare_parameter("STT_SERVER_IP", "127.0.0.1:50051")
             .get_parameter_value()
             .string_value
         )
+        start_service = (
+            self.declare_parameter("START_SERVICE", False)
+            .get_parameter_value()
+            .bool_value
+        )
+        service_name = (
+            self.declare_parameter("STT_SERVICE_NAME", "stt_service")
+            .get_parameter_value()
+            .string_value
+        )
+
+        # Initialize the Whisper gRPC client
         self.client = WhisperClient(server_ip)
 
         # Create a publisher for the transcriptions
         self.transcription_publisher = self.create_publisher(
             String, "/speech/raw_command", 10
         )
 
+        # Create groups for the subscription and service
+        subscription_group = MutuallyExclusiveCallbackGroup()
+        service_group = MutuallyExclusiveCallbackGroup()
+
         # Subscribe to audio data
         self.audio_subscription = self.create_subscription(
-            AudioData, "UsefulAudio", self.callback_audio, 10
+            AudioData,
+            "UsefulAudio",
+            self.callback_audio,
+            10,
+            callback_group=subscription_group,
         )
 
+        # Create a service
+        self.service_active = False
+        if start_service:
+            self.service_text = ""
+            detection_publish_topic = (
+                self.declare_parameter("detection_publish_topic", "/keyword_detected")
+                .get_parameter_value()
+                .string_value
+            )
+            self.KWS_publisher_mock = self.create_publisher(
+                Bool, detection_publish_topic, 10
+            )
+            self.stt_service = self.create_service(
+                STT,
+                service_name,
+                self.stt_service_callback,
+                callback_group=service_group,
+            )
+
         self.get_logger().info("*Hear Node is ready*")
 
     def callback_audio(self, data):
@@ -75,21 +115,41 @@ def callback_audio(self, data):
             # Publish the transcription
             msg = String()
             msg.data = transcription
-            self.transcription_publisher.publish(msg)
-            self.get_logger().info("Transcription published to ROS topic.")
+
+            if self.service_active:
+                # If the service is active, store the transcription
+                self.service_text = transcription
+                self.service_active = False
+            else:
+                # If the service is not active, publish the transcription
+                self.transcription_publisher.publish(msg)
+                self.get_logger().info("Transcription published to ROS topic.")
         except grpc.RpcError as e:
             self.get_logger().error(f"gRPC error: {e.code()}, {e.details()}")
         except Exception as ex:
             self.get_logger().error(f"Error during transcription: {str(ex)}")
 
+    def stt_service_callback(self, request, response):
+        self.get_logger().info("Keyword mock service activated, recording audio...")
+        self.service_active = True
+        self.KWS_publisher_mock.publish(Bool(data=True))
+        while self.service_active:
+            pass
+        response.text_heard = self.service_text
+        return response
+
 
 def main(args=None):
     rclpy.init(args=args)
+    node = HearNode()
+    executor = MultiThreadedExecutor()
+    executor.add_node(node)
     try:
-        rclpy.spin(HearNode())
+        executor.spin()
     except (ExternalShutdownException, KeyboardInterrupt):
         pass
     finally:
+        node.destroy_node()
         rclpy.shutdown()
 
 

@@ -5,8 +5,13 @@
 from faster_whisper import WhisperModel
 import os
 import torch
-from wav_utils import WavUtils
 import argparse
+import sys
+
+# Add the directory containing the protos to the Python path
+sys.path.append(os.path.join(os.path.dirname(__file__), "speech"))
+
+from wav_utils import WavUtils
 
 
 class WhisperServicer(speech_pb2_grpc.SpeechServiceServicer):

@@ -24,23 +24,11 @@ python3 Whisper.py
 
 ## Running on CPU
 
-If testing or running on a PC/laptop, use the Dockerfile to run the scripts.
+If testing or running on a PC/laptop, use docker compose to run the scripts.
 
 ```bash
-# pwd -> /speech/scripts/stt
-docker build -t roborregos/home2:stt .
-
-# To run whisper for the first time:
-# pwd -> /speech/scripts/stt
-docker run -e SCRIPT_NAME=Whisper.py -e PORT=50051 -e MODEL_SIZE=base.en -p 50051:50051 --name whisper -v .:/app roborregos/home2:stt
-# In the future you can use:
-docker start -ai whisper
-
-# To run faster-whisper for the first time:
-# pwd -> /speech/scripts/stt
-docker run -e SCRIPT_NAME=Faster-whisper.py -e PORT=50051 -e MODEL_SIZE=base.en -p 50051:50051 --name faster-whisper -v .:/app roborregos/home2:stt
-# In the future you can use:
-docker start -ai faster-whisper
+# pwd -> /docker/hri
+docker compose -f stt.yaml up
 ```
 
 ## gRPC implementation

@@ -2,7 +2,6 @@ grpcio
 grpcio-tools
 torch
 torchaudio
-whisper
 faster-whisper
 pydub
 soundfile