shadow-robot · mikramarc · May 26, 2021 · May 26, 2021 · May 26, 2021 · May 26, 2021
diff --git a/sr_speech_control/README.md b/sr_speech_control/README.md
@@ -1,7 +1,7 @@
 # sr_speech_control
 
 A node for controlling various systems using speech.
-Commands are being sent to a topic `sr_speech_control` in a form of std_msgs
+Commands are being sent to a chosen topic in a form of std_msgs
 String and can be intercepted by other nodes in order to execute actions.
 
 ## Usage
@@ -12,38 +12,22 @@ rosrun sr_speech_control speech_control.py
 ```
 
 The node will permanenly listen to the microphone input, will use Google speech
-recognition to translate audio to text, check if text starts with a trigger
-word `shadow` and if so will publish text after the trigger word to topic
-`sr_speech_control`.
+recognition to translate audio to text, check if text starts with a chosen trigger word and if so, will publish word after the trigger word to a `sr_speech_control` topic (by default) .
 
-## Known problems
+The class used has four parameters that can be provided in order to modify the behaviour:
+- `trigger_word` - sets a work that preceeds a command to be sent
+- `command_words` - list of commands that are allowed to be sent
+- `similar_words_dict_path` - path to a yaml file containing dictionary of words that are easily mistaken for a trigger word or one of the command words
+- `non_speaking_duration` - seconds of non-speaking audio to keep on both sides of the recording
+- `pause_threshold` - seconds of non-speaking audio before a phrase is considered complete
 
-Testing revealed that microphone devices available within Docker container are
-significantly worse than on the host machine. That can be easily demonstrated
-by testing Google Chrome browser speech recognition on the host and inside
-Docker container. Known workaround for that is to use pulseaudio server on host
-machine for sound capture. For that install `paprefs` program on the host:
-```
-sudo apt-get install paprefs
-```
-and run it. In "Network Server" tab, and check the "Enable network access to
-local sound devices" checkbox and other two sub-checkboxes in order not to
-require authentications. You might need to reboot host machine for this setting
-to be used.
-Run `pax11publish` utility program to find out pulseaudio server port (most
-likely 4713).
-On the container run:
-```
-export "PULSE_SERVER=tcp:<host IP address>:<host pulseaudio port>"
-```
-For example if your host IP address is `192.168.1.2`, then run:
-```
-export "PULSE_SERVER=tcp:192.168.1.2:4713"
-```
-Alternatively it is possible to map unix sockets instead of tcp but it requires
-adding new parameters when launching Docker container.
+An example usage can be seen in the `speech_control.py` file:
+```python
+    trigger_word = "shadow"
+    command_words = ["grasp", "release", "disable", "enable", "engage"]
+    similar_words_dict_path = rospkg.RosPack().get_path('sr_speech_control') + '/config/similar_words_dict.yaml'
+
+    sc = SpeechControl(trigger_word, command_words, similar_words_dict_path=similar_words_dict_path)
+    sc.run()
 
-To use node with pulseaudio microphone specify `prefer_microphone` parameter:
-```
-rosrun sr_speech_control speech_control.py _prefer_microphone:=pulse
 ```
diff --git a/sr_speech_control/config/similar_words_dict.yaml b/sr_speech_control/config/similar_words_dict.yaml
@@ -0,0 +1,3 @@
+shallow: shadow
+shiloh: shadow
+app: up
diff --git a/sr_speech_control/src/sr_speech_control/speech_control.py b/sr_speech_control/src/sr_speech_control/speech_control.py
@@ -15,33 +15,52 @@
 # with this program. If not, see <http://www.gnu.org/licenses/>.
 
 import rospy
+import rospkg
+import time
 import speech_recognition as sr
 from difflib import get_close_matches
 from std_msgs.msg import String
+import yaml
 
 
 class SpeechControl(object):
-    def __init__(self):
-        self.microphone = sr.Microphone()
-        prefer_microphone = rospy.get_param('~prefer_microphone')
-        if prefer_microphone:
-            for i, microphone_name in enumerate(sr.Microphone.list_microphone_names()):
-                if prefer_microphone in microphone_name:
-                    self.microphone = sr.Microphone(device_index=i)
-                    rospy.loginfo("Using preferred microphone: {}".format(microphone_name))
-                    break
-        self.trigger_word = rospy.get_param('~trigger_word', 'shadow')
+    def __init__(self, trigger_word, command_words, command_topic='sr_speech_control',
+                 similar_words_dict_path=None, non_speaking_duration=0.2, pause_threshold=0.2):
+        self.trigger_word = trigger_word
+        self.command_words = command_words
         self.recognizer = sr.Recognizer()
-        self._set_param_if_provided(self.recognizer, 'non_speaking_duration')
-        self._set_param_if_provided(self.recognizer, 'pause_threshold')
-        self.command_words = rospy.get_param('~command_words', [])
-        topic = rospy.get_param('~topic', 'sr_speech_control')
-        self.command_publisher = rospy.Publisher(topic, String, queue_size=1)
+        self.command_publisher = rospy.Publisher(command_topic, String, queue_size=1)
+        self.command_to_be_executed = None
+        self.similar_words_dict = {}
+
+        if similar_words_dict_path:
+            self.parse_similar_words_dict(similar_words_dict_path)
+        self._init_recognizer(non_speaking_duration, pause_threshold)
         self._stop_listening = self.recognizer.listen_in_background(self.microphone, self._recognizer_callback)
 
-    def _set_param_if_provided(self, object_to_set, param_name):
-        if rospy.has_param('~' + param_name):
-            setattr(object_to_set, param_name, rospy.get_param('~' + param_name))
+    def parse_similar_words_dict(self, path_name):
+        with open(path_name, 'r') as stream:
+            self.similar_words_dict = yaml.safe_load(stream)
+
+    def _init_recognizer(self, non_speaking_duration, pause_threshold):
+        for idx, mic in enumerate(sr.Microphone.list_microphone_names()):
+            rospy.loginfo('{}: {}'.format(idx, mic))
+
+        while not rospy.is_shutdown():
+            try:
+                idx = raw_input("Pick one of the microphones from the list above. Type the index and execute.\n")
+                if not idx:
+                    self.microphone = sr.Microphone()
+                else:
+                    self.microphone = sr.Microphone(device_index=int(idx))
+                with self.microphone as source:
+                    self.recognizer.adjust_for_ambient_noise(source)
+                    break
+            except OSError:
+                rospy.logwarn("Wrong microphone. Try again.")
+
+        self.recognizer.non_speaking_duration = non_speaking_duration
+        self.recognizer.pause_threshold = pause_threshold
 
     def _recognizer_callback(self, recognizer, audio):
         try:
@@ -53,21 +72,37 @@ def _recognizer_callback(self, recognizer, audio):
             return
 
         result = [str(x).lower() for x in result.split(' ')]
-        if len(result) > 1:
-            if self._filter_word(result[0], [self.trigger_word]) == self.trigger_word:
-                self.command_publisher.publish(' '.join([self._filter_word(x, self.command_words) for x in result[1:]]))
+
+        if self._filter_word(result[0], self.trigger_word) == self.trigger_word:
+            command = self._filter_word(''.join(result[1:]), self.command_words)
+            if command in self.command_words:
+                self.command_to_be_executed = command
 
     def _filter_word(self, word, dictionary, offset=0.5):
+        if word in self.similar_words_dict:
+            word = self.similar_words_dict[word]
+
         result = get_close_matches(word, dictionary, 1, offset)
         if not result:
             return word
         return result[0]
 
+    def run(self):
+        rospy.loginfo("Started speech control. Trigger word: {}".format(self.trigger_word))
+        while not rospy.is_shutdown():
+            if self.command_to_be_executed:
+                rospy.loginfo("Executing: {}.".format(self.command_to_be_executed))
+                self.command_publisher.publish(self.command_to_be_executed)
+                self.command_to_be_executed = None
+        self._stop_listening(wait_for_stop=False)
+
+
 if __name__ == "__main__":
-    rospy.init_node('sr_speech_control', anonymous=True)
+    rospy.init_node('example_speech_control', anonymous=True)
+
+    trigger_word = "shadow"
+    command_words = ["grasp", "release", "disable", "enable", "engage"]
+    similar_words_dict_path = rospkg.RosPack().get_path('sr_speech_control') + '/config/similar_words_dict.yaml'
 
-    sc = SpeechControl()
-    rospy.loginfo("Started speech control. Trigger word: {}, command words: {}".format(
-        sc.trigger_word, sc.command_words))
-    rospy.spin()
-    sc._stop_listening(wait_for_stop=False)
+    sc = SpeechControl(trigger_word, command_words, similar_words_dict_path=similar_words_dict_path)
+    sc.run()