From a7bba52887e9df338b750b981315801742d371a2 Mon Sep 17 00:00:00 2001 From: mikramarc Date: Wed, 26 May 2021 09:49:07 +0000 Subject: [PATCH 1/4] fixnig --- sr_speech_control/README.md | 48 ++++------ .../config/similar_words_dict.yaml | 3 + .../src/sr_speech_control/speech_control.py | 87 +++++++++++++------ 3 files changed, 79 insertions(+), 59 deletions(-) create mode 100644 sr_speech_control/config/similar_words_dict.yaml diff --git a/sr_speech_control/README.md b/sr_speech_control/README.md index 008ec04f..3b716dfd 100644 --- a/sr_speech_control/README.md +++ b/sr_speech_control/README.md @@ -1,7 +1,7 @@ # sr_speech_control A node for controlling various systems using speech. -Commands are being sent to a topic `sr_speech_control` in a form of std_msgs +Commands are being sent to a chosen topic in a form of std_msgs String and can be intercepted by other nodes in order to execute actions. ## Usage @@ -12,38 +12,22 @@ rosrun sr_speech_control speech_control.py ``` The node will permanenly listen to the microphone input, will use Google speech -recognition to translate audio to text, check if text starts with a trigger -word `shadow` and if so will publish text after the trigger word to topic -`sr_speech_control`. +recognition to translate audio to text, check if text starts with a chosen trigger word and if so, will publish word after the trigger word to a `sr_speech_control` topic (by default) . -## Known problems +The class used has four parameters that can be provided in order to modify the behaviour: +- `trigger_word` - sets a work that preceeds a command to be sent +- `command_words` - list of commands that are allowed to be sent +- `similar_words_dict_path` - path to a yaml file containing dictionary of words that are easily mistaken for a trigger word or one of the command words +- `non_speaking_duration` - seconds of non-speaking audio to keep on both sides of the recording +- `pause_threshold` - seconds of non-speaking audio before a phrase is considered complete -Testing revealed that microphone devices available within Docker container are -significantly worse than on the host machine. That can be easily demonstrated -by testing Google Chrome browser speech recognition on the host and inside -Docker container. Known workaround for that is to use pulseaudio server on host -machine for sound capture. For that install `paprefs` program on the host: -``` -sudo apt-get install paprefs -``` -and run it. In "Network Server" tab, and check the "Enable network access to -local sound devices" checkbox and other two sub-checkboxes in order not to -require authentications. You might need to reboot host machine for this setting -to be used. -Run `pax11publish` utility program to find out pulseaudio server port (most -likely 4713). -On the container run: -``` -export "PULSE_SERVER=tcp::" -``` -For example if your host IP address is `192.168.1.2`, then run: -``` -export "PULSE_SERVER=tcp:192.168.1.2:4713" -``` -Alternatively it is possible to map unix sockets instead of tcp but it requires -adding new parameters when launching Docker container. +An example usage can be seen in the `speech_control.py` file: +```python + trigger_word = "shadow" + command_words = ["grasp", "release", "disable", "enable", "engage"] + similar_words_dict_path = rospkg.RosPack().get_path('sr_speech_control') + '/config/similar_words_dict.yaml' + + sc = SpeechControl(trigger_word, command_words, similar_words_dict_path=similar_words_dict_path) + sc.run() -To use node with pulseaudio microphone specify `prefer_microphone` parameter: -``` -rosrun sr_speech_control speech_control.py _prefer_microphone:=pulse ``` diff --git a/sr_speech_control/config/similar_words_dict.yaml b/sr_speech_control/config/similar_words_dict.yaml new file mode 100644 index 00000000..140b1ee1 --- /dev/null +++ b/sr_speech_control/config/similar_words_dict.yaml @@ -0,0 +1,3 @@ +shallow: shadow +shiloh: shadow +app: up diff --git a/sr_speech_control/src/sr_speech_control/speech_control.py b/sr_speech_control/src/sr_speech_control/speech_control.py index ae6570c6..a952ab04 100755 --- a/sr_speech_control/src/sr_speech_control/speech_control.py +++ b/sr_speech_control/src/sr_speech_control/speech_control.py @@ -15,33 +15,50 @@ # with this program. If not, see . import rospy +import rospkg +import time import speech_recognition as sr from difflib import get_close_matches from std_msgs.msg import String +import yaml class SpeechControl(object): - def __init__(self): - self.microphone = sr.Microphone() - prefer_microphone = rospy.get_param('~prefer_microphone') - if prefer_microphone: - for i, microphone_name in enumerate(sr.Microphone.list_microphone_names()): - if prefer_microphone in microphone_name: - self.microphone = sr.Microphone(device_index=i) - rospy.loginfo("Using preferred microphone: {}".format(microphone_name)) - break - self.trigger_word = rospy.get_param('~trigger_word', 'shadow') + def __init__(self, trigger_word, command_words, command_topic='sr_speech_control', + similar_words_dict_path=None, non_speaking_duration=0.2, pause_threshold=0.2): + self.trigger_word = trigger_word + self.command_words = command_words self.recognizer = sr.Recognizer() - self._set_param_if_provided(self.recognizer, 'non_speaking_duration') - self._set_param_if_provided(self.recognizer, 'pause_threshold') - self.command_words = rospy.get_param('~command_words', []) - topic = rospy.get_param('~topic', 'sr_speech_control') - self.command_publisher = rospy.Publisher(topic, String, queue_size=1) + self.command_publisher = rospy.Publisher(command_topic, String, queue_size=1) + self.command_to_be_executed = None + self.similar_words_dict = {} + + if similar_words_dict_path: + self.parse_similar_words_dict(similar_words_dict_path) + self._init_recognizer(non_speaking_duration, pause_threshold) self._stop_listening = self.recognizer.listen_in_background(self.microphone, self._recognizer_callback) - def _set_param_if_provided(self, object_to_set, param_name): - if rospy.has_param('~' + param_name): - setattr(object_to_set, param_name, rospy.get_param('~' + param_name)) + + def parse_similar_words_dict(self, path_name): + with open(path_name, 'r') as stream: + self.similar_words_dict = yaml.safe_load(stream) + + def _init_recognizer(self, non_speaking_duration, pause_threshold): + for idx, mic in enumerate(sr.Microphone.list_microphone_names()): + rospy.loginfo('{}: {}'.format(idx, mic)) + + while True: + try: + idx = input("Choose one of the microphones from the list above. Type the index and press [RETURN]\n") + self.microphone = sr.Microphone(device_index=int(idx)) + with self.microphone as source: + self.recognizer.adjust_for_ambient_noise(source) + break + except OSError: + rospy.logwarn("Wrong microphone. Try again.") + + self.recognizer.non_speaking_duration = non_speaking_duration + self.recognizer.pause_threshold = pause_threshold def _recognizer_callback(self, recognizer, audio): try: @@ -53,21 +70,37 @@ def _recognizer_callback(self, recognizer, audio): return result = [str(x).lower() for x in result.split(' ')] - if len(result) > 1: - if self._filter_word(result[0], [self.trigger_word]) == self.trigger_word: - self.command_publisher.publish(' '.join([self._filter_word(x, self.command_words) for x in result[1:]])) + + if self._filter_word(result[0], self.trigger_word) == self.trigger_word: + command = self._filter_word(''.join(result[1:]), self.command_words) + if command in self.command_words: + self.command_to_be_executed = command def _filter_word(self, word, dictionary, offset=0.5): + if word in self.similar_words_dict: + word = self.similar_words_dict[word] + result = get_close_matches(word, dictionary, 1, offset) if not result: return word return result[0] + def run(self): + rospy.loginfo("Started speech control. Trigger word: {}".format(self.trigger_word)) + while not rospy.is_shutdown(): + if self.command_to_be_executed: + rospy.loginfo("Executing: {}.".format(self.command_to_be_executed)) + self.command_publisher.publish(self.command_to_be_executed) + self.command_to_be_executed = None + self._stop_listening(wait_for_stop=False) + + if __name__ == "__main__": - rospy.init_node('sr_speech_control', anonymous=True) + rospy.init_node('example_speech_control', anonymous=True) + + trigger_word = "shadow" + command_words = ["grasp", "release", "disable", "enable", "engage"] + similar_words_dict_path = rospkg.RosPack().get_path('sr_speech_control') + '/config/similar_words_dict.yaml' - sc = SpeechControl() - rospy.loginfo("Started speech control. Trigger word: {}, command words: {}".format( - sc.trigger_word, sc.command_words)) - rospy.spin() - sc._stop_listening(wait_for_stop=False) + sc = SpeechControl(trigger_word, command_words, similar_words_dict_path=similar_words_dict_path) + sc.run() From e4d2d2f379812cde5cd720cb887c05bf0dbaf074 Mon Sep 17 00:00:00 2001 From: mikramarc Date: Wed, 26 May 2021 09:51:40 +0000 Subject: [PATCH 2/4] fixing typo --- .../config/{similar_words_dict.yaml => similar_words_dict.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sr_speech_control/config/{similar_words_dict.yaml => similar_words_dict.yaml} (100%) diff --git a/sr_speech_control/config/similar_words_dict.yaml b/sr_speech_control/config/similar_words_dict.yaml similarity index 100% rename from sr_speech_control/config/similar_words_dict.yaml rename to sr_speech_control/config/similar_words_dict.yaml From f9e080387e4663e7d11dda99eb2a30e2e4e36a43 Mon Sep 17 00:00:00 2001 From: mikramarc Date: Wed, 26 May 2021 11:30:15 +0100 Subject: [PATCH 3/4] Update speech_control.py --- .../src/sr_speech_control/speech_control.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/sr_speech_control/src/sr_speech_control/speech_control.py b/sr_speech_control/src/sr_speech_control/speech_control.py index a952ab04..1401090b 100755 --- a/sr_speech_control/src/sr_speech_control/speech_control.py +++ b/sr_speech_control/src/sr_speech_control/speech_control.py @@ -38,7 +38,6 @@ def __init__(self, trigger_word, command_words, command_topic='sr_speech_control self._init_recognizer(non_speaking_duration, pause_threshold) self._stop_listening = self.recognizer.listen_in_background(self.microphone, self._recognizer_callback) - def parse_similar_words_dict(self, path_name): with open(path_name, 'r') as stream: self.similar_words_dict = yaml.safe_load(stream) @@ -47,10 +46,13 @@ def _init_recognizer(self, non_speaking_duration, pause_threshold): for idx, mic in enumerate(sr.Microphone.list_microphone_names()): rospy.loginfo('{}: {}'.format(idx, mic)) - while True: + while not rospy.is_shutdown(): try: - idx = input("Choose one of the microphones from the list above. Type the index and press [RETURN]\n") - self.microphone = sr.Microphone(device_index=int(idx)) + idx = raw_input("Choose one of the microphones from the list above. Type the index and press [RETURN]\n") + if not idx: + self.microphone = sr.Microphone() + else: + self.microphone = sr.Microphone(device_index=int(idx)) with self.microphone as source: self.recognizer.adjust_for_ambient_noise(source) break @@ -88,10 +90,10 @@ def _filter_word(self, word, dictionary, offset=0.5): def run(self): rospy.loginfo("Started speech control. Trigger word: {}".format(self.trigger_word)) while not rospy.is_shutdown(): - if self.command_to_be_executed: - rospy.loginfo("Executing: {}.".format(self.command_to_be_executed)) - self.command_publisher.publish(self.command_to_be_executed) - self.command_to_be_executed = None + if self.command_to_be_executed: + rospy.loginfo("Executing: {}.".format(self.command_to_be_executed)) + self.command_publisher.publish(self.command_to_be_executed) + self.command_to_be_executed = None self._stop_listening(wait_for_stop=False) From 3d7e19bb5e77f111686b5b1b70325f33c5eb1de7 Mon Sep 17 00:00:00 2001 From: mikramarc Date: Wed, 26 May 2021 12:07:36 +0000 Subject: [PATCH 4/4] lint --- sr_speech_control/src/sr_speech_control/speech_control.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sr_speech_control/src/sr_speech_control/speech_control.py b/sr_speech_control/src/sr_speech_control/speech_control.py index 1401090b..b5282f64 100755 --- a/sr_speech_control/src/sr_speech_control/speech_control.py +++ b/sr_speech_control/src/sr_speech_control/speech_control.py @@ -48,7 +48,7 @@ def _init_recognizer(self, non_speaking_duration, pause_threshold): while not rospy.is_shutdown(): try: - idx = raw_input("Choose one of the microphones from the list above. Type the index and press [RETURN]\n") + idx = raw_input("Pick one of the microphones from the list above. Type the index and execute.\n") if not idx: self.microphone = sr.Microphone() else: