Initial commit

BuethSam · May 11, 2023 · 1f84fe5 · 1f84fe5
commit 1f84fe5
Show file tree

Hide file tree

Showing 10 changed files with 298 additions and 0 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,3 @@
+.github
+docker-compose.yml
+Dockerfile
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -0,0 +1,44 @@
+---
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+# GitHub recommends pinning actions to a commit SHA.
+# To get a newer version, you will need to update the SHA.
+# You can also reference a tag or branch, but the action may change without warning.
+
+name: Publish Docker image
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  push_to_registry:
+    name: Push Docker image to Docker Hub
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out the repo
+        uses: actions/checkout@v3
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@f4ef78c080cd8ba55a85445d5b36e214a81df20a
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
+        with:
+          images: samboo/wyoming-tts
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@3b5e8027fcad23fda98b2e3ac259d8d67585f671
+        with:
+          context: .
+          file: ./Dockerfile
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,15 @@
+FROM python:3.7
+
+WORKDIR /app
+
+RUN mkdir /data && mkdir -p /root/.local/share && ln -s /data /root/.local/share/tts
+
+COPY requirements.txt requirements.txt
+
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+COPY . .
+
+VOLUME [ "/data" ]
+
+ENTRYPOINT ["python3", "wyoming_tts"]
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Sam Büth
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,13 @@
+# wyoming TTS
+
+coqui-ai TTS Wyoming protocol implementation. 
+
+## TODO
+
+- [ ] Multi-lingual and multi-speaker selection via wyoming protocol (currently not transmitted by home-assistant)
+
+- [ ] GPU support
+
+## Contributions
+
+Pull request a very welcome. 
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,13 @@
+version: '3.0'
+
+services:
+  tts:
+    image: samboo/wyoming-tts
+    restart: always
+    command: --uri tcp://0.0.0.0:10201 --voice tts_models/de/thorsten/vits
+    environment:
+      - COQUI_STUDIO_TOKEN= #optional
+    volumes:
+      - ./tts:/data
+    ports:
+      - 10201:10201
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+wyoming==0.0.1
+tts~=0.13.3
diff --git a/wyoming_tts/__init__.py b/wyoming_tts/__init__.py
@@ -0,0 +1 @@
+"""Wyoming server for tts."""
diff --git a/wyoming_tts/__main__.py b/wyoming_tts/__main__.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+import argparse
+import asyncio
+import logging
+from functools import partial
+
+from TTS.api import TTS
+from wyoming.info import Attribution, Info, TtsProgram, TtsVoice
+from wyoming.server import AsyncServer
+
+from handler import PiperEventHandler
+
+_LOGGER = logging.getLogger(__name__)
+
+async def main() -> None:
+    """Main entry point."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--voice",
+        default=None,
+        help="The Voice to use for TTS",
+    )
+    parser.add_argument(
+        "--speaker",
+        help="Set the target speaker",
+    )
+    parser.add_argument(
+        "--language",
+        help="Set the target language",
+    )
+    parser.add_argument("--samples-per-chunk", type=int, default=1024)
+    parser.add_argument("--uri", required=True, help="unix:// or tcp://")
+    parser.add_argument("--debug", action="store_true", help="Log DEBUG messages")
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
+
+    if (args.voice == None): 
+        _LOGGER.info("The following voices are available (specify with --voice [model_name]): ")
+        _LOGGER.info("\n".join(TTS.list_models()))
+        exit()
+
+    tts = TTS(args.voice)
+
+    if (tts.is_multi_lingual and args.language is None): 
+        _LOGGER.error("The following languages are available (specify with --language [lang]): ")
+        _LOGGER.info("\n".join(tts.languages))
+        exit()
+    if (tts.is_multi_speaker and args.speaker is None):
+        _LOGGER.error("The following speakers are available (specify with --speakers [speaker]): ")
+        _LOGGER.info("\n".join(tts.speakers))
+        exit()
+
+    language = None
+    if (tts.is_multi_lingual is False):
+        language = args.voice.split("/")[1]
+        _LOGGER.info("Using language: %s", language)
+
+    _LOGGER.info("TTS ready")
+
+    wyoming_info = Info(
+        tts=[
+            TtsProgram(
+                name="coqui-ai TTS",
+                attribution=Attribution(
+                    name="coqui-ai", url="https://github.com/coqui-ai/TTS"
+                ),
+                installed=True,
+                voices=[
+                    TtsVoice(
+                        name=speaker,
+                        attribution=Attribution(
+                            name="coqui-ai", url="https://github.com/coqui-ai/TTS"
+                        ),
+                        installed=True,
+                        languages=tts.languages if tts.is_multi_lingual else [language],
+                    ) for speaker in ([args.speaker] if tts.is_multi_speaker else ["Default"]) # Preparation for multi speaker support in wyoming event
+                ],
+            )
+        ],
+    )
+
+    server = AsyncServer.from_uri(args.uri)
+    _LOGGER.info("Ready")
+    await server.run(
+        partial(
+                PiperEventHandler,
+                wyoming_info,
+                args,
+                tts
+            )
+    )
+
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/wyoming_tts/handler.py b/wyoming_tts/handler.py
@@ -0,0 +1,88 @@
+"""Event handler for clients of the server."""
+import argparse
+import logging
+import math
+import wave
+
+from TTS.api import TTS
+from wyoming.audio import AudioChunk, AudioStart, AudioStop
+from wyoming.event import Event
+from wyoming.info import Describe, Info
+from wyoming.server import AsyncEventHandler
+from wyoming.tts import Synthesize
+
+_LOGGER = logging.getLogger(__name__)
+
+class PiperEventHandler(AsyncEventHandler):
+    def __init__(
+        self,
+        wyoming_info: Info,
+        cli_args: argparse.Namespace,
+        tts: TTS,
+        *args,
+    ) -> None:
+        super().__init__(*args)
+        self.cli_args = cli_args
+        self.wyoming_info_event = wyoming_info.event()
+        self.tts = tts
+
+    async def handle_event(self, event: Event) -> bool:
+        if Describe.is_type(event.type):
+            await self.write_event(self.wyoming_info_event)
+            _LOGGER.debug("Sent info")
+            return True
+
+        if not Synthesize.is_type(event.type):
+            _LOGGER.warning("Unexpected event: %s", event)
+            return True
+        synthesize = Synthesize.from_event(event)
+        raw_text = synthesize.text
+        text = raw_text.strip()
+
+        output_path = "/tmp/output.wav"
+        _LOGGER.debug(event)
+        tts_args = dict()
+        if (self.tts.is_multi_lingual):
+            tts_args["language"] = self.cli_args.language
+
+        if (self.tts.is_multi_speaker):
+            tts_args["speaker"] = self.cli_args.speaker
+        self.tts.tts_to_file(text, **tts_args, file_path=output_path)
+        wav_file: wave.Wave_read = wave.open(output_path, "rb")
+        with wav_file:
+            rate = wav_file.getframerate()
+            width = wav_file.getsampwidth()
+            channels = wav_file.getnchannels()
+
+            await self.write_event(
+                AudioStart(
+                    rate=rate,
+                    width=width,
+                    channels=channels,
+                ).event(),
+            )
+
+            # Audio
+            audio_bytes = wav_file.readframes(wav_file.getnframes())
+            bytes_per_sample = width * channels
+            bytes_per_chunk = bytes_per_sample * self.cli_args.samples_per_chunk
+            num_chunks = int(math.ceil(len(audio_bytes) / bytes_per_chunk))
+
+            # Split into chunks
+            for i in range(num_chunks):
+                offset = i * bytes_per_chunk
+                chunk = audio_bytes[offset : offset + bytes_per_chunk]
+                await self.write_event(
+                    AudioChunk(
+                        audio=chunk,
+                        rate=rate,
+                        width=width,
+                        channels=channels,
+                    ).event(),
+                )
+
+        await self.write_event(AudioStop().event())
+        _LOGGER.debug("Completed request")
+
+
+        return True