use-whisper web (#26)

* use-whisper web * data for kannada asr * init-indic-tts * init-parler tts * update libraries
slabstech · Dec 4, 2024 · 3d7a43e · 3d7a43e
1 parent db20c84
commit 3d7a43e
Show file tree

Hide file tree

Showing 8 changed files with 178 additions and 0 deletions.
diff --git a/tutorials/indic/README.md b/tutorials/indic/README.md
@@ -0,0 +1,19 @@
+Indic - LLM
+
+- Solutions for Indian languages
+
+- tts
+    - https://huggingface.co/ai4bharat/indic-parler-tts
+
+- text - llm
+    - sarvam-1
+
+- datasets
+    - 
+
+- ASR for kannada
+    - https://huggingface.co/ai4bharat/indicconformer_stt_kn_hybrid_ctc_rnnt_large
+
+    - git clone https://github.com/AI4Bharat/NeMo.git && cd NeMo && git checkout nemo-v2 && bash reinstall.sh
+
+    - https://github.com/AI4Bharat/NeMo
diff --git a/tutorials/indic/tts/Dockerfile b/tutorials/indic/tts/Dockerfile
@@ -0,0 +1,15 @@
+FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN pip install --no-cache-dir parler-tts transformers torch
+
+COPY . .
+
+EXPOSE 8000
+
+CMD ["python", "main.py"]
diff --git a/tutorials/indic/tts/README.md b/tutorials/indic/tts/README.md
@@ -0,0 +1,41 @@
+TTS -
+
+- Download model from Huggingface
+  - huggingface-cli download ai4bharat/indic-parler-tts
+
+- Run with Docker compose
+  - docker compose -f indic-tts-compose.yml  up --detach parler-tts-server
+
+- Test output
+  - kannada
+    - curl -s -H "content-type: application/json" localhost:8000/v1/audio/speech -d '{"input": "ಉದ್ಯಾನದಲ್ಲಿ ಮಕ್ಕಳ ಆಟವಾಡುತ್ತಿದ್ದಾರೆ ಮತ್ತು ಪಕ್ಷಿಗಳು ಚಿಲಿಪಿಲಿ ಮಾಡುತ್ತಿವೆ."}' -o audio.mp3
+
+  - hindi
+    -  curl -s -H "content-type: application/json" localhost:8000/v1/audio/speech -d '{"input": "अरे, तुम आज कैसे हो?"}' -o audio.mp3
+
+  - curl -s -H "content-type: application/json" localhost:8000/v1/audio/speech -d '{"input": "Hey, how are you?", "voice": "Feminine, speedy, and cheerfull"}' -o audio_2.mp3
+
+ -  
+
+--- 
+TODO
+
+- Create docker image
+  - docker build -t indic-parler-tts .
+
+- Run the container
+  -  docker run -d -p 8000:8000 -v ~/.cache/huggingface:/root/.cache/huggingface parler-tts
+
+
+
+-  huggingface-cli download  parler-tts/parler-tts-mini-expresso
+
+- with slabstech/parler-tts
+  - ai4bharat/indic-parler-tts
+    - huggingface-cli download ai4bharat/indic-parler-tts
+    - docker run --detach --volume ~/.cache/huggingface:/root/.cache/huggingface --publish 8000:8000 --env MODEL="ai4bharat/indic-parler-tts" slabstech/parler-tts-server
+  - parler-tts/parler-tts-mini-expresso
+    -  huggingface-cli download  parler-tts/parler-tts-mini-expresso
+    - docker run --detach --volume ~/.cache/huggingface:/root/.cache/huggingface --publish 8000:8000 --env MODEL="parler-tts/parler-tts-mini-expresso" slabstech/parler-tts-server
+
+- curl -s -H "content-type: application/json" localhost:8000/v1/audio/speech -d '{"input": "Hey, how are you?"}' -o audio.mp3
diff --git a/tutorials/indic/tts/indic-tts-compose.yml b/tutorials/indic/tts/indic-tts-compose.yml
@@ -0,0 +1,34 @@
+services:
+  parler-tts-server:
+    image: slabstech/parler-tts-server
+    build:
+      dockerfile: Dockerfile
+      context: .
+      platforms:
+        - linux/amd64
+      tags:
+        - slabstech/parler-tts-server
+    develop:
+      watch:
+        - path: ./parler_tts_server
+          action: rebuild
+    volumes:
+      - ~/.cache/huggingface:/root/.cache/huggingface
+    restart: unless-stopped
+    ports:
+      - 8000:8000
+    environment:
+      - MODEL=ai4bharat/indic-parler-tts
+    healthcheck:
+      test: curl --fail http://0.0.0.0:8000/health || exit 1
+      interval: 10s
+      timeout: 10s
+      retries: 3
+      start_period: 15s
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ['0']
+              capabilities: [gpu]
diff --git a/tutorials/indic/tts/test-compose.yml b/tutorials/indic/tts/test-compose.yml
@@ -0,0 +1,28 @@
+services:
+  parler-tts:
+    build: .
+    image: pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime
+    ports:
+      - "8000:8000"
+    volumes:
+      - ~/.cache/huggingface:/root/.cache/huggingface
+    environment:
+      - TZ=UTC
+      - MODEL_NAME=ai4bharat/indic-parler-tts
+      - PYTHONUNBUFFERED=1
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ['0']
+              capabilities: [gpu]
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "nvidia-smi"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+
+volumes:
+  huggingface_cache:
diff --git a/tutorials/whisper/web/README.md b/tutorials/whisper/web/README.md
@@ -1,5 +1,17 @@
 Whisper Web
 
+- Whisper Web UI 
+    - https://gitlab.com/aadnk/whisper-webui
+
+    - https://gitlab.com/aadnk/whisper-diarization
+
+    - https://gitlab.com/users/aadnk/projects
+- https://huggingface.co/spaces/openai/whisper
+
+- https://huggingface.co/spaces/hf-audio/whisper-large-v3-turbo
+- 
+
+
 - Web GPU - https://github.com/xenova/whisper-web/tree/experimental-webgpu
 - Spaces - https://huggingface.co/spaces/Xenova/whisper-web
 - https://github.com/xenova/whisper-web

diff --git a/tutorials/whisper/web/gradio_api.py b/tutorials/whisper/web/gradio_api.py
@@ -0,0 +1,9 @@
+from gradio_client import Client
+
+client = Client("https://openai-whisper.hf.space/")
+result = client.predict(
+				"https://github.com/gradio-app/gradio/raw/main/test/test_files/audio_sample.wav",	# str (filepath or URL to file) in 'inputs' Audio component
+				"transcribe",	# str in 'Task' Radio component
+				api_name="/predict"
+)
+print(result)
diff --git a/tutorials/whisper/web/requirements.txt b/tutorials/whisper/web/requirements.txt
@@ -0,0 +1,20 @@
+anyio==4.6.2.post1
+certifi==2024.8.30
+charset-normalizer==3.4.0
+exceptiongroup==1.2.2
+filelock==3.16.1
+fsspec==2024.10.0
+gradio_client==1.5.0
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.28.0
+huggingface-hub==0.26.3
+idna==3.10
+packaging==24.2
+PyYAML==6.0.2
+requests==2.32.3
+sniffio==1.3.1
+tqdm==4.67.1
+typing_extensions==4.12.2
+urllib3==2.2.3
+websockets==12.0