From e3c7cef06c93dbd451a002c211b1cb1046e65296 Mon Sep 17 00:00:00 2001 From: luluw Date: Sat, 7 Dec 2024 12:14:29 +0545 Subject: [PATCH] :rocket: [Update] Web Flask Script and Templates --- .../app.py | 129 +++++++++------- .../static/assets/index.html | 20 +++ .../static/assets/script.js | 141 +++++++++++++----- .../static/assets/style.css | 104 ++++++------- .../templates/index.html | 23 --- 5 files changed, 257 insertions(+), 160 deletions(-) create mode 100644 src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/static/assets/index.html delete mode 100644 src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/templates/index.html diff --git a/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/app.py b/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/app.py index b8d1102..0109d3a 100644 --- a/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/app.py +++ b/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/app.py @@ -1,69 +1,96 @@ -import webbrowser +import os import sys import argparse -from os.path import join, dirname -from fastapi import FastAPI, Request -from fastapi.staticfiles import StaticFiles -from fastapi.responses import HTMLResponse -from fastapi.templating import Jinja2Templates + +from flask import Flask, request, jsonify, send_from_directory +from werkzeug.utils import secure_filename # ASR engine path sys.path.append(join(dirname(__file__), 'Automatic_Speech_Recognition')) -from engine import SpeechRecognitionEngine +from neuralnet.dataset import get_featurizer +from decoder import SpeechRecognitionEngine +from engine import Recorder -app = FastAPI() +app = Flask(__name__) -# Serve static files -app.mount("/static", StaticFiles(directory="static"), name="static") +# Configuration +UPLOAD_FOLDER = 'uploads' +STATIC_FOLDER = 'static/assets' -# Setup Jinja2 templates -templates = Jinja2Templates(directory="templates") +app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER -# Global variable for ASR engine -global asr_engine +# Ensure upload folder exists +os.makedirs(UPLOAD_FOLDER, exist_ok=True) -@app.get("/", response_class=HTMLResponse) -async def index(request: Request): - return templates.TemplateResponse("index.html", {"request": request}) +# Initialize shared objects at startup +recorder = Recorder() # Initialize recorder +featurizer = get_featurizer(16000) # Initialize featurizer -@app.get("/start_asr") -async def start(): - action = DemoAction() - asr_engine.run(action) - return {"message": "Speech recognition started successfully!"} +asr_engine = None # Initialize to None, to avoid issues during startup -@app.get("/get_audio") -async def get_audio(): - with open('transcript.txt', 'r') as f: - transcript = f.read() - return {"transcript": transcript} +# Serve static files (index.html) +@app.route("/", methods=["GET"]) +def index(): + return send_from_directory(STATIC_FOLDER, "index.html") + + +@app.route("/transcribe/", methods=["POST"]) +def transcribe_audio(): + """ + Transcribe an uploaded audio file using the preloaded ASR engine. + """ + try: + if asr_engine is None: + return jsonify({"error": "ASR Engine is not initialized."}), 500 + + # Check if file is in request + if "file" not in request.files: + return jsonify({"error": "No file provided"}), 400 + + file = request.files["file"] + if file.filename == "": + return jsonify({"error": "No file selected"}), 400 + + # Secure filename and save file + filename = secure_filename(file.filename) + file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename) + file.save(file_path) + + print(f"File saved: {file_path}") + + recorded_audio = recorder.record() + recorder.save(recorded_audio, "audio_temp.wav") + print("\nAudio recorded") + + # Use the preloaded ASR Engine to transcribe + transcript = asr_engine.transcribe(asr_engine.model, featurizer, "audio_temp.wav") + + print("\nTranscription:") + print(transcript) + + return jsonify({"transcription": transcript}) + + except Exception as e: + return jsonify({"error": f"Internal server error: {e}"}), 500 + + +def main(args): + global asr_engine + print("Loading Speech Recognition Engine into cache...") + try: + asr_engine = SpeechRecognitionEngine(args.model_file, args.token_path) + print("ASR Engine loaded successfully.") + except Exception as e: + print(f"Error loading ASR Engine: {e}") + asr_engine = None -class DemoAction: - def __init__(self): - self.asr_results = "" - self.current_beam = "" - - def __call__(self, x): - results, current_context_length = x - self.current_beam = results - transcript = " ".join(self.asr_results.split() + results.split()) - self.save_transcript(transcript) - if current_context_length > 10: - self.asr_results = transcript - - def save_transcript(self, transcript): - with open("transcript.txt", 'w+') as f: - print(transcript) - f.write(transcript) if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Demo of Automatic Speech Recognition") - parser.add_argument("--model_file", type=str, required=True, help="Path to the model file") - parser.add_argument("--kenlm_file", type=str, default=None, help="Path to the KenLM file") + parser = argparse.ArgumentParser(description="ASR Demo: Record and Transcribe Audio") + parser.add_argument('--model_file', type=str, required=True, help='Path to the optimized ASR model.') + parser.add_argument('--token_path', type=str, default="assets/tokens.txt", help='Path to the tokens file.') args = parser.parse_args() - asr_engine = SpeechRecognitionEngine(args.model_file, args.kenlm_file) - webbrowser.open_new('http://127.0.0.1:3000/') + main(args) - import uvicorn - uvicorn.run(app, host="127.0.0.1", port=3000) \ No newline at end of file + app.run(host="127.0.0.1", port=8080, debug=True) \ No newline at end of file diff --git a/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/static/assets/index.html b/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/static/assets/index.html new file mode 100644 index 0000000..499c248 --- /dev/null +++ b/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/static/assets/index.html @@ -0,0 +1,20 @@ + + + + + + Audio Transcription + + + + + + +
+ +
+

+
+ + + \ No newline at end of file diff --git a/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/static/assets/script.js b/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/static/assets/script.js index 5ad936a..6a167a7 100644 --- a/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/static/assets/script.js +++ b/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/static/assets/script.js @@ -1,3 +1,14 @@ +let isRecording = false; +let audioVisualizer; +let audioContext = new (window.AudioContext || window.webkitAudioContext)(); +let mediaRecorder; +let audioData = []; +let recordBtn = document.getElementById("recordBtn"); +let resultDisplay = document.getElementById("result"); + + +recordBtn.addEventListener("click", toggleRecording); + class AudioVisualizer { constructor(audioContext, processFrame, processError) { this.audioContext = audioContext; @@ -15,10 +26,9 @@ class AudioVisualizer { connectStream(stream) { this.analyser = this.audioContext.createAnalyser(); const source = this.audioContext.createMediaStreamSource(stream); - console.log(source) source.connect(this.analyser); this.analyser.smoothingTimeConstant = 0.5; - this.analyser.fftSize = 32; + this.analyser.fftSize = 256; // Increased fftSize for better frequency resolution this.initRenderLoop(this.analyser); } @@ -40,60 +50,119 @@ class AudioVisualizer { const visualMainElement = document.querySelector('main'); const visualValueCount = 16; let visualElements; + +const colors = ['#ff6347', '#4682b4', '#32cd32', '#ff1493', '#ffd700', '#8a2be2', '#ff4500', '#00fa9a', '#a52a2a', '#5f9ea0', '#f0e68c', '#dda0dd', '#0000ff', '#ff00ff', '#adff2f', '#c71585']; + const createDOMElements = () => { - let i; - for (i = 0; i < visualValueCount; ++i) { + for (let i = 0; i < visualValueCount; ++i) { const elm = document.createElement('div'); visualMainElement.appendChild(elm); - } + // Assign a color from the predefined array + elm.style.background = colors[i % colors.length]; + } visualElements = document.querySelectorAll('main div'); }; -createDOMElements(); - -const startTranscriptions = () => { - fetch("/start_asr").then(res => res.json()).then(data => console.log(data)); - setInterval(() => { - fetch("/get_audio") - .then(res => res.json()) - .then(data => { - let doc = document.getElementById("transcript"); - if (data.transcript && data.transcript !== "") { - doc.innerText = data.transcript; - } - }) - .catch(error => console.error('Error:', error)); - }, 100); -}; const init = () => { // Creating initial DOM elements const audioContext = new AudioContext(); const initDOM = () => { - visualMainElement.innerHTML = ''; - createDOMElements(); + visualMainElement.innerHTML = ''; + createDOMElements(); }; initDOM(); // Swapping values around for a better visual effect const dataMap = { 0: 15, 1: 10, 2: 8, 3: 9, 4: 6, 5: 5, 6: 2, 7: 1, 8: 0, 9: 4, 10: 3, 11: 7, 12: 11, 13: 12, 14: 13, 15: 14 }; const processFrame = (data) => { - const values = Object.values(data); - let i; - for (i = 0; i < visualValueCount; ++i) { - const value = values[dataMap[i]] / 255; - const elmStyles = visualElements[i].style; - elmStyles.transform = `scaleY( ${value} )`; - elmStyles.opacity = Math.max(.25, value); - } + const values = Object.values(data); + let i; + for (i = 0; i < visualValueCount; ++i) { + const value = values[dataMap[i]] / 255; + const elmStyles = visualElements[i].style; + elmStyles.transform = `scaleY( ${value} )`; + elmStyles.opacity = Math.max(.25, value); + } }; const processError = () => { - visualMainElement.classList.add('error'); - visualMainElement.innerText = 'Allow access to your microphone in order to see this demo.'; + visualMainElement.classList.add('error'); + visualMainElement.innerText = 'Please allow access to your microphone'; + } + + const audioVisualizer = new AudioVisualizer(audioContext, processFrame, processError); + +}; + +// Initialize the visualizer and the DOM once on page load +document.addEventListener('DOMContentLoaded', init); + +// Toggle the recording state +function toggleRecording() { + if (isRecording) { + stopRecording(); + } else { + startRecording(); } +} + +function startRecording() { + recordBtn.textContent = "Record"; // Change button text to stop recording + isRecording = true; + audioData = []; // Clear previous audio data + + // Access microphone and initialize both MediaRecorder and Visualizer + navigator.mediaDevices.getUserMedia({ audio: true }) + .then((stream) => { + // Initialize MediaRecorder + mediaRecorder = new MediaRecorder(stream); + mediaRecorder.ondataavailable = (event) => { + audioData.push(event.data); // Collect audio chunks + }; + + mediaRecorder.onstop = () => { + // When recording stops, create the audio blob + const audioBlob = new Blob(audioData, { type: 'audio/wav' }); + transcribeAudio(audioBlob); // Send for transcription + }; + + mediaRecorder.start(); + console.log("Recording started..."); + + }) + .catch((err) => { + console.error("Error accessing microphone: ", err); + alert("Could not access the microphone."); + isRecording = false; + recordBtn.textContent = "Start Recording"; // Reset button text + }); +} + +function stopRecording() { + if (!isRecording || !mediaRecorder) return; + + isRecording = false; + mediaRecorder.stop(); // Stop recording + console.log("Recording stopped."); + + recordBtn.textContent = "Clear"; // Reset button text +} - const a = new AudioVisualizer(audioContext, processFrame, processError); +function transcribeAudio(audioBlob) { + const formData = new FormData(); + formData.append("file", audioBlob, "audio_temp.wav"); // Append the audio blob - startTranscriptions() -}; \ No newline at end of file + fetch("/transcribe/", { + method: "POST", + body: formData, + }) + .then(response => response.json()) + .then(data => { + resultDisplay.textContent = data.transcription; + }) + .catch((err) => { + console.error("Error during transcription:", err); + resultDisplay.textContent = "Error during transcription."; + }); +} \ No newline at end of file diff --git a/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/static/assets/style.css b/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/static/assets/style.css index 0359e03..d188ec1 100644 --- a/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/static/assets/style.css +++ b/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/static/assets/style.css @@ -1,76 +1,80 @@ *, *::before, *::after { - box-sizing: border-box; + font-family: 'Roboto', sans-serif; + box-sizing: border-box; } body, main { - margin: 0; - padding: 0; - min-width: 100%; - min-height: 100vh; - font-family: sans-serif; - text-align: center; - color: #fff; - background: #000; + margin: 0; + padding: 0; + min-width: 100%; + /* min-height: 100vh; */ + text-align: center; + color: #fff; + background: #000; } button { - position: absolute; - left: 50%; - top: 50%; - width: 5em; - height: 2em; - margin-left: -2.5em; - margin-top: -1em; - z-index: 100; - padding: .25em .5em; - color: #fff; - background: #000; - border: 1px solid #fff; - border-radius: 4px; - cursor: pointer; - font-size: 1.15em; - font-weight: 200; - box-shadow: 0 0 10px rgba(255, 255, 255, 0.5); - transition: box-shadow .5s; + position: absolute; + left: 50%; + top: 45%; + width: 5em; + height: 2em; + margin-left: -2.5em; + margin-top: -1em; + z-index: 100; + padding: .25em .5em; + color: #fff; + background: #000; + border: 1px solid #fff; + border-radius: 4px; + cursor: pointer; + font-size: 1.15em; + font-weight: 200; + box-shadow: 0 0 10px rgba(255, 255, 255, 0.5); + transition: box-shadow .5s; } button:hover { - box-shadow: 0 0 30px 5px rgba(255, 255, 255, 0.75); + box-shadow: 0 0 30px 5px rgba(255, 255, 255, 0.75); } main { - position: relative; - display: flex; - justify-content: center; - align-items: center; + position: absolute; + top: 30%; + display: flex; + justify-content: center; + width: 100%; + height: 70%; } main>div { - display: inline-block; - width: 3px; - height: 100px; - margin: 0 7px; - background: currentColor; - transform: scaleY(.5); - opacity: .25; + display: inline-block; + width: 1.5px; + height: 100px; + top: 30%; + margin: 0 7px; + background: red; + transform: scaleY(.6); + opacity: .35; } main.error { - color: #f7451d; - min-width: 20em; - max-width: 30em; - margin: 0 auto; - white-space: pre-line; + color: #f7451d; + min-width: 20em; + max-width: 30em; + margin: 0 auto; + white-space: pre-line; } #transcript { - position: fixed; - top: 60%; - width: 100vw; - padding-left: 20vw; - padding-right: 20vw; - font-size: x-large; + position: absolute; + top: 55%; + width: 100vw; + padding-left: 20vw; + padding-right: 20vw; + font-size: x-medium; + color: #fff; } \ No newline at end of file diff --git a/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/templates/index.html b/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/templates/index.html deleted file mode 100644 index 7bb4b33..0000000 --- a/src/ASR-with-Speech-Sentiment-Analysis-Text-Summarizer/templates/index.html +++ /dev/null @@ -1,23 +0,0 @@ - - - - - - - - Speech Recognition Demo - - - - - - -
- -
-
- - - - - \ No newline at end of file