Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Getting empty results from online streaming asr. Please help me!!!!! thanks a lot. #9533

Open
Enkar-Bolat opened this issue Jun 25, 2024 · 0 comments
Assignees
Labels
bug Something isn't working

Comments

@Enkar-Bolat
Copy link

Following is my code. I am running in colab, and i copied some of the code from online streaming asr using microphone.(https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Online_ASR_Microphone_Demo_Cache_Aware_Streaming.ipynb). And i am getting empty text from the model. i used https://huggingface.co/datasets/speechcolab/gigaspeech/viewer/xl this dataset to fake as streaming dataset.
import time
import torch
import copy
import numpy as np
from datetime import datetime
from logging import Logger
from omegaconf import OmegaConf, open_dict
import nemo.collections.asr as nemo_asr
from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE

class Listener:

def __init__(self, ):

    self.init_model()
    self.init_preprocessor()
    self.run()

def init_model(self):
    self.asr_model = nemo_asr.models.ASRModel.from_pretrained(model_name='stt_en_fastconformer_hybrid_large_streaming_multi')
    self.lookahead_size = 80
    self.encoder_step_length = 80
    self.left_context_size = self.asr_model.encoder.att_context_size[0]
    self.asr_model.encoder.set_default_att_context_size([self.left_context_size, int(self.lookahead_size / self.encoder_step_length)])
    self.asr_model.change_decoding_strategy(decoder_type='rnnt')
    self.decoding_cfg = self.asr_model.cfg.decoding
    self.set_decoding_strategy()
    self.asr_model.eval()
    self.cache_last_channel, self.cache_last_time, self.cache_last_channel_len = self.asr_model.encoder.get_initial_cache_state(batch_size=1)
    self.previous_hypotheses = None
    self.pred_out_stream = None
    self.step_num = 0
    self.pre_encode_cache_size = self.asr_model.encoder.streaming_cfg.pre_encode_cache_size[1]
    self.num_channels = self.asr_model.cfg.preprocessor.features
    self.cache_pre_encode = torch.zeros((1, self.num_channels, self.pre_encode_cache_size), device=self.asr_model.device)


def init_preprocessor(self):
    cfg = copy.deepcopy(self.asr_model._cfg)
    OmegaConf.set_struct(cfg.preprocessor, False)

    # some changes for streaming scenario
    cfg.preprocessor.dither = 0.0
    cfg.preprocessor.pad_to = 0
    cfg.preprocessor.normalize = "None"
    
    self.preprocessor = EncDecCTCModelBPE.from_config_dict(cfg.preprocessor)
    self.preprocessor.to(self.asr_model.device)
    
def set_decoding_strategy(self):
    with open_dict(self.decoding_cfg):
        self.decoding_cfg.strategy = "greedy"
        self.decoding_cfg.preserve_alignments = False
        if hasattr(self.asr_model, 'joint'):  # if an RNNT model
            self.decoding_cfg.greedy.max_symbols = 10
            self.decoding_cfg.fused_batch_size = -1
        self.asr_model.change_decoding_strategy(self.decoding_cfg)

def preprocess_audio(self, audio):
    audio = np.frombuffer(audio, dtype=np.int16)
    audio = audio.astype(np.float32) / 32768.0
    audio = np.clip(audio, -1.0, 1.0)
    device = self.asr_model.device
    audio_signal = torch.from_numpy(audio).unsqueeze_(0).to(device)
    audio_signal_len = torch.Tensor([audio.shape[0]]).to(device)
    processed_signal, processed_signal_length = self.preprocessor(
        input_signal=audio_signal, length=audio_signal_len
    )
    return processed_signal, processed_signal_length

def transcribe(self, audio):
    processed_signal, processed_signal_length = self.preprocess_audio(audio)
    processed_signal = torch.cat([self.cache_pre_encode, processed_signal], dim=-1)
    processed_signal_length += self.cache_pre_encode.shape[1]
    self.cache_pre_encode = processed_signal[:, :, -self.pre_encode_cache_size:]
    with torch.no_grad():
        (
            self.pred_out_stream,
            transcribed_texts,
            self.cache_last_channel,
            self.cache_last_time,
            self.cache_last_channel_len,
            self.previous_hypotheses,
        ) = self.asr_model.conformer_stream_step(
            processed_signal=processed_signal,
            processed_signal_length=processed_signal_length,
            cache_last_channel=self.cache_last_channel,
            cache_last_time=self.cache_last_time,
            cache_last_channel_len=self.cache_last_channel_len,
            keep_all_outputs=False,
            previous_hypotheses=self.previous_hypotheses,
            previous_pred_out=self.pred_out_stream,
            drop_extra_pre_encoded=None,
            return_transcription=True,
        )
    
    print(transcribed_texts[0].text)
    print(len(transcribed_texts))
    self.step_num += 1    
def run(self):
  from huggingface_hub import notebook_login
  notebook_login()
  gigaspeech = load_dataset("speechcolab/gigaspeech", "xs", use_auth_token=True,token="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",streaming=True)
  i=0
  while True:
    audio_bytes = next(iter(gigaspeech["test"]))['audio']['array']
    self.transcribe(audio_bytes)
    i+=1
    if i==10:
      break
    # time.sleep(1)

Listener().run()
resullt is 1 and empty line.

@Enkar-Bolat Enkar-Bolat added the bug Something isn't working label Jun 25, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

No branches or pull requests

2 participants