k2-fsa · glynpu · Mar 25, 2021 · Mar 29, 2021 · Mar 29, 2021 · Mar 30, 2021
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,14 @@
+[flake8]
+show-source=true
+statistics=true
+max-line-length=80
+exclude =
+  .git,
+
+ignore =
+  # E127 continuation line over-indented for visual indent
+  E127,
+  # F401, import but not used
+  F401,
+  # W504, line break after binary operator
+  W504,
diff --git a/egs/librispeech/asr/nnlm/local/common.py b/egs/librispeech/asr/nnlm/local/common.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+
+# Copyright (c)  2020  Xiaomi Corporation (author: Liyong Guo)
+# Apache 2.0
+
+# modified from https://github.com/k2-fsa/snowfall/blob/master/snowfall/common.py to save/load non-Acoustic Model
+import logging
+import os
+import torch
+
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+Pathlike = Union[str, Path]
+Info = Union[dict, None]
+
+
+def load_checkpoint(filename: Pathlike,
+                    model: torch.nn.Module,
+                    info: Info = None) -> Dict[str, Any]:
+    logging.info('load checkpoint from {}'.format(filename))
+
+    checkpoint = torch.load(filename, map_location='cpu')
+
+    model.load_state_dict(checkpoint['state_dict'])
+
+    return checkpoint
+
+
+def save_checkpoint(filename: Pathlike,
+                    model: torch.nn.Module,
+                    info: Info = None) -> None:
+    if not os.path.exists(os.path.dirname(filename)):
+        Path(os.path.dirname(filename)).mkdir(parents=True, exist_ok=True)
+    logging.info(f'Save checkpoint to {filename}')
+    checkpoint = {
+        'state_dict': model.state_dict(),
+    }
+    if info is not None:
+        checkpoint.update(info)
+
+    torch.save(checkpoint, filename)
diff --git a/egs/librispeech/asr/nnlm/local/dataset.py b/egs/librispeech/asr/nnlm/local/dataset.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+# Copyright (c)  2020  Xiaomi Corporation (author: Liyong Guo)
+# Apache 2.0
+
+import time
+from torch.utils.data import Dataset, DataLoader
+from torch.nn.utils.rnn import pad_sequence
+from typing import List
+
+import numpy as np
+import os
+import torch
+
+
+class CollateFunc(object):
+    '''Collate function for LMDataset
+    '''
+
+    def __init__(self, pad_index=0):
+        # pad_index should be identical to ignore_index of torch.nn.NLLLoss
+        self.pad_index = pad_index
+
+    def __call__(self, batch: List[List[int]]):
+        '''batch contains token_id.
+           batch can be viewd as a ragged 2-d array, with a row represents a token_id.
+           token_id reprents a tokenized text, whose format is:
+           <bos_id> token_id token_id token_id *** <eos_id>
+        '''
+        data_pad = pad_sequence(
+            [torch.from_numpy(np.array(x)).long() for x in batch], True,
+            self.pad_index)
+        xs_pad = data_pad[:, :-1]
+        ys_pad = data_pad[:, 1:]
+        return xs_pad, ys_pad
+
+
+class LMDataset(Dataset):
+
+    def __init__(self, text_file: str):
+        '''Dataset to load Language Model train/dev text data
+
+        Args:
+            text_file: text file, text for one utt per line.
+        '''
+        assert os.path.exists(
+            text_file
+        ), "text_file: {} does not exist, please check that.".format(text_file)
+        self.data = []
+        with open(text_file, 'r') as f:
+            for idx, line in enumerate(f):
+                token_id = [int(i) for i in line.strip().split()]
+                # TODO(Liyong Guo): add bos_id and eos_id to each piece of example
+                # then each valid example should be longer than 2
+                if len(token_id) > 2:
+                    self.data.append(token_id)
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        return self.data[idx]
+
+
+if __name__ == '__main__':
+    dev_file = "./data/nnlm/text/dev.txt.tokens"
+    dataset = LMDataset(dev_file)
+    collate_func = CollateFunc()
+    data_loader = DataLoader(dataset,
+                             batch_size=2,
+                             shuffle=True,
+                             num_workers=0,
+                             collate_fn=collate_func)
+    for i, batch in enumerate(data_loader):
+        xs, ys = batch
+        print(xs)
+        print(ys)
+        print(batch)
diff --git a/egs/librispeech/asr/nnlm/local/generate_lexicon.py b/egs/librispeech/asr/nnlm/local/generate_lexicon.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+
+# Copyright (c)  2020  Xiaomi Corporation (author: Liyong Guo)
+# Apache 2.0
+
+import argparse
+import collections
+from tokenizers import Tokenizer
+from tokenizers.models import WordPiece
+from tokenizers import decoders
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description='generate words.txt tokens.txt and lexicon.txt')
+    parser.add_argument('--lexicon-path',
+                        default='data/nnlm/lexicon',
+                        type=str,
+                        help="path to save lexicon files")
+    parser.add_argument('--tokenizer-path',
+                        type=str,
+                        default='./data/lm_train/tokenizer-librispeech.json',
+                        help="path to load tokenizer")
+    parser.add_argument('--train-file',
+                        default='data/nnlm/text/librispeech.txt',
+                        type=str,
+                        help="""file to be tokenized""")
+    args = parser.parse_args()
+    return args
+
+
+def generate_tokens(args):
+    ''' Extract symbols and there corresponding ids from a tokenizer,
+        and save as tokens.txt.
+        An example file looks like:
+        a 1
+        b 2
+        c 3
+        ...
+        it 100
+        sh 101
+
+    '''
+
+    tokenizer = Tokenizer.from_file(args.tokenizer_path)
+    symbols = tokenizer.get_vocab()
+    tokens_file = '{}/tokens.txt'.format(args.lexicon_path)
+    tokens_f = open(tokens_file, 'w')
+    id2sym = dict((v, k.lower()) for k, v in symbols.items())
+    for idx in range(len(symbols)):
+        assert idx in id2sym
+        tokens_f.write('{} {}\n'.format(id2sym[idx], idx))
+
+    tokens_f.close()
+
+
+def generate_lexicon(args, words):
+    ''' Tokenize every word in words.txt and save as lexicont.txt. 
+        Each line represents a word and its tokenized representation, i.e. a sequence of tokens. a word and its tokens are seprated by a table.
+
+        An example file looks like:
+
+        abbreviating	abb ##re ##via ##ting
+        abbreviation	abb ##re ##via ##t ##ion
+        abbreviations	abb ##re ##via ##t ##ions
+
+    '''
+    special_words = [
+        '<eps>', '!SIL', '<SPOKEN_NOISE>', '<UNK>', '<s>', '</s>', '#0'
+    ]
+    lexicon_file = '{}/lexicon.txt'.format(args.lexicon_path)
+    lf = open(lexicon_file, 'w')
+    tokenizer = Tokenizer.from_file(args.tokenizer_path)
+    tokenizer.decoder = decoders.WordPiece()
+    for word in words:
+        if not (word.upper() in special_words or
+                word.lower() in special_words):
+            output = tokenizer.encode(word)
+            tokens = ' '.join(output.tokens)
+        else:
+            tokens = '[unk]'
+        lf.write("{}\t{}\n".format(word.lower(), tokens.lower()))
+    lf.close()
+
+
+def load_words(args):
+    words = []
+    tokens_file = '{}/words.txt'.format(args.lexicon_path)
+
+    with open(tokens_file) as f:
+        for line in f:
+            arr = line.strip().split()
+            words.append(arr[0].lower())
+
+    return words
+
+
+def main():
+    args = get_args()
+    generate_tokens(args)
+    words = load_words(args)
+    generate_lexicon(args, words)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py b/egs/librispeech/asr/nnlm/local/huggingface_tokenizer.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+
+# Copyright (c)  2020  Xiaomi Corporation (author: Liyong Guo)
+# Apache 2.0
+
+# reference: https://huggingface.co/docs/tokenizers/python/latest/quicktour.html
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+from tokenizers import Tokenizer
+from tokenizers.models import WordPiece
+from tokenizers import normalizers
+from tokenizers.normalizers import Lowercase, NFD, StripAccents
+from tokenizers.pre_tokenizers import Whitespace
+from tokenizers.trainers import WordPieceTrainer
+from tokenizers import decoders
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description='train and tokenize with huggingface tokenizer')
+    parser.add_argument('--train-file',
+                        type=str,
+                        help="""file to train tokenizer""")
+    parser.add_argument('--vocab-size',
+                        type=int,
+                        default=10000,
+                        help="""number of tokens of the tokenizer""")
+    parser.add_argument('--tokenizer-path',
+                        type=str,
+                        help="path to save or load tokenizer")
+    parser.add_argument('--test-file',
+                        type=str,
+                        help="""file to be tokenized""")
+    args = parser.parse_args()
+    return args
+
+
+def train_tokenizer(train_files, save_path, vocab_size):
+    if os.path.exists(save_path):
+        logging.warning(
+            "{} already exists. Backing up that.".format(save_path))
+        shutil.move(save_path, '{}'.format(save_path))
+    else:
+        Path(os.path.dirname(save_path)).mkdir(parents=True, exist_ok=True)
+
+    tokenizer = Tokenizer(WordPiece(unk_token='[UNK]'))
+    tokenizer.normalizer = normalizers.Sequence(
+        [NFD(), Lowercase(), StripAccents()])
+    tokenizer.pre_tokenizer = Whitespace()
+
+    # default vocab_size=30000
+    trainer = WordPieceTrainer(vocab_size=vocab_size, special_tokens=['[UNK]'])
+    tokenizer.train(train_files, trainer)
+    tokenizer.save(save_path)
+
+
+def tokenize_text(test_file, tokenizer_path):
+    '''
+    tokenize text
+    input format looks like:
+        BOY IS BETTER UNBORN THAN
+        BRAVE OFFICER
+
+
+    output format looks like:
+        355 127 794 4824 346 370
+        1330 1898
+    '''
+    if not os.path.exists(tokenizer_path):
+        logging.warning("Tokenizer {} does not exist.".format(tokenizer_path))
+        return
+    tokenizer = Tokenizer.from_file(tokenizer_path)
+    tokenizer.decoder = decoders.WordPiece()
+    tokenized_file = "{}.tokens".format(test_file)
+    if os.path.exists(tokenized_file):
+        logging.warning(
+            "The input file seems already tokenized. Buckupping previous result"
+        )
+        shutil.move(tokenized_file, "{}.bk".format(tokenized_file))
+    logging.warning("Tokenizing {}.".format(test_file))
+    fout = open(tokenized_file, 'w')
+    with open(test_file) as f:
+        for line in f:
+            line = line.strip()
+            output = tokenizer.encode(line)
+            if len(output.ids) > 0:
+                fout.write(' '.join([str(i) for i in output.ids]) + '\n')
+
+    fout.close()
+
+
+def main():
+    args = get_args()
+    if args.train_file is not None:
+        train_files = [args.train_file]
+        train_tokenizer(train_files, args.tokenizer_path, args.vocab_size)
+
+    if args.test_file is not None:
+        tokenize_text(args.test_file, args.tokenizer_path)
+
+
+if __name__ == '__main__':
+    main()