diff --git a/.devcontainer/requirements.txt b/.devcontainer/requirements.txt index 06d27d6e..3fbd847c 100644 --- a/.devcontainer/requirements.txt +++ b/.devcontainer/requirements.txt @@ -16,7 +16,7 @@ tensorflow-datasets==4.4.0 tensorflow-hub==0.12.0 tensorflow-text==2.8.1 tensorflow==2.8.1 -tensorboard==2.8.1 +tensorboard==2.8.0 tokenizers==0.10.3 torchinfo==0.0.8 tqdm==4.62.3 diff --git a/binder/requirements.txt b/binder/requirements.txt index 06d27d6e..3fbd847c 100644 --- a/binder/requirements.txt +++ b/binder/requirements.txt @@ -16,7 +16,7 @@ tensorflow-datasets==4.4.0 tensorflow-hub==0.12.0 tensorflow-text==2.8.1 tensorflow==2.8.1 -tensorboard==2.8.1 +tensorboard==2.8.0 tokenizers==0.10.3 torchinfo==0.0.8 tqdm==4.62.3 diff --git a/etc/how-to-run.md b/etc/how-to-run.md index 1a79b2b2..955009c1 100644 --- a/etc/how-to-run.md +++ b/etc/how-to-run.md @@ -11,9 +11,7 @@ After you install miniconda, you need to clone the repository and create a virtu ```bash git clone http://github.com/microsoft/ai-for-beginners cd ai-for-beginners -cd .devcontainer -conda env create --name ai4beg --file environment.yml -cd .. +conda env create --name ai4beg --file .devcontainer/environment.yml conda activate ai4beg ``` diff --git a/lessons/5-NLP/14-Embeddings/torchnlp.py b/lessons/5-NLP/14-Embeddings/torchnlp.py index cd709f0d..e8563177 100644 --- a/lessons/5-NLP/14-Embeddings/torchnlp.py +++ b/lessons/5-NLP/14-Embeddings/torchnlp.py @@ -23,9 +23,16 @@ def load_dataset(ngrams=1,min_freq=1): vocab = torchtext.vocab.vocab(counter, min_freq=min_freq) return train_dataset,test_dataset,classes,vocab +stoi_hash = {} def encode(x,voc=None,unk=0,tokenizer=tokenizer): + global stoi_hash v = vocab if voc is None else voc - return [v.get_stoi().get(s,unk) for s in tokenizer(x)] + if v in stoi_hash.keys(): + stoi = stoi_hash[v] + else: + stoi = v.get_stoi() + stoi_hash[v]=stoi + return [stoi.get(s,unk) for s in tokenizer(x)] def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200): optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr) diff --git a/lessons/5-NLP/16-RNN/RNNPyTorch.ipynb b/lessons/5-NLP/16-RNN/RNNPyTorch.ipynb index a621a012..06be3723 100644 --- a/lessons/5-NLP/16-RNN/RNNPyTorch.ipynb +++ b/lessons/5-NLP/16-RNN/RNNPyTorch.ipynb @@ -30,21 +30,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Loading dataset...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "d:\\WORK\\ai-for-beginners\\5-NLP\\16-RNN\\data\\train.csv: 29.5MB [00:01, 28.3MB/s] \n", - "d:\\WORK\\ai-for-beginners\\5-NLP\\16-RNN\\data\\test.csv: 1.86MB [00:00, 9.72MB/s] \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "Loading dataset...\n", "Building vocab...\n" ] } @@ -461,10 +447,10 @@ ], "metadata": { "interpreter": { - "hash": "0cb620c6d4b9f7a635928804c26cf22403d89d98d79684e4529119355ee6d5a5" + "hash": "16af2a8bbb083ea23e5e41c7f5787656b2ce26968575d8763f2c4b17f9cd711f" }, "kernelspec": { - "display_name": "py37_pytorch", + "display_name": "Python 3.8.12 ('py38')", "language": "python", "name": "python3" }, diff --git a/lessons/5-NLP/16-RNN/torchnlp.py b/lessons/5-NLP/16-RNN/torchnlp.py index cd709f0d..e8563177 100644 --- a/lessons/5-NLP/16-RNN/torchnlp.py +++ b/lessons/5-NLP/16-RNN/torchnlp.py @@ -23,9 +23,16 @@ def load_dataset(ngrams=1,min_freq=1): vocab = torchtext.vocab.vocab(counter, min_freq=min_freq) return train_dataset,test_dataset,classes,vocab +stoi_hash = {} def encode(x,voc=None,unk=0,tokenizer=tokenizer): + global stoi_hash v = vocab if voc is None else voc - return [v.get_stoi().get(s,unk) for s in tokenizer(x)] + if v in stoi_hash.keys(): + stoi = stoi_hash[v] + else: + stoi = v.get_stoi() + stoi_hash[v]=stoi + return [stoi.get(s,unk) for s in tokenizer(x)] def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200): optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr) diff --git a/lessons/5-NLP/17-GenerativeNetworks/torchnlp.py b/lessons/5-NLP/17-GenerativeNetworks/torchnlp.py index cd709f0d..e8563177 100644 --- a/lessons/5-NLP/17-GenerativeNetworks/torchnlp.py +++ b/lessons/5-NLP/17-GenerativeNetworks/torchnlp.py @@ -23,9 +23,16 @@ def load_dataset(ngrams=1,min_freq=1): vocab = torchtext.vocab.vocab(counter, min_freq=min_freq) return train_dataset,test_dataset,classes,vocab +stoi_hash = {} def encode(x,voc=None,unk=0,tokenizer=tokenizer): + global stoi_hash v = vocab if voc is None else voc - return [v.get_stoi().get(s,unk) for s in tokenizer(x)] + if v in stoi_hash.keys(): + stoi = stoi_hash[v] + else: + stoi = v.get_stoi() + stoi_hash[v]=stoi + return [stoi.get(s,unk) for s in tokenizer(x)] def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200): optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)