Update environment and torchnlp.py

Jimmy-INL · May 25, 2022 · de581e2 · de581e2
1 parent bfb93b0
commit de581e2
Show file tree

Hide file tree

Showing 7 changed files with 30 additions and 25 deletions.
diff --git a/.devcontainer/requirements.txt b/.devcontainer/requirements.txt
@@ -16,7 +16,7 @@ tensorflow-datasets==4.4.0
 tensorflow-hub==0.12.0
 tensorflow-text==2.8.1
 tensorflow==2.8.1
-tensorboard==2.8.1
+tensorboard==2.8.0
 tokenizers==0.10.3
 torchinfo==0.0.8
 tqdm==4.62.3

diff --git a/binder/requirements.txt b/binder/requirements.txt
@@ -16,7 +16,7 @@ tensorflow-datasets==4.4.0
 tensorflow-hub==0.12.0
 tensorflow-text==2.8.1
 tensorflow==2.8.1
-tensorboard==2.8.1
+tensorboard==2.8.0
 tokenizers==0.10.3
 torchinfo==0.0.8
 tqdm==4.62.3

diff --git a/etc/how-to-run.md b/etc/how-to-run.md
@@ -11,9 +11,7 @@ After you install miniconda, you need to clone the repository and create a virtu
 ```bash
 git clone http://github.com/microsoft/ai-for-beginners
 cd ai-for-beginners
-cd .devcontainer
-conda env create --name ai4beg --file environment.yml
-cd ..
+conda env create --name ai4beg --file .devcontainer/environment.yml
 conda activate ai4beg
 ```
 

diff --git a/lessons/5-NLP/14-Embeddings/torchnlp.py b/lessons/5-NLP/14-Embeddings/torchnlp.py
@@ -23,9 +23,16 @@ def load_dataset(ngrams=1,min_freq=1):
     vocab = torchtext.vocab.vocab(counter, min_freq=min_freq)
     return train_dataset,test_dataset,classes,vocab
 
+stoi_hash = {}
 def encode(x,voc=None,unk=0,tokenizer=tokenizer):
+    global stoi_hash
     v = vocab if voc is None else voc
-    return [v.get_stoi().get(s,unk) for s in tokenizer(x)]
+    if v in stoi_hash.keys():
+        stoi = stoi_hash[v]
+    else:
+        stoi = v.get_stoi()
+        stoi_hash[v]=stoi        
+    return [stoi.get(s,unk) for s in tokenizer(x)]
 
 def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200):
     optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)

diff --git a/lessons/5-NLP/16-RNN/RNNPyTorch.ipynb b/lessons/5-NLP/16-RNN/RNNPyTorch.ipynb
@@ -30,21 +30,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Loading dataset...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "d:\\WORK\\ai-for-beginners\\5-NLP\\16-RNN\\data\\train.csv: 29.5MB [00:01, 28.3MB/s]                            \n",
-      "d:\\WORK\\ai-for-beginners\\5-NLP\\16-RNN\\data\\test.csv: 1.86MB [00:00, 9.72MB/s]                          \n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "Loading dataset...\n",
       "Building vocab...\n"
      ]
     }
@@ -461,10 +447,10 @@
  ],
  "metadata": {
   "interpreter": {
-   "hash": "0cb620c6d4b9f7a635928804c26cf22403d89d98d79684e4529119355ee6d5a5"
+   "hash": "16af2a8bbb083ea23e5e41c7f5787656b2ce26968575d8763f2c4b17f9cd711f"
   },
   "kernelspec": {
-   "display_name": "py37_pytorch",
+   "display_name": "Python 3.8.12 ('py38')",
    "language": "python",
    "name": "python3"
   },

diff --git a/lessons/5-NLP/16-RNN/torchnlp.py b/lessons/5-NLP/16-RNN/torchnlp.py
@@ -23,9 +23,16 @@ def load_dataset(ngrams=1,min_freq=1):
     vocab = torchtext.vocab.vocab(counter, min_freq=min_freq)
     return train_dataset,test_dataset,classes,vocab
 
+stoi_hash = {}
 def encode(x,voc=None,unk=0,tokenizer=tokenizer):
+    global stoi_hash
     v = vocab if voc is None else voc
-    return [v.get_stoi().get(s,unk) for s in tokenizer(x)]
+    if v in stoi_hash.keys():
+        stoi = stoi_hash[v]
+    else:
+        stoi = v.get_stoi()
+        stoi_hash[v]=stoi        
+    return [stoi.get(s,unk) for s in tokenizer(x)]
 
 def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200):
     optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)

diff --git a/lessons/5-NLP/17-GenerativeNetworks/torchnlp.py b/lessons/5-NLP/17-GenerativeNetworks/torchnlp.py
@@ -23,9 +23,16 @@ def load_dataset(ngrams=1,min_freq=1):
     vocab = torchtext.vocab.vocab(counter, min_freq=min_freq)
     return train_dataset,test_dataset,classes,vocab
 
+stoi_hash = {}
 def encode(x,voc=None,unk=0,tokenizer=tokenizer):
+    global stoi_hash
     v = vocab if voc is None else voc
-    return [v.get_stoi().get(s,unk) for s in tokenizer(x)]
+    if v in stoi_hash.keys():
+        stoi = stoi_hash[v]
+    else:
+        stoi = v.get_stoi()
+        stoi_hash[v]=stoi        
+    return [stoi.get(s,unk) for s in tokenizer(x)]
 
 def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200):
     optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)