add ddp

ChenghaoMou · ChenghaoMou · commit 697d672ed06e · 2021-03-29T12:22:06.000-07:00
diff --git a/README.md b/README.md
@@ -2,7 +2,6 @@
 
 ![PyPI](https://img.shields.io/pypi/v/pytorch-pqrnn?style=plastic) ![Maintenance](https://img.shields.io/maintenance/yes/2021?style=plastic) ![PyPI - License](https://img.shields.io/pypi/l/pytorch-pqrnn?style=plastic)
 
-<<<<<<< HEAD
 ## Installation
 
 ```bash
@@ -15,17 +14,6 @@ poetry install
 ## Environment
 
 Because of [this issue](https://github.com/salesforce/pytorch-qrnn/issues/29), `pytorch-qrnn` is no longer compatible with pytorch and it is also not actively maintained. If you want to use a QRNN layer in this model, you have install `pytorch-qrnn` with `torch <= 1.4` first.
-=======
-## Note
-
-Because of [this issue](https://github.com/salesforce/pytorch-qrnn/issues/29), [QRNN](https://github.com/salesforce/pytorch-qrnn) is not supported with `torch >= 1.7`. If you want to use a QRNN layer with this repo, please follow the instructions [here](https://github.com/salesforce/pytorch-qrnn) to install `python-qrnn` first with  downgraded `torch <= 1.4`. Otherwise, you can directly run 
-
-```
-pip install -r requirements.txt
-```
-
-to set up the env.
->>>>>>> d83b7c7e27e32583a585d93e463d7f82192622c4
 
 ## Usage
 
@@ -97,7 +85,7 @@ Datasets
 | ------------------------ | ---------- | -------------------------- | ----------------- | --------------------------- | ---------------------------------------------------------------- |
 | ~~PQRNN (this repo)~~<sup>0</sup>    | ~~78K~~    | ~~6.3~~                    | ~~70.4~~          | ~~TODO~~                    | `--b 128 --d 64 --num_layers 4 --rnn_type QRNN`                  |
 | PRNN (this repo)         | 90K        | 5.5                        | **70.7**          | 95.57                       | `--b 128 --d 64 --num_layers 1 --rnn_type GRU`                   |
-| PTransformer (this repo) | 617K       | 10.8                       | 68              | 86.5                        | `--b 128 --d 64 --num_layers 1 --rnn_type Transformer --nhead 2` |
+| PTransformer (this repo) | 618K       | 10.8                       | 68              | 92.4                        | `--b 128 --d 64 --num_layers 1 --rnn_type Transformer --nhead 8` |
 | PRADO<sup>1</sup>        | 175K       |                            | 65.9              |                             |                                                                  |
 | BERT                     | 335M       | **1.81**                   | 70.58             | **98.856**<sup>2</sup>      |                                                                  |
 0.  Not supported with `torch >= 1.7`
diff --git a/pytorch_pqrnn/model.py b/pytorch_pqrnn/model.py
@@ -15,11 +15,8 @@
     import pytorch_lightning as pl
     import torch
     import torch.nn as nn
-    from pytorch_lightning.metrics.functional import f1_score
-    from pytorch_lightning.metrics.functional.classification import (
-        accuracy,
-        auroc,
-    )
+    from pytorch_lightning.metrics.functional import accuracy, auroc
+    from pytorch_lightning.metrics.functional import f1 as f1_score
     from torch.nn import TransformerEncoder, TransformerEncoderLayer
     from torch.optim.lr_scheduler import ReduceLROnPlateau
 
@@ -184,7 +181,14 @@ def validation_epoch_end(self, outputs):
                 "val_auroc",
                 np.mean(
                     [
-                        auroc(logits[:, i], labels[:, i]).detach().cpu().item()
+                        auroc(
+                            torch.sigmoid(logits[:, i]),
+                            labels[:, i],
+                            pos_label=1,
+                        )
+                        .detach()
+                        .cpu()
+                        .item()
                         for i in range(logits.shape[1])
                     ]
                 ),
diff --git a/run.py b/run.py
@@ -3,6 +3,7 @@
 import torch
 from pytorch_lightning import loggers as pl_loggers
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
+from pytorch_lightning.plugins import DeepSpeedPlugin
 from pytorch_pqrnn.dataset import create_dataloaders
 from pytorch_pqrnn.model import PQRNN
 from rich.console import Console
@@ -46,6 +47,36 @@ def train(
     data_path: str,
 ):
 
+    deepspeed_config = {
+        "zero_allow_untested_optimizer": True,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": lr,
+                "betas": [0.998, 0.999],
+                "eps": 1e-5,
+                "weight_decay": 1e-9,
+            },
+        },
+        "scheduler": {
+            "type": "WarmupLR",
+            "params": {
+                "last_batch_iteration": -1,
+                "warmup_min_lr": 0,
+                "warmup_max_lr": 3e-5,
+                "warmup_num_steps": 100,
+            },
+        },
+        "zero_optimization": {
+            "stage": 2,  # Enable Stage 2 ZeRO (Optimizer/Gradient state partitioning)
+            "cpu_offload": True,  # Enable Offloading optimizer state/calculation to the host CPU
+            "contiguous_gradients": True,  # Reduce gradient fragmentation.
+            "overlap_comm": True,  # Overlap reduce/backward operation of gradients for speed.
+            "allgather_bucket_size": 2e8,  # Number of elements to all gather at once.
+            "reduce_bucket_size": 2e8,  # Number of elements we reduce/allreduce at once.
+        },
+    }
+
     train_dataloader, dev_dataloader = create_dataloaders(
         task,
         batch_size=batch_size,
@@ -69,16 +100,18 @@ def train(
 
     trainer = pl.Trainer(
         logger=pl_loggers.TensorBoardLogger("lightning_logs", log_graph=False),
-        callbacks=[EarlyStopping(monitor="val_loss", patience=10)],
+        callbacks=[EarlyStopping(monitor="val_loss", patience=5)],
         checkpoint_callback=ModelCheckpoint(
             "./checkpoints/", monitor="val_loss"
         ),
         min_epochs=2,
         deterministic=True,
         val_check_interval=0.2,
-        gpus=[0] if torch.cuda.is_available() else None,
+        gpus=list(range(torch.cuda.device_count()))
+        if torch.cuda.is_available()
+        else None,
         gradient_clip_val=1.0,
-        plugins="deepspeed" if torch.cuda.is_available() else None,
+        accelerator="ddp" if torch.cuda.is_available() else None,
         precision=16 if torch.cuda.is_available() else 32,
         accumulate_grad_batches=2 if rnn_type == "Transformer" else 1,
     )