DRAGON release; update dependencies

Note: This commit requires an update to the environment and may break existing code.
facebookresearch · Feb 16, 2023 · a761800 · a761800
1 parent 4d31084
commit a761800
Show file tree

Hide file tree

Showing 82 changed files with 4,966 additions and 143 deletions.
diff --git a/README.md b/README.md
@@ -7,14 +7,79 @@ Scalable implementation of dense retrieval.
     - https://github.com/facebookresearch/dpr-scale/tree/main/spar
 - [CITADEL: Conditional Token Interaction via Dynamic Lexical Routing for Efficient and Effective Multi-Vector Retrieval](https://arxiv.org/abs/2211.10411)
     - https://github.com/facebookresearch/dpr-scale/tree/citadel
+- [How to Train Your DRAGON: Diverse Augmentation Towards Generalizable Dense Retrieval](https://arxiv.org/abs/2302.07452)
+    - https://github.com/facebookresearch/dpr-scale/tree/main/dragon
 
 
+## Input Data Format (JSONL)
+Linewise JSON file where each row typically looks like:
+```
+{
+    "question": ...,
+    "positive_ctxs": [
+        {
+        "title": "...",
+        "text": "....",
+        <optional>
+        "id": ...,
+        "relevance": ...
+        }, {...}, ...
+    ],
+    "hard_negative_ctxs": [{...}, {...}, ...]
+}
+```
+
+or
+```
+{
+    "question": ...,
+    "id": ...,
+    "ctxs": [
+        {
+        "has_answer": True or False,    
+        "title": "...",
+        "text": "....",
+        <optional>
+        "id": ...,
+        "relevance": ...
+        }, {...}, ...
+    ]
+}
+```
+
+If your training data is large, you can use a lightweight format by specifying the line number (`docidx` starting from 0) of the document in the corpus without storing its title and text:
+```
+{
+    "question": ...,
+    "positive_ctxs": [
+        {
+        "docidx": ..., # denote the position of the passage in the corpus, starting from 0
+        <optional>
+        "id": ...,
+        "relevance": ...
+        }, {...}, ...
+    ],
+    "hard_negative_ctxs": [{...}, {...}, ...]
+}
+```
+This format requires you to use `DenseRetrieverMultiJsonlDataModule` and set `--corpus_path` while training. See below example with config `msmarco_baseline.yaml`. The corpus format follow the default Wiki corpus format with header at first line:
+```
+id"\t"text"\t"title
+<id>"\t"<text>"\t"<title>
+...
+```
+
 ## Training on cluster
 
 By default it trains locally:
 
 ```
 PYTHONPATH=.:$PYTHONPATH python dpr_scale/main.py trainer.gpus=1
+``` 
+
+You can try our example of baseline training locally on MS MARCO dataset with the lightweight data format:
+```
+PYTHONPATH=.:$PYTHONPATH python dpr_scale/main.py -m --config-name msmarco_baseline.yaml 
 ```
 
 ### SLURM Training

diff --git a/dpr_scale/conf/ccnews_ict.yaml b/dpr_scale/conf/ccnews_ict.yaml
@@ -29,5 +29,4 @@ trainer:
   gradient_clip_val: 2.0
   accumulate_grad_batches: 1
   precision: 16
-  plugins: ddp_sharded
-  accelerator: ddp
+  strategy: ddp_sharded
diff --git a/dpr_scale/conf/config.py b/dpr_scale/conf/config.py
@@ -1,9 +1,8 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
+# (c) Facebook, Inc. and its affiliates. Confidential and proprietary.
 
 from dataclasses import dataclass, field
 from typing import List, Any
 
-# @manual "//github/facebookresearch/hydra:hydra"
 from hydra.core.config_store import ConfigStore
 from omegaconf import MISSING
 

diff --git a/dpr_scale/conf/convai2.yaml b/dpr_scale/conf/convai2.yaml
@@ -30,8 +30,7 @@ trainer:
   log_every_n_steps: 10
   gradient_clip_val: 2.0
   precision: 16
-  accelerator: ddp
-  plugins: ddp_sharded
+  strategy: ddp_sharded
 
 checkpoint_callback:
   monitor: valid_accuracy@1

diff --git a/dpr_scale/conf/datamodule/generate_multivec_query_emb.yaml b/dpr_scale/conf/datamodule/generate_multivec_query_emb.yaml
@@ -0,0 +1,4 @@
+# @package _global_
+datamodule:
+  _target_: dpr_scale.datamodule.citadel.DenseRetrieverQueriesDataModule
+  test_batch_size: 128
diff --git a/dpr_scale/conf/datamodule/generate_query_emb.yaml b/dpr_scale/conf/datamodule/generate_query_emb.yaml
@@ -0,0 +1,4 @@
+# @package _global_
+datamodule:
+  _target_: dpr_scale.datamodule.dpr.DenseRetrieverQueriesDataModule
+  test_batch_size: 128
diff --git a/dpr_scale/conf/dragon_aws.yaml b/dpr_scale/conf/dragon_aws.yaml
@@ -0,0 +1,35 @@
+defaults:
+  - config
+  - override trainer: slurm_aws
+  - override task/optim: adamw
+
+task:
+  shared_model: false
+  in_batch_eval: false
+  optim:
+    lr: 3e-5
+  warmup_steps: 10000
+
+datamodule:
+  _target_: dpr_scale.datamodule.dpr.DenseRetrieverMultiJsonlDataModule
+  corpus_path: /fsx/s269lin/data/msmarco/msmarco-passage/corpus/collection.tsv
+  train_path: [/fsx/s269lin/data/msmarco/msmarco-passage/train/uniCOIL.sentence.train.jsonl,/fsx/s269lin/data/msmarco/msmarco-passage/train/uniCOIL.doct5query.train.jsonl]
+  val_path: /fsx/s269lin/data/msmarco/msmarco-passage/train/dev.jsonl
+  test_path: /fsx/s269lin/data/msmarco/msmarco-passage/train/dev.jsonl
+  batch_size: 64
+  num_negative: 1
+  pos_ctx_sample: true
+  num_val_negative: 10
+  num_test_negative: 50
+  drop_last: false
+  use_title: false
+
+trainer:
+  gpus: 8
+  num_nodes: 4
+  max_epochs: 20
+  num_sanity_val_steps: 0
+  log_every_n_steps: 10
+  gradient_clip_val: 2.0
+  precision: 16
+  strategy: ddp
diff --git a/dpr_scale/conf/dstc7.yaml b/dpr_scale/conf/dstc7.yaml
@@ -32,8 +32,7 @@ trainer:
   log_every_n_steps: 10
   gradient_clip_val: 2.0
   precision: 16
-  accelerator: ddp
-  plugins: ddp_sharded
+  strategy: ddp_sharded
 
 checkpoint_callback:
   monitor: valid_accuracy@1

diff --git a/dpr_scale/conf/msmarco.yaml b/dpr_scale/conf/msmarco.yaml
@@ -6,7 +6,7 @@ defaults:
 task:
   shared_model: false
   in_batch_eval: false
-  optim: 
+  optim:
     lr: 2.0e-5
   warmup_steps: 10000
 
@@ -31,4 +31,4 @@ trainer:
   log_every_n_steps: 10
   gradient_clip_val: 2.0
   precision: 16
-  plugins: ddp_sharded
+  strategy: ddp_sharded
diff --git a/dpr_scale/conf/msmarco_aws.yaml b/dpr_scale/conf/msmarco_aws.yaml
@@ -0,0 +1,32 @@
+defaults:
+  - config
+  - override trainer: slurm_aws
+  - override task/optim: adamw
+
+task:
+  shared_model: false
+  in_batch_eval: false
+  optim:
+    lr: 2e-5
+  warmup_steps: 3000
+
+datamodule:
+  train_path: /fsx/mhli/msmarco_passage/msmarco_exp_train.jsonl
+  val_path: /fsx/mhli/msmarco_passage/msmarco_exp_dev.jsonl
+  test_path: /fsx/mhli/msmarco_passage/msmarco_exp_dev.jsonl
+  batch_size: 16
+  num_negative: 1
+  num_val_negative: 1
+  num_test_negative: 50
+  drop_last: false
+  use_title: true
+
+trainer:
+  gpus: 8
+  num_nodes: 1
+  max_epochs: 20
+  num_sanity_val_steps: 0
+  log_every_n_steps: 10
+  gradient_clip_val: 2.0
+  precision: 16
+  strategy: ddp
diff --git a/dpr_scale/conf/msmarco_baseline.yaml b/dpr_scale/conf/msmarco_baseline.yaml
@@ -0,0 +1,35 @@
+defaults:
+  - config
+  - override trainer: gpu_1_host
+  - override task/optim: adamw
+
+task:
+  shared_model: false
+  in_batch_eval: false
+  optim:
+    lr: 3e-5
+  warmup_steps: 10000
+
+datamodule:
+  _target_: dpr_scale.datamodule.dpr.DenseRetrieverMultiJsonlDataModule
+  corpus_path: /fsx/s269lin/data/msmarco/msmarco-passage/corpus/collection.tsv
+  train_path: [/fsx/s269lin/data/msmarco/msmarco-passage/train/official_train.jsonl]
+  val_path: /fsx/s269lin/data/msmarco/msmarco-passage/train/dev.jsonl
+  test_path: /fsx/s269lin/data/msmarco/msmarco-passage/train/dev.jsonl
+  batch_size: 8
+  num_negative: 7
+  pos_ctx_sample: true
+  num_val_negative: 10
+  num_test_negative: 50
+  drop_last: false
+  use_title: false
+
+trainer:
+  gpus: 1
+  num_nodes: 1
+  max_epochs: 3
+  num_sanity_val_steps: 0
+  log_every_n_steps: 10
+  gradient_clip_val: 2.0
+  precision: 16
+  strategy: ddp
diff --git a/dpr_scale/conf/nq.yaml b/dpr_scale/conf/nq.yaml
@@ -28,5 +28,4 @@ trainer:
   log_every_n_steps: 10
   gradient_clip_val: 2.0
   precision: 16
-  accelerator: ddp
-  plugins: ddp_sharded
+  strategy: ddp_sharded
diff --git a/dpr_scale/conf/nq_aws.yaml b/dpr_scale/conf/nq_aws.yaml
@@ -0,0 +1,31 @@
+defaults:
+  - config
+  - override trainer: slurm_aws
+
+task:
+  shared_model: false
+  in_batch_eval: false
+  optim:
+    lr: 2e-5
+  warmup_steps: 1237
+
+datamodule:
+  train_path: /fsx/xilun/dpr_data/retriever/nq-train.jsonl
+  val_path: /fsx/xilun/dpr_data/retriever/nq-dev.jsonl
+  test_path: /fsx/xilun/dpr_data/retriever/nq-dev.jsonl
+  batch_size: 16
+  num_negative: 1
+  num_val_negative: 1
+  num_test_negative: 50
+  drop_last: false
+  use_title: true
+
+trainer:
+  gpus: 8
+  num_nodes: 1
+  max_epochs: 40
+  num_sanity_val_steps: 0
+  log_every_n_steps: 10
+  gradient_clip_val: 2.0
+  precision: 16
+  strategy: ddp_sharded
diff --git a/dpr_scale/conf/nq_roberta.yaml b/dpr_scale/conf/nq_roberta.yaml
@@ -29,4 +29,4 @@ trainer:
   log_every_n_steps: 10
   gradient_clip_val: 2.0
   precision: 16
-  plugins: ddp_sharded
+  strategy: ddp_sharded
diff --git a/dpr_scale/conf/orcas.yaml b/dpr_scale/conf/orcas.yaml
@@ -34,5 +34,4 @@ trainer:
   gradient_clip_val: 2.0
   accumulate_grad_batches: 1
   precision: 16
-  plugins: ddp_sharded
-  accelerator: ddp
+  strategy: ddp_sharded
diff --git a/dpr_scale/conf/task/cross_encoder_rerank.yaml b/dpr_scale/conf/task/cross_encoder_rerank.yaml
@@ -0,0 +1,8 @@
+# @package _group_
+_target_: dpr_scale.task.cross_encoder_eval_task.RerankCrossEncoderTask
+shared_model: false
+in_batch_eval: false
+# Temporary till we reformat LightningModule
+datamodule: null
+warmup_steps: null
+pretrained_checkpoint_path: null
diff --git a/dpr_scale/conf/task/dpr.yaml b/dpr_scale/conf/task/dpr.yaml
@@ -4,5 +4,5 @@ shared_model: false
 in_batch_eval: false
 # Temporary till we reformat LightningModule
 datamodule: null
-warmup_steps: null
+warmup_steps: 0 # avoid error in inference
 pretrained_checkpoint_path:
diff --git a/dpr_scale/conf/task/dpr_rerank.yaml b/dpr_scale/conf/task/dpr_rerank.yaml
@@ -0,0 +1,8 @@
+# @package _group_
+_target_: dpr_scale.task.dpr_rerank_task.RerankDenseRetrieverTask
+shared_model: false
+in_batch_eval: false
+# Temporary till we reformat LightningModule
+datamodule: null
+warmup_steps: null
+pretrained_checkpoint_path:
diff --git a/dpr_scale/conf/task/model/citadel_model.yaml b/dpr_scale/conf/task/model/citadel_model.yaml
@@ -0,0 +1,6 @@
+# @package _group_
+_target_: dpr_scale.models.citadel_models.citadel_model.CITADELEncoder
+model_path: bert-base-uncased
+cls_projection_dim: 128
+tok_projection_dim: 32
+dropout: 0.1
diff --git a/dpr_scale/conf/task/model/coil_model.yaml b/dpr_scale/conf/task/model/coil_model.yaml
@@ -0,0 +1,6 @@
+# @package _group_
+_target_: dpr_scale.models.citadel_models.coil_model.COILEncoder
+model_path: bert-base-uncased
+projection_dim: 128
+cls_projection_dim: 128
+dropout: 0.1
diff --git a/dpr_scale/conf/task/model/colbert_model.yaml b/dpr_scale/conf/task/model/colbert_model.yaml
@@ -0,0 +1,5 @@
+# @package _group_
+_target_: dpr_scale.models.citadel_models.colbert_model.ColBERTEncoder
+model_path: bert-base-uncased
+projection_dim: 128
+dropout: 0.1
diff --git a/dpr_scale/conf/task/model/cross_encoder.yaml b/dpr_scale/conf/task/model/cross_encoder.yaml
@@ -0,0 +1,3 @@
+# @package _group_
+_target_: dpr_scale.models.citadel_models.cross_encoder.CrossEncoder
+model_path: /fsx/mhli/pre-trained/ms-marco-MiniLM-L-6-v2
diff --git a/dpr_scale/conf/task/model/splade_model.yaml b/dpr_scale/conf/task/model/splade_model.yaml
@@ -0,0 +1,4 @@
+# @package _group_
+_target_: dpr_scale.models.citadel_models.splade_model.SPLADEEncoder
+model_path: bert-base-uncased
+dropout: 0.1
diff --git a/dpr_scale/conf/task/multivec.yaml b/dpr_scale/conf/task/multivec.yaml
@@ -0,0 +1,8 @@
+# @package _group_
+_target_: dpr_scale.task.citadel_task.MultiVecRetrieverTask
+shared_model: false
+in_batch_eval: false
+# Temporary till we reformat LightningModule
+datamodule: null
+warmup_steps: null
+pretrained_checkpoint_path:
diff --git a/dpr_scale/conf/task/multivec_rerank.yaml b/dpr_scale/conf/task/multivec_rerank.yaml
@@ -0,0 +1,8 @@
+# @package _group_
+_target_: dpr_scale.task.citadel_eval_task.RerankMultiVecRetrieverTask
+shared_model: false
+in_batch_eval: false
+# Temporary till we reformat LightningModule
+datamodule: null
+warmup_steps: null
+pretrained_checkpoint_path: null
diff --git a/dpr_scale/conf/task/multivec_retrieval.yaml b/dpr_scale/conf/task/multivec_retrieval.yaml
@@ -0,0 +1,8 @@
+# @package _group_
+_target_: dpr_scale.task.citadel_retrieval_task.CITADELRetrievalTask
+shared_model: false
+in_batch_eval: false
+# Temporary till we reformat LightningModule
+datamodule: null
+warmup_steps: null
+pretrained_checkpoint_path:
diff --git a/dpr_scale/conf/trainer/slurm.yaml b/dpr_scale/conf/trainer/slurm.yaml
@@ -11,8 +11,7 @@ trainer:
   log_every_n_steps: 10
   gradient_clip_val: 2.0
   accumulate_grad_batches: 1
-  plugins: ddp_sharded
-  accelerator: ddp
+  strategy: ddp_sharded
   precision: 16
 
 hydra: