From f5f4cb15cb4384d04d52472d304373cdbec55fd2 Mon Sep 17 00:00:00 2001 From: Benjamin Piwowarski Date: Thu, 7 Dec 2023 09:54:12 +0100 Subject: [PATCH] Move monobert/duobert to xpmir/cross-encoders project --- src/xpmir/papers/duobert/__init__.py | 6 - src/xpmir/papers/duobert/configuration.py | 23 --- src/xpmir/papers/duobert/experiment.py | 153 ----------------- src/xpmir/papers/duobert/normal.yaml | 52 ------ src/xpmir/papers/duobert/small.yaml | 48 ------ src/xpmir/papers/monobert/__init__.py | 13 -- src/xpmir/papers/monobert/configuration.py | 51 ------ src/xpmir/papers/monobert/experiment.py | 185 -------------------- src/xpmir/papers/monobert/finetune.py | 188 --------------------- src/xpmir/papers/monobert/normal.yaml | 37 ---- src/xpmir/papers/monobert/small.yaml | 39 ----- 11 files changed, 795 deletions(-) delete mode 100644 src/xpmir/papers/duobert/__init__.py delete mode 100644 src/xpmir/papers/duobert/configuration.py delete mode 100644 src/xpmir/papers/duobert/experiment.py delete mode 100644 src/xpmir/papers/duobert/normal.yaml delete mode 100644 src/xpmir/papers/duobert/small.yaml delete mode 100644 src/xpmir/papers/monobert/__init__.py delete mode 100644 src/xpmir/papers/monobert/configuration.py delete mode 100644 src/xpmir/papers/monobert/experiment.py delete mode 100644 src/xpmir/papers/monobert/finetune.py delete mode 100644 src/xpmir/papers/monobert/normal.yaml delete mode 100644 src/xpmir/papers/monobert/small.yaml diff --git a/src/xpmir/papers/duobert/__init__.py b/src/xpmir/papers/duobert/__init__.py deleted file mode 100644 index 8d63ef26..00000000 --- a/src/xpmir/papers/duobert/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from .. import Experiment - -# Experiments -PAPERS = [ - Experiment("msmarco", "duo-BERT trained on MS-Marco passages", "experiment:cli") -] diff --git a/src/xpmir/papers/duobert/configuration.py b/src/xpmir/papers/duobert/configuration.py deleted file mode 100644 index 0baea94f..00000000 --- a/src/xpmir/papers/duobert/configuration.py +++ /dev/null @@ -1,23 +0,0 @@ -from attrs import Factory -from xpmir.papers.helpers import configuration -from xpmir.papers.monobert.configuration import ( - Monobert, - Learner as BaseLearner, - Retrieval as BaseRetrieval, -) - - -@configuration() -class Retrieval(BaseRetrieval): - base_k: int = 30 - - -@configuration() -class Learner(BaseLearner): - base_validation_top_k: int = 30 - - -@configuration() -class DuoBERT(Monobert): - duobert: Learner = Factory(Learner) - retrieval: Retrieval = Factory(Retrieval) diff --git a/src/xpmir/papers/duobert/experiment.py b/src/xpmir/papers/duobert/experiment.py deleted file mode 100644 index a0dbc966..00000000 --- a/src/xpmir/papers/duobert/experiment.py +++ /dev/null @@ -1,153 +0,0 @@ -# Implementation of the experiments in the paper Multi-Stage Document Ranking -# with BERT (Rodrigo Nogueira, Wei Yang, Kyunghyun Cho, Jimmy Lin). 2019. -# https://arxiv.org/abs/1910.14424 - -# An imitation of examples/msmarco-reranking.py - - -from functools import partial -import logging -from experimaestro.launcherfinder import find_launcher -from xpmir.distributed import DistributedHook -from xpmir.learning.learner import Learner -from xpmir.letor.learner import ValidationListener -from xpmir.learning.optim import TensorboardService - -import xpmir.letor.trainers.pairwise as pairwise -from xpmir.neural.cross import DuoCrossScorer -from experimaestro import experiment, setmeta -from xpmir.learning.batchers import PowerAdaptativeBatcher -from xpmir.papers.cli import paper_command -from xpmir.papers.helpers.samplers import ( - prepare_collection, - msmarco_v1_docpairs_sampler, - msmarco_v1_tests, - msmarco_v1_validation_dataset, -) -from xpmir.text.huggingface import DualDuoBertTransformerEncoder -from xpmir.papers.monobert.experiment import ( - get_retrievers, - run as monobert_run, -) -from xpmir.papers.results import PaperResults -from .configuration import DuoBERT - -logging.basicConfig(level=logging.INFO) - - -def run(xp: experiment, cfg: DuoBERT, tensorboard_service: TensorboardService): - monobert_results = monobert_run(xp, cfg, tensorboard_service) - - launcher_learner = find_launcher(cfg.monobert.requirements) - launcher_evaluate = find_launcher(cfg.retrieval.requirements) - - monobert_scorer = monobert_results.models["monobert-RR@10"] - documents = prepare_collection("irds.msmarco-passage.documents") - device = cfg.device - random = cfg.random - ds_val = msmarco_v1_validation_dataset(cfg.validation) - tests = msmarco_v1_tests() - - # ------Start the code for the duobert - - # Define the trainer for the duobert - duobert_trainer = pairwise.DuoPairwiseTrainer( - lossfn=pairwise.PairwiseLossWithTarget().tag("loss", "duo_logp"), - sampler=msmarco_v1_docpairs_sampler(), - batcher=PowerAdaptativeBatcher(), - batch_size=cfg.duobert.optimization.batch_size, - ) - - # The scorer(model) for the duobert - duobert_scorer: DuoCrossScorer = DuoCrossScorer( - encoder=DualDuoBertTransformerEncoder(trainable=True, dropout=0.1) - ).tag("duo-reranker", "duobert") - - # Validation: we use monoBERT but only keep validation_top_k - # results - - retrievers, model_based_retrievers = get_retrievers(cfg) - - monobert_val_retrievers = partial( - model_based_retrievers, - retrievers=partial(retrievers, k=cfg.duobert.base_validation_top_k), - top_k=cfg.duobert.validation_top_k, - scorer=monobert_scorer, - ) - - val_retriever = model_based_retrievers( - documents, retrievers=monobert_val_retrievers, scorer=duobert_scorer - ) - - # The validation listener evaluates the full retriever - # (retriever + reranker) and keep the best performing model - # on the validation set - validation = ValidationListener( - id="bestval", - dataset=ds_val, - retriever=val_retriever, - validation_interval=cfg.duobert.validation_interval, - metrics={"RR@10": True, "AP": False, "nDCG": False}, - ) - - # The learner for the duobert. - learner = Learner( - # Misc settings - device=device, - random=random, - # How to train the model - trainer=duobert_trainer, - # The model to train - model=duobert_scorer, - # Optimization settings - steps_per_epoch=cfg.duobert.optimization.steps_per_epoch, - optimizers=cfg.duobert.optimization.optimizer, - max_epochs=cfg.duobert.optimization.max_epochs, - # The listeners (here, for validation) - listeners=[validation], - # The hook used for evaluation - hooks=[setmeta(DistributedHook(models=[duobert_scorer]), True)], - use_fp16=True, - ) - - # Submit job and link - outputs = learner.submit(launcher=launcher_learner) - tensorboard_service.add(learner, learner.logpath) - - # Evaluate the neural model on test collections - - monobert_test_retrievers = partial( - model_based_retrievers, - retrievers=partial(retrievers, k=cfg.retrieval.base_k), - top_k=cfg.retrieval.k, - scorer=monobert_scorer, - ) - test_retrievers = partial( - model_based_retrievers, - retrievers=monobert_test_retrievers, - scorer=duobert_scorer, - ) - - for metric_name in validation.monitored(): - model = outputs.listeners["bestval"][metric_name] # type: DuoCrossScorer - tests.evaluate_retriever( - partial( - model_based_retrievers, - scorer=model, - retrievers=test_retrievers, - device=device, - ), - launcher_evaluate, - model_id=f"duobert-{metric_name}", - ) - - return PaperResults( - models={"duobert-RR@10": outputs.listeners["bestval"]["RR@10"]}, - evaluations=tests, - tb_logs={"duobert-RR@10": learner.logpath}, - ) - - -@paper_command(package=__package__, schema=DuoBERT, tensorboard_service=True) -def cli(xp: experiment, cfg: DuoBERT, tensorboard_service: TensorboardService): - return run(xp, cfg, tensorboard_service) diff --git a/src/xpmir/papers/duobert/normal.yaml b/src/xpmir/papers/duobert/normal.yaml deleted file mode 100644 index 846422af..00000000 --- a/src/xpmir/papers/duobert/normal.yaml +++ /dev/null @@ -1,52 +0,0 @@ -id: duobert -title: "DuoBERT trained on MS-Marco" -description: | - DuoBERT model - - R. Nogueira, W. Yang, K. Cho, et J. Lin, « Multi-Stage Document Ranking with BERT », arXiv:1910.14424 [cs], oct. 2019. http://arxiv.org/abs/1910.14424 - - -gpu: true -indexation: - requirements: duration=6 days & cpu(mem=4G, cores=8) - -validation: - size: 500 - -retrieval: - requirements: duration=2 days & cuda(mem=24G) - k: 100 - base_k: 50 - -monobert: - requirements: duration=4 days & cuda(mem=24G) * 2 - - optimization: - steps_per_epoch: 32 - batch_size: 64 - max_epochs: 3200 - num_warmup_steps: 10000 - warmup_min_factor: 0 - lr: 3.0e-6 - weight_decay: .01 - - validation_interval: 32 - -duobert: - requirements: duration=4 days & cuda(mem=24G) * 2 - - optimization: - # Train on 100k iterations - max_epochs: 1_000 - steps_per_epoch: 100 - - # Learning rate warmup over the first 10,000 steps, and linear decay of the learning rate - num_warmup_steps: 10_000 - batch_size: 64 - warmup_min_factor: 0 - lr: 3.0e-6 - - # Validate 20 times over the 3200 epochs - validation_interval: 50 - base_validation_top_k: 1000 - validation_top_k: 50 diff --git a/src/xpmir/papers/duobert/small.yaml b/src/xpmir/papers/duobert/small.yaml deleted file mode 100644 index d94f830c..00000000 --- a/src/xpmir/papers/duobert/small.yaml +++ /dev/null @@ -1,48 +0,0 @@ -id: duobert-small -title: "DuoBERT (debug version)" -description: | - DuoBERT model (debug version) - - R. Nogueira, W. Yang, K. Cho, et J. Lin, « Multi-Stage Document Ranking with BERT », arXiv:1910.14424 [cs], oct. 2019. http://arxiv.org/abs/1910.14424 - - -gpu: true -validation: - size: 10 - -indexation: - requirements: duration=2 days & cpu(mem=4G) - -retrieval: - requirements: duration=2 days & cuda(mem=8G) - k: 20 - base_k: 40 - -monobert: - optimization: - scheduler: false - steps_per_epoch: 32 - batch_size: 16 - max_epochs: 4 - num_warmup_steps: 30 - warmup_min_factor: 0.1 - - validation_interval: 1 - validation_top_k: 20 - requirements: duration=2 days & cuda(mem=8G) - -duobert: - optimization: - scheduler: false - steps_per_epoch: 32 - batch_size: 16 - max_epochs: 16 - num_warmup_steps: 20 - warmup_min_factor: 0 - lr: 3.0e-6 - - validation_interval: 1 - base_validation_top_k: 20 - validation_top_k: 10 - - requirements: duration=2 days & cuda(mem=8G) * 2 diff --git a/src/xpmir/papers/monobert/__init__.py b/src/xpmir/papers/monobert/__init__.py deleted file mode 100644 index 6dcc9da5..00000000 --- a/src/xpmir/papers/monobert/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from .. import Experiment - -# Experiments -PAPERS = [ - Experiment( - "msmarco", "mono-BERT trained on MS-Marco passages (v1)", "experiment:cli" - ), - Experiment( - "finetune", - "Finetune monoBERT on a specific dataset (temporary)", - "finetune:cli", - ), -] diff --git a/src/xpmir/papers/monobert/configuration.py b/src/xpmir/papers/monobert/configuration.py deleted file mode 100644 index a6e94a5f..00000000 --- a/src/xpmir/papers/monobert/configuration.py +++ /dev/null @@ -1,51 +0,0 @@ -from attrs import Factory, field -from xpmir.papers import configuration -from xpmir.papers.helpers import LauncherSpecification -from xpmir.papers.helpers.optim import TransformerOptimization -from xpmir.papers.helpers.msmarco import RerankerMSMarcoV1Configuration - - -@configuration() -class Indexation(LauncherSpecification): - requirements: str = "duration=6 days & cpu(cores=8)" - - -@configuration() -class Learner: - validation_interval: int = field(default=32) - validation_top_k: int = 1000 - - optimization: TransformerOptimization = Factory(TransformerOptimization) - requirements: str = "duration=4 days & cuda(mem=24G) * 2" - sample_rate: float = 1.0 - """Sample rate for triplets""" - - sample_max: int = 0 - """Maximum number of samples considered (before shuffling). 0 for no limit.""" - - -@configuration() -class Retrieval: - k: int = 1000 - batch_size: int = 512 - requirements: str = "duration=2 days & cuda(mem=24G)" - - -@configuration() -class Preprocessing: - requirements: str = "duration=12h & cpu(cores=4)" - - -@configuration() -class Monobert(RerankerMSMarcoV1Configuration): - indexation: Indexation = Factory(Indexation) - retrieval: Retrieval = Factory(Retrieval) - - monobert: Learner = Factory(Learner) - preprocessing: Preprocessing = Factory(Preprocessing) - - dev_test_size: int = 0 - """Development test size (0 to leave it like this)""" - - base: str = "bert-base-uncased" - """Identifier for the base model""" diff --git a/src/xpmir/papers/monobert/experiment.py b/src/xpmir/papers/monobert/experiment.py deleted file mode 100644 index 852bbce5..00000000 --- a/src/xpmir/papers/monobert/experiment.py +++ /dev/null @@ -1,185 +0,0 @@ -from functools import partial -import logging - -from xpmir.distributed import DistributedHook -from xpmir.learning.learner import Learner -from xpmir.letor.learner import ValidationListener -import xpmir.letor.trainers.pairwise as pairwise -from xpmir.neural.cross import CrossScorer -from experimaestro import experiment, setmeta -from experimaestro.launcherfinder import find_launcher -from xpmir.learning.batchers import PowerAdaptativeBatcher -from xpmir.learning.optim import ( - TensorboardService, -) -from xpmir.papers.cli import paper_command -from xpmir.rankers.standard import BM25 -from xpmir.text.huggingface import DualTransformerEncoder -from xpmir.papers.results import PaperResults -from xpmir.papers.helpers.samplers import ( - msmarco_v1_docpairs_sampler, - msmarco_v1_tests, - msmarco_v1_validation_dataset, - prepare_collection, -) -from .configuration import Monobert -import xpmir.interfaces.anserini as anserini -from xpmir.rankers import scorer_retriever, RandomScorer - -logging.basicConfig(level=logging.INFO) - - -def get_retrievers(cfg: Monobert): - """Returns retrievers - - - :param cfg: The configuration - :return: A tuple composed of (1) a retriever factory based on the base model - (BM25) and (2) - """ - launcher_index = cfg.indexation.launcher - - base_model = BM25().tag("model", "bm25") - - retrievers = partial( - anserini.retriever, - anserini.index_builder(launcher=launcher_index), - model=base_model, - ) #: Anserini based retrievers - - model_based_retrievers = partial( - scorer_retriever, - batch_size=cfg.retrieval.batch_size, - batcher=PowerAdaptativeBatcher(), - device=cfg.device, - ) #: Model-based retrievers - - return retrievers, model_based_retrievers - - -def run( - xp: experiment, cfg: Monobert, tensorboard_service: TensorboardService -) -> PaperResults: - """monoBERT model""" - - launcher_learner = find_launcher(cfg.monobert.requirements) - launcher_evaluate = find_launcher(cfg.retrieval.requirements) - launcher_preprocessing = find_launcher(cfg.preprocessing.requirements) - device = cfg.device - random = cfg.random - - documents = prepare_collection("irds.msmarco-passage.documents") - ds_val = msmarco_v1_validation_dataset( - cfg.validation, launcher=launcher_preprocessing - ) - - tests = msmarco_v1_tests(cfg.dev_test_size) - - # Setup indices and validation/test base retrievers - retrievers, model_based_retrievers = get_retrievers(cfg) - val_retrievers = partial( - retrievers, store=documents, k=cfg.monobert.validation_top_k - ) - test_retrievers = partial( - retrievers, store=documents, k=cfg.retrieval.k - ) #: Test retrievers - - # Search and evaluate with a random re-ranker - random_scorer = RandomScorer(random=random).tag("scorer", "random") - tests.evaluate_retriever( - partial( - model_based_retrievers, - retrievers=test_retrievers, - scorer=random_scorer, - device=None, - ), - launcher=launcher_preprocessing, - ) - - # Search and evaluate with the base model - tests.evaluate_retriever(test_retrievers, cfg.indexation.launcher) - - # Define the different launchers - - # define the trainer for monobert - monobert_trainer = pairwise.PairwiseTrainer( - lossfn=pairwise.PointwiseCrossEntropyLoss(), - sampler=msmarco_v1_docpairs_sampler( - sample_rate=cfg.monobert.sample_rate, - sample_max=cfg.monobert.sample_max, - launcher=launcher_preprocessing, - ), - batcher=PowerAdaptativeBatcher(), - batch_size=cfg.monobert.optimization.batch_size, - ) - - monobert_scorer: CrossScorer = CrossScorer( - encoder=DualTransformerEncoder( - model_id=cfg.base, trainable=True, maxlen=512, dropout=0.1 - ) - ).tag("scorer", "monobert") - - # The validation listener evaluates the full retriever - # (retriever + scorer) and keep the best performing model - # on the validation set - validation = ValidationListener( - id="bestval", - dataset=ds_val, - retriever=model_based_retrievers( - documents, - retrievers=val_retrievers, - scorer=monobert_scorer, - device=device, - ), - validation_interval=cfg.monobert.validation_interval, - metrics={"RR@10": True, "AP": False, "nDCG": False}, - ) - - # The learner trains the model - learner = Learner( - # Misc settings - device=device, - random=random, - # How to train the model - trainer=monobert_trainer, - # The model to train - model=monobert_scorer, - # Optimization settings - steps_per_epoch=cfg.monobert.optimization.steps_per_epoch, - optimizers=cfg.monobert.optimization.optimizer, - max_epochs=cfg.monobert.optimization.max_epochs, - # The listeners (here, for validation) - listeners=[validation], - # The hook used for evaluation - hooks=[setmeta(DistributedHook(models=[monobert_scorer]), True)], - ) - - # Submit job and link - outputs = learner.submit(launcher=launcher_learner) - tensorboard_service.add(learner, learner.logpath) - - # Evaluate the neural model on test collections - for metric_name in validation.monitored(): - load_model = outputs.listeners[validation.id][metric_name] - tests.evaluate_retriever( - partial( - model_based_retrievers, - scorer=monobert_scorer, - retrievers=test_retrievers, - device=device, - ), - launcher_evaluate, - model_id=f"monobert-{metric_name}", - init_tasks=[load_model], - ) - - return PaperResults( - models={"monobert-RR@10": outputs.listeners[validation.id]["RR@10"]}, - evaluations=tests, - tb_logs={"monobert-RR@10": learner.logpath}, - ) - - -@paper_command(schema=Monobert, package=__package__, tensorboard_service=True) -def cli(xp: experiment, cfg: Monobert, tensorboard_service: TensorboardService): - return run(xp, cfg, tensorboard_service) diff --git a/src/xpmir/papers/monobert/finetune.py b/src/xpmir/papers/monobert/finetune.py deleted file mode 100644 index b7c2f47c..00000000 --- a/src/xpmir/papers/monobert/finetune.py +++ /dev/null @@ -1,188 +0,0 @@ -from functools import partial -import logging - -from xpmir.distributed import DistributedHook -from xpmir.learning.learner import Learner -from xpmir.letor.learner import ValidationListener -import xpmir.letor.trainers.pointwise as pointwise -from xpmir.letor.samplers import PointwiseModelBasedSampler -from xpmir.neural.cross import CrossScorer -from experimaestro import experiment, setmeta -from experimaestro.launcherfinder import find_launcher -from xpmir.learning.batchers import PowerAdaptativeBatcher -from xpmir.learning.optim import ( - TensorboardService, -) -from xpmir.papers.cli import paper_command -from xpmir.rankers.standard import BM25 -from xpmir.text.huggingface import DualTransformerEncoder -from xpmir.papers.results import PaperResults -from xpmir.papers.helpers.samplers import ( - finetuning_validation_dataset, - prepare_collection, - msmarco_v1_tests, -) -from .configuration import Monobert -import xpmir.interfaces.anserini as anserini -from xpmir.rankers import scorer_retriever, RandomScorer - -logging.basicConfig(level=logging.INFO) - - -def get_retrievers(cfg: Monobert): - """Returns retrievers - - - :param cfg: The configuration - :return: A tuple composed of (1) a retriever factory based on the base model - (BM25) and (2) - """ - launcher_index = cfg.indexation.launcher - - base_model = BM25().tag("model", "bm25") - - retrievers = partial( - anserini.retriever, - anserini.index_builder(launcher=launcher_index), - model=base_model, - ) #: Anserini based retrievers - - model_based_retrievers = partial( - scorer_retriever, - batch_size=cfg.retrieval.batch_size, - batcher=PowerAdaptativeBatcher(), - device=cfg.device, - ) #: Model-based retrievers - - return retrievers, model_based_retrievers - - -def run( - xp: experiment, cfg: Monobert, tensorboard_service: TensorboardService -) -> PaperResults: - """monoBERT model""" - - launcher_learner = find_launcher(cfg.monobert.requirements) - launcher_evaluate = find_launcher(cfg.retrieval.requirements) - launcher_preprocessing = find_launcher(cfg.preprocessing.requirements) - device = cfg.device - random = cfg.random - - documents = prepare_collection("irds.msmarco-passage.documents") - - train_dataset = prepare_collection("irds.msmarco-passage.train") - ds_val = finetuning_validation_dataset( - cfg.validation, - dataset_id="irds.msmarco-passage.dev", - launcher=launcher_preprocessing, - ) - - tests = msmarco_v1_tests() - - # Setup indices and validation/test base retrievers - retrievers, model_based_retrievers = get_retrievers(cfg) - train_retrievers = partial(retrievers, store=documents, k=cfg.retrieval.k) - - val_retrievers = partial( - retrievers, store=documents, k=cfg.monobert.validation_top_k - ) - test_retrievers = partial( - retrievers, store=documents, k=cfg.retrieval.k - ) #: Test retrievers - - # Search and evaluate with a random re-ranker - random_scorer = RandomScorer(random=random).tag("scorer", "random") - tests.evaluate_retriever( - partial( - model_based_retrievers, - retrievers=test_retrievers, - scorer=random_scorer, - device=None, - ), - launcher=launcher_preprocessing, - ) - - # Search and evaluate with the base model - tests.evaluate_retriever(test_retrievers, cfg.indexation.launcher) - - train_retrievers = train_retrievers(train_dataset.documents) - - monobert_trainer = pointwise.PointwiseTrainer( - lossfn=pointwise.BinaryCrossEntropyLoss(), - sampler=PointwiseModelBasedSampler( - dataset=train_dataset, retriever=train_retrievers - ), - batcher=PowerAdaptativeBatcher(), - batch_size=cfg.monobert.optimization.batch_size, - ) - - monobert_scorer: CrossScorer = CrossScorer( - encoder=DualTransformerEncoder( - model_id=cfg.base, trainable=True, maxlen=512, dropout=0.1 - ) - ).tag("scorer", "monobert") - - # The validation listener evaluates the full retriever - # (retriever + scorer) and keep the best performing model - # on the validation set - validation = ValidationListener( - id="bestval", - dataset=ds_val, - retriever=model_based_retrievers( - documents, - retrievers=val_retrievers, - scorer=monobert_scorer, - device=device, - ), - validation_interval=cfg.monobert.validation_interval, - metrics={"RR@10": True, "AP": False, "nDCG": False}, - ) - - # The learner trains the model - learner = Learner( - # Misc settings - device=device, - random=random, - # How to train the model - trainer=monobert_trainer, - # The model to train - model=monobert_scorer, - # Optimization settings - steps_per_epoch=cfg.monobert.optimization.steps_per_epoch, - optimizers=cfg.monobert.optimization.optimizer, - max_epochs=cfg.monobert.optimization.max_epochs, - # The listeners (here, for validation) - # listeners=[validation], - listeners=[], - # The hook used for evaluation - hooks=[setmeta(DistributedHook(models=[monobert_scorer]), True)], - ) - - # Submit job and link - outputs = learner.submit(launcher=launcher_learner) - tensorboard_service.add(learner, learner.logpath) - - # Evaluate the neural model on test collections - for metric_name in validation.monitored(): - model = outputs.learned_model # type: CrossScorer - tests.evaluate_retriever( - partial( - model_based_retrievers, - scorer=model, - retrievers=test_retrievers, - device=device, - ), - launcher_evaluate, - model_id=f"monobert-{metric_name}", - ) - - return PaperResults( - models={"monobert-last": outputs.learned_model}, - evaluations=tests, - tb_logs={"monobert-last": learner.logpath}, - ) - - -@paper_command(schema=Monobert, package=__package__, tensorboard_service=True) -def cli(xp: experiment, cfg: Monobert, tensorboard_service: TensorboardService): - return run(xp, cfg, tensorboard_service) diff --git a/src/xpmir/papers/monobert/normal.yaml b/src/xpmir/papers/monobert/normal.yaml deleted file mode 100644 index 3369e9eb..00000000 --- a/src/xpmir/papers/monobert/normal.yaml +++ /dev/null @@ -1,37 +0,0 @@ -id: monobert -title: "monoBERT trained on MS-Marco" -description: | - Passage Re-ranking with BERT (Rodrigo Nogueira, Kyunghyun Cho). 2019. - https://arxiv.org/abs/1901.04085 - - This model has been trained on MsMarco v1 - -gpu: true - -preprocessing: - requirements: duration=6h & cpu(mem=4G, cores=8) - -indexation: - requirements: duration=6h & cpu(mem=4G, cores=8) - -validation: - # Use 500 topics for validation - size: 500 - -monobert: - optimization: - steps_per_epoch: 32 - batch_size: 64 - max_epochs: 3200 - num_warmup_steps: 10000 - warmup_min_factor: 0 - weight_decay: 0.01 - lr: 3.0e-6 - eps: 1.0e-6 - - validation_interval: 400 - requirements: duration=4 days & cuda(mem=24G) * 2 - -retrieval: - requirements: duration=12h & cuda(mem=24G) * 2 - k: 100 diff --git a/src/xpmir/papers/monobert/small.yaml b/src/xpmir/papers/monobert/small.yaml deleted file mode 100644 index b100b23c..00000000 --- a/src/xpmir/papers/monobert/small.yaml +++ /dev/null @@ -1,39 +0,0 @@ -id: monobert-small -title: "monoBERT trained on MS-Marco (debug)" -description: | - Passage Re-ranking with BERT (Rodrigo Nogueira, Kyunghyun Cho). 2019. - https://arxiv.org/abs/1901.04085 - - This model has been trained on MsMarco v1 but only a few iterations (debug) - -gpu: true -base: "microsoft/MiniLM-L12-H384-uncased" -dev_test_size: 50 - -validation: - size: 10 - -indexation: - requirements: duration=1 days & cpu(cores=4) - -retrieval: - requirements: duration=2 days & cuda(mem=8G) - k: 20 - -monobert: - optimization: - scheduler: false - steps_per_epoch: 32 - max_epochs: 4 - batch_size: 16 - num_warmup_steps: 30 - warmup_min_factor: 0.1 - eps: 1.0e-6 - - # Only use 1% of the 100_000 triplets - sample_rate: .01 - sample_max: 100_000 - - validation_interval: 1 - validation_top_k: 20 - requirements: duration=1 days & cuda(mem=8G)