Skip to content

Commit

Permalink
add placeholders and unify format
Browse files Browse the repository at this point in the history
  • Loading branch information
Parry-Parry committed Nov 4, 2024
1 parent 04ece91 commit a7585a6
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 28 deletions.
3 changes: 2 additions & 1 deletion rankers/modelling/bge.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,11 @@ def __init__(
self,
encoder : PreTrainedModel,
tokenizer : PreTrainedTokenizer,
config : DotConfig,
config : BGEConfig,
encoder_d : PreTrainedModel = None,
pooler : Pooler = None,
):
raise NotImplementedError("Incomplete, do not use")
super().__init__(config)
self.encoder = encoder
self.tokenizer = tokenizer
Expand Down
14 changes: 8 additions & 6 deletions rankers/modelling/cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class Cat(PreTrainedModel):
the configuration for the model
"""
model_architecture = 'Cat'
cls_architecture = AutoModelForSequenceClassification
def __init__(
self,
model: PreTrainedModel,
Expand Down Expand Up @@ -50,10 +51,9 @@ def save_pretrained(self, model_dir, **kwargs):
self.model.save_pretrained(model_dir)
self.tokenizer.save_pretrained(model_dir)


def load_state_dict(self, model_dir):
"""Load state dict from a directory"""
return self.model.load_state_dict(AutoModelForSequenceClassification.from_pretrained(model_dir).state_dict())
return self.model.load_state_dict(self.cls_architecture.from_pretrained(model_dir).state_dict())

def to_pyterrier(self) -> "pt.Transformer":
return CatTransformer.from_model(self.model, self.tokenizer, text_field='text')
Expand All @@ -62,11 +62,12 @@ def to_pyterrier(self) -> "pt.Transformer":
def from_pretrained(cls, model_dir_or_name : str, num_labels=2):
"""Load model from a directory"""
config = AutoConfig.from_pretrained(model_dir_or_name)
model = AutoModelForSequenceClassification.from_pretrained(model_dir_or_name, num_labels=num_labels)
model = cls.cls_architecture.from_pretrained(model_dir_or_name, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(model_dir_or_name)
return cls(model, tokenizer, config)

class CatTransformer(pt.Transformer):
cls_architecture = AutoModelForSequenceClassification
def __init__(self,
model : PreTrainedModel,
tokenizer : PreTrainedTokenizer,
Expand All @@ -93,7 +94,7 @@ def from_pretrained(cls,
device : Union[str, torch.device] = None,
verbose : bool = False
):
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path).cuda().eval()
model = cls.cls_architecture.from_pretrained(model_name_or_path).cuda().eval()
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
config = AutoConfig.from_pretrained(model_name_or_path)
return cls(model, tokenizer, config, batch_size, text_field, device, verbose)
Expand Down Expand Up @@ -125,6 +126,7 @@ def transform(self, inp : pd.DataFrame) -> pd.DataFrame:
return pt.model.add_ranks(res)

class PairTransformer(pt.Transformer):
cls_architecture = AutoModelForSequenceClassification
def __init__(self,
model : PreTrainedModel,
tokenizer : PreTrainedTokenizer,
Expand All @@ -141,6 +143,7 @@ def __init__(self,
self.batch_size = batch_size
self.text_field = text_field
self.device = device if device is not None else 'cuda' if torch.cuda.is_available() else 'cpu'
self.verbose = verbose

@classmethod
def from_model(cls,
Expand All @@ -161,13 +164,12 @@ def from_pretrained(cls,
device : Union[str, torch.device] = None,
verbose : bool = False
):
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path)
model = cls.cls_architecture.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
config = AutoConfig.from_pretrained(model_name_or_path)
return cls(model, tokenizer, config, batch_size, text_field, device, verbose)

def transform(self, inp : pd.DataFrame) -> pd.DataFrame:
# TODO: Switch this to a pair-wise scoring
scores = []
it = inp[['query', self.text_field]].itertuples(index=False)
if self.verbose:
Expand Down
18 changes: 10 additions & 8 deletions rankers/modelling/dot.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ class Dot(PreTrainedModel):
the pooling layer
"""
model_architecture = 'Dot'
cls_architecture = AutoModel
def __init__(
self,
model : PreTrainedModel,
Expand Down Expand Up @@ -195,30 +196,31 @@ def save_pretrained(self, model_dir, **kwargs):
def load_state_dict(self, model_dir):
"""Load state dict from a directory"""
self.config = DotConfig.from_pretrained(model_dir)
self.model.load_state_dict(AutoModel.from_pretrained(model_dir).state_dict())
if not self.config.model_tied: self.model_d.load_state_dict(AutoModel.from_pretrained(model_dir + "/model_d").state_dict())
if self.config.use_pooler: self.pooler.load_state_dict(AutoModel.from_pretrained(model_dir + "/pooler").state_dict())
self.model.load_state_dict(self.cls_architecture.from_pretrained(model_dir).state_dict())
if not self.config.model_tied: self.model_d.load_state_dict(self.cls_architecture.from_pretrained(model_dir + "/model_d").state_dict())
if self.config.use_pooler: self.pooler.load_state_dict(self.cls_architecture.from_pretrained(model_dir + "/pooler").state_dict())

@classmethod
def from_pretrained(cls, model_dir_or_name, **kwargs):
"""Load model"""
if os.path.isdir(model_dir_or_name):
config = DotConfig.from_pretrained(model_dir_or_name, **kwargs)
model = AutoModel.from_pretrained(model_dir_or_name)
model = cls.cls_architecture.from_pretrained(model_dir_or_name)
tokenizer = AutoTokenizer.from_pretrained(model_dir_or_name)
model_d = None if config.model_tied else AutoModel.from_pretrained(model_dir_or_name + "/model_d")
model_d = None if config.model_tied else cls.cls_architecture.from_pretrained(model_dir_or_name + "/model_d")
pooler = None if not config.use_pooler else Pooler.from_pretrained(model_dir_or_name + "/pooler")

return cls(model, tokenizer, config, model_d, pooler)
config = DotConfig(model_dir_or_name, **kwargs)
tokenizer = AutoTokenizer.from_pretrained(model_dir_or_name)
model = AutoModel.from_pretrained(model_dir_or_name)
model = cls.cls_architecture.from_pretrained(model_dir_or_name)
return cls(model, tokenizer, config)

def to_pyterrier(self) -> "DotTransformer":
return DotTransformer.from_model(self, self.tokenizer, text_field='text')

class DotTransformer(pt.Transformer):
cls_architecture = AutoModel
def __init__(self,
model : PreTrainedModel,
tokenizer : PreTrainedTokenizer,
Expand Down Expand Up @@ -254,8 +256,8 @@ def from_pretrained(cls,
config = DotConfig.from_pretrained(model_name_or_path)
config.mode = pooling
pooler = None if not config.use_pooler else Pooler.from_pretrained(model_name_or_path+"/pooler")
model_d = None if config.model_tied else AutoModel.from_pretrained(model_name_or_path + "/model_d")
model_q = AutoModel.from_pretrained(model_name_or_path)
model_d = None if config.model_tied else cls.cls_architecture.from_pretrained(model_name_or_path + "/model_d")
model_q = cls.cls_architecture.from_pretrained(model_name_or_path)
model = Dot(model_q, config, model_d, pooler)
return cls(model, AutoTokenizer.from_pretrained(model_name_or_path), config, batch_size, text_field, device, verbose)

Expand Down
120 changes: 107 additions & 13 deletions rankers/modelling/seq2seq.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pyterrier as pt
if not pt.started():
pt.init()
from transformers import PreTrainedModel, PreTrainedTokenizer, AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM
from transformers import PreTrainedModel, PreTrainedTokenizer, AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM, AutoModelForCausalLM
from typing import Union
import torch
import pandas as pd
Expand All @@ -23,6 +23,7 @@ class Seq2Seq(PreTrainedModel):
the configuration for the model
"""
model_architecture = 'Seq2Seq'
cls_architecture = AutoModelForSeq2SeqLM
def __init__(
self,
model: AutoModelForSeq2SeqLM,
Expand Down Expand Up @@ -60,13 +61,14 @@ def to_pyterrier(self) -> "Seq2SeqTransformer":
return Seq2SeqTransformer.from_model(self.model, self.tokenizer, text_field='text')

@classmethod
def from_pretrained(cls, model_dir_or_name : str, num_labels=2):
def from_pretrained(cls, model_dir_or_name : str, num_labels=2, **kwargs):
"""Load model from a directory"""
config = AutoConfig.from_pretrained(model_dir_or_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir_or_name, num_labels=num_labels)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir_or_name, num_labels=num_labels, **kwargs)
return cls(model, config)

class Seq2SeqTransformer(pt.Transformer):
cls_architecture = AutoModelForSeq2SeqLM
def __init__(self,
model : PreTrainedModel,
tokenizer : PreTrainedTokenizer,
Expand All @@ -76,7 +78,8 @@ def __init__(self,
device : Union[str, torch.device] = None,
pos_token : str = 'true',
neg_token : str = 'false',
prompt : str = None
prompt : str = None,
verbose : bool = False
) -> None:
super().__init__()
self.model = model
Expand All @@ -88,28 +91,33 @@ def __init__(self,
self.pos_token = self.tokenizer.encode(pos_token)[0]
self.neg_token = self.tokenizer.encode(neg_token)[0]
self.prompt = prompt if prompt is not None else DEFAULT_MONO_PROMPT
self.verbose = verbose

@classmethod
def from_pretrained(cls,
model_name_or_path : str,
batch_size : int = 64,
text_field : str = 'text',
device : Union[str, torch.device] = None
device : Union[str, torch.device] = None,
prompt : str = None,
verbose : bool = False,
**kwargs
):
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
model = cls.cls_architecture.from_pretrained(model_name_or_path, **kwargs).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
config = AutoConfig.from_pretrained(model_name_or_path)
return cls(model, tokenizer, config, batch_size, text_field, device)
return cls(model, tokenizer, config, batch_size, text_field, device, prompt, verbose=verbose)

@classmethod
def from_model(cls,
model : PreTrainedModel,
tokenizer : PreTrainedTokenizer,
batch_size : int = 64,
text_field : str = 'text',
verbose : bool = False
):
config = model.config
return cls(model, tokenizer, config, batch_size, text_field, model.device)
return cls(model, tokenizer, config, batch_size, text_field, model.device, verbose=verbose)

def transform(self, inp : pd.DataFrame) -> pd.DataFrame:
scores = []
Expand Down Expand Up @@ -138,13 +146,14 @@ def __init__(self,
device : Union[str, torch.device] = None,
pos_token : str = 'true',
neg_token : str = 'false',
prompt : str = None
prompt : str = None,
verbose : bool = False
) -> None:
super().__init__(model, tokenizer, config, batch_size, text_field, device, pos_token, neg_token, prompt)
raise NotImplementedError("Incomplete, do not use")
super().__init__(model, tokenizer, config, batch_size, text_field, device, pos_token, neg_token, prompt, verbose)
self.prompt = prompt if prompt is not None else DEFAULT_DUO_PROMPT

def transform(self, inp : pd.DataFrame) -> pd.DataFrame:
# TODO: Fix this mess
scores = []
it = inp[['query', self.text_field]].itertuples(index=False)
if self.verbose:
Expand All @@ -167,10 +176,95 @@ class CausalLM(Seq2Seq):
Parameters
----------
model : AutoModelForCausalLM
the model model
the underlying HF model
tokenizer : PreTrainedTokenizer
the tokenizer for the model
config : AutoConfig
the configuration for the model
"""
model_architecture = 'CausalLM'
cls_architecture = AutoModelForCausalLM
def __init__(self, model, tokenizer, config):
raise NotImplementedError("Incomplete, do not use")
super().__init__(model, tokenizer, config)

def prepare_outputs(self, logits):
return logits
raise NotImplementedError

def save_pretrained(self, model_dir, **kwargs):
"""Save model"""
self.config.save_pretrained(model_dir)
self.model.save_pretrained(model_dir)
self.tokenizer.save_pretrained(model_dir)

def load_state_dict(self, model_dir):
"""Load state dict from a directory"""
return self.model.load_state_dict(self.cls_architecture.from_pretrained(model_dir).state_dict())

def to_pyterrier(self) -> "Seq2SeqTransformer":
return CausalLMTransformer.from_model(self.model, self.tokenizer, text_field='text')

@classmethod
def from_pretrained(cls, model_dir_or_name : str, **kwargs):
"""Load model from a directory"""
config = AutoConfig.from_pretrained(model_dir_or_name, **kwargs)
model = cls.from_pretrained(model_dir_or_name, **kwargs)
return cls(model, config)

class CausalLMTransformer(Seq2SeqTransformer):
cls_architecture = AutoModelForCausalLM
def __init__(self,
model : PreTrainedModel,
tokenizer : PreTrainedTokenizer,
config : AutoConfig,
batch_size : int,
text_field : str = 'text',
device : Union[str, torch.device] = None,
prompt : str = None,
verbose : bool = False
) -> None:
raise NotImplementedError("Incomplete, do not use")
super().__init__(model, tokenizer, config, batch_size, text_field, device, prompt, verbose)

@classmethod
def from_pretrained(cls,
model_name_or_path : str,
batch_size : int = 64,
text_field : str = 'text',
device : Union[str, torch.device] = None,
prompt : str = None,
verbose : bool = False,
**kwargs
):
model = cls.cls_architecture.from_pretrained(model_name_or_path, **kwargs).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
config = AutoConfig.from_pretrained(model_name_or_path)
return cls(model, tokenizer, config, batch_size, text_field, device, prompt, verbose=verbose)

@classmethod
def from_model(cls,
model : PreTrainedModel,
tokenizer : PreTrainedTokenizer,
batch_size : int = 64,
text_field : str = 'text',
verbose : bool = False
):
config = model.config
return cls(model, tokenizer, config, batch_size, text_field, model.device, verbose=verbose)

def transform(self, inp : pd.DataFrame) -> pd.DataFrame:
scores = []
it = inp[['query', self.text_field]].itertuples(index=False)
if self.verbose:
it = pt.tqdm(it, total=len(inp), unit='record', desc='Cat scoring')
with torch.no_grad():
for chunk in chunked(it, self.batch_size):
queries, texts = map(list, zip(*chunk))
prompts = [self.prompt.format(query=q, text=t) for q, t in zip(queries, texts)]
inps = self.tokenizer(prompts, return_tensors='pt', padding=True, truncation=True)
inps = {k: v.to(self.device) for k, v in inps.items()}
scores.append(self.model(**inps).logits[:, 0].cpu().detach().numpy())
res = inp.assign(score=np.concatenate(scores))
pt.model.add_ranks(res)
res = res.sort_values(['qid', 'rank'])
return res

0 comments on commit a7585a6

Please sign in to comment.