diff --git a/rankers/datasets/dataset.py b/rankers/datasets/dataset.py index d6a4c2a..0b0c12b 100644 --- a/rankers/datasets/dataset.py +++ b/rankers/datasets/dataset.py @@ -51,13 +51,6 @@ def _get_line_by_index(self, idx): f.seek(self.line_offsets[idx]) return json.loads(f.readline()) - def _data_generator(self): - """Generator for reading JSON lines from a compressed or uncompressed file.""" - - with open(self.training_dataset_file, 'r', encoding="utf-8") as f: - for line in f: - yield json.loads(line) - def __post_init__(self): assert self.corpus is not None, "Cannot instantiate a text-based dataset without a lookup" @@ -114,6 +107,7 @@ def __getitem__(self, idx): texts, scores = zip(*sorted(zip(texts, scores), key=lambda x: x[1], reverse=True)) return (query, texts[:self.group_size], scores[:self.group_size]) else: + breakpoint() texts, scores = zip(*random.sample(list(zip(texts, scores)), self.group_size)) return (query, texts, scores) else: