Skip to content

Commit

Permalink
fix: Use output samples rather than input samples
Browse files Browse the repository at this point in the history
  • Loading branch information
saattrupdan committed Apr 23, 2024
1 parent 16b2627 commit 645bedc
Showing 1 changed file with 7 additions and 5 deletions.
12 changes: 7 additions & 5 deletions src/foqa/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,6 @@ def build_dataset(config: DictConfig) -> None:
)
assert isinstance(dataset, Dataset)

num_samples = min(config.num_samples, len(dataset))
if num_samples < config.num_samples:
logger.info(f"Reduced number of samples to the maximal {num_samples:,}.")
dataset = dataset.select(range(num_samples))

records_path = Path(config.dirs.data) / config.dirs.raw / "records.jsonl"
if records_path.exists():
with records_path.open() as f:
Expand All @@ -45,6 +40,13 @@ def build_dataset(config: DictConfig) -> None:

with tqdm(dataset, desc=f"Generating samples with {config.model}") as pbar:
for sample in pbar:
if len(records) >= config.num_samples:
logger.info(
f"Reached the target number of samples ({config.num_samples:,}). "
"Stopping."
)
break

sample_exists = any(record["id"] == sample["url"] for record in records)
if sample_exists:
continue
Expand Down

0 comments on commit 645bedc

Please sign in to comment.