-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset.py
35 lines (32 loc) · 1.12 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from datasets import load_dataset
class Dataset:
"""
Dataset class to load the chunked dataset from Hugging Face Datasets
"""
def __init__(self, dataset_name: str, split: str) -> None:
self.dataset_name = dataset_name
self.split = split
def get_dataset(self) -> list:
dataset = load_dataset(self.dataset_name, split=self.split)
dataset = dataset.map(lambda x: {
"id": f'{x["id"]}-{x["chunk-id"]}',
"text": x["chunk"],
"metadata": {
"title": x["title"],
"url": x["source"],
"primary_category": x["primary_category"],
"published": x["published"],
"updated": x["updated"],
"text": x["chunk"],
}
})
# drop uneeded columns
dataset = dataset.remove_columns([
"title", "summary", "source",
"authors", "categories", "comment",
"journal_ref", "primary_category",
"published", "updated", "references",
"doi", "chunk-id",
"chunk"
])
return dataset