-
Notifications
You must be signed in to change notification settings - Fork 0
/
croissant.py
137 lines (124 loc) · 4.31 KB
/
croissant.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
Generates Croissant JSON-LD metadata for datasets.
"""
import json
import hashlib
def create_croissant_jsonld(title, description, author, distributions=[],
keywords=[], identifier="", publisher_name="",
citation="", date_published="", license_url="",
version=""):
"""
Create a Croissant-compliant JSON-LD file.
Args:
title (str): Title of the dataset.
description (str): Description of the dataset.
identifier (str): Identifier of the dataset.
keywords (list): List of keywords for the dataset.
publisher_name (str): Name of the publisher.
distributions (list): List of distribution metadata.
citation (str): Citation for the dataset.
date_published (str): Publication date of the dataset.
license_url (str): License URL for the dataset.
version (str): Version of the dataset.
Returns:
dict: The Croissant-compliant JSON-LD metadata.
"""
croissant = {
"@context": {
"@vocab": "https://schema.org/",
"croissant": "https://mlcommons.org/croissant#",
"Dataset": "https://schema.org/Dataset",
"FileObject": "https://schema.org/FileObject"
},
"@type": "Dataset",
"name": title.replace(" ", "_"),
"title": title,
"description": description,
"author": author,
"identifier": identifier,
"keyword": keywords,
"publisher": {
"@type": "Organization",
"name": publisher_name
},
"distribution": distributions,
"citation": citation,
"datePublished": date_published,
"license": license_url,
"version": version,
"isLiveDataset": True,
"subField": [],
"parentField": [],
"format": "json-ld",
"transform": [],
"path": "",
"cr": [],
"key": [],
"source": [],
"field": [],
"fileProperty": [],
"extract": [],
"repeated": False,
"@language": "en",
"data": [],
"rai": [],
"column": [],
"citeAs": citation,
"examples": [],
"includes": [],
"regex": "",
"fileSet": [],
"recordSet": [],
"references": [],
"dct": [],
"jsonPath": "",
"dataType": "Dataset",
"fileObject": distributions,
"conformsTo": "https://schema.org",
"separator": ",",
"sc": [],
"md5": "",
"replace": ""
}
return croissant
def create_distribution(title, format, download_url, description=""):
# Generate a unique identifier using the SHA-256 hash of the download URL
identifier = hashlib.sha256(download_url.encode()).hexdigest()
print(f"Generated identifier for {title}: {identifier}")
return {
"@type": "FileObject",
"name": title.replace(" ", "_"),
"identifier": identifier,
"title": title,
"description": description,
"encodingFormat": format,
"contentUrl": download_url,
"sha256": identifier # Example SHA-256 hash
}
# Example usage
if __name__ == '__main__':
# Example usage
title = "Example Dataset"
description = "This is an example dataset."
identifier = "http://example.org/dataset/1"
keywords = ["example", "dataset"]
publisher_name = "Example Organization"
citation = "Example Citation"
date_published = "2022-01-01"
license_url = "http://example.org/license"
version = "1.0.0"
distributions = [
create_distribution("Example Dataset CSV",
"CSV file of the example dataset.",
"text/csv",
"http://example.org/dataset/1.csv"),
create_distribution("Example Dataset Parquet",
"Parquet file of the example dataset.",
"application/x-parquet",
"http://example.org/dataset/1.parquet")
]
croissant_jsonld = create_croissant_jsonld(title, description)
# Save to file
with open("croissant_title_and_description_only.json", "w", encoding="utf-8") as f:
json.dump(croissant_jsonld, f, indent=4)
print("Croissant JSON-LD file created successfully.")