-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdcat.py
139 lines (119 loc) · 4.63 KB
/
dcat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
Creates DCAT-compliant JSON-LD files for datasets.
"""
import json
import hashlib
import datetime
def generate_sha256_hash(file_path):
"""
Generate the SHA-256 hash for a given file.
Args:
file_path (str): Path to the file.
Returns:
str: The SHA-256 hash of the file.
"""
identifier = hashlib.sha256(file_path.encode()).hexdigest()
return identifier
def create_dcat_jsonld(title, description, author, distributions=[],
keywords=[], identifier="", publisher_name="",
citation="", date_published="", license_url="",
version=""):
"""
Create a DCAT-compliant JSON-LD file.
Args:
title (str): Title of the dataset.
description (str): Description of the dataset.
identifier (str): Identifier of the dataset.
keywords (list): List of keywords for the dataset.
publisher_name (str): Name of the publisher.
distributions (list): List of distribution metadata.
citation (str): Citation for the dataset.
date_published (str): Publication date of the dataset.
license_url (str): License URL for the dataset.
version (str): Version of the dataset.
Returns:
dict: The DCAT-compliant JSON-LD metadata.
"""
dcat = {
"@context": {
"@vocab": "https://schema.org/",
"dcat": "http://www.w3.org/ns/dcat#",
"Dataset": "https://schema.org/Dataset",
"FileObject": "https://schema.org/FileObject"
},
"@type": "Catalog",
"title": "Dataset Catalog",
"description": "A catalog of datasets.",
"dataset": [
{
"@type": "Dataset",
"name": title.replace(" ", "_"),
"title": title,
"description": description,
"identifier": identifier,
"keyword": keywords,
"author": author,
"publisher": {
"@type": "Organization",
"name": publisher_name
},
"distribution": distributions,
"citation": citation,
"license": license_url,
"version": version
}
]
}
return dcat
def create_distribution(title, format, file_path, description=""):
"""
Create distribution metadata for a file.
Args:
title (str): Title of the distribution.
description (str): Description of the distribution.
format (str): Format of the file.
download_url (str): URL to download the file.
file_path (str): Path to the file to generate the SHA-256 hash.
Returns:
dict: The distribution metadata including the SHA-256 hash.
"""
sha256_hash = generate_sha256_hash(file_path)
return {
"@type": "FileObject",
"name": title.replace(" ", "_"),
"description": description,
"encodingFormat": format,
"contentUrl": file_path,
"sha256": sha256_hash
}
if __name__ == '__main__':
# Example usage
title = "Example Dataset"
description = "This is an example dataset."
identifier = "http://example.org/dataset/1"
keywords = ["example", "dataset"]
publisher_name = "Example Organization"
citation = "Example Organization (2024). Example Dataset. Retrieved from http://example.org/dataset/1"
date_published = datetime.date.today().isoformat() # Use today's date for example
license_url = "http://example.org/license"
version = "1.0.0" # Ensure version follows MAJOR.MINOR.PATCH format
# Paths to the example files
csv_file_path = "path/to/example_dataset.csv"
parquet_file_path = "path/to/example_dataset.parquet"
distributions = [
create_distribution("Example Dataset CSV",
"CSV file of the example dataset.",
"text/csv",
"http://example.org/dataset/1.csv"),
create_distribution("Example Dataset Parquet",
"Parquet file of the example dataset.",
"application/x-parquet",
"http://example.org/dataset/1.parquet")
]
dcat_jsonld = create_dcat_jsonld(title, description, identifier,
keywords, publisher_name, distributions,
citation, date_published, license_url, version)
# Save to file
with open("../migration/dcat.json", "w", encoding="utf-8") as f:
json.dump(dcat_jsonld, f, indent=4)
print("DCAT JSON-LD file created successfully.")