Skip to content

Commit

Permalink
Merge pull request #2 from kenlhlui/wip/v0.1.1
Browse files Browse the repository at this point in the history
Merging wip/v0.1.1 into release/v0.1.1
  • Loading branch information
kenlhlui authored Jan 28, 2025
2 parents 4bbd056 + 9421e33 commit 1b1e403
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 58 deletions.
23 changes: 16 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ A Python CLI tool for extracting and exporting metadata from [Dataverse](https:/
cd ./dataverse-metadata-crawler
```

3. Create an environment file (.env)
3. Create an environment file (`.env`)
```sh
touch .env # For Unix/MacOS
nano .env # or vim .env, or your preferred editor
Expand All @@ -38,12 +38,17 @@ A Python CLI tool for extracting and exporting metadata from [Dataverse](https:/
notepad .env
```

4. Configure the environment (.env) file using the text editor of your choice.
4. Configure the environment (`.env`) file using the text editor of your choice.
```sh
# .env file
BASE_URL = "TARGET_REPO_URL" # e.g., "https://demo.borealisdata.ca/"
BASE_URL = "TARGET_REPO_URL" # Base URL of the repository; e.g., "https://demo.borealisdata.ca/"
API_KEY = "YOUR_API_KEY" # Found in your Dataverse account settings. Can also be specified in the CLI interface using the -a flag.
```
Your `.env` file should look like this:
```sh
BASE_URL = "https://demo.borealisdata.ca/"
API_KEY = "XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXX"
```

5. Set up virtual environment (recommended)
```sh
Expand Down Expand Up @@ -115,16 +120,20 @@ exported_files/
├── json_files/
│ └── ds_metadata_yyyymmdd-HHMMSS.json # With -d flag enabled
│ └── empty_dv_yyyymmdd-HHMMSS.json # With -e flag enabled
│ └── failed_metadata_uris_yyyymmdd-HHMMSS.json
│ └── permission_dict_yyyymmdd-HHMMSS.json # With -p flag enabled
│ └── pid_dict_yyyymmdd-HHMMSS.json # Only exported if -p flag is used without -d flag
│ └── pid_dict_dd_yyyymmdd-HHMMSS.json # Hierarchical information of deaccessioned/draft datasets
│ └── failed_metadata_uris_yyyymmdd-HHMMSS.json # With -f flag enabled
│ └── permission_dict_yyyymmdd-HHMMSS.json # With only -p flag enabled
│ └── pid_dict_yyyymmdd-HHMMSS.json # With only -p flag enabled
│ └── pid_dict_dd_yyyymmdd-HHMMSS.json # Hierarchical information of deaccessioned/draft datasets.
├── csv_files/
│ └── ds_metadata_yyyymmdd-HHMMSS.csv # with -s flag enabled
└── logs_files/
└── log_yyyymmdd-HHMMSS.txt # Exported by default, without specifying --no-log
```

## ⚠️Disclaimer
> [!WARNING]
> To retrieve data about unpublished datasets or information that is not available publicly (e.g. collaborators/permissions), you will need to have necessary access rights. **Please note that any publication or use of non-publicly available data may require review by a Research Ethics Board**.
## ✅Tests
No tests have been written yet. Contributions welcome!

Expand Down
20 changes: 8 additions & 12 deletions dvmeta/func.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,28 +178,24 @@ def add_path_info(meta_dict: dict, ds_dict: dict) -> tuple:
return meta_dict, ds_dict_copy


def add_perrmission_info(meta_dict: dict, permission_dict: Optional[dict] = None) -> tuple:
def add_permission_info(meta_dict: dict, permission_dict: Optional[dict] = None) -> dict:
"""Add permission_info to the metadata dictionary, handling nested structures."""
if isinstance(permission_dict, dict):
permission_dict_copy = permission_dict.copy()
for pid_key, pid_value in list(permission_dict_copy.items()):
for pid_key, pid_value in list(permission_dict.items()):
pid_key_str = str(pid_key)
# Traverse the meta_dict to find matching datasetId
for _meta_key, meta_value in meta_dict.items():
if isinstance(meta_value, dict) and meta_value.get('data', {}).get('datasetId') == int(pid_key_str):
# Add path_info to the appropriate nested dictionary
meta_value['permission_info'] = pid_value
# Remove from permission_dict_copy
permission_dict_copy.pop(pid_key)
# Remove from permission_dict
permission_dict.pop(pid_key)
break
for _meta_key, meta_value in meta_dict.items():
if isinstance(meta_value, dict) and meta_value.get('data', {}).get('datasetId'):
if 'permission_info' not in meta_value:
meta_value['permission_info'] = {'status': 'NA', 'data': []}

return meta_dict, permission_dict_copy
for _meta_key, meta_value in meta_dict.items():
if 'permission_info' not in meta_value:
meta_value['permission_info'] = {'status': 'NA', 'data': []}

return meta_dict, None
return meta_dict


def load_env() -> dict:
Expand Down
6 changes: 3 additions & 3 deletions dvmeta/httpxclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ async def __aexit__(self,
await self.async_client.aclose()
self.sync_client.close()

async def _async_semaphore_client(self, url: str) -> httpx.Response | None:
async def _async_semaphore_client(self, url: str) -> httpx.Response | list[str]:
"""Asynchronous HTTP client with semaphore.
Args:
Expand All @@ -79,9 +79,9 @@ async def _async_semaphore_client(self, url: str) -> httpx.Response | None:
# print(f'HTTP request Error for {url}: {response.status_code}')
return response
return response
except (httpx.HTTPStatusError, httpx.RequestError) as exc:
except (httpx.HTTPStatusError, httpx.RequestError):
# print(f'HTTP request Error for {url}: {exc}')
return None
return [url, 'Error']

def sync_get(self, url: str) -> httpx.Response | None:
"""Synchronous GET request.
Expand Down
53 changes: 18 additions & 35 deletions dvmeta/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,13 @@ def main(
False, '--spreadsheet', '-s', help='Output a CSV file of the metadata of datasets'
),
):
"""A command line utility that crawls a dataverse repository, extracting metadata for dataverses, datasets, and permissions, and then stores it in JSON format."""
# Load the environment variables #! This need to be modified as it nullifies the auth token provided by the user
"""A Python CLI tool for extracting and exporting metadata from Dataverse repositories to JSON and CSV formats."""
# Load the environment variables
config: dict = func.load_env()

config['COLLECTION_ALIAS'] = collection_alias
config['VERSION'] = version
config['API_KEY'] = (
auth if auth else config['API_KEY']
) # Reassign the API_KEY and replace it specified in the .env file
config['API_KEY'] = (auth if auth else config['API_KEY']) # Reassign the API_KEY and replace it specified in the .env file, if provided in the CLI interface

# Check if -s flag is provided without -d flag
func.validate_spreadsheet(spreadsheet, dvdfds_matadata)
Expand Down Expand Up @@ -164,30 +162,15 @@ async def main_crawler():
# Add the path_info to the metadata
meta_dict, pid_dict_dd = func.add_path_info(meta_dict, ds_dict)

if not permission: # Delay the merging of permission metadata until the permission metadata is crawled

# Export the metadata to a JSON file
meta_json_file_path, meta_json_checksum = utils.orjson_export(meta_dict, 'meta_dict')
json_file_checksum_dict.append(
{
'type': 'Dataset Metadata (Representation & File)',
'path': meta_json_file_path,
'checksum': meta_json_checksum,
}
)
print(
f'Successfully crawled {utils.count_key(meta_dict)} metadata of dataset representation and file in total.\n'
)

# Export the updated pid_dict_dd (Which contains deaccessioned/draft datasets) to a JSON file
pid_dict_json, pid_dict_checksum = utils.orjson_export(pid_dict_dd, 'pid_dict_dd')
json_file_checksum_dict.append(
{
'type': 'Hierarchical Information of Datasets(deaccessioned/draft)',
'path': pid_dict_json,
'checksum': pid_dict_checksum,
}
)
# Export the updated pid_dict_dd (Which contains deaccessioned/draft datasets) to a JSON file
pid_dict_json, pid_dict_checksum = utils.orjson_export(pid_dict_dd, 'pid_dict_dd')
json_file_checksum_dict.append(
{
'type': 'Hierarchical Information of Datasets(deaccessioned/draft)',
'path': pid_dict_json,
'checksum': pid_dict_checksum,
}
)

if failed:
failed_metadata_uris_json, failed_metadata_uris_checksum = utils.orjson_export(
Expand Down Expand Up @@ -232,21 +215,21 @@ async def main_crawler():
}
)

# Combine the metadata and permission metadata
if dvdfds_matadata and permission:
if isinstance(permission_dict, dict):
meta_dict = func.add_perrmission_info(meta_dict, permission_dict)[0]
# Combine the metadata and permission metadata, if both are provided
# Else write dummy permission metadata to the metadata
meta_dict = func.add_permission_info(meta_dict, permission_dict if isinstance(permission_dict, dict) and permission_dict else None)

if meta_dict:
# Export the metadata to a JSON file

meta_json_file_path, meta_json_checksum = utils.orjson_export(meta_dict, 'meta_dict_with_permission')
meta_json_file_path, meta_json_checksum = utils.orjson_export(meta_dict, 'ds_metadata')
json_file_checksum_dict.append(
{
'type': 'Dataset Metadata (Representation, File & Permission)',
'path': meta_json_file_path,
'checksum': meta_json_checksum,
}
)
print(f'Successfully crawled {utils.count_key(meta_dict)} metadata of dataset representation and file in total.\n')

if empty_dv:
empty_dv_json, empty_dv_checksum = utils.orjson_export(empty_dv_dict, 'empty_dv')
Expand Down
4 changes: 3 additions & 1 deletion dvmeta/metadatacrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,10 @@ async def get_datasets_meta(self, id_list: list) -> tuple[dict, dict]:
if item and item.status_code == self.http_success_status and item.json():
dataset_persistent_idd = item.json().get('data').get('datasetPersistentId')
dataset_meta[dataset_persistent_idd] = item.json()
else:
elif item and item.status_code != self.http_success_status:
failed_dataset_meta[str(item.url)] = item.status_code
elif isinstance(item, list):
failed_dataset_meta[item[0]] = item[1]

return dataset_meta, failed_dataset_meta

Expand Down

0 comments on commit 1b1e403

Please sign in to comment.