Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: Check whether item has been preserved prior to processing(Issue #85) #93

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 37 additions & 9 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,27 +186,55 @@ def main():
# Start collections processing after completing fetching data from API and articles processing.
processed_collections_versions_count = collection_obj.process_collections(collection_data)

log.write_log_in_file('info', '------- Summary -------')
log.write_log_in_file('info',
"Total articles versions processed/fetched: \t\t\t"
+ f'{processed_articles_versions_count} / {published_articles_versions_count}',
"Total unique articles published/fetched: \t\t\t\t\t"
+ f'{published_articles_count} / {published_unpublished_count}',
True)
log.write_log_in_file('info',
"Total processed articles bags already in preservation storage: \t"
+ f'{article_obj.processor.duplicate_bag_in_preservation_storage_count}',
"Total articles versions matchable/published: \t\t\t\t\t"
+ f'{published_articles_versions_count - article_obj.no_preserved} / {published_articles_versions_count}',
True)
log.write_log_in_file('info',
"Total collections versions processed/fetched: \t\t\t"
"Total articles versions matched/matchable: \t\t\t\t\t"
+ f'{article_obj.no_matched} / {published_articles_versions_count - article_obj.no_preserved}',
True)
log.write_log_in_file('info',
"Total articles versions processed/matched: \t\t\t\t\t"
+ f'{processed_articles_versions_count} / {article_obj.no_matched}',
True)
log.write_log_in_file('info',
"Total articles versions unmatched (matchable-matched): \t\t\t\t"
+ f'{article_obj.no_unmatched}',
True)
log.write_log_in_file('info',
"Total articles versions already preserved (skipped matching, processing): \t"
+ f'{article_obj.no_preserved}',
True)
# log.write_log_in_file('info',
# "Total articles versions already preserved (processed but bag not uploaded): \t"
# + f'{article_obj.processor.duplicate_bag_in_preservation_storage_count}',
# True)
log.write_log_in_file('info',
"Total unique collections published/fetched: \t\t\t\t\t"
+ f'{collections_count} / {collections_count}',
True)
log.write_log_in_file('info',
"Total collections versions processed/published: \t\t\t\t\t"
+ f'{processed_collections_versions_count} / {collections_versions_count}',
True)
log.write_log_in_file('info',
"Total processed collections bags already in preservation storage: "
"Total collections already preserved: \t\t\t\t\t\t"
+ f'{collection_obj.processor.duplicate_bag_in_preservation_storage_count}',
True)

if processed_articles_versions_count != published_articles_versions_count or processed_collections_versions_count != collections_versions_count:
if article_obj.no_matched != published_articles_versions_count - article_obj.no_preserved:
log.write_log_in_file('warning',
'The number of matchable articles versions is different than the number matched. Check the log for details.', True)
if processed_articles_versions_count != article_obj.no_matched or processed_collections_versions_count != collections_versions_count:
log.write_log_in_file('warning',
'The number of articles versions or collections versions sucessfully processed is different'
+ ' than the number fetched. Check the log for details.', True)
'The number of articles versions or collections versions successfully processed is different'
+ ' than the number matched. Check the log for details.', True)

log.write_log_in_file('info',
f"ReBACH finished with {log.warnings_count} warnings and {log.errors_count} errors",
Expand Down
57 changes: 41 additions & 16 deletions figshare/Article.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ def __init__(self, config, log, ids):
self.article_non_match_info = {}
self.input_articles_id = ids
self.matched_curation_folder_list = []
self.no_preserved = 0
self.no_matched = 0
self.no_unmatched = 0
self.processor = Integration(self.config_obj, self.logs)

"""
Expand Down Expand Up @@ -707,7 +710,9 @@ def __copy_files_ual_rdm(self, version_data, folder_name):
"""
def find_matched_articles(self, articles):
article_data = {}
no_matched = 0
self.no_matched = 0
self.no_unmatched = 0
self.no_preserved = 0
i = 0
for article in articles:
if (articles[article] is not None):
Expand All @@ -716,19 +721,37 @@ def find_matched_articles(self, articles):
for version_data in article_versions_list:
# check curation folder for required files and setup data for further processing.
if (version_data is not None and len(version_data) > 0):
data = self.__check_curation_dir(version_data)
version_no = "v" + str(data["version"]).zfill(2)
i += 1
if (data["matched"] is True):
total_file_size = version_data['size']
self.total_all_articles_file_size += total_file_size
article_data[version_data['id']].append(data)
no_matched += 1
self.article_match_info[i] = f"article {data['id']} {version_no} ----- {data['author_dir']}"
if (self.input_articles_id):
self.matched_curation_folder_list.append(data['author_dir'])
# check whether the current version is already in preservation storage
# 1. Search for item in preservation storage using only the article_id and version
# 2. If found, check the metadata hash
# 3. If hash is the same, it means the item in figshare is unchanged
# 4. add this item to the exclusion list
# 5. If hash is different, it means the item changed in figshare and has not been preserved yet.
# 6. continue processing using the existing code
# 7. If not found, item has not been preserved, continue processing using existing code

# dummy code to test updated logic and logging
is_preserved = False
if version_data['id'] == 14192885:
is_preserved = True

if not is_preserved:
data = self.__check_curation_dir(version_data)
version_no = "v" + str(data["version"]).zfill(2)
i += 1
if (data["matched"] is True):
total_file_size = version_data['size']
self.total_all_articles_file_size += total_file_size
article_data[version_data['id']].append(data)
self.no_matched += 1
self.article_match_info[i] = f"article {data['id']} {version_no} ----- {data['author_dir']}"
if (self.input_articles_id):
self.matched_curation_folder_list.append(data['author_dir'])
else:
self.article_non_match_info[i] = f"article {data['id']} {version_no}"
else:
self.article_non_match_info[i] = f"article {data['id']} {version_no}"
self.logs.write_log_in_file('info', f"{version_data['url_public_html']} already in preservation. Skipping", True)
self.no_preserved += 1

matched_articles = []
if (self.article_match_info):
Expand All @@ -746,8 +769,9 @@ def find_matched_articles(self, articles):
self.logs.write_log_in_file('error', f"Unable to fetch matched article id - {self.article_match_info[index]}", True)

unmatched_articles = []
self.no_unmatched = len(self.article_non_match_info)
if (self.article_non_match_info):
self.logs.write_log_in_file('warning', "Curation folder not found for below articles", True)
self.logs.write_log_in_file('info', "Curation folder not found for below articles", True)

# log unmatched articles id, and version
for index in self.article_non_match_info:
Expand All @@ -762,8 +786,9 @@ def find_matched_articles(self, articles):

self.logs.write_log_in_file("info", f"Total matched unique articles: {len(set(matched_articles))}.", True)
self.logs.write_log_in_file("info", f"Total unmatched unique articles: {len(set(unmatched_articles))}.", True)
self.logs.write_log_in_file("info", f"Total matched article versions: {no_matched}.", True)
self.logs.write_log_in_file("info", f"Total unmatched article versions: {len(self.article_non_match_info)}.", True)
self.logs.write_log_in_file("info", f"Total matched article versions: {self.no_matched}.", True)
self.logs.write_log_in_file("info", f"Total unmatched article versions: {self.no_unmatched}.", True)
self.logs.write_log_in_file("info", f"Total skipped article versions (preserved): {self.no_preserved}.", True)

if len(set(unmatched_articles)) > 0 or len(self.article_non_match_info) > 0:
self.logs.write_log_in_file("warning", "There were unmatched articles or article versions."
Expand Down
Loading