From 28f0fb74dc149e2e8c552e47191b51801027efbe Mon Sep 17 00:00:00 2001 From: zoidy Date: Sun, 29 Oct 2023 03:38:20 +0000 Subject: [PATCH 1/3] Outline process for checking if item is preserved --- figshare/Article.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/figshare/Article.py b/figshare/Article.py index 1727441..a1e8368 100644 --- a/figshare/Article.py +++ b/figshare/Article.py @@ -716,6 +716,15 @@ def find_matched_articles(self, articles): for version_data in article_versions_list: # check curation folder for required files and setup data for further processing. if (version_data is not None and len(version_data) > 0): + # check whether the current version is already in preservation storage + # 1. Search for item in preservation storage using only the article_id and version + # 2. If found, check the metadata hash + # 3. If hash is the same, it means the item in figshare is unchanged + # 4. add this item to the exclusion list + # 5. If hash is different, it means the item changed in figshare and has not been preserved yet. + # 6. continue processing using the existing code + # 7. If not found, item has not been preserved, continue processing using existing code + data = self.__check_curation_dir(version_data) version_no = "v" + str(data["version"]).zfill(2) i += 1 From b728534e6bc7669b8f31aec0169ea2b107985f84 Mon Sep 17 00:00:00 2001 From: zoidy Date: Fri, 3 Nov 2023 19:53:28 +0000 Subject: [PATCH 2/3] Add logic to skip already-preserved files prior to processing, update logging --- app.py | 35 +++++++++++++++++++++++++-------- figshare/Article.py | 48 ++++++++++++++++++++++++++++++--------------- 2 files changed, 59 insertions(+), 24 deletions(-) diff --git a/app.py b/app.py index 81b4cd4..0160759 100644 --- a/app.py +++ b/app.py @@ -187,26 +187,45 @@ def main(): processed_collections_versions_count = collection_obj.process_collections(collection_data) log.write_log_in_file('info', - "Total articles versions processed/fetched: \t\t\t" - + f'{processed_articles_versions_count} / {published_articles_versions_count}', + "Total articles versions matchable/fetched: \t\t\t\t\t" + + f'{published_articles_versions_count - article_obj.no_preserved} / {published_articles_versions_count}', True) log.write_log_in_file('info', - "Total processed articles bags already in preservation storage: \t" - + f'{article_obj.processor.duplicate_bag_in_preservation_storage_count}', + "Total articles versions matched/matchable: \t\t\t\t\t" + + f'{article_obj.no_matched} / {published_articles_versions_count - article_obj.no_preserved}', True) log.write_log_in_file('info', - "Total collections versions processed/fetched: \t\t\t" + "Total articles versions processed/matched: \t\t\t\t\t" + + f'{processed_articles_versions_count} / {article_obj.no_matched}', + True) + log.write_log_in_file('info', + "Total articles versions umatched (matchable-matched): \t\t\t\t" + + f'{article_obj.no_unmatched}', + True) + log.write_log_in_file('info', + "Total articles versions already preserved (skipped matching, processing): \t" + + f'{article_obj.no_preserved}', + True) +# log.write_log_in_file('info', +# "Total articles versions already preserved (processed but bag not uploaded): \t" +# + f'{article_obj.processor.duplicate_bag_in_preservation_storage_count}', +# True) + log.write_log_in_file('info', + "Total collections versions processed/fetched: \t\t\t\t\t" + f'{processed_collections_versions_count} / {collections_versions_count}', True) log.write_log_in_file('info', - "Total processed collections bags already in preservation storage: " + "Total collections already preserved: \t\t\t\t\t\t" + f'{collection_obj.processor.duplicate_bag_in_preservation_storage_count}', True) - if processed_articles_versions_count != published_articles_versions_count or processed_collections_versions_count != collections_versions_count: + if article_obj.no_matched != published_articles_versions_count - article_obj.no_preserved: + log.write_log_in_file('warning', + 'The number of matchable articles versions is different than the number matched. Check the log for details.', True) + if processed_articles_versions_count != article_obj.no_matched or processed_collections_versions_count != collections_versions_count: log.write_log_in_file('warning', 'The number of articles versions or collections versions sucessfully processed is different' - + ' than the number fetched. Check the log for details.', True) + + ' than the number matched. Check the log for details.', True) log.write_log_in_file('info', f"ReBACH finished with {log.warnings_count} warnings and {log.errors_count} errors", diff --git a/figshare/Article.py b/figshare/Article.py index a1e8368..b12b3f5 100644 --- a/figshare/Article.py +++ b/figshare/Article.py @@ -45,6 +45,9 @@ def __init__(self, config, log, ids): self.article_non_match_info = {} self.input_articles_id = ids self.matched_curation_folder_list = [] + self.no_preserved = 0 + self.no_matched = 0 + self.no_unmatched = 0 self.processor = Integration(self.config_obj, self.logs) """ @@ -707,7 +710,9 @@ def __copy_files_ual_rdm(self, version_data, folder_name): """ def find_matched_articles(self, articles): article_data = {} - no_matched = 0 + self.no_matched = 0 + self.no_unmatched = 0 + self.no_preserved = 0 i = 0 for article in articles: if (articles[article] is not None): @@ -725,19 +730,28 @@ def find_matched_articles(self, articles): # 6. continue processing using the existing code # 7. If not found, item has not been preserved, continue processing using existing code - data = self.__check_curation_dir(version_data) - version_no = "v" + str(data["version"]).zfill(2) - i += 1 - if (data["matched"] is True): - total_file_size = version_data['size'] - self.total_all_articles_file_size += total_file_size - article_data[version_data['id']].append(data) - no_matched += 1 - self.article_match_info[i] = f"article {data['id']} {version_no} ----- {data['author_dir']}" - if (self.input_articles_id): - self.matched_curation_folder_list.append(data['author_dir']) + # dummy code to test updated logic and logging + is_preserved = False + if version_data['id'] == 14192885: + is_preserved = True + + if not is_preserved: + data = self.__check_curation_dir(version_data) + version_no = "v" + str(data["version"]).zfill(2) + i += 1 + if (data["matched"] is True): + total_file_size = version_data['size'] + self.total_all_articles_file_size += total_file_size + article_data[version_data['id']].append(data) + self.no_matched += 1 + self.article_match_info[i] = f"article {data['id']} {version_no} ----- {data['author_dir']}" + if (self.input_articles_id): + self.matched_curation_folder_list.append(data['author_dir']) + else: + self.article_non_match_info[i] = f"article {data['id']} {version_no}" else: - self.article_non_match_info[i] = f"article {data['id']} {version_no}" + self.logs.write_log_in_file('info', f"{version_data['url_public_html']} already in preservation. Skipping", True) + self.no_preserved += 1 matched_articles = [] if (self.article_match_info): @@ -755,8 +769,9 @@ def find_matched_articles(self, articles): self.logs.write_log_in_file('error', f"Unable to fetch matched article id - {self.article_match_info[index]}", True) unmatched_articles = [] + self.no_unmatched = len(self.article_non_match_info) if (self.article_non_match_info): - self.logs.write_log_in_file('warning', "Curation folder not found for below articles", True) + self.logs.write_log_in_file('info', "Curation folder not found for below articles", True) # log unmatched articles id, and version for index in self.article_non_match_info: @@ -771,8 +786,9 @@ def find_matched_articles(self, articles): self.logs.write_log_in_file("info", f"Total matched unique articles: {len(set(matched_articles))}.", True) self.logs.write_log_in_file("info", f"Total unmatched unique articles: {len(set(unmatched_articles))}.", True) - self.logs.write_log_in_file("info", f"Total matched article versions: {no_matched}.", True) - self.logs.write_log_in_file("info", f"Total unmatched article versions: {len(self.article_non_match_info)}.", True) + self.logs.write_log_in_file("info", f"Total matched article versions: {self.no_matched}.", True) + self.logs.write_log_in_file("info", f"Total unmatched article versions: {self.no_unmatched}.", True) + self.logs.write_log_in_file("info", f"Total skipped article versions (preserved): {self.no_preserved}.", True) if len(set(unmatched_articles)) > 0 or len(self.article_non_match_info) > 0: self.logs.write_log_in_file("warning", "There were unmatched articles or article versions." From 821840a69dcc7f2a90e27ea471659f7e38a57b7c Mon Sep 17 00:00:00 2001 From: zoidy Date: Fri, 2 Feb 2024 20:44:10 +0000 Subject: [PATCH 3/3] Improve summary data displayed after a run --- app.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/app.py b/app.py index 0160759..cf6894b 100644 --- a/app.py +++ b/app.py @@ -186,8 +186,13 @@ def main(): # Start collections processing after completing fetching data from API and articles processing. processed_collections_versions_count = collection_obj.process_collections(collection_data) + log.write_log_in_file('info', '------- Summary -------') log.write_log_in_file('info', - "Total articles versions matchable/fetched: \t\t\t\t\t" + "Total unique articles published/fetched: \t\t\t\t\t" + + f'{published_articles_count} / {published_unpublished_count}', + True) + log.write_log_in_file('info', + "Total articles versions matchable/published: \t\t\t\t\t" + f'{published_articles_versions_count - article_obj.no_preserved} / {published_articles_versions_count}', True) log.write_log_in_file('info', @@ -199,7 +204,7 @@ def main(): + f'{processed_articles_versions_count} / {article_obj.no_matched}', True) log.write_log_in_file('info', - "Total articles versions umatched (matchable-matched): \t\t\t\t" + "Total articles versions unmatched (matchable-matched): \t\t\t\t" + f'{article_obj.no_unmatched}', True) log.write_log_in_file('info', @@ -211,7 +216,11 @@ def main(): # + f'{article_obj.processor.duplicate_bag_in_preservation_storage_count}', # True) log.write_log_in_file('info', - "Total collections versions processed/fetched: \t\t\t\t\t" + "Total unique collections published/fetched: \t\t\t\t\t" + + f'{collections_count} / {collections_count}', + True) + log.write_log_in_file('info', + "Total collections versions processed/published: \t\t\t\t\t" + f'{processed_collections_versions_count} / {collections_versions_count}', True) log.write_log_in_file('info', @@ -224,7 +233,7 @@ def main(): 'The number of matchable articles versions is different than the number matched. Check the log for details.', True) if processed_articles_versions_count != article_obj.no_matched or processed_collections_versions_count != collections_versions_count: log.write_log_in_file('warning', - 'The number of articles versions or collections versions sucessfully processed is different' + 'The number of articles versions or collections versions successfully processed is different' + ' than the number matched. Check the log for details.', True) log.write_log_in_file('info',