diff --git a/app.py b/app.py index 81b4cd4..cf6894b 100644 --- a/app.py +++ b/app.py @@ -186,27 +186,55 @@ def main(): # Start collections processing after completing fetching data from API and articles processing. processed_collections_versions_count = collection_obj.process_collections(collection_data) + log.write_log_in_file('info', '------- Summary -------') log.write_log_in_file('info', - "Total articles versions processed/fetched: \t\t\t" - + f'{processed_articles_versions_count} / {published_articles_versions_count}', + "Total unique articles published/fetched: \t\t\t\t\t" + + f'{published_articles_count} / {published_unpublished_count}', True) log.write_log_in_file('info', - "Total processed articles bags already in preservation storage: \t" - + f'{article_obj.processor.duplicate_bag_in_preservation_storage_count}', + "Total articles versions matchable/published: \t\t\t\t\t" + + f'{published_articles_versions_count - article_obj.no_preserved} / {published_articles_versions_count}', True) log.write_log_in_file('info', - "Total collections versions processed/fetched: \t\t\t" + "Total articles versions matched/matchable: \t\t\t\t\t" + + f'{article_obj.no_matched} / {published_articles_versions_count - article_obj.no_preserved}', + True) + log.write_log_in_file('info', + "Total articles versions processed/matched: \t\t\t\t\t" + + f'{processed_articles_versions_count} / {article_obj.no_matched}', + True) + log.write_log_in_file('info', + "Total articles versions unmatched (matchable-matched): \t\t\t\t" + + f'{article_obj.no_unmatched}', + True) + log.write_log_in_file('info', + "Total articles versions already preserved (skipped matching, processing): \t" + + f'{article_obj.no_preserved}', + True) +# log.write_log_in_file('info', +# "Total articles versions already preserved (processed but bag not uploaded): \t" +# + f'{article_obj.processor.duplicate_bag_in_preservation_storage_count}', +# True) + log.write_log_in_file('info', + "Total unique collections published/fetched: \t\t\t\t\t" + + f'{collections_count} / {collections_count}', + True) + log.write_log_in_file('info', + "Total collections versions processed/published: \t\t\t\t\t" + f'{processed_collections_versions_count} / {collections_versions_count}', True) log.write_log_in_file('info', - "Total processed collections bags already in preservation storage: " + "Total collections already preserved: \t\t\t\t\t\t" + f'{collection_obj.processor.duplicate_bag_in_preservation_storage_count}', True) - if processed_articles_versions_count != published_articles_versions_count or processed_collections_versions_count != collections_versions_count: + if article_obj.no_matched != published_articles_versions_count - article_obj.no_preserved: + log.write_log_in_file('warning', + 'The number of matchable articles versions is different than the number matched. Check the log for details.', True) + if processed_articles_versions_count != article_obj.no_matched or processed_collections_versions_count != collections_versions_count: log.write_log_in_file('warning', - 'The number of articles versions or collections versions sucessfully processed is different' - + ' than the number fetched. Check the log for details.', True) + 'The number of articles versions or collections versions successfully processed is different' + + ' than the number matched. Check the log for details.', True) log.write_log_in_file('info', f"ReBACH finished with {log.warnings_count} warnings and {log.errors_count} errors", diff --git a/figshare/Article.py b/figshare/Article.py index 1727441..b12b3f5 100644 --- a/figshare/Article.py +++ b/figshare/Article.py @@ -45,6 +45,9 @@ def __init__(self, config, log, ids): self.article_non_match_info = {} self.input_articles_id = ids self.matched_curation_folder_list = [] + self.no_preserved = 0 + self.no_matched = 0 + self.no_unmatched = 0 self.processor = Integration(self.config_obj, self.logs) """ @@ -707,7 +710,9 @@ def __copy_files_ual_rdm(self, version_data, folder_name): """ def find_matched_articles(self, articles): article_data = {} - no_matched = 0 + self.no_matched = 0 + self.no_unmatched = 0 + self.no_preserved = 0 i = 0 for article in articles: if (articles[article] is not None): @@ -716,19 +721,37 @@ def find_matched_articles(self, articles): for version_data in article_versions_list: # check curation folder for required files and setup data for further processing. if (version_data is not None and len(version_data) > 0): - data = self.__check_curation_dir(version_data) - version_no = "v" + str(data["version"]).zfill(2) - i += 1 - if (data["matched"] is True): - total_file_size = version_data['size'] - self.total_all_articles_file_size += total_file_size - article_data[version_data['id']].append(data) - no_matched += 1 - self.article_match_info[i] = f"article {data['id']} {version_no} ----- {data['author_dir']}" - if (self.input_articles_id): - self.matched_curation_folder_list.append(data['author_dir']) + # check whether the current version is already in preservation storage + # 1. Search for item in preservation storage using only the article_id and version + # 2. If found, check the metadata hash + # 3. If hash is the same, it means the item in figshare is unchanged + # 4. add this item to the exclusion list + # 5. If hash is different, it means the item changed in figshare and has not been preserved yet. + # 6. continue processing using the existing code + # 7. If not found, item has not been preserved, continue processing using existing code + + # dummy code to test updated logic and logging + is_preserved = False + if version_data['id'] == 14192885: + is_preserved = True + + if not is_preserved: + data = self.__check_curation_dir(version_data) + version_no = "v" + str(data["version"]).zfill(2) + i += 1 + if (data["matched"] is True): + total_file_size = version_data['size'] + self.total_all_articles_file_size += total_file_size + article_data[version_data['id']].append(data) + self.no_matched += 1 + self.article_match_info[i] = f"article {data['id']} {version_no} ----- {data['author_dir']}" + if (self.input_articles_id): + self.matched_curation_folder_list.append(data['author_dir']) + else: + self.article_non_match_info[i] = f"article {data['id']} {version_no}" else: - self.article_non_match_info[i] = f"article {data['id']} {version_no}" + self.logs.write_log_in_file('info', f"{version_data['url_public_html']} already in preservation. Skipping", True) + self.no_preserved += 1 matched_articles = [] if (self.article_match_info): @@ -746,8 +769,9 @@ def find_matched_articles(self, articles): self.logs.write_log_in_file('error', f"Unable to fetch matched article id - {self.article_match_info[index]}", True) unmatched_articles = [] + self.no_unmatched = len(self.article_non_match_info) if (self.article_non_match_info): - self.logs.write_log_in_file('warning', "Curation folder not found for below articles", True) + self.logs.write_log_in_file('info', "Curation folder not found for below articles", True) # log unmatched articles id, and version for index in self.article_non_match_info: @@ -762,8 +786,9 @@ def find_matched_articles(self, articles): self.logs.write_log_in_file("info", f"Total matched unique articles: {len(set(matched_articles))}.", True) self.logs.write_log_in_file("info", f"Total unmatched unique articles: {len(set(unmatched_articles))}.", True) - self.logs.write_log_in_file("info", f"Total matched article versions: {no_matched}.", True) - self.logs.write_log_in_file("info", f"Total unmatched article versions: {len(self.article_non_match_info)}.", True) + self.logs.write_log_in_file("info", f"Total matched article versions: {self.no_matched}.", True) + self.logs.write_log_in_file("info", f"Total unmatched article versions: {self.no_unmatched}.", True) + self.logs.write_log_in_file("info", f"Total skipped article versions (preserved): {self.no_preserved}.", True) if len(set(unmatched_articles)) > 0 or len(self.article_non_match_info) > 0: self.logs.write_log_in_file("warning", "There were unmatched articles or article versions."