|
| 1 | +# ========================================================================= |
| 2 | +# |
| 3 | +# Copyright Ziv Yaniv |
| 4 | +# |
| 5 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | +# you may not use this file except in compliance with the License. |
| 7 | +# You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0.txt |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | +# |
| 17 | +# ========================================================================= |
| 18 | + |
| 19 | +import argparse |
| 20 | +import json |
| 21 | +import bibtexparser |
| 22 | +import pandas as pd |
| 23 | +import subprocess |
| 24 | +import sys |
| 25 | + |
| 26 | +""" |
| 27 | +This utility script creates draft release notes highlighting the differences between |
| 28 | +two tagged versions of the IBEX Imaging Community Knowledge-Base (KB). The script compares |
| 29 | +the content of the KB data files for the two tags and creates a file that is used as a starting |
| 30 | +point for the official release notes on GitHub. See the KB GitHub actions create_draft_release.yml |
| 31 | +for usage in the process of creating a new release. |
| 32 | +""" |
| 33 | + |
| 34 | + |
| 35 | +def read_reagent_resources_csv(reagent_file): |
| 36 | + # read reagent resources csv and split the multi-value columns into lists, |
| 37 | + # value separator is ";" |
| 38 | + # entry may also be "NA" or empty string. |
| 39 | + df = pd.read_csv(reagent_file, dtype=str, keep_default_na=False) |
| 40 | + df["Agree"] = df["Agree"].apply( |
| 41 | + lambda x: [v.strip() for v in x.split(";") if v.strip() not in ["", "NA"]] |
| 42 | + ) |
| 43 | + df["Disagree"] = df["Disagree"].apply( |
| 44 | + lambda x: [v.strip() for v in x.split(";") if v.strip() not in ["", "NA"]] |
| 45 | + ) |
| 46 | + df["MD5"] = df["MD5"].apply( |
| 47 | + lambda x: [v.strip() for v in x.split(";") if v.strip() not in ["", "NA"]] |
| 48 | + ) |
| 49 | + return df |
| 50 | + |
| 51 | + |
| 52 | +def create_draft_release_notes(old_tag, new_tag, output_file): |
| 53 | + csv_txt = [ |
| 54 | + ("data/protocols.csv", "Protocols: "), |
| 55 | + ("data/videos.csv", "Videos: "), |
| 56 | + ("data/datasets.csv", "Datasets: "), |
| 57 | + ("data/software.csv", "Software tools: "), |
| 58 | + ] |
| 59 | + reagent_file = "data/reagent_resources.csv" |
| 60 | + zenodo_file = ".zenodo.json" |
| 61 | + publications_file = "data/publications.bib" |
| 62 | + |
| 63 | + release_notes_str = ( |
| 64 | + "We are proud to announce a new release of the IBEX Imaging " |
| 65 | + "Community Knowledge-Base!\n\n" |
| 66 | + "All changes and updates have been integrated into the " |
| 67 | + "[official Knowledge-Base web-site]" |
| 68 | + "(https://ibeximagingcommunity.github.io/ibex_imaging_knowledge_base).\n\n" |
| 69 | + "This release includes the following updates compared to the " |
| 70 | + "previous release:\n" |
| 71 | + ) |
| 72 | + |
| 73 | + try: |
| 74 | + # Read data from old tag |
| 75 | + subprocess.run(["git", "checkout", old_tag], check=True, capture_output=True) |
| 76 | + old_dfs = [ |
| 77 | + pd.read_csv(csv_file, dtype=str, keep_default_na=False) |
| 78 | + for csv_file, _ in csv_txt |
| 79 | + ] |
| 80 | + old_reagent_resources_df = read_reagent_resources_csv(reagent_file) |
| 81 | + with open(zenodo_file) as fp: |
| 82 | + old_zenodo_dict = json.load(fp) |
| 83 | + with open(publications_file) as biblatex_file: |
| 84 | + old_bib_database = bibtexparser.load(biblatex_file) |
| 85 | + |
| 86 | + # Read data from new tag |
| 87 | + subprocess.run(["git", "checkout", new_tag], check=True, capture_output=True) |
| 88 | + new_dfs = [ |
| 89 | + pd.read_csv(csv_file, dtype=str, keep_default_na=False) |
| 90 | + for csv_file, _ in csv_txt |
| 91 | + ] |
| 92 | + new_reagent_resources_df = read_reagent_resources_csv(reagent_file) |
| 93 | + with open(zenodo_file) as fp: |
| 94 | + new_zenodo_dict = json.load(fp) |
| 95 | + with open(publications_file) as biblatex_file: |
| 96 | + new_bib_database = bibtexparser.load(biblatex_file) |
| 97 | + |
| 98 | + # Compute the number of new reagent validations. Only consider the |
| 99 | + # columns that define the reagent validation configuration. |
| 100 | + cols_to_ignore = [ |
| 101 | + "Agree", |
| 102 | + "Disagree", |
| 103 | + "Contributor", |
| 104 | + "Image Files", |
| 105 | + "Captions", |
| 106 | + "MD5", |
| 107 | + ] |
| 108 | + cols_to_use = [ |
| 109 | + col |
| 110 | + for col in new_reagent_resources_df.columns.to_list() |
| 111 | + if col not in cols_to_ignore |
| 112 | + ] |
| 113 | + # Outer merge of the new and old reagent resources dataframes using the |
| 114 | + # subset of columns that define a reagent validation configuration. The |
| 115 | + # indicator column _merge will show whether the row is present in both |
| 116 | + # dataframes ("both"), or only in one of them ("left_only" or "right_only"). |
| 117 | + # The cols_to_ignore are copied as is and the column headers are suffixed |
| 118 | + # with _new and _old. |
| 119 | + comparison_df = new_reagent_resources_df.merge( |
| 120 | + old_reagent_resources_df, |
| 121 | + on=cols_to_use, |
| 122 | + suffixes=("_new", "_old"), |
| 123 | + indicator=True, |
| 124 | + how="outer", |
| 125 | + ) |
| 126 | + differences = comparison_df[comparison_df["_merge"] != "both"] |
| 127 | + reagent_additions = sum(differences["_merge"] == "left_only") |
| 128 | + reagent_deletions = sum(differences["_merge"] == "right_only") |
| 129 | + # Compute the number of replicated reagent validations. |
| 130 | + common_rows = comparison_df["_merge"] == "both" |
| 131 | + # count number of new replications for configurations that existed |
| 132 | + # in the old release |
| 133 | + replication_old_configurations = ( |
| 134 | + comparison_df[common_rows]["Agree_new"].apply(len) |
| 135 | + + comparison_df[common_rows]["Disagree_new"].apply(len) |
| 136 | + - comparison_df[common_rows]["Agree_old"].apply(len) |
| 137 | + - comparison_df[common_rows]["Disagree_old"].apply(len) |
| 138 | + ).sum() |
| 139 | + new_release_rows = comparison_df["_merge"] == "left_only" |
| 140 | + # count number of new replications in new configurations, subtract 1 |
| 141 | + # because the first instance is not a replication |
| 142 | + replication_new_configurations = ( |
| 143 | + comparison_df[new_release_rows]["Agree_new"].apply(len) |
| 144 | + + comparison_df[new_release_rows]["Disagree_new"].apply(len) |
| 145 | + - 1 |
| 146 | + ).sum() |
| 147 | + total_replications = ( |
| 148 | + replication_old_configurations + replication_new_configurations |
| 149 | + ) |
| 150 | + if reagent_additions > 0 or reagent_deletions > 0 or total_replications > 0: |
| 151 | + release_notes_str += "* Reagent validation results: " |
| 152 | + all_reagent_strs = [] |
| 153 | + if reagent_additions > 0: |
| 154 | + all_reagent_strs.append(f"{reagent_additions} added") |
| 155 | + if reagent_deletions > 0: |
| 156 | + all_reagent_strs.append(f"{reagent_deletions} removed") |
| 157 | + if total_replications > 0: |
| 158 | + all_reagent_strs.append(f"{total_replications} replicated") |
| 159 | + release_notes_str += ", ".join(all_reagent_strs) + "\n" |
| 160 | + # Compute reagent validation image differences |
| 161 | + # Collect the MD5 hashes from all image lists into a set |
| 162 | + # for old and new and compare the sets. |
| 163 | + old_image_list = [] |
| 164 | + for img_list in old_reagent_resources_df["MD5"]: |
| 165 | + old_image_list.extend(img_list) |
| 166 | + old_images = set(old_image_list) |
| 167 | + new_image_list = [] |
| 168 | + for img_list in new_reagent_resources_df["MD5"]: |
| 169 | + new_image_list.extend(img_list) |
| 170 | + new_images = set(new_image_list) |
| 171 | + added_images = new_images - old_images |
| 172 | + removed_images = old_images - new_images |
| 173 | + if added_images or removed_images: |
| 174 | + release_notes_str += "* Images supporting reagent validation results: " |
| 175 | + all_image_strs = [] |
| 176 | + if added_images: |
| 177 | + all_image_strs.append(f"{len(added_images)} added") |
| 178 | + if removed_images: |
| 179 | + all_image_strs.append(f"{len(removed_images)} removed") |
| 180 | + release_notes_str += ", ".join(all_image_strs) + "\n" |
| 181 | + |
| 182 | + # Compute differences between new and old csv files |
| 183 | + for old_df, (_, header), new_df in zip(old_dfs, csv_txt, new_dfs): |
| 184 | + old_count = len(old_df) |
| 185 | + new_count = len(new_df) |
| 186 | + if new_count > old_count: |
| 187 | + release_notes_str += f"* {header}: {new_count - old_count} added\n" |
| 188 | + elif new_count < old_count: |
| 189 | + release_notes_str += f"* {header}: {old_count - new_count} removed\n" |
| 190 | + # Compute differences between new and old publication files, all bibtex entries |
| 191 | + # have a doi field (required by the knowledge-base). |
| 192 | + old_dois = set([bib_entry["doi"] for bib_entry in old_bib_database.entries]) |
| 193 | + new_dois = set([bib_entry["doi"] for bib_entry in new_bib_database.entries]) |
| 194 | + new_publications = new_dois - old_dois |
| 195 | + removed_publications = old_dois - new_dois |
| 196 | + if new_publications or removed_publications: |
| 197 | + release_notes_str += "* Publications: " |
| 198 | + all_publication_strs = [] |
| 199 | + if new_publications: |
| 200 | + all_publication_strs.append(f"{len(new_publications)} added") |
| 201 | + if removed_publications: |
| 202 | + all_publication_strs.append(f"{len(removed_publications)} removed") |
| 203 | + release_notes_str += ", ".join(all_publication_strs) + "\n" |
| 204 | + |
| 205 | + # Get ORCIDS for first time contributors to this release |
| 206 | + previous_contributors = set([c["orcid"] for c in old_zenodo_dict["creators"]]) |
| 207 | + current_contributors = set([c["orcid"] for c in new_zenodo_dict["creators"]]) |
| 208 | + first_time_contributors = current_contributors - previous_contributors |
| 209 | + if first_time_contributors: |
| 210 | + release_notes_str += ( |
| 211 | + "## Congratulations\n" |
| 212 | + "Congratulations and thank you to everyone who contributed to this release.\n" |
| 213 | + "We would like to especially recognize the following new contributors " |
| 214 | + "(identified by their unique Open Researcher and Contributor ID):\n" |
| 215 | + ) |
| 216 | + release_notes_str += ", ".join( |
| 217 | + [ |
| 218 | + f"[{orcid}](https://orcid.org/{orcid})" |
| 219 | + for orcid in first_time_contributors |
| 220 | + ] |
| 221 | + ) |
| 222 | + release_notes_str += "\n" |
| 223 | + |
| 224 | + output_file.write(release_notes_str) |
| 225 | + except subprocess.CalledProcessError as e: |
| 226 | + raise Exception(f"Git operation failed: {e}") |
| 227 | + except Exception as e: |
| 228 | + raise Exception(f"Error reading files: {e}") |
| 229 | + |
| 230 | + |
| 231 | +def main(argv=None): |
| 232 | + parser = argparse.ArgumentParser( |
| 233 | + description="Create draft release notes highlighting differences between two " |
| 234 | + "tagged KB versions. Script must be run from the directory containing the git repository." |
| 235 | + ) |
| 236 | + parser.add_argument( |
| 237 | + "old_release_tag", |
| 238 | + type=str, |
| 239 | + help="Git tag for the old release.", |
| 240 | + ) |
| 241 | + parser.add_argument( |
| 242 | + "new_release_tag", |
| 243 | + type=str, |
| 244 | + help="Git tag for the new release.", |
| 245 | + ) |
| 246 | + parser.add_argument( |
| 247 | + "output_file", |
| 248 | + type=argparse.FileType("w"), |
| 249 | + help="File to write the draft release notes to.", |
| 250 | + ) |
| 251 | + args = parser.parse_args(argv) |
| 252 | + |
| 253 | + try: |
| 254 | + create_draft_release_notes( |
| 255 | + args.old_release_tag, |
| 256 | + args.new_release_tag, |
| 257 | + args.output_file, |
| 258 | + ) |
| 259 | + except Exception as e: |
| 260 | + print( |
| 261 | + f"{e}", |
| 262 | + file=sys.stderr, |
| 263 | + ) |
| 264 | + return 1 |
| 265 | + return 0 |
| 266 | + |
| 267 | + |
| 268 | +if __name__ == "__main__": |
| 269 | + sys.exit(main()) |
0 commit comments