diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..8434b4b --- /dev/null +++ b/.flake8 @@ -0,0 +1,2 @@ +[flake8] +ignore = E203, W503 diff --git a/.gitignore b/.gitignore index 888ee0f..9dad33e 100644 --- a/.gitignore +++ b/.gitignore @@ -41,5 +41,5 @@ dmypy.json token.txt src/test-model.py - +src/pyosmeta/_version_generated.py .pdm-build/* diff --git a/pyproject.toml b/pyproject.toml index 63934bf..6e8e431 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,14 @@ classifiers = [ "Programming Language :: Python :: 3.11", ] -dependencies = ["ruamel-yaml>=0.17.21", "requests", "python-dotenv", "pydantic"] + +dependencies = [ + "ruamel-yaml>=0.17.21", + "requests", + "python-dotenv", + "pydantic>=2.0", +] + # This is the metadata that pip reads to understand what versions your package supports requires-python = ">=3.10" readme = "README.md" @@ -52,14 +59,26 @@ license = { text = "MIT" } # for a user to run directly from the package. [project.scripts] # Optional update-contributors = "pyosmeta.cli.update_contributors:main" -update-reviews = "pyosmeta.cli.update_reviews:main" -update-reviewers = "pyosmeta.cli.update_review_contribs:main" +update-reviews = "pyosmeta.cli.process_reviews:main" +update-review-teams = "pyosmeta.cli.update_review_teams:main" # Right now i'm not using pdm to add dependencies. # Will explore that later # Below using dynamic versioning / which is setuptools scm like -[tool.pdm] +[tool.black] +line-length = 79 +target-version = ['py310'] + +[tool.isort] +profile = "black" +multi_line_output = 3 +py_version = 27 + +# Precommit ignores this config so i added a .flake8 file +# but why did it ignore it? +[tool.flake8] +extend-ignore = ["E203", "W503"] [tool.pdm.build] @@ -70,6 +89,7 @@ package-dir = "src" # Versioning is a backend feature - instructions are in pdm-backend docs # https://pdm-backend.fming.dev/metadata/ + [tool.pdm.version] # Note that you need to create the tag after all commits are created - otherwise # pdm adds dev info after the tag number which won't publish to pypi diff --git a/src/pyosmeta/__init__.py b/src/pyosmeta/__init__.py index d745d01..e4d2c24 100644 --- a/src/pyosmeta/__init__.py +++ b/src/pyosmeta/__init__.py @@ -1,6 +1,14 @@ -# SPDX-FileCopyrightText: 2023-present Leah Wasser -# -# SPDX-License-Identifier: MIT +from .contributors import PersonModel, ProcessContributors +from .parse_issues import ProcessIssues, ReviewModel + +# Trick suggested by flake8 maintainer to ensure the imports above don't +# get flagged as being "unused" +__all__ = ( + "ProcessIssues", + "ReviewModel", + "PersonModel", + "ProcessContributors", +) try: from ._version_generated import __version__ diff --git a/src/pyosmeta/cli/process_reviews.py b/src/pyosmeta/cli/process_reviews.py new file mode 100644 index 0000000..6756c63 --- /dev/null +++ b/src/pyosmeta/cli/process_reviews.py @@ -0,0 +1,61 @@ +""" +Script that parses metadata from na issue and adds it to a yml file for the +website. It also grabs some of the package metadata such as stars, +last commit, etc. + +Output: packages.yml file containing a list of + 1. all packages with accepted reviews + 2. information related to the review including reviewers, editors + 3. basic package stats including stars, etc. + +To run at the CLI: parse_issue_metadata +""" + +# TODO: if we export files we might want packages.yml and then under_review.yml +# thus we'd want to add a second input parameters which was file_name +# TODO: feature - Would be cool to create an "under review now" list as well - +# ideally this could be passed as a CLI argument with the label we want to +# search for + +import pickle + +from pydantic import ValidationError + +from pyosmeta import ProcessIssues, ReviewModel + +# TODO: change the template to ask for date accepted format year-month-day + + +def main(): + process_review = ProcessIssues( + org="pyopensci", + repo_name="software-submission", + label_name="6/pyOS-approved 🚀🚀🚀", + ) + + # Get all issues for approved packages - load as dict + issues = process_review.return_response() + accepted_reviews = process_review.parse_issue_header(issues, 45) + + # Update gh metrics via api for all packages + repo_endpoints = process_review.get_repo_endpoints(accepted_reviews) + all_reviews = process_review.get_gh_metrics( + repo_endpoints, accepted_reviews + ) + + # Populate model objects with review data + metrics + final_reviews = {} + for key, review in all_reviews.items(): + # First add gh meta to each dict + print("Parsing & validating", key) + try: + final_reviews[key] = ReviewModel(**review) + except ValidationError as ve: + print(key, ":", ve) + + with open("all_reviews.pickle", "wb") as f: + pickle.dump(final_reviews, f) + + +if __name__ == "__main__": + main() diff --git a/src/pyosmeta/cli/update_contributors.py b/src/pyosmeta/cli/update_contributors.py index 31c29ac..b0e1bb2 100644 --- a/src/pyosmeta/cli/update_contributors.py +++ b/src/pyosmeta/cli/update_contributors.py @@ -1,15 +1,17 @@ import argparse import pickle -from pyosmeta.contributors import ProcessContributors -from pyosmeta.file_io import clean_export_yml, load_website_yml +import pydantic +from pydantic import ValidationError -# TODO: will this still run in gh actions?? -# TODO: add update=True like i did for update_reviews -# TODO: still need to add a flag to not update specific fields -# TODO: if i use composition and there are helpers in a class -# that are used in a method that i call via composition are the helpers -# still available? +from pyosmeta.contributors import PersonModel, ProcessContributors +from pyosmeta.file_io import create_paths, open_yml_file + +print(pydantic.__version__) + +# TODO - https://stackoverflow.com +# /questions/55762673/how-to-parse-list-of-models-with-pydantic +# I can use TypeAdapter to convert the json data to model objects! def main(): @@ -20,16 +22,14 @@ def main(): parser.add_argument( "--update", type=str, - help="Will force update contrib info from GitHub for every contributor", + help="Force update contrib info from GitHub for every contributor", ) args = parser.parse_args() + update_value = args.update - if args: + if update_value: update_all = True - # TODO - maybe add these as an attr in the contribs class? - base_url = "https://raw.githubusercontent.com/pyOpenSci/" - end_url = "/main/.all-contributorsrc" repos = [ "python-package-guide", "software-peer-review", @@ -37,48 +37,71 @@ def main(): "software-review", "update-web-metadata", ] - json_files = [base_url + repo + end_url for repo in repos] + json_files = create_paths(repos) # Get existing contribs from pyopensci.github.io repo (website data) - web_yaml_path = base_url + "pyopensci.github.io/main/_data/contributors.yml" + base_url = "https://raw.githubusercontent.com/pyOpenSci/" + web_yaml_path = ( + base_url + "pyopensci.github.io/main/_data/contributors.yml" + ) - process_contribs = ProcessContributors(json_files) + web_contribs = open_yml_file(web_yaml_path) + + # Populate all existing contribs into model objects + all_contribs = {} + for a_contrib in web_contribs: + print(a_contrib["github_username"]) + try: + all_contribs[a_contrib["github_username"].lower()] = PersonModel( + **a_contrib + ) + except ValidationError as ve: + print(a_contrib["github_username"]) + print(ve) - # Returns a list of dict objects with gh usernames (lowercase) as keys - # TODO: File io module (could just be a function) - web_contribs = load_website_yml(url=web_yaml_path, key="github_username") - bot_all_contribs_dict = process_contribs.combine_json_data() + print("Done processing all-contribs") - # Parse through each user in the web yaml, if they don't exist, add them - # finally - update contrib types - for key, users in bot_all_contribs_dict.items(): + # Create a list of all contributors across repositories + process_contribs = ProcessContributors(json_files) + bot_all_contribs = process_contribs.combine_json_data() + + print("Updating contrib types and searching for new users now") + for key, users in bot_all_contribs.items(): for gh_user in users: - # Add any new contributors - if gh_user not in web_contribs.keys(): - print("I found a new contributor! Adding:", gh_user) - web_contribs.update( - # TODO: this is also used in the other 2 scripts - # but add user info is in the contribs class - i do - # think it belongs there - process_contribs.check_add_user(gh_user, web_contribs) - ) - - # Update contrib type list - existing_contribs = web_contribs[gh_user]["contributor_type"] - # TODO: This helper is used in all three scripts but defined - # in the contribs class - web_contribs[gh_user][ - "contributor_type" - ] = process_contribs.update_contrib_list(existing_contribs, key) + # Find and populate data for any new contributors + if gh_user not in all_contribs.keys(): + print("Missing", gh_user, "Adding them now") + new_contrib = process_contribs.get_user_info(gh_user) + all_contribs[gh_user] = PersonModel(**new_contrib) + + # Update contribution type list for all users + all_contribs[gh_user].add_unique_value("contributor_type", key) if update_all: - gh_data = process_contribs.get_gh_data(web_contribs) - web_contribs = process_contribs.update_contrib_data(web_contribs, gh_data) + for user in all_contribs.keys(): + print("Updating all user info from github", user) + new_gh_data = process_contribs.get_user_info(user) + + # TODO: turn this into a small update method + existing = all_contribs[user].model_dump() + + for key, item in new_gh_data.items(): + if key == "mastodon": + # Mastodon isn't available in the GH api yet + continue + # Don't replace the value if there is a noupdate flag + # TODO: This approach doesn't work, ruemal-yaml doesn't + # preserve inline comments + if key == "name" and existing[key]: + continue + else: + existing[key] = item + + all_contribs[user] = PersonModel(**existing) - # Export data - # Pickle supports updates after parsing reviews + # Export to pickle which supports updates after parsing reviews with open("all_contribs.pickle", "wb") as f: - pickle.dump(web_contribs, f) + pickle.dump(all_contribs, f) if __name__ == "__main__": diff --git a/src/pyosmeta/cli/update_review_contribs.py b/src/pyosmeta/cli/update_review_contribs.py deleted file mode 100644 index 13d5413..0000000 --- a/src/pyosmeta/cli/update_review_contribs.py +++ /dev/null @@ -1,134 +0,0 @@ -""" -This script parses through our reviews and contributors and: - -1. Updates reviewer, editor and maintainer data in the contributor.yml file to -ensure all packages they supported are listed there. -1b: And that they have a listing as peer-review under contributor type -2. Updates the packages metadata with the participants names if it's missing -3. FUTURE: finally it looks to see if we are missing review participants from -the review issues in the contributor file and updates that file. - -This script assumes that update_contributors and update_reviews has been run. -Rather than hit any api's it just updates information from the issues. -To run: update_reviewers - -# TODO - FEATURE we have some packages that were NOT approved but we had editors and reviewers. -# We need to acknowledge these people as well. maybe tag them with waiting on maintainer response?? -# TODO: package-wide feature: create a flag for entries that we do not want to update -# TODO: make sure we can add a 3rd or 4th reviewer - crowsetta has this as -# will biocypher -# TODO: make sure to add a current editor boolean to the current editors and -# emeritus ones. -# TODO - ?create a class for person types?? - -""" - -import os - -from pyosmeta.contributors import ProcessContributors -from pyosmeta.file_io import clean_export_yml, load_pickle - - -def get_clean_user(username: str): - return username.lower().strip() - - -def main(): - # TODO: move refresh contribs and contribs dict attr to - # processContribs and remove this module altogether - updateContribs = ProcessContributors([]) - - # Two pickle files are outputs of the two other scripts - # use that data to limit web calls - contribs = load_pickle("all_contribs.pickle") - - # Output of update_reviews.py - packages = load_pickle("all_reviews.pickle") - - contrib_types = updateContribs.contrib_types - - for pkg_name, issue_meta in packages.items(): - print("Processing review team for:", pkg_name) - for issue_role in contrib_types.keys(): - if issue_role == "all_current_maintainers": - if issue_role in issue_meta: - # Loop through each maintainer in the list - for i, a_maintainer in enumerate(issue_meta.get(issue_role)): - gh_user = get_clean_user(a_maintainer["github_username"]) - - if gh_user not in contribs.keys(): - contribs.update( - updateContribs.check_add_user(gh_user, contribs) - ) - - # Update contrib packages for peer review - ( - contrib_key, - pkg_list, - ) = updateContribs.refresh_contribs( - contribs[gh_user], - pkg_name, # new contribs - issue_role, - ) - # Update users contrib list - contribs[gh_user][contrib_key] = pkg_list - - _, contrib_list = updateContribs.refresh_contribs( - contribs[gh_user], - None, - issue_role, - ) - contribs[gh_user]["contributor_type"] = contrib_list - - # If name is missing in issue summary, populate from contribs - if a_maintainer["name"] == "": - packages[pkg_name]["all_current_maintainers"][i][ - "name" - ] = contribs[gh_user]["name"] - - else: - print( - "All maintainers is missing in the review for ", - pkg_name, - ) - - else: - # Else we are processing editors, reviewers... - gh_user = get_clean_user( - packages[pkg_name][issue_role]["github_username"] - ) - - if gh_user not in contribs.keys(): - # If they aren't already in contribs, add them - contribs.update(updateContribs.check_add_user(gh_user, contribs)) - # Update user package contributions - ( - contrib_key, - pkg_list, - ) = updateContribs.refresh_contribs( - contribs[gh_user], - pkg_name, # new contribs - issue_role, - ) - - # Update users contrib list - contribs[gh_user][contrib_key] = pkg_list - - _, contrib_list = updateContribs.refresh_contribs( - contribs[gh_user], - None, - issue_role, - ) - contribs[gh_user]["contributor_type"] = contrib_list - - # If users's name is missing in issue, populate from contribs dict - if issue_meta[issue_role]["name"] == "": - packages[pkg_name][issue_role]["name"] = contribs[gh_user]["name"] - - # Export to yaml - clean_export_yml(contribs, os.path.join("_data", "contributors.yml")) - clean_export_yml(packages, os.path.join("_data", "packages.yml")) - - -if __name__ == "__main__": - main() diff --git a/src/pyosmeta/cli/update_review_teams.py b/src/pyosmeta/cli/update_review_teams.py new file mode 100644 index 0000000..5ff10b0 --- /dev/null +++ b/src/pyosmeta/cli/update_review_teams.py @@ -0,0 +1,128 @@ +""" +This script parses through our reviews and contributors and: + +1. Updates reviewer, editor and maintainer data in the contributor.yml file to +ensure all packages they supported are listed there. +1b: And that they have a listing as peer-review under contributor type +2. Updates the packages metadata with the participants names if it's missing +3. FUTURE: finally it looks to see if we are missing review participants from +the review issues in the contributor file and updates that file. + +This script assumes that update_contributors and update_reviews has been run. +Rather than hit any api's it just updates information from the issues. +To run: update_reviewers + +# TODO - FEATURE we have some packages that were NOT approved but we had +# editors and reviewers. +# We need to acknowledge these people as well. maybe tag them with waiting on +# maintainer response?? +# TODO: package-wide feature: create no update flag for entries +# TODO: make sure we can add a 3rd or 4th reviewer - crowsetta has this as +# will biocypher + +""" +import os + +from pydantic import ValidationError + +from pyosmeta.contributors import PersonModel, ProcessContributors +from pyosmeta.file_io import clean_export_yml, load_pickle + + +def get_clean_user(username: str) -> str: + """A small helper that removes whitespace and ensures username is + lower case""" + return username.lower().strip() + + +def main(): + process_contribs = ProcessContributors([]) + + # Two pickle files are outputs of the two other scripts + # use that data to limit web calls + contribs = load_pickle("all_contribs.pickle") + packages = load_pickle("all_reviews.pickle") + + contrib_types = process_contribs.contrib_types + + for pkg_name, issue_meta in packages.items(): + print("Processing review team for:", pkg_name) + for issue_role in contrib_types.keys(): + if issue_role == "all_current_maintainers": + # Loop through each maintainer in the list + for i, a_maintainer in enumerate( + issue_meta.all_current_maintainers + ): + gh_user = get_clean_user(a_maintainer["github_username"]) + + if gh_user not in contribs.keys(): + print("Found a new user!", gh_user) + new_contrib = process_contribs.get_user_info(gh_user) + try: + contribs[gh_user] = PersonModel(**new_contrib) + except ValidationError as ve: + print(ve) + + # Update user package contributions (if it's unique) + review_key = contrib_types[issue_role][0] + contribs[gh_user].add_unique_value( + review_key, pkg_name.lower() + ) + + # Update user contrib list (if it's unique) + review_roles = contrib_types[issue_role][1] + contribs[gh_user].add_unique_value( + "contributor_type", review_roles + ) + + # If name is missing in issue, populate from contribs + if a_maintainer["name"] == "": + name = getattr(contribs[gh_user], "name") + packages[pkg_name].all_current_maintainers[i][ + "name" + ] = name + + else: + # Else we are processing editors, reviewers... + gh_user = get_clean_user( + getattr(packages[pkg_name], issue_role)["github_username"] + ) + + if gh_user not in contribs.keys(): + # If they aren't already in contribs, add them + print("Found a new user!", gh_user) + new_contrib = process_contribs.get_user_info(gh_user) + try: + contribs[gh_user] = PersonModel(**new_contrib) + except ValidationError as ve: + print(ve) + + # Update user package contributions (if it's unique) + review_key = contrib_types[issue_role][0] + contribs[gh_user].add_unique_value( + review_key, pkg_name.lower() + ) + + # Update user contrib list (if it's unique) + review_roles = contrib_types[issue_role][1] + contribs[gh_user].add_unique_value( + "contributor_type", review_roles + ) + + # If users's name is missing in issue, populate from contribs + if getattr(issue_meta, issue_role)["name"] == "": + attribute_value = getattr(packages[pkg_name], issue_role) + attribute_value["name"] = getattr( + contribs[gh_user], "name" + ) + + # Export to yaml + contribs_ls = [model.model_dump() for model in contribs.values()] + pkgs_ls = [model.model_dump() for model in packages.values()] + + clean_export_yml(contribs_ls, os.path.join("_data", "contributors.yml")) + clean_export_yml(pkgs_ls, os.path.join("_data", "packages.yml")) + + +if __name__ == "__main__": + main() diff --git a/src/pyosmeta/cli/update_reviews.py b/src/pyosmeta/cli/update_reviews.py deleted file mode 100644 index 8e632cd..0000000 --- a/src/pyosmeta/cli/update_reviews.py +++ /dev/null @@ -1,102 +0,0 @@ -""" -Script that parses metadata from na issue and adds it to a yml file for the -website. It also grabs some of the package metadata such as stars, -last commit, etc. - -Output: packages.yml file containing a list of - 1. all packages with accepted reviews - 2. information related to the review including reviewers, editors - 3. basic package stats including stars, etc. - -To run at the CLI: parse_issue_metadata -""" - -# TODO: if we export files we might want packages.yml and then under_review.yml -# thus we'd want to add a second input parameters which was file_name -# TODO: feature - Would be cool to create an "under review now" list as well - -# ideally this could be passed as a CLI argument with the label we want to -# search for - -import argparse -import pickle - -from pyosmeta import ProcessIssues -from pyosmeta.file_io import clean_export_yml, load_website_yml - - -def main(): - update_all = False - parser = argparse.ArgumentParser( - description="A CLI script to update pyOpenSci reviews" - ) - parser.add_argument( - "--update", - type=str, - help="Will force update review info from GitHub for every review", - ) - args = parser.parse_args() - - if args: - update_all = True - - web_reviews_path = "https://raw.githubusercontent.com/pyOpenSci/pyopensci.github.io/main/_data/packages.yml" - - issueProcess = ProcessIssues( - org="pyopensci", - repo_name="software-submission", - label_name="6/pyOS-approved 🚀🚀🚀", - ) - - # Open web yaml & return dict with package name as key - web_reviews = load_website_yml(key="package_name", url=web_reviews_path) - - # Get all issues for approved packages - issues = issueProcess.return_response() - all_accepted_reviews = issueProcess.parse_issue_header(issues, 15) - - # Parse through reviews, identify new ones, fix case - if update_all == True: - for review_key, review_meta in all_accepted_reviews.items(): - web_reviews[review_key.lower()] = review_meta - else: - for review_key, review_meta in all_accepted_reviews.items(): - if review_key.lower() not in web_reviews.keys(): - print("Yay - pyOS has a new package:", review_key) - web_reviews[review_key.lower()] = review_meta - - # Update gh metrics via api for all packages - repo_endpoints = issueProcess.get_repo_endpoints(web_reviews) - gh_stats = [ - "name", - "description", - "homepage", - "created_at", - "stargazers_count", - "watchers_count", - "forks", - "open_issues_count", - "forks_count", - ] - - # Get gh metadata for each package submission - all_repo_meta = {} - for package_name in repo_endpoints.keys(): - print("Getting GitHub stats for", package_name) - package_api = repo_endpoints[package_name] - all_repo_meta[package_name] = issueProcess.get_repo_meta(package_api, gh_stats) - - all_repo_meta[package_name]["contrib_count"] = issueProcess.get_repo_contribs( - package_api - ) - all_repo_meta[package_name]["last_commit"] = issueProcess.get_last_commit( - package_api - ) - # Add github meta to review metadata - web_reviews[package_name]["gh_meta"] = all_repo_meta[package_name] - - with open("all_reviews.pickle", "wb") as f: - pickle.dump(web_reviews, f) - - -if __name__ == "__main__": - main() diff --git a/src/pyosmeta/contributors.py b/src/pyosmeta/contributors.py index 4944473..22e22aa 100644 --- a/src/pyosmeta/contributors.py +++ b/src/pyosmeta/contributors.py @@ -1,16 +1,171 @@ import json import os -from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple, Union +import re import requests +from dataclasses import dataclass from dotenv import load_dotenv +from pydantic import ( + AliasChoices, + BaseModel, + ConfigDict, + Field, + field_serializer, + field_validator, +) +from typing import List, Optional, Set, Tuple, Union + + +class UrlValidatorMixin: + # Check fields is false given mixin is used by two diff classes + @field_validator( + "website", "documentation", mode="before", check_fields=False + ) + @classmethod + def format_url(cls, url: str) -> str: + """Append https to the beginning of URL if it doesn't exist & cleanup + If the url doesn't have https add it + If the url starts with http change it to https + Else do nothing + + Parameters + ---------- + url : str + String representing the url grabbed from the GH api + + """ + + if not url: + return url # Returns empty string if url is empty + else: + if url.startswith("http://"): + print(f"{url} 'http://' replacing w 'https://'") + url = url.replace("http://", "https://") + elif not url.startswith("http"): + print("Oops, missing http") + url = "https://" + url + if cls._check_url(url=url): + return url + else: + return None + + @staticmethod + def _check_url(url: str) -> bool: + """Test url. Return true if there's a valid response, False if not + + Parameters + ---------- + url : str + String for a url to a website to test. + + """ + + try: + response = requests.get(url, timeout=6) + return response.status_code == 200 + except Exception: + print("Oops, url", url, "is not valid, removing it") + return False + + +class PersonModel(BaseModel, UrlValidatorMixin): + model_config = ConfigDict( + populate_by_name=True, + str_strip_whitespace=True, + validate_assignment=True, + ) + + name: Optional[str] = None + title: Optional[Union[list[str], str]] = None + sort: Optional[int] = None + bio: Optional[str] = None + organization: Optional[str] = Field( + None, validation_alias=AliasChoices("company") + ) + github_username: str = Field(None, validation_alias=AliasChoices("login")) + github_image_id: int = Field(None, validation_alias=AliasChoices("id")) + deia_advisory: Optional[bool] = False + editorial_board: Optional[bool] = Field( + None, validation_alias=AliasChoices("editorial-board") + ) + advisory: Optional[bool] = False + twitter: Optional[str] = Field( + None, validation_alias=AliasChoices("twitter_username") + ) + mastodon: Optional[str] = Field( + None, validation_alias=AliasChoices("mastodon_username", "mastodon") + ) + orcidid: Optional[str] = None + website: Optional[str] = Field( + None, validation_alias=AliasChoices("blog", "website") + ) + board: Optional[bool] = False + contributor_type: Set[str] = set() + packages_editor: Set[str] = set() + packages_submitted: Set[str] = set() + packages_reviewed: Set[str] = set() + location: Optional[str] = None + email: Optional[str] = None + + @field_validator( + "packages_reviewed", + "packages_submitted", + "packages_editor", + "contributor_type", + mode="before", + ) + @classmethod + def convert_to_set(cls, value: list[str]): + if isinstance(value, list): + if not value: + return set() + elif value[0] is None: + return set() + else: + value = [a_val.lower() for a_val in value] + return set(value) + elif value is None: + return set() + return set(value.lower()) + + def add_unique_value(self, attr_name: str, values: Union[str, list[str]]): + """A helper that will add only unique values to an existing list""" + if isinstance(values, str): + values = [values] + attribute = getattr(self, attr_name) + if isinstance(attribute, set): + attribute.update(values) + else: + raise ValueError(f"{attr_name} is not a set attribute") + + @field_serializer( + "packages_reviewed", + "packages_submitted", + "packages_editor", + "contributor_type", + ) + def serialize_set(self, items: Set[str]): + """This is a serializer that runs on export. It ensures sets are + converted to lists""" + return sorted(list(items)) + + @field_validator("bio", mode="before") + @classmethod + def clean_strings(cls, string: str) -> str: + """This is a cleaning step that will remove spurious + characters from string fields. + + """ + if isinstance(string, str): + string = re.sub(r"[\r\n]", "", string) + return string @dataclass class ProcessContributors: - # When initializing how do you decide what should be an input - # attribute vs just something a method accepted when called? + """A class that contains some basic methods to support populating and + updating contributor data.""" + def __init__(self, json_files: List) -> None: """ Parameters @@ -38,15 +193,15 @@ def __init__(self, json_files: List) -> None: ] self.contrib_types = { - "reviewer_1": ["packages-reviewed", ["reviewer", "peer-review"]], - "reviewer_2": ["packages-reviewed", ["reviewer", "peer-review"]], - "editor": ["packages-editor", ["editor", "peer-review"]], + "reviewer_1": ["packages_reviewed", ["reviewer", "peer-review"]], + "reviewer_2": ["packages_reviewed", ["reviewer", "peer-review"]], + "editor": ["packages_editor", ["editor", "peer-review"]], "submitting_author": [ - "packages-submitted", + "packages_submitted", ["maintainer", "submitting-author", "peer-review"], ], "all_current_maintainers": [ - "packages-submitted", + "packages_submitted", ["maintainer", "peer-review"], ], } @@ -63,124 +218,6 @@ def get_token(self) -> str: load_dotenv() return os.environ["GITHUB_TOKEN"] - def refresh_contribs(self, contribs: Dict, new_contribs, review_role): - """Need to add .... - - Parameters - ---------- - - - Returns - ------- - """ - contrib_types = self.contrib_types - contrib_key_yml = "" - # Contributor type will be updated which is a list of roles - if new_contribs: - contrib_key_yml = contrib_types[review_role][0] - existing_contribs = contribs[contrib_key_yml] - # Else this is a specific review role meant to update package list - else: - new_contribs = contrib_types[review_role][1] - existing_contribs = contribs["contributor_type"] - - final_list = self.update_contrib_list(existing_contribs, new_contribs) - return (contrib_key_yml, final_list) - - def create_contrib_template(self) -> Dict: - """A small helper that creates a template for a new contributor - that we are adding to our contributor.yml file""" - - return { - "name": "", - "bio": "", - "organization": "", - "title": "", - "github_username": "", - "github_image_id": "", - "editorial-board": "", - "twitter": "", - "mastodon": "", - "orcidid": "", - "website": "", - "contributor_type": [], - "packages-editor": [], - "packages-submitted": [], - "packages-reviewed": [], - "location": "", - "email": "", - } - - # TODO - This utility is used across all scripts. - def clean_list(self, a_list: Union[str, List[str]]) -> List[str]: - """Helper function that takes an input object as a list or string. - If it is a list containing none, it returns an empty list - if it is a string is returns the string as a list - removes 'None' if that is in the list. and returns - either an empty clean list of the list as is.""" - - if isinstance(a_list, str): - a_list = [a_list] - elif not a_list: - a_list = [] - # Remove None from list - a_list = list(filter(lambda x: x, a_list)) - return a_list - - # TODO - There is likely a better way to do this. If it returns an - # empty list then we know there are no new vals... so it likely can - # return a single thing - def unique_new_vals( - self, a_list: List[str], a_item: List[str] - ) -> Tuple[bool, Optional[List[str]]]: - """Checks two objects either a list and string or two lists - and evaluates whether there are differences between them. - - Returns - ------- - Tuple - Containing a boolean representing whether there are difference - or not and a list containing new value if there are differences. - - """ - - default = (False, None) - list_lower = [al.lower() for al in a_list] - item_lower = [ai.lower() for ai in a_item] - diff = list(set(item_lower) - set(list_lower)) - if len(diff) > 0: - default = (True, diff) - return default - - # TODO - also a helper used by all scripts - def update_contrib_list( - self, - existing_contribs: Union[List, str], - new_contrib: Union[List, str], - ) -> List: - """Method that gets an existing list of contribs. - cleans the list and then checks the list against a - new contribution to see if it should be added. - - Parameters - ---------- - existing_contribs: list or str - A users existing contributions - new_contrib: list or str - a list or a single new contribution to be added - - """ - - # Cleanup first - cleaned_list = self.clean_list(existing_contribs) - new_contrib = self.clean_list(new_contrib) - - unique_vals, new_vals = self.unique_new_vals(cleaned_list, new_contrib) - if unique_vals: - cleaned_list += new_vals - - return cleaned_list - def check_contrib_type(self, json_file: str): """ Determine the type of contribution the person @@ -211,22 +248,26 @@ def check_contrib_type(self, json_file: str): contrib_type = "community" return contrib_type - def check_add_user(self, gh_user: str, contribs: Dict[str, str]) -> None: - """Check to make sure user exists and if not, add them + # TODO possibly could repurpose this as a check in the code + # but it should return get_user_info + # def check_add_user(self, gh_user: str, contribs: Dict[str, str]) -> None: + # """Check to make sure user exists in the existing contrib data. If + # they + # don't' exist, add them - Parameters - ---------- - gh_user : str - github username - contribs: dict - A dictionary containing contributors with gh user being the key + # Parameters + # ---------- + # gh_user : str + # github username + # contribs: dict + # A dictionary containing contributors with gh user being the key - This returns the updated dictionary with a new user at the end. + # This returns the updated dictionary with a new user at the end. - """ - if gh_user not in contribs.keys(): - print("Missing user", gh_user, "adding them now.") - return self.add_new_user(gh_user) + # """ + # if gh_user not in contribs.keys(): + # print("Missing user", gh_user, "adding them now.") + # return self.get_user_info(gh_user) def load_json(self, json_path: str) -> dict: """ @@ -239,6 +280,7 @@ def load_json(self, json_path: str) -> dict: print(ae) return json.loads(response.text) + # TODO: check is i'm using the contrib type part of this method ? def process_json_file(self, json_file: str) -> Tuple[str, List]: """Deserialize a JSON file from a URL and cleanup data @@ -278,6 +320,8 @@ def combine_json_data(self) -> dict: # Create an empty dictionary to hold the combined data combined_data = {} + # TODO: to make this faster, it might be better to return a dict + # with username : [contrib1, contrib2] for json_file in self.json_files: # Process the JSON file and add the data to the combined dictionary try: @@ -287,22 +331,9 @@ def combine_json_data(self) -> dict: print("Oops - can't process", json_file, e) return combined_data - def get_gh_usernames(self, contrib_data: List) -> List: - """Get a list of all gh usernames - - Parameters - ---------- - contrib_data : list - Dict containing all of the contributor information for the website. - - """ - all_usernames = [] - for item in contrib_data: - all_usernames.append(item["github_username"]) - - return all_usernames - - def get_user_info(self, username: str, aname: Optional[str] = None) -> dict: + def get_user_info( + self, username: str, aname: Optional[str] = None + ) -> dict: """ Get a single user's information from their GitHub username using the GitHub API @@ -314,6 +345,7 @@ def get_user_info(self, username: str, aname: Optional[str] = None) -> dict: Github username to retrieve data for aname : str default=None A user's name from the contributors.yml file. + https://docs.github.com/en/rest/users/users?apiVersion=2022-11-28#get-a-user Returns ------- @@ -327,7 +359,6 @@ def get_user_info(self, username: str, aname: Optional[str] = None) -> dict: # if message = Bad credentials response_json = response.json() - user_data = {} # TODO: make an attribute and call it here? update_keys = { "name": "name", @@ -342,20 +373,9 @@ def get_user_info(self, username: str, aname: Optional[str] = None) -> dict: "github_username": "login", } - user_data[username] = {} - for akey in update_keys: - # If the key is name, check to see if there is name data - # already there. don't force update if there's a name! - if akey == "name": - if aname is None: - user_data[username][akey] = response_json.get( - update_keys[akey], None - ) - else: - # Else just keep the original name - user_data[username][akey] = aname - else: - user_data[username][akey] = response_json.get(update_keys[akey], None) + user_data = {} + for key in update_keys: + user_data[key] = response_json.get(update_keys[key], None) return user_data @@ -423,7 +443,9 @@ def combine_users(self, repoDict: dict, webDict: dict) -> dict: if gh_user in webDict.keys(): # Return a list of updated contributor type keys and use it to # update the web dict - webDict[gh_user]["contributor_type"] = self._update_contrib_type( + webDict[gh_user][ + "contributor_type" + ] = self._update_contrib_type( webDict[gh_user]["contributor_type"], repoDict[gh_user]["contributor_type"], ) @@ -433,138 +455,3 @@ def combine_users(self, repoDict: dict, webDict: dict) -> dict: print("New user found. Adding: ", gh_user) webDict[gh_user] = repoDict[gh_user] return webDict - - def add_new_user(self, gh_user: str) -> dict: - """Add a new user to the contrib file using gh username - - This method does a few things. - 1. Adds a new template entry for the user w no values populated - 2. Gets user metadata from the user's github profile - 3. Updates their contrib entry with the gh data - - Parameters - ---------- - gh_user : str - String representing the GitHub username - - Returns - ------- - Dict - Username is the key and the updated github profile info is contained - in the dict. - - """ - - new = {} - new[gh_user] = self.create_contrib_template() - gh_data = self.get_gh_data([gh_user]) - # Update their metadata in the dict and return - updated_data = self.update_contrib_data(new, gh_data) - return updated_data - - def get_gh_data(self, contribs: Union[Dict[str, str], List]) -> dict[str, str]: - """Parses through each GitHub username and hits the GitHub - API to grab user information. - - Parameters - ---------- - contribs : dict - Dict containing all current contrib info - - Returns - ------- - Dict - A dict of updated user data via a list of github usernames - """ - all_user_info = {} - for gh_user in contribs: - print("Getting github data for: ", gh_user) - # If the user already has a name in the dict, don't update - # Important to allow us to update names to ensure correct spelling, - # etc on website - if isinstance(contribs, list): - aname = None - else: - aname = contribs[gh_user]["name"] - - all_user_info[gh_user] = self.get_user_info(gh_user, aname) - return all_user_info - - def _check_url(self, url: str) -> bool: - """Test a url and return true if it works, false if not - - Parameters - ---------- - url : str - String for a url to a website to test. - - """ - - try: - response = requests.get(url, timeout=6) - return response.status_code == 200 - except: - print("Oops, url", url, "is not valid, removing it") - return False - - def update_contrib_data(self, contrib_data: dict, gh_data: dict): - """Update contributor data from the GH API return. - - Use the GitHub API to grab user profile data such as twitter handle, - mastodon, website, email and location and update contributor - information. GitHub profile data is the source of truth source for - contributor metadata. - - Parameters - ---------- - contrib_data : dict - A dict containing contributor data to be updated - gh_data : dict - Updated contributor data pulled from github API - - Returns - ------- - dict - Dictionary containing updated contributor data. - """ - - for i, gh_name in enumerate(contrib_data.keys()): - print(i, gh_name) - # Update the key:value pairs for data pulled from GitHub - for akey in self.update_keys: - if akey == "website": - url = gh_data[gh_name][gh_name][akey] - # Fix the url format and check to see if it works online - url = self.format_url(url) - # It url is valid, add to dict - if self._check_url(url): - contrib_data[gh_name][akey] = url - else: - contrib_data[gh_name][akey] = "" - else: - contrib_data[gh_name][akey] = gh_data[gh_name][gh_name][akey] - - return contrib_data - - def format_url(self, url: str) -> str: - """Append https to the beginning of URL if it doesn't exist - If the url doesn't have https add it - If the url starts with http change it to https - Else do nothing - - Parameters - ---------- - url : str - String representing the url grabbed from the GH api - - """ - if not url: - return url # returns empty string if url is empty - elif url.startswith("https://"): - return url - elif url.startswith("http://"): - print("Fixing", url, "https://" + url[7:]) - return "https://" + url[7:] - else: - print("Missing https://, adding to ", url) - return "https://" + url diff --git a/src/pyosmeta/file_io.py b/src/pyosmeta/file_io.py index 9c7c7da..8521477 100644 --- a/src/pyosmeta/file_io.py +++ b/src/pyosmeta/file_io.py @@ -1,8 +1,8 @@ import pickle import urllib.request -from typing import Dict, List, Optional, Tuple, Union import ruamel.yaml +from typing import Dict, List, Union def load_pickle(filename): @@ -27,6 +27,25 @@ def _list_to_dict(a_list: List, a_key: str) -> Dict: return {a_dict[a_key].lower(): a_dict for a_dict in a_list} +def create_paths(repos: Union[list[str], str]) -> Union[list[str], str]: + """ """ + base_url = "https://raw.githubusercontent.com/pyOpenSci/" + end_url = "/main/.all-contributorsrc" + repos = [ + "python-package-guide", + "software-peer-review", + "pyopensci.github.io", + "software-review", + "update-web-metadata", + ] + if isinstance(repos, list): + all_paths = [base_url + repo + end_url for repo in repos] + else: + all_paths = base_url + repos + end_url + + return all_paths + + def load_website_yml(key: str, url: str): """ This opens a website contrib yaml file and turns it in a @@ -37,28 +56,6 @@ def load_website_yml(key: str, url: str): return _list_to_dict(yml_list, key) -# def dict_to_list(pyos_meta: Dict[str, Union[str, List[str]]]) -> List[Dict]: -# """Turn dict into list for parsing to jekyll friendly yaml - -# Parameters -# ---------- -# pyos_meta : Dict -# A dictionary containing metadata for pyos contributors or review issues - -# Returns -# ------- -# List -# A list of dictionaries containing pyos metadata for contribs or reviews - -# """ -# print("a") -# # Turn dict into list for parsing -# return [pyos_meta[key] for key in pyos_meta] -# # for key in pyos_meta: -# # final_contribs.append(pyos_meta[key]) -# # return final_contribs - - def open_yml_file(file_path: str) -> dict: """Open & deserialize YAML file to dictionary. @@ -74,8 +71,11 @@ def open_yml_file(file_path: str) -> dict: # TODO: this used to be self.web_yml so i'll need to reorganized # the contrib class - with urllib.request.urlopen(file_path) as f: - return ruamel.yaml.safe_load(f) + try: + with urllib.request.urlopen(file_path) as f: + return ruamel.yaml.safe_load(f) + except urllib.error.URLError as url_error: + print("Oops - can find the url", file_path, url_error) def export_yaml(filename: str, data_list: list): @@ -109,7 +109,7 @@ def export_yaml(filename: str, data_list: list): # function created def clean_string(astr: str) -> str: """ - Clean a string by removing occurrences of strings starting with "*id0" and "[]". + Clean - remove strings starting with "*id0" and "[]". Parameters ---------- @@ -139,7 +139,7 @@ def clean_yaml_file(filename): with open(filename, "r") as f: lines = f.readlines() - # TODO: regex would be cleaner here - https://stackoverflow.com/questions/27064964/python-replace-all-words-start-with + # TODO: regex would be cleaner here cleaned_lines = [] for i, line in enumerate(lines): if i == 0 and line.startswith(" "): @@ -156,7 +156,9 @@ def clean_yaml_file(filename): f.write(cleaned_text) -def clean_export_yml(a_dict: Dict[str, Union[str, List[str]]], filename: str) -> None: +def clean_export_yml( + a_dict: Dict[str, Union[str, List[str]]], filename: str +) -> None: """Inputs a dictionary with keys - contribs or packages. It then converse to a list for export, and creates a cleaned YAML file that is jekyll friendly @@ -173,12 +175,7 @@ def clean_export_yml(a_dict: Dict[str, Union[str, List[str]]], filename: str) -> None Outputs a yaml file with the input name containing the pyos meta """ - # TODO: why doesn't .values() work here? it returns a representation - # error. - # final_data = [] - # for key in a_dict: - # final_data.append(a_dict[key]) - # print("sdf") + # Export to yaml - export_yaml(filename, list(a_dict.values())) + export_yaml(filename, a_dict) clean_yaml_file(filename) diff --git a/src/pyosmeta/parse_issues.py b/src/pyosmeta/parse_issues.py index 61d5953..8267998 100644 --- a/src/pyosmeta/parse_issues.py +++ b/src/pyosmeta/parse_issues.py @@ -1,13 +1,160 @@ -from dataclasses import dataclass +import re from datetime import datetime import requests +from dataclasses import dataclass +from pydantic import ( + AliasChoices, + BaseModel, + ConfigDict, + Field, + field_validator, +) +from typing import Any, Optional + +from pyosmeta.contributors import ProcessContributors, UrlValidatorMixin + + +def clean_date(a_date: Optional[str]) -> str: + """Cleans up a datetime from github and returns a date string + + In some cases the string is manually entered month-day-year and in + others it's a gh time stamp. finally sometimes it could be missing + or text. handle all of those cases with this validator. + """ + + if a_date is None or a_date == "missing": + return "missing" + # elif len(a_date) < 11: + # new_date = a_date.replace("/", "-").split("-") + # return f"{new_date[0]}-{new_date[1]}-{new_date[2]}" + else: + try: + return ( + datetime.strptime(a_date, "%Y-%m-%dT%H:%M:%SZ") + .date() + .strftime("%Y-%m-%d") + ) + except TypeError as t_error: + print("Oops - missing data. Setting date to missing", t_error) + return "missing" + + +class GhMeta(BaseModel, UrlValidatorMixin): + name: str + description: str + created_at: str + stargazers_count: int + watchers_count: int + forks: int + open_issues_count: int + forks_count: int + documentation: Optional[str] # Jointly is missing documentation + contrib_count: int + last_commit: str + + @field_validator( + "last_commit", + "created_at", + mode="before", + ) + @classmethod + def clean_date(cls, a_date: Optional[str]) -> str: + """Cleans up a datetime from github and returns a date string + + Runs the general clean_date function in this module as a validator. + """ + + return clean_date(a_date) + + +class ReviewModel(BaseModel): + # Make sure model populates both aliases and original attr name + model_config = ConfigDict( + populate_by_name=True, + str_strip_whitespace=True, + validate_assignment=True, + ) + + package_name: Optional[str] = "" + package_description: str = Field( + "", validation_alias=AliasChoices("one-line_description_of_package") + ) + submitting_author: dict[str, Optional[str]] = {} + all_current_maintainers: list[dict[str, str | None]] = {} + repository_link: Optional[str] = None + version_submitted: Optional[str] = None + categories: Optional[list[str]] = None + editor: dict[str, str | None] = {} + reviewer_1: dict[str, str | None] = {} + reviewer_2: dict[str, str | None] = {} + archive: Optional[str] = None + version_accepted: Optional[str] = None + date_accepted: Optional[str] = None + created_at: str = None + updated_at: str = None + closed_at: Optional[str] = None + issue_link: str = None + joss: Optional[str] = None + gh_meta: Optional[GhMeta] = None + + @field_validator( + "date_accepted", + mode="before", + ) + @classmethod + def clean_date_review(cls, a_date: Optional[str]) -> str: + """Clean a manually added datetime that is added to a review by an + editor when the review package is accepted. + + """ + if a_date is None or a_date in ["missing", "TBD"]: + return "missing" + else: + new_date = a_date.replace("/", "-").split("-") + if len(new_date[0]) == 4: + return f"{new_date[0]}-{new_date[1]}-{new_date[2]}" + else: + return f"{new_date[2]}-{new_date[0]}-{new_date[1]}" + + @field_validator( + "created_at", + "updated_at", + "closed_at", + mode="before", + ) + @classmethod + def clean_date(cls, a_date: Optional[str]) -> str: + """Cleans up a datetime from github and returns a date string + + Runs the general clean_date function in this module as a validator. + + """ + + return clean_date(a_date) + + @field_validator( + "editor", + "reviewer_1", + "reviewer_2", + mode="before", + ) + @classmethod + def clean_gh_url(cls, user: dict[str, str]) -> dict[str, str]: + """Remove markdown link remnants from gh usernames and name. + + Sometimes editors and reviewers add names using github links. + Remove the link data. + """ + + user["github_username"] = user["github_username"].replace( + "https://github.com/", "" + ) + user["name"] = re.sub(r"\[|\]", "", user["name"]) -from pyosmeta.contributors import ProcessContributors + return user -# main reason to use this is attributes .. avoiding them being changed -# in other instances... @dataclass class ProcessIssues: """ @@ -17,7 +164,6 @@ class ProcessIssues: """ - # TODO: turn file io into functions and remove inheritance here def __init__(self, org, repo_name, label_name): """ More here... @@ -41,9 +187,25 @@ def __init__(self, org, repo_name, label_name): self.GITHUB_TOKEN = self.contrib_instance.get_token() + gh_stats = [ + "name", + "description", + "homepage", + "created_at", + "stargazers_count", + "watchers_count", + "forks", + "open_issues_count", + "forks_count", + ] + @property def api_endpoint(self): - return f"https://api.github.com/repos/{self.org}/{self.repo_name}/issues?labels={self.label_name}&state=all" + url = ( + f"https://api.github.com/repos/{self.org}/{self.repo_name}/" + f"issues?labels={self.label_name}&state=all" + ) + return url # Set up the API endpoint def _get_response(self): @@ -106,7 +268,7 @@ def _get_line_meta(self, line_item: list[str]) -> dict[str, object]: line_item : list A single list item representing a single line in the issue containing metadata for the review. - This comment is the metadata for the review that the author fills out. + This comment is metadata for the review that the author fills out. Returns ------- @@ -126,7 +288,7 @@ def _get_line_meta(self, line_item: list[str]) -> dict[str, object]: # Add each maintainer to the dict user = aname.split("@") # Clean - user = [self._clean_name(l) for l in user] + user = [self._clean_name(a_str) for a_str in user] a_maint = { "name": self._clean_name(user[0]), "github_username": self._clean_name(user[1]), @@ -152,7 +314,7 @@ def _get_line_meta(self, line_item: list[str]) -> dict[str, object]: return meta def parse_issue_header( - self, issues: list[str], total_lines: int = 15 + self, issues: list[str], total_lines: int = 20 ) -> dict[str, str]: """ A function that parses through the header of an issue. @@ -165,7 +327,7 @@ def parse_issue_header( metadata at the top of each issue total_lines : int an integer representing the total number of lines to parse in the - issue header. Default = 15 + issue header. Default = 20 Returns ------- @@ -174,78 +336,54 @@ def parse_issue_header( package name, description, review team, version submitted etc. See key_order below for the full list of keys. """ - # Reorder data - key_order = [ - "package_name", - "package_description", - "submitting_author", - "all_current_maintainers", - "repository_link", - "version_submitted", - "categories", - "editor", - "reviewer_1", - "reviewer_2", - "archive", - "version_accepted", - "date_accepted", - "created_at", - "updated_at", - "closed_at", - "issue_link", - ] + meta_dates = ["created_at", "updated_at", "closed_at"] review = {} for issue in issues: - package_name, body_data = self.parse_comment(issue) - if not package_name: + pkg_name, body_data = self.parse_comment(issue) + if not pkg_name: continue # Index of 15 should include date accepted in the review meta - issue_meta = self.get_issue_meta(body_data, total_lines) - # Add issue open and close date to package meta - # Created, opened & closed dates are in GitHub Issue response + review[pkg_name] = self.get_issue_meta(body_data, total_lines) + # Add issue open and close date to package meta from GH response + # Date cleaning happens via pydantic validator not here for a_date in meta_dates: - issue_meta[a_date] = self._clean_date(issue[a_date]) - - # Date accepted is a manually added value. Fix format separately - # Using dashes because it's jekyll friendly - try: - the_date = issue_meta["date_accepted"].replace("/", "-").split("-") - if the_date[0] == "TBD": - continue - else: - issue_meta[ - "date_accepted" - ] = f"{the_date[2]}-{the_date[0]}-{the_date[1]}" - except KeyError as ke: - print("Oops,", package_name, "is missing date_accepted key.") - # Clean markdown url's from editor, and reviewer lines - types = ["editor", "reviewer_1", "reviewer_2"] - user_values = ["github_username", "name"] - for a_type in types: - for user_value in user_values: - issue_meta[a_type][user_value] = ( - issue_meta[a_type][user_value] - .replace("https://github.com/", "") - .replace("[", "") - .replace("]", "") - ) - - review[package_name] = issue_meta - review[package_name]["categories"] = self.get_categories(body_data) - review[package_name]["issue_link"] = issue["url"].replace( + review[pkg_name][a_date] = issue[a_date] + # Get categories and issue review link + review[pkg_name]["categories"] = self.get_categories(body_data) + review[pkg_name]["issue_link"] = issue["url"].replace( "https://api.github.com/repos/", "https://github.com/" ) - # Rename package description & reorder keys - review[package_name]["package_description"] = review[package_name].pop( - "one-line_description_of_package", "" - ) - review[package_name] = { - key: review[package_name][key] - for key in key_order - if review[package_name].get(key) + + review_clean = { + key: value + for key, value in review[pkg_name].items() + if not key.startswith("##") + and not key.startswith("---") + and not key.startswith("-_[x]_i_agree") } + review[pkg_name] = review_clean + # filtered = {} + # for key, value in review.items(): + # print(key) + # if not key.startswith("##") and not key.startswith("-"): + # filtered[key] = value + + # # Clean markdown url's from editor, and reviewer lines + # TODO - this could be a reviewer name cleanup validaotr + # types = ["editor", "reviewer_1", "reviewer_2"] + # user_values = ["github_username", "name"] + # for a_type in types: + # for user_value in user_values: + # issue_meta[a_type][user_value] = ( + # issue_meta[a_type][user_value] + # .replace("https://github.com/", "") + # .replace("[", "") + # .replace("]", "") + # ) + + # review[pkg_name] = issue_meta return review @@ -260,11 +398,11 @@ def get_issue_meta( Parameters ---------- body_data : list - A list containing all of the body data for the top comment in an issue. + A list containing all body data for the top comment in an issue. end_range : int - The number of lines to parse at the top of the issue (this may change - over time so this variable allows us to have different processing - based upon the date of the issue being opened) + The number of lines to parse at the top of the issue (this may + change over time so this variable allows us to have different + processing based upon the date of the issue being opened) Returns ------- @@ -279,7 +417,9 @@ def get_issue_meta( return issue_meta - def get_repo_endpoints(self, review_issues: dict[str, str]) -> dict[str, str]: + def get_repo_endpoints( + self, review_issues: dict[str, str] + ) -> dict[str, str]: """ Returns a list of repository endpoints @@ -291,7 +431,7 @@ def get_repo_endpoints(self, review_issues: dict[str, str]) -> dict[str, str]: Returns ------- Dict - Containing package_name: endpoint for each review. + Containing pkg_name: endpoint for each review. """ @@ -299,7 +439,13 @@ def get_repo_endpoints(self, review_issues: dict[str, str]) -> dict[str, str]: for a_package in review_issues.keys(): repo = review_issues[a_package]["repository_link"].strip("/") owner, repo = repo.split("/")[-2:] - all_repos[a_package] = f"https://api.github.com/repos/{owner}/{repo}" + # TODO: could be simpler code - Remove any link remnants + pattern = r"[\(\)\[\]?]" + owner = re.sub(pattern, "", owner) + repo = re.sub(pattern, "", repo) + all_repos[ + a_package + ] = f"https://api.github.com/repos/{owner}/{repo}" return all_repos def parse_comment(self, issue: dict[str, str]) -> tuple[str, list[str]]: @@ -315,49 +461,70 @@ def parse_comment(self, issue: dict[str, str]) -> tuple[str, list[str]]: Returns ------- - package_name : str + pkg_name : str The name of the package comment : list A list containing the comment elements in order """ - # TODO: this var isn't used - comments_url = issue["comments_url"] body = issue["body"] - # Here sometimes the lines are split with \n, others \r\n - # To clean split on \n but may have to remove the \r + # Clean line breaks (could be done with a regex too) lines = body.split("\n") lines = [a_line.strip("\r").strip() for a_line in lines] # Some users decide to hold the issue titles. # For those, clean the markdown bold ** element - lines = [line.replace("**", "").strip() for line in lines if line.strip() != ""] + lines = [ + line.replace("**", "").strip() + for line in lines + if line.strip() != "" + ] # You need a space after : or else it will break https:// in two body_data = [line.split(": ") for line in lines if line.strip() != ""] # Loop through issue header and grab relevant review metadata name_index = next( - (i for i, sublist in enumerate(body_data) if sublist[0] == "Package Name"), + ( + i + for i, sublist in enumerate(body_data) + if sublist[0] == "Package Name" + ), None, ) - package_name = body_data[name_index][1] if name_index else None + pkg_name = body_data[name_index][1] if name_index else None - return package_name, body_data + return pkg_name, body_data - def _clean_date(self, date: str) -> str: - """Cleans up a datetime from github and returns a date string""" + def get_gh_metrics( + self, + endpoints: dict[str, str], + reviews: dict[str, dict[str, Any]], + ) -> dict[str, dict[str, Any]]: + """ + Get GitHub metrics for each review based on provided endpoints. - try: - date_clean = ( - datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ") - .date() - .strftime("%Y-%m-%d") - ) - except: - print("Oops - i need a string to process date") - print("setting date to missing") - date_clean = "missing" - return date_clean + Parameters: + ---------- + endpoints : dict + A dictionary mapping package names to their GitHub URLs. + reviews : dict + A dictionary containing review data. + + Returns: + ------- + dict + Updated review data with GitHub metrics. + """ + pkg_meta = {} + for pkg_name, url in endpoints.items(): + pkg_meta[pkg_name] = self.get_repo_meta(url, self.gh_stats) + + pkg_meta[pkg_name]["contrib_count"] = self.get_repo_contribs(url) + pkg_meta[pkg_name]["last_commit"] = self.get_last_commit(url) + # Add github meta to review metadata + reviews[pkg_name]["gh_meta"] = pkg_meta[pkg_name] + + return reviews def get_repo_meta(self, url: str, stats_list: list) -> dict: """ @@ -365,8 +532,7 @@ def get_repo_meta(self, url: str, stats_list: list) -> dict: """ stats_dict = {} - # Small script to get the url (normally the docs) and description of a repo! - print(url) + # Get the url (normally the docs) and description of a repo! response = requests.get( url, headers={"Authorization": f"token {self.GITHUB_TOKEN}"} ) @@ -387,7 +553,9 @@ def get_repo_meta(self, url: str, stats_list: list) -> dict: for astat in stats_list: stats_dict[astat] = data[astat] stats_dict["documentation"] = stats_dict.pop("homepage") - stats_dict["created_at"] = self._clean_date(stats_dict["created_at"]) + # stats_dict["created_at"] = self._clean_date( + # stats_dict["created_at"] + # ) return stats_dict @@ -404,7 +572,7 @@ def get_repo_contribs(self, url: str) -> dict: ) if response.status_code == 404: - print("Can't find: ", url, ". Did the repo url change?") + print("Can't find: ", repo_contribs, ". Did the repo url change?") # Extract the description and homepage URL from the JSON response else: return len(response.json()) @@ -422,59 +590,48 @@ def get_last_commit(self, repo: str) -> str: response = requests.get( url, headers={"Authorization": f"token {self.GITHUB_TOKEN}"} ).json() - date = ( - response[0]["commit"]["author"]["date"] - # if 0 in response - # else "1970-01-01T00:00:00Z" - ) + date = response[0]["commit"]["author"]["date"] - return self._clean_date(date) + return date def get_categories( - self, issue_body_list: list[list[str]], fmt: bool = True + self, issue_list: list[list[str]], fmt: bool = True ) -> list[str]: """Parse through a pyOS review issue and grab categories associated with a package Parameters ---------- - issue_body_list : list[list[str]] - The first comment from the issue split into lines and then the lines split as by self.parse_comment() + issue_list : list[list[str]] + The first comment from the issue split into lines and then the + lines split as by self.parse_comment() fmt : bool - Applies some formatting changes to the categories to match what is required for the website. + Applies some formatting changes to the categories to match what is + required for the website. """ # Find the starting index of the category section - start_index = None - for i in range(len(issue_body_list)): - if issue_body_list[i][0].startswith("- Please indicate which"): - start_index = i + 1 - break - # NOTE - some issues have line after that startswith "Check out our" - # For those issues advance i += 1 - if issue_body_list[start_index][0].startswith("Check out our"): - start_index += 1 - - if start_index is None: - # If we couldn't find the starting index, return an empty list - return [] - - # Iterate through the lines starting at the starting index and grab the relevant text - cat_matches = ["[x]", "[X]"] - categories: list[str] = [] - for i in range(start_index, len(issue_body_list)): # 30): - line = issue_body_list[i][0].strip() - checked = any([x in line for x in cat_matches]) - - if line.startswith("- [") and checked: - category = line[line.index("]") + 2 :] - categories.append(category) - elif not line.startswith("- ["): - break - - if fmt: - categories = [c.lower().replace(" ", "-") for c in categories] - return categories - - -# https://api.github.com/repos/pyopensci/python-package-guide/commits + try: + index = next( + i + for i, sublist in enumerate(issue_list) + if "## Scope" in sublist + ) + # Iterate from scope index to first line starting with " - [" + # To find list of category check boxes + for i in range(index + 1, len(issue_list)): + if issue_list[i] and issue_list[i][0].startswith("- ["): + cat_index = i + break + except StopIteration: + print("'## Scope' not found in the list.") + + # Get checked categories for package + cat_list = issue_list[cat_index : cat_index + 10] + categories = [ + re.sub(r"- \[[xX]\] ", "", item[0]) + for item in cat_list + if re.search(r"- \[[xX]\] ", item[0]) + ] + + return [item.lower().replace("[^1]", "") for item in categories]