From a5b0d282502c56ace859d6fe142361d8f6c3cab9 Mon Sep 17 00:00:00 2001 From: Leah Wasser Date: Sun, 13 Aug 2023 19:04:20 -0600 Subject: [PATCH 01/12] Refactor: move to pydantic for validation --- .../cli/personmodel-update-contribs.py | 119 ++++++++++ src/pyosmeta/contributors.py | 211 +++++++++++++----- src/pyosmeta/file_io.py | 9 +- 3 files changed, 279 insertions(+), 60 deletions(-) create mode 100644 src/pyosmeta/cli/personmodel-update-contribs.py diff --git a/src/pyosmeta/cli/personmodel-update-contribs.py b/src/pyosmeta/cli/personmodel-update-contribs.py new file mode 100644 index 0000000..8ff341e --- /dev/null +++ b/src/pyosmeta/cli/personmodel-update-contribs.py @@ -0,0 +1,119 @@ +import argparse +import os +import pickle + +import pydantic +from pydantic import ValidationError + +from pyosmeta.contributors import PersonModel, ProcessContributors +from pyosmeta.file_io import clean_export_yml, open_yml_file + +print(pydantic.__version__) +# TODO - fix the website by renaming packages-editor, packages-submitted: +# packages-reviewed: to use underscores. this will just make life easier + + +def main(): + parser = argparse.ArgumentParser( + description="A CLI script to update pyOpenSci contributors" + ) + parser.add_argument( + "--update", + type=str, + help="Will force update contrib info from GitHub for every contributor", + ) + args = parser.parse_args() + + if args: + update_all = True + + base_url = "https://raw.githubusercontent.com/pyOpenSci/" + end_url = "/main/.all-contributorsrc" + repos = [ + "python-package-guide", + "software-peer-review", + "pyopensci.github.io", + "software-review", + "update-web-metadata", + ] + json_files = [base_url + repo + end_url for repo in repos] + + # Get existing contribs from pyopensci.github.io repo (website data) + web_yaml_path = base_url + "pyopensci.github.io/main/_data/contributors.yml" + + web_contribs = open_yml_file(web_yaml_path) + + # Populate all existing contribs into model objects + all_contribs = {} + for a_contrib in web_contribs: + try: + if a_contrib["github_username"].lower() == "arianesasso": + print("pause") + all_contribs[a_contrib["github_username"].lower()] = PersonModel( + **a_contrib + ) + except ValidationError as ve: + print(a_contrib["github_username"]) + print(ve) + + print("Done processing all-contribs") + # TODO - maybe add these as an attr in the contribs class? + base_url = "https://raw.githubusercontent.com/pyOpenSci/" + end_url = "/main/.all-contributorsrc" + repos = [ + "python-package-guide", + "software-peer-review", + "pyopensci.github.io", + "software-review", + "update-web-metadata", + ] + json_files = [base_url + repo + end_url for repo in repos] + + # Create a list of all contributors across repositories + process_contribs = ProcessContributors(json_files) + bot_all_contribs = process_contribs.combine_json_data() + + # TODO this is much slower than it should be + print("Updating contrib types and searching for new users now") + # bot_all contris is a dict of x contrib types with an associated list of + # users who contributed to that type. + for key, users in bot_all_contribs.items(): + print(key) + for gh_user in users: + # Find and populate data for any new contributors + if gh_user not in all_contribs.keys(): + print("Missing", gh_user, "Adding them now") + new_contrib = process_contribs.get_user_info(gh_user) + all_contribs[gh_user] = PersonModel(**new_contrib) + + # Update contribution type list for all users + existing_contribs = all_contribs[gh_user].contributor_type + all_contribs[ + gh_user + ].contributor_type = process_contribs.update_contrib_list( + existing_contribs, key + ) + + if update_all: + for user in all_contribs.keys(): + print("Updating all user info from github", user) + new_contrib = process_contribs.get_user_info(user) + # Update person's data (should skip update for any text + # with # noupdate flag) + all_contribs[user] = all_contribs[user].update(new_contrib) + + # Export to pickle which supports updates after parsing reviews + with open("all_contribs.pickle", "wb") as f: + pickle.dump(all_contribs, f) + + alist = [] + for key, item in all_contribs.items(): + alist.append(item.model_dump()) + + # Test export + print(os.getcwd()) + clean_export_yml(alist, os.path.join("_data", "contribs.yml")) + + +if __name__ == "__main__": + main() diff --git a/src/pyosmeta/contributors.py b/src/pyosmeta/contributors.py index 4944473..03502dd 100644 --- a/src/pyosmeta/contributors.py +++ b/src/pyosmeta/contributors.py @@ -1,16 +1,126 @@ import json import os +import re from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, List, Literal, Optional, Tuple, Union import requests from dotenv import load_dotenv +from pydantic import (AliasChoices, BaseModel, ConfigDict, Field, + field_validator) + + +class PersonModel(BaseModel): + # Make sure model populates both aliases and original attr name + model_config = ConfigDict(populate_by_name=True, anystr_strip_whitespace=True) + + name: Optional[str] = None + title: Optional[str] = None + sort: Optional[int] = None + bio: Optional[str] = None + organization: Optional[str] = Field(None, validation_alias=AliasChoices("company")) + github_username: str = Field(None, validation_alias=AliasChoices("login")) + github_image_id: int = Field(None, validation_alias=AliasChoices("id")) + deia_advisory: Optional[bool] = False + editorial_board: Optional[bool] = Field( + None, validation_alias=AliasChoices("editorial-board") + ) + advisory: Optional[bool] = False + twitter: Optional[str] = Field( + None, validation_alias=AliasChoices("twitter_username") + ) + mastodon: Optional[str] = Field( + None, validation_alias=AliasChoices("mastodon_username", "mastodon") + ) + orcidid: Optional[str] = None + website: Optional[str] = Field( + None, validation_alias=AliasChoices("blog", "website") + ) + board: Optional[bool] = False + contributor_type: Optional[list[str]] = [] + packages_editor: Optional[list[str | None]] = Field( + None, + validation_alias=AliasChoices("packages-editor"), + ) + packages_submitted: Optional[list[str | None]] = Field( + None, + validation_alias=AliasChoices("packages-submitted", "packages_submitted"), + ) + packages_reviewed: Optional[list[str | None]] = Field( + None, + validation_alias=AliasChoices("packages-reviewed", "packages_reviewed"), + ) + location: Optional[str] = None + email: Optional[str] = None + + # @field_validator("advisory", "deia_advisory", mode="before") + # def fix_bools(cls, value): + # value = "value" + # print(value) + # if value == "false": + # return False + # elif value == "true": + # return True + + @field_validator( + "packages_reviewed", + "packages_submitted", + "packages_editor", + mode="before", + ) + @classmethod + def string_to_list(cls, value): + """ + For fields such as packages-reviewed edited etc we want + a list of elements not just a single string. this will + fix that issue. + """ + # If the input value is a string, convert it to a list + if isinstance(value, list): + return value + if isinstance(value, str): + return [value] + # If the input value is None, return an empty list + elif value is None: + return [] + + @field_validator("bio", mode="before") + @classmethod + def clean_strings(cls, string: str) -> str: + """This is a cleaning step that will remove spurious + characters from string fields. + + """ + if isinstance(string, str): + # Remove "\r\n" from the string value + string = re.sub(r"[\r\n]", "", string) + return string + + def update(self, data: dict) -> "PersonModel": + """ + this doesn't currently validate the data - the discussion + below describes one way to do that but uses pydantic 1.x not + 2.x approach. + + https://github.com/pydantic/pydantic/discussions/3139#discussioncomment-4797649 + """ + + # Note that this will not validate new data :( + for aval in data.keys(): + if isinstance(getattr(self, aval), str) and "# noupdate" in getattr( + self, aval + ): + print("The", aval, "field has a noupdate flag. Skipping update.") + else: + setattr(self, aval, data[aval]) + return self @dataclass class ProcessContributors: - # When initializing how do you decide what should be an input - # attribute vs just something a method accepted when called? + """A class that contains some basic methods to support populating and + updating contributor data.""" + def __init__(self, json_files: List) -> None: """ Parameters @@ -87,29 +197,30 @@ def refresh_contribs(self, contribs: Dict, new_contribs, review_role): final_list = self.update_contrib_list(existing_contribs, new_contribs) return (contrib_key_yml, final_list) - def create_contrib_template(self) -> Dict: - """A small helper that creates a template for a new contributor - that we are adding to our contributor.yml file""" - - return { - "name": "", - "bio": "", - "organization": "", - "title": "", - "github_username": "", - "github_image_id": "", - "editorial-board": "", - "twitter": "", - "mastodon": "", - "orcidid": "", - "website": "", - "contributor_type": [], - "packages-editor": [], - "packages-submitted": [], - "packages-reviewed": [], - "location": "", - "email": "", - } + # TODO: this can go away now that i have a personmodel obj + # def create_contrib_template(self) -> Dict: + # """A small helper that creates a template for a new contributor + # that we are adding to our contributor.yml file""" + + # return { + # "name": "", + # "bio": "", + # "organization": "", + # "title": "", + # "github_username": "", + # "github_image_id": "", + # "editorial-board": "", + # "twitter": "", + # "mastodon": "", + # "orcidid": "", + # "website": "", + # "contributor_type": [], + # "packages-editor": [], + # "packages-submitted": [], + # "packages-reviewed": [], + # "location": "", + # "email": "", + # } # TODO - This utility is used across all scripts. def clean_list(self, a_list: Union[str, List[str]]) -> List[str]: @@ -212,7 +323,8 @@ def check_contrib_type(self, json_file: str): return contrib_type def check_add_user(self, gh_user: str, contribs: Dict[str, str]) -> None: - """Check to make sure user exists and if not, add them + """Check to make sure user exists in the existing contrib data. If they + don't' exist, add them Parameters ---------- @@ -278,6 +390,8 @@ def combine_json_data(self) -> dict: # Create an empty dictionary to hold the combined data combined_data = {} + # TODO: to make this faster, it might be better to return a dict + # with username : [contrib1, contrib2] for json_file in self.json_files: # Process the JSON file and add the data to the combined dictionary try: @@ -287,20 +401,22 @@ def combine_json_data(self) -> dict: print("Oops - can't process", json_file, e) return combined_data - def get_gh_usernames(self, contrib_data: List) -> List: - """Get a list of all gh usernames + # TODO: see if this is every used. it seems completley unecccsary + # given we can use.keys*() + # def get_gh_usernames(self, contrib_data: List) -> List: + # """Get a list of all gh usernames - Parameters - ---------- - contrib_data : list - Dict containing all of the contributor information for the website. + # Parameters + # ---------- + # contrib_data : list + # Dict containing all of the contributor information for the website. - """ - all_usernames = [] - for item in contrib_data: - all_usernames.append(item["github_username"]) + # """ + # all_usernames = [] + # for item in contrib_data: + # all_usernames.append(item["github_username"]) - return all_usernames + # return all_usernames def get_user_info(self, username: str, aname: Optional[str] = None) -> dict: """ @@ -327,7 +443,6 @@ def get_user_info(self, username: str, aname: Optional[str] = None) -> dict: # if message = Bad credentials response_json = response.json() - user_data = {} # TODO: make an attribute and call it here? update_keys = { "name": "name", @@ -342,20 +457,9 @@ def get_user_info(self, username: str, aname: Optional[str] = None) -> dict: "github_username": "login", } - user_data[username] = {} - for akey in update_keys: - # If the key is name, check to see if there is name data - # already there. don't force update if there's a name! - if akey == "name": - if aname is None: - user_data[username][akey] = response_json.get( - update_keys[akey], None - ) - else: - # Else just keep the original name - user_data[username][akey] = aname - else: - user_data[username][akey] = response_json.get(update_keys[akey], None) + user_data = {} + for key in update_keys: + user_data[key] = response_json.get(update_keys[key], None) return user_data @@ -456,6 +560,7 @@ def add_new_user(self, gh_user: str) -> dict: """ new = {} + # Rather than this template i can use the person_model new[gh_user] = self.create_contrib_template() gh_data = self.get_gh_data([gh_user]) # Update their metadata in the dict and return diff --git a/src/pyosmeta/file_io.py b/src/pyosmeta/file_io.py index 9c7c7da..ef968ce 100644 --- a/src/pyosmeta/file_io.py +++ b/src/pyosmeta/file_io.py @@ -173,12 +173,7 @@ def clean_export_yml(a_dict: Dict[str, Union[str, List[str]]], filename: str) -> None Outputs a yaml file with the input name containing the pyos meta """ - # TODO: why doesn't .values() work here? it returns a representation - # error. - # final_data = [] - # for key in a_dict: - # final_data.append(a_dict[key]) - # print("sdf") + # Export to yaml - export_yaml(filename, list(a_dict.values())) + export_yaml(filename, a_dict) clean_yaml_file(filename) From c9d99857327c411075021847594b30a3b66cae9f Mon Sep 17 00:00:00 2001 From: Leah Wasser Date: Tue, 15 Aug 2023 19:18:53 -0600 Subject: [PATCH 02/12] Fix: pyproject tomly flake8/black fix --- pyproject.toml | 14 +- src/pyosmeta/__init__.py | 14 +- src/pyosmeta/contributors.py | 91 +++++------- src/pyosmeta/file_io.py | 47 +++---- src/pyosmeta/parse_issues.py | 262 +++++++++++++++++++++++++---------- 5 files changed, 266 insertions(+), 162 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 63934bf..e646cd9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,9 +29,16 @@ classifiers = [ "Programming Language :: Python :: 3 :: Only", # BE sure to specify that you use python 3.x "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", +version = "0.1.0" +description = "Tools for contributors" +authors = [{ name = "Leah Wasser", email = "leah@pyopensci.org" }] +dependencies = [ + "ruamel-yaml>=0.17.21", + "requests", + "python-dotenv", + "pydantic>=2.0", ] -dependencies = ["ruamel-yaml>=0.17.21", "requests", "python-dotenv", "pydantic"] # This is the metadata that pip reads to understand what versions your package supports requires-python = ">=3.10" readme = "README.md" @@ -59,6 +66,10 @@ update-reviewers = "pyosmeta.cli.update_review_contribs:main" # Right now i'm not using pdm to add dependencies. # Will explore that later # Below using dynamic versioning / which is setuptools scm like +[tool.flake8] +# List of error codes to ignore (comma-separated) +ignore = "E203, W503" + [tool.pdm] @@ -70,6 +81,7 @@ package-dir = "src" # Versioning is a backend feature - instructions are in pdm-backend docs # https://pdm-backend.fming.dev/metadata/ + [tool.pdm.version] # Note that you need to create the tag after all commits are created - otherwise # pdm adds dev info after the tag number which won't publish to pypi diff --git a/src/pyosmeta/__init__.py b/src/pyosmeta/__init__.py index d745d01..5b35315 100644 --- a/src/pyosmeta/__init__.py +++ b/src/pyosmeta/__init__.py @@ -1,8 +1,16 @@ -# SPDX-FileCopyrightText: 2023-present Leah Wasser -# -# SPDX-License-Identifier: MIT +from .contributors import PersonModel, ProcessContributors +from .parse_issues import ProcessIssues, ReviewModel +<<<<<<< HEAD try: from ._version_generated import __version__ except ImportError: __version__ = "unreleased" +======= +__all__ = ( + "ProcessIssues", + "ReviewModel", + "PersonModel", + "ProcessContributors", +) +>>>>>>> 007299c (Fix: pyproject tomly flake8/black fix) diff --git a/src/pyosmeta/contributors.py b/src/pyosmeta/contributors.py index 03502dd..f178699 100644 --- a/src/pyosmeta/contributors.py +++ b/src/pyosmeta/contributors.py @@ -2,23 +2,30 @@ import os import re from dataclasses import dataclass -from typing import Dict, List, Literal, Optional, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union import requests from dotenv import load_dotenv -from pydantic import (AliasChoices, BaseModel, ConfigDict, Field, - field_validator) +from pydantic import ( + AliasChoices, + BaseModel, + ConfigDict, + Field, + field_validator, +) class PersonModel(BaseModel): # Make sure model populates both aliases and original attr name - model_config = ConfigDict(populate_by_name=True, anystr_strip_whitespace=True) + model_config = ConfigDict(populate_by_name=True, str_strip_whitespace=True) name: Optional[str] = None - title: Optional[str] = None + title: Optional[Union[list[str], str]] = None sort: Optional[int] = None bio: Optional[str] = None - organization: Optional[str] = Field(None, validation_alias=AliasChoices("company")) + organization: Optional[str] = Field( + None, validation_alias=AliasChoices("company") + ) github_username: str = Field(None, validation_alias=AliasChoices("login")) github_image_id: int = Field(None, validation_alias=AliasChoices("id")) deia_advisory: Optional[bool] = False @@ -44,23 +51,18 @@ class PersonModel(BaseModel): ) packages_submitted: Optional[list[str | None]] = Field( None, - validation_alias=AliasChoices("packages-submitted", "packages_submitted"), + validation_alias=AliasChoices( + "packages-submitted", "packages_submitted" + ), ) packages_reviewed: Optional[list[str | None]] = Field( None, - validation_alias=AliasChoices("packages-reviewed", "packages_reviewed"), + validation_alias=AliasChoices( + "packages-reviewed", "packages_reviewed" + ), ) location: Optional[str] = None - email: Optional[str] = None - - # @field_validator("advisory", "deia_advisory", mode="before") - # def fix_bools(cls, value): - # value = "value" - # print(value) - # if value == "false": - # return False - # elif value == "true": - # return True + email: Optional[str] = ModuleNotFoundError @field_validator( "packages_reviewed", @@ -96,25 +98,6 @@ def clean_strings(cls, string: str) -> str: string = re.sub(r"[\r\n]", "", string) return string - def update(self, data: dict) -> "PersonModel": - """ - this doesn't currently validate the data - the discussion - below describes one way to do that but uses pydantic 1.x not - 2.x approach. - - https://github.com/pydantic/pydantic/discussions/3139#discussioncomment-4797649 - """ - - # Note that this will not validate new data :( - for aval in data.keys(): - if isinstance(getattr(self, aval), str) and "# noupdate" in getattr( - self, aval - ): - print("The", aval, "field has a noupdate flag. Skipping update.") - else: - setattr(self, aval, data[aval]) - return self - @dataclass class ProcessContributors: @@ -401,24 +384,9 @@ def combine_json_data(self) -> dict: print("Oops - can't process", json_file, e) return combined_data - # TODO: see if this is every used. it seems completley unecccsary - # given we can use.keys*() - # def get_gh_usernames(self, contrib_data: List) -> List: - # """Get a list of all gh usernames - - # Parameters - # ---------- - # contrib_data : list - # Dict containing all of the contributor information for the website. - - # """ - # all_usernames = [] - # for item in contrib_data: - # all_usernames.append(item["github_username"]) - - # return all_usernames - - def get_user_info(self, username: str, aname: Optional[str] = None) -> dict: + def get_user_info( + self, username: str, aname: Optional[str] = None + ) -> dict: """ Get a single user's information from their GitHub username using the GitHub API @@ -430,6 +398,7 @@ def get_user_info(self, username: str, aname: Optional[str] = None) -> dict: Github username to retrieve data for aname : str default=None A user's name from the contributors.yml file. + https://docs.github.com/en/rest/users/users?apiVersion=2022-11-28#get-a-user Returns ------- @@ -527,7 +496,9 @@ def combine_users(self, repoDict: dict, webDict: dict) -> dict: if gh_user in webDict.keys(): # Return a list of updated contributor type keys and use it to # update the web dict - webDict[gh_user]["contributor_type"] = self._update_contrib_type( + webDict[gh_user][ + "contributor_type" + ] = self._update_contrib_type( webDict[gh_user]["contributor_type"], repoDict[gh_user]["contributor_type"], ) @@ -567,7 +538,9 @@ def add_new_user(self, gh_user: str) -> dict: updated_data = self.update_contrib_data(new, gh_data) return updated_data - def get_gh_data(self, contribs: Union[Dict[str, str], List]) -> dict[str, str]: + def get_gh_data( + self, contribs: Union[Dict[str, str], List] + ) -> dict[str, str]: """Parses through each GitHub username and hits the GitHub API to grab user information. @@ -647,7 +620,9 @@ def update_contrib_data(self, contrib_data: dict, gh_data: dict): else: contrib_data[gh_name][akey] = "" else: - contrib_data[gh_name][akey] = gh_data[gh_name][gh_name][akey] + contrib_data[gh_name][akey] = gh_data[gh_name][gh_name][ + akey + ] return contrib_data diff --git a/src/pyosmeta/file_io.py b/src/pyosmeta/file_io.py index ef968ce..5eb555d 100644 --- a/src/pyosmeta/file_io.py +++ b/src/pyosmeta/file_io.py @@ -1,6 +1,6 @@ import pickle import urllib.request -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, List, Union import ruamel.yaml @@ -27,6 +27,25 @@ def _list_to_dict(a_list: List, a_key: str) -> Dict: return {a_dict[a_key].lower(): a_dict for a_dict in a_list} +def create_paths(repos: Union[list[str], str]) -> Union[list[str], str]: + """ """ + base_url = "https://raw.githubusercontent.com/pyOpenSci/" + end_url = "/main/.all-contributorsrc" + repos = [ + "python-package-guide", + "software-peer-review", + "pyopensci.github.io", + "software-review", + "update-web-metadata", + ] + if isinstance(repos, list): + all_paths = [base_url + repo + end_url for repo in repos] + else: + all_paths = base_url + repos + end_url + + return all_paths + + def load_website_yml(key: str, url: str): """ This opens a website contrib yaml file and turns it in a @@ -37,28 +56,6 @@ def load_website_yml(key: str, url: str): return _list_to_dict(yml_list, key) -# def dict_to_list(pyos_meta: Dict[str, Union[str, List[str]]]) -> List[Dict]: -# """Turn dict into list for parsing to jekyll friendly yaml - -# Parameters -# ---------- -# pyos_meta : Dict -# A dictionary containing metadata for pyos contributors or review issues - -# Returns -# ------- -# List -# A list of dictionaries containing pyos metadata for contribs or reviews - -# """ -# print("a") -# # Turn dict into list for parsing -# return [pyos_meta[key] for key in pyos_meta] -# # for key in pyos_meta: -# # final_contribs.append(pyos_meta[key]) -# # return final_contribs - - def open_yml_file(file_path: str) -> dict: """Open & deserialize YAML file to dictionary. @@ -109,7 +106,7 @@ def export_yaml(filename: str, data_list: list): # function created def clean_string(astr: str) -> str: """ - Clean a string by removing occurrences of strings starting with "*id0" and "[]". + Clean - remove strings starting with "*id0" and "[]". Parameters ---------- @@ -139,7 +136,7 @@ def clean_yaml_file(filename): with open(filename, "r") as f: lines = f.readlines() - # TODO: regex would be cleaner here - https://stackoverflow.com/questions/27064964/python-replace-all-words-start-with + # TODO: regex would be cleaner here cleaned_lines = [] for i, line in enumerate(lines): if i == 0 and line.startswith(" "): diff --git a/src/pyosmeta/parse_issues.py b/src/pyosmeta/parse_issues.py index 61d5953..5bb537d 100644 --- a/src/pyosmeta/parse_issues.py +++ b/src/pyosmeta/parse_issues.py @@ -1,13 +1,111 @@ from dataclasses import dataclass from datetime import datetime +from typing import Any, Optional import requests +from pydantic import (AliasChoices, BaseModel, ConfigDict, Field, + field_validator) from pyosmeta.contributors import ProcessContributors -# main reason to use this is attributes .. avoiding them being changed -# in other instances... +def clean_date(a_date: Optional[str]) -> str: + """Cleans up a datetime from github and returns a date string + + In some cases the string is manually entered month-day-year and in + others it's a gh time stamp. finally sometimes it could be missing + or text. handle all of those cases with this validator. + """ + print(a_date) + if a_date is None or a_date == "missing": + return "missing" + elif len(a_date) < 11: + new_date = a_date.replace("/", "-").split("-") + return f"{new_date[2]}-{new_date[0]}-{new_date[1]}" + else: + try: + return ( + datetime.strptime(a_date, "%Y-%m-%dT%H:%M:%SZ") + .date() + .strftime("%Y-%m-%d") + ) + except: + print("Oops - missing data. Setting date to missing") + return "missing" + + +class GhMeta(BaseModel): + name: str + description: str + created_at: str + stargazers_count: int + watchers_count: int + forks: int + open_issues_count: int + forks_count: int + documentation: Optional[str] # Jointly is missing documentation + contrib_count: int + last_commit: str + + @field_validator( + "last_commit", + "created_at", + mode="before", + ) + @classmethod + def clean_date(cls, a_date: Optional[str]) -> str: + """Cleans up a datetime from github and returns a date string + + Runs the general clean_date function in this module as a validator. + """ + + return clean_date(a_date) + + +class ReviewModel(BaseModel): + # Make sure model populates both aliases and original attr name + model_config = ConfigDict(populate_by_name=True, str_strip_whitespace=True) + + package_name: Optional[str] = None + package_description: str = Field( + None, validation_alias=AliasChoices("one-line_description_of_package") + ) + submitting_author: dict[str, str] = None + all_current_maintainers: list[dict[str, str]] = None + repository_link: Optional[str] = None + version_submitted: Optional[str] = None + categories: Optional[str] = None + categories: list[str] = None + editor: dict[str, str] = None + reviewer_1: dict[str, str] = None + reviewer_2: dict[str, str] = None + archive: str = None + version_accepted: str = None + date_accepted: str = None + created_at: str = None + updated_at: str = None + closed_at: str = None + issue_link: str = None + gh_meta: GhMeta + + @field_validator( + "date_accepted", + "created_at", + "updated_at", + "closed_at", + mode="before", + ) + @classmethod + def clean_date(cls, a_date: Optional[str]) -> str: + """Cleans up a datetime from github and returns a date string + + Runs the general clean_date function in this module as a validator. + + """ + + return clean_date(a_date) + + @dataclass class ProcessIssues: """ @@ -17,7 +115,6 @@ class ProcessIssues: """ - # TODO: turn file io into functions and remove inheritance here def __init__(self, org, repo_name, label_name): """ More here... @@ -41,6 +138,18 @@ def __init__(self, org, repo_name, label_name): self.GITHUB_TOKEN = self.contrib_instance.get_token() + gh_stats = [ + "name", + "description", + "homepage", + "created_at", + "stargazers_count", + "watchers_count", + "forks", + "open_issues_count", + "forks_count", + ] + @property def api_endpoint(self): return f"https://api.github.com/repos/{self.org}/{self.repo_name}/issues?labels={self.label_name}&state=all" @@ -174,26 +283,7 @@ def parse_issue_header( package name, description, review team, version submitted etc. See key_order below for the full list of keys. """ - # Reorder data - key_order = [ - "package_name", - "package_description", - "submitting_author", - "all_current_maintainers", - "repository_link", - "version_submitted", - "categories", - "editor", - "reviewer_1", - "reviewer_2", - "archive", - "version_accepted", - "date_accepted", - "created_at", - "updated_at", - "closed_at", - "issue_link", - ] + meta_dates = ["created_at", "updated_at", "closed_at"] review = {} @@ -202,50 +292,34 @@ def parse_issue_header( if not package_name: continue # Index of 15 should include date accepted in the review meta - issue_meta = self.get_issue_meta(body_data, total_lines) + review[package_name] = self.get_issue_meta(body_data, total_lines) # Add issue open and close date to package meta # Created, opened & closed dates are in GitHub Issue response for a_date in meta_dates: - issue_meta[a_date] = self._clean_date(issue[a_date]) - - # Date accepted is a manually added value. Fix format separately - # Using dashes because it's jekyll friendly - try: - the_date = issue_meta["date_accepted"].replace("/", "-").split("-") - if the_date[0] == "TBD": - continue - else: - issue_meta[ - "date_accepted" - ] = f"{the_date[2]}-{the_date[0]}-{the_date[1]}" - except KeyError as ke: - print("Oops,", package_name, "is missing date_accepted key.") - # Clean markdown url's from editor, and reviewer lines - types = ["editor", "reviewer_1", "reviewer_2"] - user_values = ["github_username", "name"] - for a_type in types: - for user_value in user_values: - issue_meta[a_type][user_value] = ( - issue_meta[a_type][user_value] - .replace("https://github.com/", "") - .replace("[", "") - .replace("]", "") - ) - - review[package_name] = issue_meta + # TODO: this could become a validator + review[package_name][a_date] = issue[ + a_date + ] # self._clean_date(issue[a_date]) + # Get categories and issue review link review[package_name]["categories"] = self.get_categories(body_data) review[package_name]["issue_link"] = issue["url"].replace( "https://api.github.com/repos/", "https://github.com/" ) - # Rename package description & reorder keys - review[package_name]["package_description"] = review[package_name].pop( - "one-line_description_of_package", "" - ) - review[package_name] = { - key: review[package_name][key] - for key in key_order - if review[package_name].get(key) - } + + # # Clean markdown url's from editor, and reviewer lines + # TODO - this could be a reviewer name cleanup validaotr + # types = ["editor", "reviewer_1", "reviewer_2"] + # user_values = ["github_username", "name"] + # for a_type in types: + # for user_value in user_values: + # issue_meta[a_type][user_value] = ( + # issue_meta[a_type][user_value] + # .replace("https://github.com/", "") + # .replace("[", "") + # .replace("]", "") + # ) + + # review[package_name] = issue_meta return review @@ -344,20 +418,56 @@ def parse_comment(self, issue: dict[str, str]) -> tuple[str, list[str]]: return package_name, body_data - def _clean_date(self, date: str) -> str: - """Cleans up a datetime from github and returns a date string""" + # def _clean_date(self, date: str) -> str: + # """Cleans up a datetime from github and returns a date string""" + + # try: + # print(date) + # date_clean = ( + # datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ") + # .date() + # .strftime("%Y-%m-%d") + # ) + # except: + # print( + # "date is this", date, "Oops - i need a string to process date" + # ) + # print("setting date to missing") + # date_clean = "missing" + # return date_clean + + def get_gh_metrics( + self, + endpoints: dict[str, str], + reviews: dict[str, dict[str, Any]], + ) -> dict[str, dict[str, Any]]: + """ + Get GitHub metrics for each review based on provided endpoints. - try: - date_clean = ( - datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ") - .date() - .strftime("%Y-%m-%d") - ) - except: - print("Oops - i need a string to process date") - print("setting date to missing") - date_clean = "missing" - return date_clean + Parameters: + ---------- + endpoints : dict + A dictionary mapping package names to their GitHub URLs. + reviews : dict + A dictionary containing review data. + + Returns: + ------- + dict + Updated review data with GitHub metrics. + """ + pkg_meta = {} + for pkg_name, url in endpoints.items(): + print("Getting GitHub stats for", pkg_name) + + pkg_meta[pkg_name] = self.get_repo_meta(url, self.gh_stats) + + pkg_meta[pkg_name]["contrib_count"] = self.get_repo_contribs(url) + pkg_meta[pkg_name]["last_commit"] = self.get_last_commit(url) + # Add github meta to review metadata + reviews[pkg_name]["gh_meta"] = pkg_meta[pkg_name] + + return reviews def get_repo_meta(self, url: str, stats_list: list) -> dict: """ @@ -387,7 +497,9 @@ def get_repo_meta(self, url: str, stats_list: list) -> dict: for astat in stats_list: stats_dict[astat] = data[astat] stats_dict["documentation"] = stats_dict.pop("homepage") - stats_dict["created_at"] = self._clean_date(stats_dict["created_at"]) + # stats_dict["created_at"] = self._clean_date( + # stats_dict["created_at"] + # ) return stats_dict @@ -428,7 +540,7 @@ def get_last_commit(self, repo: str) -> str: # else "1970-01-01T00:00:00Z" ) - return self._clean_date(date) + return date def get_categories( self, issue_body_list: list[list[str]], fmt: bool = True From d69e1007d2a4bd9cccf039abce687c207d841b16 Mon Sep 17 00:00:00 2001 From: Leah Wasser Date: Tue, 15 Aug 2023 19:31:15 -0600 Subject: [PATCH 03/12] Fix: moving issues to pydantic and cleanup --- pyproject.toml | 9 ++ .../cli/personmodel-update-contribs.py | 119 ------------------ src/pyosmeta/cli/update_contributors.py | 114 ++++++++++------- src/pyosmeta/cli/update_review_contribs.py | 20 ++- src/pyosmeta/cli/update_reviews.py | 73 +++++------ src/pyosmeta/contributors.py | 14 +-- src/pyosmeta/file_io.py | 6 +- src/pyosmeta/parse_issues.py | 85 ++++++------- 8 files changed, 173 insertions(+), 267 deletions(-) delete mode 100644 src/pyosmeta/cli/personmodel-update-contribs.py diff --git a/pyproject.toml b/pyproject.toml index e646cd9..991b9d0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,6 +66,15 @@ update-reviewers = "pyosmeta.cli.update_review_contribs:main" # Right now i'm not using pdm to add dependencies. # Will explore that later # Below using dynamic versioning / which is setuptools scm like +[tool.black] +line-length = 79 +target-version = ['py310'] + +[tool.isort] +profile = "black" +multi_line_output = 3 +py_version = 27 + [tool.flake8] # List of error codes to ignore (comma-separated) ignore = "E203, W503" diff --git a/src/pyosmeta/cli/personmodel-update-contribs.py b/src/pyosmeta/cli/personmodel-update-contribs.py deleted file mode 100644 index 8ff341e..0000000 --- a/src/pyosmeta/cli/personmodel-update-contribs.py +++ /dev/null @@ -1,119 +0,0 @@ -import argparse -import os -import pickle - -import pydantic -from pydantic import ValidationError - -from pyosmeta.contributors import PersonModel, ProcessContributors -from pyosmeta.file_io import clean_export_yml, open_yml_file - -print(pydantic.__version__) -# TODO - fix the website by renaming packages-editor, packages-submitted: -# packages-reviewed: to use underscores. this will just make life easier - - -def main(): - parser = argparse.ArgumentParser( - description="A CLI script to update pyOpenSci contributors" - ) - parser.add_argument( - "--update", - type=str, - help="Will force update contrib info from GitHub for every contributor", - ) - args = parser.parse_args() - - if args: - update_all = True - - base_url = "https://raw.githubusercontent.com/pyOpenSci/" - end_url = "/main/.all-contributorsrc" - repos = [ - "python-package-guide", - "software-peer-review", - "pyopensci.github.io", - "software-review", - "update-web-metadata", - ] - json_files = [base_url + repo + end_url for repo in repos] - - # Get existing contribs from pyopensci.github.io repo (website data) - web_yaml_path = base_url + "pyopensci.github.io/main/_data/contributors.yml" - - web_contribs = open_yml_file(web_yaml_path) - - # Populate all existing contribs into model objects - all_contribs = {} - for a_contrib in web_contribs: - try: - if a_contrib["github_username"].lower() == "arianesasso": - print("pause") - all_contribs[a_contrib["github_username"].lower()] = PersonModel( - **a_contrib - ) - except ValidationError as ve: - print(a_contrib["github_username"]) - print(ve) - - print("Done processing all-contribs") - # TODO - maybe add these as an attr in the contribs class? - base_url = "https://raw.githubusercontent.com/pyOpenSci/" - end_url = "/main/.all-contributorsrc" - repos = [ - "python-package-guide", - "software-peer-review", - "pyopensci.github.io", - "software-review", - "update-web-metadata", - ] - json_files = [base_url + repo + end_url for repo in repos] - - # Create a list of all contributors across repositories - process_contribs = ProcessContributors(json_files) - bot_all_contribs = process_contribs.combine_json_data() - - # TODO this is much slower than it should be - print("Updating contrib types and searching for new users now") - # bot_all contris is a dict of x contrib types with an associated list of - # users who contributed to that type. - for key, users in bot_all_contribs.items(): - print(key) - for gh_user in users: - # Find and populate data for any new contributors - if gh_user not in all_contribs.keys(): - print("Missing", gh_user, "Adding them now") - new_contrib = process_contribs.get_user_info(gh_user) - all_contribs[gh_user] = PersonModel(**new_contrib) - - # Update contribution type list for all users - existing_contribs = all_contribs[gh_user].contributor_type - all_contribs[ - gh_user - ].contributor_type = process_contribs.update_contrib_list( - existing_contribs, key - ) - - if update_all: - for user in all_contribs.keys(): - print("Updating all user info from github", user) - new_contrib = process_contribs.get_user_info(user) - # Update person's data (should skip update for any text - # with # noupdate flag) - all_contribs[user] = all_contribs[user].update(new_contrib) - - # Export to pickle which supports updates after parsing reviews - with open("all_contribs.pickle", "wb") as f: - pickle.dump(all_contribs, f) - - alist = [] - for key, item in all_contribs.items(): - alist.append(item.model_dump()) - - # Test export - print(os.getcwd()) - clean_export_yml(alist, os.path.join("_data", "contribs.yml")) - - -if __name__ == "__main__": - main() diff --git a/src/pyosmeta/cli/update_contributors.py b/src/pyosmeta/cli/update_contributors.py index 31c29ac..5676033 100644 --- a/src/pyosmeta/cli/update_contributors.py +++ b/src/pyosmeta/cli/update_contributors.py @@ -1,35 +1,31 @@ import argparse import pickle -from pyosmeta.contributors import ProcessContributors -from pyosmeta.file_io import clean_export_yml, load_website_yml +import pydantic +from pydantic import ValidationError -# TODO: will this still run in gh actions?? -# TODO: add update=True like i did for update_reviews -# TODO: still need to add a flag to not update specific fields -# TODO: if i use composition and there are helpers in a class -# that are used in a method that i call via composition are the helpers -# still available? +from pyosmeta.contributors import PersonModel, ProcessContributors +from pyosmeta.file_io import create_paths, open_yml_file + +print(pydantic.__version__) +# TODO - fix the website by renaming packages-editor, packages-submitted: +# packages-reviewed: to use underscores. this will just make life easier def main(): - update_all = False parser = argparse.ArgumentParser( description="A CLI script to update pyOpenSci contributors" ) parser.add_argument( "--update", type=str, - help="Will force update contrib info from GitHub for every contributor", + help="Force update contrib info from GitHub for every contributor", ) args = parser.parse_args() if args: update_all = True - # TODO - maybe add these as an attr in the contribs class? - base_url = "https://raw.githubusercontent.com/pyOpenSci/" - end_url = "/main/.all-contributorsrc" repos = [ "python-package-guide", "software-peer-review", @@ -37,48 +33,76 @@ def main(): "software-review", "update-web-metadata", ] - json_files = [base_url + repo + end_url for repo in repos] + json_files = create_paths(repos) # Get existing contribs from pyopensci.github.io repo (website data) - web_yaml_path = base_url + "pyopensci.github.io/main/_data/contributors.yml" + base_url = "https://raw.githubusercontent.com/pyOpenSci/" + web_yaml_path = ( + base_url + "pyopensci.github.io/main/_data/contributors.yml" + ) - process_contribs = ProcessContributors(json_files) + web_contribs = open_yml_file(web_yaml_path) + + # Populate all existing contribs into model objects + all_contribs = {} + for a_contrib in web_contribs: + try: + all_contribs[a_contrib["github_username"].lower()] = PersonModel( + **a_contrib + ) + except ValidationError as ve: + print(a_contrib["github_username"]) + print(ve) - # Returns a list of dict objects with gh usernames (lowercase) as keys - # TODO: File io module (could just be a function) - web_contribs = load_website_yml(url=web_yaml_path, key="github_username") - bot_all_contribs_dict = process_contribs.combine_json_data() + print("Done processing all-contribs") - # Parse through each user in the web yaml, if they don't exist, add them - # finally - update contrib types - for key, users in bot_all_contribs_dict.items(): + # Create a list of all contributors across repositories + process_contribs = ProcessContributors(json_files) + bot_all_contribs = process_contribs.combine_json_data() + + print("Updating contrib types and searching for new users now") + # bot_all_contribs: keys: contrib_type, value: ghuser contribs + for key, users in bot_all_contribs.items(): for gh_user in users: - # Add any new contributors - if gh_user not in web_contribs.keys(): - print("I found a new contributor! Adding:", gh_user) - web_contribs.update( - # TODO: this is also used in the other 2 scripts - # but add user info is in the contribs class - i do - # think it belongs there - process_contribs.check_add_user(gh_user, web_contribs) - ) - - # Update contrib type list - existing_contribs = web_contribs[gh_user]["contributor_type"] - # TODO: This helper is used in all three scripts but defined - # in the contribs class - web_contribs[gh_user][ - "contributor_type" - ] = process_contribs.update_contrib_list(existing_contribs, key) + # Find and populate data for any new contributors + if gh_user not in all_contribs.keys(): + print("Missing", gh_user, "Adding them now") + new_contrib = process_contribs.get_user_info(gh_user) + all_contribs[gh_user] = PersonModel(**new_contrib) + + # Update contribution type list for all users + existing_contribs = all_contribs[gh_user].contributor_type + all_contribs[ + gh_user + ].contributor_type = process_contribs.update_contrib_list( + existing_contribs, key + ) if update_all: - gh_data = process_contribs.get_gh_data(web_contribs) - web_contribs = process_contribs.update_contrib_data(web_contribs, gh_data) + for user in all_contribs.keys(): + print("Updating all user info from github", user) + new_gh_data = process_contribs.get_user_info(user) + + # TODO: turn this into a small update method + existing = all_contribs[user].model_dump() + + for key, item in new_gh_data.items(): + if key == "mastodon": + # Mastodon isn't available in the api yet + continue + # Don't replace the value if there is a noupdate flag + # TODO: This approach doesn't work, ruemal-yaml doesn't + # preserve inline comments + if key == "name" and existing[key]: + continue + else: + existing[key] = item + + all_contribs[user] = PersonModel(**existing) - # Export data - # Pickle supports updates after parsing reviews + # Export to pickle which supports updates after parsing reviews with open("all_contribs.pickle", "wb") as f: - pickle.dump(web_contribs, f) + pickle.dump(all_contribs, f) if __name__ == "__main__": diff --git a/src/pyosmeta/cli/update_review_contribs.py b/src/pyosmeta/cli/update_review_contribs.py index 13d5413..c4c5aa1 100644 --- a/src/pyosmeta/cli/update_review_contribs.py +++ b/src/pyosmeta/cli/update_review_contribs.py @@ -53,12 +53,18 @@ def main(): if issue_role == "all_current_maintainers": if issue_role in issue_meta: # Loop through each maintainer in the list - for i, a_maintainer in enumerate(issue_meta.get(issue_role)): - gh_user = get_clean_user(a_maintainer["github_username"]) + for i, a_maintainer in enumerate( + issue_meta.get(issue_role) + ): + gh_user = get_clean_user( + a_maintainer["github_username"] + ) if gh_user not in contribs.keys(): contribs.update( - updateContribs.check_add_user(gh_user, contribs) + updateContribs.check_add_user( + gh_user, contribs + ) ) # Update contrib packages for peer review @@ -100,7 +106,9 @@ def main(): if gh_user not in contribs.keys(): # If they aren't already in contribs, add them - contribs.update(updateContribs.check_add_user(gh_user, contribs)) + contribs.update( + updateContribs.check_add_user(gh_user, contribs) + ) # Update user package contributions ( contrib_key, @@ -123,7 +131,9 @@ def main(): # If users's name is missing in issue, populate from contribs dict if issue_meta[issue_role]["name"] == "": - packages[pkg_name][issue_role]["name"] = contribs[gh_user]["name"] + packages[pkg_name][issue_role]["name"] = contribs[gh_user][ + "name" + ] # Export to yaml clean_export_yml(contribs, os.path.join("_data", "contributors.yml")) diff --git a/src/pyosmeta/cli/update_reviews.py b/src/pyosmeta/cli/update_reviews.py index 8e632cd..634e7bf 100644 --- a/src/pyosmeta/cli/update_reviews.py +++ b/src/pyosmeta/cli/update_reviews.py @@ -16,12 +16,19 @@ # TODO: feature - Would be cool to create an "under review now" list as well - # ideally this could be passed as a CLI argument with the label we want to # search for +# TODO: 1. add gh metadata to the review object +# prior to parsing +# 2. work on update-all!! +# 3. i think package_description might not be parsing right? + import argparse import pickle -from pyosmeta import ProcessIssues -from pyosmeta.file_io import clean_export_yml, load_website_yml +from pydantic import ValidationError + +from pyosmeta import ProcessIssues, ReviewModel +from pyosmeta.file_io import load_website_yml def main(): @@ -37,11 +44,11 @@ def main(): args = parser.parse_args() if args: - update_all = True + update_all = False web_reviews_path = "https://raw.githubusercontent.com/pyOpenSci/pyopensci.github.io/main/_data/packages.yml" - issueProcess = ProcessIssues( + process_review = ProcessIssues( org="pyopensci", repo_name="software-submission", label_name="6/pyOS-approved 🚀🚀🚀", @@ -51,48 +58,34 @@ def main(): web_reviews = load_website_yml(key="package_name", url=web_reviews_path) # Get all issues for approved packages - issues = issueProcess.return_response() - all_accepted_reviews = issueProcess.parse_issue_header(issues, 15) + issues = process_review.return_response() + accepted_reviews = process_review.parse_issue_header(issues, 15) # Parse through reviews, identify new ones, fix case if update_all == True: - for review_key, review_meta in all_accepted_reviews.items(): - web_reviews[review_key.lower()] = review_meta + for key, meta in accepted_reviews.items(): + web_reviews[key.lower()] = meta else: - for review_key, review_meta in all_accepted_reviews.items(): - if review_key.lower() not in web_reviews.keys(): - print("Yay - pyOS has a new package:", review_key) - web_reviews[review_key.lower()] = review_meta + for key, meta in accepted_reviews.items(): + if key.lower() not in web_reviews.keys(): + print("Yay - pyOS has a new package:", key) + web_reviews[key.lower()] = meta # Update gh metrics via api for all packages - repo_endpoints = issueProcess.get_repo_endpoints(web_reviews) - gh_stats = [ - "name", - "description", - "homepage", - "created_at", - "stargazers_count", - "watchers_count", - "forks", - "open_issues_count", - "forks_count", - ] - - # Get gh metadata for each package submission - all_repo_meta = {} - for package_name in repo_endpoints.keys(): - print("Getting GitHub stats for", package_name) - package_api = repo_endpoints[package_name] - all_repo_meta[package_name] = issueProcess.get_repo_meta(package_api, gh_stats) - - all_repo_meta[package_name]["contrib_count"] = issueProcess.get_repo_contribs( - package_api - ) - all_repo_meta[package_name]["last_commit"] = issueProcess.get_last_commit( - package_api - ) - # Add github meta to review metadata - web_reviews[package_name]["gh_meta"] = all_repo_meta[package_name] + repo_endpoints = process_review.get_repo_endpoints(web_reviews) + web_reviews = process_review.get_gh_metrics(repo_endpoints, web_reviews) + + # Finally populate model objects with review data + metrics + # TODO: this is really close - it's erroring when populating date + # i suspect in the github metadata + all_reviews = {} + for key, review in web_reviews.items(): + # First add gh meta to each dict + print("Parsing & validating", key) + try: + all_reviews[key] = ReviewModel(**review) + except ValidationError as ve: + print(ve) with open("all_reviews.pickle", "wb") as f: pickle.dump(web_reviews, f) diff --git a/src/pyosmeta/contributors.py b/src/pyosmeta/contributors.py index f178699..dcabaa8 100644 --- a/src/pyosmeta/contributors.py +++ b/src/pyosmeta/contributors.py @@ -1,18 +1,12 @@ import json import os import re -from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple, Union import requests +from dataclasses import dataclass from dotenv import load_dotenv -from pydantic import ( - AliasChoices, - BaseModel, - ConfigDict, - Field, - field_validator, -) +from pydantic import AliasChoices, BaseModel, ConfigDict, Field, field_validator +from typing import Dict, List, Optional, Tuple, Union class PersonModel(BaseModel): @@ -581,7 +575,7 @@ def _check_url(self, url: str) -> bool: try: response = requests.get(url, timeout=6) return response.status_code == 200 - except: + except Exception: print("Oops, url", url, "is not valid, removing it") return False diff --git a/src/pyosmeta/file_io.py b/src/pyosmeta/file_io.py index 5eb555d..6a773c1 100644 --- a/src/pyosmeta/file_io.py +++ b/src/pyosmeta/file_io.py @@ -1,8 +1,8 @@ import pickle import urllib.request -from typing import Dict, List, Union import ruamel.yaml +from typing import Dict, List, Union def load_pickle(filename): @@ -153,7 +153,9 @@ def clean_yaml_file(filename): f.write(cleaned_text) -def clean_export_yml(a_dict: Dict[str, Union[str, List[str]]], filename: str) -> None: +def clean_export_yml( + a_dict: Dict[str, Union[str, List[str]]], filename: str +) -> None: """Inputs a dictionary with keys - contribs or packages. It then converse to a list for export, and creates a cleaned YAML file that is jekyll friendly diff --git a/src/pyosmeta/parse_issues.py b/src/pyosmeta/parse_issues.py index 5bb537d..d6ca4eb 100644 --- a/src/pyosmeta/parse_issues.py +++ b/src/pyosmeta/parse_issues.py @@ -1,10 +1,9 @@ -from dataclasses import dataclass from datetime import datetime -from typing import Any, Optional import requests -from pydantic import (AliasChoices, BaseModel, ConfigDict, Field, - field_validator) +from dataclasses import dataclass +from pydantic import AliasChoices, BaseModel, ConfigDict, Field, field_validator +from typing import Any, Optional from pyosmeta.contributors import ProcessContributors @@ -29,8 +28,8 @@ def clean_date(a_date: Optional[str]) -> str: .date() .strftime("%Y-%m-%d") ) - except: - print("Oops - missing data. Setting date to missing") + except TypeError as te: + print("Oops - missing data. Setting date to missing", te) return "missing" @@ -152,7 +151,11 @@ def __init__(self, org, repo_name, label_name): @property def api_endpoint(self): - return f"https://api.github.com/repos/{self.org}/{self.repo_name}/issues?labels={self.label_name}&state=all" + url = ( + f"https://api.github.com/repos/{self.org}/{self.repo_name}/" + f"issues?labels={self.label_name}&state=all" + ) + return url # Set up the API endpoint def _get_response(self): @@ -215,7 +218,7 @@ def _get_line_meta(self, line_item: list[str]) -> dict[str, object]: line_item : list A single list item representing a single line in the issue containing metadata for the review. - This comment is the metadata for the review that the author fills out. + This comment is metadata for the review that the author fills out. Returns ------- @@ -235,7 +238,7 @@ def _get_line_meta(self, line_item: list[str]) -> dict[str, object]: # Add each maintainer to the dict user = aname.split("@") # Clean - user = [self._clean_name(l) for l in user] + user = [self._clean_name(a_str) for a_str in user] a_maint = { "name": self._clean_name(user[0]), "github_username": self._clean_name(user[1]), @@ -334,11 +337,11 @@ def get_issue_meta( Parameters ---------- body_data : list - A list containing all of the body data for the top comment in an issue. + A list containing all body data for the top comment in an issue. end_range : int - The number of lines to parse at the top of the issue (this may change - over time so this variable allows us to have different processing - based upon the date of the issue being opened) + The number of lines to parse at the top of the issue (this may + change over time so this variable allows us to have different + processing based upon the date of the issue being opened) Returns ------- @@ -353,7 +356,9 @@ def get_issue_meta( return issue_meta - def get_repo_endpoints(self, review_issues: dict[str, str]) -> dict[str, str]: + def get_repo_endpoints( + self, review_issues: dict[str, str] + ) -> dict[str, str]: """ Returns a list of repository endpoints @@ -373,7 +378,9 @@ def get_repo_endpoints(self, review_issues: dict[str, str]) -> dict[str, str]: for a_package in review_issues.keys(): repo = review_issues[a_package]["repository_link"].strip("/") owner, repo = repo.split("/")[-2:] - all_repos[a_package] = f"https://api.github.com/repos/{owner}/{repo}" + all_repos[ + a_package + ] = f"https://api.github.com/repos/{owner}/{repo}" return all_repos def parse_comment(self, issue: dict[str, str]) -> tuple[str, list[str]]: @@ -395,22 +402,27 @@ def parse_comment(self, issue: dict[str, str]) -> tuple[str, list[str]]: A list containing the comment elements in order """ - # TODO: this var isn't used - comments_url = issue["comments_url"] body = issue["body"] - # Here sometimes the lines are split with \n, others \r\n - # To clean split on \n but may have to remove the \r + # Clean line breaks (could be done with a regex too) lines = body.split("\n") lines = [a_line.strip("\r").strip() for a_line in lines] # Some users decide to hold the issue titles. # For those, clean the markdown bold ** element - lines = [line.replace("**", "").strip() for line in lines if line.strip() != ""] + lines = [ + line.replace("**", "").strip() + for line in lines + if line.strip() != "" + ] # You need a space after : or else it will break https:// in two body_data = [line.split(": ") for line in lines if line.strip() != ""] # Loop through issue header and grab relevant review metadata name_index = next( - (i for i, sublist in enumerate(body_data) if sublist[0] == "Package Name"), + ( + i + for i, sublist in enumerate(body_data) + if sublist[0] == "Package Name" + ), None, ) @@ -418,24 +430,6 @@ def parse_comment(self, issue: dict[str, str]) -> tuple[str, list[str]]: return package_name, body_data - # def _clean_date(self, date: str) -> str: - # """Cleans up a datetime from github and returns a date string""" - - # try: - # print(date) - # date_clean = ( - # datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ") - # .date() - # .strftime("%Y-%m-%d") - # ) - # except: - # print( - # "date is this", date, "Oops - i need a string to process date" - # ) - # print("setting date to missing") - # date_clean = "missing" - # return date_clean - def get_gh_metrics( self, endpoints: dict[str, str], @@ -475,7 +469,7 @@ def get_repo_meta(self, url: str, stats_list: list) -> dict: """ stats_dict = {} - # Small script to get the url (normally the docs) and description of a repo! + # get the url (normally the docs) and description of a repo! print(url) response = requests.get( url, headers={"Authorization": f"token {self.GITHUB_TOKEN}"} @@ -551,10 +545,12 @@ def get_categories( Parameters ---------- issue_body_list : list[list[str]] - The first comment from the issue split into lines and then the lines split as by self.parse_comment() + The first comment from the issue split into lines and then the + lines split as by self.parse_comment() fmt : bool - Applies some formatting changes to the categories to match what is required for the website. + Applies some formatting changes to the categories to match what is + required for the website. """ # Find the starting index of the category section start_index = None @@ -571,7 +567,7 @@ def get_categories( # If we couldn't find the starting index, return an empty list return [] - # Iterate through the lines starting at the starting index and grab the relevant text + # Iterate through lines and grab the relevant text cat_matches = ["[x]", "[X]"] categories: list[str] = [] for i in range(start_index, len(issue_body_list)): # 30): @@ -587,6 +583,3 @@ def get_categories( if fmt: categories = [c.lower().replace(" ", "-") for c in categories] return categories - - -# https://api.github.com/repos/pyopensci/python-package-guide/commits From 91cab042861fe829fe836053fcc0ad1ef8bf7707 Mon Sep 17 00:00:00 2001 From: Leah Wasser Date: Thu, 17 Aug 2023 09:21:41 -0600 Subject: [PATCH 04/12] Flake8 format --- src/pyosmeta/cli/update_reviews.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/pyosmeta/cli/update_reviews.py b/src/pyosmeta/cli/update_reviews.py index 634e7bf..bfefc32 100644 --- a/src/pyosmeta/cli/update_reviews.py +++ b/src/pyosmeta/cli/update_reviews.py @@ -45,8 +45,10 @@ def main(): if args: update_all = False - - web_reviews_path = "https://raw.githubusercontent.com/pyOpenSci/pyopensci.github.io/main/_data/packages.yml" + web_reviews_path = ( + "htts://raw.githubusercontent.com/pyOpenSci/" + "pyopensci.github.io/main/_data/packages.yml" + ) process_review = ProcessIssues( org="pyopensci", @@ -62,7 +64,7 @@ def main(): accepted_reviews = process_review.parse_issue_header(issues, 15) # Parse through reviews, identify new ones, fix case - if update_all == True: + if update_all: for key, meta in accepted_reviews.items(): web_reviews[key.lower()] = meta else: From 91a61a2f26138f1a373bc39839276085f5a10526 Mon Sep 17 00:00:00 2001 From: Leah Wasser Date: Thu, 17 Aug 2023 09:27:49 -0600 Subject: [PATCH 05/12] Flake8 format --- src/pyosmeta/contributors.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/pyosmeta/contributors.py b/src/pyosmeta/contributors.py index dcabaa8..80d1635 100644 --- a/src/pyosmeta/contributors.py +++ b/src/pyosmeta/contributors.py @@ -5,7 +5,13 @@ import requests from dataclasses import dataclass from dotenv import load_dotenv -from pydantic import AliasChoices, BaseModel, ConfigDict, Field, field_validator +from pydantic import ( + AliasChoices, + BaseModel, + ConfigDict, + Field, + field_validator, +) from typing import Dict, List, Optional, Tuple, Union @@ -519,8 +525,8 @@ def add_new_user(self, gh_user: str) -> dict: Returns ------- Dict - Username is the key and the updated github profile info is contained - in the dict. + Username is the key and the updated github profile info is + contained in the dict. """ From 1724749ce7fa70a6bbc7f772b76aaf766a548f87 Mon Sep 17 00:00:00 2001 From: Leah Wasser Date: Thu, 17 Aug 2023 16:44:43 -0600 Subject: [PATCH 06/12] Bug fixes --- .gitignore | 2 +- pyproject.toml | 6 +-- src/pyosmeta/__init__.py | 12 ++--- src/pyosmeta/cli/update_reviews.py | 5 +- src/pyosmeta/file_io.py | 7 ++- src/pyosmeta/parse_issues.py | 74 +++++++++++++++++++----------- 6 files changed, 63 insertions(+), 43 deletions(-) diff --git a/.gitignore b/.gitignore index 888ee0f..9dad33e 100644 --- a/.gitignore +++ b/.gitignore @@ -41,5 +41,5 @@ dmypy.json token.txt src/test-model.py - +src/pyosmeta/_version_generated.py .pdm-build/* diff --git a/pyproject.toml b/pyproject.toml index 991b9d0..5eb2518 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,9 +29,9 @@ classifiers = [ "Programming Language :: Python :: 3 :: Only", # BE sure to specify that you use python 3.x "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", -version = "0.1.0" -description = "Tools for contributors" -authors = [{ name = "Leah Wasser", email = "leah@pyopensci.org" }] +] + + dependencies = [ "ruamel-yaml>=0.17.21", "requests", diff --git a/src/pyosmeta/__init__.py b/src/pyosmeta/__init__.py index 5b35315..017c97d 100644 --- a/src/pyosmeta/__init__.py +++ b/src/pyosmeta/__init__.py @@ -1,16 +1,14 @@ from .contributors import PersonModel, ProcessContributors from .parse_issues import ProcessIssues, ReviewModel -<<<<<<< HEAD -try: - from ._version_generated import __version__ -except ImportError: - __version__ = "unreleased" -======= __all__ = ( "ProcessIssues", "ReviewModel", "PersonModel", "ProcessContributors", ) ->>>>>>> 007299c (Fix: pyproject tomly flake8/black fix) + +try: + from ._version_generated import __version__ +except ImportError: + __version__ = "unreleased" diff --git a/src/pyosmeta/cli/update_reviews.py b/src/pyosmeta/cli/update_reviews.py index bfefc32..bf3bbf6 100644 --- a/src/pyosmeta/cli/update_reviews.py +++ b/src/pyosmeta/cli/update_reviews.py @@ -46,7 +46,7 @@ def main(): if args: update_all = False web_reviews_path = ( - "htts://raw.githubusercontent.com/pyOpenSci/" + "https://raw.githubusercontent.com/pyOpenSci/" "pyopensci.github.io/main/_data/packages.yml" ) @@ -74,6 +74,7 @@ def main(): web_reviews[key.lower()] = meta # Update gh metrics via api for all packages + # TODO: for some reason cardsort is missing gh_metadata repo_endpoints = process_review.get_repo_endpoints(web_reviews) web_reviews = process_review.get_gh_metrics(repo_endpoints, web_reviews) @@ -87,7 +88,7 @@ def main(): try: all_reviews[key] = ReviewModel(**review) except ValidationError as ve: - print(ve) + print(key, ":", ve) with open("all_reviews.pickle", "wb") as f: pickle.dump(web_reviews, f) diff --git a/src/pyosmeta/file_io.py b/src/pyosmeta/file_io.py index 6a773c1..8521477 100644 --- a/src/pyosmeta/file_io.py +++ b/src/pyosmeta/file_io.py @@ -71,8 +71,11 @@ def open_yml_file(file_path: str) -> dict: # TODO: this used to be self.web_yml so i'll need to reorganized # the contrib class - with urllib.request.urlopen(file_path) as f: - return ruamel.yaml.safe_load(f) + try: + with urllib.request.urlopen(file_path) as f: + return ruamel.yaml.safe_load(f) + except urllib.error.URLError as url_error: + print("Oops - can find the url", file_path, url_error) def export_yaml(filename: str, data_list: list): diff --git a/src/pyosmeta/parse_issues.py b/src/pyosmeta/parse_issues.py index d6ca4eb..02291d0 100644 --- a/src/pyosmeta/parse_issues.py +++ b/src/pyosmeta/parse_issues.py @@ -1,8 +1,15 @@ +import re from datetime import datetime import requests from dataclasses import dataclass -from pydantic import AliasChoices, BaseModel, ConfigDict, Field, field_validator +from pydantic import ( + AliasChoices, + BaseModel, + ConfigDict, + Field, + field_validator, +) from typing import Any, Optional from pyosmeta.contributors import ProcessContributors @@ -15,7 +22,7 @@ def clean_date(a_date: Optional[str]) -> str: others it's a gh time stamp. finally sometimes it could be missing or text. handle all of those cases with this validator. """ - print(a_date) + if a_date is None or a_date == "missing": return "missing" elif len(a_date) < 11: @@ -28,8 +35,8 @@ def clean_date(a_date: Optional[str]) -> str: .date() .strftime("%Y-%m-%d") ) - except TypeError as te: - print("Oops - missing data. Setting date to missing", te) + except TypeError as t_error: + print("Oops - missing data. Setting date to missing", t_error) return "missing" @@ -65,25 +72,24 @@ class ReviewModel(BaseModel): # Make sure model populates both aliases and original attr name model_config = ConfigDict(populate_by_name=True, str_strip_whitespace=True) - package_name: Optional[str] = None + package_name: Optional[str] = "" package_description: str = Field( - None, validation_alias=AliasChoices("one-line_description_of_package") + "", validation_alias=AliasChoices("one-line_description_of_package") ) - submitting_author: dict[str, str] = None - all_current_maintainers: list[dict[str, str]] = None + submitting_author: dict[str, Optional[str]] = {} + all_current_maintainers: list[dict[str, str | None]] = {} repository_link: Optional[str] = None version_submitted: Optional[str] = None - categories: Optional[str] = None - categories: list[str] = None - editor: dict[str, str] = None - reviewer_1: dict[str, str] = None - reviewer_2: dict[str, str] = None - archive: str = None - version_accepted: str = None - date_accepted: str = None + categories: Optional[list[str]] = None + editor: dict[str, str | None] = {} + reviewer_1: dict[str, str | None] = {} + reviewer_2: dict[str, str | None] = {} + archive: Optional[str] = None + version_accepted: Optional[str] = None + date_accepted: Optional[str] = None created_at: str = None updated_at: str = None - closed_at: str = None + closed_at: Optional[str] = None issue_link: str = None gh_meta: GhMeta @@ -309,6 +315,20 @@ def parse_issue_header( "https://api.github.com/repos/", "https://github.com/" ) + review_clean = { + key: value + for key, value in review[package_name].items() + if not key.startswith("##") + and not key.startswith("---") + and not key.startswith("-_[x]_i_agree") + } + review[package_name] = review_clean + # filtered = {} + # for key, value in review.items(): + # print(key) + # if not key.startswith("##") and not key.startswith("-"): + # filtered[key] = value + # # Clean markdown url's from editor, and reviewer lines # TODO - this could be a reviewer name cleanup validaotr # types = ["editor", "reviewer_1", "reviewer_2"] @@ -378,6 +398,10 @@ def get_repo_endpoints( for a_package in review_issues.keys(): repo = review_issues[a_package]["repository_link"].strip("/") owner, repo = repo.split("/")[-2:] + # TODO: could be simpler code - Remove any link remnants + pattern = r"[\(\)\[\]?]" + owner = re.sub(pattern, "", owner) + repo = re.sub(pattern, "", repo) all_repos[ a_package ] = f"https://api.github.com/repos/{owner}/{repo}" @@ -453,7 +477,6 @@ def get_gh_metrics( pkg_meta = {} for pkg_name, url in endpoints.items(): print("Getting GitHub stats for", pkg_name) - pkg_meta[pkg_name] = self.get_repo_meta(url, self.gh_stats) pkg_meta[pkg_name]["contrib_count"] = self.get_repo_contribs(url) @@ -461,7 +484,7 @@ def get_gh_metrics( # Add github meta to review metadata reviews[pkg_name]["gh_meta"] = pkg_meta[pkg_name] - return reviews + return reviews def get_repo_meta(self, url: str, stats_list: list) -> dict: """ @@ -469,8 +492,7 @@ def get_repo_meta(self, url: str, stats_list: list) -> dict: """ stats_dict = {} - # get the url (normally the docs) and description of a repo! - print(url) + # Get the url (normally the docs) and description of a repo! response = requests.get( url, headers={"Authorization": f"token {self.GITHUB_TOKEN}"} ) @@ -510,7 +532,7 @@ def get_repo_contribs(self, url: str) -> dict: ) if response.status_code == 404: - print("Can't find: ", url, ". Did the repo url change?") + print("Can't find: ", repo_contribs, ". Did the repo url change?") # Extract the description and homepage URL from the JSON response else: return len(response.json()) @@ -528,11 +550,7 @@ def get_last_commit(self, repo: str) -> str: response = requests.get( url, headers={"Authorization": f"token {self.GITHUB_TOKEN}"} ).json() - date = ( - response[0]["commit"]["author"]["date"] - # if 0 in response - # else "1970-01-01T00:00:00Z" - ) + date = response[0]["commit"]["author"]["date"] return date @@ -575,7 +593,7 @@ def get_categories( checked = any([x in line for x in cat_matches]) if line.startswith("- [") and checked: - category = line[line.index("]") + 2 :] + category = line[line.index("]") + 2] categories.append(category) elif not line.startswith("- ["): break From e8d694b9df75aac97b5886e088fc3cbbc3d45472 Mon Sep 17 00:00:00 2001 From: Leah Wasser Date: Fri, 18 Aug 2023 15:39:35 -0600 Subject: [PATCH 07/12] Move all code to model based workflow --- src/pyosmeta/cli/update_contributors.py | 2 - src/pyosmeta/cli/update_review_contribs.py | 131 ++++++++++--------- src/pyosmeta/cli/update_reviews.py | 2 +- src/pyosmeta/contributors.py | 145 +++++++++++---------- src/pyosmeta/parse_issues.py | 6 +- 5 files changed, 151 insertions(+), 135 deletions(-) diff --git a/src/pyosmeta/cli/update_contributors.py b/src/pyosmeta/cli/update_contributors.py index 5676033..922e346 100644 --- a/src/pyosmeta/cli/update_contributors.py +++ b/src/pyosmeta/cli/update_contributors.py @@ -8,8 +8,6 @@ from pyosmeta.file_io import create_paths, open_yml_file print(pydantic.__version__) -# TODO - fix the website by renaming packages-editor, packages-submitted: -# packages-reviewed: to use underscores. this will just make life easier def main(): diff --git a/src/pyosmeta/cli/update_review_contribs.py b/src/pyosmeta/cli/update_review_contribs.py index c4c5aa1..79b7f7b 100644 --- a/src/pyosmeta/cli/update_review_contribs.py +++ b/src/pyosmeta/cli/update_review_contribs.py @@ -12,58 +12,59 @@ Rather than hit any api's it just updates information from the issues. To run: update_reviewers -# TODO - FEATURE we have some packages that were NOT approved but we had editors and reviewers. -# We need to acknowledge these people as well. maybe tag them with waiting on maintainer response?? -# TODO: package-wide feature: create a flag for entries that we do not want to update +# TODO - FEATURE we have some packages that were NOT approved but we had +# editors and reviewers. +# We need to acknowledge these people as well. maybe tag them with waiting on +# maintainer response?? +# TODO: package-wide feature: create no update flag for entries # TODO: make sure we can add a 3rd or 4th reviewer - crowsetta has this as # will biocypher # TODO: make sure to add a current editor boolean to the current editors and # emeritus ones. -# TODO - ?create a class for person types?? """ -import os - -from pyosmeta.contributors import ProcessContributors +# TODO - running into validation errors again here.. but making lots +# of progress!! +from pyosmeta.contributors import PersonModel, ProcessContributors from pyosmeta.file_io import clean_export_yml, load_pickle -def get_clean_user(username: str): +def get_clean_user(username: str) -> str: + """A small helper that removes whitespace and ensures username is + lower case""" return username.lower().strip() def main(): # TODO: move refresh contribs and contribs dict attr to # processContribs and remove this module altogether - updateContribs = ProcessContributors([]) + process_contribs = ProcessContributors([]) # Two pickle files are outputs of the two other scripts # use that data to limit web calls - contribs = load_pickle("all_contribs.pickle") - - # Output of update_reviews.py + all_contribs = load_pickle("all_contribs.pickle") packages = load_pickle("all_reviews.pickle") - contrib_types = updateContribs.contrib_types + contrib_types = process_contribs.contrib_types for pkg_name, issue_meta in packages.items(): print("Processing review team for:", pkg_name) for issue_role in contrib_types.keys(): if issue_role == "all_current_maintainers": - if issue_role in issue_meta: + if issue_meta.all_current_maintainers: # Loop through each maintainer in the list for i, a_maintainer in enumerate( - issue_meta.get(issue_role) + issue_meta.all_current_maintainers ): gh_user = get_clean_user( a_maintainer["github_username"] ) - if gh_user not in contribs.keys(): - contribs.update( - updateContribs.check_add_user( - gh_user, contribs + if gh_user not in all_contribs.keys(): + all_contribs.update( + process_contribs.check_add_user( + gh_user, all_contribs ) ) @@ -71,73 +72,79 @@ def main(): ( contrib_key, pkg_list, - ) = updateContribs.refresh_contribs( - contribs[gh_user], - pkg_name, # new contribs + ) = process_contribs.refresh_contribs( + all_contribs[gh_user], + pkg_name, # new all_contribs issue_role, ) # Update users contrib list - contribs[gh_user][contrib_key] = pkg_list + setattr(all_contribs[gh_user], contrib_key, pkg_list) - _, contrib_list = updateContribs.refresh_contribs( - contribs[gh_user], + _, contrib_list = process_contribs.refresh_contribs( + all_contribs[gh_user], None, issue_role, ) - contribs[gh_user]["contributor_type"] = contrib_list - # If name is missing in issue summary, populate from contribs - if a_maintainer["name"] == "": - packages[pkg_name]["all_current_maintainers"][i][ - "name" - ] = contribs[gh_user]["name"] + setattr( + all_contribs[gh_user], + "contributor_type", + contrib_list, + ) + # If name is missing in issue summary, populate from + # all_contribs + # TODO: this is currently not working as maintainer is + # a string object + if a_maintainer["name"] == "": + maintainer = getattr( + packages[pkg_name], "all_current_maintainers" + )[i]["name"] + setattr( + packages[pkg_name], + "all_current_maintainers", + getattr(all_contribs[gh_user], "name"), + ) else: print( - "All maintainers is missing in the review for ", + "All maintainers is missing in the review for:", pkg_name, ) else: # Else we are processing editors, reviewers... gh_user = get_clean_user( - packages[pkg_name][issue_role]["github_username"] + getattr(packages[pkg_name], issue_role)["github_username"] ) - if gh_user not in contribs.keys(): - # If they aren't already in contribs, add them - contribs.update( - updateContribs.check_add_user(gh_user, contribs) - ) - # Update user package contributions - ( - contrib_key, - pkg_list, - ) = updateContribs.refresh_contribs( - contribs[gh_user], - pkg_name, # new contribs - issue_role, - ) + if gh_user not in all_contribs.keys(): + # If they aren't already in all_contribs, add them + print("Found a new user!", gh_user) + new_contrib = process_contribs.get_user_info(gh_user) + all_contribs[gh_user] = PersonModel(**new_contrib) - # Update users contrib list - contribs[gh_user][contrib_key] = pkg_list - - _, contrib_list = updateContribs.refresh_contribs( - contribs[gh_user], - None, - issue_role, + # Update user package contributions + print(gh_user) + # Only add new contrib if it's unique + review_key = contrib_types[issue_role][0] + all_contribs[gh_user].add_unique_value(review_key, pkg_name) + + # Update user contrib list + review_roles = contrib_types[issue_role][1] + all_contribs[gh_user].add_unique_value( + "contributor_type", review_roles ) - contribs[gh_user]["contributor_type"] = contrib_list - # If users's name is missing in issue, populate from contribs dict - if issue_meta[issue_role]["name"] == "": - packages[pkg_name][issue_role]["name"] = contribs[gh_user][ - "name" - ] + # If users's name is missing in issue, populate from contribs + if getattr(issue_meta, issue_role)["name"] == "": + attribute_value = getattr(packages[pkg_name], issue_role) + attribute_value["name"] = getattr( + all_contribs[gh_user], "name" + ) # Export to yaml - clean_export_yml(contribs, os.path.join("_data", "contributors.yml")) - clean_export_yml(packages, os.path.join("_data", "packages.yml")) + # clean_export_yml(contribs, os.path.join("_data", "contributors.yml")) + # clean_export_yml(packages, os.path.join("_data", "packages.yml")) if __name__ == "__main__": diff --git a/src/pyosmeta/cli/update_reviews.py b/src/pyosmeta/cli/update_reviews.py index bf3bbf6..8909a1f 100644 --- a/src/pyosmeta/cli/update_reviews.py +++ b/src/pyosmeta/cli/update_reviews.py @@ -91,7 +91,7 @@ def main(): print(key, ":", ve) with open("all_reviews.pickle", "wb") as f: - pickle.dump(web_reviews, f) + pickle.dump(all_reviews, f) if __name__ == "__main__": diff --git a/src/pyosmeta/contributors.py b/src/pyosmeta/contributors.py index 80d1635..1ad4a1b 100644 --- a/src/pyosmeta/contributors.py +++ b/src/pyosmeta/contributors.py @@ -12,12 +12,16 @@ Field, field_validator, ) -from typing import Dict, List, Optional, Tuple, Union +from typing import Set, Dict, List, Optional, Tuple, Union class PersonModel(BaseModel): # Make sure model populates both aliases and original attr name - model_config = ConfigDict(populate_by_name=True, str_strip_whitespace=True) + model_config = ConfigDict( + populate_by_name=True, + str_strip_whitespace=True, + validate_assignment=True, + ) name: Optional[str] = None title: Optional[Union[list[str], str]] = None @@ -44,47 +48,67 @@ class PersonModel(BaseModel): None, validation_alias=AliasChoices("blog", "website") ) board: Optional[bool] = False - contributor_type: Optional[list[str]] = [] - packages_editor: Optional[list[str | None]] = Field( - None, - validation_alias=AliasChoices("packages-editor"), - ) - packages_submitted: Optional[list[str | None]] = Field( - None, - validation_alias=AliasChoices( - "packages-submitted", "packages_submitted" - ), - ) - packages_reviewed: Optional[list[str | None]] = Field( - None, - validation_alias=AliasChoices( - "packages-reviewed", "packages_reviewed" - ), - ) + contributor_type: Set[str] = set() + packages_editor: Set[str] = set() + packages_submitted: Set[str] = set() + packages_reviewed: Set[str] = set() location: Optional[str] = None - email: Optional[str] = ModuleNotFoundError + email: Optional[str] = None @field_validator( "packages_reviewed", "packages_submitted", "packages_editor", + "contributor_type", mode="before", ) - @classmethod - def string_to_list(cls, value): - """ - For fields such as packages-reviewed edited etc we want - a list of elements not just a single string. this will - fix that issue. - """ - # If the input value is a string, convert it to a list + def convert_to_set(cls, value): if isinstance(value, list): - return value - if isinstance(value, str): - return [value] - # If the input value is None, return an empty list + if value[0] is None: + return set() + else: + return set(value) elif value is None: - return [] + return set() + return set(value) + + def add_unique_value(self, attr_name: str, values: Union[str, list[str]]): + """A helper that will add only unique values to an existing list""" + if isinstance(values, str): + values = [values] + attribute = getattr(self, attr_name) + if isinstance(attribute, set): + attribute.update(values) + else: + raise ValueError(f"{attr_name} is not a set attribute") + + # @field_validator( + # "packages_reviewed", + # "packages_submitted", + # "packages_editor", + # "contributor_type", + # mode="before", + # ) + # @classmethod + # def string_to_list(cls, value): + # """ + # For fields such as packages-reviewed edited etc we want + # a list of elements not just a single string. this will + # fix that issue. + # """ + # # If the input value is a string, convert it to a list + # print("the value is", value) + # if isinstance(value, list): + # print("removing duplicates now") + # return list(set(value)) + # if isinstance(value, str): + # print("Found a string, turning to list") + # return [value] + # # If the input value is None, return an empty list + # # This may never happen + # elif value is None: + # print("Found a none, returning empty list. Value is", value) + # return [] @field_validator("bio", mode="before") @classmethod @@ -131,15 +155,15 @@ def __init__(self, json_files: List) -> None: ] self.contrib_types = { - "reviewer_1": ["packages-reviewed", ["reviewer", "peer-review"]], - "reviewer_2": ["packages-reviewed", ["reviewer", "peer-review"]], - "editor": ["packages-editor", ["editor", "peer-review"]], + "reviewer_1": ["packages_reviewed", ["reviewer", "peer-review"]], + "reviewer_2": ["packages_reviewed", ["reviewer", "peer-review"]], + "editor": ["packages_editor", ["editor", "peer-review"]], "submitting_author": [ - "packages-submitted", + "packages_submitted", ["maintainer", "submitting-author", "peer-review"], ], "all_current_maintainers": [ - "packages-submitted", + "packages_submitted", ["maintainer", "peer-review"], ], } @@ -156,7 +180,14 @@ def get_token(self) -> str: load_dotenv() return os.environ["GITHUB_TOKEN"] - def refresh_contribs(self, contribs: Dict, new_contribs, review_role): + def refresh_contribs( + self, + person: PersonModel, + new_contribs: Optional[ + str + ], # I think this will always be a package name? if so rename is pkg_name + review_role: str, + ): """Need to add .... Parameters @@ -167,44 +198,20 @@ def refresh_contribs(self, contribs: Dict, new_contribs, review_role): ------- """ contrib_types = self.contrib_types - contrib_key_yml = "" # Contributor type will be updated which is a list of roles + # TODO rename contribs to person if new_contribs: contrib_key_yml = contrib_types[review_role][0] - existing_contribs = contribs[contrib_key_yml] - # Else this is a specific review role meant to update package list + existing_contribs = getattr(person, contrib_key_yml) + else: - new_contribs = contrib_types[review_role][1] - existing_contribs = contribs["contributor_type"] + # Else update review role(s) in contrib_type attribute + contrib_key_yml = contrib_types[review_role][1] + existing_contribs = person.contributor_type final_list = self.update_contrib_list(existing_contribs, new_contribs) return (contrib_key_yml, final_list) - # TODO: this can go away now that i have a personmodel obj - # def create_contrib_template(self) -> Dict: - # """A small helper that creates a template for a new contributor - # that we are adding to our contributor.yml file""" - - # return { - # "name": "", - # "bio": "", - # "organization": "", - # "title": "", - # "github_username": "", - # "github_image_id": "", - # "editorial-board": "", - # "twitter": "", - # "mastodon": "", - # "orcidid": "", - # "website": "", - # "contributor_type": [], - # "packages-editor": [], - # "packages-submitted": [], - # "packages-reviewed": [], - # "location": "", - # "email": "", - # } - # TODO - This utility is used across all scripts. def clean_list(self, a_list: Union[str, List[str]]) -> List[str]: """Helper function that takes an input object as a list or string. diff --git a/src/pyosmeta/parse_issues.py b/src/pyosmeta/parse_issues.py index 02291d0..fdc50c7 100644 --- a/src/pyosmeta/parse_issues.py +++ b/src/pyosmeta/parse_issues.py @@ -70,7 +70,11 @@ def clean_date(cls, a_date: Optional[str]) -> str: class ReviewModel(BaseModel): # Make sure model populates both aliases and original attr name - model_config = ConfigDict(populate_by_name=True, str_strip_whitespace=True) + model_config = ConfigDict( + populate_by_name=True, + str_strip_whitespace=True, + validate_assignment=True, + ) package_name: Optional[str] = "" package_description: str = Field( From 68900cf0e38253cdc3c304a2918490bd2eaf9939 Mon Sep 17 00:00:00 2001 From: Leah Wasser Date: Fri, 18 Aug 2023 18:11:36 -0600 Subject: [PATCH 08/12] Fixed validation and updates are workign now --- src/pyosmeta/cli/update_contributors.py | 3 + src/pyosmeta/cli/update_review_contribs.py | 118 +++++++++------------ src/pyosmeta/cli/update_reviews.py | 10 +- src/pyosmeta/contributors.py | 38 ++----- 4 files changed, 68 insertions(+), 101 deletions(-) diff --git a/src/pyosmeta/cli/update_contributors.py b/src/pyosmeta/cli/update_contributors.py index 922e346..716d1fa 100644 --- a/src/pyosmeta/cli/update_contributors.py +++ b/src/pyosmeta/cli/update_contributors.py @@ -9,6 +9,9 @@ print(pydantic.__version__) +# TODO - https://stackoverflow.com/questions/55762673/how-to-parse-list-of-models-with-pydantic +# I can use TypeAdapter to convert the json data to model objects! + def main(): parser = argparse.ArgumentParser( diff --git a/src/pyosmeta/cli/update_review_contribs.py b/src/pyosmeta/cli/update_review_contribs.py index 79b7f7b..b111545 100644 --- a/src/pyosmeta/cli/update_review_contribs.py +++ b/src/pyosmeta/cli/update_review_contribs.py @@ -24,8 +24,14 @@ """ -# TODO - running into validation errors again here.. but making lots -# of progress!! +# TODO - Case sensitivity is an issue with my validation using set +# - jointly +# - Jointly +# - devicely +# - Devicely +# - sevivi +import os + from pyosmeta.contributors import PersonModel, ProcessContributors from pyosmeta.file_io import clean_export_yml, load_pickle @@ -43,7 +49,7 @@ def main(): # Two pickle files are outputs of the two other scripts # use that data to limit web calls - all_contribs = load_pickle("all_contribs.pickle") + contribs = load_pickle("all_contribs.pickle") packages = load_pickle("all_reviews.pickle") contrib_types = process_contribs.contrib_types @@ -51,87 +57,56 @@ def main(): for pkg_name, issue_meta in packages.items(): print("Processing review team for:", pkg_name) for issue_role in contrib_types.keys(): + # I wonder if there is a clever way to skip review if this is missing? if issue_role == "all_current_maintainers": - if issue_meta.all_current_maintainers: - # Loop through each maintainer in the list - for i, a_maintainer in enumerate( - issue_meta.all_current_maintainers - ): - gh_user = get_clean_user( - a_maintainer["github_username"] - ) - - if gh_user not in all_contribs.keys(): - all_contribs.update( - process_contribs.check_add_user( - gh_user, all_contribs - ) - ) - - # Update contrib packages for peer review - ( - contrib_key, - pkg_list, - ) = process_contribs.refresh_contribs( - all_contribs[gh_user], - pkg_name, # new all_contribs - issue_role, - ) - # Update users contrib list - setattr(all_contribs[gh_user], contrib_key, pkg_list) - - _, contrib_list = process_contribs.refresh_contribs( - all_contribs[gh_user], - None, - issue_role, + # if issue_meta.all_current_maintainers: + # Loop through each maintainer in the list + for i, a_maintainer in enumerate( + issue_meta.all_current_maintainers + ): + gh_user = get_clean_user(a_maintainer["github_username"]) + + if gh_user not in contribs.keys(): + contribs.update( + process_contribs.check_add_user(gh_user, contribs) ) - setattr( - all_contribs[gh_user], - "contributor_type", - contrib_list, - ) + # Update user package contributions (if it's unique) + review_key = contrib_types[issue_role][0] + contribs[gh_user].add_unique_value(review_key, pkg_name) - # If name is missing in issue summary, populate from - # all_contribs - # TODO: this is currently not working as maintainer is - # a string object - if a_maintainer["name"] == "": - maintainer = getattr( - packages[pkg_name], "all_current_maintainers" - )[i]["name"] - setattr( - packages[pkg_name], - "all_current_maintainers", - getattr(all_contribs[gh_user], "name"), - ) - else: - print( - "All maintainers is missing in the review for:", - pkg_name, + # Update user contrib list (if it's unique) + review_roles = contrib_types[issue_role][1] + contribs[gh_user].add_unique_value( + "contributor_type", review_roles ) + # If name is missing in issue, populate from contribs + if a_maintainer["name"] == "": + name = getattr(contribs[gh_user], "name") + packages[pkg_name].all_current_maintainers[i][ + "name" + ] = name + else: # Else we are processing editors, reviewers... gh_user = get_clean_user( getattr(packages[pkg_name], issue_role)["github_username"] ) - if gh_user not in all_contribs.keys(): - # If they aren't already in all_contribs, add them + if gh_user not in contribs.keys(): + # If they aren't already in contribs, add them print("Found a new user!", gh_user) new_contrib = process_contribs.get_user_info(gh_user) - all_contribs[gh_user] = PersonModel(**new_contrib) + contribs[gh_user] = PersonModel(**new_contrib) - # Update user package contributions - print(gh_user) - # Only add new contrib if it's unique + # Update user package contributions (if it's unique) review_key = contrib_types[issue_role][0] - all_contribs[gh_user].add_unique_value(review_key, pkg_name) + contribs[gh_user].add_unique_value(review_key, pkg_name) - # Update user contrib list + # Update user contrib list (if it's unique) review_roles = contrib_types[issue_role][1] - all_contribs[gh_user].add_unique_value( + contribs[gh_user].add_unique_value( "contributor_type", review_roles ) @@ -139,12 +114,17 @@ def main(): if getattr(issue_meta, issue_role)["name"] == "": attribute_value = getattr(packages[pkg_name], issue_role) attribute_value["name"] = getattr( - all_contribs[gh_user], "name" + contribs[gh_user], "name" ) + print("Export") # Export to yaml - # clean_export_yml(contribs, os.path.join("_data", "contributors.yml")) - # clean_export_yml(packages, os.path.join("_data", "packages.yml")) + contribs_ls = [model.model_dump() for model in contribs.values()] + # Getting error dumping packages + pkgs_ls = [model.model_dump() for model in packages.values()] + + clean_export_yml(contribs_ls, os.path.join("_data", "contributors.yml")) + clean_export_yml(pkgs_ls, os.path.join("_data", "packages.yml")) if __name__ == "__main__": diff --git a/src/pyosmeta/cli/update_reviews.py b/src/pyosmeta/cli/update_reviews.py index 8909a1f..5196f86 100644 --- a/src/pyosmeta/cli/update_reviews.py +++ b/src/pyosmeta/cli/update_reviews.py @@ -16,10 +16,6 @@ # TODO: feature - Would be cool to create an "under review now" list as well - # ideally this could be passed as a CLI argument with the label we want to # search for -# TODO: 1. add gh metadata to the review object -# prior to parsing -# 2. work on update-all!! -# 3. i think package_description might not be parsing right? import argparse @@ -30,6 +26,12 @@ from pyosmeta import ProcessIssues, ReviewModel from pyosmeta.file_io import load_website_yml +# todo - dates are wrong in issues + +# date_accepted: 5-2023-7 +# created_at: 01-2023-03 +# updated_at: 27-2023-07 + def main(): update_all = False diff --git a/src/pyosmeta/contributors.py b/src/pyosmeta/contributors.py index 1ad4a1b..65fe87f 100644 --- a/src/pyosmeta/contributors.py +++ b/src/pyosmeta/contributors.py @@ -10,9 +10,10 @@ BaseModel, ConfigDict, Field, + field_serializer, field_validator, ) -from typing import Set, Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Set, Tuple, Union class PersonModel(BaseModel): @@ -82,33 +83,14 @@ def add_unique_value(self, attr_name: str, values: Union[str, list[str]]): else: raise ValueError(f"{attr_name} is not a set attribute") - # @field_validator( - # "packages_reviewed", - # "packages_submitted", - # "packages_editor", - # "contributor_type", - # mode="before", - # ) - # @classmethod - # def string_to_list(cls, value): - # """ - # For fields such as packages-reviewed edited etc we want - # a list of elements not just a single string. this will - # fix that issue. - # """ - # # If the input value is a string, convert it to a list - # print("the value is", value) - # if isinstance(value, list): - # print("removing duplicates now") - # return list(set(value)) - # if isinstance(value, str): - # print("Found a string, turning to list") - # return [value] - # # If the input value is None, return an empty list - # # This may never happen - # elif value is None: - # print("Found a none, returning empty list. Value is", value) - # return [] + @field_serializer( + "packages_reviewed", + "packages_submitted", + "packages_editor", + "contributor_type", + ) + def serialize_set(self, items: Set[str]): + return list(items) @field_validator("bio", mode="before") @classmethod From fd6c5c96dda8d7dec73f9d5b3a43d98a6e64b6af Mon Sep 17 00:00:00 2001 From: Leah Wasser Date: Fri, 18 Aug 2023 20:00:11 -0600 Subject: [PATCH 09/12] Fix: dates and deprecated methods --- src/pyosmeta/cli/update_contributors.py | 23 ++++--- src/pyosmeta/cli/update_review_contribs.py | 10 ++-- src/pyosmeta/cli/update_reviews.py | 2 +- src/pyosmeta/contributors.py | 70 ++-------------------- src/pyosmeta/parse_issues.py | 2 +- 5 files changed, 29 insertions(+), 78 deletions(-) diff --git a/src/pyosmeta/cli/update_contributors.py b/src/pyosmeta/cli/update_contributors.py index 716d1fa..356dc5c 100644 --- a/src/pyosmeta/cli/update_contributors.py +++ b/src/pyosmeta/cli/update_contributors.py @@ -9,11 +9,13 @@ print(pydantic.__version__) -# TODO - https://stackoverflow.com/questions/55762673/how-to-parse-list-of-models-with-pydantic +# TODO - https://stackoverflow.com +# /questions/55762673/how-to-parse-list-of-models-with-pydantic # I can use TypeAdapter to convert the json data to model objects! def main(): + update_all = False parser = argparse.ArgumentParser( description="A CLI script to update pyOpenSci contributors" ) @@ -72,12 +74,17 @@ def main(): all_contribs[gh_user] = PersonModel(**new_contrib) # Update contribution type list for all users - existing_contribs = all_contribs[gh_user].contributor_type - all_contribs[ - gh_user - ].contributor_type = process_contribs.update_contrib_list( - existing_contribs, key - ) + all_contribs[gh_user].add_unique_value("contributor_type", key) + + # existing_contribs = all_contribs[gh_user].contributor_type + # # TODO: i can move all of these update items to just use the + # # personmodel.add_unique_value then i can get rid of update + # # contrib list + # all_contribs[ + # gh_user + # ].contributor_type = process_contribs.update_contrib_list( + # existing_contribs, key + # ) if update_all: for user in all_contribs.keys(): @@ -89,7 +96,7 @@ def main(): for key, item in new_gh_data.items(): if key == "mastodon": - # Mastodon isn't available in the api yet + # Mastodon isn't available in the GH api yet continue # Don't replace the value if there is a noupdate flag # TODO: This approach doesn't work, ruemal-yaml doesn't diff --git a/src/pyosmeta/cli/update_review_contribs.py b/src/pyosmeta/cli/update_review_contribs.py index b111545..d2faf63 100644 --- a/src/pyosmeta/cli/update_review_contribs.py +++ b/src/pyosmeta/cli/update_review_contribs.py @@ -57,9 +57,7 @@ def main(): for pkg_name, issue_meta in packages.items(): print("Processing review team for:", pkg_name) for issue_role in contrib_types.keys(): - # I wonder if there is a clever way to skip review if this is missing? if issue_role == "all_current_maintainers": - # if issue_meta.all_current_maintainers: # Loop through each maintainer in the list for i, a_maintainer in enumerate( issue_meta.all_current_maintainers @@ -73,7 +71,9 @@ def main(): # Update user package contributions (if it's unique) review_key = contrib_types[issue_role][0] - contribs[gh_user].add_unique_value(review_key, pkg_name) + contribs[gh_user].add_unique_value( + review_key, pkg_name.lower() + ) # Update user contrib list (if it's unique) review_roles = contrib_types[issue_role][1] @@ -102,7 +102,9 @@ def main(): # Update user package contributions (if it's unique) review_key = contrib_types[issue_role][0] - contribs[gh_user].add_unique_value(review_key, pkg_name) + contribs[gh_user].add_unique_value( + review_key, pkg_name.lower() + ) # Update user contrib list (if it's unique) review_roles = contrib_types[issue_role][1] diff --git a/src/pyosmeta/cli/update_reviews.py b/src/pyosmeta/cli/update_reviews.py index 5196f86..92e39aa 100644 --- a/src/pyosmeta/cli/update_reviews.py +++ b/src/pyosmeta/cli/update_reviews.py @@ -46,7 +46,7 @@ def main(): args = parser.parse_args() if args: - update_all = False + update_all = True web_reviews_path = ( "https://raw.githubusercontent.com/pyOpenSci/" "pyopensci.github.io/main/_data/packages.yml" diff --git a/src/pyosmeta/contributors.py b/src/pyosmeta/contributors.py index 65fe87f..c2e798c 100644 --- a/src/pyosmeta/contributors.py +++ b/src/pyosmeta/contributors.py @@ -63,15 +63,18 @@ class PersonModel(BaseModel): "contributor_type", mode="before", ) - def convert_to_set(cls, value): + def convert_to_set(cls, value: list[str]): if isinstance(value, list): - if value[0] is None: + if not value: + return set() + elif value[0] is None: return set() else: + value = [aval.lower() for aval in value] return set(value) elif value is None: return set() - return set(value) + return set(value.lower()) def add_unique_value(self, attr_name: str, values: Union[str, list[str]]): """A helper that will add only unique values to an existing list""" @@ -162,38 +165,6 @@ def get_token(self) -> str: load_dotenv() return os.environ["GITHUB_TOKEN"] - def refresh_contribs( - self, - person: PersonModel, - new_contribs: Optional[ - str - ], # I think this will always be a package name? if so rename is pkg_name - review_role: str, - ): - """Need to add .... - - Parameters - ---------- - - - Returns - ------- - """ - contrib_types = self.contrib_types - # Contributor type will be updated which is a list of roles - # TODO rename contribs to person - if new_contribs: - contrib_key_yml = contrib_types[review_role][0] - existing_contribs = getattr(person, contrib_key_yml) - - else: - # Else update review role(s) in contrib_type attribute - contrib_key_yml = contrib_types[review_role][1] - existing_contribs = person.contributor_type - - final_list = self.update_contrib_list(existing_contribs, new_contribs) - return (contrib_key_yml, final_list) - # TODO - This utility is used across all scripts. def clean_list(self, a_list: Union[str, List[str]]) -> List[str]: """Helper function that takes an input object as a list or string. @@ -235,35 +206,6 @@ def unique_new_vals( default = (True, diff) return default - # TODO - also a helper used by all scripts - def update_contrib_list( - self, - existing_contribs: Union[List, str], - new_contrib: Union[List, str], - ) -> List: - """Method that gets an existing list of contribs. - cleans the list and then checks the list against a - new contribution to see if it should be added. - - Parameters - ---------- - existing_contribs: list or str - A users existing contributions - new_contrib: list or str - a list or a single new contribution to be added - - """ - - # Cleanup first - cleaned_list = self.clean_list(existing_contribs) - new_contrib = self.clean_list(new_contrib) - - unique_vals, new_vals = self.unique_new_vals(cleaned_list, new_contrib) - if unique_vals: - cleaned_list += new_vals - - return cleaned_list - def check_contrib_type(self, json_file: str): """ Determine the type of contribution the person diff --git a/src/pyosmeta/parse_issues.py b/src/pyosmeta/parse_issues.py index fdc50c7..bce610e 100644 --- a/src/pyosmeta/parse_issues.py +++ b/src/pyosmeta/parse_issues.py @@ -27,7 +27,7 @@ def clean_date(a_date: Optional[str]) -> str: return "missing" elif len(a_date) < 11: new_date = a_date.replace("/", "-").split("-") - return f"{new_date[2]}-{new_date[0]}-{new_date[1]}" + return f"{new_date[0]}-{new_date[1]}-{new_date[2]}" else: try: return ( From 158141e8cd0442c341f28943aa2cf4a153ef118e Mon Sep 17 00:00:00 2001 From: Leah Wasser Date: Sat, 19 Aug 2023 18:22:11 -0600 Subject: [PATCH 10/12] Fix: more bugs and cleanup --- pyproject.toml | 8 +- src/pyosmeta/cli/process_reviews.py | 110 ++++++++++ ...iew_contribs.py => update_review_teams.py} | 21 +- src/pyosmeta/cli/update_reviews.py | 100 --------- src/pyosmeta/contributors.py | 194 +++++++++--------- src/pyosmeta/parse_issues.py | 143 ++++++++----- 6 files changed, 315 insertions(+), 261 deletions(-) create mode 100644 src/pyosmeta/cli/process_reviews.py rename src/pyosmeta/cli/{update_review_contribs.py => update_review_teams.py} (89%) delete mode 100644 src/pyosmeta/cli/update_reviews.py diff --git a/pyproject.toml b/pyproject.toml index 5eb2518..f80ed4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,8 +59,8 @@ license = { text = "MIT" } # for a user to run directly from the package. [project.scripts] # Optional update-contributors = "pyosmeta.cli.update_contributors:main" -update-reviews = "pyosmeta.cli.update_reviews:main" -update-reviewers = "pyosmeta.cli.update_review_contribs:main" +process-reviews = "pyosmeta.cli.process_reviews:main" +update-review-teams = "pyosmeta.cli.update_review_teams:main" # Right now i'm not using pdm to add dependencies. @@ -77,9 +77,7 @@ py_version = 27 [tool.flake8] # List of error codes to ignore (comma-separated) -ignore = "E203, W503" - -[tool.pdm] +ignore = ["E203", "W503"] [tool.pdm.build] diff --git a/src/pyosmeta/cli/process_reviews.py b/src/pyosmeta/cli/process_reviews.py new file mode 100644 index 0000000..a5862a8 --- /dev/null +++ b/src/pyosmeta/cli/process_reviews.py @@ -0,0 +1,110 @@ +""" +Script that parses metadata from na issue and adds it to a yml file for the +website. It also grabs some of the package metadata such as stars, +last commit, etc. + +Output: packages.yml file containing a list of + 1. all packages with accepted reviews + 2. information related to the review including reviewers, editors + 3. basic package stats including stars, etc. + +To run at the CLI: parse_issue_metadata +""" + +# TODO: if we export files we might want packages.yml and then under_review.yml +# thus we'd want to add a second input parameters which was file_name +# TODO: feature - Would be cool to create an "under review now" list as well - +# ideally this could be passed as a CLI argument with the label we want to +# search for + + +# import argparse +import pickle + +from pydantic import ValidationError + +from pyosmeta import ProcessIssues, ReviewModel + +# from pyosmeta.file_io import load_website_yml + + +# TODO: change the template to ask for date accepted format year-month-day + + +def main(): + # update_all = False + # parser = argparse.ArgumentParser( + # description="A CLI script to update pyOpenSci reviews" + # ) + # parser.add_argument( + # "--update", + # type=str, + # help="Will force update review info from GitHub for every review", + # ) + # args = parser.parse_args() + + # if args: + # update_all = True + # web_reviews_path = ( + # "https://raw.githubusercontent.com/pyOpenSci/" + # "pyopensci.github.io/main/_data/packages.yml" + # ) + + process_review = ProcessIssues( + org="pyopensci", + repo_name="software-submission", + label_name="6/pyOS-approved 🚀🚀🚀", + ) + + # Open web yaml & return dict with package name as key + # web_reviews = load_website_yml(key="package_name", url=web_reviews_path) + + # Get all issues for approved packages - load as dict + issues = process_review.return_response() + accepted_reviews = process_review.parse_issue_header(issues, 45) + + # TODO: clean out extra fields from accepted reviews?? + + # Parse through reviews, identify new ones, fix case + # TODO - right now i've reverted back to always updating all reviews. + # Is there a use-case to only update a new package vs updating everything? + # if update_all: + # all_reviews = {} + # for key, meta in accepted_reviews.items(): + # try: + # all_reviews[key.lower()] = ReviewModel(**meta) + # except ValidationError as ve: + # print(ve) + + # else: + # for key, meta in accepted_reviews.items(): + # if key.lower() not in all_reviews.keys(): + # print("Yay - pyOS has a new package:", key) + # all_reviews[key.lower()] = ReviewModel(**meta) + + # Update gh metrics via api for all packages + # TODO: this is working but above i made everything a model object + # do i want to do that above or just do it all at once below? + repo_endpoints = process_review.get_repo_endpoints(accepted_reviews) + all_reviews = process_review.get_gh_metrics( + repo_endpoints, accepted_reviews + ) + + # Finally populate model objects with review data + metrics + # TODO: this is really close - it's erroring when populating date + # i suspect in the github metadata + final_reviews = {} + for key, review in all_reviews.items(): + # First add gh meta to each dict + print("Parsing & validating", key) + try: + final_reviews[key] = ReviewModel(**review) + except ValidationError as ve: + print(key, ":", ve) + + with open("all_reviews.pickle", "wb") as f: + pickle.dump(final_reviews, f) + + +if __name__ == "__main__": + main() diff --git a/src/pyosmeta/cli/update_review_contribs.py b/src/pyosmeta/cli/update_review_teams.py similarity index 89% rename from src/pyosmeta/cli/update_review_contribs.py rename to src/pyosmeta/cli/update_review_teams.py index d2faf63..31d72f7 100644 --- a/src/pyosmeta/cli/update_review_contribs.py +++ b/src/pyosmeta/cli/update_review_teams.py @@ -32,7 +32,11 @@ # - sevivi import os -from pyosmeta.contributors import PersonModel, ProcessContributors +from pyosmeta.contributors import ( + PersonModel, + ProcessContributors, + ValidationError, +) from pyosmeta.file_io import clean_export_yml, load_pickle @@ -65,9 +69,12 @@ def main(): gh_user = get_clean_user(a_maintainer["github_username"]) if gh_user not in contribs.keys(): - contribs.update( - process_contribs.check_add_user(gh_user, contribs) - ) + print("Found a new user!", gh_user) + new_contrib = process_contribs.get_user_info(gh_user) + try: + contribs[gh_user] = PersonModel(**new_contrib) + except ValidationError as ve: + print(ve) # Update user package contributions (if it's unique) review_key = contrib_types[issue_role][0] @@ -98,7 +105,10 @@ def main(): # If they aren't already in contribs, add them print("Found a new user!", gh_user) new_contrib = process_contribs.get_user_info(gh_user) - contribs[gh_user] = PersonModel(**new_contrib) + try: + contribs[gh_user] = PersonModel(**new_contrib) + except ValidationError as ve: + print(ve) # Update user package contributions (if it's unique) review_key = contrib_types[issue_role][0] @@ -119,7 +129,6 @@ def main(): contribs[gh_user], "name" ) - print("Export") # Export to yaml contribs_ls = [model.model_dump() for model in contribs.values()] # Getting error dumping packages diff --git a/src/pyosmeta/cli/update_reviews.py b/src/pyosmeta/cli/update_reviews.py deleted file mode 100644 index 92e39aa..0000000 --- a/src/pyosmeta/cli/update_reviews.py +++ /dev/null @@ -1,100 +0,0 @@ -""" -Script that parses metadata from na issue and adds it to a yml file for the -website. It also grabs some of the package metadata such as stars, -last commit, etc. - -Output: packages.yml file containing a list of - 1. all packages with accepted reviews - 2. information related to the review including reviewers, editors - 3. basic package stats including stars, etc. - -To run at the CLI: parse_issue_metadata -""" - -# TODO: if we export files we might want packages.yml and then under_review.yml -# thus we'd want to add a second input parameters which was file_name -# TODO: feature - Would be cool to create an "under review now" list as well - -# ideally this could be passed as a CLI argument with the label we want to -# search for - - -import argparse -import pickle - -from pydantic import ValidationError - -from pyosmeta import ProcessIssues, ReviewModel -from pyosmeta.file_io import load_website_yml - -# todo - dates are wrong in issues - -# date_accepted: 5-2023-7 -# created_at: 01-2023-03 -# updated_at: 27-2023-07 - - -def main(): - update_all = False - parser = argparse.ArgumentParser( - description="A CLI script to update pyOpenSci reviews" - ) - parser.add_argument( - "--update", - type=str, - help="Will force update review info from GitHub for every review", - ) - args = parser.parse_args() - - if args: - update_all = True - web_reviews_path = ( - "https://raw.githubusercontent.com/pyOpenSci/" - "pyopensci.github.io/main/_data/packages.yml" - ) - - process_review = ProcessIssues( - org="pyopensci", - repo_name="software-submission", - label_name="6/pyOS-approved 🚀🚀🚀", - ) - - # Open web yaml & return dict with package name as key - web_reviews = load_website_yml(key="package_name", url=web_reviews_path) - - # Get all issues for approved packages - issues = process_review.return_response() - accepted_reviews = process_review.parse_issue_header(issues, 15) - - # Parse through reviews, identify new ones, fix case - if update_all: - for key, meta in accepted_reviews.items(): - web_reviews[key.lower()] = meta - else: - for key, meta in accepted_reviews.items(): - if key.lower() not in web_reviews.keys(): - print("Yay - pyOS has a new package:", key) - web_reviews[key.lower()] = meta - - # Update gh metrics via api for all packages - # TODO: for some reason cardsort is missing gh_metadata - repo_endpoints = process_review.get_repo_endpoints(web_reviews) - web_reviews = process_review.get_gh_metrics(repo_endpoints, web_reviews) - - # Finally populate model objects with review data + metrics - # TODO: this is really close - it's erroring when populating date - # i suspect in the github metadata - all_reviews = {} - for key, review in web_reviews.items(): - # First add gh meta to each dict - print("Parsing & validating", key) - try: - all_reviews[key] = ReviewModel(**review) - except ValidationError as ve: - print(key, ":", ve) - - with open("all_reviews.pickle", "wb") as f: - pickle.dump(all_reviews, f) - - -if __name__ == "__main__": - main() diff --git a/src/pyosmeta/contributors.py b/src/pyosmeta/contributors.py index c2e798c..3e82e60 100644 --- a/src/pyosmeta/contributors.py +++ b/src/pyosmeta/contributors.py @@ -56,6 +56,24 @@ class PersonModel(BaseModel): location: Optional[str] = None email: Optional[str] = None + # # TODO - turn this into a validator for the user website + # def _check_url(self, url: str) -> bool: + # """Test a url and return true if it works, false if not + + # Parameters + # ---------- + # url : str + # String for a url to a website to test. + + # """ + + # try: + # response = requests.get(url, timeout=6) + # return response.status_code == 200 + # except Exception: + # print("Oops, url", url, "is not valid, removing it") + # return False + @field_validator( "packages_reviewed", "packages_submitted", @@ -236,23 +254,26 @@ def check_contrib_type(self, json_file: str): contrib_type = "community" return contrib_type - def check_add_user(self, gh_user: str, contribs: Dict[str, str]) -> None: - """Check to make sure user exists in the existing contrib data. If they - don't' exist, add them + # TODO possibly could repurpose this as a check in the code + # but it should return get_user_info + # def check_add_user(self, gh_user: str, contribs: Dict[str, str]) -> None: + # """Check to make sure user exists in the existing contrib data. If + # they + # don't' exist, add them - Parameters - ---------- - gh_user : str - github username - contribs: dict - A dictionary containing contributors with gh user being the key + # Parameters + # ---------- + # gh_user : str + # github username + # contribs: dict + # A dictionary containing contributors with gh user being the key - This returns the updated dictionary with a new user at the end. + # This returns the updated dictionary with a new user at the end. - """ - if gh_user not in contribs.keys(): - print("Missing user", gh_user, "adding them now.") - return self.add_new_user(gh_user) + # """ + # if gh_user not in contribs.keys(): + # print("Missing user", gh_user, "adding them now.") + # return self.get_user_info(gh_user) def load_json(self, json_path: str) -> dict: """ @@ -440,34 +461,35 @@ def combine_users(self, repoDict: dict, webDict: dict) -> dict: webDict[gh_user] = repoDict[gh_user] return webDict - def add_new_user(self, gh_user: str) -> dict: - """Add a new user to the contrib file using gh username + # # TODO: i think i can remove this method + # def add_new_user(self, gh_user: str) -> dict: + # """Add a new user to the contrib file using gh username - This method does a few things. - 1. Adds a new template entry for the user w no values populated - 2. Gets user metadata from the user's github profile - 3. Updates their contrib entry with the gh data + # This method does a few things. + # 1. Adds a new template entry for the user w no values populated + # 2. Gets user metadata from the user's github profile + # 3. Updates their contrib entry with the gh data - Parameters - ---------- - gh_user : str - String representing the GitHub username + # Parameters + # ---------- + # gh_user : str + # String representing the GitHub username - Returns - ------- - Dict - Username is the key and the updated github profile info is - contained in the dict. + # Returns + # ------- + # Dict + # Username is the key and the updated github profile info is + # contained in the dict. - """ + # """ - new = {} - # Rather than this template i can use the person_model - new[gh_user] = self.create_contrib_template() - gh_data = self.get_gh_data([gh_user]) - # Update their metadata in the dict and return - updated_data = self.update_contrib_data(new, gh_data) - return updated_data + # new = {} + # # Rather than this template i can use the person_model + # new[gh_user] = self.create_contrib_template() + # gh_data = self.get_gh_data([gh_user]) + # # Update their metadata in the dict and return + # updated_data = self.update_contrib_data(new, gh_data) + # return updated_data def get_gh_data( self, contribs: Union[Dict[str, str], List] @@ -499,63 +521,47 @@ def get_gh_data( all_user_info[gh_user] = self.get_user_info(gh_user, aname) return all_user_info - def _check_url(self, url: str) -> bool: - """Test a url and return true if it works, false if not - - Parameters - ---------- - url : str - String for a url to a website to test. - - """ - - try: - response = requests.get(url, timeout=6) - return response.status_code == 200 - except Exception: - print("Oops, url", url, "is not valid, removing it") - return False - - def update_contrib_data(self, contrib_data: dict, gh_data: dict): - """Update contributor data from the GH API return. - - Use the GitHub API to grab user profile data such as twitter handle, - mastodon, website, email and location and update contributor - information. GitHub profile data is the source of truth source for - contributor metadata. - - Parameters - ---------- - contrib_data : dict - A dict containing contributor data to be updated - gh_data : dict - Updated contributor data pulled from github API - - Returns - ------- - dict - Dictionary containing updated contributor data. - """ - - for i, gh_name in enumerate(contrib_data.keys()): - print(i, gh_name) - # Update the key:value pairs for data pulled from GitHub - for akey in self.update_keys: - if akey == "website": - url = gh_data[gh_name][gh_name][akey] - # Fix the url format and check to see if it works online - url = self.format_url(url) - # It url is valid, add to dict - if self._check_url(url): - contrib_data[gh_name][akey] = url - else: - contrib_data[gh_name][akey] = "" - else: - contrib_data[gh_name][akey] = gh_data[gh_name][gh_name][ - akey - ] - - return contrib_data + # Shouldn't need this anymore with pydantic + # def update_contrib_data(self, contrib_data: dict, gh_data: dict): + # """Update contributor data from the GH API return. + + # Use the GitHub API to grab user profile data such as twitter handle, + # mastodon, website, email and location and update contributor + # information. GitHub profile data is the source of truth source for + # contributor metadata. + + # Parameters + # ---------- + # contrib_data : dict + # A dict containing contributor data to be updated + # gh_data : dict + # Updated contributor data pulled from github API + + # Returns + # ------- + # dict + # Dictionary containing updated contributor data. + # """ + + # for i, gh_name in enumerate(contrib_data.keys()): + # print(i, gh_name) + # # Update the key:value pairs for data pulled from GitHub + # for akey in self.update_keys: + # if akey == "website": + # url = gh_data[gh_name][gh_name][akey] + # # Fix the url format and check to see if it works online + # url = self.format_url(url) + # # It url is valid, add to dict + # if self._check_url(url): + # contrib_data[gh_name][akey] = url + # else: + # contrib_data[gh_name][akey] = "" + # else: + # contrib_data[gh_name][akey] = gh_data[gh_name][gh_name][ + # akey + # ] + + # return contrib_data def format_url(self, url: str) -> str: """Append https to the beginning of URL if it doesn't exist diff --git a/src/pyosmeta/parse_issues.py b/src/pyosmeta/parse_issues.py index bce610e..8bea3ec 100644 --- a/src/pyosmeta/parse_issues.py +++ b/src/pyosmeta/parse_issues.py @@ -25,9 +25,9 @@ def clean_date(a_date: Optional[str]) -> str: if a_date is None or a_date == "missing": return "missing" - elif len(a_date) < 11: - new_date = a_date.replace("/", "-").split("-") - return f"{new_date[0]}-{new_date[1]}-{new_date[2]}" + # elif len(a_date) < 11: + # new_date = a_date.replace("/", "-").split("-") + # return f"{new_date[0]}-{new_date[1]}-{new_date[2]}" else: try: return ( @@ -95,10 +95,29 @@ class ReviewModel(BaseModel): updated_at: str = None closed_at: Optional[str] = None issue_link: str = None - gh_meta: GhMeta + joss: Optional[str] = None + gh_meta: Optional[GhMeta] = None @field_validator( "date_accepted", + mode="before", + ) + @classmethod + def clean_date_review(cls, a_date: Optional[str]) -> str: + """Clean a manually added datetime that is added to a review by an + editor when the review package is accepted. + + """ + if a_date is None or a_date in ["missing", "TBD"]: + return "missing" + else: + new_date = a_date.replace("/", "-").split("-") + if len(new_date[0]) == 4: + return f"{new_date[0]}-{new_date[1]}-{new_date[2]}" + else: + return f"{new_date[2]}-{new_date[0]}-{new_date[1]}" + + @field_validator( "created_at", "updated_at", "closed_at", @@ -114,6 +133,27 @@ def clean_date(cls, a_date: Optional[str]) -> str: return clean_date(a_date) + @field_validator( + "editor", + "reviewer_1", + "reviewer_2", + mode="before", + ) + @classmethod + def clean_gh_url(cls, user: dict[str, str]) -> dict[str, str]: + """Remove markdown link remnants from gh usernames and name. + + Sometimes editors and reviewers add names using github links. + Remove the link data. + """ + + user["github_username"] = user["github_username"].replace( + "https://github.com/", "" + ) + user["name"] = re.sub(r"\[|\]", "", user["name"]) + + return user + @dataclass class ProcessIssues: @@ -274,7 +314,7 @@ def _get_line_meta(self, line_item: list[str]) -> dict[str, object]: return meta def parse_issue_header( - self, issues: list[str], total_lines: int = 15 + self, issues: list[str], total_lines: int = 20 ) -> dict[str, str]: """ A function that parses through the header of an issue. @@ -287,7 +327,7 @@ def parse_issue_header( metadata at the top of each issue total_lines : int an integer representing the total number of lines to parse in the - issue header. Default = 15 + issue header. Default = 20 Returns ------- @@ -301,32 +341,29 @@ def parse_issue_header( review = {} for issue in issues: - package_name, body_data = self.parse_comment(issue) - if not package_name: + pkg_name, body_data = self.parse_comment(issue) + if not pkg_name: continue # Index of 15 should include date accepted in the review meta - review[package_name] = self.get_issue_meta(body_data, total_lines) - # Add issue open and close date to package meta - # Created, opened & closed dates are in GitHub Issue response + review[pkg_name] = self.get_issue_meta(body_data, total_lines) + # Add issue open and close date to package meta from GH response + # Date cleaning happens via pydantic validator not here for a_date in meta_dates: - # TODO: this could become a validator - review[package_name][a_date] = issue[ - a_date - ] # self._clean_date(issue[a_date]) + review[pkg_name][a_date] = issue[a_date] # Get categories and issue review link - review[package_name]["categories"] = self.get_categories(body_data) - review[package_name]["issue_link"] = issue["url"].replace( + review[pkg_name]["categories"] = self.get_categories(body_data) + review[pkg_name]["issue_link"] = issue["url"].replace( "https://api.github.com/repos/", "https://github.com/" ) review_clean = { key: value - for key, value in review[package_name].items() + for key, value in review[pkg_name].items() if not key.startswith("##") and not key.startswith("---") and not key.startswith("-_[x]_i_agree") } - review[package_name] = review_clean + review[pkg_name] = review_clean # filtered = {} # for key, value in review.items(): # print(key) @@ -346,7 +383,7 @@ def parse_issue_header( # .replace("]", "") # ) - # review[package_name] = issue_meta + # review[pkg_name] = issue_meta return review @@ -394,7 +431,7 @@ def get_repo_endpoints( Returns ------- Dict - Containing package_name: endpoint for each review. + Containing pkg_name: endpoint for each review. """ @@ -424,7 +461,7 @@ def parse_comment(self, issue: dict[str, str]) -> tuple[str, list[str]]: Returns ------- - package_name : str + pkg_name : str The name of the package comment : list A list containing the comment elements in order @@ -454,9 +491,9 @@ def parse_comment(self, issue: dict[str, str]) -> tuple[str, list[str]]: None, ) - package_name = body_data[name_index][1] if name_index else None + pkg_name = body_data[name_index][1] if name_index else None - return package_name, body_data + return pkg_name, body_data def get_gh_metrics( self, @@ -559,14 +596,14 @@ def get_last_commit(self, repo: str) -> str: return date def get_categories( - self, issue_body_list: list[list[str]], fmt: bool = True + self, issue_list: list[list[str]], fmt: bool = True ) -> list[str]: """Parse through a pyOS review issue and grab categories associated with a package Parameters ---------- - issue_body_list : list[list[str]] + issue_list : list[list[str]] The first comment from the issue split into lines and then the lines split as by self.parse_comment() @@ -575,33 +612,27 @@ def get_categories( required for the website. """ # Find the starting index of the category section - start_index = None - for i in range(len(issue_body_list)): - if issue_body_list[i][0].startswith("- Please indicate which"): - start_index = i + 1 - break - # NOTE - some issues have line after that startswith "Check out our" - # For those issues advance i += 1 - if issue_body_list[start_index][0].startswith("Check out our"): - start_index += 1 - - if start_index is None: - # If we couldn't find the starting index, return an empty list - return [] - - # Iterate through lines and grab the relevant text - cat_matches = ["[x]", "[X]"] - categories: list[str] = [] - for i in range(start_index, len(issue_body_list)): # 30): - line = issue_body_list[i][0].strip() - checked = any([x in line for x in cat_matches]) - - if line.startswith("- [") and checked: - category = line[line.index("]") + 2] - categories.append(category) - elif not line.startswith("- ["): - break - - if fmt: - categories = [c.lower().replace(" ", "-") for c in categories] - return categories + try: + index = next( + i + for i, sublist in enumerate(issue_list) + if "## Scope" in sublist + ) + # Iterate from scope index to first line starting with " - [" + # To find list of category check boxes + for i in range(index + 1, len(issue_list)): + if issue_list[i] and issue_list[i][0].startswith("- ["): + cat_index = i + break + except StopIteration: + print("'## Scope' not found in the list.") + + # Get checked categories for package + cat_list = issue_list[cat_index : cat_index + 10] + categories = [ + re.sub(r"- \[[xX]\] ", "", item[0]) + for item in cat_list + if re.search(r"- \[[xX]\] ", item[0]) + ] + + return [item.lower().replace("[^1]", "") for item in categories] From 4fcb559e71b9c141102ab206ccb63aa534e93925 Mon Sep 17 00:00:00 2001 From: Leah Wasser Date: Sun, 20 Aug 2023 12:17:12 -0600 Subject: [PATCH 11/12] Fix: add more url validators and cleanup --- .flake8 | 2 + pyproject.toml | 5 +- src/pyosmeta/cli/update_contributors.py | 15 +--- src/pyosmeta/cli/update_review_teams.py | 8 +- src/pyosmeta/contributors.py | 102 ++++++++++++++---------- src/pyosmeta/parse_issues.py | 5 +- 6 files changed, 71 insertions(+), 66 deletions(-) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..8434b4b --- /dev/null +++ b/.flake8 @@ -0,0 +1,2 @@ +[flake8] +ignore = E203, W503 diff --git a/pyproject.toml b/pyproject.toml index f80ed4b..05a51da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,9 +75,10 @@ profile = "black" multi_line_output = 3 py_version = 27 +# Precommit ignores this config so i added a .flake8 file +# but why did it ignore it? [tool.flake8] -# List of error codes to ignore (comma-separated) -ignore = ["E203", "W503"] +extend-ignore = ["E203", "W503"] [tool.pdm.build] diff --git a/src/pyosmeta/cli/update_contributors.py b/src/pyosmeta/cli/update_contributors.py index 356dc5c..4a3f13a 100644 --- a/src/pyosmeta/cli/update_contributors.py +++ b/src/pyosmeta/cli/update_contributors.py @@ -25,8 +25,9 @@ def main(): help="Force update contrib info from GitHub for every contributor", ) args = parser.parse_args() + update_value = args.update - if args: + if update_value: update_all = True repos = [ @@ -49,6 +50,7 @@ def main(): # Populate all existing contribs into model objects all_contribs = {} for a_contrib in web_contribs: + print(a_contrib) try: all_contribs[a_contrib["github_username"].lower()] = PersonModel( **a_contrib @@ -64,7 +66,6 @@ def main(): bot_all_contribs = process_contribs.combine_json_data() print("Updating contrib types and searching for new users now") - # bot_all_contribs: keys: contrib_type, value: ghuser contribs for key, users in bot_all_contribs.items(): for gh_user in users: # Find and populate data for any new contributors @@ -76,16 +77,6 @@ def main(): # Update contribution type list for all users all_contribs[gh_user].add_unique_value("contributor_type", key) - # existing_contribs = all_contribs[gh_user].contributor_type - # # TODO: i can move all of these update items to just use the - # # personmodel.add_unique_value then i can get rid of update - # # contrib list - # all_contribs[ - # gh_user - # ].contributor_type = process_contribs.update_contrib_list( - # existing_contribs, key - # ) - if update_all: for user in all_contribs.keys(): print("Updating all user info from github", user) diff --git a/src/pyosmeta/cli/update_review_teams.py b/src/pyosmeta/cli/update_review_teams.py index 31d72f7..b24f6d5 100644 --- a/src/pyosmeta/cli/update_review_teams.py +++ b/src/pyosmeta/cli/update_review_teams.py @@ -32,11 +32,9 @@ # - sevivi import os -from pyosmeta.contributors import ( - PersonModel, - ProcessContributors, - ValidationError, -) +from pydantic import ValidationError + +from pyosmeta.contributors import PersonModel, ProcessContributors from pyosmeta.file_io import clean_export_yml, load_pickle diff --git a/src/pyosmeta/contributors.py b/src/pyosmeta/contributors.py index 3e82e60..712a98d 100644 --- a/src/pyosmeta/contributors.py +++ b/src/pyosmeta/contributors.py @@ -16,7 +16,59 @@ from typing import Dict, List, Optional, Set, Tuple, Union -class PersonModel(BaseModel): +class UrlValidatorMixin: + # Check fields is false because this is being inherited by two diff classes + @field_validator( + "website", "documentation", mode="before", check_fields=False + ) + @classmethod + def format_url(cls, url: str) -> str: + """Append https to the beginning of URL if it doesn't exist & cleanup + If the url doesn't have https add it + If the url starts with http change it to https + Else do nothing + + Parameters + ---------- + url : str + String representing the url grabbed from the GH api + + """ + + if not url: + return url # Returns empty string if url is empty + else: + if url.startswith("http://"): + print(f"{url} 'http://' replacing w 'https://'") + url = url.replace("http://", "https://") + elif not url.startswith("http"): + print("Oops, missing http") + url = "https://" + url + if cls._check_url(url=url): + return url + else: + return None + + @staticmethod + def _check_url(url: str) -> bool: + """Test url. Return true if there's a valid response, False if not + + Parameters + ---------- + url : str + String for a url to a website to test. + + """ + + try: + response = requests.get(url, timeout=6) + return response.status_code == 200 + except Exception: + print("Oops, url", url, "is not valid, removing it") + return False + + +class PersonModel(BaseModel, UrlValidatorMixin): # Make sure model populates both aliases and original attr name model_config = ConfigDict( populate_by_name=True, @@ -56,24 +108,6 @@ class PersonModel(BaseModel): location: Optional[str] = None email: Optional[str] = None - # # TODO - turn this into a validator for the user website - # def _check_url(self, url: str) -> bool: - # """Test a url and return true if it works, false if not - - # Parameters - # ---------- - # url : str - # String for a url to a website to test. - - # """ - - # try: - # response = requests.get(url, timeout=6) - # return response.status_code == 200 - # except Exception: - # print("Oops, url", url, "is not valid, removing it") - # return False - @field_validator( "packages_reviewed", "packages_submitted", @@ -81,6 +115,7 @@ class PersonModel(BaseModel): "contributor_type", mode="before", ) + @classmethod def convert_to_set(cls, value: list[str]): if isinstance(value, list): if not value: @@ -88,7 +123,7 @@ def convert_to_set(cls, value: list[str]): elif value[0] is None: return set() else: - value = [aval.lower() for aval in value] + value = [a_val.lower() for a_val in value] return set(value) elif value is None: return set() @@ -111,7 +146,9 @@ def add_unique_value(self, attr_name: str, values: Union[str, list[str]]): "contributor_type", ) def serialize_set(self, items: Set[str]): - return list(items) + """This is a serializer that runs on export. It ensures sets are + converted to lists""" + return sorted(list(items)) @field_validator("bio", mode="before") @classmethod @@ -562,26 +599,3 @@ def get_gh_data( # ] # return contrib_data - - def format_url(self, url: str) -> str: - """Append https to the beginning of URL if it doesn't exist - If the url doesn't have https add it - If the url starts with http change it to https - Else do nothing - - Parameters - ---------- - url : str - String representing the url grabbed from the GH api - - """ - if not url: - return url # returns empty string if url is empty - elif url.startswith("https://"): - return url - elif url.startswith("http://"): - print("Fixing", url, "https://" + url[7:]) - return "https://" + url[7:] - else: - print("Missing https://, adding to ", url) - return "https://" + url diff --git a/src/pyosmeta/parse_issues.py b/src/pyosmeta/parse_issues.py index 8bea3ec..8267998 100644 --- a/src/pyosmeta/parse_issues.py +++ b/src/pyosmeta/parse_issues.py @@ -12,7 +12,7 @@ ) from typing import Any, Optional -from pyosmeta.contributors import ProcessContributors +from pyosmeta.contributors import ProcessContributors, UrlValidatorMixin def clean_date(a_date: Optional[str]) -> str: @@ -40,7 +40,7 @@ def clean_date(a_date: Optional[str]) -> str: return "missing" -class GhMeta(BaseModel): +class GhMeta(BaseModel, UrlValidatorMixin): name: str description: str created_at: str @@ -517,7 +517,6 @@ def get_gh_metrics( """ pkg_meta = {} for pkg_name, url in endpoints.items(): - print("Getting GitHub stats for", pkg_name) pkg_meta[pkg_name] = self.get_repo_meta(url, self.gh_stats) pkg_meta[pkg_name]["contrib_count"] = self.get_repo_contribs(url) From 2a1922797f52abfa73dd36c2a6290090cd01c84d Mon Sep 17 00:00:00 2001 From: Leah Wasser Date: Sun, 20 Aug 2023 12:36:02 -0600 Subject: [PATCH 12/12] Fix: cleanup unused code and comments --- pyproject.toml | 2 +- src/pyosmeta/__init__.py | 2 + src/pyosmeta/cli/process_reviews.py | 51 +------- src/pyosmeta/cli/update_contributors.py | 2 +- src/pyosmeta/cli/update_review_teams.py | 12 -- src/pyosmeta/contributors.py | 150 +----------------------- 6 files changed, 8 insertions(+), 211 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 05a51da..6e8e431 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,7 @@ license = { text = "MIT" } # for a user to run directly from the package. [project.scripts] # Optional update-contributors = "pyosmeta.cli.update_contributors:main" -process-reviews = "pyosmeta.cli.process_reviews:main" +update-reviews = "pyosmeta.cli.process_reviews:main" update-review-teams = "pyosmeta.cli.update_review_teams:main" diff --git a/src/pyosmeta/__init__.py b/src/pyosmeta/__init__.py index 017c97d..e4d2c24 100644 --- a/src/pyosmeta/__init__.py +++ b/src/pyosmeta/__init__.py @@ -1,6 +1,8 @@ from .contributors import PersonModel, ProcessContributors from .parse_issues import ProcessIssues, ReviewModel +# Trick suggested by flake8 maintainer to ensure the imports above don't +# get flagged as being "unused" __all__ = ( "ProcessIssues", "ReviewModel", diff --git a/src/pyosmeta/cli/process_reviews.py b/src/pyosmeta/cli/process_reviews.py index a5862a8..6756c63 100644 --- a/src/pyosmeta/cli/process_reviews.py +++ b/src/pyosmeta/cli/process_reviews.py @@ -17,82 +17,33 @@ # ideally this could be passed as a CLI argument with the label we want to # search for - -# import argparse import pickle from pydantic import ValidationError from pyosmeta import ProcessIssues, ReviewModel -# from pyosmeta.file_io import load_website_yml - - # TODO: change the template to ask for date accepted format year-month-day def main(): - # update_all = False - # parser = argparse.ArgumentParser( - # description="A CLI script to update pyOpenSci reviews" - # ) - # parser.add_argument( - # "--update", - # type=str, - # help="Will force update review info from GitHub for every review", - # ) - # args = parser.parse_args() - - # if args: - # update_all = True - # web_reviews_path = ( - # "https://raw.githubusercontent.com/pyOpenSci/" - # "pyopensci.github.io/main/_data/packages.yml" - # ) - process_review = ProcessIssues( org="pyopensci", repo_name="software-submission", label_name="6/pyOS-approved 🚀🚀🚀", ) - # Open web yaml & return dict with package name as key - # web_reviews = load_website_yml(key="package_name", url=web_reviews_path) - # Get all issues for approved packages - load as dict issues = process_review.return_response() accepted_reviews = process_review.parse_issue_header(issues, 45) - # TODO: clean out extra fields from accepted reviews?? - - # Parse through reviews, identify new ones, fix case - # TODO - right now i've reverted back to always updating all reviews. - # Is there a use-case to only update a new package vs updating everything? - # if update_all: - # all_reviews = {} - # for key, meta in accepted_reviews.items(): - # try: - # all_reviews[key.lower()] = ReviewModel(**meta) - # except ValidationError as ve: - # print(ve) - - # else: - # for key, meta in accepted_reviews.items(): - # if key.lower() not in all_reviews.keys(): - # print("Yay - pyOS has a new package:", key) - # all_reviews[key.lower()] = ReviewModel(**meta) - # Update gh metrics via api for all packages - # TODO: this is working but above i made everything a model object - # do i want to do that above or just do it all at once below? repo_endpoints = process_review.get_repo_endpoints(accepted_reviews) all_reviews = process_review.get_gh_metrics( repo_endpoints, accepted_reviews ) - # Finally populate model objects with review data + metrics - # TODO: this is really close - it's erroring when populating date - # i suspect in the github metadata + # Populate model objects with review data + metrics final_reviews = {} for key, review in all_reviews.items(): # First add gh meta to each dict diff --git a/src/pyosmeta/cli/update_contributors.py b/src/pyosmeta/cli/update_contributors.py index 4a3f13a..b0e1bb2 100644 --- a/src/pyosmeta/cli/update_contributors.py +++ b/src/pyosmeta/cli/update_contributors.py @@ -50,7 +50,7 @@ def main(): # Populate all existing contribs into model objects all_contribs = {} for a_contrib in web_contribs: - print(a_contrib) + print(a_contrib["github_username"]) try: all_contribs[a_contrib["github_username"].lower()] = PersonModel( **a_contrib diff --git a/src/pyosmeta/cli/update_review_teams.py b/src/pyosmeta/cli/update_review_teams.py index b24f6d5..5ff10b0 100644 --- a/src/pyosmeta/cli/update_review_teams.py +++ b/src/pyosmeta/cli/update_review_teams.py @@ -19,17 +19,8 @@ # TODO: package-wide feature: create no update flag for entries # TODO: make sure we can add a 3rd or 4th reviewer - crowsetta has this as # will biocypher -# TODO: make sure to add a current editor boolean to the current editors and -# emeritus ones. """ - -# TODO - Case sensitivity is an issue with my validation using set -# - jointly -# - Jointly -# - devicely -# - Devicely -# - sevivi import os from pydantic import ValidationError @@ -45,8 +36,6 @@ def get_clean_user(username: str) -> str: def main(): - # TODO: move refresh contribs and contribs dict attr to - # processContribs and remove this module altogether process_contribs = ProcessContributors([]) # Two pickle files are outputs of the two other scripts @@ -129,7 +118,6 @@ def main(): # Export to yaml contribs_ls = [model.model_dump() for model in contribs.values()] - # Getting error dumping packages pkgs_ls = [model.model_dump() for model in packages.values()] clean_export_yml(contribs_ls, os.path.join("_data", "contributors.yml")) diff --git a/src/pyosmeta/contributors.py b/src/pyosmeta/contributors.py index 712a98d..22e22aa 100644 --- a/src/pyosmeta/contributors.py +++ b/src/pyosmeta/contributors.py @@ -13,11 +13,11 @@ field_serializer, field_validator, ) -from typing import Dict, List, Optional, Set, Tuple, Union +from typing import List, Optional, Set, Tuple, Union class UrlValidatorMixin: - # Check fields is false because this is being inherited by two diff classes + # Check fields is false given mixin is used by two diff classes @field_validator( "website", "documentation", mode="before", check_fields=False ) @@ -69,7 +69,6 @@ def _check_url(url: str) -> bool: class PersonModel(BaseModel, UrlValidatorMixin): - # Make sure model populates both aliases and original attr name model_config = ConfigDict( populate_by_name=True, str_strip_whitespace=True, @@ -158,7 +157,6 @@ def clean_strings(cls, string: str) -> str: """ if isinstance(string, str): - # Remove "\r\n" from the string value string = re.sub(r"[\r\n]", "", string) return string @@ -220,47 +218,6 @@ def get_token(self) -> str: load_dotenv() return os.environ["GITHUB_TOKEN"] - # TODO - This utility is used across all scripts. - def clean_list(self, a_list: Union[str, List[str]]) -> List[str]: - """Helper function that takes an input object as a list or string. - If it is a list containing none, it returns an empty list - if it is a string is returns the string as a list - removes 'None' if that is in the list. and returns - either an empty clean list of the list as is.""" - - if isinstance(a_list, str): - a_list = [a_list] - elif not a_list: - a_list = [] - # Remove None from list - a_list = list(filter(lambda x: x, a_list)) - return a_list - - # TODO - There is likely a better way to do this. If it returns an - # empty list then we know there are no new vals... so it likely can - # return a single thing - def unique_new_vals( - self, a_list: List[str], a_item: List[str] - ) -> Tuple[bool, Optional[List[str]]]: - """Checks two objects either a list and string or two lists - and evaluates whether there are differences between them. - - Returns - ------- - Tuple - Containing a boolean representing whether there are difference - or not and a list containing new value if there are differences. - - """ - - default = (False, None) - list_lower = [al.lower() for al in a_list] - item_lower = [ai.lower() for ai in a_item] - diff = list(set(item_lower) - set(list_lower)) - if len(diff) > 0: - default = (True, diff) - return default - def check_contrib_type(self, json_file: str): """ Determine the type of contribution the person @@ -323,6 +280,7 @@ def load_json(self, json_path: str) -> dict: print(ae) return json.loads(response.text) + # TODO: check is i'm using the contrib type part of this method ? def process_json_file(self, json_file: str) -> Tuple[str, List]: """Deserialize a JSON file from a URL and cleanup data @@ -497,105 +455,3 @@ def combine_users(self, repoDict: dict, webDict: dict) -> dict: print("New user found. Adding: ", gh_user) webDict[gh_user] = repoDict[gh_user] return webDict - - # # TODO: i think i can remove this method - # def add_new_user(self, gh_user: str) -> dict: - # """Add a new user to the contrib file using gh username - - # This method does a few things. - # 1. Adds a new template entry for the user w no values populated - # 2. Gets user metadata from the user's github profile - # 3. Updates their contrib entry with the gh data - - # Parameters - # ---------- - # gh_user : str - # String representing the GitHub username - - # Returns - # ------- - # Dict - # Username is the key and the updated github profile info is - # contained in the dict. - - # """ - - # new = {} - # # Rather than this template i can use the person_model - # new[gh_user] = self.create_contrib_template() - # gh_data = self.get_gh_data([gh_user]) - # # Update their metadata in the dict and return - # updated_data = self.update_contrib_data(new, gh_data) - # return updated_data - - def get_gh_data( - self, contribs: Union[Dict[str, str], List] - ) -> dict[str, str]: - """Parses through each GitHub username and hits the GitHub - API to grab user information. - - Parameters - ---------- - contribs : dict - Dict containing all current contrib info - - Returns - ------- - Dict - A dict of updated user data via a list of github usernames - """ - all_user_info = {} - for gh_user in contribs: - print("Getting github data for: ", gh_user) - # If the user already has a name in the dict, don't update - # Important to allow us to update names to ensure correct spelling, - # etc on website - if isinstance(contribs, list): - aname = None - else: - aname = contribs[gh_user]["name"] - - all_user_info[gh_user] = self.get_user_info(gh_user, aname) - return all_user_info - - # Shouldn't need this anymore with pydantic - # def update_contrib_data(self, contrib_data: dict, gh_data: dict): - # """Update contributor data from the GH API return. - - # Use the GitHub API to grab user profile data such as twitter handle, - # mastodon, website, email and location and update contributor - # information. GitHub profile data is the source of truth source for - # contributor metadata. - - # Parameters - # ---------- - # contrib_data : dict - # A dict containing contributor data to be updated - # gh_data : dict - # Updated contributor data pulled from github API - - # Returns - # ------- - # dict - # Dictionary containing updated contributor data. - # """ - - # for i, gh_name in enumerate(contrib_data.keys()): - # print(i, gh_name) - # # Update the key:value pairs for data pulled from GitHub - # for akey in self.update_keys: - # if akey == "website": - # url = gh_data[gh_name][gh_name][akey] - # # Fix the url format and check to see if it works online - # url = self.format_url(url) - # # It url is valid, add to dict - # if self._check_url(url): - # contrib_data[gh_name][akey] = url - # else: - # contrib_data[gh_name][akey] = "" - # else: - # contrib_data[gh_name][akey] = gh_data[gh_name][gh_name][ - # akey - # ] - - # return contrib_data