From a5b0d282502c56ace859d6fe142361d8f6c3cab9 Mon Sep 17 00:00:00 2001
From: Leah Wasser <leah@pyopensci.org>
Date: Sun, 13 Aug 2023 19:04:20 -0600
Subject: [PATCH 01/12] Refactor: move to pydantic for validation

---
 .../cli/personmodel-update-contribs.py        | 119 ++++++++++
 src/pyosmeta/contributors.py                  | 211 +++++++++++++-----
 src/pyosmeta/file_io.py                       |   9 +-
 3 files changed, 279 insertions(+), 60 deletions(-)
 create mode 100644 src/pyosmeta/cli/personmodel-update-contribs.py

diff --git a/src/pyosmeta/cli/personmodel-update-contribs.py b/src/pyosmeta/cli/personmodel-update-contribs.py
new file mode 100644
index 0000000..8ff341e
--- /dev/null
+++ b/src/pyosmeta/cli/personmodel-update-contribs.py
@@ -0,0 +1,119 @@
+import argparse
+import os
+import pickle
+
+import pydantic
+from pydantic import ValidationError
+
+from pyosmeta.contributors import PersonModel, ProcessContributors
+from pyosmeta.file_io import clean_export_yml, open_yml_file
+
+print(pydantic.__version__)
+# TODO - fix the website by renaming   packages-editor, packages-submitted:
+# packages-reviewed: to use underscores. this will just make life easier
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="A CLI script to update pyOpenSci contributors"
+    )
+    parser.add_argument(
+        "--update",
+        type=str,
+        help="Will force update contrib info from GitHub for every contributor",
+    )
+    args = parser.parse_args()
+
+    if args:
+        update_all = True
+
+    base_url = "https://raw.githubusercontent.com/pyOpenSci/"
+    end_url = "/main/.all-contributorsrc"
+    repos = [
+        "python-package-guide",
+        "software-peer-review",
+        "pyopensci.github.io",
+        "software-review",
+        "update-web-metadata",
+    ]
+    json_files = [base_url + repo + end_url for repo in repos]
+
+    # Get existing contribs from pyopensci.github.io repo (website data)
+    web_yaml_path = base_url + "pyopensci.github.io/main/_data/contributors.yml"
+
+    web_contribs = open_yml_file(web_yaml_path)
+
+    # Populate all existing contribs into model objects
+    all_contribs = {}
+    for a_contrib in web_contribs:
+        try:
+            if a_contrib["github_username"].lower() == "arianesasso":
+                print("pause")
+            all_contribs[a_contrib["github_username"].lower()] = PersonModel(
+                **a_contrib
+            )
+        except ValidationError as ve:
+            print(a_contrib["github_username"])
+            print(ve)
+
+    print("Done processing all-contribs")
+    # TODO - maybe add these as an attr in the contribs class?
+    base_url = "https://raw.githubusercontent.com/pyOpenSci/"
+    end_url = "/main/.all-contributorsrc"
+    repos = [
+        "python-package-guide",
+        "software-peer-review",
+        "pyopensci.github.io",
+        "software-review",
+        "update-web-metadata",
+    ]
+    json_files = [base_url + repo + end_url for repo in repos]
+
+    # Create a list of all contributors across repositories
+    process_contribs = ProcessContributors(json_files)
+    bot_all_contribs = process_contribs.combine_json_data()
+
+    # TODO this is much slower than it should be
+    print("Updating contrib types and searching for new users now")
+    # bot_all contris is a dict of x contrib types with an associated list of
+    # users who contributed to that type.
+    for key, users in bot_all_contribs.items():
+        print(key)
+        for gh_user in users:
+            # Find and populate data for any new contributors
+            if gh_user not in all_contribs.keys():
+                print("Missing", gh_user, "Adding them now")
+                new_contrib = process_contribs.get_user_info(gh_user)
+                all_contribs[gh_user] = PersonModel(**new_contrib)
+
+            # Update contribution type list for all users
+            existing_contribs = all_contribs[gh_user].contributor_type
+            all_contribs[
+                gh_user
+            ].contributor_type = process_contribs.update_contrib_list(
+                existing_contribs, key
+            )
+
+    if update_all:
+        for user in all_contribs.keys():
+            print("Updating all user info from github", user)
+            new_contrib = process_contribs.get_user_info(user)
+            # Update person's data (should skip update for any text
+            # with # noupdate flag)
+            all_contribs[user] = all_contribs[user].update(new_contrib)
+
+    # Export to pickle which supports updates after parsing reviews
+    with open("all_contribs.pickle", "wb") as f:
+        pickle.dump(all_contribs, f)
+
+    alist = []
+    for key, item in all_contribs.items():
+        alist.append(item.model_dump())
+
+    # Test export
+    print(os.getcwd())
+    clean_export_yml(alist, os.path.join("_data", "contribs.yml"))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/pyosmeta/contributors.py b/src/pyosmeta/contributors.py
index 4944473..03502dd 100644
--- a/src/pyosmeta/contributors.py
+++ b/src/pyosmeta/contributors.py
@@ -1,16 +1,126 @@
 import json
 import os
+import re
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Literal, Optional, Tuple, Union
 
 import requests
 from dotenv import load_dotenv
+from pydantic import (AliasChoices, BaseModel, ConfigDict, Field,
+                      field_validator)
+
+
+class PersonModel(BaseModel):
+    # Make sure model populates both aliases and original attr name
+    model_config = ConfigDict(populate_by_name=True, anystr_strip_whitespace=True)
+
+    name: Optional[str] = None
+    title: Optional[str] = None
+    sort: Optional[int] = None
+    bio: Optional[str] = None
+    organization: Optional[str] = Field(None, validation_alias=AliasChoices("company"))
+    github_username: str = Field(None, validation_alias=AliasChoices("login"))
+    github_image_id: int = Field(None, validation_alias=AliasChoices("id"))
+    deia_advisory: Optional[bool] = False
+    editorial_board: Optional[bool] = Field(
+        None, validation_alias=AliasChoices("editorial-board")
+    )
+    advisory: Optional[bool] = False
+    twitter: Optional[str] = Field(
+        None, validation_alias=AliasChoices("twitter_username")
+    )
+    mastodon: Optional[str] = Field(
+        None, validation_alias=AliasChoices("mastodon_username", "mastodon")
+    )
+    orcidid: Optional[str] = None
+    website: Optional[str] = Field(
+        None, validation_alias=AliasChoices("blog", "website")
+    )
+    board: Optional[bool] = False
+    contributor_type: Optional[list[str]] = []
+    packages_editor: Optional[list[str | None]] = Field(
+        None,
+        validation_alias=AliasChoices("packages-editor"),
+    )
+    packages_submitted: Optional[list[str | None]] = Field(
+        None,
+        validation_alias=AliasChoices("packages-submitted", "packages_submitted"),
+    )
+    packages_reviewed: Optional[list[str | None]] = Field(
+        None,
+        validation_alias=AliasChoices("packages-reviewed", "packages_reviewed"),
+    )
+    location: Optional[str] = None
+    email: Optional[str] = None
+
+    # @field_validator("advisory", "deia_advisory", mode="before")
+    # def fix_bools(cls, value):
+    #     value = "value"
+    #     print(value)
+    #     if value == "false":
+    #         return False
+    #     elif value == "true":
+    #         return True
+
+    @field_validator(
+        "packages_reviewed",
+        "packages_submitted",
+        "packages_editor",
+        mode="before",
+    )
+    @classmethod
+    def string_to_list(cls, value):
+        """
+        For fields such as packages-reviewed edited etc we want
+        a list of elements not just a single string. this will
+        fix that issue.
+        """
+        # If the input value is a string, convert it to a list
+        if isinstance(value, list):
+            return value
+        if isinstance(value, str):
+            return [value]
+        # If the input value is None, return an empty list
+        elif value is None:
+            return []
+
+    @field_validator("bio", mode="before")
+    @classmethod
+    def clean_strings(cls, string: str) -> str:
+        """This is a cleaning step that will remove spurious
+        characters from string fields.
+
+        """
+        if isinstance(string, str):
+            # Remove "\r\n" from the string value
+            string = re.sub(r"[\r\n]", "", string)
+        return string
+
+    def update(self, data: dict) -> "PersonModel":
+        """
+        this doesn't currently validate the data - the discussion
+        below describes one way to do that but uses pydantic 1.x not
+        2.x approach.
+
+        https://github.com/pydantic/pydantic/discussions/3139#discussioncomment-4797649
+        """
+
+        # Note that this will not validate new data :(
+        for aval in data.keys():
+            if isinstance(getattr(self, aval), str) and "# noupdate" in getattr(
+                self, aval
+            ):
+                print("The", aval, "field has a noupdate flag. Skipping update.")
+            else:
+                setattr(self, aval, data[aval])
+        return self
 
 
 @dataclass
 class ProcessContributors:
-    # When initializing how do you decide what should be an input
-    # attribute vs just something a method accepted when called?
+    """A class that contains some basic methods to support populating and
+    updating contributor data."""
+
     def __init__(self, json_files: List) -> None:
         """
         Parameters
@@ -87,29 +197,30 @@ def refresh_contribs(self, contribs: Dict, new_contribs, review_role):
         final_list = self.update_contrib_list(existing_contribs, new_contribs)
         return (contrib_key_yml, final_list)
 
-    def create_contrib_template(self) -> Dict:
-        """A small helper that creates a template for a new contributor
-        that we are adding to our contributor.yml file"""
-
-        return {
-            "name": "",
-            "bio": "",
-            "organization": "",
-            "title": "",
-            "github_username": "",
-            "github_image_id": "",
-            "editorial-board": "",
-            "twitter": "",
-            "mastodon": "",
-            "orcidid": "",
-            "website": "",
-            "contributor_type": [],
-            "packages-editor": [],
-            "packages-submitted": [],
-            "packages-reviewed": [],
-            "location": "",
-            "email": "",
-        }
+    # TODO: this can go away now that i have a personmodel obj
+    # def create_contrib_template(self) -> Dict:
+    #     """A small helper that creates a template for a new contributor
+    #     that we are adding to our contributor.yml file"""
+
+    #     return {
+    #         "name": "",
+    #         "bio": "",
+    #         "organization": "",
+    #         "title": "",
+    #         "github_username": "",
+    #         "github_image_id": "",
+    #         "editorial-board": "",
+    #         "twitter": "",
+    #         "mastodon": "",
+    #         "orcidid": "",
+    #         "website": "",
+    #         "contributor_type": [],
+    #         "packages-editor": [],
+    #         "packages-submitted": [],
+    #         "packages-reviewed": [],
+    #         "location": "",
+    #         "email": "",
+    #     }
 
     # TODO - This utility is used across all scripts.
     def clean_list(self, a_list: Union[str, List[str]]) -> List[str]:
@@ -212,7 +323,8 @@ def check_contrib_type(self, json_file: str):
         return contrib_type
 
     def check_add_user(self, gh_user: str, contribs: Dict[str, str]) -> None:
-        """Check to make sure user exists and if not, add them
+        """Check to make sure user exists in the existing contrib data. If they
+        don't' exist, add them
 
         Parameters
         ----------
@@ -278,6 +390,8 @@ def combine_json_data(self) -> dict:
         # Create an empty dictionary to hold the combined data
         combined_data = {}
 
+        # TODO: to make this faster, it might be better to return a dict
+        # with username : [contrib1, contrib2]
         for json_file in self.json_files:
             # Process the JSON file and add the data to the combined dictionary
             try:
@@ -287,20 +401,22 @@ def combine_json_data(self) -> dict:
                 print("Oops - can't process", json_file, e)
         return combined_data
 
-    def get_gh_usernames(self, contrib_data: List) -> List:
-        """Get a list of all gh usernames
+    # TODO: see if this is every used. it seems completley unecccsary
+    # given we can use.keys*()
+    # def get_gh_usernames(self, contrib_data: List) -> List:
+    #     """Get a list of all gh usernames
 
-        Parameters
-        ----------
-        contrib_data : list
-            Dict containing all of the contributor information for the website.
+    #     Parameters
+    #     ----------
+    #     contrib_data : list
+    #         Dict containing all of the contributor information for the website.
 
-        """
-        all_usernames = []
-        for item in contrib_data:
-            all_usernames.append(item["github_username"])
+    #     """
+    #     all_usernames = []
+    #     for item in contrib_data:
+    #         all_usernames.append(item["github_username"])
 
-        return all_usernames
+    #     return all_usernames
 
     def get_user_info(self, username: str, aname: Optional[str] = None) -> dict:
         """
@@ -327,7 +443,6 @@ def get_user_info(self, username: str, aname: Optional[str] = None) -> dict:
         # if message = Bad credentials
         response_json = response.json()
 
-        user_data = {}
         # TODO: make an attribute and call it here?
         update_keys = {
             "name": "name",
@@ -342,20 +457,9 @@ def get_user_info(self, username: str, aname: Optional[str] = None) -> dict:
             "github_username": "login",
         }
 
-        user_data[username] = {}
-        for akey in update_keys:
-            # If the key is name, check to see if there is name data
-            # already there. don't force update if there's a name!
-            if akey == "name":
-                if aname is None:
-                    user_data[username][akey] = response_json.get(
-                        update_keys[akey], None
-                    )
-                else:
-                    # Else just keep the original name
-                    user_data[username][akey] = aname
-            else:
-                user_data[username][akey] = response_json.get(update_keys[akey], None)
+        user_data = {}
+        for key in update_keys:
+            user_data[key] = response_json.get(update_keys[key], None)
 
         return user_data
 
@@ -456,6 +560,7 @@ def add_new_user(self, gh_user: str) -> dict:
         """
 
         new = {}
+        # Rather than this template i can use the person_model
         new[gh_user] = self.create_contrib_template()
         gh_data = self.get_gh_data([gh_user])
         # Update their metadata in the dict and return
diff --git a/src/pyosmeta/file_io.py b/src/pyosmeta/file_io.py
index 9c7c7da..ef968ce 100644
--- a/src/pyosmeta/file_io.py
+++ b/src/pyosmeta/file_io.py
@@ -173,12 +173,7 @@ def clean_export_yml(a_dict: Dict[str, Union[str, List[str]]], filename: str) ->
     None
         Outputs a yaml file with the input name containing the pyos meta
     """
-    # TODO: why doesn't .values() work here? it returns a representation
-    # error.
-    # final_data = []
-    # for key in a_dict:
-    #     final_data.append(a_dict[key])
-    # print("sdf")
+
     # Export to yaml
-    export_yaml(filename, list(a_dict.values()))
+    export_yaml(filename, a_dict)
     clean_yaml_file(filename)

From c9d99857327c411075021847594b30a3b66cae9f Mon Sep 17 00:00:00 2001
From: Leah Wasser <leah@pyopensci.org>
Date: Tue, 15 Aug 2023 19:18:53 -0600
Subject: [PATCH 02/12] Fix: pyproject tomly flake8/black fix

---
 pyproject.toml               |  14 +-
 src/pyosmeta/__init__.py     |  14 +-
 src/pyosmeta/contributors.py |  91 +++++-------
 src/pyosmeta/file_io.py      |  47 +++----
 src/pyosmeta/parse_issues.py | 262 +++++++++++++++++++++++++----------
 5 files changed, 266 insertions(+), 162 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 63934bf..e646cd9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,9 +29,16 @@ classifiers = [
     "Programming Language :: Python :: 3 :: Only", # BE sure to specify that you use python 3.x
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+version = "0.1.0"
+description = "Tools for contributors"
+authors = [{ name = "Leah Wasser", email = "leah@pyopensci.org" }]
+dependencies = [
+    "ruamel-yaml>=0.17.21",
+    "requests",
+    "python-dotenv",
+    "pydantic>=2.0",
 ]
 
-dependencies = ["ruamel-yaml>=0.17.21", "requests", "python-dotenv", "pydantic"]
 # This is the metadata that pip reads to understand what versions your package supports
 requires-python = ">=3.10"
 readme = "README.md"
@@ -59,6 +66,10 @@ update-reviewers = "pyosmeta.cli.update_review_contribs:main"
 # Right now i'm not using pdm to add dependencies.
 # Will explore that later
 # Below using dynamic versioning / which is setuptools scm like
+[tool.flake8]
+# List of error codes to ignore (comma-separated)
+ignore = "E203, W503"
+
 [tool.pdm]
 
 
@@ -70,6 +81,7 @@ package-dir = "src"
 
 # Versioning is a backend feature - instructions are in pdm-backend docs
 # https://pdm-backend.fming.dev/metadata/
+
 [tool.pdm.version]
 # Note that you need to create the tag after all commits are created - otherwise
 # pdm adds dev info after the tag number which won't publish to pypi
diff --git a/src/pyosmeta/__init__.py b/src/pyosmeta/__init__.py
index d745d01..5b35315 100644
--- a/src/pyosmeta/__init__.py
+++ b/src/pyosmeta/__init__.py
@@ -1,8 +1,16 @@
-# SPDX-FileCopyrightText: 2023-present Leah Wasser <leah@pyopensci.org>
-#
-# SPDX-License-Identifier: MIT
+from .contributors import PersonModel, ProcessContributors
+from .parse_issues import ProcessIssues, ReviewModel
 
+<<<<<<< HEAD
 try:
     from ._version_generated import __version__
 except ImportError:
     __version__ = "unreleased"
+=======
+__all__ = (
+    "ProcessIssues",
+    "ReviewModel",
+    "PersonModel",
+    "ProcessContributors",
+)
+>>>>>>> 007299c (Fix: pyproject tomly flake8/black fix)
diff --git a/src/pyosmeta/contributors.py b/src/pyosmeta/contributors.py
index 03502dd..f178699 100644
--- a/src/pyosmeta/contributors.py
+++ b/src/pyosmeta/contributors.py
@@ -2,23 +2,30 @@
 import os
 import re
 from dataclasses import dataclass
-from typing import Dict, List, Literal, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import requests
 from dotenv import load_dotenv
-from pydantic import (AliasChoices, BaseModel, ConfigDict, Field,
-                      field_validator)
+from pydantic import (
+    AliasChoices,
+    BaseModel,
+    ConfigDict,
+    Field,
+    field_validator,
+)
 
 
 class PersonModel(BaseModel):
     # Make sure model populates both aliases and original attr name
-    model_config = ConfigDict(populate_by_name=True, anystr_strip_whitespace=True)
+    model_config = ConfigDict(populate_by_name=True, str_strip_whitespace=True)
 
     name: Optional[str] = None
-    title: Optional[str] = None
+    title: Optional[Union[list[str], str]] = None
     sort: Optional[int] = None
     bio: Optional[str] = None
-    organization: Optional[str] = Field(None, validation_alias=AliasChoices("company"))
+    organization: Optional[str] = Field(
+        None, validation_alias=AliasChoices("company")
+    )
     github_username: str = Field(None, validation_alias=AliasChoices("login"))
     github_image_id: int = Field(None, validation_alias=AliasChoices("id"))
     deia_advisory: Optional[bool] = False
@@ -44,23 +51,18 @@ class PersonModel(BaseModel):
     )
     packages_submitted: Optional[list[str | None]] = Field(
         None,
-        validation_alias=AliasChoices("packages-submitted", "packages_submitted"),
+        validation_alias=AliasChoices(
+            "packages-submitted", "packages_submitted"
+        ),
     )
     packages_reviewed: Optional[list[str | None]] = Field(
         None,
-        validation_alias=AliasChoices("packages-reviewed", "packages_reviewed"),
+        validation_alias=AliasChoices(
+            "packages-reviewed", "packages_reviewed"
+        ),
     )
     location: Optional[str] = None
-    email: Optional[str] = None
-
-    # @field_validator("advisory", "deia_advisory", mode="before")
-    # def fix_bools(cls, value):
-    #     value = "value"
-    #     print(value)
-    #     if value == "false":
-    #         return False
-    #     elif value == "true":
-    #         return True
+    email: Optional[str] = ModuleNotFoundError
 
     @field_validator(
         "packages_reviewed",
@@ -96,25 +98,6 @@ def clean_strings(cls, string: str) -> str:
             string = re.sub(r"[\r\n]", "", string)
         return string
 
-    def update(self, data: dict) -> "PersonModel":
-        """
-        this doesn't currently validate the data - the discussion
-        below describes one way to do that but uses pydantic 1.x not
-        2.x approach.
-
-        https://github.com/pydantic/pydantic/discussions/3139#discussioncomment-4797649
-        """
-
-        # Note that this will not validate new data :(
-        for aval in data.keys():
-            if isinstance(getattr(self, aval), str) and "# noupdate" in getattr(
-                self, aval
-            ):
-                print("The", aval, "field has a noupdate flag. Skipping update.")
-            else:
-                setattr(self, aval, data[aval])
-        return self
-
 
 @dataclass
 class ProcessContributors:
@@ -401,24 +384,9 @@ def combine_json_data(self) -> dict:
                 print("Oops - can't process", json_file, e)
         return combined_data
 
-    # TODO: see if this is every used. it seems completley unecccsary
-    # given we can use.keys*()
-    # def get_gh_usernames(self, contrib_data: List) -> List:
-    #     """Get a list of all gh usernames
-
-    #     Parameters
-    #     ----------
-    #     contrib_data : list
-    #         Dict containing all of the contributor information for the website.
-
-    #     """
-    #     all_usernames = []
-    #     for item in contrib_data:
-    #         all_usernames.append(item["github_username"])
-
-    #     return all_usernames
-
-    def get_user_info(self, username: str, aname: Optional[str] = None) -> dict:
+    def get_user_info(
+        self, username: str, aname: Optional[str] = None
+    ) -> dict:
         """
         Get a single user's information from their GitHub username using the
         GitHub API
@@ -430,6 +398,7 @@ def get_user_info(self, username: str, aname: Optional[str] = None) -> dict:
             Github username to retrieve data for
         aname : str default=None
             A user's name from the contributors.yml file.
+            https://docs.github.com/en/rest/users/users?apiVersion=2022-11-28#get-a-user
 
         Returns
         -------
@@ -527,7 +496,9 @@ def combine_users(self, repoDict: dict, webDict: dict) -> dict:
             if gh_user in webDict.keys():
                 # Return a list of updated contributor type keys and use it to
                 # update the web dict
-                webDict[gh_user]["contributor_type"] = self._update_contrib_type(
+                webDict[gh_user][
+                    "contributor_type"
+                ] = self._update_contrib_type(
                     webDict[gh_user]["contributor_type"],
                     repoDict[gh_user]["contributor_type"],
                 )
@@ -567,7 +538,9 @@ def add_new_user(self, gh_user: str) -> dict:
         updated_data = self.update_contrib_data(new, gh_data)
         return updated_data
 
-    def get_gh_data(self, contribs: Union[Dict[str, str], List]) -> dict[str, str]:
+    def get_gh_data(
+        self, contribs: Union[Dict[str, str], List]
+    ) -> dict[str, str]:
         """Parses through each GitHub username and hits the GitHub
         API to grab user information.
 
@@ -647,7 +620,9 @@ def update_contrib_data(self, contrib_data: dict, gh_data: dict):
                     else:
                         contrib_data[gh_name][akey] = ""
                 else:
-                    contrib_data[gh_name][akey] = gh_data[gh_name][gh_name][akey]
+                    contrib_data[gh_name][akey] = gh_data[gh_name][gh_name][
+                        akey
+                    ]
 
         return contrib_data
 
diff --git a/src/pyosmeta/file_io.py b/src/pyosmeta/file_io.py
index ef968ce..5eb555d 100644
--- a/src/pyosmeta/file_io.py
+++ b/src/pyosmeta/file_io.py
@@ -1,6 +1,6 @@
 import pickle
 import urllib.request
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Union
 
 import ruamel.yaml
 
@@ -27,6 +27,25 @@ def _list_to_dict(a_list: List, a_key: str) -> Dict:
     return {a_dict[a_key].lower(): a_dict for a_dict in a_list}
 
 
+def create_paths(repos: Union[list[str], str]) -> Union[list[str], str]:
+    """ """
+    base_url = "https://raw.githubusercontent.com/pyOpenSci/"
+    end_url = "/main/.all-contributorsrc"
+    repos = [
+        "python-package-guide",
+        "software-peer-review",
+        "pyopensci.github.io",
+        "software-review",
+        "update-web-metadata",
+    ]
+    if isinstance(repos, list):
+        all_paths = [base_url + repo + end_url for repo in repos]
+    else:
+        all_paths = base_url + repos + end_url
+
+    return all_paths
+
+
 def load_website_yml(key: str, url: str):
     """
     This opens a website contrib yaml file and turns it in a
@@ -37,28 +56,6 @@ def load_website_yml(key: str, url: str):
     return _list_to_dict(yml_list, key)
 
 
-# def dict_to_list(pyos_meta: Dict[str, Union[str, List[str]]]) -> List[Dict]:
-#     """Turn dict into list for parsing to jekyll friendly yaml
-
-#     Parameters
-#     ----------
-#     pyos_meta : Dict
-#         A dictionary containing metadata for pyos contributors or review issues
-
-#     Returns
-#     -------
-#     List
-#         A list of dictionaries containing pyos metadata for contribs or reviews
-
-#     """
-#     print("a")
-#     # Turn dict into list for parsing
-#     return [pyos_meta[key] for key in pyos_meta]
-#     # for key in pyos_meta:
-#     #     final_contribs.append(pyos_meta[key])
-#     # return final_contribs
-
-
 def open_yml_file(file_path: str) -> dict:
     """Open & deserialize YAML file to dictionary.
 
@@ -109,7 +106,7 @@ def export_yaml(filename: str, data_list: list):
 # function created
 def clean_string(astr: str) -> str:
     """
-    Clean a string by removing occurrences of strings starting with "*id0" and "[]".
+    Clean - remove strings starting with "*id0" and "[]".
 
     Parameters
     ----------
@@ -139,7 +136,7 @@ def clean_yaml_file(filename):
     with open(filename, "r") as f:
         lines = f.readlines()
 
-    # TODO: regex would be cleaner here - https://stackoverflow.com/questions/27064964/python-replace-all-words-start-with
+    # TODO: regex would be cleaner here
     cleaned_lines = []
     for i, line in enumerate(lines):
         if i == 0 and line.startswith("  "):
diff --git a/src/pyosmeta/parse_issues.py b/src/pyosmeta/parse_issues.py
index 61d5953..5bb537d 100644
--- a/src/pyosmeta/parse_issues.py
+++ b/src/pyosmeta/parse_issues.py
@@ -1,13 +1,111 @@
 from dataclasses import dataclass
 from datetime import datetime
+from typing import Any, Optional
 
 import requests
+from pydantic import (AliasChoices, BaseModel, ConfigDict, Field,
+                      field_validator)
 
 from pyosmeta.contributors import ProcessContributors
 
 
-# main reason to use this is attributes .. avoiding them being changed
-# in other instances...
+def clean_date(a_date: Optional[str]) -> str:
+    """Cleans up a datetime from github and returns a date string
+
+    In some cases the string is manually entered month-day-year and in
+    others it's a gh time stamp. finally sometimes it could be missing
+    or text. handle all of those cases with this validator.
+    """
+    print(a_date)
+    if a_date is None or a_date == "missing":
+        return "missing"
+    elif len(a_date) < 11:
+        new_date = a_date.replace("/", "-").split("-")
+        return f"{new_date[2]}-{new_date[0]}-{new_date[1]}"
+    else:
+        try:
+            return (
+                datetime.strptime(a_date, "%Y-%m-%dT%H:%M:%SZ")
+                .date()
+                .strftime("%Y-%m-%d")
+            )
+        except:
+            print("Oops - missing data. Setting date to missing")
+            return "missing"
+
+
+class GhMeta(BaseModel):
+    name: str
+    description: str
+    created_at: str
+    stargazers_count: int
+    watchers_count: int
+    forks: int
+    open_issues_count: int
+    forks_count: int
+    documentation: Optional[str]  # Jointly is missing documentation
+    contrib_count: int
+    last_commit: str
+
+    @field_validator(
+        "last_commit",
+        "created_at",
+        mode="before",
+    )
+    @classmethod
+    def clean_date(cls, a_date: Optional[str]) -> str:
+        """Cleans up a datetime from github and returns a date string
+
+        Runs the general clean_date function in this module as a validator.
+        """
+
+        return clean_date(a_date)
+
+
+class ReviewModel(BaseModel):
+    # Make sure model populates both aliases and original attr name
+    model_config = ConfigDict(populate_by_name=True, str_strip_whitespace=True)
+
+    package_name: Optional[str] = None
+    package_description: str = Field(
+        None, validation_alias=AliasChoices("one-line_description_of_package")
+    )
+    submitting_author: dict[str, str] = None
+    all_current_maintainers: list[dict[str, str]] = None
+    repository_link: Optional[str] = None
+    version_submitted: Optional[str] = None
+    categories: Optional[str] = None
+    categories: list[str] = None
+    editor: dict[str, str] = None
+    reviewer_1: dict[str, str] = None
+    reviewer_2: dict[str, str] = None
+    archive: str = None
+    version_accepted: str = None
+    date_accepted: str = None
+    created_at: str = None
+    updated_at: str = None
+    closed_at: str = None
+    issue_link: str = None
+    gh_meta: GhMeta
+
+    @field_validator(
+        "date_accepted",
+        "created_at",
+        "updated_at",
+        "closed_at",
+        mode="before",
+    )
+    @classmethod
+    def clean_date(cls, a_date: Optional[str]) -> str:
+        """Cleans up a datetime from github and returns a date string
+
+        Runs the general clean_date function in this module as a validator.
+
+        """
+
+        return clean_date(a_date)
+
+
 @dataclass
 class ProcessIssues:
     """
@@ -17,7 +115,6 @@ class ProcessIssues:
 
     """
 
-    # TODO: turn file io into functions and remove inheritance here
     def __init__(self, org, repo_name, label_name):
         """
         More here...
@@ -41,6 +138,18 @@ def __init__(self, org, repo_name, label_name):
 
         self.GITHUB_TOKEN = self.contrib_instance.get_token()
 
+    gh_stats = [
+        "name",
+        "description",
+        "homepage",
+        "created_at",
+        "stargazers_count",
+        "watchers_count",
+        "forks",
+        "open_issues_count",
+        "forks_count",
+    ]
+
     @property
     def api_endpoint(self):
         return f"https://api.github.com/repos/{self.org}/{self.repo_name}/issues?labels={self.label_name}&state=all"
@@ -174,26 +283,7 @@ def parse_issue_header(
             package name, description, review team, version submitted etc.
             See key_order below for the full list of keys.
         """
-        # Reorder data
-        key_order = [
-            "package_name",
-            "package_description",
-            "submitting_author",
-            "all_current_maintainers",
-            "repository_link",
-            "version_submitted",
-            "categories",
-            "editor",
-            "reviewer_1",
-            "reviewer_2",
-            "archive",
-            "version_accepted",
-            "date_accepted",
-            "created_at",
-            "updated_at",
-            "closed_at",
-            "issue_link",
-        ]
+
         meta_dates = ["created_at", "updated_at", "closed_at"]
 
         review = {}
@@ -202,50 +292,34 @@ def parse_issue_header(
             if not package_name:
                 continue
             # Index of 15 should include date accepted in the review meta
-            issue_meta = self.get_issue_meta(body_data, total_lines)
+            review[package_name] = self.get_issue_meta(body_data, total_lines)
             # Add issue open and close date to package meta
             # Created, opened & closed dates are in GitHub Issue response
             for a_date in meta_dates:
-                issue_meta[a_date] = self._clean_date(issue[a_date])
-
-            # Date accepted is a manually added value. Fix format separately
-            # Using dashes because it's jekyll friendly
-            try:
-                the_date = issue_meta["date_accepted"].replace("/", "-").split("-")
-                if the_date[0] == "TBD":
-                    continue
-                else:
-                    issue_meta[
-                        "date_accepted"
-                    ] = f"{the_date[2]}-{the_date[0]}-{the_date[1]}"
-            except KeyError as ke:
-                print("Oops,", package_name, "is missing date_accepted key.")
-            # Clean markdown url's from editor, and reviewer lines
-            types = ["editor", "reviewer_1", "reviewer_2"]
-            user_values = ["github_username", "name"]
-            for a_type in types:
-                for user_value in user_values:
-                    issue_meta[a_type][user_value] = (
-                        issue_meta[a_type][user_value]
-                        .replace("https://github.com/", "")
-                        .replace("[", "")
-                        .replace("]", "")
-                    )
-
-            review[package_name] = issue_meta
+                # TODO: this could become a validator
+                review[package_name][a_date] = issue[
+                    a_date
+                ]  # self._clean_date(issue[a_date])
+            # Get categories and issue review link
             review[package_name]["categories"] = self.get_categories(body_data)
             review[package_name]["issue_link"] = issue["url"].replace(
                 "https://api.github.com/repos/", "https://github.com/"
             )
-            # Rename package description & reorder keys
-            review[package_name]["package_description"] = review[package_name].pop(
-                "one-line_description_of_package", ""
-            )
-            review[package_name] = {
-                key: review[package_name][key]
-                for key in key_order
-                if review[package_name].get(key)
-            }
+
+            # # Clean markdown url's from editor, and reviewer lines
+            # TODO - this could be a reviewer name cleanup validaotr
+            # types = ["editor", "reviewer_1", "reviewer_2"]
+            # user_values = ["github_username", "name"]
+            # for a_type in types:
+            #     for user_value in user_values:
+            #         issue_meta[a_type][user_value] = (
+            #             issue_meta[a_type][user_value]
+            #             .replace("https://github.com/", "")
+            #             .replace("[", "")
+            #             .replace("]", "")
+            #         )
+
+            # review[package_name] = issue_meta
 
         return review
 
@@ -344,20 +418,56 @@ def parse_comment(self, issue: dict[str, str]) -> tuple[str, list[str]]:
 
         return package_name, body_data
 
-    def _clean_date(self, date: str) -> str:
-        """Cleans up a datetime  from github and returns a date string"""
+    # def _clean_date(self, date: str) -> str:
+    #     """Cleans up a datetime  from github and returns a date string"""
+
+    #     try:
+    #         print(date)
+    #         date_clean = (
+    #             datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ")
+    #             .date()
+    #             .strftime("%Y-%m-%d")
+    #         )
+    #     except:
+    #         print(
+    #             "date is this", date, "Oops - i need a string to process date"
+    #         )
+    #         print("setting date to missing")
+    #         date_clean = "missing"
+    #     return date_clean
+
+    def get_gh_metrics(
+        self,
+        endpoints: dict[str, str],
+        reviews: dict[str, dict[str, Any]],
+    ) -> dict[str, dict[str, Any]]:
+        """
+        Get GitHub metrics for each review based on provided endpoints.
 
-        try:
-            date_clean = (
-                datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ")
-                .date()
-                .strftime("%Y-%m-%d")
-            )
-        except:
-            print("Oops - i need a string to process date")
-            print("setting date to missing")
-            date_clean = "missing"
-        return date_clean
+        Parameters:
+        ----------
+        endpoints : dict
+            A dictionary mapping package names to their GitHub URLs.
+        reviews : dict
+            A dictionary containing review data.
+
+        Returns:
+        -------
+        dict
+            Updated review data with GitHub metrics.
+        """
+        pkg_meta = {}
+        for pkg_name, url in endpoints.items():
+            print("Getting GitHub stats for", pkg_name)
+
+            pkg_meta[pkg_name] = self.get_repo_meta(url, self.gh_stats)
+
+            pkg_meta[pkg_name]["contrib_count"] = self.get_repo_contribs(url)
+            pkg_meta[pkg_name]["last_commit"] = self.get_last_commit(url)
+            # Add github meta to review metadata
+            reviews[pkg_name]["gh_meta"] = pkg_meta[pkg_name]
+
+            return reviews
 
     def get_repo_meta(self, url: str, stats_list: list) -> dict:
         """
@@ -387,7 +497,9 @@ def get_repo_meta(self, url: str, stats_list: list) -> dict:
             for astat in stats_list:
                 stats_dict[astat] = data[astat]
             stats_dict["documentation"] = stats_dict.pop("homepage")
-            stats_dict["created_at"] = self._clean_date(stats_dict["created_at"])
+            # stats_dict["created_at"] = self._clean_date(
+            #     stats_dict["created_at"]
+            # )
 
         return stats_dict
 
@@ -428,7 +540,7 @@ def get_last_commit(self, repo: str) -> str:
             # else "1970-01-01T00:00:00Z"
         )
 
-        return self._clean_date(date)
+        return date
 
     def get_categories(
         self, issue_body_list: list[list[str]], fmt: bool = True

From d69e1007d2a4bd9cccf039abce687c207d841b16 Mon Sep 17 00:00:00 2001
From: Leah Wasser <leah@pyopensci.org>
Date: Tue, 15 Aug 2023 19:31:15 -0600
Subject: [PATCH 03/12] Fix: moving issues to pydantic and cleanup

---
 pyproject.toml                                |   9 ++
 .../cli/personmodel-update-contribs.py        | 119 ------------------
 src/pyosmeta/cli/update_contributors.py       | 114 ++++++++++-------
 src/pyosmeta/cli/update_review_contribs.py    |  20 ++-
 src/pyosmeta/cli/update_reviews.py            |  73 +++++------
 src/pyosmeta/contributors.py                  |  14 +--
 src/pyosmeta/file_io.py                       |   6 +-
 src/pyosmeta/parse_issues.py                  |  85 ++++++-------
 8 files changed, 173 insertions(+), 267 deletions(-)
 delete mode 100644 src/pyosmeta/cli/personmodel-update-contribs.py

diff --git a/pyproject.toml b/pyproject.toml
index e646cd9..991b9d0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -66,6 +66,15 @@ update-reviewers = "pyosmeta.cli.update_review_contribs:main"
 # Right now i'm not using pdm to add dependencies.
 # Will explore that later
 # Below using dynamic versioning / which is setuptools scm like
+[tool.black]
+line-length = 79
+target-version = ['py310']
+
+[tool.isort]
+profile = "black"
+multi_line_output = 3
+py_version = 27
+
 [tool.flake8]
 # List of error codes to ignore (comma-separated)
 ignore = "E203, W503"
diff --git a/src/pyosmeta/cli/personmodel-update-contribs.py b/src/pyosmeta/cli/personmodel-update-contribs.py
deleted file mode 100644
index 8ff341e..0000000
--- a/src/pyosmeta/cli/personmodel-update-contribs.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import argparse
-import os
-import pickle
-
-import pydantic
-from pydantic import ValidationError
-
-from pyosmeta.contributors import PersonModel, ProcessContributors
-from pyosmeta.file_io import clean_export_yml, open_yml_file
-
-print(pydantic.__version__)
-# TODO - fix the website by renaming   packages-editor, packages-submitted:
-# packages-reviewed: to use underscores. this will just make life easier
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="A CLI script to update pyOpenSci contributors"
-    )
-    parser.add_argument(
-        "--update",
-        type=str,
-        help="Will force update contrib info from GitHub for every contributor",
-    )
-    args = parser.parse_args()
-
-    if args:
-        update_all = True
-
-    base_url = "https://raw.githubusercontent.com/pyOpenSci/"
-    end_url = "/main/.all-contributorsrc"
-    repos = [
-        "python-package-guide",
-        "software-peer-review",
-        "pyopensci.github.io",
-        "software-review",
-        "update-web-metadata",
-    ]
-    json_files = [base_url + repo + end_url for repo in repos]
-
-    # Get existing contribs from pyopensci.github.io repo (website data)
-    web_yaml_path = base_url + "pyopensci.github.io/main/_data/contributors.yml"
-
-    web_contribs = open_yml_file(web_yaml_path)
-
-    # Populate all existing contribs into model objects
-    all_contribs = {}
-    for a_contrib in web_contribs:
-        try:
-            if a_contrib["github_username"].lower() == "arianesasso":
-                print("pause")
-            all_contribs[a_contrib["github_username"].lower()] = PersonModel(
-                **a_contrib
-            )
-        except ValidationError as ve:
-            print(a_contrib["github_username"])
-            print(ve)
-
-    print("Done processing all-contribs")
-    # TODO - maybe add these as an attr in the contribs class?
-    base_url = "https://raw.githubusercontent.com/pyOpenSci/"
-    end_url = "/main/.all-contributorsrc"
-    repos = [
-        "python-package-guide",
-        "software-peer-review",
-        "pyopensci.github.io",
-        "software-review",
-        "update-web-metadata",
-    ]
-    json_files = [base_url + repo + end_url for repo in repos]
-
-    # Create a list of all contributors across repositories
-    process_contribs = ProcessContributors(json_files)
-    bot_all_contribs = process_contribs.combine_json_data()
-
-    # TODO this is much slower than it should be
-    print("Updating contrib types and searching for new users now")
-    # bot_all contris is a dict of x contrib types with an associated list of
-    # users who contributed to that type.
-    for key, users in bot_all_contribs.items():
-        print(key)
-        for gh_user in users:
-            # Find and populate data for any new contributors
-            if gh_user not in all_contribs.keys():
-                print("Missing", gh_user, "Adding them now")
-                new_contrib = process_contribs.get_user_info(gh_user)
-                all_contribs[gh_user] = PersonModel(**new_contrib)
-
-            # Update contribution type list for all users
-            existing_contribs = all_contribs[gh_user].contributor_type
-            all_contribs[
-                gh_user
-            ].contributor_type = process_contribs.update_contrib_list(
-                existing_contribs, key
-            )
-
-    if update_all:
-        for user in all_contribs.keys():
-            print("Updating all user info from github", user)
-            new_contrib = process_contribs.get_user_info(user)
-            # Update person's data (should skip update for any text
-            # with # noupdate flag)
-            all_contribs[user] = all_contribs[user].update(new_contrib)
-
-    # Export to pickle which supports updates after parsing reviews
-    with open("all_contribs.pickle", "wb") as f:
-        pickle.dump(all_contribs, f)
-
-    alist = []
-    for key, item in all_contribs.items():
-        alist.append(item.model_dump())
-
-    # Test export
-    print(os.getcwd())
-    clean_export_yml(alist, os.path.join("_data", "contribs.yml"))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/pyosmeta/cli/update_contributors.py b/src/pyosmeta/cli/update_contributors.py
index 31c29ac..5676033 100644
--- a/src/pyosmeta/cli/update_contributors.py
+++ b/src/pyosmeta/cli/update_contributors.py
@@ -1,35 +1,31 @@
 import argparse
 import pickle
 
-from pyosmeta.contributors import ProcessContributors
-from pyosmeta.file_io import clean_export_yml, load_website_yml
+import pydantic
+from pydantic import ValidationError
 
-# TODO: will this still run in gh actions??
-# TODO: add update=True like i did for update_reviews
-# TODO: still need to add a flag to not update specific fields
-# TODO: if i use composition and there are helpers in a class
-# that are used in a method that i call via composition are the helpers
-# still available?
+from pyosmeta.contributors import PersonModel, ProcessContributors
+from pyosmeta.file_io import create_paths, open_yml_file
+
+print(pydantic.__version__)
+# TODO - fix the website by renaming   packages-editor, packages-submitted:
+# packages-reviewed: to use underscores. this will just make life easier
 
 
 def main():
-    update_all = False
     parser = argparse.ArgumentParser(
         description="A CLI script to update pyOpenSci contributors"
     )
     parser.add_argument(
         "--update",
         type=str,
-        help="Will force update contrib info from GitHub for every contributor",
+        help="Force update contrib info from GitHub for every contributor",
     )
     args = parser.parse_args()
 
     if args:
         update_all = True
 
-    # TODO - maybe add these as an attr in the contribs class?
-    base_url = "https://raw.githubusercontent.com/pyOpenSci/"
-    end_url = "/main/.all-contributorsrc"
     repos = [
         "python-package-guide",
         "software-peer-review",
@@ -37,48 +33,76 @@ def main():
         "software-review",
         "update-web-metadata",
     ]
-    json_files = [base_url + repo + end_url for repo in repos]
+    json_files = create_paths(repos)
 
     # Get existing contribs from pyopensci.github.io repo (website data)
-    web_yaml_path = base_url + "pyopensci.github.io/main/_data/contributors.yml"
+    base_url = "https://raw.githubusercontent.com/pyOpenSci/"
+    web_yaml_path = (
+        base_url + "pyopensci.github.io/main/_data/contributors.yml"
+    )
 
-    process_contribs = ProcessContributors(json_files)
+    web_contribs = open_yml_file(web_yaml_path)
+
+    # Populate all existing contribs into model objects
+    all_contribs = {}
+    for a_contrib in web_contribs:
+        try:
+            all_contribs[a_contrib["github_username"].lower()] = PersonModel(
+                **a_contrib
+            )
+        except ValidationError as ve:
+            print(a_contrib["github_username"])
+            print(ve)
 
-    # Returns a list of dict objects with gh usernames (lowercase) as keys
-    # TODO: File io module (could just be a function)
-    web_contribs = load_website_yml(url=web_yaml_path, key="github_username")
-    bot_all_contribs_dict = process_contribs.combine_json_data()
+    print("Done processing all-contribs")
 
-    # Parse through each user in the web yaml, if they don't exist, add them
-    # finally - update contrib types
-    for key, users in bot_all_contribs_dict.items():
+    # Create a list of all contributors across repositories
+    process_contribs = ProcessContributors(json_files)
+    bot_all_contribs = process_contribs.combine_json_data()
+
+    print("Updating contrib types and searching for new users now")
+    # bot_all_contribs: keys: contrib_type, value: ghuser contribs
+    for key, users in bot_all_contribs.items():
         for gh_user in users:
-            # Add any new contributors
-            if gh_user not in web_contribs.keys():
-                print("I found a new contributor! Adding:", gh_user)
-                web_contribs.update(
-                    # TODO: this is also used in the other 2 scripts
-                    # but add user info is in the contribs class - i do
-                    # think it belongs there
-                    process_contribs.check_add_user(gh_user, web_contribs)
-                )
-
-            # Update contrib type list
-            existing_contribs = web_contribs[gh_user]["contributor_type"]
-            # TODO: This helper is used in all three scripts but defined
-            # in the contribs class
-            web_contribs[gh_user][
-                "contributor_type"
-            ] = process_contribs.update_contrib_list(existing_contribs, key)
+            # Find and populate data for any new contributors
+            if gh_user not in all_contribs.keys():
+                print("Missing", gh_user, "Adding them now")
+                new_contrib = process_contribs.get_user_info(gh_user)
+                all_contribs[gh_user] = PersonModel(**new_contrib)
+
+            # Update contribution type list for all users
+            existing_contribs = all_contribs[gh_user].contributor_type
+            all_contribs[
+                gh_user
+            ].contributor_type = process_contribs.update_contrib_list(
+                existing_contribs, key
+            )
 
     if update_all:
-        gh_data = process_contribs.get_gh_data(web_contribs)
-        web_contribs = process_contribs.update_contrib_data(web_contribs, gh_data)
+        for user in all_contribs.keys():
+            print("Updating all user info from github", user)
+            new_gh_data = process_contribs.get_user_info(user)
+
+            # TODO: turn this into a small update method
+            existing = all_contribs[user].model_dump()
+
+            for key, item in new_gh_data.items():
+                if key == "mastodon":
+                    # Mastodon isn't available in the api yet
+                    continue
+                # Don't replace the value if there is a noupdate flag
+                # TODO: This approach doesn't work, ruemal-yaml doesn't
+                # preserve inline comments
+                if key == "name" and existing[key]:
+                    continue
+                else:
+                    existing[key] = item
+
+            all_contribs[user] = PersonModel(**existing)
 
-    # Export data
-    # Pickle supports updates after parsing reviews
+    # Export to pickle which supports updates after parsing reviews
     with open("all_contribs.pickle", "wb") as f:
-        pickle.dump(web_contribs, f)
+        pickle.dump(all_contribs, f)
 
 
 if __name__ == "__main__":
diff --git a/src/pyosmeta/cli/update_review_contribs.py b/src/pyosmeta/cli/update_review_contribs.py
index 13d5413..c4c5aa1 100644
--- a/src/pyosmeta/cli/update_review_contribs.py
+++ b/src/pyosmeta/cli/update_review_contribs.py
@@ -53,12 +53,18 @@ def main():
             if issue_role == "all_current_maintainers":
                 if issue_role in issue_meta:
                     # Loop through each maintainer in the list
-                    for i, a_maintainer in enumerate(issue_meta.get(issue_role)):
-                        gh_user = get_clean_user(a_maintainer["github_username"])
+                    for i, a_maintainer in enumerate(
+                        issue_meta.get(issue_role)
+                    ):
+                        gh_user = get_clean_user(
+                            a_maintainer["github_username"]
+                        )
 
                         if gh_user not in contribs.keys():
                             contribs.update(
-                                updateContribs.check_add_user(gh_user, contribs)
+                                updateContribs.check_add_user(
+                                    gh_user, contribs
+                                )
                             )
 
                         # Update contrib packages for peer review
@@ -100,7 +106,9 @@ def main():
 
                 if gh_user not in contribs.keys():
                     # If they aren't already in contribs, add them
-                    contribs.update(updateContribs.check_add_user(gh_user, contribs))
+                    contribs.update(
+                        updateContribs.check_add_user(gh_user, contribs)
+                    )
                 # Update user package contributions
                 (
                     contrib_key,
@@ -123,7 +131,9 @@ def main():
 
                 # If users's name is missing in issue, populate from contribs dict
                 if issue_meta[issue_role]["name"] == "":
-                    packages[pkg_name][issue_role]["name"] = contribs[gh_user]["name"]
+                    packages[pkg_name][issue_role]["name"] = contribs[gh_user][
+                        "name"
+                    ]
 
     # Export to yaml
     clean_export_yml(contribs, os.path.join("_data", "contributors.yml"))
diff --git a/src/pyosmeta/cli/update_reviews.py b/src/pyosmeta/cli/update_reviews.py
index 8e632cd..634e7bf 100644
--- a/src/pyosmeta/cli/update_reviews.py
+++ b/src/pyosmeta/cli/update_reviews.py
@@ -16,12 +16,19 @@
 # TODO: feature - Would be cool to create an "under review now" list as well -
 # ideally this could be passed as a CLI argument with the label we want to
 # search for
+# TODO: 1. add gh metadata to the review object
+# prior to parsing
+# 2. work on update-all!!
+# 3. i think package_description might not be parsing right?
+
 
 import argparse
 import pickle
 
-from pyosmeta import ProcessIssues
-from pyosmeta.file_io import clean_export_yml, load_website_yml
+from pydantic import ValidationError
+
+from pyosmeta import ProcessIssues, ReviewModel
+from pyosmeta.file_io import load_website_yml
 
 
 def main():
@@ -37,11 +44,11 @@ def main():
     args = parser.parse_args()
 
     if args:
-        update_all = True
+        update_all = False
 
     web_reviews_path = "https://raw.githubusercontent.com/pyOpenSci/pyopensci.github.io/main/_data/packages.yml"
 
-    issueProcess = ProcessIssues(
+    process_review = ProcessIssues(
         org="pyopensci",
         repo_name="software-submission",
         label_name="6/pyOS-approved 🚀🚀🚀",
@@ -51,48 +58,34 @@ def main():
     web_reviews = load_website_yml(key="package_name", url=web_reviews_path)
 
     # Get all issues for approved packages
-    issues = issueProcess.return_response()
-    all_accepted_reviews = issueProcess.parse_issue_header(issues, 15)
+    issues = process_review.return_response()
+    accepted_reviews = process_review.parse_issue_header(issues, 15)
 
     # Parse through reviews, identify new ones, fix case
     if update_all == True:
-        for review_key, review_meta in all_accepted_reviews.items():
-            web_reviews[review_key.lower()] = review_meta
+        for key, meta in accepted_reviews.items():
+            web_reviews[key.lower()] = meta
     else:
-        for review_key, review_meta in all_accepted_reviews.items():
-            if review_key.lower() not in web_reviews.keys():
-                print("Yay - pyOS has a new package:", review_key)
-                web_reviews[review_key.lower()] = review_meta
+        for key, meta in accepted_reviews.items():
+            if key.lower() not in web_reviews.keys():
+                print("Yay - pyOS has a new package:", key)
+                web_reviews[key.lower()] = meta
 
     # Update gh metrics via api for all packages
-    repo_endpoints = issueProcess.get_repo_endpoints(web_reviews)
-    gh_stats = [
-        "name",
-        "description",
-        "homepage",
-        "created_at",
-        "stargazers_count",
-        "watchers_count",
-        "forks",
-        "open_issues_count",
-        "forks_count",
-    ]
-
-    # Get gh metadata for each package submission
-    all_repo_meta = {}
-    for package_name in repo_endpoints.keys():
-        print("Getting GitHub stats for", package_name)
-        package_api = repo_endpoints[package_name]
-        all_repo_meta[package_name] = issueProcess.get_repo_meta(package_api, gh_stats)
-
-        all_repo_meta[package_name]["contrib_count"] = issueProcess.get_repo_contribs(
-            package_api
-        )
-        all_repo_meta[package_name]["last_commit"] = issueProcess.get_last_commit(
-            package_api
-        )
-        # Add github meta to review metadata
-        web_reviews[package_name]["gh_meta"] = all_repo_meta[package_name]
+    repo_endpoints = process_review.get_repo_endpoints(web_reviews)
+    web_reviews = process_review.get_gh_metrics(repo_endpoints, web_reviews)
+
+    # Finally populate model objects with review data + metrics
+    # TODO: this is really close - it's erroring when populating date
+    # i suspect in the github metadata
+    all_reviews = {}
+    for key, review in web_reviews.items():
+        # First add gh meta to each dict
+        print("Parsing & validating", key)
+        try:
+            all_reviews[key] = ReviewModel(**review)
+        except ValidationError as ve:
+            print(ve)
 
     with open("all_reviews.pickle", "wb") as f:
         pickle.dump(web_reviews, f)
diff --git a/src/pyosmeta/contributors.py b/src/pyosmeta/contributors.py
index f178699..dcabaa8 100644
--- a/src/pyosmeta/contributors.py
+++ b/src/pyosmeta/contributors.py
@@ -1,18 +1,12 @@
 import json
 import os
 import re
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Union
 
 import requests
+from dataclasses import dataclass
 from dotenv import load_dotenv
-from pydantic import (
-    AliasChoices,
-    BaseModel,
-    ConfigDict,
-    Field,
-    field_validator,
-)
+from pydantic import AliasChoices, BaseModel, ConfigDict, Field, field_validator
+from typing import Dict, List, Optional, Tuple, Union
 
 
 class PersonModel(BaseModel):
@@ -581,7 +575,7 @@ def _check_url(self, url: str) -> bool:
         try:
             response = requests.get(url, timeout=6)
             return response.status_code == 200
-        except:
+        except Exception:
             print("Oops, url", url, "is not valid, removing it")
             return False
 
diff --git a/src/pyosmeta/file_io.py b/src/pyosmeta/file_io.py
index 5eb555d..6a773c1 100644
--- a/src/pyosmeta/file_io.py
+++ b/src/pyosmeta/file_io.py
@@ -1,8 +1,8 @@
 import pickle
 import urllib.request
-from typing import Dict, List, Union
 
 import ruamel.yaml
+from typing import Dict, List, Union
 
 
 def load_pickle(filename):
@@ -153,7 +153,9 @@ def clean_yaml_file(filename):
         f.write(cleaned_text)
 
 
-def clean_export_yml(a_dict: Dict[str, Union[str, List[str]]], filename: str) -> None:
+def clean_export_yml(
+    a_dict: Dict[str, Union[str, List[str]]], filename: str
+) -> None:
     """Inputs a dictionary with keys - contribs or packages.
     It then converse to a list for export, and creates a cleaned
     YAML file that is jekyll friendly
diff --git a/src/pyosmeta/parse_issues.py b/src/pyosmeta/parse_issues.py
index 5bb537d..d6ca4eb 100644
--- a/src/pyosmeta/parse_issues.py
+++ b/src/pyosmeta/parse_issues.py
@@ -1,10 +1,9 @@
-from dataclasses import dataclass
 from datetime import datetime
-from typing import Any, Optional
 
 import requests
-from pydantic import (AliasChoices, BaseModel, ConfigDict, Field,
-                      field_validator)
+from dataclasses import dataclass
+from pydantic import AliasChoices, BaseModel, ConfigDict, Field, field_validator
+from typing import Any, Optional
 
 from pyosmeta.contributors import ProcessContributors
 
@@ -29,8 +28,8 @@ def clean_date(a_date: Optional[str]) -> str:
                 .date()
                 .strftime("%Y-%m-%d")
             )
-        except:
-            print("Oops - missing data. Setting date to missing")
+        except TypeError as te:
+            print("Oops - missing data. Setting date to missing", te)
             return "missing"
 
 
@@ -152,7 +151,11 @@ def __init__(self, org, repo_name, label_name):
 
     @property
     def api_endpoint(self):
-        return f"https://api.github.com/repos/{self.org}/{self.repo_name}/issues?labels={self.label_name}&state=all"
+        url = (
+            f"https://api.github.com/repos/{self.org}/{self.repo_name}/"
+            f"issues?labels={self.label_name}&state=all"
+        )
+        return url
 
     # Set up the API endpoint
     def _get_response(self):
@@ -215,7 +218,7 @@ def _get_line_meta(self, line_item: list[str]) -> dict[str, object]:
         line_item : list
             A single list item representing a single line in the issue
             containing metadata for the review.
-            This comment is the metadata for the review that the author fills out.
+            This comment is metadata for the review that the author fills out.
 
         Returns
         -------
@@ -235,7 +238,7 @@ def _get_line_meta(self, line_item: list[str]) -> dict[str, object]:
                     # Add each maintainer to the dict
                     user = aname.split("@")
                     # Clean
-                    user = [self._clean_name(l) for l in user]
+                    user = [self._clean_name(a_str) for a_str in user]
                     a_maint = {
                         "name": self._clean_name(user[0]),
                         "github_username": self._clean_name(user[1]),
@@ -334,11 +337,11 @@ def get_issue_meta(
         Parameters
         ----------
         body_data : list
-            A list containing all of the body data for the top comment in an issue.
+            A list containing all body data for the top comment in an issue.
         end_range : int
-            The number of lines to parse at the top of the issue (this may change
-            over time so this variable allows us to have different processing
-            based upon the date of the issue being opened)
+            The number of lines to parse at the top of the issue (this may
+            change over time so this variable allows us to have different
+            processing based upon the date of the issue being opened)
 
         Returns
         -------
@@ -353,7 +356,9 @@ def get_issue_meta(
 
         return issue_meta
 
-    def get_repo_endpoints(self, review_issues: dict[str, str]) -> dict[str, str]:
+    def get_repo_endpoints(
+        self, review_issues: dict[str, str]
+    ) -> dict[str, str]:
         """
         Returns a list of repository endpoints
 
@@ -373,7 +378,9 @@ def get_repo_endpoints(self, review_issues: dict[str, str]) -> dict[str, str]:
         for a_package in review_issues.keys():
             repo = review_issues[a_package]["repository_link"].strip("/")
             owner, repo = repo.split("/")[-2:]
-            all_repos[a_package] = f"https://api.github.com/repos/{owner}/{repo}"
+            all_repos[
+                a_package
+            ] = f"https://api.github.com/repos/{owner}/{repo}"
         return all_repos
 
     def parse_comment(self, issue: dict[str, str]) -> tuple[str, list[str]]:
@@ -395,22 +402,27 @@ def parse_comment(self, issue: dict[str, str]) -> tuple[str, list[str]]:
                 A list containing the comment elements in order
         """
 
-        # TODO: this var isn't used
-        comments_url = issue["comments_url"]
         body = issue["body"]
-        # Here sometimes the lines are split with \n, others \r\n
-        # To clean split on \n but may have to remove the \r
+        # Clean line breaks (could be done with a regex too)
         lines = body.split("\n")
         lines = [a_line.strip("\r").strip() for a_line in lines]
         # Some users decide to hold the issue titles.
         # For those, clean the markdown bold ** element
-        lines = [line.replace("**", "").strip() for line in lines if line.strip() != ""]
+        lines = [
+            line.replace("**", "").strip()
+            for line in lines
+            if line.strip() != ""
+        ]
         # You need a space after : or else it will break https:// in two
         body_data = [line.split(": ") for line in lines if line.strip() != ""]
 
         # Loop through issue header and grab relevant review metadata
         name_index = next(
-            (i for i, sublist in enumerate(body_data) if sublist[0] == "Package Name"),
+            (
+                i
+                for i, sublist in enumerate(body_data)
+                if sublist[0] == "Package Name"
+            ),
             None,
         )
 
@@ -418,24 +430,6 @@ def parse_comment(self, issue: dict[str, str]) -> tuple[str, list[str]]:
 
         return package_name, body_data
 
-    # def _clean_date(self, date: str) -> str:
-    #     """Cleans up a datetime  from github and returns a date string"""
-
-    #     try:
-    #         print(date)
-    #         date_clean = (
-    #             datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ")
-    #             .date()
-    #             .strftime("%Y-%m-%d")
-    #         )
-    #     except:
-    #         print(
-    #             "date is this", date, "Oops - i need a string to process date"
-    #         )
-    #         print("setting date to missing")
-    #         date_clean = "missing"
-    #     return date_clean
-
     def get_gh_metrics(
         self,
         endpoints: dict[str, str],
@@ -475,7 +469,7 @@ def get_repo_meta(self, url: str, stats_list: list) -> dict:
 
         """
         stats_dict = {}
-        # Small script to get the url (normally the docs) and description of a repo!
+        # get the url (normally the docs) and description of a repo!
         print(url)
         response = requests.get(
             url, headers={"Authorization": f"token {self.GITHUB_TOKEN}"}
@@ -551,10 +545,12 @@ def get_categories(
         Parameters
         ----------
         issue_body_list : list[list[str]]
-            The first comment from the issue split into lines and then the lines split as by self.parse_comment()
+            The first comment from the issue split into lines and then the
+            lines split as by self.parse_comment()
 
         fmt : bool
-            Applies some formatting changes to the categories to match what is required for the website.
+            Applies some formatting changes to the categories to match what is
+            required for the website.
         """
         # Find the starting index of the category section
         start_index = None
@@ -571,7 +567,7 @@ def get_categories(
             # If we couldn't find the starting index, return an empty list
             return []
 
-        # Iterate through the lines starting at the starting index and grab the relevant text
+        # Iterate through lines and grab the relevant text
         cat_matches = ["[x]", "[X]"]
         categories: list[str] = []
         for i in range(start_index, len(issue_body_list)):  # 30):
@@ -587,6 +583,3 @@ def get_categories(
         if fmt:
             categories = [c.lower().replace(" ", "-") for c in categories]
         return categories
-
-
-# https://api.github.com/repos/pyopensci/python-package-guide/commits

From 91cab042861fe829fe836053fcc0ad1ef8bf7707 Mon Sep 17 00:00:00 2001
From: Leah Wasser <leah@pyopensci.org>
Date: Thu, 17 Aug 2023 09:21:41 -0600
Subject: [PATCH 04/12] Flake8 format

---
 src/pyosmeta/cli/update_reviews.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/pyosmeta/cli/update_reviews.py b/src/pyosmeta/cli/update_reviews.py
index 634e7bf..bfefc32 100644
--- a/src/pyosmeta/cli/update_reviews.py
+++ b/src/pyosmeta/cli/update_reviews.py
@@ -45,8 +45,10 @@ def main():
 
     if args:
         update_all = False
-
-    web_reviews_path = "https://raw.githubusercontent.com/pyOpenSci/pyopensci.github.io/main/_data/packages.yml"
+    web_reviews_path = (
+        "htts://raw.githubusercontent.com/pyOpenSci/"
+        "pyopensci.github.io/main/_data/packages.yml"
+    )
 
     process_review = ProcessIssues(
         org="pyopensci",
@@ -62,7 +64,7 @@ def main():
     accepted_reviews = process_review.parse_issue_header(issues, 15)
 
     # Parse through reviews, identify new ones, fix case
-    if update_all == True:
+    if update_all:
         for key, meta in accepted_reviews.items():
             web_reviews[key.lower()] = meta
     else:

From 91a61a2f26138f1a373bc39839276085f5a10526 Mon Sep 17 00:00:00 2001
From: Leah Wasser <leah@pyopensci.org>
Date: Thu, 17 Aug 2023 09:27:49 -0600
Subject: [PATCH 05/12] Flake8 format

---
 src/pyosmeta/contributors.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/pyosmeta/contributors.py b/src/pyosmeta/contributors.py
index dcabaa8..80d1635 100644
--- a/src/pyosmeta/contributors.py
+++ b/src/pyosmeta/contributors.py
@@ -5,7 +5,13 @@
 import requests
 from dataclasses import dataclass
 from dotenv import load_dotenv
-from pydantic import AliasChoices, BaseModel, ConfigDict, Field, field_validator
+from pydantic import (
+    AliasChoices,
+    BaseModel,
+    ConfigDict,
+    Field,
+    field_validator,
+)
 from typing import Dict, List, Optional, Tuple, Union
 
 
@@ -519,8 +525,8 @@ def add_new_user(self, gh_user: str) -> dict:
         Returns
         -------
         Dict
-            Username is the key and the updated github profile info is contained
-            in the dict.
+            Username is the key and the updated github profile info is
+            contained in the dict.
 
         """
 

From 1724749ce7fa70a6bbc7f772b76aaf766a548f87 Mon Sep 17 00:00:00 2001
From: Leah Wasser <leah@pyopensci.org>
Date: Thu, 17 Aug 2023 16:44:43 -0600
Subject: [PATCH 06/12] Bug fixes

---
 .gitignore                         |  2 +-
 pyproject.toml                     |  6 +--
 src/pyosmeta/__init__.py           | 12 ++---
 src/pyosmeta/cli/update_reviews.py |  5 +-
 src/pyosmeta/file_io.py            |  7 ++-
 src/pyosmeta/parse_issues.py       | 74 +++++++++++++++++++-----------
 6 files changed, 63 insertions(+), 43 deletions(-)

diff --git a/.gitignore b/.gitignore
index 888ee0f..9dad33e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,5 +41,5 @@ dmypy.json
 
 token.txt
 src/test-model.py
-
+src/pyosmeta/_version_generated.py
 .pdm-build/*
diff --git a/pyproject.toml b/pyproject.toml
index 991b9d0..5eb2518 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,9 +29,9 @@ classifiers = [
     "Programming Language :: Python :: 3 :: Only", # BE sure to specify that you use python 3.x
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
-version = "0.1.0"
-description = "Tools for contributors"
-authors = [{ name = "Leah Wasser", email = "leah@pyopensci.org" }]
+]
+
+
 dependencies = [
     "ruamel-yaml>=0.17.21",
     "requests",
diff --git a/src/pyosmeta/__init__.py b/src/pyosmeta/__init__.py
index 5b35315..017c97d 100644
--- a/src/pyosmeta/__init__.py
+++ b/src/pyosmeta/__init__.py
@@ -1,16 +1,14 @@
 from .contributors import PersonModel, ProcessContributors
 from .parse_issues import ProcessIssues, ReviewModel
 
-<<<<<<< HEAD
-try:
-    from ._version_generated import __version__
-except ImportError:
-    __version__ = "unreleased"
-=======
 __all__ = (
     "ProcessIssues",
     "ReviewModel",
     "PersonModel",
     "ProcessContributors",
 )
->>>>>>> 007299c (Fix: pyproject tomly flake8/black fix)
+
+try:
+    from ._version_generated import __version__
+except ImportError:
+    __version__ = "unreleased"
diff --git a/src/pyosmeta/cli/update_reviews.py b/src/pyosmeta/cli/update_reviews.py
index bfefc32..bf3bbf6 100644
--- a/src/pyosmeta/cli/update_reviews.py
+++ b/src/pyosmeta/cli/update_reviews.py
@@ -46,7 +46,7 @@ def main():
     if args:
         update_all = False
     web_reviews_path = (
-        "htts://raw.githubusercontent.com/pyOpenSci/"
+        "https://raw.githubusercontent.com/pyOpenSci/"
         "pyopensci.github.io/main/_data/packages.yml"
     )
 
@@ -74,6 +74,7 @@ def main():
                 web_reviews[key.lower()] = meta
 
     # Update gh metrics via api for all packages
+    # TODO: for some reason cardsort is missing gh_metadata
     repo_endpoints = process_review.get_repo_endpoints(web_reviews)
     web_reviews = process_review.get_gh_metrics(repo_endpoints, web_reviews)
 
@@ -87,7 +88,7 @@ def main():
         try:
             all_reviews[key] = ReviewModel(**review)
         except ValidationError as ve:
-            print(ve)
+            print(key, ":", ve)
 
     with open("all_reviews.pickle", "wb") as f:
         pickle.dump(web_reviews, f)
diff --git a/src/pyosmeta/file_io.py b/src/pyosmeta/file_io.py
index 6a773c1..8521477 100644
--- a/src/pyosmeta/file_io.py
+++ b/src/pyosmeta/file_io.py
@@ -71,8 +71,11 @@ def open_yml_file(file_path: str) -> dict:
 
     # TODO: this used to be self.web_yml so i'll need to reorganized
     # the contrib class
-    with urllib.request.urlopen(file_path) as f:
-        return ruamel.yaml.safe_load(f)
+    try:
+        with urllib.request.urlopen(file_path) as f:
+            return ruamel.yaml.safe_load(f)
+    except urllib.error.URLError as url_error:
+        print("Oops - can find the url", file_path, url_error)
 
 
 def export_yaml(filename: str, data_list: list):
diff --git a/src/pyosmeta/parse_issues.py b/src/pyosmeta/parse_issues.py
index d6ca4eb..02291d0 100644
--- a/src/pyosmeta/parse_issues.py
+++ b/src/pyosmeta/parse_issues.py
@@ -1,8 +1,15 @@
+import re
 from datetime import datetime
 
 import requests
 from dataclasses import dataclass
-from pydantic import AliasChoices, BaseModel, ConfigDict, Field, field_validator
+from pydantic import (
+    AliasChoices,
+    BaseModel,
+    ConfigDict,
+    Field,
+    field_validator,
+)
 from typing import Any, Optional
 
 from pyosmeta.contributors import ProcessContributors
@@ -15,7 +22,7 @@ def clean_date(a_date: Optional[str]) -> str:
     others it's a gh time stamp. finally sometimes it could be missing
     or text. handle all of those cases with this validator.
     """
-    print(a_date)
+
     if a_date is None or a_date == "missing":
         return "missing"
     elif len(a_date) < 11:
@@ -28,8 +35,8 @@ def clean_date(a_date: Optional[str]) -> str:
                 .date()
                 .strftime("%Y-%m-%d")
             )
-        except TypeError as te:
-            print("Oops - missing data. Setting date to missing", te)
+        except TypeError as t_error:
+            print("Oops - missing data. Setting date to missing", t_error)
             return "missing"
 
 
@@ -65,25 +72,24 @@ class ReviewModel(BaseModel):
     # Make sure model populates both aliases and original attr name
     model_config = ConfigDict(populate_by_name=True, str_strip_whitespace=True)
 
-    package_name: Optional[str] = None
+    package_name: Optional[str] = ""
     package_description: str = Field(
-        None, validation_alias=AliasChoices("one-line_description_of_package")
+        "", validation_alias=AliasChoices("one-line_description_of_package")
     )
-    submitting_author: dict[str, str] = None
-    all_current_maintainers: list[dict[str, str]] = None
+    submitting_author: dict[str, Optional[str]] = {}
+    all_current_maintainers: list[dict[str, str | None]] = {}
     repository_link: Optional[str] = None
     version_submitted: Optional[str] = None
-    categories: Optional[str] = None
-    categories: list[str] = None
-    editor: dict[str, str] = None
-    reviewer_1: dict[str, str] = None
-    reviewer_2: dict[str, str] = None
-    archive: str = None
-    version_accepted: str = None
-    date_accepted: str = None
+    categories: Optional[list[str]] = None
+    editor: dict[str, str | None] = {}
+    reviewer_1: dict[str, str | None] = {}
+    reviewer_2: dict[str, str | None] = {}
+    archive: Optional[str] = None
+    version_accepted: Optional[str] = None
+    date_accepted: Optional[str] = None
     created_at: str = None
     updated_at: str = None
-    closed_at: str = None
+    closed_at: Optional[str] = None
     issue_link: str = None
     gh_meta: GhMeta
 
@@ -309,6 +315,20 @@ def parse_issue_header(
                 "https://api.github.com/repos/", "https://github.com/"
             )
 
+            review_clean = {
+                key: value
+                for key, value in review[package_name].items()
+                if not key.startswith("##")
+                and not key.startswith("---")
+                and not key.startswith("-_[x]_i_agree")
+            }
+            review[package_name] = review_clean
+            # filtered = {}
+            # for key, value in review.items():
+            #     print(key)
+            #     if not key.startswith("##") and not key.startswith("-"):
+            #         filtered[key] = value
+
             # # Clean markdown url's from editor, and reviewer lines
             # TODO - this could be a reviewer name cleanup validaotr
             # types = ["editor", "reviewer_1", "reviewer_2"]
@@ -378,6 +398,10 @@ def get_repo_endpoints(
         for a_package in review_issues.keys():
             repo = review_issues[a_package]["repository_link"].strip("/")
             owner, repo = repo.split("/")[-2:]
+            # TODO: could be simpler code - Remove any link remnants
+            pattern = r"[\(\)\[\]?]"
+            owner = re.sub(pattern, "", owner)
+            repo = re.sub(pattern, "", repo)
             all_repos[
                 a_package
             ] = f"https://api.github.com/repos/{owner}/{repo}"
@@ -453,7 +477,6 @@ def get_gh_metrics(
         pkg_meta = {}
         for pkg_name, url in endpoints.items():
             print("Getting GitHub stats for", pkg_name)
-
             pkg_meta[pkg_name] = self.get_repo_meta(url, self.gh_stats)
 
             pkg_meta[pkg_name]["contrib_count"] = self.get_repo_contribs(url)
@@ -461,7 +484,7 @@ def get_gh_metrics(
             # Add github meta to review metadata
             reviews[pkg_name]["gh_meta"] = pkg_meta[pkg_name]
 
-            return reviews
+        return reviews
 
     def get_repo_meta(self, url: str, stats_list: list) -> dict:
         """
@@ -469,8 +492,7 @@ def get_repo_meta(self, url: str, stats_list: list) -> dict:
 
         """
         stats_dict = {}
-        # get the url (normally the docs) and description of a repo!
-        print(url)
+        # Get the url (normally the docs) and description of a repo!
         response = requests.get(
             url, headers={"Authorization": f"token {self.GITHUB_TOKEN}"}
         )
@@ -510,7 +532,7 @@ def get_repo_contribs(self, url: str) -> dict:
         )
 
         if response.status_code == 404:
-            print("Can't find: ", url, ". Did the repo url change?")
+            print("Can't find: ", repo_contribs, ". Did the repo url change?")
         # Extract the description and homepage URL from the JSON response
         else:
             return len(response.json())
@@ -528,11 +550,7 @@ def get_last_commit(self, repo: str) -> str:
         response = requests.get(
             url, headers={"Authorization": f"token {self.GITHUB_TOKEN}"}
         ).json()
-        date = (
-            response[0]["commit"]["author"]["date"]
-            # if 0 in response
-            # else "1970-01-01T00:00:00Z"
-        )
+        date = response[0]["commit"]["author"]["date"]
 
         return date
 
@@ -575,7 +593,7 @@ def get_categories(
             checked = any([x in line for x in cat_matches])
 
             if line.startswith("- [") and checked:
-                category = line[line.index("]") + 2 :]
+                category = line[line.index("]") + 2]
                 categories.append(category)
             elif not line.startswith("- ["):
                 break

From e8d694b9df75aac97b5886e088fc3cbbc3d45472 Mon Sep 17 00:00:00 2001
From: Leah Wasser <leah@pyopensci.org>
Date: Fri, 18 Aug 2023 15:39:35 -0600
Subject: [PATCH 07/12] Move all code to model based workflow

---
 src/pyosmeta/cli/update_contributors.py    |   2 -
 src/pyosmeta/cli/update_review_contribs.py | 131 ++++++++++---------
 src/pyosmeta/cli/update_reviews.py         |   2 +-
 src/pyosmeta/contributors.py               | 145 +++++++++++----------
 src/pyosmeta/parse_issues.py               |   6 +-
 5 files changed, 151 insertions(+), 135 deletions(-)

diff --git a/src/pyosmeta/cli/update_contributors.py b/src/pyosmeta/cli/update_contributors.py
index 5676033..922e346 100644
--- a/src/pyosmeta/cli/update_contributors.py
+++ b/src/pyosmeta/cli/update_contributors.py
@@ -8,8 +8,6 @@
 from pyosmeta.file_io import create_paths, open_yml_file
 
 print(pydantic.__version__)
-# TODO - fix the website by renaming   packages-editor, packages-submitted:
-# packages-reviewed: to use underscores. this will just make life easier
 
 
 def main():
diff --git a/src/pyosmeta/cli/update_review_contribs.py b/src/pyosmeta/cli/update_review_contribs.py
index c4c5aa1..79b7f7b 100644
--- a/src/pyosmeta/cli/update_review_contribs.py
+++ b/src/pyosmeta/cli/update_review_contribs.py
@@ -12,58 +12,59 @@
 Rather than hit any api's it just updates information from the issues.
 To run: update_reviewers
 
-# TODO - FEATURE we have some packages that were NOT approved but we had editors and reviewers.
-# We need to acknowledge these people as well. maybe tag them with waiting on maintainer response??
-# TODO: package-wide feature: create a flag for entries that we do not want to update
+# TODO - FEATURE we have some packages that were NOT approved but we had
+# editors and reviewers.
+# We need to acknowledge these people as well. maybe tag them with waiting on
+# maintainer response??
+# TODO: package-wide feature: create no update flag for entries
 # TODO: make sure we can add a 3rd or 4th reviewer - crowsetta has this as
 # will biocypher
 # TODO: make sure to add a current editor boolean to the current editors and
 # emeritus ones.
-# TODO - ?create a class for person types??
 
 """
 
-import os
-
-from pyosmeta.contributors import ProcessContributors
+# TODO - running into validation errors again here.. but making lots
+# of progress!!
+from pyosmeta.contributors import PersonModel, ProcessContributors
 from pyosmeta.file_io import clean_export_yml, load_pickle
 
 
-def get_clean_user(username: str):
+def get_clean_user(username: str) -> str:
+    """A small helper that removes whitespace and ensures username is
+    lower case"""
     return username.lower().strip()
 
 
 def main():
     # TODO: move refresh contribs and contribs dict attr to
     # processContribs and remove this module altogether
-    updateContribs = ProcessContributors([])
+    process_contribs = ProcessContributors([])
 
     # Two pickle files are outputs of the two other scripts
     # use that data to limit web calls
-    contribs = load_pickle("all_contribs.pickle")
-
-    # Output of update_reviews.py
+    all_contribs = load_pickle("all_contribs.pickle")
     packages = load_pickle("all_reviews.pickle")
 
-    contrib_types = updateContribs.contrib_types
+    contrib_types = process_contribs.contrib_types
 
     for pkg_name, issue_meta in packages.items():
         print("Processing review team for:", pkg_name)
         for issue_role in contrib_types.keys():
             if issue_role == "all_current_maintainers":
-                if issue_role in issue_meta:
+                if issue_meta.all_current_maintainers:
                     # Loop through each maintainer in the list
                     for i, a_maintainer in enumerate(
-                        issue_meta.get(issue_role)
+                        issue_meta.all_current_maintainers
                     ):
                         gh_user = get_clean_user(
                             a_maintainer["github_username"]
                         )
 
-                        if gh_user not in contribs.keys():
-                            contribs.update(
-                                updateContribs.check_add_user(
-                                    gh_user, contribs
+                        if gh_user not in all_contribs.keys():
+                            all_contribs.update(
+                                process_contribs.check_add_user(
+                                    gh_user, all_contribs
                                 )
                             )
 
@@ -71,73 +72,79 @@ def main():
                         (
                             contrib_key,
                             pkg_list,
-                        ) = updateContribs.refresh_contribs(
-                            contribs[gh_user],
-                            pkg_name,  # new contribs
+                        ) = process_contribs.refresh_contribs(
+                            all_contribs[gh_user],
+                            pkg_name,  # new all_contribs
                             issue_role,
                         )
                         # Update users contrib list
-                        contribs[gh_user][contrib_key] = pkg_list
+                        setattr(all_contribs[gh_user], contrib_key, pkg_list)
 
-                        _, contrib_list = updateContribs.refresh_contribs(
-                            contribs[gh_user],
+                        _, contrib_list = process_contribs.refresh_contribs(
+                            all_contribs[gh_user],
                             None,
                             issue_role,
                         )
-                        contribs[gh_user]["contributor_type"] = contrib_list
 
-                        # If name is missing in issue summary, populate from contribs
-                        if a_maintainer["name"] == "":
-                            packages[pkg_name]["all_current_maintainers"][i][
-                                "name"
-                            ] = contribs[gh_user]["name"]
+                        setattr(
+                            all_contribs[gh_user],
+                            "contributor_type",
+                            contrib_list,
+                        )
 
+                        # If name is missing in issue summary, populate from
+                        # all_contribs
+                        # TODO: this is currently not working as maintainer is
+                        # a string object
+                        if a_maintainer["name"] == "":
+                            maintainer = getattr(
+                                packages[pkg_name], "all_current_maintainers"
+                            )[i]["name"]
+                            setattr(
+                                packages[pkg_name],
+                                "all_current_maintainers",
+                                getattr(all_contribs[gh_user], "name"),
+                            )
                 else:
                     print(
-                        "All maintainers is missing in the review for ",
+                        "All maintainers is missing in the review for:",
                         pkg_name,
                     )
 
             else:
                 # Else we are processing editors, reviewers...
                 gh_user = get_clean_user(
-                    packages[pkg_name][issue_role]["github_username"]
+                    getattr(packages[pkg_name], issue_role)["github_username"]
                 )
 
-                if gh_user not in contribs.keys():
-                    # If they aren't already in contribs, add them
-                    contribs.update(
-                        updateContribs.check_add_user(gh_user, contribs)
-                    )
-                # Update user package contributions
-                (
-                    contrib_key,
-                    pkg_list,
-                ) = updateContribs.refresh_contribs(
-                    contribs[gh_user],
-                    pkg_name,  # new contribs
-                    issue_role,
-                )
+                if gh_user not in all_contribs.keys():
+                    # If they aren't already in all_contribs, add them
+                    print("Found a new user!", gh_user)
+                    new_contrib = process_contribs.get_user_info(gh_user)
+                    all_contribs[gh_user] = PersonModel(**new_contrib)
 
-                # Update users contrib list
-                contribs[gh_user][contrib_key] = pkg_list
-
-                _, contrib_list = updateContribs.refresh_contribs(
-                    contribs[gh_user],
-                    None,
-                    issue_role,
+                # Update user package contributions
+                print(gh_user)
+                # Only add new contrib if it's unique
+                review_key = contrib_types[issue_role][0]
+                all_contribs[gh_user].add_unique_value(review_key, pkg_name)
+
+                # Update user contrib list
+                review_roles = contrib_types[issue_role][1]
+                all_contribs[gh_user].add_unique_value(
+                    "contributor_type", review_roles
                 )
-                contribs[gh_user]["contributor_type"] = contrib_list
 
-                # If users's name is missing in issue, populate from contribs dict
-                if issue_meta[issue_role]["name"] == "":
-                    packages[pkg_name][issue_role]["name"] = contribs[gh_user][
-                        "name"
-                    ]
+                # If users's name is missing in issue, populate from contribs
+                if getattr(issue_meta, issue_role)["name"] == "":
+                    attribute_value = getattr(packages[pkg_name], issue_role)
+                    attribute_value["name"] = getattr(
+                        all_contribs[gh_user], "name"
+                    )
 
     # Export to yaml
-    clean_export_yml(contribs, os.path.join("_data", "contributors.yml"))
-    clean_export_yml(packages, os.path.join("_data", "packages.yml"))
+    # clean_export_yml(contribs, os.path.join("_data", "contributors.yml"))
+    # clean_export_yml(packages, os.path.join("_data", "packages.yml"))
 
 
 if __name__ == "__main__":
diff --git a/src/pyosmeta/cli/update_reviews.py b/src/pyosmeta/cli/update_reviews.py
index bf3bbf6..8909a1f 100644
--- a/src/pyosmeta/cli/update_reviews.py
+++ b/src/pyosmeta/cli/update_reviews.py
@@ -91,7 +91,7 @@ def main():
             print(key, ":", ve)
 
     with open("all_reviews.pickle", "wb") as f:
-        pickle.dump(web_reviews, f)
+        pickle.dump(all_reviews, f)
 
 
 if __name__ == "__main__":
diff --git a/src/pyosmeta/contributors.py b/src/pyosmeta/contributors.py
index 80d1635..1ad4a1b 100644
--- a/src/pyosmeta/contributors.py
+++ b/src/pyosmeta/contributors.py
@@ -12,12 +12,16 @@
     Field,
     field_validator,
 )
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Set, Dict, List, Optional, Tuple, Union
 
 
 class PersonModel(BaseModel):
     # Make sure model populates both aliases and original attr name
-    model_config = ConfigDict(populate_by_name=True, str_strip_whitespace=True)
+    model_config = ConfigDict(
+        populate_by_name=True,
+        str_strip_whitespace=True,
+        validate_assignment=True,
+    )
 
     name: Optional[str] = None
     title: Optional[Union[list[str], str]] = None
@@ -44,47 +48,67 @@ class PersonModel(BaseModel):
         None, validation_alias=AliasChoices("blog", "website")
     )
     board: Optional[bool] = False
-    contributor_type: Optional[list[str]] = []
-    packages_editor: Optional[list[str | None]] = Field(
-        None,
-        validation_alias=AliasChoices("packages-editor"),
-    )
-    packages_submitted: Optional[list[str | None]] = Field(
-        None,
-        validation_alias=AliasChoices(
-            "packages-submitted", "packages_submitted"
-        ),
-    )
-    packages_reviewed: Optional[list[str | None]] = Field(
-        None,
-        validation_alias=AliasChoices(
-            "packages-reviewed", "packages_reviewed"
-        ),
-    )
+    contributor_type: Set[str] = set()
+    packages_editor: Set[str] = set()
+    packages_submitted: Set[str] = set()
+    packages_reviewed: Set[str] = set()
     location: Optional[str] = None
-    email: Optional[str] = ModuleNotFoundError
+    email: Optional[str] = None
 
     @field_validator(
         "packages_reviewed",
         "packages_submitted",
         "packages_editor",
+        "contributor_type",
         mode="before",
     )
-    @classmethod
-    def string_to_list(cls, value):
-        """
-        For fields such as packages-reviewed edited etc we want
-        a list of elements not just a single string. this will
-        fix that issue.
-        """
-        # If the input value is a string, convert it to a list
+    def convert_to_set(cls, value):
         if isinstance(value, list):
-            return value
-        if isinstance(value, str):
-            return [value]
-        # If the input value is None, return an empty list
+            if value[0] is None:
+                return set()
+            else:
+                return set(value)
         elif value is None:
-            return []
+            return set()
+        return set(value)
+
+    def add_unique_value(self, attr_name: str, values: Union[str, list[str]]):
+        """A helper that will add only unique values to an existing list"""
+        if isinstance(values, str):
+            values = [values]
+        attribute = getattr(self, attr_name)
+        if isinstance(attribute, set):
+            attribute.update(values)
+        else:
+            raise ValueError(f"{attr_name} is not a set attribute")
+
+    # @field_validator(
+    #     "packages_reviewed",
+    #     "packages_submitted",
+    #     "packages_editor",
+    #     "contributor_type",
+    #     mode="before",
+    # )
+    # @classmethod
+    # def string_to_list(cls, value):
+    #     """
+    #     For fields such as packages-reviewed edited etc we want
+    #     a list of elements not just a single string. this will
+    #     fix that issue.
+    #     """
+    #     # If the input value is a string, convert it to a list
+    #     print("the value is", value)
+    #     if isinstance(value, list):
+    #         print("removing duplicates now")
+    #         return list(set(value))
+    #     if isinstance(value, str):
+    #         print("Found a string, turning to list")
+    #         return [value]
+    #     # If the input value is None, return an empty list
+    #     # This may never happen
+    #     elif value is None:
+    #         print("Found a none, returning empty list. Value is", value)
+    #         return []
 
     @field_validator("bio", mode="before")
     @classmethod
@@ -131,15 +155,15 @@ def __init__(self, json_files: List) -> None:
         ]
 
         self.contrib_types = {
-            "reviewer_1": ["packages-reviewed", ["reviewer", "peer-review"]],
-            "reviewer_2": ["packages-reviewed", ["reviewer", "peer-review"]],
-            "editor": ["packages-editor", ["editor", "peer-review"]],
+            "reviewer_1": ["packages_reviewed", ["reviewer", "peer-review"]],
+            "reviewer_2": ["packages_reviewed", ["reviewer", "peer-review"]],
+            "editor": ["packages_editor", ["editor", "peer-review"]],
             "submitting_author": [
-                "packages-submitted",
+                "packages_submitted",
                 ["maintainer", "submitting-author", "peer-review"],
             ],
             "all_current_maintainers": [
-                "packages-submitted",
+                "packages_submitted",
                 ["maintainer", "peer-review"],
             ],
         }
@@ -156,7 +180,14 @@ def get_token(self) -> str:
         load_dotenv()
         return os.environ["GITHUB_TOKEN"]
 
-    def refresh_contribs(self, contribs: Dict, new_contribs, review_role):
+    def refresh_contribs(
+        self,
+        person: PersonModel,
+        new_contribs: Optional[
+            str
+        ],  # I think this will always be a package name? if so rename is pkg_name
+        review_role: str,
+    ):
         """Need to add ....
 
         Parameters
@@ -167,44 +198,20 @@ def refresh_contribs(self, contribs: Dict, new_contribs, review_role):
         -------
         """
         contrib_types = self.contrib_types
-        contrib_key_yml = ""
         # Contributor type will be updated which is a list of roles
+        # TODO rename contribs to person
         if new_contribs:
             contrib_key_yml = contrib_types[review_role][0]
-            existing_contribs = contribs[contrib_key_yml]
-        # Else this is a specific review role meant to update package list
+            existing_contribs = getattr(person, contrib_key_yml)
+
         else:
-            new_contribs = contrib_types[review_role][1]
-            existing_contribs = contribs["contributor_type"]
+            # Else update review role(s) in contrib_type attribute
+            contrib_key_yml = contrib_types[review_role][1]
+            existing_contribs = person.contributor_type
 
         final_list = self.update_contrib_list(existing_contribs, new_contribs)
         return (contrib_key_yml, final_list)
 
-    # TODO: this can go away now that i have a personmodel obj
-    # def create_contrib_template(self) -> Dict:
-    #     """A small helper that creates a template for a new contributor
-    #     that we are adding to our contributor.yml file"""
-
-    #     return {
-    #         "name": "",
-    #         "bio": "",
-    #         "organization": "",
-    #         "title": "",
-    #         "github_username": "",
-    #         "github_image_id": "",
-    #         "editorial-board": "",
-    #         "twitter": "",
-    #         "mastodon": "",
-    #         "orcidid": "",
-    #         "website": "",
-    #         "contributor_type": [],
-    #         "packages-editor": [],
-    #         "packages-submitted": [],
-    #         "packages-reviewed": [],
-    #         "location": "",
-    #         "email": "",
-    #     }
-
     # TODO - This utility is used across all scripts.
     def clean_list(self, a_list: Union[str, List[str]]) -> List[str]:
         """Helper function that takes an input object as a list or string.
diff --git a/src/pyosmeta/parse_issues.py b/src/pyosmeta/parse_issues.py
index 02291d0..fdc50c7 100644
--- a/src/pyosmeta/parse_issues.py
+++ b/src/pyosmeta/parse_issues.py
@@ -70,7 +70,11 @@ def clean_date(cls, a_date: Optional[str]) -> str:
 
 class ReviewModel(BaseModel):
     # Make sure model populates both aliases and original attr name
-    model_config = ConfigDict(populate_by_name=True, str_strip_whitespace=True)
+    model_config = ConfigDict(
+        populate_by_name=True,
+        str_strip_whitespace=True,
+        validate_assignment=True,
+    )
 
     package_name: Optional[str] = ""
     package_description: str = Field(

From 68900cf0e38253cdc3c304a2918490bd2eaf9939 Mon Sep 17 00:00:00 2001
From: Leah Wasser <leah@pyopensci.org>
Date: Fri, 18 Aug 2023 18:11:36 -0600
Subject: [PATCH 08/12] Fixed validation and updates are workign now

---
 src/pyosmeta/cli/update_contributors.py    |   3 +
 src/pyosmeta/cli/update_review_contribs.py | 118 +++++++++------------
 src/pyosmeta/cli/update_reviews.py         |  10 +-
 src/pyosmeta/contributors.py               |  38 ++-----
 4 files changed, 68 insertions(+), 101 deletions(-)

diff --git a/src/pyosmeta/cli/update_contributors.py b/src/pyosmeta/cli/update_contributors.py
index 922e346..716d1fa 100644
--- a/src/pyosmeta/cli/update_contributors.py
+++ b/src/pyosmeta/cli/update_contributors.py
@@ -9,6 +9,9 @@
 
 print(pydantic.__version__)
 
+# TODO - https://stackoverflow.com/questions/55762673/how-to-parse-list-of-models-with-pydantic
+# I can use TypeAdapter to convert the json data to model objects!
+
 
 def main():
     parser = argparse.ArgumentParser(
diff --git a/src/pyosmeta/cli/update_review_contribs.py b/src/pyosmeta/cli/update_review_contribs.py
index 79b7f7b..b111545 100644
--- a/src/pyosmeta/cli/update_review_contribs.py
+++ b/src/pyosmeta/cli/update_review_contribs.py
@@ -24,8 +24,14 @@
 
 """
 
-# TODO - running into validation errors again here.. but making lots
-# of progress!!
+# TODO - Case sensitivity is an issue with my validation using set
+#     - jointly
+# - Jointly
+# - devicely
+# - Devicely
+# - sevivi
+import os
+
 from pyosmeta.contributors import PersonModel, ProcessContributors
 from pyosmeta.file_io import clean_export_yml, load_pickle
 
@@ -43,7 +49,7 @@ def main():
 
     # Two pickle files are outputs of the two other scripts
     # use that data to limit web calls
-    all_contribs = load_pickle("all_contribs.pickle")
+    contribs = load_pickle("all_contribs.pickle")
     packages = load_pickle("all_reviews.pickle")
 
     contrib_types = process_contribs.contrib_types
@@ -51,87 +57,56 @@ def main():
     for pkg_name, issue_meta in packages.items():
         print("Processing review team for:", pkg_name)
         for issue_role in contrib_types.keys():
+            # I wonder if there is a clever way to skip review if this is missing?
             if issue_role == "all_current_maintainers":
-                if issue_meta.all_current_maintainers:
-                    # Loop through each maintainer in the list
-                    for i, a_maintainer in enumerate(
-                        issue_meta.all_current_maintainers
-                    ):
-                        gh_user = get_clean_user(
-                            a_maintainer["github_username"]
-                        )
-
-                        if gh_user not in all_contribs.keys():
-                            all_contribs.update(
-                                process_contribs.check_add_user(
-                                    gh_user, all_contribs
-                                )
-                            )
-
-                        # Update contrib packages for peer review
-                        (
-                            contrib_key,
-                            pkg_list,
-                        ) = process_contribs.refresh_contribs(
-                            all_contribs[gh_user],
-                            pkg_name,  # new all_contribs
-                            issue_role,
-                        )
-                        # Update users contrib list
-                        setattr(all_contribs[gh_user], contrib_key, pkg_list)
-
-                        _, contrib_list = process_contribs.refresh_contribs(
-                            all_contribs[gh_user],
-                            None,
-                            issue_role,
+                # if issue_meta.all_current_maintainers:
+                # Loop through each maintainer in the list
+                for i, a_maintainer in enumerate(
+                    issue_meta.all_current_maintainers
+                ):
+                    gh_user = get_clean_user(a_maintainer["github_username"])
+
+                    if gh_user not in contribs.keys():
+                        contribs.update(
+                            process_contribs.check_add_user(gh_user, contribs)
                         )
 
-                        setattr(
-                            all_contribs[gh_user],
-                            "contributor_type",
-                            contrib_list,
-                        )
+                    # Update user package contributions (if it's unique)
+                    review_key = contrib_types[issue_role][0]
+                    contribs[gh_user].add_unique_value(review_key, pkg_name)
 
-                        # If name is missing in issue summary, populate from
-                        # all_contribs
-                        # TODO: this is currently not working as maintainer is
-                        # a string object
-                        if a_maintainer["name"] == "":
-                            maintainer = getattr(
-                                packages[pkg_name], "all_current_maintainers"
-                            )[i]["name"]
-                            setattr(
-                                packages[pkg_name],
-                                "all_current_maintainers",
-                                getattr(all_contribs[gh_user], "name"),
-                            )
-                else:
-                    print(
-                        "All maintainers is missing in the review for:",
-                        pkg_name,
+                    # Update user contrib list (if it's unique)
+                    review_roles = contrib_types[issue_role][1]
+                    contribs[gh_user].add_unique_value(
+                        "contributor_type", review_roles
                     )
 
+                    # If name is missing in issue, populate from contribs
+                    if a_maintainer["name"] == "":
+                        name = getattr(contribs[gh_user], "name")
+                        packages[pkg_name].all_current_maintainers[i][
+                            "name"
+                        ] = name
+
             else:
                 # Else we are processing editors, reviewers...
                 gh_user = get_clean_user(
                     getattr(packages[pkg_name], issue_role)["github_username"]
                 )
 
-                if gh_user not in all_contribs.keys():
-                    # If they aren't already in all_contribs, add them
+                if gh_user not in contribs.keys():
+                    # If they aren't already in contribs, add them
                     print("Found a new user!", gh_user)
                     new_contrib = process_contribs.get_user_info(gh_user)
-                    all_contribs[gh_user] = PersonModel(**new_contrib)
+                    contribs[gh_user] = PersonModel(**new_contrib)
 
-                # Update user package contributions
-                print(gh_user)
-                # Only add new contrib if it's unique
+                # Update user package contributions (if it's unique)
                 review_key = contrib_types[issue_role][0]
-                all_contribs[gh_user].add_unique_value(review_key, pkg_name)
+                contribs[gh_user].add_unique_value(review_key, pkg_name)
 
-                # Update user contrib list
+                # Update user contrib list (if it's unique)
                 review_roles = contrib_types[issue_role][1]
-                all_contribs[gh_user].add_unique_value(
+                contribs[gh_user].add_unique_value(
                     "contributor_type", review_roles
                 )
 
@@ -139,12 +114,17 @@ def main():
                 if getattr(issue_meta, issue_role)["name"] == "":
                     attribute_value = getattr(packages[pkg_name], issue_role)
                     attribute_value["name"] = getattr(
-                        all_contribs[gh_user], "name"
+                        contribs[gh_user], "name"
                     )
 
+    print("Export")
     # Export to yaml
-    # clean_export_yml(contribs, os.path.join("_data", "contributors.yml"))
-    # clean_export_yml(packages, os.path.join("_data", "packages.yml"))
+    contribs_ls = [model.model_dump() for model in contribs.values()]
+    # Getting error dumping packages
+    pkgs_ls = [model.model_dump() for model in packages.values()]
+
+    clean_export_yml(contribs_ls, os.path.join("_data", "contributors.yml"))
+    clean_export_yml(pkgs_ls, os.path.join("_data", "packages.yml"))
 
 
 if __name__ == "__main__":
diff --git a/src/pyosmeta/cli/update_reviews.py b/src/pyosmeta/cli/update_reviews.py
index 8909a1f..5196f86 100644
--- a/src/pyosmeta/cli/update_reviews.py
+++ b/src/pyosmeta/cli/update_reviews.py
@@ -16,10 +16,6 @@
 # TODO: feature - Would be cool to create an "under review now" list as well -
 # ideally this could be passed as a CLI argument with the label we want to
 # search for
-# TODO: 1. add gh metadata to the review object
-# prior to parsing
-# 2. work on update-all!!
-# 3. i think package_description might not be parsing right?
 
 
 import argparse
@@ -30,6 +26,12 @@
 from pyosmeta import ProcessIssues, ReviewModel
 from pyosmeta.file_io import load_website_yml
 
+# todo - dates are wrong in issues
+
+#   date_accepted: 5-2023-7
+#   created_at: 01-2023-03
+#   updated_at: 27-2023-07
+
 
 def main():
     update_all = False
diff --git a/src/pyosmeta/contributors.py b/src/pyosmeta/contributors.py
index 1ad4a1b..65fe87f 100644
--- a/src/pyosmeta/contributors.py
+++ b/src/pyosmeta/contributors.py
@@ -10,9 +10,10 @@
     BaseModel,
     ConfigDict,
     Field,
+    field_serializer,
     field_validator,
 )
-from typing import Set, Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Set, Tuple, Union
 
 
 class PersonModel(BaseModel):
@@ -82,33 +83,14 @@ def add_unique_value(self, attr_name: str, values: Union[str, list[str]]):
         else:
             raise ValueError(f"{attr_name} is not a set attribute")
 
-    # @field_validator(
-    #     "packages_reviewed",
-    #     "packages_submitted",
-    #     "packages_editor",
-    #     "contributor_type",
-    #     mode="before",
-    # )
-    # @classmethod
-    # def string_to_list(cls, value):
-    #     """
-    #     For fields such as packages-reviewed edited etc we want
-    #     a list of elements not just a single string. this will
-    #     fix that issue.
-    #     """
-    #     # If the input value is a string, convert it to a list
-    #     print("the value is", value)
-    #     if isinstance(value, list):
-    #         print("removing duplicates now")
-    #         return list(set(value))
-    #     if isinstance(value, str):
-    #         print("Found a string, turning to list")
-    #         return [value]
-    #     # If the input value is None, return an empty list
-    #     # This may never happen
-    #     elif value is None:
-    #         print("Found a none, returning empty list. Value is", value)
-    #         return []
+    @field_serializer(
+        "packages_reviewed",
+        "packages_submitted",
+        "packages_editor",
+        "contributor_type",
+    )
+    def serialize_set(self, items: Set[str]):
+        return list(items)
 
     @field_validator("bio", mode="before")
     @classmethod

From fd6c5c96dda8d7dec73f9d5b3a43d98a6e64b6af Mon Sep 17 00:00:00 2001
From: Leah Wasser <leah@pyopensci.org>
Date: Fri, 18 Aug 2023 20:00:11 -0600
Subject: [PATCH 09/12] Fix: dates and deprecated methods

---
 src/pyosmeta/cli/update_contributors.py    | 23 ++++---
 src/pyosmeta/cli/update_review_contribs.py | 10 ++--
 src/pyosmeta/cli/update_reviews.py         |  2 +-
 src/pyosmeta/contributors.py               | 70 ++--------------------
 src/pyosmeta/parse_issues.py               |  2 +-
 5 files changed, 29 insertions(+), 78 deletions(-)

diff --git a/src/pyosmeta/cli/update_contributors.py b/src/pyosmeta/cli/update_contributors.py
index 716d1fa..356dc5c 100644
--- a/src/pyosmeta/cli/update_contributors.py
+++ b/src/pyosmeta/cli/update_contributors.py
@@ -9,11 +9,13 @@
 
 print(pydantic.__version__)
 
-# TODO - https://stackoverflow.com/questions/55762673/how-to-parse-list-of-models-with-pydantic
+# TODO - https://stackoverflow.com
+# /questions/55762673/how-to-parse-list-of-models-with-pydantic
 # I can use TypeAdapter to convert the json data to model objects!
 
 
 def main():
+    update_all = False
     parser = argparse.ArgumentParser(
         description="A CLI script to update pyOpenSci contributors"
     )
@@ -72,12 +74,17 @@ def main():
                 all_contribs[gh_user] = PersonModel(**new_contrib)
 
             # Update contribution type list for all users
-            existing_contribs = all_contribs[gh_user].contributor_type
-            all_contribs[
-                gh_user
-            ].contributor_type = process_contribs.update_contrib_list(
-                existing_contribs, key
-            )
+            all_contribs[gh_user].add_unique_value("contributor_type", key)
+
+            # existing_contribs = all_contribs[gh_user].contributor_type
+            # # TODO: i can move all of these update items to just use the
+            # # personmodel.add_unique_value then i can get rid of update
+            # # contrib list
+            # all_contribs[
+            #     gh_user
+            # ].contributor_type = process_contribs.update_contrib_list(
+            #     existing_contribs, key
+            # )
 
     if update_all:
         for user in all_contribs.keys():
@@ -89,7 +96,7 @@ def main():
 
             for key, item in new_gh_data.items():
                 if key == "mastodon":
-                    # Mastodon isn't available in the api yet
+                    # Mastodon isn't available in the GH api yet
                     continue
                 # Don't replace the value if there is a noupdate flag
                 # TODO: This approach doesn't work, ruemal-yaml doesn't
diff --git a/src/pyosmeta/cli/update_review_contribs.py b/src/pyosmeta/cli/update_review_contribs.py
index b111545..d2faf63 100644
--- a/src/pyosmeta/cli/update_review_contribs.py
+++ b/src/pyosmeta/cli/update_review_contribs.py
@@ -57,9 +57,7 @@ def main():
     for pkg_name, issue_meta in packages.items():
         print("Processing review team for:", pkg_name)
         for issue_role in contrib_types.keys():
-            # I wonder if there is a clever way to skip review if this is missing?
             if issue_role == "all_current_maintainers":
-                # if issue_meta.all_current_maintainers:
                 # Loop through each maintainer in the list
                 for i, a_maintainer in enumerate(
                     issue_meta.all_current_maintainers
@@ -73,7 +71,9 @@ def main():
 
                     # Update user package contributions (if it's unique)
                     review_key = contrib_types[issue_role][0]
-                    contribs[gh_user].add_unique_value(review_key, pkg_name)
+                    contribs[gh_user].add_unique_value(
+                        review_key, pkg_name.lower()
+                    )
 
                     # Update user contrib list (if it's unique)
                     review_roles = contrib_types[issue_role][1]
@@ -102,7 +102,9 @@ def main():
 
                 # Update user package contributions (if it's unique)
                 review_key = contrib_types[issue_role][0]
-                contribs[gh_user].add_unique_value(review_key, pkg_name)
+                contribs[gh_user].add_unique_value(
+                    review_key, pkg_name.lower()
+                )
 
                 # Update user contrib list (if it's unique)
                 review_roles = contrib_types[issue_role][1]
diff --git a/src/pyosmeta/cli/update_reviews.py b/src/pyosmeta/cli/update_reviews.py
index 5196f86..92e39aa 100644
--- a/src/pyosmeta/cli/update_reviews.py
+++ b/src/pyosmeta/cli/update_reviews.py
@@ -46,7 +46,7 @@ def main():
     args = parser.parse_args()
 
     if args:
-        update_all = False
+        update_all = True
     web_reviews_path = (
         "https://raw.githubusercontent.com/pyOpenSci/"
         "pyopensci.github.io/main/_data/packages.yml"
diff --git a/src/pyosmeta/contributors.py b/src/pyosmeta/contributors.py
index 65fe87f..c2e798c 100644
--- a/src/pyosmeta/contributors.py
+++ b/src/pyosmeta/contributors.py
@@ -63,15 +63,18 @@ class PersonModel(BaseModel):
         "contributor_type",
         mode="before",
     )
-    def convert_to_set(cls, value):
+    def convert_to_set(cls, value: list[str]):
         if isinstance(value, list):
-            if value[0] is None:
+            if not value:
+                return set()
+            elif value[0] is None:
                 return set()
             else:
+                value = [aval.lower() for aval in value]
                 return set(value)
         elif value is None:
             return set()
-        return set(value)
+        return set(value.lower())
 
     def add_unique_value(self, attr_name: str, values: Union[str, list[str]]):
         """A helper that will add only unique values to an existing list"""
@@ -162,38 +165,6 @@ def get_token(self) -> str:
         load_dotenv()
         return os.environ["GITHUB_TOKEN"]
 
-    def refresh_contribs(
-        self,
-        person: PersonModel,
-        new_contribs: Optional[
-            str
-        ],  # I think this will always be a package name? if so rename is pkg_name
-        review_role: str,
-    ):
-        """Need to add ....
-
-        Parameters
-        ----------
-
-
-        Returns
-        -------
-        """
-        contrib_types = self.contrib_types
-        # Contributor type will be updated which is a list of roles
-        # TODO rename contribs to person
-        if new_contribs:
-            contrib_key_yml = contrib_types[review_role][0]
-            existing_contribs = getattr(person, contrib_key_yml)
-
-        else:
-            # Else update review role(s) in contrib_type attribute
-            contrib_key_yml = contrib_types[review_role][1]
-            existing_contribs = person.contributor_type
-
-        final_list = self.update_contrib_list(existing_contribs, new_contribs)
-        return (contrib_key_yml, final_list)
-
     # TODO - This utility is used across all scripts.
     def clean_list(self, a_list: Union[str, List[str]]) -> List[str]:
         """Helper function that takes an input object as a list or string.
@@ -235,35 +206,6 @@ def unique_new_vals(
             default = (True, diff)
         return default
 
-    # TODO - also a helper used by all scripts
-    def update_contrib_list(
-        self,
-        existing_contribs: Union[List, str],
-        new_contrib: Union[List, str],
-    ) -> List:
-        """Method that gets an existing list of contribs.
-        cleans the list and then checks the list against a
-        new contribution to see if it should be added.
-
-        Parameters
-        ----------
-        existing_contribs: list or str
-            A users existing contributions
-        new_contrib: list or str
-            a list or a single new contribution to be added
-
-        """
-
-        # Cleanup first
-        cleaned_list = self.clean_list(existing_contribs)
-        new_contrib = self.clean_list(new_contrib)
-
-        unique_vals, new_vals = self.unique_new_vals(cleaned_list, new_contrib)
-        if unique_vals:
-            cleaned_list += new_vals
-
-        return cleaned_list
-
     def check_contrib_type(self, json_file: str):
         """
         Determine the type of contribution the person
diff --git a/src/pyosmeta/parse_issues.py b/src/pyosmeta/parse_issues.py
index fdc50c7..bce610e 100644
--- a/src/pyosmeta/parse_issues.py
+++ b/src/pyosmeta/parse_issues.py
@@ -27,7 +27,7 @@ def clean_date(a_date: Optional[str]) -> str:
         return "missing"
     elif len(a_date) < 11:
         new_date = a_date.replace("/", "-").split("-")
-        return f"{new_date[2]}-{new_date[0]}-{new_date[1]}"
+        return f"{new_date[0]}-{new_date[1]}-{new_date[2]}"
     else:
         try:
             return (

From 158141e8cd0442c341f28943aa2cf4a153ef118e Mon Sep 17 00:00:00 2001
From: Leah Wasser <leah@pyopensci.org>
Date: Sat, 19 Aug 2023 18:22:11 -0600
Subject: [PATCH 10/12] Fix: more bugs and cleanup

---
 pyproject.toml                                |   8 +-
 src/pyosmeta/cli/process_reviews.py           | 110 ++++++++++
 ...iew_contribs.py => update_review_teams.py} |  21 +-
 src/pyosmeta/cli/update_reviews.py            | 100 ---------
 src/pyosmeta/contributors.py                  | 194 +++++++++---------
 src/pyosmeta/parse_issues.py                  | 143 ++++++++-----
 6 files changed, 315 insertions(+), 261 deletions(-)
 create mode 100644 src/pyosmeta/cli/process_reviews.py
 rename src/pyosmeta/cli/{update_review_contribs.py => update_review_teams.py} (89%)
 delete mode 100644 src/pyosmeta/cli/update_reviews.py

diff --git a/pyproject.toml b/pyproject.toml
index 5eb2518..f80ed4b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,8 +59,8 @@ license = { text = "MIT" }
 # for a user to run directly from the package.
 [project.scripts] # Optional
 update-contributors = "pyosmeta.cli.update_contributors:main"
-update-reviews = "pyosmeta.cli.update_reviews:main"
-update-reviewers = "pyosmeta.cli.update_review_contribs:main"
+process-reviews = "pyosmeta.cli.process_reviews:main"
+update-review-teams = "pyosmeta.cli.update_review_teams:main"
 
 
 # Right now i'm not using pdm to add dependencies.
@@ -77,9 +77,7 @@ py_version = 27
 
 [tool.flake8]
 # List of error codes to ignore (comma-separated)
-ignore = "E203, W503"
-
-[tool.pdm]
+ignore = ["E203", "W503"]
 
 
 [tool.pdm.build]
diff --git a/src/pyosmeta/cli/process_reviews.py b/src/pyosmeta/cli/process_reviews.py
new file mode 100644
index 0000000..a5862a8
--- /dev/null
+++ b/src/pyosmeta/cli/process_reviews.py
@@ -0,0 +1,110 @@
+"""
+Script that parses metadata from na issue and adds it to a yml file for the
+website. It also grabs some of the package metadata such as stars,
+last commit, etc.
+
+Output: packages.yml file containing a list of
+ 1. all packages with accepted reviews
+ 2. information related to the review including reviewers, editors
+ 3. basic package stats including stars, etc.
+
+To run at the CLI: parse_issue_metadata
+"""
+
+# TODO: if we export files we might want packages.yml and then under_review.yml
+# thus we'd want to add a second input parameters which was file_name
+# TODO: feature - Would be cool to create an "under review now" list as well -
+# ideally this could be passed as a CLI argument with the label we want to
+# search for
+
+
+# import argparse
+import pickle
+
+from pydantic import ValidationError
+
+from pyosmeta import ProcessIssues, ReviewModel
+
+# from pyosmeta.file_io import load_website_yml
+
+
+# TODO: change the template to ask for date accepted format year-month-day
+
+
+def main():
+    # update_all = False
+    # parser = argparse.ArgumentParser(
+    #     description="A CLI script to update pyOpenSci reviews"
+    # )
+    # parser.add_argument(
+    #     "--update",
+    #     type=str,
+    #     help="Will force update review info from GitHub for every review",
+    # )
+    # args = parser.parse_args()
+
+    # if args:
+    #     update_all = True
+    # web_reviews_path = (
+    #     "https://raw.githubusercontent.com/pyOpenSci/"
+    #     "pyopensci.github.io/main/_data/packages.yml"
+    # )
+
+    process_review = ProcessIssues(
+        org="pyopensci",
+        repo_name="software-submission",
+        label_name="6/pyOS-approved 🚀🚀🚀",
+    )
+
+    # Open web yaml & return dict with package name as key
+    # web_reviews = load_website_yml(key="package_name", url=web_reviews_path)
+
+    # Get all issues for approved packages - load as dict
+    issues = process_review.return_response()
+    accepted_reviews = process_review.parse_issue_header(issues, 45)
+
+    # TODO: clean out extra fields from accepted reviews??
+
+    # Parse through reviews, identify new ones, fix case
+    # TODO - right now i've reverted back to always updating all reviews.
+    # Is there a use-case to only update a new package vs updating everything?
+    # if update_all:
+    # all_reviews = {}
+    # for key, meta in accepted_reviews.items():
+    #     try:
+    #         all_reviews[key.lower()] = ReviewModel(**meta)
+    #     except ValidationError as ve:
+    #         print(ve)
+
+    # else:
+    #     for key, meta in accepted_reviews.items():
+    #         if key.lower() not in all_reviews.keys():
+    #             print("Yay - pyOS has a new package:", key)
+    #             all_reviews[key.lower()] = ReviewModel(**meta)
+
+    # Update gh metrics via api for all packages
+    # TODO: this is working but above i made everything a model object
+    # do i want to do that above or just do it all at once below?
+    repo_endpoints = process_review.get_repo_endpoints(accepted_reviews)
+    all_reviews = process_review.get_gh_metrics(
+        repo_endpoints, accepted_reviews
+    )
+
+    # Finally populate model objects with review data + metrics
+    # TODO: this is really close - it's erroring when populating date
+    # i suspect in the github metadata
+    final_reviews = {}
+    for key, review in all_reviews.items():
+        # First add gh meta to each dict
+        print("Parsing & validating", key)
+        try:
+            final_reviews[key] = ReviewModel(**review)
+        except ValidationError as ve:
+            print(key, ":", ve)
+
+    with open("all_reviews.pickle", "wb") as f:
+        pickle.dump(final_reviews, f)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/pyosmeta/cli/update_review_contribs.py b/src/pyosmeta/cli/update_review_teams.py
similarity index 89%
rename from src/pyosmeta/cli/update_review_contribs.py
rename to src/pyosmeta/cli/update_review_teams.py
index d2faf63..31d72f7 100644
--- a/src/pyosmeta/cli/update_review_contribs.py
+++ b/src/pyosmeta/cli/update_review_teams.py
@@ -32,7 +32,11 @@
 # - sevivi
 import os
 
-from pyosmeta.contributors import PersonModel, ProcessContributors
+from pyosmeta.contributors import (
+    PersonModel,
+    ProcessContributors,
+    ValidationError,
+)
 from pyosmeta.file_io import clean_export_yml, load_pickle
 
 
@@ -65,9 +69,12 @@ def main():
                     gh_user = get_clean_user(a_maintainer["github_username"])
 
                     if gh_user not in contribs.keys():
-                        contribs.update(
-                            process_contribs.check_add_user(gh_user, contribs)
-                        )
+                        print("Found a new user!", gh_user)
+                        new_contrib = process_contribs.get_user_info(gh_user)
+                        try:
+                            contribs[gh_user] = PersonModel(**new_contrib)
+                        except ValidationError as ve:
+                            print(ve)
 
                     # Update user package contributions (if it's unique)
                     review_key = contrib_types[issue_role][0]
@@ -98,7 +105,10 @@ def main():
                     # If they aren't already in contribs, add them
                     print("Found a new user!", gh_user)
                     new_contrib = process_contribs.get_user_info(gh_user)
-                    contribs[gh_user] = PersonModel(**new_contrib)
+                    try:
+                        contribs[gh_user] = PersonModel(**new_contrib)
+                    except ValidationError as ve:
+                        print(ve)
 
                 # Update user package contributions (if it's unique)
                 review_key = contrib_types[issue_role][0]
@@ -119,7 +129,6 @@ def main():
                         contribs[gh_user], "name"
                     )
 
-    print("Export")
     # Export to yaml
     contribs_ls = [model.model_dump() for model in contribs.values()]
     # Getting error dumping packages
diff --git a/src/pyosmeta/cli/update_reviews.py b/src/pyosmeta/cli/update_reviews.py
deleted file mode 100644
index 92e39aa..0000000
--- a/src/pyosmeta/cli/update_reviews.py
+++ /dev/null
@@ -1,100 +0,0 @@
-"""
-Script that parses metadata from na issue and adds it to a yml file for the
-website. It also grabs some of the package metadata such as stars,
-last commit, etc.
-
-Output: packages.yml file containing a list of
- 1. all packages with accepted reviews
- 2. information related to the review including reviewers, editors
- 3. basic package stats including stars, etc.
-
-To run at the CLI: parse_issue_metadata
-"""
-
-# TODO: if we export files we might want packages.yml and then under_review.yml
-# thus we'd want to add a second input parameters which was file_name
-# TODO: feature - Would be cool to create an "under review now" list as well -
-# ideally this could be passed as a CLI argument with the label we want to
-# search for
-
-
-import argparse
-import pickle
-
-from pydantic import ValidationError
-
-from pyosmeta import ProcessIssues, ReviewModel
-from pyosmeta.file_io import load_website_yml
-
-# todo - dates are wrong in issues
-
-#   date_accepted: 5-2023-7
-#   created_at: 01-2023-03
-#   updated_at: 27-2023-07
-
-
-def main():
-    update_all = False
-    parser = argparse.ArgumentParser(
-        description="A CLI script to update pyOpenSci reviews"
-    )
-    parser.add_argument(
-        "--update",
-        type=str,
-        help="Will force update review info from GitHub for every review",
-    )
-    args = parser.parse_args()
-
-    if args:
-        update_all = True
-    web_reviews_path = (
-        "https://raw.githubusercontent.com/pyOpenSci/"
-        "pyopensci.github.io/main/_data/packages.yml"
-    )
-
-    process_review = ProcessIssues(
-        org="pyopensci",
-        repo_name="software-submission",
-        label_name="6/pyOS-approved 🚀🚀🚀",
-    )
-
-    # Open web yaml & return dict with package name as key
-    web_reviews = load_website_yml(key="package_name", url=web_reviews_path)
-
-    # Get all issues for approved packages
-    issues = process_review.return_response()
-    accepted_reviews = process_review.parse_issue_header(issues, 15)
-
-    # Parse through reviews, identify new ones, fix case
-    if update_all:
-        for key, meta in accepted_reviews.items():
-            web_reviews[key.lower()] = meta
-    else:
-        for key, meta in accepted_reviews.items():
-            if key.lower() not in web_reviews.keys():
-                print("Yay - pyOS has a new package:", key)
-                web_reviews[key.lower()] = meta
-
-    # Update gh metrics via api for all packages
-    # TODO: for some reason cardsort is missing gh_metadata
-    repo_endpoints = process_review.get_repo_endpoints(web_reviews)
-    web_reviews = process_review.get_gh_metrics(repo_endpoints, web_reviews)
-
-    # Finally populate model objects with review data + metrics
-    # TODO: this is really close - it's erroring when populating date
-    # i suspect in the github metadata
-    all_reviews = {}
-    for key, review in web_reviews.items():
-        # First add gh meta to each dict
-        print("Parsing & validating", key)
-        try:
-            all_reviews[key] = ReviewModel(**review)
-        except ValidationError as ve:
-            print(key, ":", ve)
-
-    with open("all_reviews.pickle", "wb") as f:
-        pickle.dump(all_reviews, f)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/pyosmeta/contributors.py b/src/pyosmeta/contributors.py
index c2e798c..3e82e60 100644
--- a/src/pyosmeta/contributors.py
+++ b/src/pyosmeta/contributors.py
@@ -56,6 +56,24 @@ class PersonModel(BaseModel):
     location: Optional[str] = None
     email: Optional[str] = None
 
+    # # TODO - turn this into a validator for the user website
+    # def _check_url(self, url: str) -> bool:
+    #     """Test a url and return true if it works, false if not
+
+    #     Parameters
+    #     ----------
+    #     url : str
+    #         String for a url to a website to test.
+
+    #     """
+
+    #     try:
+    #         response = requests.get(url, timeout=6)
+    #         return response.status_code == 200
+    #     except Exception:
+    #         print("Oops, url", url, "is not valid, removing it")
+    #         return False
+
     @field_validator(
         "packages_reviewed",
         "packages_submitted",
@@ -236,23 +254,26 @@ def check_contrib_type(self, json_file: str):
             contrib_type = "community"
         return contrib_type
 
-    def check_add_user(self, gh_user: str, contribs: Dict[str, str]) -> None:
-        """Check to make sure user exists in the existing contrib data. If they
-        don't' exist, add them
+    # TODO possibly could repurpose this as a check in the code
+    # but it should return get_user_info
+    # def check_add_user(self, gh_user: str, contribs: Dict[str, str]) -> None:
+    #     """Check to make sure user exists in the existing contrib data. If
+    #     they
+    #     don't' exist, add them
 
-        Parameters
-        ----------
-        gh_user : str
-            github username
-        contribs: dict
-            A dictionary containing contributors with gh user being the key
+    #     Parameters
+    #     ----------
+    #     gh_user : str
+    #         github username
+    #     contribs: dict
+    #         A dictionary containing contributors with gh user being the key
 
-        This returns the updated dictionary with a new user at the end.
+    #     This returns the updated dictionary with a new user at the end.
 
-        """
-        if gh_user not in contribs.keys():
-            print("Missing user", gh_user, "adding them now.")
-            return self.add_new_user(gh_user)
+    #     """
+    #     if gh_user not in contribs.keys():
+    #         print("Missing user", gh_user, "adding them now.")
+    #         return self.get_user_info(gh_user)
 
     def load_json(self, json_path: str) -> dict:
         """
@@ -440,34 +461,35 @@ def combine_users(self, repoDict: dict, webDict: dict) -> dict:
                 webDict[gh_user] = repoDict[gh_user]
         return webDict
 
-    def add_new_user(self, gh_user: str) -> dict:
-        """Add a new user to the contrib file using gh username
+    # # TODO: i think i can remove this method
+    # def add_new_user(self, gh_user: str) -> dict:
+    #     """Add a new user to the contrib file using gh username
 
-        This method does a few things.
-        1. Adds a new template entry for the user w no values populated
-        2. Gets user metadata from the user's github profile
-        3. Updates their contrib entry with the gh data
+    #     This method does a few things.
+    #     1. Adds a new template entry for the user w no values populated
+    #     2. Gets user metadata from the user's github profile
+    #     3. Updates their contrib entry with the gh data
 
-        Parameters
-        ----------
-        gh_user : str
-            String representing the GitHub username
+    #     Parameters
+    #     ----------
+    #     gh_user : str
+    #         String representing the GitHub username
 
-        Returns
-        -------
-        Dict
-            Username is the key and the updated github profile info is
-            contained in the dict.
+    #     Returns
+    #     -------
+    #     Dict
+    #         Username is the key and the updated github profile info is
+    #         contained in the dict.
 
-        """
+    #     """
 
-        new = {}
-        # Rather than this template i can use the person_model
-        new[gh_user] = self.create_contrib_template()
-        gh_data = self.get_gh_data([gh_user])
-        # Update their metadata in the dict and return
-        updated_data = self.update_contrib_data(new, gh_data)
-        return updated_data
+    #     new = {}
+    #     # Rather than this template i can use the person_model
+    #     new[gh_user] = self.create_contrib_template()
+    #     gh_data = self.get_gh_data([gh_user])
+    #     # Update their metadata in the dict and return
+    #     updated_data = self.update_contrib_data(new, gh_data)
+    #     return updated_data
 
     def get_gh_data(
         self, contribs: Union[Dict[str, str], List]
@@ -499,63 +521,47 @@ def get_gh_data(
             all_user_info[gh_user] = self.get_user_info(gh_user, aname)
         return all_user_info
 
-    def _check_url(self, url: str) -> bool:
-        """Test a url and return true if it works, false if not
-
-        Parameters
-        ----------
-        url : str
-            String for a url to a website to test.
-
-        """
-
-        try:
-            response = requests.get(url, timeout=6)
-            return response.status_code == 200
-        except Exception:
-            print("Oops, url", url, "is not valid, removing it")
-            return False
-
-    def update_contrib_data(self, contrib_data: dict, gh_data: dict):
-        """Update contributor data from the GH API return.
-
-        Use the GitHub API to grab user profile data such as twitter handle,
-        mastodon, website, email and location and update contributor
-        information. GitHub profile data is the source of truth source for
-        contributor metadata.
-
-        Parameters
-        ----------
-        contrib_data : dict
-            A dict containing contributor data to be updated
-        gh_data : dict
-            Updated contributor data pulled from github API
-
-        Returns
-        -------
-        dict
-            Dictionary containing updated contributor data.
-        """
-
-        for i, gh_name in enumerate(contrib_data.keys()):
-            print(i, gh_name)
-            # Update the key:value pairs for data pulled from GitHub
-            for akey in self.update_keys:
-                if akey == "website":
-                    url = gh_data[gh_name][gh_name][akey]
-                    # Fix the url format and check to see if it works online
-                    url = self.format_url(url)
-                    # It url is valid, add to dict
-                    if self._check_url(url):
-                        contrib_data[gh_name][akey] = url
-                    else:
-                        contrib_data[gh_name][akey] = ""
-                else:
-                    contrib_data[gh_name][akey] = gh_data[gh_name][gh_name][
-                        akey
-                    ]
-
-        return contrib_data
+    # Shouldn't need this anymore with pydantic
+    # def update_contrib_data(self, contrib_data: dict, gh_data: dict):
+    #     """Update contributor data from the GH API return.
+
+    #     Use the GitHub API to grab user profile data such as twitter handle,
+    #     mastodon, website, email and location and update contributor
+    #     information. GitHub profile data is the source of truth source for
+    #     contributor metadata.
+
+    #     Parameters
+    #     ----------
+    #     contrib_data : dict
+    #         A dict containing contributor data to be updated
+    #     gh_data : dict
+    #         Updated contributor data pulled from github API
+
+    #     Returns
+    #     -------
+    #     dict
+    #         Dictionary containing updated contributor data.
+    #     """
+
+    #     for i, gh_name in enumerate(contrib_data.keys()):
+    #         print(i, gh_name)
+    #         # Update the key:value pairs for data pulled from GitHub
+    #         for akey in self.update_keys:
+    #             if akey == "website":
+    #                 url = gh_data[gh_name][gh_name][akey]
+    #                 # Fix the url format and check to see if it works online
+    #                 url = self.format_url(url)
+    #                 # It url is valid, add to dict
+    #                 if self._check_url(url):
+    #                     contrib_data[gh_name][akey] = url
+    #                 else:
+    #                     contrib_data[gh_name][akey] = ""
+    #             else:
+    #                 contrib_data[gh_name][akey] = gh_data[gh_name][gh_name][
+    #                     akey
+    #                 ]
+
+    #     return contrib_data
 
     def format_url(self, url: str) -> str:
         """Append https to the beginning of URL if it doesn't exist
diff --git a/src/pyosmeta/parse_issues.py b/src/pyosmeta/parse_issues.py
index bce610e..8bea3ec 100644
--- a/src/pyosmeta/parse_issues.py
+++ b/src/pyosmeta/parse_issues.py
@@ -25,9 +25,9 @@ def clean_date(a_date: Optional[str]) -> str:
 
     if a_date is None or a_date == "missing":
         return "missing"
-    elif len(a_date) < 11:
-        new_date = a_date.replace("/", "-").split("-")
-        return f"{new_date[0]}-{new_date[1]}-{new_date[2]}"
+    # elif len(a_date) < 11:
+    #     new_date = a_date.replace("/", "-").split("-")
+    #     return f"{new_date[0]}-{new_date[1]}-{new_date[2]}"
     else:
         try:
             return (
@@ -95,10 +95,29 @@ class ReviewModel(BaseModel):
     updated_at: str = None
     closed_at: Optional[str] = None
     issue_link: str = None
-    gh_meta: GhMeta
+    joss: Optional[str] = None
+    gh_meta: Optional[GhMeta] = None
 
     @field_validator(
         "date_accepted",
+        mode="before",
+    )
+    @classmethod
+    def clean_date_review(cls, a_date: Optional[str]) -> str:
+        """Clean a manually added datetime that is added to a review by an
+        editor when the review package is accepted.
+
+        """
+        if a_date is None or a_date in ["missing", "TBD"]:
+            return "missing"
+        else:
+            new_date = a_date.replace("/", "-").split("-")
+            if len(new_date[0]) == 4:
+                return f"{new_date[0]}-{new_date[1]}-{new_date[2]}"
+            else:
+                return f"{new_date[2]}-{new_date[0]}-{new_date[1]}"
+
+    @field_validator(
         "created_at",
         "updated_at",
         "closed_at",
@@ -114,6 +133,27 @@ def clean_date(cls, a_date: Optional[str]) -> str:
 
         return clean_date(a_date)
 
+    @field_validator(
+        "editor",
+        "reviewer_1",
+        "reviewer_2",
+        mode="before",
+    )
+    @classmethod
+    def clean_gh_url(cls, user: dict[str, str]) -> dict[str, str]:
+        """Remove markdown link remnants from gh usernames and name.
+
+        Sometimes editors and reviewers add names using github links.
+        Remove the link data.
+        """
+
+        user["github_username"] = user["github_username"].replace(
+            "https://github.com/", ""
+        )
+        user["name"] = re.sub(r"\[|\]", "", user["name"])
+
+        return user
+
 
 @dataclass
 class ProcessIssues:
@@ -274,7 +314,7 @@ def _get_line_meta(self, line_item: list[str]) -> dict[str, object]:
         return meta
 
     def parse_issue_header(
-        self, issues: list[str], total_lines: int = 15
+        self, issues: list[str], total_lines: int = 20
     ) -> dict[str, str]:
         """
         A function that parses through the header of an issue.
@@ -287,7 +327,7 @@ def parse_issue_header(
             metadata at the top of each issue
         total_lines : int
             an integer representing the total number of lines to parse in the
-            issue header. Default = 15
+            issue header. Default = 20
 
         Returns
         -------
@@ -301,32 +341,29 @@ def parse_issue_header(
 
         review = {}
         for issue in issues:
-            package_name, body_data = self.parse_comment(issue)
-            if not package_name:
+            pkg_name, body_data = self.parse_comment(issue)
+            if not pkg_name:
                 continue
             # Index of 15 should include date accepted in the review meta
-            review[package_name] = self.get_issue_meta(body_data, total_lines)
-            # Add issue open and close date to package meta
-            # Created, opened & closed dates are in GitHub Issue response
+            review[pkg_name] = self.get_issue_meta(body_data, total_lines)
+            # Add issue open and close date to package meta from GH response
+            # Date cleaning happens via pydantic validator not here
             for a_date in meta_dates:
-                # TODO: this could become a validator
-                review[package_name][a_date] = issue[
-                    a_date
-                ]  # self._clean_date(issue[a_date])
+                review[pkg_name][a_date] = issue[a_date]
             # Get categories and issue review link
-            review[package_name]["categories"] = self.get_categories(body_data)
-            review[package_name]["issue_link"] = issue["url"].replace(
+            review[pkg_name]["categories"] = self.get_categories(body_data)
+            review[pkg_name]["issue_link"] = issue["url"].replace(
                 "https://api.github.com/repos/", "https://github.com/"
             )
 
             review_clean = {
                 key: value
-                for key, value in review[package_name].items()
+                for key, value in review[pkg_name].items()
                 if not key.startswith("##")
                 and not key.startswith("---")
                 and not key.startswith("-_[x]_i_agree")
             }
-            review[package_name] = review_clean
+            review[pkg_name] = review_clean
             # filtered = {}
             # for key, value in review.items():
             #     print(key)
@@ -346,7 +383,7 @@ def parse_issue_header(
             #             .replace("]", "")
             #         )
 
-            # review[package_name] = issue_meta
+            # review[pkg_name] = issue_meta
 
         return review
 
@@ -394,7 +431,7 @@ def get_repo_endpoints(
         Returns
         -------
             Dict
-                Containing package_name: endpoint for each review.
+                Containing pkg_name: endpoint for each review.
 
         """
 
@@ -424,7 +461,7 @@ def parse_comment(self, issue: dict[str, str]) -> tuple[str, list[str]]:
 
         Returns
         -------
-            package_name : str
+            pkg_name : str
                 The name of the package
             comment : list
                 A list containing the comment elements in order
@@ -454,9 +491,9 @@ def parse_comment(self, issue: dict[str, str]) -> tuple[str, list[str]]:
             None,
         )
 
-        package_name = body_data[name_index][1] if name_index else None
+        pkg_name = body_data[name_index][1] if name_index else None
 
-        return package_name, body_data
+        return pkg_name, body_data
 
     def get_gh_metrics(
         self,
@@ -559,14 +596,14 @@ def get_last_commit(self, repo: str) -> str:
         return date
 
     def get_categories(
-        self, issue_body_list: list[list[str]], fmt: bool = True
+        self, issue_list: list[list[str]], fmt: bool = True
     ) -> list[str]:
         """Parse through a pyOS review issue and grab categories associated
         with a package
 
         Parameters
         ----------
-        issue_body_list : list[list[str]]
+        issue_list : list[list[str]]
             The first comment from the issue split into lines and then the
             lines split as by self.parse_comment()
 
@@ -575,33 +612,27 @@ def get_categories(
             required for the website.
         """
         # Find the starting index of the category section
-        start_index = None
-        for i in range(len(issue_body_list)):
-            if issue_body_list[i][0].startswith("- Please indicate which"):
-                start_index = i + 1
-                break
-        # NOTE - some issues have line after that startswith "Check out our"
-        # For those issues advance i += 1
-        if issue_body_list[start_index][0].startswith("Check out our"):
-            start_index += 1
-
-        if start_index is None:
-            # If we couldn't find the starting index, return an empty list
-            return []
-
-        # Iterate through lines and grab the relevant text
-        cat_matches = ["[x]", "[X]"]
-        categories: list[str] = []
-        for i in range(start_index, len(issue_body_list)):  # 30):
-            line = issue_body_list[i][0].strip()
-            checked = any([x in line for x in cat_matches])
-
-            if line.startswith("- [") and checked:
-                category = line[line.index("]") + 2]
-                categories.append(category)
-            elif not line.startswith("- ["):
-                break
-
-        if fmt:
-            categories = [c.lower().replace(" ", "-") for c in categories]
-        return categories
+        try:
+            index = next(
+                i
+                for i, sublist in enumerate(issue_list)
+                if "## Scope" in sublist
+            )
+            # Iterate from scope index to first line starting with " - ["
+            # To find list of category check boxes
+            for i in range(index + 1, len(issue_list)):
+                if issue_list[i] and issue_list[i][0].startswith("- ["):
+                    cat_index = i
+                    break
+        except StopIteration:
+            print("'## Scope' not found in the list.")
+
+        # Get checked categories for package
+        cat_list = issue_list[cat_index : cat_index + 10]
+        categories = [
+            re.sub(r"- \[[xX]\] ", "", item[0])
+            for item in cat_list
+            if re.search(r"- \[[xX]\] ", item[0])
+        ]
+
+        return [item.lower().replace("[^1]", "") for item in categories]

From 4fcb559e71b9c141102ab206ccb63aa534e93925 Mon Sep 17 00:00:00 2001
From: Leah Wasser <leah@pyopensci.org>
Date: Sun, 20 Aug 2023 12:17:12 -0600
Subject: [PATCH 11/12] Fix: add more url validators and cleanup

---
 .flake8                                 |   2 +
 pyproject.toml                          |   5 +-
 src/pyosmeta/cli/update_contributors.py |  15 +---
 src/pyosmeta/cli/update_review_teams.py |   8 +-
 src/pyosmeta/contributors.py            | 102 ++++++++++++++----------
 src/pyosmeta/parse_issues.py            |   5 +-
 6 files changed, 71 insertions(+), 66 deletions(-)
 create mode 100644 .flake8

diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..8434b4b
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+ignore = E203, W503
diff --git a/pyproject.toml b/pyproject.toml
index f80ed4b..05a51da 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -75,9 +75,10 @@ profile = "black"
 multi_line_output = 3
 py_version = 27
 
+# Precommit ignores this config so i added a .flake8 file
+# but why did it ignore it?
 [tool.flake8]
-# List of error codes to ignore (comma-separated)
-ignore = ["E203", "W503"]
+extend-ignore = ["E203", "W503"]
 
 
 [tool.pdm.build]
diff --git a/src/pyosmeta/cli/update_contributors.py b/src/pyosmeta/cli/update_contributors.py
index 356dc5c..4a3f13a 100644
--- a/src/pyosmeta/cli/update_contributors.py
+++ b/src/pyosmeta/cli/update_contributors.py
@@ -25,8 +25,9 @@ def main():
         help="Force update contrib info from GitHub for every contributor",
     )
     args = parser.parse_args()
+    update_value = args.update
 
-    if args:
+    if update_value:
         update_all = True
 
     repos = [
@@ -49,6 +50,7 @@ def main():
     # Populate all existing contribs into model objects
     all_contribs = {}
     for a_contrib in web_contribs:
+        print(a_contrib)
         try:
             all_contribs[a_contrib["github_username"].lower()] = PersonModel(
                 **a_contrib
@@ -64,7 +66,6 @@ def main():
     bot_all_contribs = process_contribs.combine_json_data()
 
     print("Updating contrib types and searching for new users now")
-    # bot_all_contribs: keys: contrib_type, value: ghuser contribs
     for key, users in bot_all_contribs.items():
         for gh_user in users:
             # Find and populate data for any new contributors
@@ -76,16 +77,6 @@ def main():
             # Update contribution type list for all users
             all_contribs[gh_user].add_unique_value("contributor_type", key)
 
-            # existing_contribs = all_contribs[gh_user].contributor_type
-            # # TODO: i can move all of these update items to just use the
-            # # personmodel.add_unique_value then i can get rid of update
-            # # contrib list
-            # all_contribs[
-            #     gh_user
-            # ].contributor_type = process_contribs.update_contrib_list(
-            #     existing_contribs, key
-            # )
-
     if update_all:
         for user in all_contribs.keys():
             print("Updating all user info from github", user)
diff --git a/src/pyosmeta/cli/update_review_teams.py b/src/pyosmeta/cli/update_review_teams.py
index 31d72f7..b24f6d5 100644
--- a/src/pyosmeta/cli/update_review_teams.py
+++ b/src/pyosmeta/cli/update_review_teams.py
@@ -32,11 +32,9 @@
 # - sevivi
 import os
 
-from pyosmeta.contributors import (
-    PersonModel,
-    ProcessContributors,
-    ValidationError,
-)
+from pydantic import ValidationError
+
+from pyosmeta.contributors import PersonModel, ProcessContributors
 from pyosmeta.file_io import clean_export_yml, load_pickle
 
 
diff --git a/src/pyosmeta/contributors.py b/src/pyosmeta/contributors.py
index 3e82e60..712a98d 100644
--- a/src/pyosmeta/contributors.py
+++ b/src/pyosmeta/contributors.py
@@ -16,7 +16,59 @@
 from typing import Dict, List, Optional, Set, Tuple, Union
 
 
-class PersonModel(BaseModel):
+class UrlValidatorMixin:
+    # Check fields is false because this is being inherited by two diff classes
+    @field_validator(
+        "website", "documentation", mode="before", check_fields=False
+    )
+    @classmethod
+    def format_url(cls, url: str) -> str:
+        """Append https to the beginning of URL if it doesn't exist & cleanup
+        If the url doesn't have https add it
+        If the url starts with http change it to https
+        Else do nothing
+
+        Parameters
+        ----------
+        url : str
+            String representing the url grabbed from the GH api
+
+        """
+
+        if not url:
+            return url  # Returns empty string if url is empty
+        else:
+            if url.startswith("http://"):
+                print(f"{url} 'http://' replacing w 'https://'")
+                url = url.replace("http://", "https://")
+            elif not url.startswith("http"):
+                print("Oops, missing http")
+                url = "https://" + url
+        if cls._check_url(url=url):
+            return url
+        else:
+            return None
+
+    @staticmethod
+    def _check_url(url: str) -> bool:
+        """Test url. Return true if there's a valid response, False if not
+
+        Parameters
+        ----------
+        url : str
+            String for a url to a website to test.
+
+        """
+
+        try:
+            response = requests.get(url, timeout=6)
+            return response.status_code == 200
+        except Exception:
+            print("Oops, url", url, "is not valid, removing it")
+            return False
+
+
+class PersonModel(BaseModel, UrlValidatorMixin):
     # Make sure model populates both aliases and original attr name
     model_config = ConfigDict(
         populate_by_name=True,
@@ -56,24 +108,6 @@ class PersonModel(BaseModel):
     location: Optional[str] = None
     email: Optional[str] = None
 
-    # # TODO - turn this into a validator for the user website
-    # def _check_url(self, url: str) -> bool:
-    #     """Test a url and return true if it works, false if not
-
-    #     Parameters
-    #     ----------
-    #     url : str
-    #         String for a url to a website to test.
-
-    #     """
-
-    #     try:
-    #         response = requests.get(url, timeout=6)
-    #         return response.status_code == 200
-    #     except Exception:
-    #         print("Oops, url", url, "is not valid, removing it")
-    #         return False
-
     @field_validator(
         "packages_reviewed",
         "packages_submitted",
@@ -81,6 +115,7 @@ class PersonModel(BaseModel):
         "contributor_type",
         mode="before",
     )
+    @classmethod
     def convert_to_set(cls, value: list[str]):
         if isinstance(value, list):
             if not value:
@@ -88,7 +123,7 @@ def convert_to_set(cls, value: list[str]):
             elif value[0] is None:
                 return set()
             else:
-                value = [aval.lower() for aval in value]
+                value = [a_val.lower() for a_val in value]
                 return set(value)
         elif value is None:
             return set()
@@ -111,7 +146,9 @@ def add_unique_value(self, attr_name: str, values: Union[str, list[str]]):
         "contributor_type",
     )
     def serialize_set(self, items: Set[str]):
-        return list(items)
+        """This is a serializer that runs on export. It ensures sets are
+        converted to lists"""
+        return sorted(list(items))
 
     @field_validator("bio", mode="before")
     @classmethod
@@ -562,26 +599,3 @@ def get_gh_data(
     #                 ]
 
     #     return contrib_data
-
-    def format_url(self, url: str) -> str:
-        """Append https to the beginning of URL if it doesn't exist
-        If the url doesn't have https add it
-        If the url starts with http change it to https
-        Else do nothing
-
-        Parameters
-        ----------
-        url : str
-            String representing the url grabbed from the GH api
-
-        """
-        if not url:
-            return url  # returns empty string if url is empty
-        elif url.startswith("https://"):
-            return url
-        elif url.startswith("http://"):
-            print("Fixing", url, "https://" + url[7:])
-            return "https://" + url[7:]
-        else:
-            print("Missing https://, adding to ", url)
-            return "https://" + url
diff --git a/src/pyosmeta/parse_issues.py b/src/pyosmeta/parse_issues.py
index 8bea3ec..8267998 100644
--- a/src/pyosmeta/parse_issues.py
+++ b/src/pyosmeta/parse_issues.py
@@ -12,7 +12,7 @@
 )
 from typing import Any, Optional
 
-from pyosmeta.contributors import ProcessContributors
+from pyosmeta.contributors import ProcessContributors, UrlValidatorMixin
 
 
 def clean_date(a_date: Optional[str]) -> str:
@@ -40,7 +40,7 @@ def clean_date(a_date: Optional[str]) -> str:
             return "missing"
 
 
-class GhMeta(BaseModel):
+class GhMeta(BaseModel, UrlValidatorMixin):
     name: str
     description: str
     created_at: str
@@ -517,7 +517,6 @@ def get_gh_metrics(
         """
         pkg_meta = {}
         for pkg_name, url in endpoints.items():
-            print("Getting GitHub stats for", pkg_name)
             pkg_meta[pkg_name] = self.get_repo_meta(url, self.gh_stats)
 
             pkg_meta[pkg_name]["contrib_count"] = self.get_repo_contribs(url)

From 2a1922797f52abfa73dd36c2a6290090cd01c84d Mon Sep 17 00:00:00 2001
From: Leah Wasser <leah@pyopensci.org>
Date: Sun, 20 Aug 2023 12:36:02 -0600
Subject: [PATCH 12/12] Fix: cleanup unused code and comments

---
 pyproject.toml                          |   2 +-
 src/pyosmeta/__init__.py                |   2 +
 src/pyosmeta/cli/process_reviews.py     |  51 +-------
 src/pyosmeta/cli/update_contributors.py |   2 +-
 src/pyosmeta/cli/update_review_teams.py |  12 --
 src/pyosmeta/contributors.py            | 150 +-----------------------
 6 files changed, 8 insertions(+), 211 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 05a51da..6e8e431 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,7 +59,7 @@ license = { text = "MIT" }
 # for a user to run directly from the package.
 [project.scripts] # Optional
 update-contributors = "pyosmeta.cli.update_contributors:main"
-process-reviews = "pyosmeta.cli.process_reviews:main"
+update-reviews = "pyosmeta.cli.process_reviews:main"
 update-review-teams = "pyosmeta.cli.update_review_teams:main"
 
 
diff --git a/src/pyosmeta/__init__.py b/src/pyosmeta/__init__.py
index 017c97d..e4d2c24 100644
--- a/src/pyosmeta/__init__.py
+++ b/src/pyosmeta/__init__.py
@@ -1,6 +1,8 @@
 from .contributors import PersonModel, ProcessContributors
 from .parse_issues import ProcessIssues, ReviewModel
 
+# Trick suggested by flake8 maintainer to ensure the imports above don't
+# get flagged as being "unused"
 __all__ = (
     "ProcessIssues",
     "ReviewModel",
diff --git a/src/pyosmeta/cli/process_reviews.py b/src/pyosmeta/cli/process_reviews.py
index a5862a8..6756c63 100644
--- a/src/pyosmeta/cli/process_reviews.py
+++ b/src/pyosmeta/cli/process_reviews.py
@@ -17,82 +17,33 @@
 # ideally this could be passed as a CLI argument with the label we want to
 # search for
 
-
-# import argparse
 import pickle
 
 from pydantic import ValidationError
 
 from pyosmeta import ProcessIssues, ReviewModel
 
-# from pyosmeta.file_io import load_website_yml
-
-
 # TODO: change the template to ask for date accepted format year-month-day
 
 
 def main():
-    # update_all = False
-    # parser = argparse.ArgumentParser(
-    #     description="A CLI script to update pyOpenSci reviews"
-    # )
-    # parser.add_argument(
-    #     "--update",
-    #     type=str,
-    #     help="Will force update review info from GitHub for every review",
-    # )
-    # args = parser.parse_args()
-
-    # if args:
-    #     update_all = True
-    # web_reviews_path = (
-    #     "https://raw.githubusercontent.com/pyOpenSci/"
-    #     "pyopensci.github.io/main/_data/packages.yml"
-    # )
-
     process_review = ProcessIssues(
         org="pyopensci",
         repo_name="software-submission",
         label_name="6/pyOS-approved 🚀🚀🚀",
     )
 
-    # Open web yaml & return dict with package name as key
-    # web_reviews = load_website_yml(key="package_name", url=web_reviews_path)
-
     # Get all issues for approved packages - load as dict
     issues = process_review.return_response()
     accepted_reviews = process_review.parse_issue_header(issues, 45)
 
-    # TODO: clean out extra fields from accepted reviews??
-
-    # Parse through reviews, identify new ones, fix case
-    # TODO - right now i've reverted back to always updating all reviews.
-    # Is there a use-case to only update a new package vs updating everything?
-    # if update_all:
-    # all_reviews = {}
-    # for key, meta in accepted_reviews.items():
-    #     try:
-    #         all_reviews[key.lower()] = ReviewModel(**meta)
-    #     except ValidationError as ve:
-    #         print(ve)
-
-    # else:
-    #     for key, meta in accepted_reviews.items():
-    #         if key.lower() not in all_reviews.keys():
-    #             print("Yay - pyOS has a new package:", key)
-    #             all_reviews[key.lower()] = ReviewModel(**meta)
-
     # Update gh metrics via api for all packages
-    # TODO: this is working but above i made everything a model object
-    # do i want to do that above or just do it all at once below?
     repo_endpoints = process_review.get_repo_endpoints(accepted_reviews)
     all_reviews = process_review.get_gh_metrics(
         repo_endpoints, accepted_reviews
     )
 
-    # Finally populate model objects with review data + metrics
-    # TODO: this is really close - it's erroring when populating date
-    # i suspect in the github metadata
+    # Populate model objects with review data + metrics
     final_reviews = {}
     for key, review in all_reviews.items():
         # First add gh meta to each dict
diff --git a/src/pyosmeta/cli/update_contributors.py b/src/pyosmeta/cli/update_contributors.py
index 4a3f13a..b0e1bb2 100644
--- a/src/pyosmeta/cli/update_contributors.py
+++ b/src/pyosmeta/cli/update_contributors.py
@@ -50,7 +50,7 @@ def main():
     # Populate all existing contribs into model objects
     all_contribs = {}
     for a_contrib in web_contribs:
-        print(a_contrib)
+        print(a_contrib["github_username"])
         try:
             all_contribs[a_contrib["github_username"].lower()] = PersonModel(
                 **a_contrib
diff --git a/src/pyosmeta/cli/update_review_teams.py b/src/pyosmeta/cli/update_review_teams.py
index b24f6d5..5ff10b0 100644
--- a/src/pyosmeta/cli/update_review_teams.py
+++ b/src/pyosmeta/cli/update_review_teams.py
@@ -19,17 +19,8 @@
 # TODO: package-wide feature: create no update flag for entries
 # TODO: make sure we can add a 3rd or 4th reviewer - crowsetta has this as
 # will biocypher
-# TODO: make sure to add a current editor boolean to the current editors and
-# emeritus ones.
 
 """
-
-# TODO - Case sensitivity is an issue with my validation using set
-#     - jointly
-# - Jointly
-# - devicely
-# - Devicely
-# - sevivi
 import os
 
 from pydantic import ValidationError
@@ -45,8 +36,6 @@ def get_clean_user(username: str) -> str:
 
 
 def main():
-    # TODO: move refresh contribs and contribs dict attr to
-    # processContribs and remove this module altogether
     process_contribs = ProcessContributors([])
 
     # Two pickle files are outputs of the two other scripts
@@ -129,7 +118,6 @@ def main():
 
     # Export to yaml
     contribs_ls = [model.model_dump() for model in contribs.values()]
-    # Getting error dumping packages
     pkgs_ls = [model.model_dump() for model in packages.values()]
 
     clean_export_yml(contribs_ls, os.path.join("_data", "contributors.yml"))
diff --git a/src/pyosmeta/contributors.py b/src/pyosmeta/contributors.py
index 712a98d..22e22aa 100644
--- a/src/pyosmeta/contributors.py
+++ b/src/pyosmeta/contributors.py
@@ -13,11 +13,11 @@
     field_serializer,
     field_validator,
 )
-from typing import Dict, List, Optional, Set, Tuple, Union
+from typing import List, Optional, Set, Tuple, Union
 
 
 class UrlValidatorMixin:
-    # Check fields is false because this is being inherited by two diff classes
+    # Check fields is false given mixin is used by two diff classes
     @field_validator(
         "website", "documentation", mode="before", check_fields=False
     )
@@ -69,7 +69,6 @@ def _check_url(url: str) -> bool:
 
 
 class PersonModel(BaseModel, UrlValidatorMixin):
-    # Make sure model populates both aliases and original attr name
     model_config = ConfigDict(
         populate_by_name=True,
         str_strip_whitespace=True,
@@ -158,7 +157,6 @@ def clean_strings(cls, string: str) -> str:
 
         """
         if isinstance(string, str):
-            # Remove "\r\n" from the string value
             string = re.sub(r"[\r\n]", "", string)
         return string
 
@@ -220,47 +218,6 @@ def get_token(self) -> str:
         load_dotenv()
         return os.environ["GITHUB_TOKEN"]
 
-    # TODO - This utility is used across all scripts.
-    def clean_list(self, a_list: Union[str, List[str]]) -> List[str]:
-        """Helper function that takes an input object as a list or string.
-        If it is a list containing none, it returns an empty list
-        if it is a string is returns the string as a list
-        removes 'None' if that is in the list. and returns
-            either an empty clean list of the list as is."""
-
-        if isinstance(a_list, str):
-            a_list = [a_list]
-        elif not a_list:
-            a_list = []
-        # Remove None from list
-        a_list = list(filter(lambda x: x, a_list))
-        return a_list
-
-    # TODO - There is likely a better way to do this. If it returns an
-    # empty list then we know there are no new vals... so it likely can
-    # return a single thing
-    def unique_new_vals(
-        self, a_list: List[str], a_item: List[str]
-    ) -> Tuple[bool, Optional[List[str]]]:
-        """Checks two objects either a list and string or two lists
-        and evaluates whether there are differences between them.
-
-        Returns
-        -------
-        Tuple
-            Containing a boolean representing whether there are difference
-            or not and a list containing new value if there are differences.
-
-        """
-
-        default = (False, None)
-        list_lower = [al.lower() for al in a_list]
-        item_lower = [ai.lower() for ai in a_item]
-        diff = list(set(item_lower) - set(list_lower))
-        if len(diff) > 0:
-            default = (True, diff)
-        return default
-
     def check_contrib_type(self, json_file: str):
         """
         Determine the type of contribution the person
@@ -323,6 +280,7 @@ def load_json(self, json_path: str) -> dict:
             print(ae)
         return json.loads(response.text)
 
+    # TODO: check is i'm using the contrib type part of this method ?
     def process_json_file(self, json_file: str) -> Tuple[str, List]:
         """Deserialize a JSON file from a URL and cleanup data
 
@@ -497,105 +455,3 @@ def combine_users(self, repoDict: dict, webDict: dict) -> dict:
                 print("New user found. Adding: ", gh_user)
                 webDict[gh_user] = repoDict[gh_user]
         return webDict
-
-    # # TODO: i think i can remove this method
-    # def add_new_user(self, gh_user: str) -> dict:
-    #     """Add a new user to the contrib file using gh username
-
-    #     This method does a few things.
-    #     1. Adds a new template entry for the user w no values populated
-    #     2. Gets user metadata from the user's github profile
-    #     3. Updates their contrib entry with the gh data
-
-    #     Parameters
-    #     ----------
-    #     gh_user : str
-    #         String representing the GitHub username
-
-    #     Returns
-    #     -------
-    #     Dict
-    #         Username is the key and the updated github profile info is
-    #         contained in the dict.
-
-    #     """
-
-    #     new = {}
-    #     # Rather than this template i can use the person_model
-    #     new[gh_user] = self.create_contrib_template()
-    #     gh_data = self.get_gh_data([gh_user])
-    #     # Update their metadata in the dict and return
-    #     updated_data = self.update_contrib_data(new, gh_data)
-    #     return updated_data
-
-    def get_gh_data(
-        self, contribs: Union[Dict[str, str], List]
-    ) -> dict[str, str]:
-        """Parses through each GitHub username and hits the GitHub
-        API to grab user information.
-
-        Parameters
-        ----------
-        contribs : dict
-            Dict containing all current contrib info
-
-        Returns
-        -------
-            Dict
-            A dict of updated user data via a list of github usernames
-        """
-        all_user_info = {}
-        for gh_user in contribs:
-            print("Getting github data for: ", gh_user)
-            # If the user already has a name in the dict, don't update
-            # Important to allow us to update names to ensure correct spelling,
-            # etc on website
-            if isinstance(contribs, list):
-                aname = None
-            else:
-                aname = contribs[gh_user]["name"]
-
-            all_user_info[gh_user] = self.get_user_info(gh_user, aname)
-        return all_user_info
-
-    # Shouldn't need this anymore with pydantic
-    # def update_contrib_data(self, contrib_data: dict, gh_data: dict):
-    #     """Update contributor data from the GH API return.
-
-    #     Use the GitHub API to grab user profile data such as twitter handle,
-    #     mastodon, website, email and location and update contributor
-    #     information. GitHub profile data is the source of truth source for
-    #     contributor metadata.
-
-    #     Parameters
-    #     ----------
-    #     contrib_data : dict
-    #         A dict containing contributor data to be updated
-    #     gh_data : dict
-    #         Updated contributor data pulled from github API
-
-    #     Returns
-    #     -------
-    #     dict
-    #         Dictionary containing updated contributor data.
-    #     """
-
-    #     for i, gh_name in enumerate(contrib_data.keys()):
-    #         print(i, gh_name)
-    #         # Update the key:value pairs for data pulled from GitHub
-    #         for akey in self.update_keys:
-    #             if akey == "website":
-    #                 url = gh_data[gh_name][gh_name][akey]
-    #                 # Fix the url format and check to see if it works online
-    #                 url = self.format_url(url)
-    #                 # It url is valid, add to dict
-    #                 if self._check_url(url):
-    #                     contrib_data[gh_name][akey] = url
-    #                 else:
-    #                     contrib_data[gh_name][akey] = ""
-    #             else:
-    #                 contrib_data[gh_name][akey] = gh_data[gh_name][gh_name][
-    #                     akey
-    #                 ]
-
-    #     return contrib_data