TestingResearchIllinois · darko-marinov · Dec 7, 2022 · Nov 28, 2022 · Nov 30, 2022 · Nov 30, 2022
diff --git a/repo-info/README.md b/repo-info/README.md
@@ -0,0 +1,21 @@
+# Repo Info Helper
+
+This script scans a given .csv file (works on both `pr-data.csv` and `py-data.csv`), and outputs another .csv file, with 3 columns:
+
+* Repo URL
+* Months since latest commit to master/main
+* Number of stars
+
+... which are sorted in descending order of number of stars and ascending order of months since last commit to master.
+
+The latter 2 values will help us in shortlisting a project to fix flaky tests in. The chances of your PR getting accepted are higher for a repository that is actively maintained and has a high number of stars. This script will only scan URLs that have an empty `Status` column.
+
+## To run:
+
+* Requires a github access token if there are more than 60 requests made (i.e. more than 60 unique repositories in the file), which is highly likely, since both `pr-data.csv` and `py-data.csv` each contain 300+ unique repositories at the time of writing this (Nov 2022).
+
+* Following are the commands to run the script from the root directory. Remember to use a github access token to overcome the rate limit:
+    * For `pr-data.csv`: `repo-info/get_repo_info.py -f pr-data.csv -c 'Project URL' -t <github-access-token>`
+    * For `py-data.csv`: `repo-info/get_repo_info.py -f py-data.csv -c 'Project URL' -t <github-access-token>`
+
+The new file will be saved with the name `repo_info.csv` inside the `repo-info` directory.
diff --git a/repo-info/get_repo_info.py b/repo-info/get_repo_info.py
@@ -0,0 +1,62 @@
+import os
+import argparse
+import datetime
+import pandas as pd
+from tqdm import tqdm
+from github import Github
+
+tqdm.pandas()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-t', '--github_access_token', help='GitHub access token to overcome API rate limitations')
+parser.add_argument('-f', '--filepath', help='Filepath of .csv file containing repo data')
+parser.add_argument('-c', '--colname', help='Column name in CSV file pertaining to repo URL')
+args = parser.parse_args()
+
+GITHUB_API_RATE_LIMIT = 5000
+FILEPATH, COLNAME, GITHUB_ACCESS_TOKEN = args.filepath, args.colname, args.github_access_token
+
+data = pd.read_csv(FILEPATH)
+data = data[data['Status'].isna()]
+REPO_URLS = data[COLNAME].unique()
+NUM_REPOS = REPO_URLS.shape[0]
+
+def check_number_repos():
+    if NUM_REPOS > GITHUB_API_RATE_LIMIT:
+        print(f'You can only make {GITHUB_API_RATE_LIMIT} requests per hour. Your file has {NUM_REPOS} unique repositories. Exiting.')
+        exit(0)
+
+def get_diff_month(d1, d2):
+    return (d1.year - d2.year) * 12 + d1.month - d2.month
+
+def get_repo_object(repo_url):
+    try:
+        repo_name = repo_url.split('github.com/')[1]
+        return Github(GITHUB_ACCESS_TOKEN).get_repo(repo_name)
+    except Exception as e:
+        print(e)
+        return None
+
+def get_months_since_last_commit(repo):
+    try:
+        default_branch = repo.get_branch(repo.default_branch)
+        latest_commit_date = default_branch.commit.commit.author.date
+        months_since_commit = get_diff_month(datetime.datetime.now(), latest_commit_date)
+        return months_since_commit
+    except Exception as e:
+        print(e)
+        return None
+
+def get_maintained_repos():    
+    check_number_repos()
+    print(f'Analyzing {NUM_REPOS} repositories...')
+    df = pd.DataFrame()
+    df['REPO_URL'] = REPO_URLS
+    df['REPO_OBJECT'] = df['REPO_URL'].progress_apply(lambda url: get_repo_object(url))
+    df['MONTHS_SINCE_LAST_COMMIT'] = df['REPO_OBJECT'].progress_apply(lambda repo_object: get_months_since_last_commit(repo_object))
+    df['STARS'] = df['REPO_OBJECT'].progress_apply(lambda repo_object: repo_object.stargazers_count if repo_object is not None else None)
+    df = df.sort_values(by=['MONTHS_SINCE_LAST_COMMIT', 'STARS'], ascending=[True, False]).drop(columns=['REPO_OBJECT', 'Unnamed: 0'], errors='ignore')
+    df.to_csv(f'{os.getcwd()}/repo-info/repo-info.csv', index=False)
+
+if __name__ == '__main__':
+    get_maintained_repos()
diff --git a/repo-info/requirements.txt b/repo-info/requirements.txt
@@ -0,0 +1,3 @@
+pandas==1.5.2
+PyGithub==1.57
+tqdm==4.64.1