Skip to content

Commit

Permalink
fix: rate limit and docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
AkshayDevkate committed Dec 3, 2024
1 parent 5b06fd8 commit 60fc468
Showing 1 changed file with 91 additions and 15 deletions.
Original file line number Diff line number Diff line change
@@ -1,21 +1,42 @@
"""
Python script to check if GitHub repositories have CONTRIBUTING.md and CODE_OF_CONDUCT.md.
- Checks root directory, `.github/`, and `docs/` folders.
- Uses ghapi and a GitHub token for authentication.
- Skips repositories on error or non-GitHub domains but retains all rows in the output CSV.
This script checks if GitHub repositories have `CONTRIBUTING.md` and
`CODE_OF_CONDUCT.md` files. It processes repositories listed in a CSV
file, checks the presence of these files in the root, `.github/`, and
`docs/` directories. It uses the GitHub API and handles rate limiting
by pausing execution for 20 minutes (or until rate limit resets) if
the rate limit is reached. The results are saved to an output CSV file.
Required environment variables:
- GITHUB_TOKEN: GitHub Personal Access Token with appropriate permissions.
Usage:
python script_name.py --input <input_file.csv> --output <output_file.csv>
"""

import os
import argparse
import pandas as pd
import time
from ghapi.all import GhApi
from dotenv import load_dotenv


def check_repository_files(api, repo_owner, repo_name):
def check_repository_files(api: GhApi, repo_owner: str, repo_name: str) -> bool:
"""
Check if the repository has CONTRIBUTING.md and CODE_OF_CONDUCT.md files.
Searches in the root directory, `.github/`, and `docs/` folders.
Check if a repository contains `CONTRIBUTING.md` and `CODE_OF_CONDUCT.md` files.
This function searches in the following directories:
- Root directory
- `.github/` folder
- `docs/` folder
Args:
api (GhApi): Authenticated GitHub API client.
repo_owner (str): The GitHub repository owner's username.
repo_name (str): The GitHub repository name.
Returns:
bool: True if at least one of the files is found, otherwise False.
"""
paths_to_check = [
"CONTRIBUTING.md",
Expand All @@ -34,13 +55,57 @@ def check_repository_files(api, repo_owner, repo_name):
return False


def process_repositories(input_csv, output_csv, token):
def check_rate_limit(api: GhApi) -> bool:
"""
Iterate through repositories from input CSV and check for guidelines.
Save results to an output CSV.
Check if the GitHub API rate limit has been reached. If the rate limit is
exceeded, the script sleeps until the rate limit resets.
Args:
api (GhApi): Authenticated GitHub API client.
Returns:
bool: True if the rate limit has been reached and the script is sleeping.
"""
# Read the input CSV
df = pd.read_csv(input_csv)
rate_limit = api.rate_limit.get()
remaining = rate_limit.resources.core.remaining
reset_time = rate_limit.resources.core.reset

if remaining == 0:
reset_timestamp = reset_time.timestamp()
current_time = time.time()
sleep_time = reset_timestamp - current_time + 60 # Adding 1 minute buffer
print(f"Rate limit reached. Sleeping for {int(sleep_time // 60)} minutes...")
time.sleep(sleep_time)
return True
return False


def process_repositories(input_csv: str, output_csv: str, token: str) -> None:
"""
Processes each GitHub repository from the input CSV file, checks for the
presence of `CONTRIBUTING.md` and `CODE_OF_CONDUCT.md` files, and saves
the results in an output CSV file. The function handles rate limiting by
sleeping when the limit is exceeded.
Args:
input_csv (str): Path to the input CSV file containing repository URLs.
output_csv (str): Path to the output CSV file where results will be saved.
token (str): GitHub Personal Access Token for authentication.
Returns:
None
"""
try:
# Try reading the CSV with a semicolon delimiter
df = pd.read_csv(input_csv, delimiter=';', encoding='utf-8')
except UnicodeDecodeError:
print(f"Error reading {input_csv} with UTF-8 encoding. Trying ISO-8859-1...")
df = pd.read_csv(input_csv, delimiter=';', encoding='ISO-8859-1')

# Check if 'html_url' column exists
if 'html_url' not in df.columns:
print(f"Error: The 'html_url' column is missing in the input CSV file.")
return

# Add columns for results
df['has_contributing'] = None
Expand All @@ -53,6 +118,10 @@ def process_repositories(input_csv, output_csv, token):

for index, row in df.iterrows():
try:
# Check rate limit before processing each repository
if check_rate_limit(api):
print(f"Rate limit reset, proceeding with next repository.")

# Skip non-GitHub URLs
if "github.com" not in row['html_url']:
print(f"Skipping non-GitHub domain: {row['html_url']}")
Expand All @@ -79,9 +148,16 @@ def process_repositories(input_csv, output_csv, token):
print(f"Results saved to {output_csv}")


def main():
def main() -> None:
"""
Main function to parse arguments and execute the script.
Main function to parse command-line arguments and execute the script.
Parses the input CSV file path, output CSV file path, and GitHub token
from the environment variables or command-line arguments. It then processes
the repositories and checks for the required files.
Returns:
None
"""
# Get the directory of the current script
script_dir = os.path.dirname(os.path.realpath(__file__))
Expand All @@ -106,7 +182,7 @@ def main():
args = parser.parse_args()

# Process repositories
process_repositories(args.input_csv, args.output_csv, token)
process_repositories(args.input, args.output, token)


if __name__ == "__main__":
Expand Down

0 comments on commit 60fc468

Please sign in to comment.