Skip to content

Commit

Permalink
fix: pylint warnings
Browse files Browse the repository at this point in the history
  • Loading branch information
AkshayDevkate committed Nov 7, 2024
1 parent 4f6376a commit 1fec8bb
Showing 1 changed file with 15 additions and 23 deletions.
38 changes: 15 additions & 23 deletions collect_variables/scripts/github_api/enrich_repo_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,12 @@
pagination for contributors, manages API rate limits, and outputs the results
to a specified CSV file.
Usage:
python script.py --input input.csv --output output.csv
Dependencies:
- pandas
- ghapi
- python-dotenv
- argparse
- chardet
- pandas
- ghapi
- python-dotenv
- argparse
- chardet
The script manages API rate limit errors (HTTP 403) by waiting until the rate
limit is reset before retrying. If it cannot fetch data for some URLs, it leaves
Expand All @@ -24,14 +21,11 @@
import os
import time
import argparse
import logging
import pandas as pd
import requests
import chardet # Ensure chardet is imported
from dotenv import load_dotenv
from requests.exceptions import HTTPError
from ghapi.all import GhApi


# Load GitHub token from .env file
script_dir = os.path.dirname(os.path.realpath(__file__))
env_path = os.path.join(script_dir, '..', '..', '..', '.env')
Expand Down Expand Up @@ -66,8 +60,7 @@ def detect_encoding(file_path):
def handle_rate_limit():
"""
Handles the GitHub API rate limit by sleeping for 15 minutes when the limit
is reached. This function is triggered when an API rate limit error
(HTTP 403) occurs.
is reached.
"""
print('API rate limit exceeded. Sleeping for 15 minutes...')
time.sleep(15 * 60) # Sleep for 15 minutes
Expand Down Expand Up @@ -119,11 +112,12 @@ def get_repo_metadata(repo_url):
total_downloads = sum(asset.download_count
for release in releases
for asset in release.assets)
except Exception:
except Exception as download_error:
print(f'Error fetching downloads: {download_error}')
total_downloads = 'N/A'

# Extract relevant metadata
metadata = {
return {
'html_url': repo_url, # Use 'html_url' as the key for merging
'Repository Name': repo_data.name,
'Full Name': repo_data.full_name,
Expand All @@ -144,9 +138,7 @@ def get_repo_metadata(repo_url):
'Total Downloads': total_downloads
}

return metadata

except HTTP403ForbiddenError as http_error:
except GhApi.HTTP403ForbiddenError as http_error:
# Handle rate limit errors and retry after sleeping
print(f'Error: {http_error}')
handle_rate_limit() # Sleep for 15 minutes
Expand All @@ -172,19 +164,19 @@ def main(input_csv_path, output_csv_path):

# Try reading the input CSV file with detected encoding and comma delimiter
try:
df = pd.read_csv(input_csv_path, encoding=encoding, delimiter=',', on_bad_lines='skip')
input_df = pd.read_csv(input_csv_path, encoding=encoding, delimiter=',', on_bad_lines='skip')
except Exception as error:
print(f'Failed to read the CSV file: {error}')
return

# Print out the columns to debug
print("Columns in the original DataFrame (df):", df.columns)
print("Columns in the original DataFrame (input_df):", input_df.columns)

# List to store metadata dictionaries
metadata_list = []

# Iterate through URLs in the DataFrame, drop NaN values
for idx, url in df['html_url'].dropna().items():
for idx, url in input_df['html_url'].dropna().items():
# Ensure the URL is a string and check if it's a GitHub URL
if isinstance(url, str) and 'github.com' in url:
metadata = get_repo_metadata(url)
Expand All @@ -200,7 +192,7 @@ def main(input_csv_path, output_csv_path):
print("Columns in the metadata DataFrame (metadata_df):", metadata_df.columns)

# Merge the original dataframe with the new metadata based on 'html_url'
merged_df = pd.merge(df, metadata_df, on='html_url', how='left')
merged_df = pd.merge(input_df, metadata_df, on='html_url', how='left')

# Save the merged data to a new CSV file
merged_df.to_csv(output_csv_path, index=False)
Expand Down

0 comments on commit 1fec8bb

Please sign in to comment.