Skip to content

Commit

Permalink
feat(dependecy): program to check for .lock files in repositories for…
Browse files Browse the repository at this point in the history
… Python, R and C++
  • Loading branch information
AkshayDevkate committed Dec 9, 2024
1 parent a101f35 commit b5310c0
Showing 1 changed file with 138 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
"""
Retrives dependecy .lock files for Python, R, C++
"""

import argparse
import os
import time
import pandas as pd
from dotenv import load_dotenv
from github import Github, GithubException, RateLimitExceededException

# Get the directory of the current script
script_dir = os.path.dirname(os.path.realpath(__file__))

# Create the relative path to the .env file
env_path = os.path.join(script_dir, '..', '..', '..','..', '.env')

# Load the .env file
load_dotenv(dotenv_path=env_path, override=True)

# Get the GITHUB_TOKEN from the .env file
token = os.getenv('GITHUB_TOKEN')

# Use the token to create a Github instance
g = Github(token)

def check_requirements(repository_url):
"""
Check if requirements are made explicit in a GitHub repository.
Args:
- repository_url (str): The GitHub repository URL.
Returns:
- bool: True if requirements are found and explicit, False otherwise.
"""
# Parse owner and repo from URL
url_parts = repository_url.rstrip('/').split('/')
owner = url_parts[-2]
repo = url_parts[-1]

try:
# Get repository details
repository = g.get_repo(f"{owner}/{repo}")

# Get the programming language of the repository
repository_language = repository.language

# Define the common .lock files
common_dependency_files = {
'Python': ['Pipfile.lock', 'poetry.lock', 'requirement.lock'],
'R': ['renv.lock', 'packrat.lock'],
'C++': ['vcpkg.lock', 'conan.lock', 'CMakeCache.txt']
}

# Check for each dependency file in the repository based on the language
if repository_language in common_dependency_files:
for dependency_file in common_dependency_files[repository_language]:
try:
repository.get_contents(dependency_file)
return True
except GithubException:
continue
return False

except RateLimitExceededException:
# Handle the rate limit error and pause for 15 minutes
print("GitHub API rate limit exceeded. Sleeping for 15 minutes...")
time.sleep(15 * 60) # Sleep for 15 minutes
return check_requirements(repository_url) # Retry the same repository

except Exception as e:
print(f"Failed to check requirements for {repository_url}: {str(e)}")
return None # Return None if there's an error

def is_github_url(url):
"""
Check if a URL is a valid GitHub URL.
Args:
- url (str or None): The URL to check.
Returns:
- bool: True if the URL is a valid GitHub repository URL, False otherwise.
"""
if isinstance(url, str):
return url.startswith("https://github.com/")
return False

if __name__ == "__main__":
# Create an ArgumentParser object
argument_parser = argparse.ArgumentParser(
description='Check requirements of GitHub repositories listed in a CSV file.'
)

# Add command-line arguments
argument_parser.add_argument(
'--input',
type=str,
default='../collect_repositories/results/repositories_filtered.csv',
help='Input CSV file containing GitHub repository URLs'
)

argument_parser.add_argument(
'--output',
type=str,
default='results/soft_dev_pract.csv',
help='Output CSV file to save results'
)

# Parse command-line arguments
command_line_arguments = argument_parser.parse_args()

# Read the input CSV file using pandas
input_data = pd.read_csv(command_line_arguments.input, sep=',')

# Add a new column for the results
input_data['dependency_config_files'] = ''

# Loop through each row of the DataFrame
for index, row in input_data.iterrows():
repo_url = row['html_url']

# Skip rows with missing or non-string GitHub URLs
if not is_github_url(repo_url):
print(f"Skipping invalid or non-GitHub URL: {repo_url}")
input_data.at[index, 'dependency_config_files'] = None # Leave the field empty
continue

# Check requirements for each repository using 'repo_url'
result = check_requirements(repo_url)

# Set the result in the new column (leave as None if an error occurred)
input_data.at[index, 'requirements_defined'] = result

# Write the updated DataFrame to the output CSV file, keeping all original columns
input_data.to_csv(command_line_arguments.output, index=False)

print(f"Results saved to {command_line_arguments.output}")

0 comments on commit b5310c0

Please sign in to comment.