forked from UtrechtUniversity/SWORDS-template
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(dependecy): program to check for .lock files in repositories for…
… Python, R and C++
- Loading branch information
1 parent
a101f35
commit b5310c0
Showing
1 changed file
with
138 additions
and
0 deletions.
There are no files selected for viewing
138 changes: 138 additions & 0 deletions
138
collect_variables/scripts/soft_dev_pract/dependency_practices/dependency_lock_files.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
""" | ||
Retrives dependecy .lock files for Python, R, C++ | ||
""" | ||
|
||
import argparse | ||
import os | ||
import time | ||
import pandas as pd | ||
from dotenv import load_dotenv | ||
from github import Github, GithubException, RateLimitExceededException | ||
|
||
# Get the directory of the current script | ||
script_dir = os.path.dirname(os.path.realpath(__file__)) | ||
|
||
# Create the relative path to the .env file | ||
env_path = os.path.join(script_dir, '..', '..', '..','..', '.env') | ||
|
||
# Load the .env file | ||
load_dotenv(dotenv_path=env_path, override=True) | ||
|
||
# Get the GITHUB_TOKEN from the .env file | ||
token = os.getenv('GITHUB_TOKEN') | ||
|
||
# Use the token to create a Github instance | ||
g = Github(token) | ||
|
||
def check_requirements(repository_url): | ||
""" | ||
Check if requirements are made explicit in a GitHub repository. | ||
Args: | ||
- repository_url (str): The GitHub repository URL. | ||
Returns: | ||
- bool: True if requirements are found and explicit, False otherwise. | ||
""" | ||
# Parse owner and repo from URL | ||
url_parts = repository_url.rstrip('/').split('/') | ||
owner = url_parts[-2] | ||
repo = url_parts[-1] | ||
|
||
try: | ||
# Get repository details | ||
repository = g.get_repo(f"{owner}/{repo}") | ||
|
||
# Get the programming language of the repository | ||
repository_language = repository.language | ||
|
||
# Define the common .lock files | ||
common_dependency_files = { | ||
'Python': ['Pipfile.lock', 'poetry.lock', 'requirement.lock'], | ||
'R': ['renv.lock', 'packrat.lock'], | ||
'C++': ['vcpkg.lock', 'conan.lock', 'CMakeCache.txt'] | ||
} | ||
|
||
# Check for each dependency file in the repository based on the language | ||
if repository_language in common_dependency_files: | ||
for dependency_file in common_dependency_files[repository_language]: | ||
try: | ||
repository.get_contents(dependency_file) | ||
return True | ||
except GithubException: | ||
continue | ||
return False | ||
|
||
except RateLimitExceededException: | ||
# Handle the rate limit error and pause for 15 minutes | ||
print("GitHub API rate limit exceeded. Sleeping for 15 minutes...") | ||
time.sleep(15 * 60) # Sleep for 15 minutes | ||
return check_requirements(repository_url) # Retry the same repository | ||
|
||
except Exception as e: | ||
print(f"Failed to check requirements for {repository_url}: {str(e)}") | ||
return None # Return None if there's an error | ||
|
||
def is_github_url(url): | ||
""" | ||
Check if a URL is a valid GitHub URL. | ||
Args: | ||
- url (str or None): The URL to check. | ||
Returns: | ||
- bool: True if the URL is a valid GitHub repository URL, False otherwise. | ||
""" | ||
if isinstance(url, str): | ||
return url.startswith("https://github.com/") | ||
return False | ||
|
||
if __name__ == "__main__": | ||
# Create an ArgumentParser object | ||
argument_parser = argparse.ArgumentParser( | ||
description='Check requirements of GitHub repositories listed in a CSV file.' | ||
) | ||
|
||
# Add command-line arguments | ||
argument_parser.add_argument( | ||
'--input', | ||
type=str, | ||
default='../collect_repositories/results/repositories_filtered.csv', | ||
help='Input CSV file containing GitHub repository URLs' | ||
) | ||
|
||
argument_parser.add_argument( | ||
'--output', | ||
type=str, | ||
default='results/soft_dev_pract.csv', | ||
help='Output CSV file to save results' | ||
) | ||
|
||
# Parse command-line arguments | ||
command_line_arguments = argument_parser.parse_args() | ||
|
||
# Read the input CSV file using pandas | ||
input_data = pd.read_csv(command_line_arguments.input, sep=',') | ||
|
||
# Add a new column for the results | ||
input_data['dependency_config_files'] = '' | ||
|
||
# Loop through each row of the DataFrame | ||
for index, row in input_data.iterrows(): | ||
repo_url = row['html_url'] | ||
|
||
# Skip rows with missing or non-string GitHub URLs | ||
if not is_github_url(repo_url): | ||
print(f"Skipping invalid or non-GitHub URL: {repo_url}") | ||
input_data.at[index, 'dependency_config_files'] = None # Leave the field empty | ||
continue | ||
|
||
# Check requirements for each repository using 'repo_url' | ||
result = check_requirements(repo_url) | ||
|
||
# Set the result in the new column (leave as None if an error occurred) | ||
input_data.at[index, 'requirements_defined'] = result | ||
|
||
# Write the updated DataFrame to the output CSV file, keeping all original columns | ||
input_data.to_csv(command_line_arguments.output, index=False) | ||
|
||
print(f"Results saved to {command_line_arguments.output}") |