-
-
Notifications
You must be signed in to change notification settings - Fork 162
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open Source Sorting Hat #3318
Draft
SahilDhillon21
wants to merge
8
commits into
OWASP-BLT:main
Choose a base branch
from
SahilDhillon21:ossh
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
Open Source Sorting Hat #3318
Changes from 3 commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
20b27f5
Basic setup
SahilDhillon21 78a3387
OpenSourceRepo model and command to fetch repos
SahilDhillon21 3a88545
Merge branch 'main' into ossh
SahilDhillon21 a7c5f7e
Merge branch 'main' into ossh
SahilDhillon21 f718b16
Use Repo model only and create ossh community model along with reddit…
SahilDhillon21 315809e
merge
SahilDhillon21 fd75880
Create Discussion Channel Model and command to fetch discord servers
SahilDhillon21 aff7d82
Merge branch 'main' into ossh
DonnieBLT File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,239 @@ | ||
import logging | ||
import time | ||
from datetime import datetime | ||
from datetime import timezone as dt_timezone | ||
|
||
import requests | ||
from django.conf import settings | ||
from django.core.management.base import BaseCommand | ||
from django.db import transaction | ||
from django.utils import timezone | ||
from django.utils.text import slugify | ||
|
||
from website.models import OpenSourceRepo, Tag | ||
|
||
# ANSI escape codes for colors | ||
COLOR_RED = "\033[91m" | ||
COLOR_GREEN = "\033[92m" | ||
COLOR_YELLOW = "\033[93m" | ||
COLOR_BLUE = "\033[94m" | ||
COLOR_RESET = "\033[0m" | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class Command(BaseCommand): | ||
help = "Fetches and updates open source repository data from GitHub" | ||
|
||
def handle(self, *args, **options): | ||
self.session = requests.Session() | ||
self.session.headers.update( | ||
{"Authorization": f"token {settings.GITHUB_TOKEN}", "Accept": "application/vnd.github.v3+json"} | ||
) | ||
|
||
self.MIN_STARS = 10 | ||
self.MIN_FORKS = 5 | ||
self.MAX_REPOS = 500 # Could go upwards of 100k in production | ||
self.MIN_CONTRIBUTORS, self.MIN_COMMITS = 5, 20 | ||
self.MAX_DAYS_SINCE_UPDATE = 60 | ||
|
||
logger.info(f"{COLOR_BLUE}Starting the fetch_repositories process.{COLOR_RESET}") | ||
self.fetch_repositories() | ||
logger.info(f"{COLOR_BLUE}Finished the fetch_repositories process.{COLOR_RESET}") | ||
|
||
def has_code_files(self, repo_full_name): | ||
"""Check if the repo contains at least one source code file.""" | ||
try: | ||
repo_details = self.session.get(f"https://api.github.com/repos/{repo_full_name}").json() | ||
default_branch = repo_details.get("default_branch", "main") | ||
|
||
response = self.session.get( | ||
f"https://api.github.com/repos/{repo_full_name}/git/trees/{default_branch}?recursive=1" | ||
) | ||
if response.status_code != 200: | ||
logger.warning( | ||
f"{COLOR_YELLOW}Failed to fetch file tree for {repo_full_name}: {response.status_code}{COLOR_RESET}" | ||
) | ||
return False | ||
|
||
files = [file["path"] for file in response.json().get("tree", [])] | ||
code_extensions = {".py", ".js", ".java", ".cpp", ".c", ".ts", ".rb", ".go", ".rs", ".swift"} | ||
return any(file.endswith(tuple(code_extensions)) for file in files) | ||
|
||
except Exception as e: | ||
logger.error(f"{COLOR_RED}Error checking code files for {repo_full_name}: {str(e)}{COLOR_RESET}") | ||
return False | ||
|
||
def get_commit_count(self, repo_full_name): | ||
"""Fetches the total commit count efficiently.""" | ||
url = f"https://api.github.com/repos/{repo_full_name}/commits?per_page=1" | ||
try: | ||
response = self.session.get(url) | ||
if response.status_code == 200 and response.links.get("last"): | ||
last_page_url = response.links["last"]["url"] | ||
last_page_number = int(last_page_url.split("page=")[-1]) | ||
return last_page_number | ||
elif response.status_code == 200: | ||
return len(response.json()) | ||
else: | ||
return 0 | ||
except Exception as e: | ||
logger.error(f"{COLOR_RED}Error fetching commit count for {repo_full_name}: {str(e)}{COLOR_RESET}") | ||
return 0 | ||
|
||
def get_contributors_count(self, repo_full_name): | ||
"""Fetches the number of contributors.""" | ||
url = f"https://api.github.com/repos/{repo_full_name}/contributors?per_page=1&anon=true" | ||
try: | ||
response = self.session.get(url) | ||
if response.status_code == 200 and response.links.get("last"): | ||
last_page_url = response.links["last"]["url"] | ||
last_page_number = int(last_page_url.split("page=")[-1]) | ||
return last_page_number | ||
elif response.status_code == 200: | ||
return len(response.json()) | ||
else: | ||
return 0 | ||
except Exception as e: | ||
logger.error(f"{COLOR_RED}Error fetching contributors for {repo_full_name}: {str(e)}{COLOR_RESET}") | ||
return 0 | ||
|
||
def is_good_repository(self, repo_data): | ||
"""Checks for repository quality.""" | ||
failure_messages = [] | ||
|
||
if not self.has_code_files(repo_data["full_name"]): | ||
failure_messages.append(f"{COLOR_RED}No code files found{COLOR_RESET}") | ||
|
||
num_contributors = self.get_contributors_count(repo_data["full_name"]) | ||
if num_contributors < self.MIN_CONTRIBUTORS: | ||
failure_messages.append(f"{COLOR_RED}Contributors < {self.MIN_CONTRIBUTORS}{COLOR_RESET}") | ||
|
||
num_commits = self.get_commit_count(repo_data["full_name"]) | ||
if num_commits < self.MIN_COMMITS: | ||
failure_messages.append(f"{COLOR_RED}Commits < {self.MIN_COMMITS}{COLOR_RESET}") | ||
|
||
last_push_date = datetime.strptime(repo_data["pushed_at"], "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=dt_timezone.utc) | ||
days_since_last_push = (datetime.now(dt_timezone.utc) - last_push_date).days | ||
if days_since_last_push > self.MAX_DAYS_SINCE_UPDATE: | ||
failure_messages.append(f"{COLOR_RED}Last push > {self.MAX_DAYS_SINCE_UPDATE} days ago{COLOR_RESET}") | ||
|
||
if repo_data["stargazers_count"] < self.MIN_STARS: | ||
failure_messages.append(f"{COLOR_RED}Stars < {self.MIN_STARS}{COLOR_RESET}") | ||
if repo_data["forks_count"] < self.MIN_FORKS: | ||
failure_messages.append(f"{COLOR_RED}Forks < {self.MIN_FORKS}{COLOR_RESET}") | ||
if repo_data["archived"]: | ||
failure_messages.append(f"{COLOR_RED}Repository is archived{COLOR_RESET}") | ||
if repo_data.get("license") is None: | ||
failure_messages.append(f"{COLOR_RED}No license found{COLOR_RESET}") | ||
if repo_data.get("size", 0) <= 100: | ||
failure_messages.append(f"{COLOR_RED}Size <= 100 KB{COLOR_RESET}") | ||
|
||
if failure_messages: | ||
logger.warning( | ||
f"{COLOR_YELLOW}Repository {repo_data['full_name']} failed checks: {', '.join(failure_messages)}{COLOR_RESET}" | ||
) | ||
return False | ||
|
||
logger.info(f"{COLOR_GREEN}Repository {repo_data['full_name']} meets all criteria.{COLOR_RESET}") | ||
return True | ||
|
||
def fetch_repositories(self): | ||
query = " ".join( | ||
[ | ||
"is:public", | ||
f"stars:>={self.MIN_STARS}", | ||
f"forks:>={self.MIN_FORKS}", | ||
"archived:false", | ||
"has:license", | ||
"size:>100", | ||
"-topic:awesome", | ||
"-topic:list", | ||
"-topic:resource", | ||
"-topic:resources", | ||
"-topic:questions", | ||
"-topic:cheatsheet", | ||
"-topic:roadmap", | ||
"-topic:guide", | ||
"-topic:collection", | ||
"-topic:interview", | ||
"-topic:coding-interview", | ||
"-topic:notes", | ||
"-topic:tutorials", | ||
] | ||
) | ||
page, repos_processed, repos_saved = 1, 0, 0 | ||
|
||
while repos_processed < self.MAX_REPOS: | ||
try: | ||
logger.info(f"{COLOR_BLUE}Fetching repositories from GitHub API (Page {page}).{COLOR_RESET}") | ||
response = self.session.get( | ||
"https://api.github.com/search/repositories", | ||
params={"q": query, "sort": "stars", "order": "desc", "page": page, "per_page": 100}, | ||
) | ||
|
||
if response.status_code == 403: | ||
logger.warning( | ||
f"{COLOR_YELLOW}Reached GitHub API rate limit. Sleeping for 60 seconds.{COLOR_RESET}" | ||
) | ||
time.sleep(60) | ||
continue | ||
elif response.status_code != 200: | ||
logger.error(f"{COLOR_RED}Error fetching repositories: {response.status_code}{COLOR_RESET}") | ||
break | ||
|
||
repos = response.json().get("items", []) | ||
if not repos: | ||
logger.info(f"{COLOR_BLUE}No more repositories found. Exiting loop.{COLOR_RESET}") | ||
break | ||
|
||
self.process_repositories(repos) | ||
repos_processed += len(repos) | ||
logger.info(f"{COLOR_BLUE}Processed {repos_processed} repositories so far.{COLOR_RESET}") | ||
page += 1 | ||
time.sleep(1) | ||
except Exception as e: | ||
logger.error(f"{COLOR_RED}Error fetching repositories: {str(e)}{COLOR_RESET}") | ||
time.sleep(5) | ||
|
||
def process_repositories(self, repos): | ||
for repo_data in repos: | ||
try: | ||
if not self.is_good_repository(repo_data): | ||
continue | ||
|
||
with transaction.atomic(): | ||
repo, created = OpenSourceRepo.objects.update_or_create( | ||
url=repo_data["html_url"], | ||
defaults={ | ||
"name": repo_data["name"], | ||
"owner": repo_data["owner"]["login"], | ||
"description": repo_data["description"] or "", | ||
"primary_language": repo_data["language"] or "", | ||
"last_updated": timezone.make_aware( | ||
datetime.strptime(repo_data["updated_at"], "%Y-%m-%dT%H:%M:%SZ"), dt_timezone.utc | ||
), | ||
"stars": repo_data["stargazers_count"], | ||
"forks": repo_data["forks_count"], | ||
"open_issues": repo_data["open_issues_count"], | ||
"last_pushed": timezone.make_aware( | ||
datetime.strptime(repo_data["pushed_at"], "%Y-%m-%dT%H:%M:%SZ"), dt_timezone.utc | ||
), | ||
"license": repo_data.get("license", {}).get("spdx_id", ""), | ||
}, | ||
) | ||
|
||
if repo_data.get("topics"): | ||
tags = [] | ||
for topic in repo_data["topics"]: | ||
tag_slug = slugify(topic) | ||
tag, _ = Tag.objects.get_or_create(slug=tag_slug, defaults={"name": topic}) | ||
tags.append(tag) | ||
repo.tags.set(tags) | ||
|
||
logger.info( | ||
f"{COLOR_GREEN}{'Created' if created else 'Updated'} repository: {repo.name}{COLOR_RESET}" | ||
) | ||
except Exception as e: | ||
logger.error(f"{COLOR_RED}Error processing repository {repo_data['full_name']}: {str(e)}{COLOR_RESET}") | ||
continue |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
# Generated by Django 5.1.4 on 2025-02-02 23:06 | ||
|
||
import django.db.models.deletion | ||
from django.db import migrations, models | ||
|
||
|
||
class Migration(migrations.Migration): | ||
dependencies = [ | ||
("website", "0187_baconearning"), | ||
] | ||
|
||
operations = [ | ||
migrations.CreateModel( | ||
name="GitHubReview", | ||
fields=[ | ||
( | ||
"id", | ||
models.AutoField( | ||
auto_created=True, | ||
primary_key=True, | ||
serialize=False, | ||
verbose_name="ID", | ||
), | ||
), | ||
("review_id", models.IntegerField(unique=True)), | ||
("body", models.TextField(blank=True, null=True)), | ||
("state", models.CharField(max_length=50)), | ||
("submitted_at", models.DateTimeField()), | ||
("url", models.URLField()), | ||
( | ||
"pull_request", | ||
models.ForeignKey( | ||
on_delete=django.db.models.deletion.CASCADE, | ||
related_name="reviews", | ||
to="website.githubissue", | ||
), | ||
), | ||
( | ||
"reviewer", | ||
models.ForeignKey( | ||
on_delete=django.db.models.deletion.CASCADE, | ||
related_name="reviews_made", | ||
to="website.userprofile", | ||
), | ||
), | ||
], | ||
), | ||
migrations.CreateModel( | ||
name="OpenSourceRepo", | ||
fields=[ | ||
( | ||
"id", | ||
models.AutoField( | ||
auto_created=True, | ||
primary_key=True, | ||
serialize=False, | ||
verbose_name="ID", | ||
), | ||
), | ||
("name", models.CharField(max_length=255)), | ||
("owner", models.CharField(max_length=255)), | ||
("url", models.URLField()), | ||
("description", models.TextField(blank=True)), | ||
("primary_language", models.CharField(max_length=50)), | ||
("last_updated", models.DateTimeField()), | ||
("stars", models.PositiveIntegerField(default=0)), | ||
("forks", models.PositiveIntegerField(default=0)), | ||
("open_issues", models.PositiveIntegerField(default=0)), | ||
("last_pushed", models.DateTimeField()), | ||
("license", models.CharField(blank=True, max_length=100)), | ||
( | ||
"tags", | ||
models.ManyToManyField(related_name="repositories", to="website.tag"), | ||
), | ||
], | ||
), | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about we use our existing repo model?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Definitely, but the project field must be made nullable, would that be okay?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes