Skip to content

Commit

Permalink
Update method of readme collection; also, split API calls when result…
Browse files Browse the repository at this point in the history
… size = 1000
  • Loading branch information
jmelot committed Sep 7, 2023
1 parent a35e910 commit 446c1d3
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 7 deletions.
11 changes: 5 additions & 6 deletions scripts/retrieve_repo_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,11 @@ def add_scraped_meta(repo_record: dict) -> None:
for title, key, _ in link_title_key_default:
if title in link.text:
repo_record[key] = link.find("span")["title"]
default_branch = repo_record["full_metadata"]["default_branch"]
readme_resp = requests.get(
f"https://raw.githubusercontent.com/{owner_name}/{repo_name}/{default_branch}/README.md"
)
repo_record["readme_text"] = readme_resp.text
repo_record["homepage_text"] = repo_homepage.text
readme_container = soup.find("div", {"id": "readme"})
readme_text = None
if readme_container:
readme_text = readme_container.find(class_="Box-body").text
repo_record["readme_text"] = readme_text
except Exception as e:
print(e)

Expand Down
2 changes: 1 addition & 1 deletion scripts/retrieve_repos.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def get_size_partitions(self, topic: str) -> list:
else:
result_count = repo_resp_js.get("total_count", 0)
print(f"Found {result_count} repos in {fmt_size_range}")
if result_count > 1000:
if result_count >= 1000:
if size_range[0] == size_range[1]:
print(
f"ERROR: too many repos ({result_count}) for {topic} in {fmt_size_range}"
Expand Down

0 comments on commit 446c1d3

Please sign in to comment.