Skip to content

Commit

Permalink
only get data for new matches
Browse files Browse the repository at this point in the history
  • Loading branch information
jack89roberts committed Oct 15, 2021
1 parent ff24e13 commit 6fa1d01
Showing 1 changed file with 30 additions and 8 deletions.
38 changes: 30 additions & 8 deletions airsenal/scraper/scrape_understat.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def parse_match(match_info: dict):
return result


def get_season_info(season: str):
def get_season_info(season: str, result: dict = {}):
"""Get statistics for whole season
This function scrapes data for all the matches and returns a single
Expand All @@ -169,7 +169,9 @@ def get_season_info(season: str):
season: str
The season for which the statistics need to be
reported.
results: dict, optional
Previously saved match results - won't get new data for any match ID present
in results.keys(), by default {}
Returns
-------
Expand All @@ -180,30 +182,50 @@ def get_season_info(season: str):
"""

matches_info = get_matches_info(season)
result = {}

for match in tqdm(matches_info):
parsed_match = parse_match(match)
if parsed_match:
result[match.get("id")] = parse_match(match)
if match.get("id") not in result.keys():
parsed_match = parse_match(match)
if parsed_match:
result[match.get("id")] = parse_match(match)

return result


if __name__ == "__main__":
def main():
parser = argparse.ArgumentParser(description="Scrape understat archives")
parser.add_argument(
"--season",
help="Season to scrape data for",
choices=list(base_url.keys()),
required=True,
)
parser.add_argument(
"--overwrite",
help="Force overwriting previously saved data if set",
action="store_true",
)
args = parser.parse_args()
season = args.season
goal_subs_data = get_season_info(season)
overwrite = args.overwrite

result = {}
save_path = os.path.join(
os.path.dirname(__file__), f"../data/goals_subs_data_{season}.json"
)
if os.path.exists(save_path) and not overwrite:
print(
f"Data for {season} season already exists. Will only get data for new "
"matches. To re-download data for all matches use --overwrite."
)
with open(save_path, "r") as f:
result = json.load(f)

goal_subs_data = get_season_info(season, result=result)

with open(save_path, "w") as f:
json.dump(goal_subs_data, f, indent=4)


if __name__ == "__main__":
main()

0 comments on commit 6fa1d01

Please sign in to comment.