Skip to content

Commit

Permalink
Apply suggestions from code review
Browse files Browse the repository at this point in the history
  • Loading branch information
nabobalis committed Oct 25, 2023
1 parent 925ec26 commit a20acc2
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 25 deletions.
15 changes: 7 additions & 8 deletions .github/workflows/run_and_deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,6 @@ jobs:
check_style:
runs-on: ubuntu-latest
steps:
- name: Get current date
id: date
run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
- name: Checkout Repository
uses: actions/checkout@v4
- name: Set up Python 3.11
Expand Down Expand Up @@ -77,16 +74,18 @@ jobs:
uses: actions/download-artifact@v3
with:
name: files
path: files
- name: Upload files to release
path: ~/files
- name: Display structure of downloaded files
run: ls -R
working-directory: /home/runner/files
- name: Upload files then create tag and release
uses: svenstaro/upload-release-action@v2
with:
tag: ${{ steps.date.outputs.date }}
file: files/timeline*
file: /home/runner/files/timeline*
overwrite: true
body: "Latest version of the timeline"
- name: Push index.html to branch page
if: ${{ github.ref }} == 'refs/heads/main'
uses: JamesIves/github-pages-deploy-action@v4
with:
folder: ./files
folder: /home/runner/files
6 changes: 1 addition & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,8 @@ fixable = [
]
extend-ignore = [
"E501",
"T201",
]
"BLE001",

[tool.ruff.per-file-ignores]
"examples/*.py" = [
"T201", # We need print in our examples
]

[tool.ruff.pydocstyle]
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ itables==1.6.2
nbconvert==7.9.2
pandas==2.1.1
requests==2.31.0
loguru==0.7.2
27 changes: 15 additions & 12 deletions scraper.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import os
from datetime import datetime
from itertools import product
from pathlib import Path
from typing import Optional

import pandas as pd
import requests
from bs4 import BeautifulSoup
from loguru import logger

from config import DATASETS, MAP_4, TIME_FORMATS

Expand Down Expand Up @@ -53,7 +54,7 @@ def _format_date(date: str, year: Optional[str], _hack: Optional[datetime] = Non
return new_date


def _clean_date(date: str, extra_replace: bool = False) -> str:
def _clean_date(date: str, *, extra_replace: bool = False) -> str:
"""
Removes any non-numeric characters from the date.
Expand Down Expand Up @@ -103,7 +104,9 @@ def _process_time(data: pd.DataFrame, column: int = 0) -> pd.DataFrame:
"""
for time_format in TIME_FORMATS:
try:
data.iloc[:, column] = data.iloc[:, column].apply(lambda x: datetime.strptime(x, time_format))
data.iloc[:, column] = data.iloc[:, column].apply(
lambda x, time_format=time_format: datetime.strptime(x, time_format),
)
return data
except Exception:
pass
Expand Down Expand Up @@ -217,6 +220,7 @@ def process_txt(filepath: str, skip_rows: Optional[list], data: pd.DataFrame) ->
Dataframe with the data from the text file.
"""
if "http" in filepath:
logger.debug(f"Processing {filepath}")
new_data = pd.read_fwf(
filepath,
header=None if "sdo_spacecraft_night" in filepath else 0,
Expand All @@ -231,7 +235,7 @@ def process_txt(filepath: str, skip_rows: Optional[list], data: pd.DataFrame) ->
if len(new_data.columns) in [2, 3]:
new_data = _process_data(new_data, filepath)
elif len(new_data.columns) > 3:
print(f"Unexpected number of columns for {filepath}, dropping all but first two")
logger.debug(f"Unexpected number of columns for {filepath}, dropping all but first two")
new_data = new_data.iloc[:, [0, 1]]
new_data.columns = ["Start Time", "End Time"]
try:
Expand Down Expand Up @@ -290,7 +294,6 @@ def process_html(url: str, data: pd.DataFrame) -> pd.DataFrame:
new_rows = rows[0].text.split("\n\n")
# Time is one single element whereas each event text is a separate element
dates, text = new_rows[0].strip().split("\n"), new_rows[1:-1]
# new_rows = [f"{date} {comment}" for date, comment in zip(dates, text)]
instrument = ["HMI" if "HMI" in new_row else "AIA" if "AIA" in new_row else "SDO" for new_row in text]
comment = [new_row.replace("\n", " ") for new_row in text]
start_dates = [(_format_date(_clean_date(date), year)) for date in dates]
Expand Down Expand Up @@ -341,7 +344,7 @@ def scrape_url(url: str) -> list:
list
List of all the urls scraped.
"""
base_url = os.path.dirname(url)
base_url = str(Path(url).parent).replace("https:/", "https://")
request = requests.get(url)
soup = BeautifulSoup(request.text, "html.parser")
urls = []
Expand Down Expand Up @@ -406,8 +409,8 @@ def drop_duplicates(data: pd.DataFrame) -> pd.DataFrame:
if __name__ == "__main__":
final_timeline = pd.DataFrame(columns=["Start Time", "End Time", "Instrument", "Source", "Comment"])
for dataset_name, block in DATASETS.items():
print(f"Scraping {dataset_name}")
print(f"{len(final_timeline.index)} rows so far")
logger.info(f"Scraping {dataset_name}")
logger.info(f"{len(final_timeline.index)} rows so far")
urls = [block.get("URL")]
if block.get("SCRAPE"):
urls = scrape_url(block["URL"])
Expand All @@ -419,23 +422,23 @@ def drop_duplicates(data: pd.DataFrame) -> pd.DataFrame:
else:
urls = [block["fURL"].format(f"20{i:02}") for i in block["RANGE"]]
for url in sorted(urls):
print(f"Parsing {url}")
logger.info(f"Parsing {url}")
if "txt" in url:
final_timeline = process_txt(url, block.get("SKIP_ROWS"), final_timeline)
elif "html" in url:
final_timeline = process_html(url, final_timeline)
else:
raise ValueError(f"Unknown file type for {url}")

print(f"{len(final_timeline.index)} rows in total")
logger.info(f"{len(final_timeline.index)} rows in total")
final_timeline = final_timeline.sort_values("Start Time")
final_timeline = final_timeline.reset_index(drop=True)
final_timeline["End Time"] = final_timeline["End Time"].fillna("Unknown")
final_timeline["Instrument"] = final_timeline["Instrument"].fillna("SDO")
final_timeline["Comment"] = final_timeline["Comment"].fillna("No Comment")
final_timeline = drop_duplicates(final_timeline)
print(f"{len(final_timeline.index)} rows in after deduplication")
logger.info(f"{len(final_timeline.index)} rows in after deduplication")
today_date = pd.Timestamp("today").strftime("%Y%m%d")
final_timeline.to_csv(f"timeline_{today_date}.csv", index=False)
final_timeline.to_csv(f"timeline_{today_date}.txt", sep="\t", index=False)
print(f"Files were saved to {os.getcwd()}")
logger.info(f"Files were saved to {Path.cwd()}")

0 comments on commit a20acc2

Please sign in to comment.