Skip to content

Commit

Permalink
Move VA to JS-aware browser; add UTF-8 compatibility
Browse files Browse the repository at this point in the history
  • Loading branch information
stucka committed Dec 22, 2024
1 parent 546d8f9 commit aad3a2b
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 26 deletions.
2 changes: 1 addition & 1 deletion warn/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def read_csv(self, name):
"""
path = Path(self.path, name)
logger.debug(f"Reading CSV from cache {path}")
with open(path) as fh:
with open(path, encoding="utf-8") as fh:
return list(csv.reader(fh))

def download(
Expand Down
69 changes: 46 additions & 23 deletions warn/scrapers/va.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
import logging
import os
from glob import glob
from pathlib import Path
from shutil import copyfile
from time import sleep

from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

from .. import utils
from ..cache import Cache

# from bs4 import BeautifulSoup, Tag


__authors__ = ["zstumgoren", "Dilcia19", "shallotly"]
__authors__ = ["zstumgoren", "Dilcia19", "shallotly", "stucka"]
__tags__ = ["html", "csv"]
__source__ = {
"name": "Virginia Employment Commission",
Expand Down Expand Up @@ -35,32 +41,49 @@ def scrape(

# This may break again, but this revised attempt has far fewer moving parts and actually fetches the complete data set.
# Blame Stucka in December 2024.
# And it broke again in December 2024, but not even Stucka will blame Stucka for this mess.

# Get the WARN page
# url = "https://www.vec.virginia.gov/warn-notices"
# url = "https://vec.virginia.gov/warn-notices?field_notice_date_value%5Bmin%5D%5Bdate%5D=1%2F1%2F1990&field_notice_date_value%5Bmax%5D%5Bdate%5D=&field_region_warn_tid=All"
# r = utils.get_url(url, verify=True)
# html = r.text

# Save it to the cache
cache = Cache(cache_dir)
# cache.write("va/source.html", html)
# csv_url = "https://vec.virginia.gov/warn-notices-csv.csv?field_notice_date_value%5Bmin%5D%5Bdate%5D=1%2F1%2F1990&field_notice_date_value%5Bmax%5D%5Bdate%5D=&field_region_warn_tid=All"

csv_url = "https://vec.virginia.gov/warn-notices-csv.csv"

# driver = webdriver.Chrome(options=chromeoptionsholder, service=Service(ChromeDriverManager().install()))
logger.debug("Attempting to launch Chrome")
chromeoptionsholder = ChromeOptions()
chrome_install = ChromeDriverManager().install()
folder = os.path.dirname(chrome_install)
chromedriver_path = os.path.join(folder, "chromedriver.exe")
service = ChromeService(chromedriver_path)
driver = webdriver.Chrome(options=chromeoptionsholder, service=service)
logger.debug(f"Attempting to fetch {csv_url}")
driver.get(csv_url)

sleep(25)

logger.debug(driver.page_source)

# get the user download folder (dynamic so will work on any machine)
downLoadFolder = os.path.join(os.getenv("USERPROFILE"), "Downloads") # type: ignore
# get the list of files
list_of_files = glob(downLoadFolder + "/*.csv")
# get the latest file name
latest_file = max(list_of_files, key=os.path.getctime)
# print the latest file name
logger.debug(f"CSV saved to {latest_file}")

target_filename = cache_dir / "va" / "source.csv"

utils.create_directory(path=cache_dir / "va", is_file=False)

# Parse out the CSV download link
# soup = BeautifulSoup(html, "html.parser")
# csv_link = soup.find("a", text="Download")
# if isinstance(csv_link, Tag):
# csv_href = csv_link["href"]
# else:
# raise ValueError("Could not find CSV link")
logger.debug(f"Saving file to {target_filename}")

# csv_href = "/warn-notices-csv.csv?"
# csv_url = f"https://www.vec.virginia.gov{csv_href}"
copyfile(latest_file, target_filename)

csv_url = "https://vec.virginia.gov/warn-notices-csv.csv?field_notice_date_value%5Bmin%5D%5Bdate%5D=1%2F1%2F1990&field_notice_date_value%5Bmax%5D%5Bdate%5D=&field_region_warn_tid=All"
driver.quit()

# Download it to the cache
cache.download("va/source.csv", csv_url, verify=True)
# cache.download("va/source.csv", csv_url, verify=True)

# Open it up as a list of rows
csv_rows = cache.read_csv("va/source.csv")
Expand Down
4 changes: 2 additions & 2 deletions warn/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def save_if_good_url(filename, url, **kwargs):
success_flag = False
content = False
else:
with open(filename, "wb") as outfile:
with open(filename, "wb", encoding="utf-8") as outfile:
outfile.write(response.content)
success_flag = True
content = response.content
Expand All @@ -104,7 +104,7 @@ def write_rows_to_csv(output_path: Path, rows: list, mode="w"):
"""
create_directory(output_path, is_file=True)
logger.debug(f"Writing {len(rows)} rows to {output_path}")
with open(output_path, mode, newline="") as f:
with open(output_path, mode, newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerows(rows)

Expand Down

0 comments on commit aad3a2b

Please sign in to comment.