Quick patch CO, rework VA

biglocalnews · Dec 5, 2024 · ea0796b · ea0796b
1 parent af15c7a
commit ea0796b
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 14 deletions.
diff --git a/warn/scrapers/co.py b/warn/scrapers/co.py
@@ -123,6 +123,7 @@ def scrape(
         "Occupations Impacted": "occupations",
         "Occupations": "occupations",
         "Select the workforce area": "workforce_area",
+        "Total CO": "jobs",
         "Total number of permanent layoffs": "permanent_job_losses",
         "Total number of temporary layoffs": "temporary_job_losses",
         "Total number of furloughs": "furloughs",

diff --git a/warn/scrapers/va.py b/warn/scrapers/va.py
@@ -1,11 +1,12 @@
 import logging
 from pathlib import Path
 
-from bs4 import BeautifulSoup, Tag
-
 from .. import utils
 from ..cache import Cache
 
+# from bs4 import BeautifulSoup, Tag
+
+
 __authors__ = ["zstumgoren", "Dilcia19", "shallotly"]
 __tags__ = ["html", "csv"]
 __source__ = {
@@ -29,26 +30,37 @@ def scrape(
 
     Returns: the Path where the file is written
     """
+    # This scraper initially tried to get a CSV download link that was only for the most recent entries. The scraping part of that broke.
+    # It's now hard-coded to a particular download link with parameters that should get the full thing.
+
+    # This may break again, but this revised attempt has far fewer moving parts and actually fetches the complete data set.
+    # Blame Stucka in December 2024.
+
     # Get the WARN page
-    url = "https://www.vec.virginia.gov/warn-notices"
-    r = utils.get_url(url, verify=False)
-    html = r.text
+    # url = "https://www.vec.virginia.gov/warn-notices"
+    # url = "https://vec.virginia.gov/warn-notices?field_notice_date_value%5Bmin%5D%5Bdate%5D=1%2F1%2F1990&field_notice_date_value%5Bmax%5D%5Bdate%5D=&field_region_warn_tid=All"
+    # r = utils.get_url(url, verify=True)
+    # html = r.text
 
     # Save it to the cache
     cache = Cache(cache_dir)
-    cache.write("va/source.html", html)
+    # cache.write("va/source.html", html)
 
     # Parse out the CSV download link
-    soup = BeautifulSoup(html, "html.parser")
-    csv_link = soup.find("a", text="Download")
-    if isinstance(csv_link, Tag):
-        csv_href = csv_link["href"]
-    else:
-        raise ValueError("Could not find CSV link")
-    csv_url = f"https://www.vec.virginia.gov{csv_href}"
+    # soup = BeautifulSoup(html, "html.parser")
+    # csv_link = soup.find("a", text="Download")
+    # if isinstance(csv_link, Tag):
+    #     csv_href = csv_link["href"]
+    # else:
+    #     raise ValueError("Could not find CSV link")
+
+    # csv_href = "/warn-notices-csv.csv?"
+    # csv_url = f"https://www.vec.virginia.gov{csv_href}"
+
+    csv_url = "https://vec.virginia.gov/warn-notices-csv.csv?field_notice_date_value%5Bmin%5D%5Bdate%5D=1%2F1%2F1990&field_notice_date_value%5Bmax%5D%5Bdate%5D=&field_region_warn_tid=All"
 
     # Download it to the cache
-    cache.download("va/source.csv", csv_url, verify=False)
+    cache.download("va/source.csv", csv_url, verify=True)
 
     # Open it up as a list of rows
     csv_rows = cache.read_csv("va/source.csv")