Johnson Flu A: hack together a rough P2RA estimate

naobservatory · Mar 15, 2024 · c969331 · c969331
1 parent 1e78084
commit c969331
Show file tree

Hide file tree

Showing 6 changed files with 98 additions and 6,562 deletions.
diff --git a/fit.py b/fit.py
@@ -37,7 +37,7 @@ def start(num_samples: int, plot: bool) -> None:
     ) in predictors_by_taxid():
         taxids_str = "_".join(str(t) for t in taxids)
         for study, bioprojects in target_bioprojects.items():
-            enrichment = None if study == "brinch" else Enrichment.VIRAL
+            enrichment = None
             model = stats.build_model(
                 mgs_data,
                 bioprojects,

diff --git a/fits_summary.tsv b/fits_summary.tsv
diff --git a/input.tsv b/input.tsv
diff --git a/mgs.py b/mgs.py
@@ -1,3 +1,4 @@
+import os
 import json
 import urllib.request
 from collections import Counter
@@ -14,7 +15,7 @@
 
 MGS_REPO_DEFAULTS = {
     "user": "naobservatory",
-    "repo": "mgs-pipeline",
+    "repo": "mgs-restricted",
     "ref": "data-2023-07-21",
 }
 
@@ -23,10 +24,8 @@
 
 
 target_bioprojects = {
-    "crits_christoph": [BioProject("PRJNA661613")],
-    "rothman": [BioProject("PRJNA729801")],
-    "spurbeck": [BioProject("PRJNA924011")],
-    "brinch": [BioProject("PRJEB13832"), BioProject("PRJEB34633")],
+    "johnson": [BioProject("MJ-2024-02-12"),
+                BioProject("MJ-2024-03-04")],
 }
 
 
@@ -37,18 +36,9 @@ class GitHubRepo:
     ref: str
 
     def get_file(self, path: str) -> str:
-        file_url = (
-            f"https://raw.githubusercontent.com/"
-            f"{self.user}/{self.repo}/{self.ref}/{path}"
-        )
-        with urllib.request.urlopen(file_url) as response:
-            if response.status == 200:
-                return response.read()
-            else:
-                raise ValueError(
-                    f"Failed to download {file_url}. "
-                    f"Response status code: {response.status}"
-                )
+        with open(os.path.expanduser(
+                f"~/code/{self.repo}/{path}")) as inf:
+            return inf.read()
 
 
 def load_bioprojects(repo: GitHubRepo) -> dict[BioProject, list[Sample]]:
@@ -68,20 +58,33 @@ class SampleAttributes(BaseModel):
     country: str
     state: Optional[str] = None
     county: Optional[str] = None
-    location: str
+    location: Optional[str] = None
     fine_location: Optional[str] = None
     # Fixme: Not all the dates are real dates
     date: date | str
     reads: int
+    edta_treated: Optional[bool|str] = False
+    readlengths: Optional[dict] = None
+    ribofrac: Optional[float] = None
+    nuclease_treated: Optional[bool|str] = False
+    sample_volume_ml: Optional[str] = None
     enrichment: Optional[Enrichment] = None
+    concentration_method: Optional[str] = None
     method: Optional[str] = None
+    airport: Optional[str] = None
+    collection: Optional[str] = None
+
 
 
 def load_sample_attributes(repo: GitHubRepo) -> dict[Sample, SampleAttributes]:
     data = json.loads(repo.get_file("dashboard/metadata_samples.json"))
-    return {
-        Sample(s): SampleAttributes(**attribs) for s, attribs in data.items()
-    }
+    sa = {Sample(s): SampleAttributes(**attribs)
+          for s, attribs in data.items()}
+    for a in sa.values():
+        a.location = "n/a"
+        a.fine_location = "n/a"
+
+    return sa
 
 
 SampleCounts = dict[TaxID, dict[Sample, int]]
@@ -153,14 +156,14 @@ def sample_attributes(
         samples = {
             s: self.sample_attrs[s] for s in self.bioprojects[bioproject]
         }
-        if enrichment:
-            return {
-                s: attrs
-                for s, attrs in samples.items()
-                if attrs.enrichment == enrichment
-            }
-        else:
-            return samples
+        return {
+            s: attrs
+            for s, attrs in samples.items()
+            # remove them from the second run when it was True/False, but not
+            # the first run when it was Yes/No.  The second run it was much to
+            # agressive.
+            if not attrs.edta_treated == True
+        }
 
     def total_reads(self, bioproject: BioProject) -> dict[Sample, int]:
         return {

diff --git a/pathogens/influenza.py b/pathogens/influenza.py
@@ -140,6 +140,19 @@ def load_weekly_data() -> WeeklyData:
     source="https://www.cdc.gov/flu/about/burden/2021-2022.htm#:~:text=The%20overall%20burden%20of%20influenza%20(flu)%20for%20the%202021%2D2022%20season%20was%20an%20estimated%C2%A09%20million%C2%A0flu%20illnesses",
 )
 
+infections_2023_2024 = IncidenceAbsolute(
+    annual_infections=41_500_000,
+    confidence_interval=(29_000_000, 54_000_000),
+    # The estimate is only part of the year, but the code that works with these
+    # understands start and end dates.
+    start_date="2023-10-01",
+    end_date="2024-03-09",
+    tag="us-2023-2024",
+    country="United States",
+    # "CDC estimates that, from October 1, 2023 through March 9, 2024, there
+    # have been: 29 – 54 million flu illnesses"
+    source="https://www.cdc.gov/flu/about/burden/preliminary-in-season-estimates.htm"
+)
 
 def compare_incidence_to_positive_tests(
     official_incidence: IncidenceAbsolute, weekly_data: WeeklyData
@@ -190,6 +203,9 @@ def estimate_incidences() -> list[IncidenceRate]:
     underreporting_2021_2022 = compare_incidence_to_positive_tests(
         infections_2021_2022, weekly_data
     )
+    underreporting_2023_2024 = compare_incidence_to_positive_tests(
+        infections_2023_2024, weekly_data
+    )
 
     # The CDC didn't estimate an annual infections for 2020-2021, but assume
     # underreporting is the average of the two years we do have.
@@ -220,11 +236,16 @@ def get_underreporting(
             and start <= infections_2021_2022.parsed_start
         ):
             return underreporting_2020_2021
+        elif (
+            start >= infections_2023_2024.parsed_start
+            and start <= infections_2023_2024.parsed_end
+        ):
+            return underreporting_2023_2024
         else:
             return None
 
     incidences = []
-    for state in ["California", "Ohio"]:
+    for state in ["Missouri"]:
         for parsed_start in weekly_data[state]:
             positive_a, positive_b = weekly_data[state][parsed_start]
             parsed_end = parsed_start + datetime.timedelta(weeks=1)

diff --git a/populations.py b/populations.py
@@ -1,3 +1,5 @@
+import datetime
+import dataclasses
 from typing import Optional
 
 from pathogen_properties import Population, prevalence_data_filename
@@ -29,9 +31,17 @@ def load_location_populations():
 def us_population(
     year: int, county: Optional[str] = None, state: Optional[str] = None
 ) -> Population:
+
+    if year in [2023, 2024]:
+        pop_2022 = us_population(2022, county, state)
+        return dataclasses.replace(
+            pop_2022,
+            parsed_start=datetime.date(year, 7, 1),
+            parsed_end=datetime.date(year, 7, 1))
+
     if year not in [2020, 2021, 2022]:
         raise Exception("Unsupported year: %s" % year)
-
+    
     total_people = 0
     # All estimates are July 1st, specifically.
     pop_date = "%s-07-01" % year