Skip to content

Commit

Permalink
Johnson Flu A: hack together a rough P2RA estimate
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffkaufman committed Mar 15, 2024
1 parent 1e78084 commit c969331
Show file tree
Hide file tree
Showing 6 changed files with 98 additions and 6,562 deletions.
2 changes: 1 addition & 1 deletion fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def start(num_samples: int, plot: bool) -> None:
) in predictors_by_taxid():
taxids_str = "_".join(str(t) for t in taxids)
for study, bioprojects in target_bioprojects.items():
enrichment = None if study == "brinch" else Enrichment.VIRAL
enrichment = None
model = stats.build_model(
mgs_data,
bioprojects,
Expand Down
448 changes: 4 additions & 444 deletions fits_summary.tsv

Large diffs are not rendered by default.

6,114 changes: 28 additions & 6,086 deletions input.tsv

Large diffs are not rendered by default.

61 changes: 32 additions & 29 deletions mgs.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import json
import urllib.request
from collections import Counter
Expand All @@ -14,7 +15,7 @@

MGS_REPO_DEFAULTS = {
"user": "naobservatory",
"repo": "mgs-pipeline",
"repo": "mgs-restricted",
"ref": "data-2023-07-21",
}

Expand All @@ -23,10 +24,8 @@


target_bioprojects = {
"crits_christoph": [BioProject("PRJNA661613")],
"rothman": [BioProject("PRJNA729801")],
"spurbeck": [BioProject("PRJNA924011")],
"brinch": [BioProject("PRJEB13832"), BioProject("PRJEB34633")],
"johnson": [BioProject("MJ-2024-02-12"),
BioProject("MJ-2024-03-04")],
}


Expand All @@ -37,18 +36,9 @@ class GitHubRepo:
ref: str

def get_file(self, path: str) -> str:
file_url = (
f"https://raw.githubusercontent.com/"
f"{self.user}/{self.repo}/{self.ref}/{path}"
)
with urllib.request.urlopen(file_url) as response:
if response.status == 200:
return response.read()
else:
raise ValueError(
f"Failed to download {file_url}. "
f"Response status code: {response.status}"
)
with open(os.path.expanduser(
f"~/code/{self.repo}/{path}")) as inf:
return inf.read()


def load_bioprojects(repo: GitHubRepo) -> dict[BioProject, list[Sample]]:
Expand All @@ -68,20 +58,33 @@ class SampleAttributes(BaseModel):
country: str
state: Optional[str] = None
county: Optional[str] = None
location: str
location: Optional[str] = None
fine_location: Optional[str] = None
# Fixme: Not all the dates are real dates
date: date | str
reads: int
edta_treated: Optional[bool|str] = False
readlengths: Optional[dict] = None
ribofrac: Optional[float] = None
nuclease_treated: Optional[bool|str] = False
sample_volume_ml: Optional[str] = None
enrichment: Optional[Enrichment] = None
concentration_method: Optional[str] = None
method: Optional[str] = None
airport: Optional[str] = None
collection: Optional[str] = None



def load_sample_attributes(repo: GitHubRepo) -> dict[Sample, SampleAttributes]:
data = json.loads(repo.get_file("dashboard/metadata_samples.json"))
return {
Sample(s): SampleAttributes(**attribs) for s, attribs in data.items()
}
sa = {Sample(s): SampleAttributes(**attribs)
for s, attribs in data.items()}
for a in sa.values():
a.location = "n/a"
a.fine_location = "n/a"

return sa


SampleCounts = dict[TaxID, dict[Sample, int]]
Expand Down Expand Up @@ -153,14 +156,14 @@ def sample_attributes(
samples = {
s: self.sample_attrs[s] for s in self.bioprojects[bioproject]
}
if enrichment:
return {
s: attrs
for s, attrs in samples.items()
if attrs.enrichment == enrichment
}
else:
return samples
return {
s: attrs
for s, attrs in samples.items()
# remove them from the second run when it was True/False, but not
# the first run when it was Yes/No. The second run it was much to
# agressive.
if not attrs.edta_treated == True
}

def total_reads(self, bioproject: BioProject) -> dict[Sample, int]:
return {
Expand Down
23 changes: 22 additions & 1 deletion pathogens/influenza.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,19 @@ def load_weekly_data() -> WeeklyData:
source="https://www.cdc.gov/flu/about/burden/2021-2022.htm#:~:text=The%20overall%20burden%20of%20influenza%20(flu)%20for%20the%202021%2D2022%20season%20was%20an%20estimated%C2%A09%20million%C2%A0flu%20illnesses",
)

infections_2023_2024 = IncidenceAbsolute(
annual_infections=41_500_000,
confidence_interval=(29_000_000, 54_000_000),
# The estimate is only part of the year, but the code that works with these
# understands start and end dates.
start_date="2023-10-01",
end_date="2024-03-09",
tag="us-2023-2024",
country="United States",
# "CDC estimates that, from October 1, 2023 through March 9, 2024, there
# have been: 29 – 54 million flu illnesses"
source="https://www.cdc.gov/flu/about/burden/preliminary-in-season-estimates.htm"
)

def compare_incidence_to_positive_tests(
official_incidence: IncidenceAbsolute, weekly_data: WeeklyData
Expand Down Expand Up @@ -190,6 +203,9 @@ def estimate_incidences() -> list[IncidenceRate]:
underreporting_2021_2022 = compare_incidence_to_positive_tests(
infections_2021_2022, weekly_data
)
underreporting_2023_2024 = compare_incidence_to_positive_tests(
infections_2023_2024, weekly_data
)

# The CDC didn't estimate an annual infections for 2020-2021, but assume
# underreporting is the average of the two years we do have.
Expand Down Expand Up @@ -220,11 +236,16 @@ def get_underreporting(
and start <= infections_2021_2022.parsed_start
):
return underreporting_2020_2021
elif (
start >= infections_2023_2024.parsed_start
and start <= infections_2023_2024.parsed_end
):
return underreporting_2023_2024
else:
return None

incidences = []
for state in ["California", "Ohio"]:
for state in ["Missouri"]:
for parsed_start in weekly_data[state]:
positive_a, positive_b = weekly_data[state][parsed_start]
parsed_end = parsed_start + datetime.timedelta(weeks=1)
Expand Down
12 changes: 11 additions & 1 deletion populations.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import datetime
import dataclasses
from typing import Optional

from pathogen_properties import Population, prevalence_data_filename
Expand Down Expand Up @@ -29,9 +31,17 @@ def load_location_populations():
def us_population(
year: int, county: Optional[str] = None, state: Optional[str] = None
) -> Population:

if year in [2023, 2024]:
pop_2022 = us_population(2022, county, state)
return dataclasses.replace(
pop_2022,
parsed_start=datetime.date(year, 7, 1),
parsed_end=datetime.date(year, 7, 1))

if year not in [2020, 2021, 2022]:
raise Exception("Unsupported year: %s" % year)

total_people = 0
# All estimates are July 1st, specifically.
pop_date = "%s-07-01" % year
Expand Down

0 comments on commit c969331

Please sign in to comment.