Skip to content

Commit

Permalink
Merge pull request #436 from ahernank/add-cohort-group-metadata
Browse files Browse the repository at this point in the history
Access cohort group data
  • Loading branch information
ahernank authored Mar 5, 2024
2 parents 0978276 + 65c7fc5 commit 691a5ff
Show file tree
Hide file tree
Showing 20 changed files with 989 additions and 353 deletions.
1 change: 1 addition & 0 deletions docs/source/Af1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ Sample metadata access
plot_samples_bar
plot_samples_interactive_map
wgs_data_catalog
cohorts

SNP data access
---------------
Expand Down
1 change: 1 addition & 0 deletions docs/source/Ag3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ Sample metadata access
plot_samples_bar
plot_samples_interactive_map
wgs_data_catalog
cohorts

SNP data access
---------------
Expand Down
34 changes: 34 additions & 0 deletions malariagen_data/anoph/sample_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1058,6 +1058,40 @@ def _setup_cohort_queries(

return cohort_queries_checked

@check_types
@doc(
summary="""
Read data for a specific cohort set, including cohort size,
country code, taxon, administrative units name, ISO code, geoBoundaries
shape ID and representative latitude and longitude points.
""",
parameters=dict(
cohort_set="""
A cohort set name. Accepted values are:
"admin1_month", "admin1_quarter", "admin1_year",
"admin2_month", "admin2_quarter", "admin2_year".
"""
),
returns="A dataframe of cohort data, one row per cohort.",
)
def cohorts(
self,
cohort_set: base_params.cohorts,
) -> pd.DataFrame:
major_version_path = self._major_version_path
cohorts_analysis = self._cohorts_analysis

path = f"{major_version_path[:2]}_cohorts/cohorts_{cohorts_analysis}/cohorts_{cohort_set}.csv"

# Read the manifest into a pandas dataframe.
with self.open_file(path) as f:
df_cohorts = pd.read_csv(f, sep=",", na_values="")

# Ensure all column names are lower case.
df_cohorts.columns = [c.lower() for c in df_cohorts.columns]

return df_cohorts


def locate_cohorts(*, cohorts, data):
# Build cohort dictionary where key=cohort_id, value=loc_coh.
Expand Down
25 changes: 22 additions & 3 deletions tests/anoph/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1099,7 +1099,7 @@ def init_config(self):
"SITE_ANNOTATIONS_ZARR_PATH": "reference/genome/agamp4/Anopheles-gambiae-PEST_SEQANNOTATION_AgamP4.12.zarr",
"DEFAULT_AIM_ANALYSIS": "20220528",
"DEFAULT_SITE_FILTERS_ANALYSIS": "dt_20200416",
"DEFAULT_COHORTS_ANALYSIS": "20230223",
"DEFAULT_COHORTS_ANALYSIS": "20230516",
"SITE_MASK_IDS": ["gamb_colu_arab", "gamb_colu", "arab"],
"PHASING_ANALYSIS_IDS": ["gamb_colu_arab", "gamb_colu", "arab"],
"COVERAGE_CALLS_ANALYSIS_IDS": ["gamb_colu", "arab"],
Expand Down Expand Up @@ -1258,7 +1258,7 @@ def write_metadata(self, release, release_path, sample_set, aim=True, cohorts=Tr
/ "vo_agam_release"
/ release_path
/ "metadata"
/ "cohorts_20230223"
/ "cohorts_20230516"
/ sample_set
/ "samples.cohorts.csv"
)
Expand All @@ -1268,13 +1268,32 @@ def write_metadata(self, release, release_path, sample_set, aim=True, cohorts=Tr
self.bucket_path
/ release_path
/ "metadata"
/ "cohorts_20230223"
/ "cohorts_20230516"
/ sample_set
/ "samples.cohorts.csv"
)
dst_path.parent.mkdir(parents=True, exist_ok=True)
df_coh_ds.to_csv(dst_path, index=False)

# Create cohorts data by sampling from some real files.
src_path = (
self.fixture_dir
/ "vo_agam_release"
/ "v3_cohorts"
/ "cohorts_20230516"
/ "cohorts_admin1_month.csv"
)
dst_path = (
self.bucket_path
/ "v3_cohorts"
/ "cohorts_20230516"
/ "cohorts_admin1_month.csv"
)
dst_path.parent.mkdir(parents=True, exist_ok=True)
with open(src_path, mode="r") as src, open(dst_path, mode="w") as dst:
for line in src.readlines()[:5]:
print(line, file=dst)

# Create data catalog by sampling from some real metadata files.
src_path = (
self.fixture_dir
Expand Down

This file was deleted.

Loading

0 comments on commit 691a5ff

Please sign in to comment.