Merge pull request #436 from ahernank/add-cohort-group-metadata

Access cohort group data
malariagen · Mar 5, 2024 · 691a5ff · 691a5ff
2 parents 0978276 + 65c7fc5
commit 691a5ff
Show file tree

Hide file tree

Showing 20 changed files with 989 additions and 353 deletions.
diff --git a/docs/source/Af1.rst b/docs/source/Af1.rst
@@ -54,6 +54,7 @@ Sample metadata access
     plot_samples_bar
     plot_samples_interactive_map
     wgs_data_catalog
+    cohorts
 
 SNP data access
 ---------------

diff --git a/docs/source/Ag3.rst b/docs/source/Ag3.rst
@@ -54,6 +54,7 @@ Sample metadata access
     plot_samples_bar
     plot_samples_interactive_map
     wgs_data_catalog
+    cohorts
 
 SNP data access
 ---------------

diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
@@ -1058,6 +1058,40 @@ def _setup_cohort_queries(
 
         return cohort_queries_checked
 
+    @check_types
+    @doc(
+        summary="""
+            Read data for a specific cohort set, including cohort size,
+            country code, taxon, administrative units name, ISO code, geoBoundaries
+            shape ID and representative latitude and longitude points.
+        """,
+        parameters=dict(
+            cohort_set="""
+                A cohort set name. Accepted values are:
+                "admin1_month", "admin1_quarter", "admin1_year",
+                "admin2_month", "admin2_quarter", "admin2_year".
+            """
+        ),
+        returns="A dataframe of cohort data, one row per cohort.",
+    )
+    def cohorts(
+        self,
+        cohort_set: base_params.cohorts,
+    ) -> pd.DataFrame:
+        major_version_path = self._major_version_path
+        cohorts_analysis = self._cohorts_analysis
+
+        path = f"{major_version_path[:2]}_cohorts/cohorts_{cohorts_analysis}/cohorts_{cohort_set}.csv"
+
+        # Read the manifest into a pandas dataframe.
+        with self.open_file(path) as f:
+            df_cohorts = pd.read_csv(f, sep=",", na_values="")
+
+        # Ensure all column names are lower case.
+        df_cohorts.columns = [c.lower() for c in df_cohorts.columns]
+
+        return df_cohorts
+
 
 def locate_cohorts(*, cohorts, data):
     # Build cohort dictionary where key=cohort_id, value=loc_coh.

diff --git a/tests/anoph/conftest.py b/tests/anoph/conftest.py
@@ -1099,7 +1099,7 @@ def init_config(self):
             "SITE_ANNOTATIONS_ZARR_PATH": "reference/genome/agamp4/Anopheles-gambiae-PEST_SEQANNOTATION_AgamP4.12.zarr",
             "DEFAULT_AIM_ANALYSIS": "20220528",
             "DEFAULT_SITE_FILTERS_ANALYSIS": "dt_20200416",
-            "DEFAULT_COHORTS_ANALYSIS": "20230223",
+            "DEFAULT_COHORTS_ANALYSIS": "20230516",
             "SITE_MASK_IDS": ["gamb_colu_arab", "gamb_colu", "arab"],
             "PHASING_ANALYSIS_IDS": ["gamb_colu_arab", "gamb_colu", "arab"],
             "COVERAGE_CALLS_ANALYSIS_IDS": ["gamb_colu", "arab"],
@@ -1258,7 +1258,7 @@ def write_metadata(self, release, release_path, sample_set, aim=True, cohorts=Tr
                 / "vo_agam_release"
                 / release_path
                 / "metadata"
-                / "cohorts_20230223"
+                / "cohorts_20230516"
                 / sample_set
                 / "samples.cohorts.csv"
             )
@@ -1268,13 +1268,32 @@ def write_metadata(self, release, release_path, sample_set, aim=True, cohorts=Tr
                 self.bucket_path
                 / release_path
                 / "metadata"
-                / "cohorts_20230223"
+                / "cohorts_20230516"
                 / sample_set
                 / "samples.cohorts.csv"
             )
             dst_path.parent.mkdir(parents=True, exist_ok=True)
             df_coh_ds.to_csv(dst_path, index=False)
 
+            # Create cohorts data by sampling from some real files.
+            src_path = (
+                self.fixture_dir
+                / "vo_agam_release"
+                / "v3_cohorts"
+                / "cohorts_20230516"
+                / "cohorts_admin1_month.csv"
+            )
+            dst_path = (
+                self.bucket_path
+                / "v3_cohorts"
+                / "cohorts_20230516"
+                / "cohorts_admin1_month.csv"
+            )
+            dst_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(src_path, mode="r") as src, open(dst_path, mode="w") as dst:
+                for line in src.readlines()[:5]:
+                    print(line, file=dst)
+
         # Create data catalog by sampling from some real metadata files.
         src_path = (
             self.fixture_dir

diff --git a/.../fixture/missing_metadata/v3/metadata/cohorts_20230223/AG1000G-AO/samples.admin_units.csv b/.../fixture/missing_metadata/v3/metadata/cohorts_20230223/AG1000G-AO/samples.admin_units.csv