change naming of dataset to samples

semantically makes more sense since the bids "datasets" contain SPIM samples. Note, we still use the word dataset in other local context, e.g. for bigstitcher (dataset.xml, fuse_dataset) or ome_zarr multiscales dataset. But at least this resolves the conflict in the direct inputs and outputs of the workflow.
khanlab · Oct 8, 2024 · 77eed24 · 77eed24
1 parent ee43d44
commit 77eed24
Show file tree

Hide file tree

Showing 19 changed files with 85 additions and 80 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,7 @@ resources/**
 logs/**
 .snakemake
 .snakemake/**
+__pycache__
+.ipynb_checkpoints
+benchmarks
+*.swp
diff --git a/README.md b/README.md
@@ -40,7 +40,7 @@ python3.11 -m venv venv
 source venv/bin/activate
 ```
 
-3. Update the `config/datasets.tsv` spreadsheet to point to your dataset(s). Each dataset's tif files should be in it's own folder or tar file, with no other tif files. Enter the path to each dataset in the `dataset_path` column. The first three columns identify the subject, sample, acquisition, which become part of the resulting filenames (BIDS naming). The `stain_0` and `stain_1` identify what stains were used for each channel. Use `autof` to indicate the autofluorescence channel. If you have a different number of stains you can add or remove these columns. If your samples have different numbers of stains, you can leave values blank or use `n/a` to indicate that a sample does not have a particular stain. 
+3. Update the `config/samples.tsv` spreadsheet to point to your sample(s). Each sample's tif files should be in it's own folder or tar file, with no other tif files. Enter the path to each sample in the `sample_path` column. The first three columns identify the subject, sample, acquisition, which become part of the resulting filenames (BIDS naming). The `stain_0` and `stain_1` identify what stains were used for each channel. Use `autof` to indicate the autofluorescence channel. If you have a different number of stains you can add or remove these columns. If your samples have different numbers of stains, you can leave values blank or use `n/a` to indicate that a sample does not have a particular stain. 
 
 Note: The acquisition value must contain either `blaze` or `prestitched`, and defines which workflow will be used. E.g. for LifeCanvas data that is already stitched, you need to include `prestitched` in the acquisition flag. 
 

diff --git a/config/config.yml b/config/config.yml
@@ -1,4 +1,4 @@
-datasets: 'config/datasets.tsv'
+samples: 'config/samples.tsv'
 
 root: 'bids' # can use a s3:// or gcs:// prefix to write output to cloud storage
 work: 'work' 

diff --git a/config/datasets.tsv b/config/datasets.tsv
diff --git a/config/samples.tsv b/config/samples.tsv
@@ -0,0 +1,3 @@
+subject	sample	acq	stain_0	stain_1	stain_2	sample_path
+mouse1	brain	blaze	Lectin	PI	Abeta	.test/dryrun/data
+lifecanvas1	brain	prestitched	PI	Abeta	n/a	.test/dryrun/data
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,12 +29,14 @@ xmltodict = "^0.13.0"
 tifffile = "^2024.5.10"
 
 [tool.poe.tasks]
-test_localin_gcsout = "snakemake --dry-run --config datasets=testing/dryrun_tests/datasets_local.tsv root='gcs://khanlab-lightsheet/data/test_bids'"
-test_localin_localout = "snakemake --dry-run --config datasets=testing/dryrun_tests/datasets_local.tsv root=bids"
-test_gcsin_gcsout = "snakemake --dry-run --config datasets=testing/dryrun_tests/datasets_gcs.tsv root='gcs://khanlab-lightsheet/data/test_bids'"
-test_gcsin_localout = "snakemake --dry-run --config datasets=testing/dryrun_tests/datasets_gcs.tsv root=bids"
-test_localin_localout_zipstore = "snakemake --dry-run --config datasets=testing/dryrun_tests/datasets_local.tsv root=bids use_zipstore=True"
-test_localin_gcsout_zipstore = "snakemake --dry-run --config datasets=testing/dryrun_tests/datasets_local.tsv root='gcs://khanlab-lightsheet/data/test_bids' use_zipstore=True"
+test_localin_gcsout = "snakemake --dry-run --config samples=testing/dryrun_tests/samples_local.tsv root='gcs://khanlab-lightsheet/data/test_bids'"
+test_localin_localout = "snakemake --dry-run --config samples=testing/dryrun_tests/samples_local.tsv root=bids"
+test_gcsin_gcsout = "snakemake --dry-run --config samples=testing/dryrun_tests/samples_gcs.tsv root='gcs://khanlab-lightsheet/data/test_bids'"
+test_gcsin_localout = "snakemake --dry-run --config samples=testing/dryrun_tests/samples_gcs.tsv root=bids"
+test_localin_localout_zipstore = "snakemake --dry-run --config samples=testing/dryrun_tests/samples_local.tsv root=bids use_zipstore=True"
+test_localin_gcsout_zipstore = "snakemake --dry-run --config samples=testing/dryrun_tests/samples_local.tsv root='gcs://khanlab-lightsheet/data/test_bids' use_zipstore=True"
+test_gcsout=["test_localin_gcsout", "test_gcsin_gcsout", "test_localin_gcsout_zipstore"]
+test_localout=["test_localin_localout", "test_gcsin_localout", "test_localin_localout_zipstore"]
 
 [build-system]
 requires = ["poetry-core"]

diff --git a/testing/dryrun_tests/datasets_local.tsv b/testing/dryrun_tests/datasets_local.tsv
diff --git a/testing/dryrun_tests/datasets_gcs.tsv → testing/dryrun_tests/samples_gcs.tsv b/testing/dryrun_tests/datasets_gcs.tsv → testing/dryrun_tests/samples_gcs.tsv
@@ -1,2 +1,2 @@
-subject	sample	acq	stain_0	stain_1	stain_2	dataset_path
+subject	sample	acq	stain_0	stain_1	stain_2	sample_path
 mouse1	brain	blaze	Lectin	PI	Abeta	gcs://my-bucket/test_data
diff --git a/testing/dryrun_tests/samples_local.tsv b/testing/dryrun_tests/samples_local.tsv
@@ -0,0 +1,2 @@
+subject	sample	acq	stain_0	stain_1	stain_2	sample_path
+mouse1	brain	blaze	Lectin	PI	Abeta	.test/dryrun/data
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -21,9 +21,9 @@ resampled = Path(root) / "derivatives" / "resampled"
 # this is needed to use the latest bids spec with the pre-release snakebids
 set_bids_spec("v0_11_0")
 
-# read datasets tsv
+# read samples tsv
 dtype = defaultdict(lambda: str, num_tiles=int)
-datasets = pd.read_csv(config["datasets"], sep="\t", dtype=dtype)
+samples = pd.read_csv(config["samples"], sep="\t", dtype=dtype)
 
 
 include: "rules/common.smk"

diff --git a/workflow/rules/bids.smk b/workflow/rules/bids.smk
@@ -53,7 +53,7 @@ rule bids_samples_json:
 
 rule create_samples_tsv:
     params:
-        datasets_df=datasets,
+        samples_df=samples,
     output:
         tsv=bids_toplevel(root, "samples.tsv"),
     log:

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -85,7 +85,7 @@ def get_extension_ome_zarr():
 # targets
 def get_all_targets():
     targets = []
-    for i in range(len(datasets)):
+    for i in range(len(samples)):
         targets.extend(
             expand_bids(
                 root=root,
@@ -95,9 +95,9 @@ def get_all_targets():
                 acq="{acq}",
                 suffix="SPIM.{extension}",
                 expand_kwargs=dict(
-                    subject=datasets.loc[i, "subject"],
-                    sample=datasets.loc[i, "sample"],
-                    acq=datasets.loc[i, "acq"],
+                    subject=samples.loc[i, "subject"],
+                    sample=samples.loc[i, "sample"],
+                    acq=samples.loc[i, "acq"],
                     extension=[get_extension_ome_zarr(), "json"],
                 ),
             )
@@ -113,9 +113,9 @@ def get_all_targets():
                 stain="{stain}",
                 suffix="SPIM.nii",
                 expand_kwargs=dict(
-                    subject=datasets.loc[i, "subject"],
-                    sample=datasets.loc[i, "sample"],
-                    acq=datasets.loc[i, "acq"],
+                    subject=samples.loc[i, "subject"],
+                    sample=samples.loc[i, "sample"],
+                    acq=samples.loc[i, "acq"],
                     level=config["nifti"]["levels"],
                     stain=get_stains_by_row(i),
                 ),
@@ -127,12 +127,12 @@ def get_all_targets():
 def get_all_subj_html(wildcards):
     htmls = []
 
-    for i in range(len(datasets)):
+    for i in range(len(samples)):
         html = "{root}/qc/sub-{subject}_sample-{sample}_acq-{acq}/subject.html".format(
             root=root,
-            subject=datasets.loc[i, "subject"],
-            sample=datasets.loc[i, "sample"],
-            acq=datasets.loc[i, "acq"],
+            subject=samples.loc[i, "subject"],
+            sample=samples.loc[i, "sample"],
+            acq=samples.loc[i, "acq"],
         )
         htmls.append(remote_file(html))
 
@@ -157,51 +157,50 @@ def get_qc_targets():
     return targets
 
 
-def dataset_is_remote(wildcards):
-    return is_remote_gcs(Path(get_dataset_path(wildcards)))
+def sample_is_remote(wildcards):
+    return is_remote_gcs(Path(get_sample_path(wildcards)))
 
 
-def get_input_dataset(wildcards):
-    """returns path to extracted dataset or path to provided input folder"""
-    dataset_path = Path(get_dataset_path(wildcards))
-    suffix = dataset_path.suffix
+def get_input_sample(wildcards):
+    """returns path to extracted sample or path to provided input folder"""
+    sample_path = Path(get_sample_path(wildcards))
 
-    if is_remote_gcs(dataset_path):
+    if is_remote_gcs(sample_path):
         return rules.cp_from_gcs.output.ome_dir.format(**wildcards)
 
-    if dataset_path.is_dir():
-        return get_dataset_path_remote(wildcards)
+    if sample_path.is_dir():
+        return get_sample_path_remote(wildcards)
 
-    elif tarfile.is_tarfile(dataset_path):
-        # dataset was a tar file, so point to the extracted folder
-        return rules.extract_dataset.output.ome_dir.format(**wildcards)
+    elif tarfile.is_tarfile(sample_path):
+        # sample was a tar file, so point to the extracted folder
+        return rules.extract_sample.output.ome_dir.format(**wildcards)
 
     else:
-        print(f"unsupported input: {dataset_path}")
+        print(f"unsupported input: {sample_path}")
 
 
 def get_metadata_json(wildcards):
     """returns path to metadata, extracted from local or gcs"""
-    dataset_path = Path(get_dataset_path(wildcards))
+    sample_path = Path(get_sample_path(wildcards))
 
-    if is_remote_gcs(dataset_path):
+    if is_remote_gcs(sample_path):
         return rules.blaze_to_metadata_gcs.output.metadata_json.format(**wildcards)
     else:
         return rules.blaze_to_metadata.output.metadata_json.format(**wildcards)
 
 
 # import
-def cmd_extract_dataset(wildcards, input, output):
+def cmd_extract_sample(wildcards, input, output):
     cmds = []
 
     # supports tar, tar.gz, tgz, or folder name
-    dataset_path = Path(input.dataset_path)
-    suffix = dataset_path.suffix
-    if dataset_path.is_dir():
+    sample_path = Path(input.sample_path)
+    suffix = sample_path.suffix
+    if sample_path.is_dir():
         # we have a directory
         print("input directory not copied/extracted by this rule")
 
-    elif tarfile.is_tarfile(dataset_path):
+    elif tarfile.is_tarfile(sample_path):
         # we have a tar file
         # check if gzipped:
         cmds.append(f"mkdir -p {output}")
@@ -211,41 +210,41 @@ def cmd_extract_dataset(wildcards, input, output):
             cmds.append(f"tar -xf {input} -C {output}")
 
     else:
-        print(f"unsupported input: {dataset_path}")
+        print(f"unsupported input: {sample_path}")
 
     return " && ".join(cmds)
 
 
-def get_dataset_path_remote(wildcards):
-    path = get_dataset_path(wildcards)
+def get_sample_path_remote(wildcards):
+    path = get_sample_path(wildcards)
     if is_remote(path):
         return storage(path)
     else:
         return path
 
 
-def get_dataset_path_gs(wildcards):
-    path = Path(get_dataset_path(wildcards)).path
+def get_sample_path_gs(wildcards):
+    path = Path(get_sample_path(wildcards)).path
     return f"gs://{path}"
 
 
-def get_dataset_path(wildcards):
-    df = datasets.query(
+def get_sample_path(wildcards):
+    df = samples.query(
         f"subject=='{wildcards.subject}' and sample=='{wildcards.sample}' and acq=='{wildcards.acq}'"
     )
-    return df.dataset_path.to_list()[0]
+    return df.sample_path.to_list()[0]
 
 
 def get_stains_by_row(i):
     # Select columns that match the pattern 'stain_'
-    stain_columns = datasets.filter(like="stain_").columns
+    stain_columns = samples.filter(like="stain_").columns
 
     # Select values for a given row
-    return datasets.loc[i, stain_columns].dropna().tolist()
+    return samples.loc[i, stain_columns].dropna().tolist()
 
 
 def get_stains(wildcards):
-    df = datasets.query(
+    df = samples.query(
         f"subject=='{wildcards.subject}' and sample=='{wildcards.sample}' and acq=='{wildcards.acq}'"
     )
 

diff --git a/workflow/rules/flatfield_corr.smk b/workflow/rules/flatfield_corr.smk
@@ -8,7 +8,7 @@ rule fit_basic_flatfield_corr:
             datatype="micr",
             sample="{sample}",
             acq="{acq}",
-            desc="rawfromgcs" if dataset_is_remote(wildcards) else "raw",
+            desc="rawfromgcs" if sample_is_remote(wildcards) else "raw",
             suffix="SPIM.zarr",
         ).format(**wildcards),
     params:
@@ -70,7 +70,7 @@ rule apply_basic_flatfield_corr:
             datatype="micr",
             sample="{sample}",
             acq="{acq}",
-            desc="rawfromgcs" if dataset_is_remote(wildcards) else "raw",
+            desc="rawfromgcs" if sample_is_remote(wildcards) else "raw",
             suffix="SPIM.zarr",
         ).format(**wildcards),
         model_dirs=lambda wildcards: expand(

diff --git a/workflow/rules/import.smk b/workflow/rules/import.smk
@@ -1,8 +1,8 @@
-rule extract_dataset:
+rule extract_sample:
     input:
-        dataset_path=get_dataset_path_remote,
+        sample_path=get_sample_path_remote,
     params:
-        cmd=cmd_extract_dataset,
+        cmd=cmd_extract_sample,
     output:
         ome_dir=temp(
             directory(
@@ -23,7 +23,7 @@ rule extract_dataset:
         bids(
             root="logs",
             subject="{subject}",
-            datatype="extract_dataset",
+            datatype="extract_sample",
             sample="{sample}",
             acq="{acq}",
             desc="raw",
@@ -37,7 +37,7 @@ rule blaze_to_metadata_gcs:
     input:
         creds=os.path.expanduser(config["remote_creds"]),
     params:
-        dataset_path=get_dataset_path_gs,
+        sample_path=get_sample_path_gs,
         in_tif_pattern=lambda wildcards: config["import_blaze"]["raw_tif_pattern"],
         storage_provider_settings=workflow.storage_provider_settings,
     output:
@@ -78,7 +78,7 @@ rule blaze_to_metadata_gcs:
 
 rule blaze_to_metadata:
     input:
-        ome_dir=get_input_dataset,
+        ome_dir=get_input_sample,
     output:
         metadata_json=temp(
             bids(
@@ -144,7 +144,7 @@ rule copy_blaze_metadata:
 
 rule prestitched_to_metadata:
     input:
-        ome_dir=get_input_dataset,
+        ome_dir=get_input_sample,
     params:
         physical_size_x_um=config["import_prestitched"]["physical_size_x_um"],
         physical_size_y_um=config["import_prestitched"]["physical_size_y_um"],
@@ -189,7 +189,7 @@ rule tif_to_zarr:
         output shape is (tiles,channels,z,y,x), with the 2d 
         images as the chunks"""
     input:
-        ome_dir=get_input_dataset,
+        ome_dir=get_input_sample,
         metadata_json=rules.copy_blaze_metadata.output.metadata_json,
     params:
         intensity_rescaling=config["import_blaze"]["intensity_rescaling"],
@@ -244,7 +244,7 @@ rule tif_to_zarr_gcs:
         metadata_json=rules.copy_blaze_metadata.output.metadata_json,
         creds=os.path.expanduser(config["remote_creds"]),
     params:
-        dataset_path=get_dataset_path_gs,
+        sample_path=get_sample_path_gs,
         in_tif_pattern=lambda wildcards: config["import_blaze"]["raw_tif_pattern"],
         intensity_rescaling=config["import_blaze"]["intensity_rescaling"],
         storage_provider_settings=workflow.storage_provider_settings,

diff --git a/workflow/rules/ome_zarr.smk b/workflow/rules/ome_zarr.smk
@@ -52,7 +52,7 @@ rule zarr_to_ome_zarr:
 rule tif_stacks_to_ome_zarr:
     input:
         **get_storage_creds(),
-        tif_dir=get_input_dataset,
+        tif_dir=get_input_sample,
         metadata_json=rules.prestitched_to_metadata.output.metadata_json,
     params:
         in_tif_glob=lambda wildcards, input: os.path.join(

diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk
@@ -20,7 +20,7 @@ rule generate_flatfield_qc:
             datatype="micr",
             sample="{sample}",
             acq="{acq}",
-            desc="rawfromgcs" if dataset_is_remote(wildcards) else "raw",
+            desc="rawfromgcs" if sample_is_remote(wildcards) else "raw",
             suffix="SPIM.zarr",
         ).format(**wildcards),
         corr=bids(
@@ -190,7 +190,7 @@ rule generate_aggregate_qc:
         report_html=config["report"]["resources"]["report_html"],
         subj_htmls=get_all_subj_html,
     params:
-        datasets=datasets,
+        samples=samples,
     output:
         total_html=remote_file(Path(root) / "qc" / "qc_report.html"),
     log:

diff --git a/workflow/scripts/blaze_to_metadata_gcs.py b/workflow/scripts/blaze_to_metadata_gcs.py
@@ -9,13 +9,13 @@
 import gcsfs
 from lib.cloud_io import get_fsspec
 
-dataset_uri = snakemake.params.dataset_path
+sample_uri = snakemake.params.sample_path
 
 gcsfs_opts={'project': snakemake.params.storage_provider_settings['gcs'].get_settings().project,
                         'token': snakemake.input.creds}
 fs = gcsfs.GCSFileSystem(**gcsfs_opts)
 
-tifs = fs.glob(f"{dataset_uri}/*.tif")
+tifs = fs.glob(f"{sample_uri}/*.tif")
 
 #check first tif file to see if it is zstack or not:
 if 'xyz-Table Z' in Path(tifs[0]).name:

diff --git a/workflow/scripts/create_samples_tsv.py b/workflow/scripts/create_samples_tsv.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-df = snakemake.params.datasets_df
+df = snakemake.params.samples_df
 
 df['participant_id'] = 'sub-' + df['subject'] 
 df['sample_id'] = 'sample-' + df['sample']
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		subject sample acq stain_0 stain_1 stain_2 sample_path
		mouse1 brain blaze Lectin PI Abeta .test/dryrun/data