From 9de492cf26b351d93ec1c0b37b1847f4c8e5d3a2 Mon Sep 17 00:00:00 2001 From: bethac07 Date: Thu, 29 Jun 2023 08:12:08 -0400 Subject: [PATCH 1/2] Add extra args to collate to handle edge cases --- pycytominer/cyto_utils/collate.py | 63 ++++++++++++++++++++++++--- pycytominer/cyto_utils/collate_cmd.py | 36 +++++++++++++++ 2 files changed, 92 insertions(+), 7 deletions(-) diff --git a/pycytominer/cyto_utils/collate.py b/pycytominer/cyto_utils/collate.py index 58532dbb..d7f13d1f 100644 --- a/pycytominer/cyto_utils/collate.py +++ b/pycytominer/cyto_utils/collate.py @@ -32,6 +32,10 @@ def collate( add_image_features=True, image_feature_categories=["Granularity", "Texture", "ImageQuality", "Threshold"], printtoscreen=True, + append_metadata=False, + overwrite_metadata=False, + download_flags=[], + upload_flags=[], ): """Collate the CellProfiler-created CSVs into a single SQLite file by calling cytominer-database @@ -49,22 +53,30 @@ def collate( An existing column to be explicitly copied to a new column called Metadata_Plate if no Metadata_Plate column already explicitly exists munge : bool, default False Whether munge should be passed to cytominer-database, if True cytominer-database will expect a single all-object CSV; it will split each object into its own table - csv_dir : str, default 'analysis' + csv_dir : str, default "analysis" The directory under the base directory where the analysis CSVs will be found. If running the analysis pipeline, this should nearly always be "analysis" aws_remote : str, optional, default None A remote AWS prefix, if set CSV files will be synced down from at the beginning and to which SQLite files will be synced up at the end of the run aggregate_only : bool, default False Whether to perform only the aggregation of existent SQLite files and bypass previous collation steps - tmp_dir: str, default '/tmp' + tmp_dir: str, default "/tmp" The temporary directory to be used by cytominer-databases for output overwrite: bool, optional, default False Whether or not to overwrite an sqlite that exists in the temporary directory if it already exists add_image_features: bool, optional, default True Whether or not to add the image features to the profiles - image_feature_categories: list, optional, default ['Granularity','Texture','ImageQuality','Count','Threshold'] + image_feature_categories: list, optional, default ["Granularity","Texture","ImageQuality","Count","Threshold"] The list of image feature groups to be used by add_image_features during aggregation printtoscreen: bool, optional, default True Whether or not to print output to the terminal + append_metadata: bool, optional, default False + TODO + overwrite_metadata: bool, optional, default False + TODO + download_flags: list, optional, default [] + TODO + upload_flags: list, optional, default [] + TODO """ from pycytominer.cyto_utils.cells import SingleCells @@ -98,11 +110,16 @@ def collate( remote_aggregated_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.csv" - sync_cmd = f"aws s3 sync --exclude * --include */Cells.csv --include */Nuclei.csv --include */Cytoplasm.csv --include */Image.csv {remote_input_dir} {input_dir}" + sync_cmd = ["aws", "s3", "sync", "--exclude", "*", "--include", "*/Cells.csv", "--include", + "*/Nuclei.csv", "--include", "*/Cytoplasm.csv", "--include", "*/Image.csv", remote_input_dir, + input_dir] + download_flags if printtoscreen: print(f"Downloading CSVs from {remote_input_dir} to {input_dir}") run_check_errors(sync_cmd) + if (overwrite_metadata or append_metadata): + find_and_fix_metadata(input_dir,overwrite=overwrite_metadata) + ingest_cmd = [ "cytominer-database", "ingest", @@ -159,7 +176,7 @@ def collate( if aws_remote: if printtoscreen: print(f"Uploading {cache_backend_file} to {remote_backend_file}") - cp_cmd = ["aws", "s3", "cp", cache_backend_file, remote_backend_file] + cp_cmd = ["aws", "s3", "cp", cache_backend_file, remote_backend_file] + upload_flags run_check_errors(cp_cmd) if printtoscreen: @@ -182,7 +199,7 @@ def collate( remote_aggregated_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.csv" - cp_cmd = ["aws", "s3", "cp", remote_backend_file, backend_file] + cp_cmd = ["aws", "s3", "cp", remote_backend_file, backend_file] + download_flags if printtoscreen: print( f"Downloading SQLite files from {remote_backend_file} to {backend_file}" @@ -208,7 +225,7 @@ def collate( if aws_remote: if printtoscreen: print(f"Uploading {aggregated_file} to {remote_aggregated_file}") - csv_cp_cmd = ["aws", "s3", "cp", aggregated_file, remote_aggregated_file] + csv_cp_cmd = ["aws", "s3", "cp", aggregated_file, remote_aggregated_file] + upload_flags run_check_errors(csv_cp_cmd) if printtoscreen: @@ -216,3 +233,35 @@ def collate( import shutil shutil.rmtree(backend_dir) + +def find_and_fix_metadata(path_to_plate_folder,overwrite=False): + site_list = os.listdir(path_to_plate_folder) + for eachsite in site_list: + image_csv = os.path.join(path_to_plate_folder,eachsite,"Image.csv") + if os.path.exists(image_csv): + append_metadata(image_csv,overwrite) + + +def append_metadata(path_to_csv,overwrite=False): + import pandas as pd + all_meta = path_to_csv.split("/")[-2] + plate = "-".join(all_meta.split("-")[:-2]) + well = all_meta.split("-")[-2] + site = all_meta.split("-")[-1] + df = pd.read_csv(path_to_csv) + edited=False + if overwrite: + df.drop(columns=["Metadata_Plate","Metadata_Well","Metadata_Site"],inplace=True,errors="ignore") + edited=True + insertion_index=list(df.columns).index("ModuleError_01LoadData") + if "Metadata_Plate" not in list(df.columns): + df.insert(insertion_index,"Metadata_Plate",plate) + edited=True + if "Metadata_Well" not in list(df.columns): + df.insert(insertion_index,"Metadata_Well",well) + edited=True + if "Metadata_Site" not in list(df.columns): + df.insert(insertion_index,"Metadata_Site",site) + edited=True + if edited: + df.to_csv(path_to_csv,index=False) \ No newline at end of file diff --git a/pycytominer/cyto_utils/collate_cmd.py b/pycytominer/cyto_utils/collate_cmd.py index 0ccfcb9c..7e78b6fb 100644 --- a/pycytominer/cyto_utils/collate_cmd.py +++ b/pycytominer/cyto_utils/collate_cmd.py @@ -77,6 +77,38 @@ help="Whether to print status updates", ) + parser.add_argument( + "--append-metadata", + dest="append_metadata", + action="store_true", + default=False, + help="Whether or not to add imputed plate, well, and/or site metadata if it's missing", + ) + + parser.add_argument( + "--overwrite-metadata", + dest="overwrite_metadata", + action="store_true", + default=False, + help="Whether or not to add imputed plate, well, and/or site metadata, overwriting what's already there", + ) + + parser.add_argument( + "--download-flags", + dest="download_flags", + type=lambda s: [] if "," not in s else [item for item in s.split(",")], + default="", + help="Extra flags to pass to aws download commands. Multiple values can be passed in if comma separated with no spaces between them", + ) + + parser.add_argument( + "--upload-flags", + dest="upload_flags", + type=lambda s: [] if "," not in s else [item for item in s.split(",")], + default="", + help="Extra flags to pass to aws upload commands. Multiple values can be passed in if comma separated with no spaces between them", + ) + args = parser.parse_args() collate( @@ -94,4 +126,8 @@ add_image_features=args.add_image_features, image_feature_categories=args.image_feature_categories, printtoscreen=args.printtoscreen, + append_metadata=args.append_metadata, + overwrite_metadata=args.overwrite_metadata, + download_flags=args.download_flags, + upload_flags=args.upload_flags ) From d4147cd01be14db1c488db5139cf9ebf14df501a Mon Sep 17 00:00:00 2001 From: bethac07 Date: Thu, 29 Jun 2023 08:26:00 -0400 Subject: [PATCH 2/2] black --- pycytominer/cyto_utils/collate.py | 82 +++++++++++++++++++-------- pycytominer/cyto_utils/collate_cmd.py | 2 +- 2 files changed, 58 insertions(+), 26 deletions(-) diff --git a/pycytominer/cyto_utils/collate.py b/pycytominer/cyto_utils/collate.py index d7f13d1f..43086924 100644 --- a/pycytominer/cyto_utils/collate.py +++ b/pycytominer/cyto_utils/collate.py @@ -35,7 +35,7 @@ def collate( append_metadata=False, overwrite_metadata=False, download_flags=[], - upload_flags=[], + upload_flags=[], ): """Collate the CellProfiler-created CSVs into a single SQLite file by calling cytominer-database @@ -74,9 +74,9 @@ def collate( overwrite_metadata: bool, optional, default False TODO download_flags: list, optional, default [] - TODO + TODO upload_flags: list, optional, default [] - TODO + TODO """ from pycytominer.cyto_utils.cells import SingleCells @@ -110,15 +110,29 @@ def collate( remote_aggregated_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.csv" - sync_cmd = ["aws", "s3", "sync", "--exclude", "*", "--include", "*/Cells.csv", "--include", - "*/Nuclei.csv", "--include", "*/Cytoplasm.csv", "--include", "*/Image.csv", remote_input_dir, - input_dir] + download_flags + sync_cmd = [ + "aws", + "s3", + "sync", + "--exclude", + "*", + "--include", + "*/Cells.csv", + "--include", + "*/Nuclei.csv", + "--include", + "*/Cytoplasm.csv", + "--include", + "*/Image.csv", + remote_input_dir, + input_dir, + ] + download_flags if printtoscreen: print(f"Downloading CSVs from {remote_input_dir} to {input_dir}") run_check_errors(sync_cmd) - if (overwrite_metadata or append_metadata): - find_and_fix_metadata(input_dir,overwrite=overwrite_metadata) + if overwrite_metadata or append_metadata: + find_and_fix_metadata(input_dir, overwrite=overwrite_metadata) ingest_cmd = [ "cytominer-database", @@ -176,7 +190,13 @@ def collate( if aws_remote: if printtoscreen: print(f"Uploading {cache_backend_file} to {remote_backend_file}") - cp_cmd = ["aws", "s3", "cp", cache_backend_file, remote_backend_file] + upload_flags + cp_cmd = [ + "aws", + "s3", + "cp", + cache_backend_file, + remote_backend_file, + ] + upload_flags run_check_errors(cp_cmd) if printtoscreen: @@ -225,7 +245,13 @@ def collate( if aws_remote: if printtoscreen: print(f"Uploading {aggregated_file} to {remote_aggregated_file}") - csv_cp_cmd = ["aws", "s3", "cp", aggregated_file, remote_aggregated_file] + upload_flags + csv_cp_cmd = [ + "aws", + "s3", + "cp", + aggregated_file, + remote_aggregated_file, + ] + upload_flags run_check_errors(csv_cp_cmd) if printtoscreen: @@ -234,34 +260,40 @@ def collate( shutil.rmtree(backend_dir) -def find_and_fix_metadata(path_to_plate_folder,overwrite=False): + +def find_and_fix_metadata(path_to_plate_folder, overwrite=False): site_list = os.listdir(path_to_plate_folder) for eachsite in site_list: - image_csv = os.path.join(path_to_plate_folder,eachsite,"Image.csv") + image_csv = os.path.join(path_to_plate_folder, eachsite, "Image.csv") if os.path.exists(image_csv): - append_metadata(image_csv,overwrite) + append_metadata(image_csv, overwrite) -def append_metadata(path_to_csv,overwrite=False): +def append_metadata(path_to_csv, overwrite=False): import pandas as pd + all_meta = path_to_csv.split("/")[-2] plate = "-".join(all_meta.split("-")[:-2]) well = all_meta.split("-")[-2] site = all_meta.split("-")[-1] df = pd.read_csv(path_to_csv) - edited=False + edited = False if overwrite: - df.drop(columns=["Metadata_Plate","Metadata_Well","Metadata_Site"],inplace=True,errors="ignore") - edited=True - insertion_index=list(df.columns).index("ModuleError_01LoadData") + df.drop( + columns=["Metadata_Plate", "Metadata_Well", "Metadata_Site"], + inplace=True, + errors="ignore", + ) + edited = True + insertion_index = list(df.columns).index("ModuleError_01LoadData") if "Metadata_Plate" not in list(df.columns): - df.insert(insertion_index,"Metadata_Plate",plate) - edited=True + df.insert(insertion_index, "Metadata_Plate", plate) + edited = True if "Metadata_Well" not in list(df.columns): - df.insert(insertion_index,"Metadata_Well",well) - edited=True + df.insert(insertion_index, "Metadata_Well", well) + edited = True if "Metadata_Site" not in list(df.columns): - df.insert(insertion_index,"Metadata_Site",site) - edited=True + df.insert(insertion_index, "Metadata_Site", site) + edited = True if edited: - df.to_csv(path_to_csv,index=False) \ No newline at end of file + df.to_csv(path_to_csv, index=False) diff --git a/pycytominer/cyto_utils/collate_cmd.py b/pycytominer/cyto_utils/collate_cmd.py index 7e78b6fb..b7e7a377 100644 --- a/pycytominer/cyto_utils/collate_cmd.py +++ b/pycytominer/cyto_utils/collate_cmd.py @@ -129,5 +129,5 @@ append_metadata=args.append_metadata, overwrite_metadata=args.overwrite_metadata, download_flags=args.download_flags, - upload_flags=args.upload_flags + upload_flags=args.upload_flags, )