From c608ea08db936389f448b2c7afb8d0fe21f74c16 Mon Sep 17 00:00:00 2001 From: HuangXiaoyan0106 Date: Tue, 20 Aug 2024 14:37:56 +0800 Subject: [PATCH 1/7] :bug: fix validation: file_size --- .../modules/validation/check_manifest.py | 38 ++++++++++++++----- data/example_manifest.csv | 2 +- data/validation_rules.json | 4 +- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/d3b_dff_cli/modules/validation/check_manifest.py b/d3b_dff_cli/modules/validation/check_manifest.py index ffe44f4..c625a75 100644 --- a/d3b_dff_cli/modules/validation/check_manifest.py +++ b/d3b_dff_cli/modules/validation/check_manifest.py @@ -1,6 +1,7 @@ import json import argparse import pandas as pd +import re # Define a function to perform validation def validate_row(row, rules): @@ -18,7 +19,11 @@ def validate_row(row, rules): op_value = consequence.get("equals") is_empty = consequence.get("empty") - cell_value = str(row.get(col)).lower() + cell_value = row.get(col) + if is_empty and pd.isna(cell_value): + error_messages.append(f"*{col}*: cannot be empty.") + else: + cell_value = str(cell_value).lower() if op_value != "" and op_value is not None: allowed_values = op_value.split(",") @@ -29,9 +34,6 @@ def validate_row(row, rules): if cell_value != op_value.lower(): error_messages.append(f"*{col}*: must be {op_value}.") - if is_empty and not cell_value: - error_messages.append(f"*{col}*: cannot be empty.") - # Check if file_name ends with a valid extension if col == "file_name" and "ends_with" in consequence: format = conditions[0].get("equals") @@ -43,14 +45,30 @@ def validate_row(row, rules): if col == "file_size" and row.get("file_format", "").lower() in ["fastq", "bam", "cram"]: greater_than_value = consequence.get("greater_than") if greater_than_value: - try: - file_size_in_gb = float(row.get(col, 0)) / (1024 * 1024 * 1024) # Convert to GB - if file_size_in_gb <= float(greater_than_value.rstrip(" GB")): - error_messages.append(f"Warning: *{col}* less than {greater_than_value}") - except ValueError: - error_messages.append(f"*{col}* is not a valid numeric value") + experiment = row.get("experiment_strategy", "").lower() + if experiment in ["wgs", "wxs", "wes"]: + greater_than_value = "1 GB" + minum_value = 1_000_000_000 # WGS/WXS should be greater than 1G + else: + greater_than_value = consequence.get("greater_than") + minum_value = float(greater_than_value.rstrip("M"))*1000_000 # Other experimental strategy should be greater than the specified value. + + if pd.notna(cell_value): + fize_size_str = str(cell_value).lower() + # support file size formats in bytes, megabytes, and gigabytes: 100, 100M, 100MB, 10G, 10GB + try: + if 'm' in fize_size_str: + size_byte = float(re.sub('[^0-9.]', '', fize_size_str)) * 1_000_000 + elif 'g' in fize_size_str: + size_byte = float(re.sub('[^0-9.]', '', fize_size_str)) * 1_000_000_000 + else: + size_byte = float(fize_size_str) + if size_byte < minum_value: + error_messages.append(f"Warning: *{col}* less than {greater_than_value}") + except ValueError: + error_messages.append(f"*{col}*: {fize_size_str} is not a valid numeric value") if error_messages: return False, error_messages # Return all error messages for this row diff --git a/data/example_manifest.csv b/data/example_manifest.csv index 298afdf..1ce13f3 100644 --- a/data/example_manifest.csv +++ b/data/example_manifest.csv @@ -1,3 +1,3 @@ sample_id,aliquot_id,tissue_type,file_name,file_format,file_size,file_hash_type,file_hash_value,sequencing_center,platform,instrument_model,experiment_strategy,library_selection,library_strand,target_capture_kit_name,target_capture_kit_link,is_paired_end,read_pair_number,flow_cell_barcode,lane_number,is_adapter_trimmed,adapter_sequencing,total_reads,mean_coverage,reference_genome -test1,1549608,Tumor,test1.fq.gz,FASTQ,30000000000,md5,284799447,Broad,complete genomics,Illumina HiSeq X Ten,wxs,Hybrid Selection,Not Applicable,,,TRUE,R1,H0164ALXX140820,1,FALSE,AGATCGGAAGAGC,1000000,30X, +test1,1549608,Tumor,test1.fq.gz,FASTQ,300M,md5,284799447,Broad,complete genomics,Illumina HiSeq X Ten,wxs,Hybrid Selection,Not Applicable,,,TRUE,R1,H0164ALXX140820,1,FALSE,AGATCGGAAGAGC,1000000,30X, test2,15480,Tumor,test2.bam,BAM,1100000001,md5,284799447,Broad,complete genomics,Illumina HiSeq X Ten,wxs,Hybrid Selection,Not Applicable,,,TRUE,R1,,,TRUE,,1000000,30X, \ No newline at end of file diff --git a/data/validation_rules.json b/data/validation_rules.json index d24d7fd..772e584 100644 --- a/data/validation_rules.json +++ b/data/validation_rules.json @@ -125,7 +125,7 @@ "consequences": [ { "column": "file_size", - "greater_than": "1 GB", + "greater_than": "200M", "valid": true } ] @@ -377,7 +377,7 @@ "consequences": [ { "column": "file_size", - "greater_than": "1 GB", + "greater_than": "200M", "valid": true } ] From 0352fb789d6c74e176d7c8ad7b15b845c86b4662 Mon Sep 17 00:00:00 2001 From: HuangXiaoyan0106 Date: Tue, 20 Aug 2024 15:01:25 +0800 Subject: [PATCH 2/7] :pencil2: update doc --- d3b_dff_cli/modules/validation/check_manifest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/d3b_dff_cli/modules/validation/check_manifest.py b/d3b_dff_cli/modules/validation/check_manifest.py index c625a75..43bfb25 100644 --- a/d3b_dff_cli/modules/validation/check_manifest.py +++ b/d3b_dff_cli/modules/validation/check_manifest.py @@ -68,7 +68,7 @@ def validate_row(row, rules): error_messages.append(f"Warning: *{col}* less than {greater_than_value}") except ValueError: - error_messages.append(f"*{col}*: {fize_size_str} is not a valid numeric value") + error_messages.append(f"*{col}*: {fize_size_str} is not a valid value") if error_messages: return False, error_messages # Return all error messages for this row From 2e56e1e8e462e9d7f06642269c9d3716b40f6b4d Mon Sep 17 00:00:00 2001 From: HuangXiaoyan0106 Date: Fri, 23 Aug 2024 22:07:32 +0800 Subject: [PATCH 3/7] :rewind: revert file size in Bytes --- d3b_dff_cli/modules/validation/check_manifest.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/d3b_dff_cli/modules/validation/check_manifest.py b/d3b_dff_cli/modules/validation/check_manifest.py index 43bfb25..0183359 100644 --- a/d3b_dff_cli/modules/validation/check_manifest.py +++ b/d3b_dff_cli/modules/validation/check_manifest.py @@ -54,21 +54,13 @@ def validate_row(row, rules): minum_value = float(greater_than_value.rstrip("M"))*1000_000 # Other experimental strategy should be greater than the specified value. if pd.notna(cell_value): - fize_size_str = str(cell_value).lower() - - # support file size formats in bytes, megabytes, and gigabytes: 100, 100M, 100MB, 10G, 10GB + size_byte = float(cell_value) try: - if 'm' in fize_size_str: - size_byte = float(re.sub('[^0-9.]', '', fize_size_str)) * 1_000_000 - elif 'g' in fize_size_str: - size_byte = float(re.sub('[^0-9.]', '', fize_size_str)) * 1_000_000_000 - else: - size_byte = float(fize_size_str) if size_byte < minum_value: error_messages.append(f"Warning: *{col}* less than {greater_than_value}") except ValueError: - error_messages.append(f"*{col}*: {fize_size_str} is not a valid value") + error_messages.append(f"*{col}*: {size_byte} is not a valid value") if error_messages: return False, error_messages # Return all error messages for this row From 8c9b87f8c628406b8d3736d4a5d52448b7200e5e Mon Sep 17 00:00:00 2001 From: HuangXiaoyan0106 Date: Fri, 23 Aug 2024 22:13:48 +0800 Subject: [PATCH 4/7] :pencil2: fix typo --- d3b_dff_cli/modules/validation/check_manifest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/d3b_dff_cli/modules/validation/check_manifest.py b/d3b_dff_cli/modules/validation/check_manifest.py index 0183359..2e9269d 100644 --- a/d3b_dff_cli/modules/validation/check_manifest.py +++ b/d3b_dff_cli/modules/validation/check_manifest.py @@ -54,13 +54,13 @@ def validate_row(row, rules): minum_value = float(greater_than_value.rstrip("M"))*1000_000 # Other experimental strategy should be greater than the specified value. if pd.notna(cell_value): - size_byte = float(cell_value) try: + size_byte = float(cell_value) if size_byte < minum_value: error_messages.append(f"Warning: *{col}* less than {greater_than_value}") except ValueError: - error_messages.append(f"*{col}*: {size_byte} is not a valid value") + error_messages.append(f"*{col}*: {cell_value} is not a valid value") if error_messages: return False, error_messages # Return all error messages for this row From 5abce080a69039626a00752120fe813007f85f2d Mon Sep 17 00:00:00 2001 From: HuangXiaoyan0106 Date: Fri, 23 Aug 2024 22:52:47 +0800 Subject: [PATCH 5/7] :zap: variable threshold --- d3b_dff_cli/modules/validation/check_manifest.py | 15 +++++++-------- data/validation_rules.json | 8 ++++++-- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/d3b_dff_cli/modules/validation/check_manifest.py b/d3b_dff_cli/modules/validation/check_manifest.py index 2e9269d..f5411a2 100644 --- a/d3b_dff_cli/modules/validation/check_manifest.py +++ b/d3b_dff_cli/modules/validation/check_manifest.py @@ -43,21 +43,20 @@ def validate_row(row, rules): # Check if file_format is "FASTQ," "BAM," or "CRAM" and file_size > specified value if col == "file_size" and row.get("file_format", "").lower() in ["fastq", "bam", "cram"]: - greater_than_value = consequence.get("greater_than") - if greater_than_value: + general_cutoff = consequence.get("general_byte_cutoff") + wgs_wxs_cutoff = consequence.get("wgs_wxs_byte_cutoff") + if general_cutoff: experiment = row.get("experiment_strategy", "").lower() if experiment in ["wgs", "wxs", "wes"]: - greater_than_value = "1 GB" - minum_value = 1_000_000_000 # WGS/WXS should be greater than 1G + minum_value = float(wgs_wxs_cutoff) else: - greater_than_value = consequence.get("greater_than") - minum_value = float(greater_than_value.rstrip("M"))*1000_000 # Other experimental strategy should be greater than the specified value. - + minum_value = float(general_cutoff) + if pd.notna(cell_value): try: size_byte = float(cell_value) if size_byte < minum_value: - error_messages.append(f"Warning: *{col}* less than {greater_than_value}") + error_messages.append(f"Warning: *{col}* less than {minum_value}") except ValueError: error_messages.append(f"*{col}*: {cell_value} is not a valid value") diff --git a/data/validation_rules.json b/data/validation_rules.json index 772e584..74a180b 100644 --- a/data/validation_rules.json +++ b/data/validation_rules.json @@ -125,8 +125,11 @@ "consequences": [ { "column": "file_size", - "greater_than": "200M", + "_comment": "The threshold for WGS/WXS is 1GB, and 200MB for others.", + "general_byte_cutoff": "200000000", + "wgs_wxs_byte_cutoff": "1000000000", "valid": true + } ] } @@ -377,7 +380,8 @@ "consequences": [ { "column": "file_size", - "greater_than": "200M", + "general_byte_cutoff": "200,000,000", + "wgs_wxs_byte_cutoff": "1,000,000,000", "valid": true } ] From 5a5574c10a9bb08147c5bae9506fb747c35e569c Mon Sep 17 00:00:00 2001 From: HuangXiaoyan0106 Date: Fri, 23 Aug 2024 22:55:32 +0800 Subject: [PATCH 6/7] :rewind: revert example --- data/example_manifest.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/example_manifest.csv b/data/example_manifest.csv index 1ce13f3..298afdf 100644 --- a/data/example_manifest.csv +++ b/data/example_manifest.csv @@ -1,3 +1,3 @@ sample_id,aliquot_id,tissue_type,file_name,file_format,file_size,file_hash_type,file_hash_value,sequencing_center,platform,instrument_model,experiment_strategy,library_selection,library_strand,target_capture_kit_name,target_capture_kit_link,is_paired_end,read_pair_number,flow_cell_barcode,lane_number,is_adapter_trimmed,adapter_sequencing,total_reads,mean_coverage,reference_genome -test1,1549608,Tumor,test1.fq.gz,FASTQ,300M,md5,284799447,Broad,complete genomics,Illumina HiSeq X Ten,wxs,Hybrid Selection,Not Applicable,,,TRUE,R1,H0164ALXX140820,1,FALSE,AGATCGGAAGAGC,1000000,30X, +test1,1549608,Tumor,test1.fq.gz,FASTQ,30000000000,md5,284799447,Broad,complete genomics,Illumina HiSeq X Ten,wxs,Hybrid Selection,Not Applicable,,,TRUE,R1,H0164ALXX140820,1,FALSE,AGATCGGAAGAGC,1000000,30X, test2,15480,Tumor,test2.bam,BAM,1100000001,md5,284799447,Broad,complete genomics,Illumina HiSeq X Ten,wxs,Hybrid Selection,Not Applicable,,,TRUE,R1,,,TRUE,,1000000,30X, \ No newline at end of file From 79a62c7d3da6b26799dbdd79b3755f9d26eabc5d Mon Sep 17 00:00:00 2001 From: HuangXiaoyan0106 Date: Fri, 23 Aug 2024 22:57:14 +0800 Subject: [PATCH 7/7] :bulb: add comments in rules --- data/validation_rules.json | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/data/validation_rules.json b/data/validation_rules.json index 74a180b..9edd17f 100644 --- a/data/validation_rules.json +++ b/data/validation_rules.json @@ -380,8 +380,9 @@ "consequences": [ { "column": "file_size", - "general_byte_cutoff": "200,000,000", - "wgs_wxs_byte_cutoff": "1,000,000,000", + "_comment": "The threshold for WGS/WXS is 1GB, and 200MB for others.", + "general_byte_cutoff": "200000000", + "wgs_wxs_byte_cutoff": "1000000000", "valid": true } ]