d3b-center · HuangXiaoyan0106 · Aug 27, 2024 · Aug 20, 2024 · Aug 20, 2024 · Aug 23, 2024
diff --git a/d3b_dff_cli/modules/validation/check_manifest.py b/d3b_dff_cli/modules/validation/check_manifest.py
@@ -1,6 +1,7 @@
 import json
 import argparse
 import pandas as pd
+import re
 
 # Define a function to perform validation
 def validate_row(row, rules):
@@ -18,7 +19,11 @@ def validate_row(row, rules):
                 op_value = consequence.get("equals")
                 is_empty = consequence.get("empty")
 
-                cell_value = str(row.get(col)).lower()
+                cell_value = row.get(col)
+                if is_empty and pd.isna(cell_value):
+                    error_messages.append(f"*{col}*: cannot be empty.")
+                else:
+                    cell_value = str(cell_value).lower()
 
                 if op_value != "" and op_value is not None:
                     allowed_values = op_value.split(",")
@@ -29,9 +34,6 @@ def validate_row(row, rules):
                         if cell_value != op_value.lower():
                             error_messages.append(f"*{col}*: must be {op_value}.")
 
-                if is_empty and not cell_value:
-                    error_messages.append(f"*{col}*: cannot be empty.")
-
                 # Check if file_name ends with a valid extension
                 if col == "file_name" and "ends_with" in consequence:
                     format = conditions[0].get("equals")
@@ -43,14 +45,30 @@ def validate_row(row, rules):
                 if col == "file_size" and row.get("file_format", "").lower() in ["fastq", "bam", "cram"]:
                     greater_than_value = consequence.get("greater_than")
                     if greater_than_value:
-                        try:
-                            file_size_in_gb = float(row.get(col, 0)) / (1024 * 1024 * 1024)  # Convert to GB
-                            if file_size_in_gb <= float(greater_than_value.rstrip(" GB")):
-                                error_messages.append(f"Warning: *{col}* less than {greater_than_value}")
-                        except ValueError:
-                            error_messages.append(f"*{col}* is not a valid numeric value")
+                        experiment = row.get("experiment_strategy", "").lower()
+                        if experiment in ["wgs", "wxs", "wes"]:
+                            greater_than_value = "1 GB"
+                            minum_value = 1_000_000_000 # WGS/WXS should be greater than 1G
+                        else:
+                            greater_than_value = consequence.get("greater_than")
+                            minum_value = float(greater_than_value.rstrip("M"))*1000_000 # Other experimental strategy should be greater than the specified value.
+
+                        if pd.notna(cell_value):
+                            fize_size_str = str(cell_value).lower()
 
+                            # support file size formats in bytes, megabytes, and gigabytes: 100, 100M, 100MB, 10G, 10GB
+                            try:
+                                if 'm' in fize_size_str:
+                                    size_byte = float(re.sub('[^0-9.]', '', fize_size_str)) * 1_000_000
+                                elif 'g' in fize_size_str:
+                                    size_byte = float(re.sub('[^0-9.]', '', fize_size_str)) * 1_000_000_000
+                                else:
+                                    size_byte = float(fize_size_str)
+                                if size_byte < minum_value:
+                                    error_messages.append(f"Warning: *{col}* less than {greater_than_value}")
 
+                            except ValueError:
+                                error_messages.append(f"*{col}*: {fize_size_str} is not a valid value")
 
     if error_messages:
         return False, error_messages  # Return all error messages for this row

diff --git a/data/example_manifest.csv b/data/example_manifest.csv
@@ -1,3 +1,3 @@
 sample_id,aliquot_id,tissue_type,file_name,file_format,file_size,file_hash_type,file_hash_value,sequencing_center,platform,instrument_model,experiment_strategy,library_selection,library_strand,target_capture_kit_name,target_capture_kit_link,is_paired_end,read_pair_number,flow_cell_barcode,lane_number,is_adapter_trimmed,adapter_sequencing,total_reads,mean_coverage,reference_genome
-test1,1549608,Tumor,test1.fq.gz,FASTQ,30000000000,md5,284799447,Broad,complete genomics,Illumina HiSeq X Ten,wxs,Hybrid Selection,Not Applicable,,,TRUE,R1,H0164ALXX140820,1,FALSE,AGATCGGAAGAGC,1000000,30X,
+test1,1549608,Tumor,test1.fq.gz,FASTQ,300M,md5,284799447,Broad,complete genomics,Illumina HiSeq X Ten,wxs,Hybrid Selection,Not Applicable,,,TRUE,R1,H0164ALXX140820,1,FALSE,AGATCGGAAGAGC,1000000,30X,
 test2,15480,Tumor,test2.bam,BAM,1100000001,md5,284799447,Broad,complete genomics,Illumina HiSeq X Ten,wxs,Hybrid Selection,Not Applicable,,,TRUE,R1,,,TRUE,,1000000,30X,
diff --git a/data/validation_rules.json b/data/validation_rules.json
@@ -125,7 +125,7 @@
       "consequences": [
         {
           "column": "file_size",
-          "greater_than": "1 GB",
+          "greater_than": "200M",
           "valid": true
         }
       ]
@@ -377,7 +377,7 @@
       "consequences": [
         {
           "column": "file_size",
-          "greater_than": "1 GB",
+          "greater_than": "200M",
           "valid": true
         }
       ]