From 5c46e0c519bfd4107f19c60da4a59d5fb573608a Mon Sep 17 00:00:00 2001 From: HuangXiaoyan0106 Date: Tue, 10 Sep 2024 10:14:24 +0800 Subject: [PATCH 1/7] :truck: update cerberus validation --- MANIFEST.in | 1 + d3b_dff_cli/cli.py | 8 - .../validation/cerberus_custom_checks.py | 110 +++++ .../modules/validation/check_manifest.py | 184 ++++---- .../validation/validation_rules_schema.json | 430 ++++++++++++++++++ data/validation_rules_schema.json | 430 ++++++++++++++++++ setup.py | 1 + 7 files changed, 1067 insertions(+), 97 deletions(-) create mode 100644 MANIFEST.in create mode 100644 d3b_dff_cli/modules/validation/cerberus_custom_checks.py create mode 100644 d3b_dff_cli/modules/validation/validation_rules_schema.json create mode 100644 data/validation_rules_schema.json diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..2ce67ac --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include d3b_dff_cli/modules/validation/validation_rules_schema.json \ No newline at end of file diff --git a/d3b_dff_cli/cli.py b/d3b_dff_cli/cli.py index 443d74b..2c5ad9e 100644 --- a/d3b_dff_cli/cli.py +++ b/d3b_dff_cli/cli.py @@ -76,14 +76,6 @@ def create_parser(): manifest_parser = validation_subparsers.add_parser( "manifest", help="Manifest validation based on defined rules." ) - manifest_parser.add_argument( - "-rules", help="Formatted JSON file defining validation rules.", required=True - ) - manifest_parser.add_argument( - "-rule_type", - help="Specific type of validation rule defined in the json rule file.", - required=True, - ) manifest_parser.add_argument( "-manifest_file", help="Manifest based on the d3b genomics manifest template.", diff --git a/d3b_dff_cli/modules/validation/cerberus_custom_checks.py b/d3b_dff_cli/modules/validation/cerberus_custom_checks.py new file mode 100644 index 0000000..f2428a6 --- /dev/null +++ b/d3b_dff_cli/modules/validation/cerberus_custom_checks.py @@ -0,0 +1,110 @@ +from cerberus import Validator +import pandas as pd +import warnings + +# Suppress specific UserWarnings from Cerberus +warnings.filterwarnings("ignore", category=UserWarning, module="cerberus.validator") +class CustomValidator(Validator): + def __init__(self, schema, rules=None, *args, **kwargs): + super().__init__(schema, *args, **kwargs) + self.rules = rules if rules else {} + + def _validate_dependencies(self, dependencies, field, value): + """ + Check if the field's dependencies are met. If not, skip validation for this field. + """ + for dependency_field, allowed_values in dependencies.items(): + dependency_value = self.document.get(dependency_field) + + # Check if the allowed_values is a list or a single value + if isinstance(allowed_values, list): + if dependency_value not in allowed_values: + return False # Dependencies not met + else: + if dependency_value != allowed_values: + return False # Dependencies not met + + # Dependencies are met, perform validation + return True + + def _validate_field(self, field, value, field_schema): + """ + Validate the field value based on its schema. + """ + field_type = field_schema.get('type') + allowed_values = field_schema.get('allowed') + + # Validate required fields + if field_schema.get('required') and (value is None or pd.isna(value) or value == ''): + self._error(field, f"{field} is required.") + return False + + # Handle validation for allowed values + if allowed_values and value not in allowed_values: + self._error(field, f"{field} must be one of {allowed_values}.") + return False + + # Validate type + if field_type: + if field_type == 'boolean': + if value not in ['true', 'false']: + self._error(field, f"{field} must be a boolean value.") + return False + elif field_type == 'integer': + if not (isinstance(value, int) or (isinstance(value, float) and value.is_integer())): + self._error(field, f"{field} must be of integer type.") + return False + elif field_type == 'string': + if not isinstance(value, str): + self._error(field, f"{field} must be of string type.") + return False + + # Apply additional validation based on custom rules + if field == 'file_name': + file_format = self.document.get('file_format') + if file_format: + extensions = self.rules.get('file_name_extensions') + expected_extension = extensions.get(file_format) + if expected_extension and not value.lower().endswith(expected_extension): + self._error(field, f"{field} must end with {expected_extension} for file_format '{file_format}'.") + return False + + if field == 'file_size': + file_format = self.document.get('file_format') + experiment = self.document.get("experiment_strategy") + + byte_cutoff_general = self.rules.get('file_size_byte_cutoff').get('general_cutoff') + byte_cutoff_wgs_wxs = self.rules.get('file_size_byte_cutoff').get('wgs_wxs_cutoff') + dependencies_format = self.rules.get('file_size_byte_cutoff').get('dependencies').get('file_format') + + if experiment in ["wgs", "wxs", "wes"]: + minum_value = byte_cutoff_wgs_wxs + else: + minum_value = byte_cutoff_general + + if file_format in dependencies_format: + if value < minum_value: + self._error(field, f"Warning: *{field}* must be at least {minum_value} for file_format '{file_format}'.") + return False + return True + + def validate(self, document, *args, **kwargs): + """ + Override validate method to ensure dependencies are respected. + """ + self.document = document + is_valid = True + + for field, field_schema in self.schema.items(): + value = document.get(field) + dependencies = field_schema.get('dependencies') + + if dependencies: + if not self._validate_dependencies(dependencies, field, value): + continue # Skip validation if dependencies are not met + + # Perform validation if dependencies are met + if not self._validate_field(field, value, field_schema): + is_valid = False + + return is_valid \ No newline at end of file diff --git a/d3b_dff_cli/modules/validation/check_manifest.py b/d3b_dff_cli/modules/validation/check_manifest.py index f5411a2..5cadc9d 100644 --- a/d3b_dff_cli/modules/validation/check_manifest.py +++ b/d3b_dff_cli/modules/validation/check_manifest.py @@ -1,110 +1,116 @@ -import json import argparse import pandas as pd -import re - -# Define a function to perform validation -def validate_row(row, rules): - error_messages = [] - for rule in rules: - conditions = rule.get("conditions", []) - consequences = rule.get("consequences", []) - condition_met = all( - str(row.get(cond["column"])).lower() in map(str.lower,cond.get("equals").split(",")) - for cond in conditions - ) - if condition_met: - for consequence in consequences: - col = consequence.get("column") - op_value = consequence.get("equals") - is_empty = consequence.get("empty") - - cell_value = row.get(col) - if is_empty and pd.isna(cell_value): - error_messages.append(f"*{col}*: cannot be empty.") - else: - cell_value = str(cell_value).lower() - - if op_value != "" and op_value is not None: - allowed_values = op_value.split(",") - if len(allowed_values) > 1: - if cell_value not in map(str.lower,allowed_values): - error_messages.append(f"*{col}*: must be one of {', '.join(allowed_values)}.") - else: - if cell_value != op_value.lower(): - error_messages.append(f"*{col}*: must be {op_value}.") - - # Check if file_name ends with a valid extension - if col == "file_name" and "ends_with" in consequence: - format = conditions[0].get("equals") - valid_extensions = consequence["ends_with"].split(",") - if not any(cell_value.lower().endswith(ext.strip()) for ext in valid_extensions): - error_messages.append(f"*file_format* is: {format}, but *{col}* is: {cell_value}, which must end with: {', '.join(valid_extensions)}") - - # Check if file_format is "FASTQ," "BAM," or "CRAM" and file_size > specified value - if col == "file_size" and row.get("file_format", "").lower() in ["fastq", "bam", "cram"]: - general_cutoff = consequence.get("general_byte_cutoff") - wgs_wxs_cutoff = consequence.get("wgs_wxs_byte_cutoff") - if general_cutoff: - experiment = row.get("experiment_strategy", "").lower() - if experiment in ["wgs", "wxs", "wes"]: - minum_value = float(wgs_wxs_cutoff) - else: - minum_value = float(general_cutoff) - - if pd.notna(cell_value): - try: - size_byte = float(cell_value) - if size_byte < minum_value: - error_messages.append(f"Warning: *{col}* less than {minum_value}") - - except ValueError: - error_messages.append(f"*{col}*: {cell_value} is not a valid value") - - if error_messages: - return False, error_messages # Return all error messages for this row - else: - return True, None +import json +import os +from .cerberus_custom_checks import CustomValidator -def main(args): - rule_type = args.rule_type - rules_json = args.rules - manifest = args.manifest_file +wk_dir = os.path.dirname(os.path.abspath(__file__)) +validation_schema = os.path.join(wk_dir, "validation_rules_schema.json") - file_extension = manifest.split('.')[-1].lower() +def load_data(manifest_file): + # Load data based on file extension + file_extension = manifest_file.split('.')[-1].lower() if file_extension == 'csv': - manifest_data = pd.read_csv(manifest) + manifest_data = pd.read_csv(manifest_file) elif file_extension == 'tsv': - manifest_data = pd.read_csv(manifest, delimiter='\t') + manifest_data = pd.read_csv(manifest_file, delimiter='\t') elif file_extension in ['xls', 'xlsx']: - xlsx = pd.ExcelFile(manifest) + xlsx = pd.ExcelFile(manifest_file) if len(xlsx.sheet_names) == 1: manifest_data = pd.read_excel(xlsx) elif "Genomics_Manifest" in xlsx.sheet_names: - manifest_data = pd.read_excel(xlsx, "Genomics_Manifest") + manifest_data = pd.read_excel(xlsx, sheet_name="Genomics_Manifest") else: - raise ValueError(f"Genomics_Manifest sheet not found in {manifest}") + raise ValueError(f"Sheet 'Genomics_Manifest' not found in {manifest_file}") else: raise ValueError("Unsupported file format. Please provide a CSV, TSV, or Excel file.") - with open(rules_json, "r") as json_file: - validation_rules = json.load(json_file)[rule_type] + # Convert manifest to lowercase + manifest_data = manifest_data.apply(lambda col: col.astype(str).str.lower() if col.dtype.name in ['object', 'bool'] else col) + + return manifest_data + +def convert_schema_to_lowercase(schema): + for k, v in schema.items(): + if isinstance(v, dict): + convert_schema_to_lowercase(v) + elif isinstance(v, list): + schema[k] = [item.lower() if isinstance(item, str) else item for item in v] + elif isinstance(v, str): + schema[k] = v.lower() + return schema + +def validate_data(df, schema_json): + valid = True + errors = [] + custom_rules = schema_json['custom_rules'] + + for index, row in df.iterrows(): - # Iterate through each row in the DataFrame and perform validation - validation_failed = False - for index, row in manifest_data.iterrows(): - is_valid, messages = validate_row(row, validation_rules) + # Check the manifest type and select the appropriate validation rules + experiment_strategy = row["experiment_strategy"] + platform = row["platform"] + if platform == "pacbio": + rule_type = "pacbio_longread_rules" + else: + if experiment_strategy in ["wgs","wxs","wes","target sequencing","panel","target"]: + rule_type = "DNAseq_rules" + elif experiment_strategy in ["rna-seq","rnaseq","mirna-seq","mirnaseq"]: + rule_type = "RNAseq_rules" + elif experiment_strategy in ["scrna-seq","snran-seq","scrnaseq","snranseq"]: + rule_type = "single_cell_rules" + elif experiment_strategy in ["methtlation","methylation microarray"]: + rule_type = "methylation_rules" + else: + raise ValueError("Unsupported experiment_strategy for Row {0}".format(index + 1)) + schema = schema_json[rule_type] + + # Filter out fields not in the schema fields + row_dict = row.to_dict() + filtered_row_dict = {k: v for k, v in row_dict.items() if k in schema} + + v = CustomValidator(schema, custom_rules) + is_valid = v.validate(filtered_row_dict) if not is_valid: - error_message = "Validation Failed For Row {0}:\n{1}".format(index + 1, '\n'.join(messages)) - print(error_message,"\n") - validation_failed = True - if not validation_failed: - print("Validation Passed: All rows are valid.") + valid = False + errors.append({ + 'row': index + 1, + 'errors': v.errors + }) + + return valid, errors + +def main(args): + with open(validation_schema, 'r') as f: + schema = json.load(f) + schema_json = convert_schema_to_lowercase(schema) + + # Load and preprocess the data + df = load_data(args.manifest_file) + + # Validate the data + valid, errors = validate_data(df, schema_json) + + # Print validation report + if valid: + print("====Validation Passed====\n All rows are valid.") + else: + # Check if all warning messages + only_warnings = all("Warning" in field_error for error in errors for field_errors in error['errors'].values() for field_error in field_errors) + + # Print appropriate message + if only_warnings: + print("====Validation Warnings====") + else: + print("====Validation Failed====") + + for error in errors: + print(f"Row {error['row']}:") + for field, field_errors in error['errors'].items(): + print(f" {field_errors[0]}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Validate a manifest based on defined rules.") - parser.add_argument("-rules", help="Formatted JSON file defining validation rules.", required=True) - parser.add_argument("-rule_type", help="Specific type of validation rule defined in the json rule file.") - parser.add_argument("-manifest_file", help="Manifest based on the d3b genomics manifest template.") + parser.add_argument("-manifest_file", required=True, help="Path to the manifest file (CSV/Excel).") args = parser.parse_args() main(args) \ No newline at end of file diff --git a/d3b_dff_cli/modules/validation/validation_rules_schema.json b/d3b_dff_cli/modules/validation/validation_rules_schema.json new file mode 100644 index 0000000..e28f795 --- /dev/null +++ b/d3b_dff_cli/modules/validation/validation_rules_schema.json @@ -0,0 +1,430 @@ +{ + "custom_rules":{ + "_comment": "Add custom fields that Cerberus doesn't recognize as valid schema rules.", + "file_size_byte_cutoff":{ + "_comment": "The threshold of file_size for WGS/WXS is 1GB, and 200MB for others.", + "type": "integer", + "general_cutoff": 200000000, + "wgs_wxs_cutoff": 200000000, + "dependencies": {"file_format": ["FASTQ", "BAM", "CRAM"]} + }, + "file_name_extensions": { + "bam": ".bam", + "bai": ".bai", + "cram": ".cram", + "crai": ".crai" + } + }, + "DNAseq_rules":{ + "sample_id": {"required": true}, + "aliquot_id": {"required": true}, + "tissue_type": { + "type": "string", + "required": true, + "allowed": ["Tumor", "Normal", "Abnormal", "Peritumoral", "Unknown", "Not Reported"] + }, + "file_name": {"type": "string", "required": true}, + "file_format": { + "type": "string", + "required": true, + "allowed": ["FASTQ", "BAM", "BAI", "CRAM", "CRAI", "GVCF", "VCF", "TBI", "MAF", "PDF", "HTML", "DCM", "IDAT", "SVS", "GPR", "CNS", "TXT", "PNG", "CSV", "PED", "SEG", "TAR", "TSV"] + }, + "file_size": {"type": "integer", "required": true}, + "file_hash_type": { + "type": "string", + "required": true, + "allowed": ["MD5", "SHA1", "SHA256", "SHA512", "ETag"] + }, + "file_hash_value": {"required": true}, + "sequencing_center": {"type": "string", "required": true}, + "platform": { + "type": "string", + "required": true, + "allowed": ["Complete Genomics", "Illumina", "Ion Torrent", "LS454", "SOLiD", "ONT", "DNBSEQ", "Other"] + }, + "instrument_model": { + "type": "string", + "required": true + }, + "experiment_strategy": { + "type": "string", + "required": true, + "allowed": ["WGS", "WXS", "Targeted Sequencing", "Panel"] + }, + "total_reads": {"type": "integer", "required": true}, + "read_pair_number": { + "type": "string", + "required": true, + "allowed": ["R1", "R2", "Not Applicable"], + "dependencies": {"file_format": "FASTQ"} + }, + "is_paired_end": { + "type": "boolean", + "required": true, + "allowed": ["TRUE", "FALSE"], + "dependencies": {"file_format": ["FASTQ", "BAM"]} + }, + "is_adapter_trimmed": { + "type": "boolean", + "required": true, + "allowed": ["TRUE", "FALSE"], + "dependencies": {"file_format": ["FASTQ"]} + }, + "flow_cell_barcode": { + "type": "string", + "required": true, + "dependencies": {"file_format": ["FASTQ"]} + }, + "lane_number": { + "type": "integer", + "required": true, + "dependencies": {"file_format": ["FASTQ"]} + }, + "adapter_sequencing": { + "type": "string", + "required": true, + "dependencies": {"is_adapter_trimmed": "FALSE"} + }, + "target_capture_kit_name": { + "type": "string", + "required": true, + "dependencies": {"experiment_strategy": ["Targeted Sequencing", "Panel"]} + }, + "target_capture_kit_link": { + "type": "string", + "required": true, + "dependencies": {"experiment_strategy": ["Targeted Sequencing", "Panel"]} + }, + "mean_coverage": { + "type": "string", + "required": true, + "dependencies": {"experiment_strategy": ["WGS", "WXS", "WES", "Targeted Sequencing", "Panel"]} + } + }, + "RNAseq_rules":{ + "sample_id": {"required": true}, + "aliquot_id": {"required": true}, + "tissue_type": { + "type": "string", + "required": true, + "allowed": ["Tumor", "Normal", "Abnormal", "Peritumoral", "Unknown", "Not Reported"] + }, + "file_name": {"type": "string", "required": true}, + "file_format": { + "type": "string", + "required": true, + "allowed": ["FASTQ", "BAM", "BAI", "CRAM", "CRAI", "GVCF", "VCF", "TBI", "MAF", "PDF", "HTML", "DCM", "IDAT", "SVS", "GPR", "CNS", "TXT", "PNG", "CSV", "PED", "SEG", "TAR", "TSV"] + }, + "file_size": {"type": "integer", "required": true}, + "file_hash_type": { + "type": "string", + "required": true, + "allowed": ["MD5", "SHA1", "SHA256", "SHA512", "ETag"] + }, + "file_hash_value": {"required": true}, + "sequencing_center": {"type": "string", "required": true}, + "platform": { + "type": "string", + "required": true, + "allowed": ["Complete Genomics", "Illumina", "Ion Torrent", "LS454", "SOLiD", "ONT", "DNBSEQ", "Other"] + }, + "instrument_model": { + "type": "string", + "required": true + }, + "experiment_strategy": { + "type": "string", + "required": true, + "allowed": ["RNA-Seq","RNAseq","miRNA-Seq"] + }, + "library_selection": { + "type": "string", + "required": true, + "allowed": ["Affinity Enrichment","Hybrid Selection","miRNA Size Fractionation","PCR","Poly-T Enrichment","Random","rRNA Depletion","Other"] + }, + "library_strand": { + "type": "string", + "required": true, + "allowed": ["Stranded","Unstranded","First Stranded","Second Stranded","Not Applicable"] + }, + "total_reads": {"type": "integer", "required": true}, + "read_pair_number": { + "type": "string", + "required": true, + "allowed": ["R1", "R2", "Not Applicable"], + "dependencies": {"file_format": "FASTQ"} + }, + "is_paired_end": { + "type": "boolean", + "required": true, + "allowed": ["TRUE", "FALSE"], + "dependencies": {"file_format": ["FASTQ", "BAM"]} + }, + "is_adapter_trimmed": { + "type": "boolean", + "required": true, + "allowed": ["TRUE", "FALSE"], + "dependencies": {"file_format": ["FASTQ"]} + }, + "flow_cell_barcode": { + "type": "string", + "required": true, + "dependencies": {"file_format": ["FASTQ"]} + }, + "lane_number": { + "type": "integer", + "required": true, + "dependencies": {"file_format": ["FASTQ"]} + }, + "adapter_sequencing": { + "type": "string", + "required": true, + "dependencies": {"is_adapter_trimmed": "FALSE"} + } + }, + "single_cell_rules":{ + "sample_id": {"required": true}, + "aliquot_id": {"required": true}, + "organism": {"required": true}, + "cell_entity": {"required": true}, + "tissue_type": { + "type": "string", + "required": true, + "allowed": ["Tumor", "Normal", "Abnormal", "Peritumoral", "Unknown", "Not Reported"] + }, + "file_name": {"type": "string", "required": true}, + "file_format": { + "type": "string", + "required": true, + "allowed": ["FASTQ", "BAM", "BAI", "CRAM", "CRAI", "GVCF", "VCF", "TBI", "MAF", "PDF", "HTML", "DCM", "IDAT", "SVS", "GPR", "CNS", "TXT", "PNG", "CSV", "PED", "SEG", "TAR", "TSV"] + }, + "file_size": {"type": "integer", "required": true}, + "file_hash_type": { + "type": "string", + "required": true, + "allowed": ["MD5", "SHA1", "SHA256", "SHA512", "ETag"] + }, + "file_hash_value": {"required": true}, + "sequencing_center": {"type": "string", "required": true}, + "platform": { + "type": "string", + "required": true, + "allowed": ["Complete Genomics", "Illumina", "Ion Torrent", "LS454", "SOLiD", "ONT", "DNBSEQ", "Other"] + }, + "instrument_model": { + "type": "string", + "required": true + }, + "experiment_strategy": { + "type": "string", + "required": true, + "allowed": ["scRNA-Seq", "snRNA-Seq"] + }, + "end_bias": { + "type": "string", + "required": true, + "allowed": ["3'-end", "5'-end", "Full-length"] + }, + "library_selection": { + "type": "string", + "required": true, + "allowed": ["Affinity Enrichment","Hybrid Selection","miRNA Size Fractionation","PCR","Poly-T Enrichment","Random","rRNA Depletion","Other"] + }, + "library_strand": { + "type": "string", + "required": true, + "allowed": ["Stranded","Unstranded","First Stranded","Second Stranded","Not Applicable"] + }, + "library_construction": { + "type": "string", + "required": true + }, + "UMI_barcode_read": { + "type": "string", + "required": true, + "allowed": ["index1","index2","read1","read2","Not Applicable"] + }, + "UMI_barcode_offset": {"type": "integer", "required": true}, + "cell_barcode_read": { + "type": "string", + "required": true, + "allowed": ["index1","index2","read1","read2","Not Applicable"] + }, + "UMI_barcode_size": {"type": "integer", "required": true}, + "cell_barcode_offset": {"type": "integer", "required": true}, + "cell_barcode_size": {"type": "integer", "required": true}, + "cDNA_read": { + "type": "string", + "required": true, + "allowed": ["index1","index2","read1","read2","Not Applicable"] + }, + "cDNA_read_offset": {"type": "integer", "required": true}, + "read_pair_number": { + "type": "string", + "required": true, + "allowed": ["R1", "R2", "Not Applicable"], + "dependencies": {"file_format": "FASTQ"} + }, + "is_paired_end": { + "type": "boolean", + "required": true, + "allowed": ["TRUE", "FALSE"], + "dependencies": {"file_format": ["FASTQ", "BAM"]} + }, + "is_adapter_trimmed": { + "type": "boolean", + "required": true, + "allowed": ["TRUE", "FALSE"], + "dependencies": {"file_format": ["FASTQ"]} + }, + "flow_cell_barcode": { + "type": "string", + "required": true, + "dependencies": {"file_format": ["FASTQ"]} + }, + "lane_number": { + "type": "integer", + "required": true, + "dependencies": {"file_format": ["FASTQ"]} + }, + "adapter_sequencing": { + "type": "string", + "required": true, + "dependencies": {"is_adapter_trimmed": "FALSE"} + } + }, + "pacbio_longread_rules":{ + "sample_id": {"required": true}, + "aliquot_id": {"required": true}, + "tissue_type": { + "type": "string", + "required": true, + "allowed": ["Tumor", "Normal", "Abnormal", "Peritumoral", "Unknown", "Not Reported"] + }, + "file_name": {"type": "string", "required": true}, + "file_format": { + "type": "string", + "required": true, + "allowed": ["FASTQ", "BAM", "BAI", "CRAM", "CRAI", "GVCF", "VCF", "TBI", "MAF", "PDF", "HTML", "DCM", "IDAT", "SVS", "GPR", "CNS", "TXT", "PNG", "CSV", "PED", "SEG", "TAR", "TSV"] + }, + "file_size": {"type": "integer", "required": true}, + "file_hash_type": { + "type": "string", + "required": true, + "allowed": ["MD5", "SHA1", "SHA256", "SHA512", "ETag"] + }, + "file_hash_value": {"required": true}, + "sequencing_center": {"type": "string", "required": true}, + "platform": { + "type": "string", + "required": true, + "allowed": ["PacBio"] + }, + "instrument_model": { + "type": "string", + "required": true + }, + "experiment_strategy": { + "type": "string", + "required": true, + "allowed": ["WGS", "WXS", "RNA-Seq", "Targeted Sequencing", "Panel"] + }, + "library_selection": { + "type": "string", + "required": true, + "allowed": ["Affinity Enrichment","Hybrid Selection","miRNA Size Fractionation","PCR","Poly-T Enrichment","Random","rRNA Depletion","Other"] + }, + "library_strand": { + "type": "string", + "required": true, + "allowed": ["Stranded","Unstranded","First Stranded","Second Stranded","Not Applicable"] + }, + "sequencing_mode": { + "type": "string", + "required": true, + "allowed": ["CLR","CCS"] + }, + "total_reads": {"type": "integer", "required": true}, + "read_pair_number": { + "type": "string", + "required": true, + "allowed": ["R1", "R2", "Not Applicable"], + "dependencies": {"file_format": "FASTQ"} + }, + "is_paired_end": { + "type": "boolean", + "required": true, + "allowed": ["TRUE", "FALSE"], + "dependencies": {"file_format": ["FASTQ", "BAM"]} + }, + "is_adapter_trimmed": { + "type": "boolean", + "required": true, + "allowed": ["TRUE", "FALSE"], + "dependencies": {"file_format": ["FASTQ"]} + }, + "flow_cell_barcode": { + "type": "string", + "required": true, + "dependencies": {"file_format": ["FASTQ"]} + }, + "lane_number": { + "type": "integer", + "required": true, + "dependencies": {"file_format": ["FASTQ"]} + }, + "adapter_sequencing": { + "type": "string", + "required": true, + "dependencies": {"is_adapter_trimmed": "FALSE"} + }, + "target_capture_kit_name": { + "type": "string", + "required": true, + "dependencies": {"experiment_strategy": ["Targeted Sequencing", "Panel"]} + }, + "target_capture_kit_link": { + "type": "string", + "required": true, + "dependencies": {"experiment_strategy": ["Targeted Sequencing", "Panel"]} + }, + "mean_coverage": { + "type": "string", + "required": true, + "dependencies": {"experiment_strategy": ["WGS", "WXS", "WES", "Targeted Sequencing", "Panel"]} + } + }, + "methylation_rules":{ + "sample_id": {"required": true}, + "aliquot_id": {"required": true}, + "tissue_type": { + "type": "string", + "required": true, + "allowed": ["Tumor", "Normal", "Abnormal", "Peritumoral", "Unknown", "Not Reported"] + }, + "file_name": {"type": "string", "required": true}, + "file_format": { + "type": "string", + "required": true, + "allowed": ["FASTQ", "BAM", "BAI", "CRAM", "CRAI", "GVCF", "VCF", "TBI", "MAF", "PDF", "HTML", "DCM", "IDAT", "SVS", "GPR", "CNS", "TXT", "PNG", "CSV", "PED", "SEG", "TAR", "TSV"] + }, + "file_size": {"type": "integer", "required": true}, + "file_hash_type": { + "type": "string", + "required": true, + "allowed": ["MD5", "SHA1", "SHA256", "SHA512", "ETag"] + }, + "file_hash_value": {"required": true}, + "sequencing_center": {"type": "string", "required": true}, + "platform": { + "type": "string", + "required": true, + "allowed": ["Illumina Infinium HumanMethylation450","Illumina Infinium HumanMethylationEPIC","Illumina Infinium HumanMethylationEPICv2","Illumina Infinium HumanMethylation27k","Roche NimbleGen MethylationSeq","Agilent SurePrint Methyl-Seq"] + }, + "experiment_strategy": { + "type": "string", + "required": true, + "allowed": ["Methylation", "Methylation Microarray"] + } + } +} \ No newline at end of file diff --git a/data/validation_rules_schema.json b/data/validation_rules_schema.json new file mode 100644 index 0000000..e28f795 --- /dev/null +++ b/data/validation_rules_schema.json @@ -0,0 +1,430 @@ +{ + "custom_rules":{ + "_comment": "Add custom fields that Cerberus doesn't recognize as valid schema rules.", + "file_size_byte_cutoff":{ + "_comment": "The threshold of file_size for WGS/WXS is 1GB, and 200MB for others.", + "type": "integer", + "general_cutoff": 200000000, + "wgs_wxs_cutoff": 200000000, + "dependencies": {"file_format": ["FASTQ", "BAM", "CRAM"]} + }, + "file_name_extensions": { + "bam": ".bam", + "bai": ".bai", + "cram": ".cram", + "crai": ".crai" + } + }, + "DNAseq_rules":{ + "sample_id": {"required": true}, + "aliquot_id": {"required": true}, + "tissue_type": { + "type": "string", + "required": true, + "allowed": ["Tumor", "Normal", "Abnormal", "Peritumoral", "Unknown", "Not Reported"] + }, + "file_name": {"type": "string", "required": true}, + "file_format": { + "type": "string", + "required": true, + "allowed": ["FASTQ", "BAM", "BAI", "CRAM", "CRAI", "GVCF", "VCF", "TBI", "MAF", "PDF", "HTML", "DCM", "IDAT", "SVS", "GPR", "CNS", "TXT", "PNG", "CSV", "PED", "SEG", "TAR", "TSV"] + }, + "file_size": {"type": "integer", "required": true}, + "file_hash_type": { + "type": "string", + "required": true, + "allowed": ["MD5", "SHA1", "SHA256", "SHA512", "ETag"] + }, + "file_hash_value": {"required": true}, + "sequencing_center": {"type": "string", "required": true}, + "platform": { + "type": "string", + "required": true, + "allowed": ["Complete Genomics", "Illumina", "Ion Torrent", "LS454", "SOLiD", "ONT", "DNBSEQ", "Other"] + }, + "instrument_model": { + "type": "string", + "required": true + }, + "experiment_strategy": { + "type": "string", + "required": true, + "allowed": ["WGS", "WXS", "Targeted Sequencing", "Panel"] + }, + "total_reads": {"type": "integer", "required": true}, + "read_pair_number": { + "type": "string", + "required": true, + "allowed": ["R1", "R2", "Not Applicable"], + "dependencies": {"file_format": "FASTQ"} + }, + "is_paired_end": { + "type": "boolean", + "required": true, + "allowed": ["TRUE", "FALSE"], + "dependencies": {"file_format": ["FASTQ", "BAM"]} + }, + "is_adapter_trimmed": { + "type": "boolean", + "required": true, + "allowed": ["TRUE", "FALSE"], + "dependencies": {"file_format": ["FASTQ"]} + }, + "flow_cell_barcode": { + "type": "string", + "required": true, + "dependencies": {"file_format": ["FASTQ"]} + }, + "lane_number": { + "type": "integer", + "required": true, + "dependencies": {"file_format": ["FASTQ"]} + }, + "adapter_sequencing": { + "type": "string", + "required": true, + "dependencies": {"is_adapter_trimmed": "FALSE"} + }, + "target_capture_kit_name": { + "type": "string", + "required": true, + "dependencies": {"experiment_strategy": ["Targeted Sequencing", "Panel"]} + }, + "target_capture_kit_link": { + "type": "string", + "required": true, + "dependencies": {"experiment_strategy": ["Targeted Sequencing", "Panel"]} + }, + "mean_coverage": { + "type": "string", + "required": true, + "dependencies": {"experiment_strategy": ["WGS", "WXS", "WES", "Targeted Sequencing", "Panel"]} + } + }, + "RNAseq_rules":{ + "sample_id": {"required": true}, + "aliquot_id": {"required": true}, + "tissue_type": { + "type": "string", + "required": true, + "allowed": ["Tumor", "Normal", "Abnormal", "Peritumoral", "Unknown", "Not Reported"] + }, + "file_name": {"type": "string", "required": true}, + "file_format": { + "type": "string", + "required": true, + "allowed": ["FASTQ", "BAM", "BAI", "CRAM", "CRAI", "GVCF", "VCF", "TBI", "MAF", "PDF", "HTML", "DCM", "IDAT", "SVS", "GPR", "CNS", "TXT", "PNG", "CSV", "PED", "SEG", "TAR", "TSV"] + }, + "file_size": {"type": "integer", "required": true}, + "file_hash_type": { + "type": "string", + "required": true, + "allowed": ["MD5", "SHA1", "SHA256", "SHA512", "ETag"] + }, + "file_hash_value": {"required": true}, + "sequencing_center": {"type": "string", "required": true}, + "platform": { + "type": "string", + "required": true, + "allowed": ["Complete Genomics", "Illumina", "Ion Torrent", "LS454", "SOLiD", "ONT", "DNBSEQ", "Other"] + }, + "instrument_model": { + "type": "string", + "required": true + }, + "experiment_strategy": { + "type": "string", + "required": true, + "allowed": ["RNA-Seq","RNAseq","miRNA-Seq"] + }, + "library_selection": { + "type": "string", + "required": true, + "allowed": ["Affinity Enrichment","Hybrid Selection","miRNA Size Fractionation","PCR","Poly-T Enrichment","Random","rRNA Depletion","Other"] + }, + "library_strand": { + "type": "string", + "required": true, + "allowed": ["Stranded","Unstranded","First Stranded","Second Stranded","Not Applicable"] + }, + "total_reads": {"type": "integer", "required": true}, + "read_pair_number": { + "type": "string", + "required": true, + "allowed": ["R1", "R2", "Not Applicable"], + "dependencies": {"file_format": "FASTQ"} + }, + "is_paired_end": { + "type": "boolean", + "required": true, + "allowed": ["TRUE", "FALSE"], + "dependencies": {"file_format": ["FASTQ", "BAM"]} + }, + "is_adapter_trimmed": { + "type": "boolean", + "required": true, + "allowed": ["TRUE", "FALSE"], + "dependencies": {"file_format": ["FASTQ"]} + }, + "flow_cell_barcode": { + "type": "string", + "required": true, + "dependencies": {"file_format": ["FASTQ"]} + }, + "lane_number": { + "type": "integer", + "required": true, + "dependencies": {"file_format": ["FASTQ"]} + }, + "adapter_sequencing": { + "type": "string", + "required": true, + "dependencies": {"is_adapter_trimmed": "FALSE"} + } + }, + "single_cell_rules":{ + "sample_id": {"required": true}, + "aliquot_id": {"required": true}, + "organism": {"required": true}, + "cell_entity": {"required": true}, + "tissue_type": { + "type": "string", + "required": true, + "allowed": ["Tumor", "Normal", "Abnormal", "Peritumoral", "Unknown", "Not Reported"] + }, + "file_name": {"type": "string", "required": true}, + "file_format": { + "type": "string", + "required": true, + "allowed": ["FASTQ", "BAM", "BAI", "CRAM", "CRAI", "GVCF", "VCF", "TBI", "MAF", "PDF", "HTML", "DCM", "IDAT", "SVS", "GPR", "CNS", "TXT", "PNG", "CSV", "PED", "SEG", "TAR", "TSV"] + }, + "file_size": {"type": "integer", "required": true}, + "file_hash_type": { + "type": "string", + "required": true, + "allowed": ["MD5", "SHA1", "SHA256", "SHA512", "ETag"] + }, + "file_hash_value": {"required": true}, + "sequencing_center": {"type": "string", "required": true}, + "platform": { + "type": "string", + "required": true, + "allowed": ["Complete Genomics", "Illumina", "Ion Torrent", "LS454", "SOLiD", "ONT", "DNBSEQ", "Other"] + }, + "instrument_model": { + "type": "string", + "required": true + }, + "experiment_strategy": { + "type": "string", + "required": true, + "allowed": ["scRNA-Seq", "snRNA-Seq"] + }, + "end_bias": { + "type": "string", + "required": true, + "allowed": ["3'-end", "5'-end", "Full-length"] + }, + "library_selection": { + "type": "string", + "required": true, + "allowed": ["Affinity Enrichment","Hybrid Selection","miRNA Size Fractionation","PCR","Poly-T Enrichment","Random","rRNA Depletion","Other"] + }, + "library_strand": { + "type": "string", + "required": true, + "allowed": ["Stranded","Unstranded","First Stranded","Second Stranded","Not Applicable"] + }, + "library_construction": { + "type": "string", + "required": true + }, + "UMI_barcode_read": { + "type": "string", + "required": true, + "allowed": ["index1","index2","read1","read2","Not Applicable"] + }, + "UMI_barcode_offset": {"type": "integer", "required": true}, + "cell_barcode_read": { + "type": "string", + "required": true, + "allowed": ["index1","index2","read1","read2","Not Applicable"] + }, + "UMI_barcode_size": {"type": "integer", "required": true}, + "cell_barcode_offset": {"type": "integer", "required": true}, + "cell_barcode_size": {"type": "integer", "required": true}, + "cDNA_read": { + "type": "string", + "required": true, + "allowed": ["index1","index2","read1","read2","Not Applicable"] + }, + "cDNA_read_offset": {"type": "integer", "required": true}, + "read_pair_number": { + "type": "string", + "required": true, + "allowed": ["R1", "R2", "Not Applicable"], + "dependencies": {"file_format": "FASTQ"} + }, + "is_paired_end": { + "type": "boolean", + "required": true, + "allowed": ["TRUE", "FALSE"], + "dependencies": {"file_format": ["FASTQ", "BAM"]} + }, + "is_adapter_trimmed": { + "type": "boolean", + "required": true, + "allowed": ["TRUE", "FALSE"], + "dependencies": {"file_format": ["FASTQ"]} + }, + "flow_cell_barcode": { + "type": "string", + "required": true, + "dependencies": {"file_format": ["FASTQ"]} + }, + "lane_number": { + "type": "integer", + "required": true, + "dependencies": {"file_format": ["FASTQ"]} + }, + "adapter_sequencing": { + "type": "string", + "required": true, + "dependencies": {"is_adapter_trimmed": "FALSE"} + } + }, + "pacbio_longread_rules":{ + "sample_id": {"required": true}, + "aliquot_id": {"required": true}, + "tissue_type": { + "type": "string", + "required": true, + "allowed": ["Tumor", "Normal", "Abnormal", "Peritumoral", "Unknown", "Not Reported"] + }, + "file_name": {"type": "string", "required": true}, + "file_format": { + "type": "string", + "required": true, + "allowed": ["FASTQ", "BAM", "BAI", "CRAM", "CRAI", "GVCF", "VCF", "TBI", "MAF", "PDF", "HTML", "DCM", "IDAT", "SVS", "GPR", "CNS", "TXT", "PNG", "CSV", "PED", "SEG", "TAR", "TSV"] + }, + "file_size": {"type": "integer", "required": true}, + "file_hash_type": { + "type": "string", + "required": true, + "allowed": ["MD5", "SHA1", "SHA256", "SHA512", "ETag"] + }, + "file_hash_value": {"required": true}, + "sequencing_center": {"type": "string", "required": true}, + "platform": { + "type": "string", + "required": true, + "allowed": ["PacBio"] + }, + "instrument_model": { + "type": "string", + "required": true + }, + "experiment_strategy": { + "type": "string", + "required": true, + "allowed": ["WGS", "WXS", "RNA-Seq", "Targeted Sequencing", "Panel"] + }, + "library_selection": { + "type": "string", + "required": true, + "allowed": ["Affinity Enrichment","Hybrid Selection","miRNA Size Fractionation","PCR","Poly-T Enrichment","Random","rRNA Depletion","Other"] + }, + "library_strand": { + "type": "string", + "required": true, + "allowed": ["Stranded","Unstranded","First Stranded","Second Stranded","Not Applicable"] + }, + "sequencing_mode": { + "type": "string", + "required": true, + "allowed": ["CLR","CCS"] + }, + "total_reads": {"type": "integer", "required": true}, + "read_pair_number": { + "type": "string", + "required": true, + "allowed": ["R1", "R2", "Not Applicable"], + "dependencies": {"file_format": "FASTQ"} + }, + "is_paired_end": { + "type": "boolean", + "required": true, + "allowed": ["TRUE", "FALSE"], + "dependencies": {"file_format": ["FASTQ", "BAM"]} + }, + "is_adapter_trimmed": { + "type": "boolean", + "required": true, + "allowed": ["TRUE", "FALSE"], + "dependencies": {"file_format": ["FASTQ"]} + }, + "flow_cell_barcode": { + "type": "string", + "required": true, + "dependencies": {"file_format": ["FASTQ"]} + }, + "lane_number": { + "type": "integer", + "required": true, + "dependencies": {"file_format": ["FASTQ"]} + }, + "adapter_sequencing": { + "type": "string", + "required": true, + "dependencies": {"is_adapter_trimmed": "FALSE"} + }, + "target_capture_kit_name": { + "type": "string", + "required": true, + "dependencies": {"experiment_strategy": ["Targeted Sequencing", "Panel"]} + }, + "target_capture_kit_link": { + "type": "string", + "required": true, + "dependencies": {"experiment_strategy": ["Targeted Sequencing", "Panel"]} + }, + "mean_coverage": { + "type": "string", + "required": true, + "dependencies": {"experiment_strategy": ["WGS", "WXS", "WES", "Targeted Sequencing", "Panel"]} + } + }, + "methylation_rules":{ + "sample_id": {"required": true}, + "aliquot_id": {"required": true}, + "tissue_type": { + "type": "string", + "required": true, + "allowed": ["Tumor", "Normal", "Abnormal", "Peritumoral", "Unknown", "Not Reported"] + }, + "file_name": {"type": "string", "required": true}, + "file_format": { + "type": "string", + "required": true, + "allowed": ["FASTQ", "BAM", "BAI", "CRAM", "CRAI", "GVCF", "VCF", "TBI", "MAF", "PDF", "HTML", "DCM", "IDAT", "SVS", "GPR", "CNS", "TXT", "PNG", "CSV", "PED", "SEG", "TAR", "TSV"] + }, + "file_size": {"type": "integer", "required": true}, + "file_hash_type": { + "type": "string", + "required": true, + "allowed": ["MD5", "SHA1", "SHA256", "SHA512", "ETag"] + }, + "file_hash_value": {"required": true}, + "sequencing_center": {"type": "string", "required": true}, + "platform": { + "type": "string", + "required": true, + "allowed": ["Illumina Infinium HumanMethylation450","Illumina Infinium HumanMethylationEPIC","Illumina Infinium HumanMethylationEPICv2","Illumina Infinium HumanMethylation27k","Roche NimbleGen MethylationSeq","Agilent SurePrint Methyl-Seq"] + }, + "experiment_strategy": { + "type": "string", + "required": true, + "allowed": ["Methylation", "Methylation Microarray"] + } + } +} \ No newline at end of file diff --git a/setup.py b/setup.py index 7fc2ee2..fbd80b6 100644 --- a/setup.py +++ b/setup.py @@ -11,6 +11,7 @@ name='d3b-dff-cli', version=__version__, packages=find_packages(), + include_package_data=True, entry_points={ 'console_scripts': [ 'd3b=d3b_dff_cli.cli:main', From 88c12914de70078008958afd40843f78398ae68b Mon Sep 17 00:00:00 2001 From: HuangXiaoyan0106 Date: Tue, 10 Sep 2024 10:40:31 +0800 Subject: [PATCH 2/7] :lipstick: add lib --- requirements.txt | 1 + setup.py | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 9a35e83..ff5383c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,3 +23,4 @@ typing_extensions==4.10.0 tzdata==2023.4 urllib3==2.1.0 yarl==1.9.4 +cerberus==1.3.5 diff --git a/setup.py b/setup.py index fbd80b6..bcff3a2 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,11 @@ +import os from setuptools import setup, find_packages from d3b_dff_cli.version import __version__ -install_requires = [ - 'pandas', - 'argparse', - 'requests', -] +root_dir = os.path.dirname(os.path.abspath(__file__)) +req_file = os.path.join(root_dir, "requirements.txt") +with open(req_file) as f: + requirements = f.read().splitlines() setup( name='d3b-dff-cli', @@ -17,7 +17,7 @@ 'd3b=d3b_dff_cli.cli:main', ], }, - install_requires=install_requires, + install_requires=requirements, python_requires='>=3.8', author='Xiaoyan Huang', author_email='huangx@chop.edu', From 444cbda288878ea20444ac4b658b3f5a92a1b547 Mon Sep 17 00:00:00 2001 From: HuangXiaoyan0106 Date: Tue, 10 Sep 2024 18:05:42 +0800 Subject: [PATCH 3/7] :technologist: update rules --- .../validation/cerberus_custom_checks.py | 108 +++++++----------- .../modules/validation/check_manifest.py | 53 +++++---- .../validation/validation_rules_schema.json | 95 ++++++++++----- 3 files changed, 139 insertions(+), 117 deletions(-) diff --git a/d3b_dff_cli/modules/validation/cerberus_custom_checks.py b/d3b_dff_cli/modules/validation/cerberus_custom_checks.py index f2428a6..0bf71d9 100644 --- a/d3b_dff_cli/modules/validation/cerberus_custom_checks.py +++ b/d3b_dff_cli/modules/validation/cerberus_custom_checks.py @@ -1,69 +1,40 @@ from cerberus import Validator -import pandas as pd import warnings # Suppress specific UserWarnings from Cerberus warnings.filterwarnings("ignore", category=UserWarning, module="cerberus.validator") + class CustomValidator(Validator): def __init__(self, schema, rules=None, *args, **kwargs): + """ + Initialize the CustomValidator with schema and optional custom rules. + """ super().__init__(schema, *args, **kwargs) - self.rules = rules if rules else {} + self.custom_rules = rules or {} - def _validate_dependencies(self, dependencies, field, value): + def _check_dependencies(self, field, document): """ - Check if the field's dependencies are met. If not, skip validation for this field. + Check if the field's dependencies are met. """ + dependencies = self.schema.get(field, {}).get('dependencies', {}) for dependency_field, allowed_values in dependencies.items(): - dependency_value = self.document.get(dependency_field) - - # Check if the allowed_values is a list or a single value + dependency_value = document.get(dependency_field) if isinstance(allowed_values, list): if dependency_value not in allowed_values: - return False # Dependencies not met + return False else: if dependency_value != allowed_values: - return False # Dependencies not met - - # Dependencies are met, perform validation + return False return True - def _validate_field(self, field, value, field_schema): + def _validate_custom_rules(self, field, value): """ - Validate the field value based on its schema. + Apply custom validation rules that are beyond the default Cerberus validation. """ - field_type = field_schema.get('type') - allowed_values = field_schema.get('allowed') - - # Validate required fields - if field_schema.get('required') and (value is None or pd.isna(value) or value == ''): - self._error(field, f"{field} is required.") - return False - - # Handle validation for allowed values - if allowed_values and value not in allowed_values: - self._error(field, f"{field} must be one of {allowed_values}.") - return False - - # Validate type - if field_type: - if field_type == 'boolean': - if value not in ['true', 'false']: - self._error(field, f"{field} must be a boolean value.") - return False - elif field_type == 'integer': - if not (isinstance(value, int) or (isinstance(value, float) and value.is_integer())): - self._error(field, f"{field} must be of integer type.") - return False - elif field_type == 'string': - if not isinstance(value, str): - self._error(field, f"{field} must be of string type.") - return False - - # Apply additional validation based on custom rules if field == 'file_name': file_format = self.document.get('file_format') if file_format: - extensions = self.rules.get('file_name_extensions') + extensions = self.custom_rules.get('file_name_extensions', {}) expected_extension = extensions.get(file_format) if expected_extension and not value.lower().endswith(expected_extension): self._error(field, f"{field} must end with {expected_extension} for file_format '{file_format}'.") @@ -72,39 +43,42 @@ def _validate_field(self, field, value, field_schema): if field == 'file_size': file_format = self.document.get('file_format') experiment = self.document.get("experiment_strategy") - - byte_cutoff_general = self.rules.get('file_size_byte_cutoff').get('general_cutoff') - byte_cutoff_wgs_wxs = self.rules.get('file_size_byte_cutoff').get('wgs_wxs_cutoff') - dependencies_format = self.rules.get('file_size_byte_cutoff').get('dependencies').get('file_format') + byte_cutoff_general = self.custom_rules.get('file_size_byte_cutoff', {}).get('general_cutoff', 0) + byte_cutoff_wgs_wxs = self.custom_rules.get('file_size_byte_cutoff', {}).get('wgs_wxs_cutoff', 0) + dependencies_format = self.custom_rules.get('file_size_byte_cutoff', {}).get('dependencies', {}).get('file_format', []) + + minum_value = byte_cutoff_wgs_wxs if experiment in ["wgs", "wxs", "wes"] else byte_cutoff_general - if experiment in ["wgs", "wxs", "wes"]: - minum_value = byte_cutoff_wgs_wxs - else: - minum_value = byte_cutoff_general - if file_format in dependencies_format: if value < minum_value: - self._error(field, f"Warning: *{field}* must be at least {minum_value} for file_format '{file_format}'.") + self._error(field, f"[Warning] must be at least {minum_value} for file_format '{file_format}'.") return False + return True + def validate(self, document, *args, **kwargs): """ - Override validate method to ensure dependencies are respected. + Override validate method to first check dependencies, then apply default and custom validation. """ self.document = document - is_valid = True - - for field, field_schema in self.schema.items(): - value = document.get(field) - dependencies = field_schema.get('dependencies') - - if dependencies: - if not self._validate_dependencies(dependencies, field, value): - continue # Skip validation if dependencies are not met - # Perform validation if dependencies are met - if not self._validate_field(field, value, field_schema): - is_valid = False + # Prepare filtered document with fields that meet dependencies + filtered_document = {} + for field in self.schema: + if self._check_dependencies(field, document): + filtered_document[field] = document.get(field) + + # Perform default validation + super().validate(filtered_document, *args, **kwargs) + + for field, value in filtered_document.items(): + self._validate_custom_rules(field, value) - return is_valid \ No newline at end of file + # Filter and return errors + print_errors = {field: errors for field, errors in self.errors.items() if field in filtered_document} + + # Determine overall validity based on errors + is_valid = not bool(print_errors) + + return is_valid, print_errors \ No newline at end of file diff --git a/d3b_dff_cli/modules/validation/check_manifest.py b/d3b_dff_cli/modules/validation/check_manifest.py index 5cadc9d..16ceb90 100644 --- a/d3b_dff_cli/modules/validation/check_manifest.py +++ b/d3b_dff_cli/modules/validation/check_manifest.py @@ -8,7 +8,9 @@ validation_schema = os.path.join(wk_dir, "validation_rules_schema.json") def load_data(manifest_file): - # Load data based on file extension + """ + Load data from a manifest file and convert strings to lowercase. + """ file_extension = manifest_file.split('.')[-1].lower() if file_extension == 'csv': manifest_data = pd.read_csv(manifest_file) @@ -25,12 +27,13 @@ def load_data(manifest_file): else: raise ValueError("Unsupported file format. Please provide a CSV, TSV, or Excel file.") - # Convert manifest to lowercase - manifest_data = manifest_data.apply(lambda col: col.astype(str).str.lower() if col.dtype.name in ['object', 'bool'] else col) - + manifest_data = manifest_data.apply(lambda col: col.astype(str).str.lower() if col.dtype.name in ['object'] else col) return manifest_data def convert_schema_to_lowercase(schema): + """ + Convert all string values in the schema to lowercase. + """ for k, v in schema.items(): if isinstance(v, dict): convert_schema_to_lowercase(v) @@ -41,48 +44,56 @@ def convert_schema_to_lowercase(schema): return schema def validate_data(df, schema_json): + """ + Validate the DataFrame against the schema. + """ valid = True errors = [] - custom_rules = schema_json['custom_rules'] + custom_rules = schema_json.get('custom_rules', {}) for index, row in df.iterrows(): - - # Check the manifest type and select the appropriate validation rules - experiment_strategy = row["experiment_strategy"] - platform = row["platform"] + experiment_strategy = row.get("experiment_strategy", "").lower() + platform = row.get("platform", "").lower() + if platform == "pacbio": rule_type = "pacbio_longread_rules" else: - if experiment_strategy in ["wgs","wxs","wes","target sequencing","panel","target"]: + if experiment_strategy in ["wgs", "wxs", "wes", "target sequencing", "panel", "target"]: rule_type = "DNAseq_rules" - elif experiment_strategy in ["rna-seq","rnaseq","mirna-seq","mirnaseq"]: + elif experiment_strategy in ["rna-seq", "rnaseq", "mirna-seq", "mirnaseq"]: rule_type = "RNAseq_rules" - elif experiment_strategy in ["scrna-seq","snran-seq","scrnaseq","snranseq"]: + elif experiment_strategy in ["scrna-seq", "snran-seq", "scrnaseq", "snranseq"]: rule_type = "single_cell_rules" - elif experiment_strategy in ["methtlation","methylation microarray"]: + elif experiment_strategy in ["methtlation", "methylation microarray"]: rule_type = "methylation_rules" else: - raise ValueError("Unsupported experiment_strategy for Row {0}".format(index + 1)) - schema = schema_json[rule_type] + raise ValueError(f"Unsupported experiment_strategy for Row {index + 1}") + + schema = schema_json.get(rule_type, {}) - # Filter out fields not in the schema fields + # Filter out fields not in the schema fields for combined manifest row_dict = row.to_dict() filtered_row_dict = {k: v for k, v in row_dict.items() if k in schema} v = CustomValidator(schema, custom_rules) - is_valid = v.validate(filtered_row_dict) + (is_valid, out_error) = v.validate(filtered_row_dict) + if not is_valid: valid = False errors.append({ 'row': index + 1, - 'errors': v.errors + 'errors': out_error }) return valid, errors def main(args): + """ + Main function to load schema, validate data, and print the validation report. + """ with open(validation_schema, 'r') as f: schema = json.load(f) + schema_json = convert_schema_to_lowercase(schema) # Load and preprocess the data @@ -90,7 +101,7 @@ def main(args): # Validate the data valid, errors = validate_data(df, schema_json) - + # Print validation report if valid: print("====Validation Passed====\n All rows are valid.") @@ -98,7 +109,6 @@ def main(args): # Check if all warning messages only_warnings = all("Warning" in field_error for error in errors for field_errors in error['errors'].values() for field_error in field_errors) - # Print appropriate message if only_warnings: print("====Validation Warnings====") else: @@ -107,7 +117,8 @@ def main(args): for error in errors: print(f"Row {error['row']}:") for field, field_errors in error['errors'].items(): - print(f" {field_errors[0]}") + for field_error in field_errors: + print(f" {field}: {field_error}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Validate a manifest based on defined rules.") diff --git a/d3b_dff_cli/modules/validation/validation_rules_schema.json b/d3b_dff_cli/modules/validation/validation_rules_schema.json index e28f795..9144b51 100644 --- a/d3b_dff_cli/modules/validation/validation_rules_schema.json +++ b/d3b_dff_cli/modules/validation/validation_rules_schema.json @@ -16,8 +16,14 @@ } }, "DNAseq_rules":{ - "sample_id": {"required": true}, - "aliquot_id": {"required": true}, + "sample_id": { + "type": ["string","integer"], + "required": true + }, + "aliquot_id": { + "type": ["string","integer"], + "required": true + }, "tissue_type": { "type": "string", "required": true, @@ -35,7 +41,10 @@ "required": true, "allowed": ["MD5", "SHA1", "SHA256", "SHA512", "ETag"] }, - "file_hash_value": {"required": true}, + "file_hash_value": { + "type": ["string","integer"], + "required": true + }, "sequencing_center": {"type": "string", "required": true}, "platform": { "type": "string", @@ -61,13 +70,11 @@ "is_paired_end": { "type": "boolean", "required": true, - "allowed": ["TRUE", "FALSE"], "dependencies": {"file_format": ["FASTQ", "BAM"]} }, "is_adapter_trimmed": { "type": "boolean", "required": true, - "allowed": ["TRUE", "FALSE"], "dependencies": {"file_format": ["FASTQ"]} }, "flow_cell_barcode": { @@ -83,7 +90,7 @@ "adapter_sequencing": { "type": "string", "required": true, - "dependencies": {"is_adapter_trimmed": "FALSE"} + "dependencies": {"is_adapter_trimmed": false} }, "target_capture_kit_name": { "type": "string", @@ -102,8 +109,14 @@ } }, "RNAseq_rules":{ - "sample_id": {"required": true}, - "aliquot_id": {"required": true}, + "sample_id": { + "type": ["string","integer"], + "required": true + }, + "aliquot_id": { + "type": ["string","integer"], + "required": true + }, "tissue_type": { "type": "string", "required": true, @@ -121,7 +134,7 @@ "required": true, "allowed": ["MD5", "SHA1", "SHA256", "SHA512", "ETag"] }, - "file_hash_value": {"required": true}, + "file_hash_value": {"type": ["string","integer"], "required": true}, "sequencing_center": {"type": "string", "required": true}, "platform": { "type": "string", @@ -157,13 +170,11 @@ "is_paired_end": { "type": "boolean", "required": true, - "allowed": ["TRUE", "FALSE"], "dependencies": {"file_format": ["FASTQ", "BAM"]} }, "is_adapter_trimmed": { "type": "boolean", "required": true, - "allowed": ["TRUE", "FALSE"], "dependencies": {"file_format": ["FASTQ"]} }, "flow_cell_barcode": { @@ -179,14 +190,26 @@ "adapter_sequencing": { "type": "string", "required": true, - "dependencies": {"is_adapter_trimmed": "FALSE"} + "dependencies": {"is_adapter_trimmed": false} } }, "single_cell_rules":{ - "sample_id": {"required": true}, - "aliquot_id": {"required": true}, - "organism": {"required": true}, - "cell_entity": {"required": true}, + "sample_id": { + "type": ["string","integer"], + "required": true + }, + "aliquot_id": { + "type": ["string","integer"], + "required": true + }, + "organism": { + "type": ["string","integer"], + "required": true + }, + "cell_entity": { + "type": ["string","integer"], + "required": true + }, "tissue_type": { "type": "string", "required": true, @@ -204,7 +227,7 @@ "required": true, "allowed": ["MD5", "SHA1", "SHA256", "SHA512", "ETag"] }, - "file_hash_value": {"required": true}, + "file_hash_value": {"type": ["string","integer"], "required": true}, "sequencing_center": {"type": "string", "required": true}, "platform": { "type": "string", @@ -268,13 +291,11 @@ "is_paired_end": { "type": "boolean", "required": true, - "allowed": ["TRUE", "FALSE"], "dependencies": {"file_format": ["FASTQ", "BAM"]} }, "is_adapter_trimmed": { "type": "boolean", "required": true, - "allowed": ["TRUE", "FALSE"], "dependencies": {"file_format": ["FASTQ"]} }, "flow_cell_barcode": { @@ -290,12 +311,18 @@ "adapter_sequencing": { "type": "string", "required": true, - "dependencies": {"is_adapter_trimmed": "FALSE"} + "dependencies": {"is_adapter_trimmed": false} } }, "pacbio_longread_rules":{ - "sample_id": {"required": true}, - "aliquot_id": {"required": true}, + "sample_id": { + "type": ["string","integer"], + "required": true + }, + "aliquot_id": { + "type": ["string","integer"], + "required": true + }, "tissue_type": { "type": "string", "required": true, @@ -313,7 +340,10 @@ "required": true, "allowed": ["MD5", "SHA1", "SHA256", "SHA512", "ETag"] }, - "file_hash_value": {"required": true}, + "file_hash_value": { + "type": ["string","integer"], + "required": true + }, "sequencing_center": {"type": "string", "required": true}, "platform": { "type": "string", @@ -354,13 +384,11 @@ "is_paired_end": { "type": "boolean", "required": true, - "allowed": ["TRUE", "FALSE"], "dependencies": {"file_format": ["FASTQ", "BAM"]} }, "is_adapter_trimmed": { "type": "boolean", "required": true, - "allowed": ["TRUE", "FALSE"], "dependencies": {"file_format": ["FASTQ"]} }, "flow_cell_barcode": { @@ -376,7 +404,7 @@ "adapter_sequencing": { "type": "string", "required": true, - "dependencies": {"is_adapter_trimmed": "FALSE"} + "dependencies": {"is_adapter_trimmed": false} }, "target_capture_kit_name": { "type": "string", @@ -395,8 +423,14 @@ } }, "methylation_rules":{ - "sample_id": {"required": true}, - "aliquot_id": {"required": true}, + "sample_id": { + "type": ["string","integer"], + "required": true + }, + "aliquot_id": { + "type": ["string","integer"], + "required": true + }, "tissue_type": { "type": "string", "required": true, @@ -414,7 +448,10 @@ "required": true, "allowed": ["MD5", "SHA1", "SHA256", "SHA512", "ETag"] }, - "file_hash_value": {"required": true}, + "file_hash_value": { + "type": ["string","integer"], + "required": true + }, "sequencing_center": {"type": "string", "required": true}, "platform": { "type": "string", From eeb03e964b5d5912bede45b9e87041745a168a5e Mon Sep 17 00:00:00 2001 From: HuangXiaoyan0106 Date: Wed, 11 Sep 2024 00:23:47 +0800 Subject: [PATCH 4/7] :white_check_mark: update custom rule validation --- d3b_dff_cli/modules/validation/cerberus_custom_checks.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/d3b_dff_cli/modules/validation/cerberus_custom_checks.py b/d3b_dff_cli/modules/validation/cerberus_custom_checks.py index 0bf71d9..545d069 100644 --- a/d3b_dff_cli/modules/validation/cerberus_custom_checks.py +++ b/d3b_dff_cli/modules/validation/cerberus_custom_checks.py @@ -34,7 +34,7 @@ def _validate_custom_rules(self, field, value): if field == 'file_name': file_format = self.document.get('file_format') if file_format: - extensions = self.custom_rules.get('file_name_extensions', {}) + extensions = self.custom_rules.get('file_name_extensions') expected_extension = extensions.get(file_format) if expected_extension and not value.lower().endswith(expected_extension): self._error(field, f"{field} must end with {expected_extension} for file_format '{file_format}'.") @@ -43,9 +43,9 @@ def _validate_custom_rules(self, field, value): if field == 'file_size': file_format = self.document.get('file_format') experiment = self.document.get("experiment_strategy") - byte_cutoff_general = self.custom_rules.get('file_size_byte_cutoff', {}).get('general_cutoff', 0) - byte_cutoff_wgs_wxs = self.custom_rules.get('file_size_byte_cutoff', {}).get('wgs_wxs_cutoff', 0) - dependencies_format = self.custom_rules.get('file_size_byte_cutoff', {}).get('dependencies', {}).get('file_format', []) + byte_cutoff_general = self.custom_rules.get('file_size_byte_cutoff').get('general_cutoff') + byte_cutoff_wgs_wxs = self.custom_rules.get('file_size_byte_cutoff').get('wgs_wxs_cutoff') + dependencies_format = self.custom_rules.get('file_size_byte_cutoff').get('dependencies').get('file_format') minum_value = byte_cutoff_wgs_wxs if experiment in ["wgs", "wxs", "wes"] else byte_cutoff_general From 05cb87064cfc02b538a9f9544b0aeacf79857928 Mon Sep 17 00:00:00 2001 From: HuangXiaoyan0106 Date: Wed, 11 Sep 2024 00:26:57 +0800 Subject: [PATCH 5/7] :white_check_mark: update custom rule validation --- d3b_dff_cli/modules/validation/validation_rules_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/d3b_dff_cli/modules/validation/validation_rules_schema.json b/d3b_dff_cli/modules/validation/validation_rules_schema.json index 9144b51..48e3f8c 100644 --- a/d3b_dff_cli/modules/validation/validation_rules_schema.json +++ b/d3b_dff_cli/modules/validation/validation_rules_schema.json @@ -5,7 +5,7 @@ "_comment": "The threshold of file_size for WGS/WXS is 1GB, and 200MB for others.", "type": "integer", "general_cutoff": 200000000, - "wgs_wxs_cutoff": 200000000, + "wgs_wxs_cutoff": 1000000000, "dependencies": {"file_format": ["FASTQ", "BAM", "CRAM"]} }, "file_name_extensions": { From 69dd979bbd79179dc637032e29d8f2b7f0199436 Mon Sep 17 00:00:00 2001 From: HuangXiaoyan0106 Date: Tue, 17 Sep 2024 00:17:49 +0800 Subject: [PATCH 6/7] :bookmark: increase version --- d3b_dff_cli/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/d3b_dff_cli/version.py b/d3b_dff_cli/version.py index 3f39079..feea50a 100644 --- a/d3b_dff_cli/version.py +++ b/d3b_dff_cli/version.py @@ -1 +1 @@ -__version__ = '2.0.1' +__version__ = 'v3.0.0' From 3b40f080d8af1215a420c3f7463c717c27e8bce1 Mon Sep 17 00:00:00 2001 From: HuangXiaoyan0106 Date: Tue, 17 Sep 2024 00:46:29 +0800 Subject: [PATCH 7/7] :mute: update version --- d3b_dff_cli/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/d3b_dff_cli/version.py b/d3b_dff_cli/version.py index feea50a..4eb28e3 100644 --- a/d3b_dff_cli/version.py +++ b/d3b_dff_cli/version.py @@ -1 +1 @@ -__version__ = 'v3.0.0' +__version__ = '3.0.0'