Skip to content

Commit

Permalink
Merge pull request #205 from include-dcc/validator-fixes
Browse files Browse the repository at this point in the history
Handling different charter encodings and multi-value slot updates
  • Loading branch information
madanucd authored Sep 3, 2024
2 parents 9d8f4a6 + 6fa6c47 commit 29f9642
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 27 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ linkml-runtime = "^1.1.24"
ontodev-cogs = "^0.3.3"
schemasheets = "^0.1.13"
typer = "^0.7.0"
chardet = "^5.2.0"

[tool.poetry.dev-dependencies]
linkml = "^1.2.6"
Expand Down
14 changes: 7 additions & 7 deletions src/data_validation/validate_study.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def validate_study_entry(row):
instance = Study(
studyCode = row['study code'],
studyTitle = handle_nan(row['study title']),
program = row['program'].split('|'),
program = row['program'].split('|') if handle_nan(row['program']) else [],
studyDescription = handle_nan(row['study description']),
principalInvestigatorName = handle_nan(row['principal investigator name']).split('|') if handle_nan(row['principal investigator name']) else [],
studyContactName = handle_nan(row['study contact name']).split('|') if handle_nan(row['study contact name']) else [],
Expand All @@ -17,12 +17,12 @@ def validate_study_entry(row):
vbrEmail = handle_nan(row['vbr email']),
vbrUrl = handle_nan(row['vbr url']),
vbrReadme = handle_nan(row['vbr readme']),
researchDomain = row['research domain'].split('|'),
participantLifespanStage = row['participant lifespan stage'].split('|'),
researchDomain = row['research domain'].split('|') if handle_nan(row['research domain']) else [],
participantLifespanStage = row['participant lifespan stage'].split('|') if handle_nan(row['participant lifespan stage']) else [],
selectionCriteria = handle_nan(row['selection criteria']),
studyDesign = handle_nan(row['study design']),
clinicalDataSourceType = row['clinical data source type'].split('|'),
dataCategory = row['data category'].split('|'),
clinicalDataSourceType = row['clinical data source type'].split('|') if handle_nan(row['clinical data source type']) else [],
dataCategory = row['data category'].split('|') if handle_nan(row['data category']) else [],
studyWebsite = handle_nan(row['study website']),
dbgap = row['dbgap'].split('|') if handle_nan(row['dbgap']) else [],
publication = str(row['publication']).split('|') if handle_nan(row['publication']) else [],
Expand All @@ -35,5 +35,5 @@ def validate_study_entry(row):
return True, None
except ValidationError as e:
# Validation failed
error_details = (str(row['Study Code']), e)
return False, error_details
error_details = (str(row['study code']), e)
return False, error_details
8 changes: 7 additions & 1 deletion src/data_validation/validation_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas as pd
import os
from datetime import datetime
import chardet


def clean_string(value):
Expand Down Expand Up @@ -49,9 +50,14 @@ def save_validation_results(validation_results, input_file_name, output_path):
file.write('\n'.join(validation_results_str))
return output_file_path

def detect_encoding(file_path):
with open(file_path, 'rb') as f:
result = chardet.detect(f.read())
return result['encoding']

def read_csv_file(file_path):
df = pd.read_csv(file_path)
encoding = detect_encoding(file_path)
df = pd.read_csv(file_path, encoding=encoding)
df.columns = df.columns.str.lower() # Convert column names to lower case
return df

Expand Down
38 changes: 20 additions & 18 deletions src/include_linkml/include_pydantic.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from __future__ import annotations
from __future__ import annotations
from datetime import (
datetime,
date
)
from decimal import Decimal
from enum import Enum
from decimal import Decimal
from enum import Enum
import re
import sys
from typing import (
Expand All @@ -15,7 +15,7 @@
Optional,
Union
)
from pydantic.version import VERSION as PYDANTIC_VERSION
from pydantic.version import VERSION as PYDANTIC_VERSION
if int(PYDANTIC_VERSION[0])>=2:
from pydantic import (
BaseModel,
Expand Down Expand Up @@ -161,6 +161,7 @@ class EnumProgram(str, Enum):


class EnumStudyCode(str, Enum):
AADSC = "aadsc"
ABC_DS = "abc_ds"
ADS = "ads"
AECOM_DS = "aecom_ds"
Expand All @@ -183,6 +184,7 @@ class EnumStudyCode(str, Enum):
ECODS = "ecods"
EXcEEDS = "exceeds"
HTP = "htp"
OPTimal = "optimal"
TEAM_DS = "team_ds"
TRC_DS = "trc_ds"
X01_deSmith = "x01_desmith"
Expand Down Expand Up @@ -215,7 +217,7 @@ class EnumClinicalDataSourceType(str, Enum):
Medical_Record = "medical_record"
# Data obtained by examination, interview, etc. with investigator
Investigator_Assessment = "investigator_assessment"
# Data obtained from survey, questionnaire, etc.
# Data obtained from survey, questionnaire, etc. filled out by participant or caregiver
Participant_or_Caregiver_Report = "participant_or_caregiver_report"
Other = "other"
Unknown = "unknown"
Expand Down Expand Up @@ -257,7 +259,7 @@ class Biospecimen(Thing):
"""
A Biospecimen Collected from A Participant
"""
studyCode: EnumStudyCode = Field(..., title="Study Code", description="""Unique identifer for the study (generally a short acronym)""")
studyCode: EnumStudyCode = Field(..., title="Study Code", description="""Unique identifier for the study (generally a short acronym)""")
participantGlobalId: str = Field(..., title="Participant Global ID", description="""Unique INCLUDE global identifier for the participant, assigned by DCC""")
participantExternalId: str = Field(..., title="Participant External ID", description="""Unique, de-identified identifier for the participant, assigned by data contributor. External IDs must be two steps removed from personal information in the study records.""")
sampleGlobalId: str = Field(..., title="Sample Global ID", description="""INCLUDE global identifier for sample, assigned by DCC""")
Expand Down Expand Up @@ -286,7 +288,7 @@ class DataFile(Thing):
"""
Metadata about Data Files
"""
studyCode: EnumStudyCode = Field(..., title="Study Code", description="""Unique identifer for the study (generally a short acronym)""")
studyCode: EnumStudyCode = Field(..., title="Study Code", description="""Unique identifier for the study (generally a short acronym)""")
participantGlobalId: str = Field(..., title="Participant Global ID", description="""Unique INCLUDE global identifier for the participant, assigned by DCC""")
participantExternalId: str = Field(..., title="Participant External ID", description="""Unique, de-identified identifier for the participant, assigned by data contributor. External IDs must be two steps removed from personal information in the study records.""")
sampleGlobalId: str = Field(..., title="Sample Global ID", description="""INCLUDE global identifier for sample, assigned by DCC""")
Expand All @@ -311,7 +313,7 @@ class Participant(Thing):
"""
Demographic and clinical information about the participant
"""
studyCode: EnumStudyCode = Field(..., title="Study Code", description="""Unique identifer for the study (generally a short acronym)""")
studyCode: EnumStudyCode = Field(..., title="Study Code", description="""Unique identifier for the study (generally a short acronym)""")
participantGlobalId: str = Field(..., title="Participant Global ID", description="""Unique INCLUDE global identifier for the participant, assigned by DCC""")
participantExternalId: str = Field(..., title="Participant External ID", description="""Unique, de-identified identifier for the participant, assigned by data contributor. External IDs must be two steps removed from personal information in the study records.""")
familyId: Optional[str] = Field(None, title="Family ID", description="""Unique identifer for family to which Participant belongs, assigned by data contributor""")
Expand All @@ -335,7 +337,7 @@ class Condition(Thing):
"""
Co-occurring conditions and other observations for the participant
"""
studyCode: EnumStudyCode = Field(..., title="Study Code", description="""Unique identifer for the study (generally a short acronym)""")
studyCode: EnumStudyCode = Field(..., title="Study Code", description="""Unique identifier for the study (generally a short acronym)""")
participantGlobalId: str = Field(..., title="Participant Global ID", description="""Unique INCLUDE global identifier for the participant, assigned by DCC""")
participantExternalId: str = Field(..., title="Participant External ID", description="""Unique, de-identified identifier for the participant, assigned by data contributor. External IDs must be two steps removed from personal information in the study records.""")
eventId: Optional[str] = Field(None, title="Event ID", description="""Identifier for event (Visit, Survey completion, Sample collection, etc.) to which the Condition data are linked, if applicable. There may be multiple events linked to a Participant.""")
Expand All @@ -361,23 +363,23 @@ class Study(Thing):
"""
General information about the study
"""
studyCode: EnumStudyCode = Field(..., title="Study Code", description="""Unique identifer for the study (generally a short acronym)""")
studyCode: EnumStudyCode = Field(..., title="Study Code", description="""Unique identifier for the study (generally a short acronym)""")
studyTitle: str = Field(..., title="Study Title", description="""Full title of the study""")
program: List[EnumProgram] = Field(default_factory=list, title="Program", description="""Funding source(s) for the study (pipe-separated if multiple)""")
studyDescription: str = Field(..., title="Study Description", description="""Brief description of the study (2-4 sentences)""")
principalInvestigatorName: List[str] = Field(default_factory=list, title="Principal Investigator Name", description="""Name(s) of Principal Investigator(s) of this study; pipe-separated if multiple""")
studyContactName: List[str] = Field(default_factory=list, title="Study Contact Name", description="""Name of contact person for this study; pipe-separated if multiple""")
studyContactInstitution: List[str] = Field(default_factory=list, title="Study Contact Institution", description="""Institution of contact person for this study; pipe-separated if multiple""")
studyContactEmail: List[str] = Field(default_factory=list, title="Study Contact Email", description="""Email address of contact person for this study; pipe-separated if multiple""")
vbrEmail: Optional[str] = Field(None, title="VBR Email", description="""Email address for Virtual Biorepository requests/inquiries""")
vbrUrl: Optional[str] = Field(None, title="VBR URL", description="""Link to Virtual Biorepository request form""")
vbrReadme: Optional[str] = Field(None, title="VBR Readme", description="""Instructions for contacting or requesting samples from Virtual Biorepository""")
vbrEmail: Optional[str] = Field(None, title="VBR Email", description="""Email address for Virtual Biorepository requests/inquiries, if participating""")
vbrUrl: Optional[str] = Field(None, title="VBR URL", description="""Link to Virtual Biorepository request form, if participating""")
vbrReadme: Optional[str] = Field(None, title="VBR Readme", description="""Instructions for contacting or requesting samples from Virtual Biorepository, if participating""")
researchDomain: List[EnumResearchDomain] = Field(default_factory=list, title="Research Domain", description="""Main research domain(s) of the study, other than Down syndrome; pipe-separated if multiple""")
participantLifespanStage: List[EnumParticipantLifespanStage] = Field(default_factory=list, title="Participant Lifespan Stage", description="""Focus age group(s) of the study population; pipe-separated if multiple""")
selectionCriteria: Optional[str] = Field(None, title="Selection Criteria", description="""Brief description of inclusion and/or exclusion criteria for the study""")
studyDesign: str = Field(..., title="Study Design", description="""Overall design of study, including whether it is longitudinal and whether family members/unrelated controls are also enrolled""")
clinicalDataSourceType: List[EnumClinicalDataSourceType] = Field(default_factory=list, title="Clinical Data Source Type", description="""Source(s) of data collected from study participants; pipe-separated if multiple""")
dataCategory: EnumDataCategory = Field(..., title="Data Category", description="""Categories of data expected to be collected in this study""")
dataCategory: List[EnumDataCategory] = Field(default_factory=list, title="Data Category", description="""Categories of data expected to be collected in this study""")
studyWebsite: Optional[str] = Field(None, title="Study Website", description="""Website for the study""")
dbgap: Optional[List[str]] = Field(default_factory=list, title="dbGaP", description="""dbGaP \"phs\" accession code(s) associated with this Study, either for access or informational purposes (pipe-separated if multiple)""")
publication: Optional[List[str]] = Field(default_factory=list, title="Publication", description="""URL for publication(s) describing the study's rationale and methodology (PubMed Central preferred but not required; pipe-separated if multiple)""")
Expand All @@ -391,7 +393,7 @@ class Dataset(Thing):
"""
Information about a specific grouping of data files
"""
studyCode: EnumStudyCode = Field(..., title="Study Code", description="""Unique identifer for the study (generally a short acronym)""")
studyCode: EnumStudyCode = Field(..., title="Study Code", description="""Unique identifier for the study (generally a short acronym)""")
datasetName: str = Field(..., title="Dataset Name", description="""Full name of the dataset, provided by contributor""")
datasetDescription: Optional[str] = Field(None, title="Dataset Description", description="""Brief additional notes about the dataset (1-3 sentences) that are not already captured in the other fields""")
datasetGlobalId: Optional[str] = Field(None, title="Dataset Global ID", description="""Unique Global ID for dataset, generated by DCC""")
Expand All @@ -400,8 +402,8 @@ class Dataset(Thing):
expectedNumberOfFiles: Optional[int] = Field(None, title="Expected Number of Files", description="""Expected number of files associated with this dataset, including dictionaries. If additional explanation is needed, please add to Dataset Description field.""")
dataCollectionStartYear: Optional[str] = Field(None, title="Data Collection Start Year", description="""Year that data collection started""")
dataCollectionEndYear: Optional[str] = Field(None, title="Data Collection End Year", description="""Year that data collection ended""")
dataCategory: EnumDataCategory = Field(..., title="Data Category", description="""General category of data in Dataset; pipe-separated if multiple""")
dataType: Optional[str] = Field(None, title="Data Type", description="""Specific type of data contained in Dataset; pipe-separated if multiple (e.g. Preprocessed metabolite relative abundance, Absolute protein concentration, Aligned reads, Simple nucleotide variations, GVCF, Gene expression quantifications, Gene fusions, Somatic copy number variations, Somatic structural variations)""")
dataCategory: List[EnumDataCategory] = Field(default_factory=list, title="Data Category", description="""General category of data in Dataset; pipe-separated if multiple""")
dataType: Optional[List[str]] = Field(default_factory=list, title="Data Type", description="""Specific type of data contained in Dataset; pipe-separated if multiple (e.g. Preprocessed metabolite relative abundance, Absolute protein concentration, Aligned reads, Simple nucleotide variations, GVCF, Gene expression quantifications, Gene fusions, Somatic copy number variations, Somatic structural variations)""")
experimentalStrategy: Optional[List[str]] = Field(default_factory=list, title="Experimental Strategy", description="""Experimental method used to obtain data in Dataset; pipe-separated if multiple (e.g. Whole genome sequencing, RNAseq, Multiplex immunoassay, Mass spec metabolomics)""")
experimentalPlatform: Optional[List[str]] = Field(default_factory=list, title="Experimental Platform", description="""Specific platform used to perform experiment; pipe-separated if multiple (e.g. SOMAscan, MSD, Luminex, Illumina)""")
publication: Optional[List[str]] = Field(default_factory=list, title="Publication", description="""URL for publication(s) describing the Dataset's rationale and methodology (PubMed Central preferred but not required; pipe-separated if multiple)""")
Expand All @@ -417,7 +419,7 @@ class DatasetManifest(Thing):
"""
Mapping information for files in Dataset
"""
studyCode: EnumStudyCode = Field(..., title="Study Code", description="""Unique identifer for the study (generally a short acronym)""")
studyCode: EnumStudyCode = Field(..., title="Study Code", description="""Unique identifier for the study (generally a short acronym)""")
datasetName: str = Field(..., title="Dataset Name", description="""Full name of the dataset, provided by contributor""")
datasetGlobalId: Optional[str] = Field(None, title="Dataset Global ID", description="""Unique Global ID for dataset, generated by DCC""")
datasetExternalId: Optional[str] = Field(None, title="Dataset External ID", description="""Unique identifier or code for dataset, if provided by contributor""")
Expand Down
5 changes: 4 additions & 1 deletion src/linkml/include_study.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ classes:
slot_usage:
dataCategory:
description: Categories of data expected to be collected in this study
multivalued: true
dbgap:
description: dbGaP "phs" accession code(s) associated with this Study, either for access or informational purposes (pipe-separated if multiple)
publication:
Expand Down Expand Up @@ -95,6 +96,7 @@ classes:
slot_usage:
dataCategory:
description: General category of data in Dataset; pipe-separated if multiple
multivalued: true
dbgap:
description: dbGaP "phs" accession code(s) required to access the files in this Dataset, if applicable (pipe-separated if multiple)
publication:
Expand All @@ -103,7 +105,8 @@ classes:
description: Expected number of participants in this Dataset. If additional explanation is needed, please add to Dataset Description field.
dataType:
description: Specific type of data contained in Dataset; pipe-separated if multiple (e.g. Preprocessed metabolite relative abundance, Absolute protein concentration, Aligned reads, Simple nucleotide variations, GVCF, Gene expression quantifications, Gene fusions, Somatic copy number variations, Somatic structural variations)
experimentalStrategy:
multivalued: true
experimentalStrategy:
description: Experimental method used to obtain data in Dataset; pipe-separated if multiple (e.g. Whole genome sequencing, RNAseq, Multiplex immunoassay, Mass spec metabolomics)

DatasetManifest:
Expand Down

0 comments on commit 29f9642

Please sign in to comment.