diff --git a/docs/conf.py b/docs/conf.py index d5190276f..d010519a5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -406,7 +406,8 @@ def parse_schema(spec, schema): fields = ['Name', 'Type', 'Attributes', 'Definition'] tables = ['Repertoire', 'Study', 'Subject', 'Diagnosis', 'Sample', 'CellProcessing', 'NucleicAcidProcessing', 'PCRTarget', 'SequencingRun', 'RawSequenceData', 'DataProcessing', - 'Rearrangement', 'Alignment', 'Clone', 'Tree', 'Node', 'Cell'] + 'Rearrangement', 'Alignment', 'Clone', 'Tree', 'Node', 'Cell', + 'RearrangedSequence', 'GermlineSequence', 'GeneDelineationV', 'GeneDescription', 'GermlineSet'] for spec in tables: with open(os.path.join(download_path, '%s.tsv' % spec), 'w') as f: writer = csv.DictWriter(f, fieldnames=fields, dialect='excel-tab', extrasaction='ignore') diff --git a/docs/datarep/germline.rst b/docs/datarep/germline.rst index 939230c6d..ffdf4ea41 100644 --- a/docs/datarep/germline.rst +++ b/docs/datarep/germline.rst @@ -3,15 +3,216 @@ Germline Schema (Experimental) ============================== -A ``Germline`` is a collection of ``GeneDescriptions`` for germline IG -or TR genes that are used for V(D)J assignment and other analyses. +Motivation +---------- + +Understanding and cataloguing receptor germline genes and allele sequences is critical to the analysis of AIRR data. +While the human set is relatively well understood in outline, although probably still far from complete, those of other +species, even those that are relatively closely studied, is at a much earlier stage. There is an urgent need to define a +standardised format for listing such genes, so that they can be shared between researchers and easily consumed by software +tools. + +Receptor Germline Schema +------------------------ + +The receptor germline schema defines the data elements necessary to describe one or more receptor germline genes, together +with supporting evidence. The fundamental object is the ``GeneDescription``, which describes a single gene or allele, containing +the necessary details for the annotation of a rearranged sequence such as the location of CDRs (in the case of a V-gene) and +framing information (in the case of a J-gene). ``GeneDescription`` also contains fields to delineate RSS, and the leader regions +of V-genes, should those be covered by the sequence provided. + +Evidence supporting the gene or allele can be provided in linked ``GermlineSequence`` and ``RearrangedSequence`` objects. Information +represented in these objects will typically be stored in a repository: either an INSDC repository such as Genbank or SRA, or +a lower-tier repository such as OGRDB. Please note that the key distinction between these object types is whether the V(D)J +genes have rearranged, rather than the origin of the material, as mature B and T cells carry rearranged sequences in chromosomal +DNA. It is most likely that supporting sequences will be GermlineSequences, i.e. prior to rearrangement. In the case of a +germline inference from a repertoire, the inferred germline sequence should be provided as a ``GermlineSequence``, if the evidence +has been deposited in a repository. + +For V-genes, an IMGT-gapped sequence (i.e.,. a sequence delineated in accordance with the +`IMGT numbering scheme `_) is provided in +``GeneDescription``. Other delineations, such as `Chothia `_ and +`Kabat `_, can be provided via linked ``GeneDelineationV`` objects. +A ``GermlineSet`` brings together multiple ``GeneDescriptions`` from the same locus to form a curated set. The schema assumes that germline +sets will be published by multiple repositories. A germline set may be uniquely referenced by means of the ``germline_set_ref:`` +this is a composite field containing the repository id, germline set label, and version. + +Gene and Allele Naming +---------------------- + +The International Union of Immunological Societies allocates gene symbols for receptor genes. GeneDescription contains a gene_symbol +field, but it is optional, recognising that a symbol may not have been assigned. Gene symbols are long-lasting, but the underlying +sequence may be revised over time. GeneDescription contains a mandatory coding_sequence_identifier, which will be updated should the +sequence change. It is anticipated that publishers of gene sets will provide mechanisms to issue these identifiers, and to allow +researchers to review change history of GeneDescriptions and GermlineSets. In the interests of consistency and transparency, when +referring to a gene or allele, the gene_symbol should be used wherever possible, however coding_sequence_identifier provides a fallback +where a gene symbol has not been assigned. + +Genotypes +--------- + +A ``ReceptorGenotype`` describes the specific alleles found in an individual, and also identifies genes that are not found (deleted). +Depending on the data available and the inference method used, genotypes may contain haplotyping information, which may be full, or partial. +As an example of partial haplotyping, the genotype may have been determined from genomic sequencing in which the sequence of the locus was +assembled into contigs, but could not be fully assembled. In this case the co-location of alleles in each contig has been established, but +the co-location across the entire locus can not be. Co-location is therefore indicated by means of the ``phasing`` parameter, which in this +case would be assigned a different value for alleles on each contig. File Format Specification +------------------------- + +The file format has not been specified yet. + +.. _GermlineSetFields: + +GermlineSet Fields +----------------------------- + +:download:`Download as TSV <../_downloads/GermlineSet.tsv>` + +.. list-table:: + :widths: 20, 15, 15, 50 + :header-rows: 1 + + * - Name + - Type + - Attributes + - Definition + {%- for field in GermlineSet_schema %} + * - ``{{ field.Name }}`` + - {{ field.Type }} + - {{ field.Attributes }} + - {{ field.Definition | trim }} + {%- endfor %} + +.. _GeneDescriptionFields: + +GeneDescription Fields +----------------------------- + +:download:`Download as TSV <../_downloads/GeneDescription.tsv>` + +.. list-table:: + :widths: 20, 15, 15, 50 + :header-rows: 1 + + * - Name + - Type + - Attributes + - Definition + {%- for field in GeneDescription_schema %} + * - ``{{ field.Name }}`` + - {{ field.Type }} + - {{ field.Attributes }} + - {{ field.Definition | trim }} + {%- endfor %} + +.. _RearrangedSequenceFields: + +RearrangedSequence Fields ----------------------------- -Germline files are YAML/JSON with a structure defined below. Files should be -encoded as UTF-8. Identifiers are case-sensitive. Files should have the -extension ``.yaml``, ``.yml``, or ``.json``. +:download:`Download as TSV <../_downloads/RearrangedSequence.tsv>` -Fields +.. list-table:: + :widths: 20, 15, 15, 50 + :header-rows: 1 + + * - Name + - Type + - Attributes + - Definition + {%- for field in RearrangedSequence_schema %} + * - ``{{ field.Name }}`` + - {{ field.Type }} + - {{ field.Attributes }} + - {{ field.Definition | trim }} + {%- endfor %} + +.. _GermlineSequenceFields: + +GermlineSequence Fields +----------------------------- + +:download:`Download as TSV <../_downloads/GermlineSequence.tsv>` + +.. list-table:: + :widths: 20, 15, 15, 50 + :header-rows: 1 + + * - Name + - Type + - Attributes + - Definition + {%- for field in GermlineSequence_schema %} + * - ``{{ field.Name }}`` + - {{ field.Type }} + - {{ field.Attributes }} + - {{ field.Definition | trim }} + {%- endfor %} + +.. _GeneDelineationVFields: + +GeneDelineationV Fields +----------------------------- + +:download:`Download as TSV <../_downloads/GeneDelineationV.tsv>` + +.. list-table:: + :widths: 20, 15, 15, 50 + :header-rows: 1 + + * - Name + - Type + - Attributes + - Definition + {%- for field in GeneDelineationV_schema %} + * - ``{{ field.Name }}`` + - {{ field.Type }} + - {{ field.Attributes }} + - {{ field.Definition | trim }} + {%- endfor %} + +.. _ReceptorGenotypeFields: + +ReceptorGenotype Fields +----------------------------- + +:download:`Download as TSV <../_downloads/ReceptorGenotype.tsv>` + +.. list-table:: + :widths: 20, 15, 15, 50 + :header-rows: 1 + + * - Name + - Type + - Attributes + - Definition + {%- for field in ReceptorGenotype_schema %} + * - ``{{ field.Name }}`` + - {{ field.Type }} + - {{ field.Attributes }} + - {{ field.Definition | trim }} + {%- endfor %} + +.. _MHCGenotypeFields: + +MHCGenotype Fields ----------------------------- + +:download:`Download as TSV <../_downloads/MHCGenotype.tsv>` + +.. list-table:: + :widths: 20, 15, 15, 50 + :header-rows: 1 + + * - Name + - Type + - Attributes + - Definition + {%- for field in MHCGenotype_schema %} + * - ``{{ field.Name }}`` + - {{ field.Type }} + - {{ field.Attributes }} + - {{ field.Definition | trim }} + {%- endfor %} diff --git a/docs/datarep/overview.rst b/docs/datarep/overview.rst index 7d2e04de8..78e9af1ef 100644 --- a/docs/datarep/overview.rst +++ b/docs/datarep/overview.rst @@ -203,3 +203,4 @@ Schema Definitions Alignment Schema (Experimental) Clone and Lineage Tree Schema (Experimental) Cell Schema (Experimental) + Germline Schema (Experimental) diff --git a/lang/R/inst/extdata/airr-schema.yaml b/lang/R/inst/extdata/airr-schema.yaml index bcceb7ac1..b664c06eb 100644 --- a/lang/R/inst/extdata/airr-schema.yaml +++ b/lang/R/inst/extdata/airr-schema.yaml @@ -175,6 +175,20 @@ DataFile: $ref: '#/RepertoireGroup' x-airr: nullable: false + AlleleDescription: + type: array + description: List of allele descriptions + items: + $ref: '#/AlleleDescription' + x-airr: + nullable: false + GermlineSet: + type: array + description: List of germline sets + items: + $ref: '#/GermlineSet' + x-airr: + nullable: false DataProcessing: type: array description: List of data processing workflows @@ -289,26 +303,658 @@ TimePoint: id: UO:0000003 label: time unit +# +# General objects +# TODO: link to global schema with JSON-LD? +# + +# An individual +Acknowledgement: + discriminator: AIRR + description: Individual whose contribution to this work should be acknowledged + type: object + required: + - acknowledgement_id + - name + - institution_name + properties: + acknowledgement_id: + type: string + description: unique identifier of this Acknowledgement within the file + x-airr: + nullable: false + name: + type: string + description: Full name of individual + institution_name: + type: string + description: Individual's department and institution name + ORCID_id: + type: string + description: Individual's ORCID Id, if available + # # Germline gene schema # -# The GeneDescription object will be introduced at a later point here. Until -# then the term "gene description" below can be considered to be equivalent -# to "gene symbol". + +# Rearranged and genomic germline sequences +RearrangedSequence: + discriminator: AIRR + description: Details of a directly observed rearranged sequence or an inference from rearranged sequences contributing support for a gene or allele + type: object + required: + - sequence_id + - sequence + - derivation + - observation_type + - repository_name + - repository_id + - deposited_version + - seq_start + - seq_end + properties: + sequence_id: + type: string + description: Unique identifier of this RearrangedSequence within the file + x-airr: + nullable: false + sequence: + type: string + description: nucleotide sequence + x-airr: + nullable: false + derivation: + type: string + enum: + - DNA + - RNA + description: The class of nucleic acid that was used as primary starting material + x-airr: + nullable: false + observation_type: + type: string + description: The type of observation from which this sequence was drawn, e.g. direct sequencing, inference from repertoire + enum: + - direct sequencing + - inference from repertoire + x-airr: + nullable: false + notes: + type: string + description: Notes + repository_name: + type: string + description: Name of the repository in which the sequence has been deposited + x-airr: + nullable: false + repository_id: + type: string + description: Id or serial number of the sequence within the repository + x-airr: + nullable: false + deposited_version: + type: string + description: Version number of the sequence within the repository + x-airr: + nullable: false + seq_start: + type: integer + description: Start co-ordinate of the sequence detailed in this record, within the sequence deposited + x-airr: + nullable: false + seq_end: + type: integer + description: End co-ordinate of the sequence detailed in this record, within the sequence deposited + x-airr: + nullable: false + +UnrearrangedSequence: + discriminator: AIRR + description: Details of an unrearranged sequence contributing support for a gene or allele + type: object + required: + - sequence_id + - sequence + - repository_name + - assembly_id + - gff_seqid + - gff_start + - gff_end + - strand + properties: + sequence_id: + type: string + description: unique identifier of this UnrearrangedSequence within the file + x-airr: + nullable: false + sequence: + type: string + description: Sequence of interest described in this record (typically this will include gene and promoter region) + x-airr: + nullable: false + notes: + type: string + description: Notes + repository_name: + type: string + description: Name of the repository in which the assembly or contig is deposited + x-airr: + nullable: false + assembly_id: + type: string + description: Identifier of the assembly or contig within the repository + x-airr: + nullable: false + patch_no: + type: string + description: Genome assembly patch number in which this gene was determined + gff_seqid: + type: string + description: Sequence (from the assembly) of a window including the gene and preferably also the promoter region + gff_start: + type: integer + description: Genomic co-ordinates of the start of the sequence of interest described in this record, in Ensemble GFF version 3 + gff_end: + type: integer + description: Genomic co-ordinates of the end of the sequence of interest described in this record, in Ensemble GFF version 3 + strand: + type: string + enum: + - + + - "-" + description: sense (+ or -) + +# V gene delineation +SequenceDelineationV: + discriminator: AIRR + description: Delineation of a V-gene in a particular system + type: object + required: + - sequence_delineation_id + - delineation_scheme + - fwr1_start + - fwr1_end + - cdr1_start + - cdr1_end + - fwr2_start + - fwr2_end + - cdr2_start + - cdr2_end + - fwr3_start + - fwr3_end + - cdr3_start + properties: + sequence_delineation_id: + type: string + description: Unique identifier of this SequenceDelineationV within the file + x-airr: + nullable: false + delineation_scheme: + type: string + description: Name of the delineation scheme + example: Chothia + x-airr: + nullable: false + fwr1_start: + type: integer + description: FWR1 start co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + fwr1_end: + type: integer + description: FWR1 end co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + cdr1_start: + type: integer + description: CDR1 start co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + cdr1_end: + type: integer + description: CDR1 end co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + fwr2_start: + type: integer + description: FWR2 start co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + fwr2_end: + type: integer + description: FWR2 end co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + cdr2_start: + type: integer + description: CDR2 start co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + cdr2_end: + type: integer + description: CDR2 end co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + fwr3_start: + type: integer + description: FWR3 start co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + fwr3_end: + type: integer + description: FWR3 end co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + cdr3_start: + type: integer + description: CDR3 start co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + alignment: + type: array + items: + type: string + description: one string for each codon in the fields v_start to cdr3_start indicating the label of that codon according to the numbering of the delineation scheme + +# The Gene Description +AlleleDescription: + discriminator: AIRR + description: Details of a putative or confirmed Ig receptor gene/allele inferred from one or more observations + type: object + required: + - allele_description_id + - maintainer + - lab_address + - release_version + - release_date + - release_description + - sequence + - coding_sequence + - coding_sequence_identifier + - locus + - sequence_type + - functional + - inference_type + - species + properties: + allele_description_id: + type: string + description: Unique identifier of this AlleleDescription within the file + x-airr: + nullable: false + maintainer: + type: string + description: Maintainer of this sequence record + x-airr: + nullable: false + acknowledgements: + type: array + description: List of individuals whose contribution to the gene description should be acknowledged + items: + $ref: '#/Acknowledgement' + lab_address: + type: string + description: Institution and full address of corresponding author + x-airr: + nullable: false + release_version: + type: integer + description: Version number of this record, updated whenever a revised version is published or released + x-airr: + nullable: false + release_date: + type: string + description: Date of this release + x-airr: + nullable: false + release_description: + type: string + description: Brief descriptive notes of the reason for this release and the changes embodied + x-airr: + nullable: false + label: + type: string + description: The accepted name for this gene or allele, if any + example: IGHV1-69*01 + sequence: + type: string + description: nt sequence of the gene. This should cover the full length that is available, including where possible RSS, and 5' UTR and lead-in for V-gene sequences + x-airr: + nullable: false + coding_sequence: + type: string + description: nucleotide sequence of the core region of the gene (V-, D-, J- or C-REGION), aligned, in the case of the V-REGION, with the IMGT numbering scheme + x-airr: + nullable: false + aliases: + type: array + items: + type: string + description: Alternative names for this sequence + locus: + type: string + enum: + - IGH + - IGK + - IGL + - TRA + - TRB + - TRG + - TRD + description: Gene locus + x-airr: + nullable: false + chromosome: + type: integer + description: chromosome on which the gene is located + sequence_type: + type: string + enum: + - V + - D + - J + - C + description: Sequence type (V, D, J, C) + x-airr: + nullable: false + functional: + type: boolean + description: True if the gene is functional, false if it is a pseudogene + x-airr: + nullable: false + inference_type: + type: string + enum: + - Genomic and rearranged + - Genomic only + - Rearranged only + description: Type of inference(s) from which this gene sequence was inferred + x-airr: + nullable: false + species: + type: string + description: Binomial designation of subject's species + example: Mus musculus + x-airr: + nullable: false + species_subgroup: + type: string + description: Race, strain or other species subgroup to which this subject belongs + example: BALB/C + species_subgroup_type: + type: string + enum: + - breed + - strain + - inbred + - outbred + - locational + status: + type: string + enum: + - active + - draft + - retired + - withdrawn + description: Status of record, assumed active if the field is not proesent + gene_subgroup: + type: string + description: Gene subgroup or clade, as (and if) identified for this species and gene + subgroup_designation: + type: string + description: Gene designation within this subgroup, if identified + allele_designation: + type: string + description: Allele designation, if identified + j_codon_frame: + type: integer + enum: + - 1 + - 2 + - 3 + description: Codon position of the first nucleotide in the 'coding_sequence' field. Mandatory for J genes. Not used for V or D genes. ('1' means the sequence is in-frame, '2' means that the first bp is missing from the first codon, '3' means that the first 2 bp are missing) + gene_start: + type: integer + description: Co-ordinate (in the sequence field) of the first nucleotide in the coding_sequence field + gene_end: + type: integer + description: Co-ordinate (in the sequence field) of the last gene-coding nucleotide in the coding_sequence field + utr_5_prime_start: + type: integer + description: Start co-ordinate (in the sequence field) of 5 prime UTR (V-genes only) + utr_5_prime_end: + type: integer + description: End co-ordinate (in the sequence field) of 5 prime UTR (V-genes only) + leader_1_start: + type: integer + description: Start co-ordinate (in the sequence field) of L-PART1 (V-genes only) + leader_1_end: + type: integer + description: End co-ordinate (in the sequence field) of L-PART1 (V-genes only) + leader_2_start: + type: integer + description: Start co-ordinate (in the sequence field) of L-PART2 (V-genes only) + leader_2_end: + type: integer + description: End co-ordinate (in the sequence field) of L-PART2 (V-genes only) + v_rs_start: + type: integer + description: Start co-ordinate (in the sequence field) of V recombination site (V-genes only) + v_rs_end: + type: integer + description: End co-ordinate (in the sequence field) of V recombination site (V-genes only) + d_rs_3_prime_start: + type: integer + description: Start co-ordinate (in the sequence field) of 3 prime D recombination site (D-genes only) + d_rs_3_prime_end: + type: integer + description: End co-ordinate (in the sequence field) of 3 prime D recombination site (D-genes only) + d_rs_5_prime_start: + type: integer + description: Start co-ordinate (in the sequence field) of 5 prime D recombination site (D-genes only) + d_rs_5_prime_end: + type: integer + description: End co-ordinate (in the sequence field) of 5 prime D recombination site (D-genes only) + j_cdr3_end: + type: integer + description: In the case of a J-gene, the co-ordinate (in the sequence field) of the first nucelotide of the conserved PHE or TRP (IMGT codon position 118) + j_rs_start: + type: integer + description: Start co-ordinate (in the sequence field) of J recombination site (J-genes only) + j_rs_end: + type: integer + description: End co-ordinate (in the sequence field) of J recombination site (J-genes only) + j_donor_splice: + type: integer + description: Co-ordinate (in the sequence field) of the 3' splice donor site (J-genes only) + v_gene_delineations: + type: array + items: + $ref: '#/SequenceDelineationV' + unrearranged_support: + type: array + items: + $ref: '#/UnrearrangedSequence' + rearranged_support: + type: array + items: + $ref: '#/RearrangedSequence' + paralogs: + type: array + items: + type: string + description: Gene symbols of any paralogs + notes: + type: string + description: Notes + curational_tags: + type: array + items: + type: string + enum: + - likely_truncated + - likely_full_length + description: Controlled-vocabulary tags applied to this description + +# Collection of gene descriptions into a germline set +GermlineSet: + discriminator: AIRR + description: Details of a 'germline set' bringing together multiple AlleleDescriptions from the same strain or species. All genes in a GermlineSet should be from a single locus. + type: object + required: + - germline_set_id + - author + - lab_name + - lab_address + - release_version + - release_description + - release_date + - germline_set_name + - germline_set_ref + - species + - locus + - allele_descriptions + properties: + germline_set_id: + type: string + description: Unique identifier of the GermlineSet within this file + x-airr: + nullable: false + author: + type: string + description: Corresponding author + x-airr: + nullable: false + lab_name: + type: string + description: Department of corresponding author + x-airr: + nullable: false + lab_address: + type: string + description: Institutional address of corresponding author + x-airr: + nullable: false + acknowledgements: + type: array + description: List of individuals whose contribution to the germline set should be acknowledged + items: + $ref: '#/Acknowledgement' + release_version: + type: number + description: Version number of this record, allocated automatically + x-airr: + nullable: false + release_description: + type: string + description: Brief descriptive notes of the reason for this release and the changes embodied + x-airr: + nullable: false + release_date: + type: string + description: Date of this release + x-airr: + nullable: false + germline_set_name: + type: string + description: descriptive name of this germline set + x-airr: + nullable: false + germline_set_ref: + type: string + description: Unique identifier of the germline set and version, in standardized form (Repo:Label:Version) + example: OGRDB:Human_IGH:2021.11 + x-airr: + nullable: false + pub_ids: + type: string + description: Publications describing the germline set + example: "PMID:85642,PMID:12345" + species: + type: string + description: Binomial designation of subject's species + example: Mus musculus + x-airr: + nullable: false + species_subgroup: + type: string + description: Race, strain or other species subgroup to which this subject belongs + example: BALB/C + species_subgroup_type: + type: string + enum: + - breed + - strain + - inbred + - outbred + - locational + locus: + type: string + enum: + - IGH + - IGK + - IGL + - TRA + - TRB + - TRG + - TRD + description: Gene locus + x-airr: + nullable: false + allele_descriptions: + type: array + items: + $ref: '#/AlleleDescription' + description: list of allele_descriptions in the germline set + x-airr: + nullable: false + notes: + type: string + description: Notes + # -# Gene descriptions from the same class organized together as a set -GermlineClassSet: +# Genotype schema +# + +# GenotypeSet lists the Genotypes (describing different loci) inferred for this subject + +GenotypeSet: + discriminator: AIRR + type: object + required: + - receptor_genotype_set_id + properties: + receptor_genotype_set_id: + type: string + description: A unique identifier for this Receptor Genotype Set. + x-airr: + nullable: false + genotype_class_list: + description: List of Genotypes included in this Receptor Genotype Set. + type: array + items: + $ref: '#/Genotype' + +# This enumerates the alleles and gene deletions inferred in a single subject. Included alleles may either be listed by reference to a GermlineSet, or +# listed as 'undocumented', in which case the inferred sequence is provided + +# Genotype of adaptive immune receptors +Genotype: discriminator: AIRR type: object + required: + - receptor_genotype_id + - locus properties: - germline_class_id: + receptor_genotype_id: type: string - description: A unique identifier for this Germline Class Set. - germline_class: + description: A unique identifier for this Receptor Genotype + x-airr: + nullable: false + locus: type: string enum: - IGH - - IGI - IGK - IGL - TRA @@ -317,57 +963,129 @@ GermlineClassSet: - TRG example: IGH x-airr: - nullable: true + nullable: false adc-query-support: true format: controlled vocabulary - germline_alleles: + documented_alleles: type: array - description: Array of gene descriptions + description: Array of alleles inferred to be present which are documented in GermlineSets items: - type: string - description: Gene description for a germline allele. If referring to a known reference sequence in a database the relevant gene/allele nomenclature should be followed (e.g., IGHV4-59*01 if using IMGT/GENE-DB). + type: object + properties: + label: + type: string + description: The accepted name for this allele, if any, taken from the GermlineSet + x-airr: + nullable: false + coding_sequence_identifier: + type: string + description: Unique identifier of the coding sequence, as allocated by an identified repository + x-airr: + nullable: false + germline_set_ref: + type: string + description: GermlineSet from which it was taken, referenced in standardized form (Repo:Label:Version) + example: OGRDB:Human_IGH:2021.11 + x-airr: + nullable: false + phasing: + type: integer + description: Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the same chromosome + x-airr: + nullable: true + adc-query-support: true + undocumented_alleles: + type: array + description: Array of alleles inferred to be present and not documented in an identified GermlineSet + items: + type: object + properties: + allele_name: + type: string + description: Allele name as allocated by the inference pipeline + x-airr: + nullable: false + sequence: + type: string + description: nt sequence of the allele, as provided by the inference pipeline + x-airr: + nullable: false + phasing: + type: integer + description: Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the same chromosome + x-airr: + nullable: true + adc-query-support: true + deleted_genes: + type: array + description: Array of genes identified as being deleted in this genotype + items: + type: object + properties: + gene_symbol: + type: string + description: The accepted name for this gene, taken from the GermlineSet + x-airr: + nullable: false + germline_set_ref: + type: string + description: GermlineSet from which it was taken (issuer/name/version) + x-airr: + nullable: false + phasing: + type: integer + description: Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the same chromosome x-airr: nullable: true adc-query-support: true - germline_process: + inference_process: type: string enum: - genomic_sequencing - repertoire_sequencing - description: Information on how the germline was acquired. Controlled vocabulary. - title: Germline acquisition process + description: Information on how the genotype was acquired. Controlled vocabulary. + title: Genotype acquisition process example: repertoire_sequencing x-airr: nullable: true adc-query-support: true format: controlled vocabulary -# List of germline class sets used for analysis or to describe +# List of MHCGenotypes used for analysis to describe # a subject's genotype -GermlineSet: +MHCGenotypeSet: discriminator: AIRR type: object + required: + - mhc_genotype_set_id properties: - germline_set_id: + mhc_genotype_set_id: type: string - description: A unique identifier for this Germline Set. - germline_class_list: - description: List of classes included in this germline set. + description: A unique identifier for this MHC Genotype Set. + x-airr: + nullable: false + mhc_genotype_class_list: + description: List of classes included in this MHC Genotype set. type: array items: - $ref: '#/GermlineClassSet' + $ref: '#/MHCGenotype' -MHCGermlineClassSet: +# Genotype of major histocompatibility complex (MHC) class I and II receptors +MHCGenotype: discriminator: AIRR type: object + required: + - mhc_genotype_id properties: - germline_class_id: + mhc_genotype_id: type: string - description: A unique identifier for this Germline Set, assumed to be unique in the context of the study. - germline_class: + description: A unique identifier for this MHC Genotype, assumed to be unique in the context of the study. + x-airr: + nullable: false + genotype_class: type: string enum: - - MHC + - MHC # do we need this enum?? example: MHC x-airr: nullable: true @@ -377,39 +1095,34 @@ MHCGermlineClassSet: type: array description: Array of gene descriptions items: - type: string - description: Gene description for a germline allele. If referring to a known reference sequence in a database the relevant gene/allele nomenclature should be followed (e.g., HLA‐C*07:29). + type: object + properties: + gene_symbol: + type: string + description: The accepted name for this gene + x-airr: + nullable: false + germline_set_ref: + type: string + description: Repository and list from which it was taken (issuer/name/version) + x-airr: + nullable: false x-airr: nullable: true adc-query-support: true - germline_process: + genotype_process: type: string enum: - genomic_sequencing - repertoire_sequencing - description: Information on how the germline was acquired. Controlled vocabulary. - title: Germline acquisition process + description: Information on how the genotype was acquired. Controlled vocabulary. + title: Genotype acquisition process example: repertoire_sequencing x-airr: nullable: true adc-query-support: true format: controlled vocabulary -# List of germline class sets used for analysis or to describe -# a subject's genotype -MHCGermlineSet: - discriminator: AIRR - type: object - properties: - germline_set_id: - type: string - description: A unique identifier for this Germline Set. - germline_class_list: - description: List of classes included in this germline set. - type: array - items: - $ref: '#/MHCGermlineClassSet' - # # Repertoire metadata schema # @@ -871,16 +1584,16 @@ Subject: x-airr: nullable: false adc-query-support: true - germline: + genotype: type: object - description: Germline for this subject, if known, by germline class. + description: Genotype for this subject, if known properties: - receptor_germline: - $ref: '#/GermlineSet' - description: Immune receptor germline set for this subject. - mhc_germline: - $ref: '#/MHCGermlineSet' - description: MHC germline set for this subject. + receptor_genotype_set: + $ref: '#/GenotypeSet' + description: Immune receptor genotype set for this subject. + mhc_genotype_set: + $ref: '#/MHCGenotypeSet' + description: MHC genotype set for this subject. # 1-to-n relationship between a subject and its diagnoses Diagnosis: @@ -1892,16 +2605,6 @@ DataProcessing: nullable: true adc-query-support: true name: Processed data file names - data_processing_germline: - type: object - description: Germline used for this data processing process - properties: - receptor_germline: - $ref: '#/GermlineSet' - description: Immune receptor germline set for this data processing process. - mhc_germline: - $ref: '#/MHCGermlineSet' - description: MHC germline set for this data processing process. germline_database: type: string description: Source of germline V(D)J genes with version number or date accessed. diff --git a/lang/python/airr/specs/airr-schema.yaml b/lang/python/airr/specs/airr-schema.yaml index bcceb7ac1..b664c06eb 100644 --- a/lang/python/airr/specs/airr-schema.yaml +++ b/lang/python/airr/specs/airr-schema.yaml @@ -175,6 +175,20 @@ DataFile: $ref: '#/RepertoireGroup' x-airr: nullable: false + AlleleDescription: + type: array + description: List of allele descriptions + items: + $ref: '#/AlleleDescription' + x-airr: + nullable: false + GermlineSet: + type: array + description: List of germline sets + items: + $ref: '#/GermlineSet' + x-airr: + nullable: false DataProcessing: type: array description: List of data processing workflows @@ -289,26 +303,658 @@ TimePoint: id: UO:0000003 label: time unit +# +# General objects +# TODO: link to global schema with JSON-LD? +# + +# An individual +Acknowledgement: + discriminator: AIRR + description: Individual whose contribution to this work should be acknowledged + type: object + required: + - acknowledgement_id + - name + - institution_name + properties: + acknowledgement_id: + type: string + description: unique identifier of this Acknowledgement within the file + x-airr: + nullable: false + name: + type: string + description: Full name of individual + institution_name: + type: string + description: Individual's department and institution name + ORCID_id: + type: string + description: Individual's ORCID Id, if available + # # Germline gene schema # -# The GeneDescription object will be introduced at a later point here. Until -# then the term "gene description" below can be considered to be equivalent -# to "gene symbol". + +# Rearranged and genomic germline sequences +RearrangedSequence: + discriminator: AIRR + description: Details of a directly observed rearranged sequence or an inference from rearranged sequences contributing support for a gene or allele + type: object + required: + - sequence_id + - sequence + - derivation + - observation_type + - repository_name + - repository_id + - deposited_version + - seq_start + - seq_end + properties: + sequence_id: + type: string + description: Unique identifier of this RearrangedSequence within the file + x-airr: + nullable: false + sequence: + type: string + description: nucleotide sequence + x-airr: + nullable: false + derivation: + type: string + enum: + - DNA + - RNA + description: The class of nucleic acid that was used as primary starting material + x-airr: + nullable: false + observation_type: + type: string + description: The type of observation from which this sequence was drawn, e.g. direct sequencing, inference from repertoire + enum: + - direct sequencing + - inference from repertoire + x-airr: + nullable: false + notes: + type: string + description: Notes + repository_name: + type: string + description: Name of the repository in which the sequence has been deposited + x-airr: + nullable: false + repository_id: + type: string + description: Id or serial number of the sequence within the repository + x-airr: + nullable: false + deposited_version: + type: string + description: Version number of the sequence within the repository + x-airr: + nullable: false + seq_start: + type: integer + description: Start co-ordinate of the sequence detailed in this record, within the sequence deposited + x-airr: + nullable: false + seq_end: + type: integer + description: End co-ordinate of the sequence detailed in this record, within the sequence deposited + x-airr: + nullable: false + +UnrearrangedSequence: + discriminator: AIRR + description: Details of an unrearranged sequence contributing support for a gene or allele + type: object + required: + - sequence_id + - sequence + - repository_name + - assembly_id + - gff_seqid + - gff_start + - gff_end + - strand + properties: + sequence_id: + type: string + description: unique identifier of this UnrearrangedSequence within the file + x-airr: + nullable: false + sequence: + type: string + description: Sequence of interest described in this record (typically this will include gene and promoter region) + x-airr: + nullable: false + notes: + type: string + description: Notes + repository_name: + type: string + description: Name of the repository in which the assembly or contig is deposited + x-airr: + nullable: false + assembly_id: + type: string + description: Identifier of the assembly or contig within the repository + x-airr: + nullable: false + patch_no: + type: string + description: Genome assembly patch number in which this gene was determined + gff_seqid: + type: string + description: Sequence (from the assembly) of a window including the gene and preferably also the promoter region + gff_start: + type: integer + description: Genomic co-ordinates of the start of the sequence of interest described in this record, in Ensemble GFF version 3 + gff_end: + type: integer + description: Genomic co-ordinates of the end of the sequence of interest described in this record, in Ensemble GFF version 3 + strand: + type: string + enum: + - + + - "-" + description: sense (+ or -) + +# V gene delineation +SequenceDelineationV: + discriminator: AIRR + description: Delineation of a V-gene in a particular system + type: object + required: + - sequence_delineation_id + - delineation_scheme + - fwr1_start + - fwr1_end + - cdr1_start + - cdr1_end + - fwr2_start + - fwr2_end + - cdr2_start + - cdr2_end + - fwr3_start + - fwr3_end + - cdr3_start + properties: + sequence_delineation_id: + type: string + description: Unique identifier of this SequenceDelineationV within the file + x-airr: + nullable: false + delineation_scheme: + type: string + description: Name of the delineation scheme + example: Chothia + x-airr: + nullable: false + fwr1_start: + type: integer + description: FWR1 start co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + fwr1_end: + type: integer + description: FWR1 end co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + cdr1_start: + type: integer + description: CDR1 start co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + cdr1_end: + type: integer + description: CDR1 end co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + fwr2_start: + type: integer + description: FWR2 start co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + fwr2_end: + type: integer + description: FWR2 end co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + cdr2_start: + type: integer + description: CDR2 start co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + cdr2_end: + type: integer + description: CDR2 end co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + fwr3_start: + type: integer + description: FWR3 start co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + fwr3_end: + type: integer + description: FWR3 end co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + cdr3_start: + type: integer + description: CDR3 start co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + alignment: + type: array + items: + type: string + description: one string for each codon in the fields v_start to cdr3_start indicating the label of that codon according to the numbering of the delineation scheme + +# The Gene Description +AlleleDescription: + discriminator: AIRR + description: Details of a putative or confirmed Ig receptor gene/allele inferred from one or more observations + type: object + required: + - allele_description_id + - maintainer + - lab_address + - release_version + - release_date + - release_description + - sequence + - coding_sequence + - coding_sequence_identifier + - locus + - sequence_type + - functional + - inference_type + - species + properties: + allele_description_id: + type: string + description: Unique identifier of this AlleleDescription within the file + x-airr: + nullable: false + maintainer: + type: string + description: Maintainer of this sequence record + x-airr: + nullable: false + acknowledgements: + type: array + description: List of individuals whose contribution to the gene description should be acknowledged + items: + $ref: '#/Acknowledgement' + lab_address: + type: string + description: Institution and full address of corresponding author + x-airr: + nullable: false + release_version: + type: integer + description: Version number of this record, updated whenever a revised version is published or released + x-airr: + nullable: false + release_date: + type: string + description: Date of this release + x-airr: + nullable: false + release_description: + type: string + description: Brief descriptive notes of the reason for this release and the changes embodied + x-airr: + nullable: false + label: + type: string + description: The accepted name for this gene or allele, if any + example: IGHV1-69*01 + sequence: + type: string + description: nt sequence of the gene. This should cover the full length that is available, including where possible RSS, and 5' UTR and lead-in for V-gene sequences + x-airr: + nullable: false + coding_sequence: + type: string + description: nucleotide sequence of the core region of the gene (V-, D-, J- or C-REGION), aligned, in the case of the V-REGION, with the IMGT numbering scheme + x-airr: + nullable: false + aliases: + type: array + items: + type: string + description: Alternative names for this sequence + locus: + type: string + enum: + - IGH + - IGK + - IGL + - TRA + - TRB + - TRG + - TRD + description: Gene locus + x-airr: + nullable: false + chromosome: + type: integer + description: chromosome on which the gene is located + sequence_type: + type: string + enum: + - V + - D + - J + - C + description: Sequence type (V, D, J, C) + x-airr: + nullable: false + functional: + type: boolean + description: True if the gene is functional, false if it is a pseudogene + x-airr: + nullable: false + inference_type: + type: string + enum: + - Genomic and rearranged + - Genomic only + - Rearranged only + description: Type of inference(s) from which this gene sequence was inferred + x-airr: + nullable: false + species: + type: string + description: Binomial designation of subject's species + example: Mus musculus + x-airr: + nullable: false + species_subgroup: + type: string + description: Race, strain or other species subgroup to which this subject belongs + example: BALB/C + species_subgroup_type: + type: string + enum: + - breed + - strain + - inbred + - outbred + - locational + status: + type: string + enum: + - active + - draft + - retired + - withdrawn + description: Status of record, assumed active if the field is not proesent + gene_subgroup: + type: string + description: Gene subgroup or clade, as (and if) identified for this species and gene + subgroup_designation: + type: string + description: Gene designation within this subgroup, if identified + allele_designation: + type: string + description: Allele designation, if identified + j_codon_frame: + type: integer + enum: + - 1 + - 2 + - 3 + description: Codon position of the first nucleotide in the 'coding_sequence' field. Mandatory for J genes. Not used for V or D genes. ('1' means the sequence is in-frame, '2' means that the first bp is missing from the first codon, '3' means that the first 2 bp are missing) + gene_start: + type: integer + description: Co-ordinate (in the sequence field) of the first nucleotide in the coding_sequence field + gene_end: + type: integer + description: Co-ordinate (in the sequence field) of the last gene-coding nucleotide in the coding_sequence field + utr_5_prime_start: + type: integer + description: Start co-ordinate (in the sequence field) of 5 prime UTR (V-genes only) + utr_5_prime_end: + type: integer + description: End co-ordinate (in the sequence field) of 5 prime UTR (V-genes only) + leader_1_start: + type: integer + description: Start co-ordinate (in the sequence field) of L-PART1 (V-genes only) + leader_1_end: + type: integer + description: End co-ordinate (in the sequence field) of L-PART1 (V-genes only) + leader_2_start: + type: integer + description: Start co-ordinate (in the sequence field) of L-PART2 (V-genes only) + leader_2_end: + type: integer + description: End co-ordinate (in the sequence field) of L-PART2 (V-genes only) + v_rs_start: + type: integer + description: Start co-ordinate (in the sequence field) of V recombination site (V-genes only) + v_rs_end: + type: integer + description: End co-ordinate (in the sequence field) of V recombination site (V-genes only) + d_rs_3_prime_start: + type: integer + description: Start co-ordinate (in the sequence field) of 3 prime D recombination site (D-genes only) + d_rs_3_prime_end: + type: integer + description: End co-ordinate (in the sequence field) of 3 prime D recombination site (D-genes only) + d_rs_5_prime_start: + type: integer + description: Start co-ordinate (in the sequence field) of 5 prime D recombination site (D-genes only) + d_rs_5_prime_end: + type: integer + description: End co-ordinate (in the sequence field) of 5 prime D recombination site (D-genes only) + j_cdr3_end: + type: integer + description: In the case of a J-gene, the co-ordinate (in the sequence field) of the first nucelotide of the conserved PHE or TRP (IMGT codon position 118) + j_rs_start: + type: integer + description: Start co-ordinate (in the sequence field) of J recombination site (J-genes only) + j_rs_end: + type: integer + description: End co-ordinate (in the sequence field) of J recombination site (J-genes only) + j_donor_splice: + type: integer + description: Co-ordinate (in the sequence field) of the 3' splice donor site (J-genes only) + v_gene_delineations: + type: array + items: + $ref: '#/SequenceDelineationV' + unrearranged_support: + type: array + items: + $ref: '#/UnrearrangedSequence' + rearranged_support: + type: array + items: + $ref: '#/RearrangedSequence' + paralogs: + type: array + items: + type: string + description: Gene symbols of any paralogs + notes: + type: string + description: Notes + curational_tags: + type: array + items: + type: string + enum: + - likely_truncated + - likely_full_length + description: Controlled-vocabulary tags applied to this description + +# Collection of gene descriptions into a germline set +GermlineSet: + discriminator: AIRR + description: Details of a 'germline set' bringing together multiple AlleleDescriptions from the same strain or species. All genes in a GermlineSet should be from a single locus. + type: object + required: + - germline_set_id + - author + - lab_name + - lab_address + - release_version + - release_description + - release_date + - germline_set_name + - germline_set_ref + - species + - locus + - allele_descriptions + properties: + germline_set_id: + type: string + description: Unique identifier of the GermlineSet within this file + x-airr: + nullable: false + author: + type: string + description: Corresponding author + x-airr: + nullable: false + lab_name: + type: string + description: Department of corresponding author + x-airr: + nullable: false + lab_address: + type: string + description: Institutional address of corresponding author + x-airr: + nullable: false + acknowledgements: + type: array + description: List of individuals whose contribution to the germline set should be acknowledged + items: + $ref: '#/Acknowledgement' + release_version: + type: number + description: Version number of this record, allocated automatically + x-airr: + nullable: false + release_description: + type: string + description: Brief descriptive notes of the reason for this release and the changes embodied + x-airr: + nullable: false + release_date: + type: string + description: Date of this release + x-airr: + nullable: false + germline_set_name: + type: string + description: descriptive name of this germline set + x-airr: + nullable: false + germline_set_ref: + type: string + description: Unique identifier of the germline set and version, in standardized form (Repo:Label:Version) + example: OGRDB:Human_IGH:2021.11 + x-airr: + nullable: false + pub_ids: + type: string + description: Publications describing the germline set + example: "PMID:85642,PMID:12345" + species: + type: string + description: Binomial designation of subject's species + example: Mus musculus + x-airr: + nullable: false + species_subgroup: + type: string + description: Race, strain or other species subgroup to which this subject belongs + example: BALB/C + species_subgroup_type: + type: string + enum: + - breed + - strain + - inbred + - outbred + - locational + locus: + type: string + enum: + - IGH + - IGK + - IGL + - TRA + - TRB + - TRG + - TRD + description: Gene locus + x-airr: + nullable: false + allele_descriptions: + type: array + items: + $ref: '#/AlleleDescription' + description: list of allele_descriptions in the germline set + x-airr: + nullable: false + notes: + type: string + description: Notes + # -# Gene descriptions from the same class organized together as a set -GermlineClassSet: +# Genotype schema +# + +# GenotypeSet lists the Genotypes (describing different loci) inferred for this subject + +GenotypeSet: + discriminator: AIRR + type: object + required: + - receptor_genotype_set_id + properties: + receptor_genotype_set_id: + type: string + description: A unique identifier for this Receptor Genotype Set. + x-airr: + nullable: false + genotype_class_list: + description: List of Genotypes included in this Receptor Genotype Set. + type: array + items: + $ref: '#/Genotype' + +# This enumerates the alleles and gene deletions inferred in a single subject. Included alleles may either be listed by reference to a GermlineSet, or +# listed as 'undocumented', in which case the inferred sequence is provided + +# Genotype of adaptive immune receptors +Genotype: discriminator: AIRR type: object + required: + - receptor_genotype_id + - locus properties: - germline_class_id: + receptor_genotype_id: type: string - description: A unique identifier for this Germline Class Set. - germline_class: + description: A unique identifier for this Receptor Genotype + x-airr: + nullable: false + locus: type: string enum: - IGH - - IGI - IGK - IGL - TRA @@ -317,57 +963,129 @@ GermlineClassSet: - TRG example: IGH x-airr: - nullable: true + nullable: false adc-query-support: true format: controlled vocabulary - germline_alleles: + documented_alleles: type: array - description: Array of gene descriptions + description: Array of alleles inferred to be present which are documented in GermlineSets items: - type: string - description: Gene description for a germline allele. If referring to a known reference sequence in a database the relevant gene/allele nomenclature should be followed (e.g., IGHV4-59*01 if using IMGT/GENE-DB). + type: object + properties: + label: + type: string + description: The accepted name for this allele, if any, taken from the GermlineSet + x-airr: + nullable: false + coding_sequence_identifier: + type: string + description: Unique identifier of the coding sequence, as allocated by an identified repository + x-airr: + nullable: false + germline_set_ref: + type: string + description: GermlineSet from which it was taken, referenced in standardized form (Repo:Label:Version) + example: OGRDB:Human_IGH:2021.11 + x-airr: + nullable: false + phasing: + type: integer + description: Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the same chromosome + x-airr: + nullable: true + adc-query-support: true + undocumented_alleles: + type: array + description: Array of alleles inferred to be present and not documented in an identified GermlineSet + items: + type: object + properties: + allele_name: + type: string + description: Allele name as allocated by the inference pipeline + x-airr: + nullable: false + sequence: + type: string + description: nt sequence of the allele, as provided by the inference pipeline + x-airr: + nullable: false + phasing: + type: integer + description: Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the same chromosome + x-airr: + nullable: true + adc-query-support: true + deleted_genes: + type: array + description: Array of genes identified as being deleted in this genotype + items: + type: object + properties: + gene_symbol: + type: string + description: The accepted name for this gene, taken from the GermlineSet + x-airr: + nullable: false + germline_set_ref: + type: string + description: GermlineSet from which it was taken (issuer/name/version) + x-airr: + nullable: false + phasing: + type: integer + description: Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the same chromosome x-airr: nullable: true adc-query-support: true - germline_process: + inference_process: type: string enum: - genomic_sequencing - repertoire_sequencing - description: Information on how the germline was acquired. Controlled vocabulary. - title: Germline acquisition process + description: Information on how the genotype was acquired. Controlled vocabulary. + title: Genotype acquisition process example: repertoire_sequencing x-airr: nullable: true adc-query-support: true format: controlled vocabulary -# List of germline class sets used for analysis or to describe +# List of MHCGenotypes used for analysis to describe # a subject's genotype -GermlineSet: +MHCGenotypeSet: discriminator: AIRR type: object + required: + - mhc_genotype_set_id properties: - germline_set_id: + mhc_genotype_set_id: type: string - description: A unique identifier for this Germline Set. - germline_class_list: - description: List of classes included in this germline set. + description: A unique identifier for this MHC Genotype Set. + x-airr: + nullable: false + mhc_genotype_class_list: + description: List of classes included in this MHC Genotype set. type: array items: - $ref: '#/GermlineClassSet' + $ref: '#/MHCGenotype' -MHCGermlineClassSet: +# Genotype of major histocompatibility complex (MHC) class I and II receptors +MHCGenotype: discriminator: AIRR type: object + required: + - mhc_genotype_id properties: - germline_class_id: + mhc_genotype_id: type: string - description: A unique identifier for this Germline Set, assumed to be unique in the context of the study. - germline_class: + description: A unique identifier for this MHC Genotype, assumed to be unique in the context of the study. + x-airr: + nullable: false + genotype_class: type: string enum: - - MHC + - MHC # do we need this enum?? example: MHC x-airr: nullable: true @@ -377,39 +1095,34 @@ MHCGermlineClassSet: type: array description: Array of gene descriptions items: - type: string - description: Gene description for a germline allele. If referring to a known reference sequence in a database the relevant gene/allele nomenclature should be followed (e.g., HLA‐C*07:29). + type: object + properties: + gene_symbol: + type: string + description: The accepted name for this gene + x-airr: + nullable: false + germline_set_ref: + type: string + description: Repository and list from which it was taken (issuer/name/version) + x-airr: + nullable: false x-airr: nullable: true adc-query-support: true - germline_process: + genotype_process: type: string enum: - genomic_sequencing - repertoire_sequencing - description: Information on how the germline was acquired. Controlled vocabulary. - title: Germline acquisition process + description: Information on how the genotype was acquired. Controlled vocabulary. + title: Genotype acquisition process example: repertoire_sequencing x-airr: nullable: true adc-query-support: true format: controlled vocabulary -# List of germline class sets used for analysis or to describe -# a subject's genotype -MHCGermlineSet: - discriminator: AIRR - type: object - properties: - germline_set_id: - type: string - description: A unique identifier for this Germline Set. - germline_class_list: - description: List of classes included in this germline set. - type: array - items: - $ref: '#/MHCGermlineClassSet' - # # Repertoire metadata schema # @@ -871,16 +1584,16 @@ Subject: x-airr: nullable: false adc-query-support: true - germline: + genotype: type: object - description: Germline for this subject, if known, by germline class. + description: Genotype for this subject, if known properties: - receptor_germline: - $ref: '#/GermlineSet' - description: Immune receptor germline set for this subject. - mhc_germline: - $ref: '#/MHCGermlineSet' - description: MHC germline set for this subject. + receptor_genotype_set: + $ref: '#/GenotypeSet' + description: Immune receptor genotype set for this subject. + mhc_genotype_set: + $ref: '#/MHCGenotypeSet' + description: MHC genotype set for this subject. # 1-to-n relationship between a subject and its diagnoses Diagnosis: @@ -1892,16 +2605,6 @@ DataProcessing: nullable: true adc-query-support: true name: Processed data file names - data_processing_germline: - type: object - description: Germline used for this data processing process - properties: - receptor_germline: - $ref: '#/GermlineSet' - description: Immune receptor germline set for this data processing process. - mhc_germline: - $ref: '#/MHCGermlineSet' - description: MHC germline set for this data processing process. germline_database: type: string description: Source of germline V(D)J genes with version number or date accessed. diff --git a/specs/airr-schema-openapi3.yaml b/specs/airr-schema-openapi3.yaml index 4b10b024a..cd3140310 100644 --- a/specs/airr-schema-openapi3.yaml +++ b/specs/airr-schema-openapi3.yaml @@ -173,6 +173,18 @@ DataFile: description: List of Repertoire groups items: $ref: '#/RepertoireGroup' + AlleleDescription: + type: array + nullable: false + description: List of allele descriptions + items: + $ref: '#/AlleleDescription' + GermlineSet: + type: array + nullable: false + description: List of germline sets + items: + $ref: '#/GermlineSet' DataProcessing: type: array nullable: false @@ -289,28 +301,665 @@ TimePoint: label: time unit # -# Germline gene schema +# General objects # -# The GeneDescription object will be introduced at a later point here. Until -# then the term "gene description" below can be considered to be equivalent -# to "gene symbol". + +# An individual +Acknowledgement: + discriminator: + propertyName: AIRR + description: Individual whose contribution to this work should be acknowledged + type: object + required: + - acknowledgement_id + - name + - institution_name + properties: + acknowledgement_id: + type: string + nullable: false + description: unique identifier of this Acknowledgement within the file + name: + type: string + nullable: true + description: Full name of individual + institution_name: + type: string + nullable: true + description: Individual's department and institution name + ORCID_id: + type: string + nullable: true + description: Individual's ORCID Id, if available + +# +# Germline gene schema # -# Gene descriptions from the same class organized together as a set -GermlineClassSet: + +# Rearranged and genomic germline sequences +RearrangedSequence: discriminator: propertyName: AIRR + description: Details of a directly observed rearranged sequence or an inference from rearranged sequences contributing support for a gene or allele type: object + required: + - sequence_id + - sequence + - derivation + - observation_type + - repository_name + - repository_id + - deposited_version + - seq_start + - seq_end properties: - germline_class_id: + sequence_id: + type: string + nullable: false + description: Unique identifier of this RearrangedSequence within the file + sequence: + type: string + nullable: false + description: nucleotide sequence + derivation: + type: string + nullable: false + enum: + - DNA + - RNA + description: The class of nucleic acid that was used as primary starting material + observation_type: + type: string + nullable: false + description: The type of observation from which this sequence was drawn, e.g. direct sequencing, inference from repertoire + enum: + - direct sequencing + - inference from repertoire + notes: type: string nullable: true - description: A unique identifier for this Germline Class Set. - germline_class: + description: Notes + repository_name: + type: string + nullable: false + description: Name of the repository in which the sequence has been deposited + repository_id: + type: string + nullable: false + description: Id or serial number of the sequence within the repository + deposited_version: + type: string + nullable: false + description: Version number of the sequence within the repository + seq_start: + type: integer + nullable: false + description: Start co-ordinate of the sequence detailed in this record, within the sequence deposited + seq_end: + type: integer + nullable: false + description: End co-ordinate of the sequence detailed in this record, within the sequence deposited + +UnrearrangedSequence: + discriminator: + propertyName: AIRR + description: Details of an unrearranged sequence contributing support for a gene or allele + type: object + required: + - sequence_id + - sequence + - repository_name + - assembly_id + - gff_seqid + - gff_start + - gff_end + - strand + properties: + sequence_id: + type: string + nullable: false + description: unique identifier of this UnrearrangedSequence within the file + sequence: + type: string + nullable: false + description: Sequence of interest described in this record (typically this will include gene and promoter region) + notes: + type: string + nullable: true + description: Notes + repository_name: + type: string + nullable: false + description: Name of the repository in which the assembly or contig is deposited + assembly_id: + type: string + nullable: false + description: Identifier of the assembly or contig within the repository + patch_no: + type: string + nullable: true + description: Genome assembly patch number in which this gene was determined + gff_seqid: + type: string + nullable: true + description: Sequence (from the assembly) of a window including the gene and preferably also the promoter region + gff_start: + type: integer + nullable: true + description: Genomic co-ordinates of the start of the sequence of interest described in this record, in Ensemble GFF version 3 + gff_end: + type: integer + nullable: true + description: Genomic co-ordinates of the end of the sequence of interest described in this record, in Ensemble GFF version 3 + strand: type: string nullable: true + enum: + - + + - "-" + description: sense (+ or -) + +# V gene delineation +SequenceDelineationV: + discriminator: + propertyName: AIRR + description: Delineation of a V-gene in a particular system + type: object + required: + - sequence_delineation_id + - delineation_scheme + - fwr1_start + - fwr1_end + - cdr1_start + - cdr1_end + - fwr2_start + - fwr2_end + - cdr2_start + - cdr2_end + - fwr3_start + - fwr3_end + - cdr3_start + properties: + sequence_delineation_id: + type: string + nullable: false + description: Unique identifier of this SequenceDelineationV within the file + delineation_scheme: + type: string + nullable: false + description: Name of the delineation scheme + example: Chothia + fwr1_start: + type: integer + nullable: false + description: FWR1 start co-ordinate in Gene Description 'alignment' field + fwr1_end: + type: integer + nullable: false + description: FWR1 end co-ordinate in Gene Description 'alignment' field + cdr1_start: + type: integer + nullable: false + description: CDR1 start co-ordinate in Gene Description 'alignment' field + cdr1_end: + type: integer + nullable: false + description: CDR1 end co-ordinate in Gene Description 'alignment' field + fwr2_start: + type: integer + nullable: false + description: FWR2 start co-ordinate in Gene Description 'alignment' field + fwr2_end: + type: integer + nullable: false + description: FWR2 end co-ordinate in Gene Description 'alignment' field + cdr2_start: + type: integer + nullable: false + description: CDR2 start co-ordinate in Gene Description 'alignment' field + cdr2_end: + type: integer + nullable: false + description: CDR2 end co-ordinate in Gene Description 'alignment' field + fwr3_start: + type: integer + nullable: false + description: FWR3 start co-ordinate in Gene Description 'alignment' field + fwr3_end: + type: integer + nullable: false + description: FWR3 end co-ordinate in Gene Description 'alignment' field + cdr3_start: + type: integer + nullable: false + description: CDR3 start co-ordinate in Gene Description 'alignment' field + alignment: + type: array + nullable: true + items: + type: string + description: one string for each codon in the fields v_start to cdr3_start indicating the label of that codon according to the numbering of the delineation scheme + +# Description of a putative or confirmed Ig receptor gene/allele +AlleleDescription: + discriminator: + propertyName: AIRR + description: Details of a putative or confirmed Ig receptor gene/allele inferred from one or more observations + type: object + required: + - allele_description_id + - maintainer + - lab_address + - release_version + - release_date + - release_description + - sequence + - coding_sequence + - coding_sequence_identifier + - locus + - sequence_type + - functional + - inference_type + - species + properties: + allele_description_id: + type: string + nullable: false + description: Unique identifier of this AlleleDescription within the file + maintainer: + type: string + nullable: false + description: Maintainer of this sequence record + acknowledgements: + type: array + nullable: true + description: List of individuals whose contribution to the gene description should be acknowledged + items: + $ref: '#/Acknowledgement' + lab_address: + type: string + nullable: false + description: Institution and full address of corresponding author + release_version: + type: integer + nullable: false + description: Version number of this record, updated whenever a revised version is published or released + release_date: + type: string + nullable: false + description: Date of this release + release_description: + type: string + nullable: false + description: Brief descriptive notes of the reason for this release and the changes embodied + label: + type: string + nullable: true + description: The accepted name for this gene or allele, if any + example: IGHV1-69*01 + sequence: + type: string + nullable: false + description: nt sequence of the gene. This should cover the full length that is available, including where possible RSS, and 5' UTR and lead-in for V-gene sequences + coding_sequence: + type: string + nullable: false + description: nucleotide sequence of the core region of the gene (V-, D-, J- or C-REGION), aligned, in the case of the V-REGION, with the IMGT numbering scheme + aliases: + type: array + nullable: true + items: + type: string + description: Alternative names for this sequence + locus: + type: string + nullable: false + enum: + - IGH + - IGK + - IGL + - TRA + - TRB + - TRG + - TRD + description: Gene locus + chromosome: + type: integer + nullable: true + description: chromosome on which the gene is located + sequence_type: + type: string + nullable: false + enum: + - V + - D + - J + - C + description: Sequence type (V, D, J, C) + functional: + type: boolean + nullable: false + description: True if the gene is functional, false if it is a pseudogene + inference_type: + type: string + nullable: false + enum: + - Genomic and rearranged + - Genomic only + - Rearranged only + description: Type of inference(s) from which this gene sequence was inferred + species: + type: string + nullable: false + description: Binomial designation of subject's species + example: Mus musculus + species_subgroup: + type: string + nullable: true + description: Race, strain or other species subgroup to which this subject belongs + example: BALB/C + species_subgroup_type: + type: string + nullable: true + enum: + - breed + - strain + - inbred + - outbred + - locational + status: + type: string + nullable: true + enum: + - active + - draft + - retired + - withdrawn + description: Status of record, assumed active if the field is not proesent + gene_subgroup: + type: string + nullable: true + description: Gene subgroup or clade, as (and if) identified for this species and gene + subgroup_designation: + type: string + nullable: true + description: Gene designation within this subgroup, if identified + allele_designation: + type: string + nullable: true + description: Allele designation, if identified + j_codon_frame: + type: integer + nullable: true + enum: + - 1 + - 2 + - 3 + description: Codon position of the first nucleotide in the 'coding_sequence' field. Mandatory for J genes. Not used for V or D genes. ('1' means the sequence is in-frame, '2' means that the first bp is missing from the first codon, '3' means that the first 2 bp are missing) + gene_start: + type: integer + nullable: true + description: Co-ordinate (in the sequence field) of the first nucleotide in the coding_sequence field + gene_end: + type: integer + nullable: true + description: Co-ordinate (in the sequence field) of the last gene-coding nucleotide in the coding_sequence field + utr_5_prime_start: + type: integer + nullable: true + description: Start co-ordinate (in the sequence field) of 5 prime UTR (V-genes only) + utr_5_prime_end: + type: integer + nullable: true + description: End co-ordinate (in the sequence field) of 5 prime UTR (V-genes only) + leader_1_start: + type: integer + nullable: true + description: Start co-ordinate (in the sequence field) of L-PART1 (V-genes only) + leader_1_end: + type: integer + nullable: true + description: End co-ordinate (in the sequence field) of L-PART1 (V-genes only) + leader_2_start: + type: integer + nullable: true + description: Start co-ordinate (in the sequence field) of L-PART2 (V-genes only) + leader_2_end: + type: integer + nullable: true + description: End co-ordinate (in the sequence field) of L-PART2 (V-genes only) + v_rs_start: + type: integer + nullable: true + description: Start co-ordinate (in the sequence field) of V recombination site (V-genes only) + v_rs_end: + type: integer + nullable: true + description: End co-ordinate (in the sequence field) of V recombination site (V-genes only) + d_rs_3_prime_start: + type: integer + nullable: true + description: Start co-ordinate (in the sequence field) of 3 prime D recombination site (D-genes only) + d_rs_3_prime_end: + type: integer + nullable: true + description: End co-ordinate (in the sequence field) of 3 prime D recombination site (D-genes only) + d_rs_5_prime_start: + type: integer + nullable: true + description: Start co-ordinate (in the sequence field) of 5 prime D recombination site (D-genes only) + d_rs_5_prime_end: + type: integer + nullable: true + description: End co-ordinate (in the sequence field) of 5 prime D recombination site (D-genes only) + j_cdr3_end: + type: integer + nullable: true + description: In the case of a J-gene, the co-ordinate (in the sequence field) of the first nucelotide of the conserved PHE or TRP (IMGT codon position 118) + j_rs_start: + type: integer + nullable: true + description: Start co-ordinate (in the sequence field) of J recombination site (J-genes only) + j_rs_end: + type: integer + nullable: true + description: End co-ordinate (in the sequence field) of J recombination site (J-genes only) + j_donor_splice: + type: integer + nullable: true + description: Co-ordinate (in the sequence field) of the 3' splice donor site (J-genes only) + v_gene_delineations: + type: array + nullable: true + items: + $ref: '#/SequenceDelineationV' + unrearranged_support: + type: array + nullable: true + items: + $ref: '#/UnrearrangedSequence' + rearranged_support: + type: array + nullable: true + items: + $ref: '#/RearrangedSequence' + paralogs: + type: array + nullable: true + items: + type: string + description: Gene symbols of any paralogs + notes: + type: string + nullable: true + description: Notes + curational_tags: + type: array + nullable: true + items: + type: string + enum: + - likely_truncated + - likely_full_length + description: Controlled-vocabulary tags applied to this description + +# Collection of gene descriptions into a germline set +GermlineSet: + discriminator: + propertyName: AIRR + description: Details of a 'germline set' bringing together multiple AlleleDescriptions from the same strain or species. All genes in a GermlineSet should be from a single locus. + type: object + required: + - germline_set_id + - author + - lab_name + - lab_address + - release_version + - release_description + - release_date + - germline_set_name + - germline_set_ref + - species + - locus + - allele_descriptions + properties: + germline_set_id: + type: string + nullable: false + description: Unique identifier of the GermlineSet within this file + author: + type: string + nullable: false + description: Corresponding author + lab_name: + type: string + nullable: false + description: Department of corresponding author + lab_address: + type: string + nullable: false + description: Institutional address of corresponding author + acknowledgements: + type: array + nullable: true + description: List of individuals whose contribution to the germline set should be acknowledged + items: + $ref: '#/Acknowledgement' + release_version: + type: number + nullable: false + description: Version number of this record, allocated automatically + release_description: + type: string + nullable: false + description: Brief descriptive notes of the reason for this release and the changes embodied + release_date: + type: string + nullable: false + description: Date of this release + germline_set_name: + type: string + nullable: false + description: descriptive name of this germline set + germline_set_ref: + type: string + nullable: false + description: Unique identifier of the germline set and version, in standardized form (Repo:Label:Version) + example: OGRDB:Human_IGH:2021.11 + pub_ids: + type: string + nullable: true + description: Publications describing the germline set + example: "PMID:85642,PMID:12345" + species: + type: string + nullable: false + description: Binomial designation of subject's species + example: Mus musculus + species_subgroup: + type: string + nullable: true + description: Race, strain or other species subgroup to which this subject belongs + example: BALB/C + species_subgroup_type: + type: string + nullable: true + enum: + - breed + - strain + - inbred + - outbred + - locational + locus: + type: string + nullable: false + enum: + - IGH + - IGK + - IGL + - TRA + - TRB + - TRG + - TRD + description: Gene locus + allele_descriptions: + type: array + nullable: false + items: + $ref: '#/AlleleDescription' + description: list of allele_descriptions in the germline set + notes: + type: string + nullable: true + description: Notes + +# +# Genotype schema +# + +# GenotypeSet lists the Genotypes (describing different loci) inferred for this subject + +GenotypeSet: + discriminator: + propertyName: AIRR + type: object + required: + - receptor_genotype_set_id + properties: + receptor_genotype_set_id: + type: string + nullable: false + description: A unique identifier for this Receptor Genotype Set. + genotype_class_list: + description: List of Genotypes included in this Receptor Genotype Set. + type: array + nullable: true + items: + $ref: '#/Genotype' + + +# Genotype of adaptive immune receptors +# This enumerates the alleles and gene deletions inferred in a single subject. +# Included alleles may either be listed by reference to a GermlineSet, or +# listed as 'undocumented', in which case the inferred sequence is provided + +Genotype: + discriminator: + propertyName: AIRR + type: object + required: + - receptor_genotype_id + - locus + properties: + receptor_genotype_id: + type: string + nullable: false + description: A unique identifier for this Receptor Genotype + locus: + type: string + nullable: false enum: - IGH - - IGI - IGK - IGL - TRA @@ -321,60 +970,127 @@ GermlineClassSet: x-airr: adc-query-support: true format: controlled vocabulary - germline_alleles: + documented_alleles: type: array nullable: true - description: Array of gene descriptions + description: Array of alleles inferred to be present which are documented in GermlineSets items: - type: string - description: Gene description for a germline allele. If referring to a known reference sequence in a database the relevant gene/allele nomenclature should be followed (e.g., IGHV4-59*01 if using IMGT/GENE-DB). + type: object + properties: + label: + type: string + nullable: false + description: The accepted name for this allele, if any, taken from the GermlineSet + coding_sequence_identifier: + type: string + nullable: false + description: Unique identifier of the coding sequence, as allocated by an identified repository + germline_set_ref: + type: string + nullable: false + description: GermlineSet from which it was taken, referenced in standardized form (Repo:Label:Version) + example: OGRDB:Human_IGH:2021.11 + phasing: + type: integer + nullable: true + description: Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the same chromosome + x-airr: + adc-query-support: true + undocumented_alleles: + type: array + nullable: true + description: Array of alleles inferred to be present and not documented in an identified GermlineSet + items: + type: object + properties: + allele_name: + type: string + nullable: false + description: Allele name as allocated by the inference pipeline + sequence: + type: string + nullable: false + description: nt sequence of the allele, as provided by the inference pipeline + phasing: + type: integer + nullable: true + description: Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the same chromosome x-airr: adc-query-support: true - germline_process: + deleted_genes: + type: array + nullable: true + description: Array of genes identified as being deleted in this genotype + items: + type: object + properties: + gene_symbol: + type: string + nullable: false + description: The accepted name for this gene, taken from the GermlineSet + germline_set_ref: + type: string + nullable: false + description: GermlineSet from which it was taken (issuer/name/version) + phasing: + type: integer + nullable: true + description: Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the same chromosome + x-airr: + adc-query-support: true + inference_process: type: string nullable: true enum: - genomic_sequencing - repertoire_sequencing - description: Information on how the germline was acquired. Controlled vocabulary. - title: Germline acquisition process + description: Information on how the genotype was acquired. Controlled vocabulary. + title: Genotype acquisition process example: repertoire_sequencing x-airr: adc-query-support: true format: controlled vocabulary -# List of germline class sets used for analysis or to describe + + +# List of MHCGenotypes used for analysis to describe # a subject's genotype -GermlineSet: +MHCGenotypeSet: discriminator: propertyName: AIRR type: object + required: + - mhc_genotype_set_id properties: - germline_set_id: + mhc_genotype_set_id: type: string - nullable: true - description: A unique identifier for this Germline Set. - germline_class_list: - description: List of classes included in this germline set. + nullable: false + description: A unique identifier for this MHC Genotype Set. + mhc_genotype_class_list: + description: List of classes included in this MHC Genotype set. type: array nullable: true items: - $ref: '#/GermlineClassSet' + $ref: '#/MHCGenotype' + -MHCGermlineClassSet: +# Genotype of major histocompatibility complex (MHC) class I and II receptors +MHCGenotype: discriminator: propertyName: AIRR type: object + required: + - mhc_genotype_id properties: - germline_class_id: + mhc_genotype_id: type: string - nullable: true - description: A unique identifier for this Germline Set, assumed to be unique in the context of the study. - germline_class: + nullable: false + description: A unique identifier for this MHC Genotype, assumed to be unique in the context of the study. + genotype_class: type: string nullable: true enum: - - MHC + - MHC # do we need this enum?? example: MHC x-airr: adc-query-support: true @@ -384,41 +1100,31 @@ MHCGermlineClassSet: nullable: true description: Array of gene descriptions items: - type: string - description: Gene description for a germline allele. If referring to a known reference sequence in a database the relevant gene/allele nomenclature should be followed (e.g., HLA‐C*07:29). + type: object + properties: + gene_symbol: + type: string + nullable: false + description: The accepted name for this gene + germline_set_ref: + type: string + nullable: false + description: Repository and list from which it was taken (issuer/name/version) x-airr: adc-query-support: true - germline_process: + genotype_process: type: string nullable: true enum: - genomic_sequencing - repertoire_sequencing - description: Information on how the germline was acquired. Controlled vocabulary. - title: Germline acquisition process + description: Information on how the genotype was acquired. Controlled vocabulary. + title: Genotype acquisition process example: repertoire_sequencing x-airr: adc-query-support: true format: controlled vocabulary -# List of germline class sets used for analysis or to describe -# a subject's genotype -MHCGermlineSet: - discriminator: - propertyName: AIRR - type: object - properties: - germline_set_id: - type: string - nullable: true - description: A unique identifier for this Germline Set. - germline_class_list: - description: List of classes included in this germline set. - type: array - nullable: true - items: - $ref: '#/MHCGermlineClassSet' - # # Repertoire metadata schema # @@ -883,19 +1589,19 @@ Subject: $ref: '#/Diagnosis' x-airr: adc-query-support: true - germline: + genotype: type: object nullable: true - description: Germline for this subject, if known, by germline class. + description: Genotype for this subject, if known properties: - receptor_germline: + receptor_genotype_set: nullable: true - $ref: '#/GermlineSet' - description: Immune receptor germline set for this subject. - mhc_germline: + $ref: '#/GenotypeSet' + description: Immune receptor genotype set for this subject. + mhc_genotype_set: nullable: true - $ref: '#/MHCGermlineSet' - description: MHC germline set for this subject. + $ref: '#/MHCGenotypeSet' + description: MHC genotype set for this subject. # 1-to-n relationship between a subject and its diagnoses Diagnosis: @@ -1915,19 +2621,6 @@ DataProcessing: x-airr: adc-query-support: true name: Processed data file names - data_processing_germline: - type: object - nullable: true - description: Germline used for this data processing process - properties: - receptor_germline: - nullable: true - $ref: '#/GermlineSet' - description: Immune receptor germline set for this data processing process. - mhc_germline: - nullable: true - $ref: '#/MHCGermlineSet' - description: MHC germline set for this data processing process. germline_database: type: string nullable: true diff --git a/specs/airr-schema.yaml b/specs/airr-schema.yaml index bcceb7ac1..b664c06eb 100644 --- a/specs/airr-schema.yaml +++ b/specs/airr-schema.yaml @@ -175,6 +175,20 @@ DataFile: $ref: '#/RepertoireGroup' x-airr: nullable: false + AlleleDescription: + type: array + description: List of allele descriptions + items: + $ref: '#/AlleleDescription' + x-airr: + nullable: false + GermlineSet: + type: array + description: List of germline sets + items: + $ref: '#/GermlineSet' + x-airr: + nullable: false DataProcessing: type: array description: List of data processing workflows @@ -289,26 +303,658 @@ TimePoint: id: UO:0000003 label: time unit +# +# General objects +# TODO: link to global schema with JSON-LD? +# + +# An individual +Acknowledgement: + discriminator: AIRR + description: Individual whose contribution to this work should be acknowledged + type: object + required: + - acknowledgement_id + - name + - institution_name + properties: + acknowledgement_id: + type: string + description: unique identifier of this Acknowledgement within the file + x-airr: + nullable: false + name: + type: string + description: Full name of individual + institution_name: + type: string + description: Individual's department and institution name + ORCID_id: + type: string + description: Individual's ORCID Id, if available + # # Germline gene schema # -# The GeneDescription object will be introduced at a later point here. Until -# then the term "gene description" below can be considered to be equivalent -# to "gene symbol". + +# Rearranged and genomic germline sequences +RearrangedSequence: + discriminator: AIRR + description: Details of a directly observed rearranged sequence or an inference from rearranged sequences contributing support for a gene or allele + type: object + required: + - sequence_id + - sequence + - derivation + - observation_type + - repository_name + - repository_id + - deposited_version + - seq_start + - seq_end + properties: + sequence_id: + type: string + description: Unique identifier of this RearrangedSequence within the file + x-airr: + nullable: false + sequence: + type: string + description: nucleotide sequence + x-airr: + nullable: false + derivation: + type: string + enum: + - DNA + - RNA + description: The class of nucleic acid that was used as primary starting material + x-airr: + nullable: false + observation_type: + type: string + description: The type of observation from which this sequence was drawn, e.g. direct sequencing, inference from repertoire + enum: + - direct sequencing + - inference from repertoire + x-airr: + nullable: false + notes: + type: string + description: Notes + repository_name: + type: string + description: Name of the repository in which the sequence has been deposited + x-airr: + nullable: false + repository_id: + type: string + description: Id or serial number of the sequence within the repository + x-airr: + nullable: false + deposited_version: + type: string + description: Version number of the sequence within the repository + x-airr: + nullable: false + seq_start: + type: integer + description: Start co-ordinate of the sequence detailed in this record, within the sequence deposited + x-airr: + nullable: false + seq_end: + type: integer + description: End co-ordinate of the sequence detailed in this record, within the sequence deposited + x-airr: + nullable: false + +UnrearrangedSequence: + discriminator: AIRR + description: Details of an unrearranged sequence contributing support for a gene or allele + type: object + required: + - sequence_id + - sequence + - repository_name + - assembly_id + - gff_seqid + - gff_start + - gff_end + - strand + properties: + sequence_id: + type: string + description: unique identifier of this UnrearrangedSequence within the file + x-airr: + nullable: false + sequence: + type: string + description: Sequence of interest described in this record (typically this will include gene and promoter region) + x-airr: + nullable: false + notes: + type: string + description: Notes + repository_name: + type: string + description: Name of the repository in which the assembly or contig is deposited + x-airr: + nullable: false + assembly_id: + type: string + description: Identifier of the assembly or contig within the repository + x-airr: + nullable: false + patch_no: + type: string + description: Genome assembly patch number in which this gene was determined + gff_seqid: + type: string + description: Sequence (from the assembly) of a window including the gene and preferably also the promoter region + gff_start: + type: integer + description: Genomic co-ordinates of the start of the sequence of interest described in this record, in Ensemble GFF version 3 + gff_end: + type: integer + description: Genomic co-ordinates of the end of the sequence of interest described in this record, in Ensemble GFF version 3 + strand: + type: string + enum: + - + + - "-" + description: sense (+ or -) + +# V gene delineation +SequenceDelineationV: + discriminator: AIRR + description: Delineation of a V-gene in a particular system + type: object + required: + - sequence_delineation_id + - delineation_scheme + - fwr1_start + - fwr1_end + - cdr1_start + - cdr1_end + - fwr2_start + - fwr2_end + - cdr2_start + - cdr2_end + - fwr3_start + - fwr3_end + - cdr3_start + properties: + sequence_delineation_id: + type: string + description: Unique identifier of this SequenceDelineationV within the file + x-airr: + nullable: false + delineation_scheme: + type: string + description: Name of the delineation scheme + example: Chothia + x-airr: + nullable: false + fwr1_start: + type: integer + description: FWR1 start co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + fwr1_end: + type: integer + description: FWR1 end co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + cdr1_start: + type: integer + description: CDR1 start co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + cdr1_end: + type: integer + description: CDR1 end co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + fwr2_start: + type: integer + description: FWR2 start co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + fwr2_end: + type: integer + description: FWR2 end co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + cdr2_start: + type: integer + description: CDR2 start co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + cdr2_end: + type: integer + description: CDR2 end co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + fwr3_start: + type: integer + description: FWR3 start co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + fwr3_end: + type: integer + description: FWR3 end co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + cdr3_start: + type: integer + description: CDR3 start co-ordinate in Gene Description 'alignment' field + x-airr: + nullable: false + alignment: + type: array + items: + type: string + description: one string for each codon in the fields v_start to cdr3_start indicating the label of that codon according to the numbering of the delineation scheme + +# The Gene Description +AlleleDescription: + discriminator: AIRR + description: Details of a putative or confirmed Ig receptor gene/allele inferred from one or more observations + type: object + required: + - allele_description_id + - maintainer + - lab_address + - release_version + - release_date + - release_description + - sequence + - coding_sequence + - coding_sequence_identifier + - locus + - sequence_type + - functional + - inference_type + - species + properties: + allele_description_id: + type: string + description: Unique identifier of this AlleleDescription within the file + x-airr: + nullable: false + maintainer: + type: string + description: Maintainer of this sequence record + x-airr: + nullable: false + acknowledgements: + type: array + description: List of individuals whose contribution to the gene description should be acknowledged + items: + $ref: '#/Acknowledgement' + lab_address: + type: string + description: Institution and full address of corresponding author + x-airr: + nullable: false + release_version: + type: integer + description: Version number of this record, updated whenever a revised version is published or released + x-airr: + nullable: false + release_date: + type: string + description: Date of this release + x-airr: + nullable: false + release_description: + type: string + description: Brief descriptive notes of the reason for this release and the changes embodied + x-airr: + nullable: false + label: + type: string + description: The accepted name for this gene or allele, if any + example: IGHV1-69*01 + sequence: + type: string + description: nt sequence of the gene. This should cover the full length that is available, including where possible RSS, and 5' UTR and lead-in for V-gene sequences + x-airr: + nullable: false + coding_sequence: + type: string + description: nucleotide sequence of the core region of the gene (V-, D-, J- or C-REGION), aligned, in the case of the V-REGION, with the IMGT numbering scheme + x-airr: + nullable: false + aliases: + type: array + items: + type: string + description: Alternative names for this sequence + locus: + type: string + enum: + - IGH + - IGK + - IGL + - TRA + - TRB + - TRG + - TRD + description: Gene locus + x-airr: + nullable: false + chromosome: + type: integer + description: chromosome on which the gene is located + sequence_type: + type: string + enum: + - V + - D + - J + - C + description: Sequence type (V, D, J, C) + x-airr: + nullable: false + functional: + type: boolean + description: True if the gene is functional, false if it is a pseudogene + x-airr: + nullable: false + inference_type: + type: string + enum: + - Genomic and rearranged + - Genomic only + - Rearranged only + description: Type of inference(s) from which this gene sequence was inferred + x-airr: + nullable: false + species: + type: string + description: Binomial designation of subject's species + example: Mus musculus + x-airr: + nullable: false + species_subgroup: + type: string + description: Race, strain or other species subgroup to which this subject belongs + example: BALB/C + species_subgroup_type: + type: string + enum: + - breed + - strain + - inbred + - outbred + - locational + status: + type: string + enum: + - active + - draft + - retired + - withdrawn + description: Status of record, assumed active if the field is not proesent + gene_subgroup: + type: string + description: Gene subgroup or clade, as (and if) identified for this species and gene + subgroup_designation: + type: string + description: Gene designation within this subgroup, if identified + allele_designation: + type: string + description: Allele designation, if identified + j_codon_frame: + type: integer + enum: + - 1 + - 2 + - 3 + description: Codon position of the first nucleotide in the 'coding_sequence' field. Mandatory for J genes. Not used for V or D genes. ('1' means the sequence is in-frame, '2' means that the first bp is missing from the first codon, '3' means that the first 2 bp are missing) + gene_start: + type: integer + description: Co-ordinate (in the sequence field) of the first nucleotide in the coding_sequence field + gene_end: + type: integer + description: Co-ordinate (in the sequence field) of the last gene-coding nucleotide in the coding_sequence field + utr_5_prime_start: + type: integer + description: Start co-ordinate (in the sequence field) of 5 prime UTR (V-genes only) + utr_5_prime_end: + type: integer + description: End co-ordinate (in the sequence field) of 5 prime UTR (V-genes only) + leader_1_start: + type: integer + description: Start co-ordinate (in the sequence field) of L-PART1 (V-genes only) + leader_1_end: + type: integer + description: End co-ordinate (in the sequence field) of L-PART1 (V-genes only) + leader_2_start: + type: integer + description: Start co-ordinate (in the sequence field) of L-PART2 (V-genes only) + leader_2_end: + type: integer + description: End co-ordinate (in the sequence field) of L-PART2 (V-genes only) + v_rs_start: + type: integer + description: Start co-ordinate (in the sequence field) of V recombination site (V-genes only) + v_rs_end: + type: integer + description: End co-ordinate (in the sequence field) of V recombination site (V-genes only) + d_rs_3_prime_start: + type: integer + description: Start co-ordinate (in the sequence field) of 3 prime D recombination site (D-genes only) + d_rs_3_prime_end: + type: integer + description: End co-ordinate (in the sequence field) of 3 prime D recombination site (D-genes only) + d_rs_5_prime_start: + type: integer + description: Start co-ordinate (in the sequence field) of 5 prime D recombination site (D-genes only) + d_rs_5_prime_end: + type: integer + description: End co-ordinate (in the sequence field) of 5 prime D recombination site (D-genes only) + j_cdr3_end: + type: integer + description: In the case of a J-gene, the co-ordinate (in the sequence field) of the first nucelotide of the conserved PHE or TRP (IMGT codon position 118) + j_rs_start: + type: integer + description: Start co-ordinate (in the sequence field) of J recombination site (J-genes only) + j_rs_end: + type: integer + description: End co-ordinate (in the sequence field) of J recombination site (J-genes only) + j_donor_splice: + type: integer + description: Co-ordinate (in the sequence field) of the 3' splice donor site (J-genes only) + v_gene_delineations: + type: array + items: + $ref: '#/SequenceDelineationV' + unrearranged_support: + type: array + items: + $ref: '#/UnrearrangedSequence' + rearranged_support: + type: array + items: + $ref: '#/RearrangedSequence' + paralogs: + type: array + items: + type: string + description: Gene symbols of any paralogs + notes: + type: string + description: Notes + curational_tags: + type: array + items: + type: string + enum: + - likely_truncated + - likely_full_length + description: Controlled-vocabulary tags applied to this description + +# Collection of gene descriptions into a germline set +GermlineSet: + discriminator: AIRR + description: Details of a 'germline set' bringing together multiple AlleleDescriptions from the same strain or species. All genes in a GermlineSet should be from a single locus. + type: object + required: + - germline_set_id + - author + - lab_name + - lab_address + - release_version + - release_description + - release_date + - germline_set_name + - germline_set_ref + - species + - locus + - allele_descriptions + properties: + germline_set_id: + type: string + description: Unique identifier of the GermlineSet within this file + x-airr: + nullable: false + author: + type: string + description: Corresponding author + x-airr: + nullable: false + lab_name: + type: string + description: Department of corresponding author + x-airr: + nullable: false + lab_address: + type: string + description: Institutional address of corresponding author + x-airr: + nullable: false + acknowledgements: + type: array + description: List of individuals whose contribution to the germline set should be acknowledged + items: + $ref: '#/Acknowledgement' + release_version: + type: number + description: Version number of this record, allocated automatically + x-airr: + nullable: false + release_description: + type: string + description: Brief descriptive notes of the reason for this release and the changes embodied + x-airr: + nullable: false + release_date: + type: string + description: Date of this release + x-airr: + nullable: false + germline_set_name: + type: string + description: descriptive name of this germline set + x-airr: + nullable: false + germline_set_ref: + type: string + description: Unique identifier of the germline set and version, in standardized form (Repo:Label:Version) + example: OGRDB:Human_IGH:2021.11 + x-airr: + nullable: false + pub_ids: + type: string + description: Publications describing the germline set + example: "PMID:85642,PMID:12345" + species: + type: string + description: Binomial designation of subject's species + example: Mus musculus + x-airr: + nullable: false + species_subgroup: + type: string + description: Race, strain or other species subgroup to which this subject belongs + example: BALB/C + species_subgroup_type: + type: string + enum: + - breed + - strain + - inbred + - outbred + - locational + locus: + type: string + enum: + - IGH + - IGK + - IGL + - TRA + - TRB + - TRG + - TRD + description: Gene locus + x-airr: + nullable: false + allele_descriptions: + type: array + items: + $ref: '#/AlleleDescription' + description: list of allele_descriptions in the germline set + x-airr: + nullable: false + notes: + type: string + description: Notes + # -# Gene descriptions from the same class organized together as a set -GermlineClassSet: +# Genotype schema +# + +# GenotypeSet lists the Genotypes (describing different loci) inferred for this subject + +GenotypeSet: + discriminator: AIRR + type: object + required: + - receptor_genotype_set_id + properties: + receptor_genotype_set_id: + type: string + description: A unique identifier for this Receptor Genotype Set. + x-airr: + nullable: false + genotype_class_list: + description: List of Genotypes included in this Receptor Genotype Set. + type: array + items: + $ref: '#/Genotype' + +# This enumerates the alleles and gene deletions inferred in a single subject. Included alleles may either be listed by reference to a GermlineSet, or +# listed as 'undocumented', in which case the inferred sequence is provided + +# Genotype of adaptive immune receptors +Genotype: discriminator: AIRR type: object + required: + - receptor_genotype_id + - locus properties: - germline_class_id: + receptor_genotype_id: type: string - description: A unique identifier for this Germline Class Set. - germline_class: + description: A unique identifier for this Receptor Genotype + x-airr: + nullable: false + locus: type: string enum: - IGH - - IGI - IGK - IGL - TRA @@ -317,57 +963,129 @@ GermlineClassSet: - TRG example: IGH x-airr: - nullable: true + nullable: false adc-query-support: true format: controlled vocabulary - germline_alleles: + documented_alleles: type: array - description: Array of gene descriptions + description: Array of alleles inferred to be present which are documented in GermlineSets items: - type: string - description: Gene description for a germline allele. If referring to a known reference sequence in a database the relevant gene/allele nomenclature should be followed (e.g., IGHV4-59*01 if using IMGT/GENE-DB). + type: object + properties: + label: + type: string + description: The accepted name for this allele, if any, taken from the GermlineSet + x-airr: + nullable: false + coding_sequence_identifier: + type: string + description: Unique identifier of the coding sequence, as allocated by an identified repository + x-airr: + nullable: false + germline_set_ref: + type: string + description: GermlineSet from which it was taken, referenced in standardized form (Repo:Label:Version) + example: OGRDB:Human_IGH:2021.11 + x-airr: + nullable: false + phasing: + type: integer + description: Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the same chromosome + x-airr: + nullable: true + adc-query-support: true + undocumented_alleles: + type: array + description: Array of alleles inferred to be present and not documented in an identified GermlineSet + items: + type: object + properties: + allele_name: + type: string + description: Allele name as allocated by the inference pipeline + x-airr: + nullable: false + sequence: + type: string + description: nt sequence of the allele, as provided by the inference pipeline + x-airr: + nullable: false + phasing: + type: integer + description: Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the same chromosome + x-airr: + nullable: true + adc-query-support: true + deleted_genes: + type: array + description: Array of genes identified as being deleted in this genotype + items: + type: object + properties: + gene_symbol: + type: string + description: The accepted name for this gene, taken from the GermlineSet + x-airr: + nullable: false + germline_set_ref: + type: string + description: GermlineSet from which it was taken (issuer/name/version) + x-airr: + nullable: false + phasing: + type: integer + description: Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the same chromosome x-airr: nullable: true adc-query-support: true - germline_process: + inference_process: type: string enum: - genomic_sequencing - repertoire_sequencing - description: Information on how the germline was acquired. Controlled vocabulary. - title: Germline acquisition process + description: Information on how the genotype was acquired. Controlled vocabulary. + title: Genotype acquisition process example: repertoire_sequencing x-airr: nullable: true adc-query-support: true format: controlled vocabulary -# List of germline class sets used for analysis or to describe +# List of MHCGenotypes used for analysis to describe # a subject's genotype -GermlineSet: +MHCGenotypeSet: discriminator: AIRR type: object + required: + - mhc_genotype_set_id properties: - germline_set_id: + mhc_genotype_set_id: type: string - description: A unique identifier for this Germline Set. - germline_class_list: - description: List of classes included in this germline set. + description: A unique identifier for this MHC Genotype Set. + x-airr: + nullable: false + mhc_genotype_class_list: + description: List of classes included in this MHC Genotype set. type: array items: - $ref: '#/GermlineClassSet' + $ref: '#/MHCGenotype' -MHCGermlineClassSet: +# Genotype of major histocompatibility complex (MHC) class I and II receptors +MHCGenotype: discriminator: AIRR type: object + required: + - mhc_genotype_id properties: - germline_class_id: + mhc_genotype_id: type: string - description: A unique identifier for this Germline Set, assumed to be unique in the context of the study. - germline_class: + description: A unique identifier for this MHC Genotype, assumed to be unique in the context of the study. + x-airr: + nullable: false + genotype_class: type: string enum: - - MHC + - MHC # do we need this enum?? example: MHC x-airr: nullable: true @@ -377,39 +1095,34 @@ MHCGermlineClassSet: type: array description: Array of gene descriptions items: - type: string - description: Gene description for a germline allele. If referring to a known reference sequence in a database the relevant gene/allele nomenclature should be followed (e.g., HLA‐C*07:29). + type: object + properties: + gene_symbol: + type: string + description: The accepted name for this gene + x-airr: + nullable: false + germline_set_ref: + type: string + description: Repository and list from which it was taken (issuer/name/version) + x-airr: + nullable: false x-airr: nullable: true adc-query-support: true - germline_process: + genotype_process: type: string enum: - genomic_sequencing - repertoire_sequencing - description: Information on how the germline was acquired. Controlled vocabulary. - title: Germline acquisition process + description: Information on how the genotype was acquired. Controlled vocabulary. + title: Genotype acquisition process example: repertoire_sequencing x-airr: nullable: true adc-query-support: true format: controlled vocabulary -# List of germline class sets used for analysis or to describe -# a subject's genotype -MHCGermlineSet: - discriminator: AIRR - type: object - properties: - germline_set_id: - type: string - description: A unique identifier for this Germline Set. - germline_class_list: - description: List of classes included in this germline set. - type: array - items: - $ref: '#/MHCGermlineClassSet' - # # Repertoire metadata schema # @@ -871,16 +1584,16 @@ Subject: x-airr: nullable: false adc-query-support: true - germline: + genotype: type: object - description: Germline for this subject, if known, by germline class. + description: Genotype for this subject, if known properties: - receptor_germline: - $ref: '#/GermlineSet' - description: Immune receptor germline set for this subject. - mhc_germline: - $ref: '#/MHCGermlineSet' - description: MHC germline set for this subject. + receptor_genotype_set: + $ref: '#/GenotypeSet' + description: Immune receptor genotype set for this subject. + mhc_genotype_set: + $ref: '#/MHCGenotypeSet' + description: MHC genotype set for this subject. # 1-to-n relationship between a subject and its diagnoses Diagnosis: @@ -1892,16 +2605,6 @@ DataProcessing: nullable: true adc-query-support: true name: Processed data file names - data_processing_germline: - type: object - description: Germline used for this data processing process - properties: - receptor_germline: - $ref: '#/GermlineSet' - description: Immune receptor germline set for this data processing process. - mhc_germline: - $ref: '#/MHCGermlineSet' - description: MHC germline set for this data processing process. germline_database: type: string description: Source of germline V(D)J genes with version number or date accessed.