From 86034a8f337a8a6e4b01ce7a24302c56dc1855f0 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 11 Jan 2024 12:03:16 -0600 Subject: [PATCH 1/2] add display name and category for genome attrib --- .../genome_attribs-GTDB.yml | 94 +++++++++++++++++++ .../genome_attribs-PMI.yml | 58 ++++++++++++ 2 files changed, 152 insertions(+) diff --git a/src/common/collection_column_specs/genome_attribs-GTDB.yml b/src/common/collection_column_specs/genome_attribs-GTDB.yml index 481b77fd4..7e8867f6a 100644 --- a/src/common/collection_column_specs/genome_attribs-GTDB.yml +++ b/src/common/collection_column_specs/genome_attribs-GTDB.yml @@ -14,163 +14,257 @@ columns: - key: kbase_id type: string filter_strategy: identity + display_name: KBase ID + category: Identifiers - key: kbase_display_name type: string filter_strategy: ngram + display_name: KBase Name + category: Identifiers - key: accession type: string filter_strategy: identity # this seems like a good candidate for substring + display_name: Accession + category: Identifiers - key: checkm_completeness type: float + display_name: CheckM Completeness + category: Quality - key: checkm_contamination type: float + display_name: CheckM Contamination + category: Quality - key: checkm_marker_count type: int + display_name: Number of CheckM Markers + category: Other - key: checkm_marker_lineage # GTDB node, might need better tokenizing or substring search type: string filter_strategy: fulltext + display_name: CheckM Marker Lineage + category: Other - key: checkm_marker_set_count type: int + display_name: Number of CheckM Marker Sets + category: Other - key: contig_count type: int + display_name: Number of Contigs + category: Other - key: gc_count type: int + display_name: GC Count + category: Other - key: gc_percentage type: float + display_name: GC Content + category: Statistics - key: genome_size type: int + display_name: Genome Size + category: Statistics - key: classification # GTDB classification, might need better tokenizing type: string filter_strategy: ngram + display_name: Classification + category: Taxonomy - key: longest_contig type: int + display_name: Longest Contig + category: Other - key: longest_scaffold type: int + display_name: Longest Scaffold + category: Other - key: mean_contig_length type: float + display_name: Mean Contig Length + category: Other - key: mean_scaffold_length type: float + display_name: Mean Scaffold Length + category: Other - key: mimag_high_quality # is this a boolean? do we need a bool type & convert? type: string filter_strategy: identity + display_name: Mimag High Quality + category: Quality - key: mimag_low_quality # is this a boolean? do we need a bool type & convert? type: string filter_strategy: identity + display_name: Mimag Low Quality + category: Quality - key: mimag_medium_quality # is this a boolean? do we need a bool type & convert? type: string filter_strategy: identity + display_name: Mimag Medium Quality + category: Quality - key: n50_contigs type: int + display_name: N50 Contigs + category: Other - key: n50_scaffolds type: int + display_name: N50 Scaffolds + category: Other - key: ncbi_assembly_level type: string filter_strategy: fulltext + display_name: NCBI Assembly Level + category: Source - key: ncbi_assembly_name type: string filter_strategy: identity # this seems like a good candidate for substring + display_name: NCBI Assembly Name + category: Other - key: ncbi_bioproject type: string filter_strategy: identity # this seems like a good candidate for substring + display_name: NCBI BioProject + category: Source - key: ncbi_biosample type: string filter_strategy: identity # this seems like a good candidate for substring + display_name: NCBI BioSample + category: Source - key: ncbi_country type: string filter_strategy: fulltext + display_name: NCBI Country + category: Other - key: ncbi_date # should put all dates into ISO8601 format, but not add significance type: date + display_name: NCBI Date + category: Source - key: ncbi_genbank_assembly_accession type: string filter_strategy: identity # this seems like a good candidate for substring + display_name: NCBI GenBank Assembly Accession + category: Other - key: ncbi_genome_category type: string filter_strategy: fulltext + display_name: NCBI Genome Category + category: Other - key: ncbi_isolate type: string filter_strategy: fulltext + display_name: NCBI Isolate + category: Other - key: ncbi_isolation_source type: string filter_strategy: fulltext + display_name: NCBI Isolation Source + category: Other - key: ncbi_lat_lon # might want to do specialized processing here type: string filter_strategy: fulltext + display_name: NCBI Latitude/Longitude + category: Other - key: ncbi_organism_name type: string filter_strategy: fulltext + display_name: NCBI Organism Name + category: Taxonomy - key: ncbi_seq_rel_date # should put all dates into ISO8601 format, but not add significance type: date + display_name: NCBI Sequence Release Date + category: Other - key: ncbi_species_taxid type: int + display_name: NCBI Species Taxon ID + category: Other - key: ncbi_strain_identifiers type: string filter_strategy: fulltext + display_name: NCBI Strain Identifiers + category: Other - key: ncbi_submitter type: string filter_strategy: fulltext + display_name: NCBI Submitter + category: Other - key: ncbi_taxid type: int + display_name: NCBI Taxon ID + category: Other - key: ncbi_taxonomy_unfiltered # might need better tokenizing type: string filter_strategy: fulltext + display_name: NCBI Taxonomy (Unfiltered) + category: Other - key: protein_count type: int + display_name: Number of Protein Encoding Genes + category: Statistics - key: scaffold_count type: int + display_name: Number of Scaffolds + category: Other - key: ssu_count type: int + display_name: SSU Count + category: Other - key: ssu_length # this looks like an int that's a string in the DB. Should we coerce? type: string filter_strategy: fulltext + display_name: SSU Length + category: Other - key: trna_aa_count type: int + display_name: tRNA AA Count + category: Other - key: trna_count type: int + display_name: Number of tRNA Genes + category: Statistics - key: trna_selenocysteine_count type: int + display_name: tRNA Selenocysteine Count + category: Other diff --git a/src/common/collection_column_specs/genome_attribs-PMI.yml b/src/common/collection_column_specs/genome_attribs-PMI.yml index c36fd1776..4faea7ffa 100644 --- a/src/common/collection_column_specs/genome_attribs-PMI.yml +++ b/src/common/collection_column_specs/genome_attribs-PMI.yml @@ -14,85 +14,133 @@ columns: - key: kbase_id type: string filter_strategy: identity + display_name: KBase ID + category: Identifiers - key: kbase_sample_id type: string filter_strategy: identity + display_name: KBase Sample ID + category: Identifiers - key: kbase_display_name type: string filter_strategy: ngram + display_name: KBase Name + category: Identifiers - key: kbase_genome_size type: int + display_name: Genome Size (KBase) + category: Statistics - key: kbase_gc_content type: float + display_name: GC Content (KBase) + category: Statistics - key: kbase_num_contigs type: int + display_name: Number of Contigs (KBase) + category: Statistics - key: kbase_num_cds type: int + display_name: Number of CDS (KBase) + category: Statistics - key: kbase_num_protein_encoding_genes type: int + display_name: Number of Protein Encoding Genes (KBase) + category: Statistics - key: Contamination # checkm2 # GTDB is checkm_contamination type: float + display_name: CheckM Contamination + category: Quality - key: Completeness # checkm2 # GTDB is checkm_completeness type: float + display_name: CheckM Completeness + category: Quality - key: user_genome # id provided to GTDB_tk type: string filter_strategy: identity + display_name: User Genome + category: Other - key: classification # GTDB classification, might need better tokenizing type: string filter_strategy: ngram + display_name: Classification + category: Taxonomy - key: fastani_reference # e.g. the genbank accession type: string filter_strategy: identity # this seems like a good candidate for substring + display_name: FastANI Reference + category: Other - key: fastani_reference_radius type: float + display_name: FastANI Reference Radius + category: Other - key: fastani_taxonomy # might need better tokenizing type: string filter_strategy: fulltext + display_name: FastANI Taxonomy + category: Other - key: fastani_ani type: float + display_name: FastANI ANI + category: Other - key: fastani_af type: float + display_name: FastANI AF + category: Other - key: closest_placement_reference # genbank accession type: string filter_strategy: identity # this seems like a good candidate for substring + display_name: Closest Placement Reference + category: Other - key: closest_placement_radius type: float + display_name: Closest Placement Radius + category: Other - key: closest_placement_ani type: float + display_name: Closest Placement ANI + category: Other - key: closest_placement_af type: float + display_name: Closest Placement AF + category: Other - key: pplacer_taxonomy # might need better tokenizing type: string filter_strategy: fulltext + display_name: pplacer Taxonomy + category: Other - key: classification_method type: string filter_strategy: fulltext + display_name: Classification Method + category: Taxonomy - key: note type: string filter_strategy: fulltext + display_name: Note + category: Other # this field probably needs special processing as it's a list of accessions with # an array of associated numbers. It might even be best not to include it in search for @@ -100,17 +148,27 @@ columns: - key: other_related_references(genome_id,species_name,radius,ANI,AF) type: string filter_strategy: fulltext + display_name: Other Related References + category: Other - key: msa_percent type: float + display_name: MSA Percent + category: Other - key: translation_table type: int + display_name: Translation Table + category: Other - key: red_value # not sure what this is type: string filter_strategy: fulltext + display_name: Red Value + category: Other - key: warnings type: string filter_strategy: fulltext + display_name: Warnings + category: Other From 83bb78c27d9aa6256bd9356618fae4540c0905d9 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 11 Jan 2024 12:51:29 -0600 Subject: [PATCH 2/2] add descp placeholder --- .../genome_attribs-GTDB.yml | 47 +++++++++++++++++++ .../genome_attribs-PMI.yml | 29 ++++++++++++ 2 files changed, 76 insertions(+) diff --git a/src/common/collection_column_specs/genome_attribs-GTDB.yml b/src/common/collection_column_specs/genome_attribs-GTDB.yml index 7e8867f6a..2ef8cb2ec 100644 --- a/src/common/collection_column_specs/genome_attribs-GTDB.yml +++ b/src/common/collection_column_specs/genome_attribs-GTDB.yml @@ -16,255 +16,302 @@ columns: filter_strategy: identity display_name: KBase ID category: Identifiers + description: - key: kbase_display_name type: string filter_strategy: ngram display_name: KBase Name category: Identifiers + description: - key: accession type: string filter_strategy: identity # this seems like a good candidate for substring display_name: Accession category: Identifiers + description: - key: checkm_completeness type: float display_name: CheckM Completeness category: Quality + description: - key: checkm_contamination type: float display_name: CheckM Contamination category: Quality + description: - key: checkm_marker_count type: int display_name: Number of CheckM Markers category: Other + description: - key: checkm_marker_lineage # GTDB node, might need better tokenizing or substring search type: string filter_strategy: fulltext display_name: CheckM Marker Lineage category: Other + description: - key: checkm_marker_set_count type: int display_name: Number of CheckM Marker Sets category: Other + description: - key: contig_count type: int display_name: Number of Contigs category: Other + description: - key: gc_count type: int display_name: GC Count category: Other + description: - key: gc_percentage type: float display_name: GC Content category: Statistics + description: - key: genome_size type: int display_name: Genome Size category: Statistics + description: - key: classification # GTDB classification, might need better tokenizing type: string filter_strategy: ngram display_name: Classification category: Taxonomy + description: - key: longest_contig type: int display_name: Longest Contig category: Other + description: - key: longest_scaffold type: int display_name: Longest Scaffold category: Other + description: - key: mean_contig_length type: float display_name: Mean Contig Length category: Other + description: - key: mean_scaffold_length type: float display_name: Mean Scaffold Length category: Other + description: - key: mimag_high_quality # is this a boolean? do we need a bool type & convert? type: string filter_strategy: identity display_name: Mimag High Quality category: Quality + description: - key: mimag_low_quality # is this a boolean? do we need a bool type & convert? type: string filter_strategy: identity display_name: Mimag Low Quality category: Quality + description: - key: mimag_medium_quality # is this a boolean? do we need a bool type & convert? type: string filter_strategy: identity display_name: Mimag Medium Quality category: Quality + description: - key: n50_contigs type: int display_name: N50 Contigs category: Other + description: - key: n50_scaffolds type: int display_name: N50 Scaffolds category: Other + description: - key: ncbi_assembly_level type: string filter_strategy: fulltext display_name: NCBI Assembly Level category: Source + description: - key: ncbi_assembly_name type: string filter_strategy: identity # this seems like a good candidate for substring display_name: NCBI Assembly Name category: Other + description: - key: ncbi_bioproject type: string filter_strategy: identity # this seems like a good candidate for substring display_name: NCBI BioProject category: Source + description: - key: ncbi_biosample type: string filter_strategy: identity # this seems like a good candidate for substring display_name: NCBI BioSample category: Source + description: - key: ncbi_country type: string filter_strategy: fulltext display_name: NCBI Country category: Other + description: - key: ncbi_date # should put all dates into ISO8601 format, but not add significance type: date display_name: NCBI Date category: Source + description: - key: ncbi_genbank_assembly_accession type: string filter_strategy: identity # this seems like a good candidate for substring display_name: NCBI GenBank Assembly Accession category: Other + description: - key: ncbi_genome_category type: string filter_strategy: fulltext display_name: NCBI Genome Category category: Other + description: - key: ncbi_isolate type: string filter_strategy: fulltext display_name: NCBI Isolate category: Other + description: - key: ncbi_isolation_source type: string filter_strategy: fulltext display_name: NCBI Isolation Source category: Other + description: - key: ncbi_lat_lon # might want to do specialized processing here type: string filter_strategy: fulltext display_name: NCBI Latitude/Longitude category: Other + description: - key: ncbi_organism_name type: string filter_strategy: fulltext display_name: NCBI Organism Name category: Taxonomy + description: - key: ncbi_seq_rel_date # should put all dates into ISO8601 format, but not add significance type: date display_name: NCBI Sequence Release Date category: Other + description: - key: ncbi_species_taxid type: int display_name: NCBI Species Taxon ID category: Other + description: - key: ncbi_strain_identifiers type: string filter_strategy: fulltext display_name: NCBI Strain Identifiers category: Other + description: - key: ncbi_submitter type: string filter_strategy: fulltext display_name: NCBI Submitter category: Other + description: - key: ncbi_taxid type: int display_name: NCBI Taxon ID category: Other + description: - key: ncbi_taxonomy_unfiltered # might need better tokenizing type: string filter_strategy: fulltext display_name: NCBI Taxonomy (Unfiltered) category: Other + description: - key: protein_count type: int display_name: Number of Protein Encoding Genes category: Statistics + description: - key: scaffold_count type: int display_name: Number of Scaffolds category: Other + description: - key: ssu_count type: int display_name: SSU Count category: Other + description: - key: ssu_length # this looks like an int that's a string in the DB. Should we coerce? type: string filter_strategy: fulltext display_name: SSU Length category: Other + description: - key: trna_aa_count type: int display_name: tRNA AA Count category: Other + description: - key: trna_count type: int display_name: Number of tRNA Genes category: Statistics + description: - key: trna_selenocysteine_count type: int display_name: tRNA Selenocysteine Count category: Other + description: diff --git a/src/common/collection_column_specs/genome_attribs-PMI.yml b/src/common/collection_column_specs/genome_attribs-PMI.yml index 4faea7ffa..be9cb3e3a 100644 --- a/src/common/collection_column_specs/genome_attribs-PMI.yml +++ b/src/common/collection_column_specs/genome_attribs-PMI.yml @@ -16,131 +16,155 @@ columns: filter_strategy: identity display_name: KBase ID category: Identifiers + description: - key: kbase_sample_id type: string filter_strategy: identity display_name: KBase Sample ID category: Identifiers + description: - key: kbase_display_name type: string filter_strategy: ngram display_name: KBase Name category: Identifiers + description: - key: kbase_genome_size type: int display_name: Genome Size (KBase) category: Statistics + description: - key: kbase_gc_content type: float display_name: GC Content (KBase) category: Statistics + description: - key: kbase_num_contigs type: int display_name: Number of Contigs (KBase) category: Statistics + description: - key: kbase_num_cds type: int display_name: Number of CDS (KBase) category: Statistics + description: - key: kbase_num_protein_encoding_genes type: int display_name: Number of Protein Encoding Genes (KBase) category: Statistics + description: - key: Contamination # checkm2 # GTDB is checkm_contamination type: float display_name: CheckM Contamination category: Quality + description: - key: Completeness # checkm2 # GTDB is checkm_completeness type: float display_name: CheckM Completeness category: Quality + description: - key: user_genome # id provided to GTDB_tk type: string filter_strategy: identity display_name: User Genome category: Other + description: - key: classification # GTDB classification, might need better tokenizing type: string filter_strategy: ngram display_name: Classification category: Taxonomy + description: - key: fastani_reference # e.g. the genbank accession type: string filter_strategy: identity # this seems like a good candidate for substring display_name: FastANI Reference category: Other + description: - key: fastani_reference_radius type: float display_name: FastANI Reference Radius category: Other + description: - key: fastani_taxonomy # might need better tokenizing type: string filter_strategy: fulltext display_name: FastANI Taxonomy category: Other + description: - key: fastani_ani type: float display_name: FastANI ANI category: Other + description: - key: fastani_af type: float display_name: FastANI AF category: Other + description: - key: closest_placement_reference # genbank accession type: string filter_strategy: identity # this seems like a good candidate for substring display_name: Closest Placement Reference category: Other + description: - key: closest_placement_radius type: float display_name: Closest Placement Radius category: Other + description: - key: closest_placement_ani type: float display_name: Closest Placement ANI category: Other + description: - key: closest_placement_af type: float display_name: Closest Placement AF category: Other + description: - key: pplacer_taxonomy # might need better tokenizing type: string filter_strategy: fulltext display_name: pplacer Taxonomy category: Other + description: - key: classification_method type: string filter_strategy: fulltext display_name: Classification Method category: Taxonomy + description: - key: note type: string filter_strategy: fulltext display_name: Note category: Other + description: # this field probably needs special processing as it's a list of accessions with # an array of associated numbers. It might even be best not to include it in search for @@ -150,25 +174,30 @@ columns: filter_strategy: fulltext display_name: Other Related References category: Other + description: - key: msa_percent type: float display_name: MSA Percent category: Other + description: - key: translation_table type: int display_name: Translation Table category: Other + description: - key: red_value # not sure what this is type: string filter_strategy: fulltext display_name: Red Value category: Other + description: - key: warnings type: string filter_strategy: fulltext display_name: Warnings category: Other + description: