diff --git a/src/common/collection_column_specs/genome_attribs-GTDB.yml b/src/common/collection_column_specs/genome_attribs-GTDB.yml index 481b77fd4..2ef8cb2ec 100644 --- a/src/common/collection_column_specs/genome_attribs-GTDB.yml +++ b/src/common/collection_column_specs/genome_attribs-GTDB.yml @@ -14,163 +14,304 @@ columns: - key: kbase_id type: string filter_strategy: identity + display_name: KBase ID + category: Identifiers + description: - key: kbase_display_name type: string filter_strategy: ngram + display_name: KBase Name + category: Identifiers + description: - key: accession type: string filter_strategy: identity # this seems like a good candidate for substring + display_name: Accession + category: Identifiers + description: - key: checkm_completeness type: float + display_name: CheckM Completeness + category: Quality + description: - key: checkm_contamination type: float + display_name: CheckM Contamination + category: Quality + description: - key: checkm_marker_count type: int + display_name: Number of CheckM Markers + category: Other + description: - key: checkm_marker_lineage # GTDB node, might need better tokenizing or substring search type: string filter_strategy: fulltext + display_name: CheckM Marker Lineage + category: Other + description: - key: checkm_marker_set_count type: int + display_name: Number of CheckM Marker Sets + category: Other + description: - key: contig_count type: int + display_name: Number of Contigs + category: Other + description: - key: gc_count type: int + display_name: GC Count + category: Other + description: - key: gc_percentage type: float + display_name: GC Content + category: Statistics + description: - key: genome_size type: int + display_name: Genome Size + category: Statistics + description: - key: classification # GTDB classification, might need better tokenizing type: string filter_strategy: ngram + display_name: Classification + category: Taxonomy + description: - key: longest_contig type: int + display_name: Longest Contig + category: Other + description: - key: longest_scaffold type: int + display_name: Longest Scaffold + category: Other + description: - key: mean_contig_length type: float + display_name: Mean Contig Length + category: Other + description: - key: mean_scaffold_length type: float + display_name: Mean Scaffold Length + category: Other + description: - key: mimag_high_quality # is this a boolean? do we need a bool type & convert? type: string filter_strategy: identity + display_name: Mimag High Quality + category: Quality + description: - key: mimag_low_quality # is this a boolean? do we need a bool type & convert? type: string filter_strategy: identity + display_name: Mimag Low Quality + category: Quality + description: - key: mimag_medium_quality # is this a boolean? do we need a bool type & convert? type: string filter_strategy: identity + display_name: Mimag Medium Quality + category: Quality + description: - key: n50_contigs type: int + display_name: N50 Contigs + category: Other + description: - key: n50_scaffolds type: int + display_name: N50 Scaffolds + category: Other + description: - key: ncbi_assembly_level type: string filter_strategy: fulltext + display_name: NCBI Assembly Level + category: Source + description: - key: ncbi_assembly_name type: string filter_strategy: identity # this seems like a good candidate for substring + display_name: NCBI Assembly Name + category: Other + description: - key: ncbi_bioproject type: string filter_strategy: identity # this seems like a good candidate for substring + display_name: NCBI BioProject + category: Source + description: - key: ncbi_biosample type: string filter_strategy: identity # this seems like a good candidate for substring + display_name: NCBI BioSample + category: Source + description: - key: ncbi_country type: string filter_strategy: fulltext + display_name: NCBI Country + category: Other + description: - key: ncbi_date # should put all dates into ISO8601 format, but not add significance type: date + display_name: NCBI Date + category: Source + description: - key: ncbi_genbank_assembly_accession type: string filter_strategy: identity # this seems like a good candidate for substring + display_name: NCBI GenBank Assembly Accession + category: Other + description: - key: ncbi_genome_category type: string filter_strategy: fulltext + display_name: NCBI Genome Category + category: Other + description: - key: ncbi_isolate type: string filter_strategy: fulltext + display_name: NCBI Isolate + category: Other + description: - key: ncbi_isolation_source type: string filter_strategy: fulltext + display_name: NCBI Isolation Source + category: Other + description: - key: ncbi_lat_lon # might want to do specialized processing here type: string filter_strategy: fulltext + display_name: NCBI Latitude/Longitude + category: Other + description: - key: ncbi_organism_name type: string filter_strategy: fulltext + display_name: NCBI Organism Name + category: Taxonomy + description: - key: ncbi_seq_rel_date # should put all dates into ISO8601 format, but not add significance type: date + display_name: NCBI Sequence Release Date + category: Other + description: - key: ncbi_species_taxid type: int + display_name: NCBI Species Taxon ID + category: Other + description: - key: ncbi_strain_identifiers type: string filter_strategy: fulltext + display_name: NCBI Strain Identifiers + category: Other + description: - key: ncbi_submitter type: string filter_strategy: fulltext + display_name: NCBI Submitter + category: Other + description: - key: ncbi_taxid type: int + display_name: NCBI Taxon ID + category: Other + description: - key: ncbi_taxonomy_unfiltered # might need better tokenizing type: string filter_strategy: fulltext + display_name: NCBI Taxonomy (Unfiltered) + category: Other + description: - key: protein_count type: int + display_name: Number of Protein Encoding Genes + category: Statistics + description: - key: scaffold_count type: int + display_name: Number of Scaffolds + category: Other + description: - key: ssu_count type: int + display_name: SSU Count + category: Other + description: - key: ssu_length # this looks like an int that's a string in the DB. Should we coerce? type: string filter_strategy: fulltext + display_name: SSU Length + category: Other + description: - key: trna_aa_count type: int + display_name: tRNA AA Count + category: Other + description: - key: trna_count type: int + display_name: Number of tRNA Genes + category: Statistics + description: - key: trna_selenocysteine_count type: int + display_name: tRNA Selenocysteine Count + category: Other + description: diff --git a/src/common/collection_column_specs/genome_attribs-PMI.yml b/src/common/collection_column_specs/genome_attribs-PMI.yml index c36fd1776..be9cb3e3a 100644 --- a/src/common/collection_column_specs/genome_attribs-PMI.yml +++ b/src/common/collection_column_specs/genome_attribs-PMI.yml @@ -14,85 +14,157 @@ columns: - key: kbase_id type: string filter_strategy: identity + display_name: KBase ID + category: Identifiers + description: - key: kbase_sample_id type: string filter_strategy: identity + display_name: KBase Sample ID + category: Identifiers + description: - key: kbase_display_name type: string filter_strategy: ngram + display_name: KBase Name + category: Identifiers + description: - key: kbase_genome_size type: int + display_name: Genome Size (KBase) + category: Statistics + description: - key: kbase_gc_content type: float + display_name: GC Content (KBase) + category: Statistics + description: - key: kbase_num_contigs type: int + display_name: Number of Contigs (KBase) + category: Statistics + description: - key: kbase_num_cds type: int + display_name: Number of CDS (KBase) + category: Statistics + description: - key: kbase_num_protein_encoding_genes type: int + display_name: Number of Protein Encoding Genes (KBase) + category: Statistics + description: - key: Contamination # checkm2 # GTDB is checkm_contamination type: float + display_name: CheckM Contamination + category: Quality + description: - key: Completeness # checkm2 # GTDB is checkm_completeness type: float + display_name: CheckM Completeness + category: Quality + description: - key: user_genome # id provided to GTDB_tk type: string filter_strategy: identity + display_name: User Genome + category: Other + description: - key: classification # GTDB classification, might need better tokenizing type: string filter_strategy: ngram + display_name: Classification + category: Taxonomy + description: - key: fastani_reference # e.g. the genbank accession type: string filter_strategy: identity # this seems like a good candidate for substring + display_name: FastANI Reference + category: Other + description: - key: fastani_reference_radius type: float + display_name: FastANI Reference Radius + category: Other + description: - key: fastani_taxonomy # might need better tokenizing type: string filter_strategy: fulltext + display_name: FastANI Taxonomy + category: Other + description: - key: fastani_ani type: float + display_name: FastANI ANI + category: Other + description: - key: fastani_af type: float + display_name: FastANI AF + category: Other + description: - key: closest_placement_reference # genbank accession type: string filter_strategy: identity # this seems like a good candidate for substring + display_name: Closest Placement Reference + category: Other + description: - key: closest_placement_radius type: float + display_name: Closest Placement Radius + category: Other + description: - key: closest_placement_ani type: float + display_name: Closest Placement ANI + category: Other + description: - key: closest_placement_af type: float + display_name: Closest Placement AF + category: Other + description: - key: pplacer_taxonomy # might need better tokenizing type: string filter_strategy: fulltext + display_name: pplacer Taxonomy + category: Other + description: - key: classification_method type: string filter_strategy: fulltext + display_name: Classification Method + category: Taxonomy + description: - key: note type: string filter_strategy: fulltext + display_name: Note + category: Other + description: # this field probably needs special processing as it's a list of accessions with # an array of associated numbers. It might even be best not to include it in search for @@ -100,17 +172,32 @@ columns: - key: other_related_references(genome_id,species_name,radius,ANI,AF) type: string filter_strategy: fulltext + display_name: Other Related References + category: Other + description: - key: msa_percent type: float + display_name: MSA Percent + category: Other + description: - key: translation_table type: int + display_name: Translation Table + category: Other + description: - key: red_value # not sure what this is type: string filter_strategy: fulltext + display_name: Red Value + category: Other + description: - key: warnings type: string filter_strategy: fulltext + display_name: Warnings + category: Other + description: