kbase · Tianhao-Gu · Jan 11, 2024 · Jan 11, 2024 · Jan 11, 2024
diff --git a/src/common/collection_column_specs/genome_attribs-GTDB.yml b/src/common/collection_column_specs/genome_attribs-GTDB.yml
@@ -14,163 +14,304 @@ columns:
    - key: kbase_id
      type: string
      filter_strategy: identity
+     display_name: KBase ID
+     category: Identifiers
+     description:
 
    - key: kbase_display_name
      type: string
      filter_strategy: ngram
+     display_name: KBase Name
+     category: Identifiers
+     description:
 
    - key: accession
      type: string
      filter_strategy: identity  # this seems like a good candidate for substring
+     display_name: Accession
+     category: Identifiers
+     description:
 
    - key: checkm_completeness
      type: float
+     display_name: CheckM Completeness
+     category: Quality
+     description:
 
    - key: checkm_contamination
      type: float
+     display_name: CheckM Contamination
+     category: Quality
+     description:
 
    - key: checkm_marker_count
      type: int
+     display_name: Number of CheckM Markers
+     category: Other
+     description:
 
    - key: checkm_marker_lineage  # GTDB node, might need better tokenizing or substring search
      type: string
      filter_strategy: fulltext
+     display_name: CheckM Marker Lineage
+     category: Other
+     description:
 
    - key: checkm_marker_set_count
      type: int
+     display_name: Number of CheckM Marker Sets
+     category: Other
+     description:
 
    - key: contig_count
      type: int
+     display_name: Number of Contigs
+     category: Other
+     description:
 
    - key: gc_count
      type: int
+     display_name: GC Count
+     category: Other
+     description:
 
    - key: gc_percentage
      type: float
+     display_name: GC Content
+     category: Statistics
+     description:
 
    - key: genome_size
      type: int
+     display_name: Genome Size
+     category: Statistics
+     description:
 
    - key: classification  # GTDB classification, might need better tokenizing
      type: string
      filter_strategy: ngram
+     display_name: Classification
+     category: Taxonomy
+     description:
 
    - key: longest_contig
      type: int
+     display_name: Longest Contig
+     category: Other
+     description:
 
    - key: longest_scaffold
      type: int
+     display_name: Longest Scaffold
+     category: Other
+     description:
 
    - key: mean_contig_length
      type: float
+     display_name: Mean Contig Length
+     category: Other
+     description:
 
    - key: mean_scaffold_length
      type: float
+     display_name: Mean Scaffold Length
+     category: Other
+     description:
 
    - key: mimag_high_quality  # is this a boolean? do we need a bool type & convert?
      type: string
      filter_strategy: identity
+     display_name: Mimag High Quality
+     category: Quality
+     description:
 
    - key: mimag_low_quality  # is this a boolean? do we need a bool type & convert?
      type: string
      filter_strategy: identity
+     display_name: Mimag Low Quality
+     category: Quality
+     description:
 
    - key: mimag_medium_quality  # is this a boolean? do we need a bool type & convert?
      type: string
      filter_strategy: identity
+     display_name: Mimag Medium Quality
+     category: Quality
+     description:
 
    - key: n50_contigs
      type: int
+     display_name: N50 Contigs
+     category: Other
+     description:
 
    - key: n50_scaffolds
      type: int
+     display_name: N50 Scaffolds
+     category: Other
+     description:
 
    - key: ncbi_assembly_level
      type: string
      filter_strategy: fulltext
+     display_name: NCBI Assembly Level
+     category: Source
+     description:
 
    - key: ncbi_assembly_name
      type: string
      filter_strategy: identity  # this seems like a good candidate for substring
+     display_name: NCBI Assembly Name
+     category: Other
+     description:
 
    - key: ncbi_bioproject
      type: string
      filter_strategy: identity  # this seems like a good candidate for substring
+     display_name: NCBI BioProject
+     category: Source
+     description:
 
    - key: ncbi_biosample
      type: string
      filter_strategy: identity  # this seems like a good candidate for substring
+     display_name: NCBI BioSample
+     category: Source
+     description:
 
    - key: ncbi_country
      type: string
      filter_strategy: fulltext
+     display_name: NCBI Country
+     category: Other
+     description:
 
    - key: ncbi_date  # should put all dates into ISO8601 format, but not add significance
      type: date
+     display_name: NCBI Date
+     category: Source
+     description:
 
    - key: ncbi_genbank_assembly_accession
      type: string
      filter_strategy: identity  # this seems like a good candidate for substring
+     display_name: NCBI GenBank Assembly Accession
+     category: Other
+     description:
 
    - key: ncbi_genome_category
      type: string
      filter_strategy: fulltext
+     display_name: NCBI Genome Category
+     category: Other
+     description:
 
    - key: ncbi_isolate
      type: string
      filter_strategy: fulltext
+     display_name: NCBI Isolate
+     category: Other
+     description:
 
    - key: ncbi_isolation_source
      type: string
      filter_strategy: fulltext
+     display_name: NCBI Isolation Source
+     category: Other
+     description:
 
    - key: ncbi_lat_lon  # might want to do specialized processing here
      type: string
      filter_strategy: fulltext
+     display_name: NCBI Latitude/Longitude
+     category: Other
+     description:
 
    - key: ncbi_organism_name
      type: string
      filter_strategy: fulltext
+     display_name: NCBI Organism Name
+     category: Taxonomy
+     description:
 
    - key: ncbi_seq_rel_date  # should put all dates into ISO8601 format, but not add significance
      type: date
+     display_name: NCBI Sequence Release Date
+     category: Other
+     description:
 
    - key: ncbi_species_taxid
      type: int
+     display_name: NCBI Species Taxon ID
+     category: Other
+     description:
 
    - key: ncbi_strain_identifiers
      type: string
      filter_strategy: fulltext
+     display_name: NCBI Strain Identifiers
+     category: Other
+     description:
 
    - key: ncbi_submitter
      type: string
      filter_strategy: fulltext
+     display_name: NCBI Submitter
+     category: Other
+     description:
 
    - key: ncbi_taxid
      type: int
+     display_name: NCBI Taxon ID
+     category: Other
+     description:
 
    - key: ncbi_taxonomy_unfiltered  # might need better tokenizing
      type: string
      filter_strategy: fulltext
+     display_name: NCBI Taxonomy (Unfiltered)
+     category: Other
+     description:
 
    - key: protein_count
      type: int
+     display_name: Number of Protein Encoding Genes
+     category: Statistics
+     description:
 
    - key: scaffold_count
      type: int
+     display_name: Number of Scaffolds
+     category: Other
+     description:
 
    - key: ssu_count
      type: int
+     display_name: SSU Count
+     category: Other
+     description:
 
    - key: ssu_length  # this looks like an int that's a string in the DB. Should we coerce?
      type: string
      filter_strategy: fulltext
+     display_name: SSU Length
+     category: Other
+     description:
 
    - key: trna_aa_count
      type: int
+     display_name: tRNA AA Count
+     category: Other
+     description:
 
    - key: trna_count
      type: int
+     display_name: Number of tRNA Genes
+     category: Statistics
+     description:
 
    - key: trna_selenocysteine_count
      type: int
+     display_name: tRNA Selenocysteine Count
+     category: Other
+     description: