From 86034a8f337a8a6e4b01ce7a24302c56dc1855f0 Mon Sep 17 00:00:00 2001
From: Tianhao-Gu <tgu@anl.gov>
Date: Thu, 11 Jan 2024 12:03:16 -0600
Subject: [PATCH 1/2] add display name and category for genome attrib

---
 .../genome_attribs-GTDB.yml                   | 94 +++++++++++++++++++
 .../genome_attribs-PMI.yml                    | 58 ++++++++++++
 2 files changed, 152 insertions(+)

diff --git a/src/common/collection_column_specs/genome_attribs-GTDB.yml b/src/common/collection_column_specs/genome_attribs-GTDB.yml
index 481b77fd4..7e8867f6a 100644
--- a/src/common/collection_column_specs/genome_attribs-GTDB.yml
+++ b/src/common/collection_column_specs/genome_attribs-GTDB.yml
@@ -14,163 +14,257 @@ columns:
    - key: kbase_id
      type: string
      filter_strategy: identity
+     display_name: KBase ID
+     category: Identifiers
 
    - key: kbase_display_name
      type: string
      filter_strategy: ngram
+     display_name: KBase Name
+     category: Identifiers
 
    - key: accession
      type: string
      filter_strategy: identity  # this seems like a good candidate for substring
+     display_name: Accession
+     category: Identifiers
 
    - key: checkm_completeness
      type: float
+     display_name: CheckM Completeness
+     category: Quality
 
    - key: checkm_contamination
      type: float
+     display_name: CheckM Contamination
+     category: Quality
 
    - key: checkm_marker_count
      type: int
+     display_name: Number of CheckM Markers
+     category: Other
 
    - key: checkm_marker_lineage  # GTDB node, might need better tokenizing or substring search
      type: string
      filter_strategy: fulltext
+     display_name: CheckM Marker Lineage
+     category: Other
 
    - key: checkm_marker_set_count
      type: int
+     display_name: Number of CheckM Marker Sets
+     category: Other
 
    - key: contig_count
      type: int
+     display_name: Number of Contigs
+     category: Other
 
    - key: gc_count
      type: int
+     display_name: GC Count
+     category: Other
 
    - key: gc_percentage
      type: float
+     display_name: GC Content
+     category: Statistics
 
    - key: genome_size
      type: int
+     display_name: Genome Size
+     category: Statistics
 
    - key: classification  # GTDB classification, might need better tokenizing
      type: string
      filter_strategy: ngram
+     display_name: Classification
+     category: Taxonomy
 
    - key: longest_contig
      type: int
+     display_name: Longest Contig
+     category: Other
 
    - key: longest_scaffold
      type: int
+     display_name: Longest Scaffold
+     category: Other
 
    - key: mean_contig_length
      type: float
+     display_name: Mean Contig Length
+     category: Other
 
    - key: mean_scaffold_length
      type: float
+     display_name: Mean Scaffold Length
+     category: Other
 
    - key: mimag_high_quality  # is this a boolean? do we need a bool type & convert?
      type: string
      filter_strategy: identity
+     display_name: Mimag High Quality
+     category: Quality
 
    - key: mimag_low_quality  # is this a boolean? do we need a bool type & convert?
      type: string
      filter_strategy: identity
+     display_name: Mimag Low Quality
+     category: Quality
 
    - key: mimag_medium_quality  # is this a boolean? do we need a bool type & convert?
      type: string
      filter_strategy: identity
+     display_name: Mimag Medium Quality
+     category: Quality
 
    - key: n50_contigs
      type: int
+     display_name: N50 Contigs
+     category: Other
 
    - key: n50_scaffolds
      type: int
+     display_name: N50 Scaffolds
+     category: Other
 
    - key: ncbi_assembly_level
      type: string
      filter_strategy: fulltext
+     display_name: NCBI Assembly Level
+     category: Source
 
    - key: ncbi_assembly_name
      type: string
      filter_strategy: identity  # this seems like a good candidate for substring
+     display_name: NCBI Assembly Name
+     category: Other
 
    - key: ncbi_bioproject
      type: string
      filter_strategy: identity  # this seems like a good candidate for substring
+     display_name: NCBI BioProject
+     category: Source
 
    - key: ncbi_biosample
      type: string
      filter_strategy: identity  # this seems like a good candidate for substring
+     display_name: NCBI BioSample
+     category: Source
 
    - key: ncbi_country
      type: string
      filter_strategy: fulltext
+     display_name: NCBI Country
+     category: Other
 
    - key: ncbi_date  # should put all dates into ISO8601 format, but not add significance
      type: date
+     display_name: NCBI Date
+     category: Source
 
    - key: ncbi_genbank_assembly_accession
      type: string
      filter_strategy: identity  # this seems like a good candidate for substring
+     display_name: NCBI GenBank Assembly Accession
+     category: Other
 
    - key: ncbi_genome_category
      type: string
      filter_strategy: fulltext
+     display_name: NCBI Genome Category
+     category: Other
 
    - key: ncbi_isolate
      type: string
      filter_strategy: fulltext
+     display_name: NCBI Isolate
+     category: Other
 
    - key: ncbi_isolation_source
      type: string
      filter_strategy: fulltext
+     display_name: NCBI Isolation Source
+     category: Other
 
    - key: ncbi_lat_lon  # might want to do specialized processing here
      type: string
      filter_strategy: fulltext
+     display_name: NCBI Latitude/Longitude
+     category: Other
 
    - key: ncbi_organism_name
      type: string
      filter_strategy: fulltext
+     display_name: NCBI Organism Name
+     category: Taxonomy
 
    - key: ncbi_seq_rel_date  # should put all dates into ISO8601 format, but not add significance
      type: date
+     display_name: NCBI Sequence Release Date
+     category: Other
 
    - key: ncbi_species_taxid
      type: int
+     display_name: NCBI Species Taxon ID
+     category: Other
 
    - key: ncbi_strain_identifiers
      type: string
      filter_strategy: fulltext
+     display_name: NCBI Strain Identifiers
+     category: Other
 
    - key: ncbi_submitter
      type: string
      filter_strategy: fulltext
+     display_name: NCBI Submitter
+     category: Other
 
    - key: ncbi_taxid
      type: int
+     display_name: NCBI Taxon ID
+     category: Other
 
    - key: ncbi_taxonomy_unfiltered  # might need better tokenizing
      type: string
      filter_strategy: fulltext
+     display_name: NCBI Taxonomy (Unfiltered)
+     category: Other
 
    - key: protein_count
      type: int
+     display_name: Number of Protein Encoding Genes
+     category: Statistics
 
    - key: scaffold_count
      type: int
+     display_name: Number of Scaffolds
+     category: Other
 
    - key: ssu_count
      type: int
+     display_name: SSU Count
+     category: Other
 
    - key: ssu_length  # this looks like an int that's a string in the DB. Should we coerce?
      type: string
      filter_strategy: fulltext
+     display_name: SSU Length
+     category: Other
 
    - key: trna_aa_count
      type: int
+     display_name: tRNA AA Count
+     category: Other
 
    - key: trna_count
      type: int
+     display_name: Number of tRNA Genes
+     category: Statistics
 
    - key: trna_selenocysteine_count
      type: int
+     display_name: tRNA Selenocysteine Count
+     category: Other
diff --git a/src/common/collection_column_specs/genome_attribs-PMI.yml b/src/common/collection_column_specs/genome_attribs-PMI.yml
index c36fd1776..4faea7ffa 100644
--- a/src/common/collection_column_specs/genome_attribs-PMI.yml
+++ b/src/common/collection_column_specs/genome_attribs-PMI.yml
@@ -14,85 +14,133 @@ columns:
    - key: kbase_id
      type: string
      filter_strategy: identity
+     display_name: KBase ID
+     category: Identifiers
 
    - key: kbase_sample_id
      type: string
      filter_strategy: identity
+     display_name: KBase Sample ID
+     category: Identifiers
 
    - key: kbase_display_name
      type: string
      filter_strategy: ngram
+     display_name: KBase Name
+     category: Identifiers
 
    - key: kbase_genome_size
      type: int
+     display_name: Genome Size (KBase)
+     category: Statistics
 
    - key: kbase_gc_content
      type: float
+     display_name: GC Content (KBase)
+     category: Statistics
 
    - key: kbase_num_contigs
      type: int
+     display_name: Number of Contigs (KBase)
+     category: Statistics
 
    - key: kbase_num_cds
      type: int
+     display_name: Number of CDS (KBase)
+     category: Statistics
 
    - key: kbase_num_protein_encoding_genes
      type: int
+     display_name: Number of Protein Encoding Genes (KBase)
+     category: Statistics
 
    - key: Contamination  # checkm2  # GTDB is checkm_contamination
      type: float
+     display_name: CheckM Contamination
+     category: Quality
    
    - key: Completeness  # checkm2  # GTDB is checkm_completeness
      type: float
+     display_name: CheckM Completeness
+     category: Quality
    
    - key: user_genome  # id provided to GTDB_tk
      type: string
      filter_strategy: identity
+     display_name: User Genome
+     category: Other
    
    - key: classification  # GTDB classification, might need better tokenizing
      type: string
      filter_strategy: ngram
+     display_name: Classification
+     category: Taxonomy
    
    - key: fastani_reference  # e.g. the genbank accession
      type: string
      filter_strategy: identity  # this seems like a good candidate for substring
+     display_name: FastANI Reference
+     category: Other
 
    - key: fastani_reference_radius
      type: float
+     display_name: FastANI Reference Radius
+     category: Other
 
    - key: fastani_taxonomy  # might need better tokenizing
      type: string
      filter_strategy: fulltext
+     display_name: FastANI Taxonomy
+     category: Other
 
    - key: fastani_ani
      type: float
+     display_name: FastANI ANI
+     category: Other
 
    - key: fastani_af
      type: float
+     display_name: FastANI AF
+     category: Other
 
    - key: closest_placement_reference  # genbank accession
      type: string
      filter_strategy: identity  # this seems like a good candidate for substring
+     display_name: Closest Placement Reference
+     category: Other
 
    - key: closest_placement_radius
      type: float
+     display_name: Closest Placement Radius
+     category: Other
 
    - key: closest_placement_ani
      type: float
+     display_name: Closest Placement ANI
+     category: Other
 
    - key: closest_placement_af
      type: float
+     display_name: Closest Placement AF
+     category: Other
 
    - key: pplacer_taxonomy  # might need better tokenizing
      type: string
      filter_strategy: fulltext
+     display_name: pplacer Taxonomy
+     category: Other
 
    - key: classification_method
      type: string
      filter_strategy: fulltext
+     display_name: Classification Method
+     category: Taxonomy
 
    - key: note
      type: string
      filter_strategy: fulltext
+     display_name: Note
+     category: Other
 
    # this field probably needs special processing as it's a list of accessions with
    # an array of associated numbers. It might even be best not to include it in search for
@@ -100,17 +148,27 @@ columns:
    - key: other_related_references(genome_id,species_name,radius,ANI,AF)
      type: string
      filter_strategy: fulltext
+     display_name: Other Related References
+     category: Other
 
    - key: msa_percent
      type: float
+     display_name: MSA Percent
+     category: Other
 
    - key: translation_table
      type: int
+     display_name: Translation Table
+     category: Other
 
    - key: red_value  # not sure what this is
      type: string
      filter_strategy: fulltext
+     display_name: Red Value
+     category: Other
 
    - key: warnings
      type: string
      filter_strategy: fulltext
+     display_name: Warnings
+     category: Other

From 83bb78c27d9aa6256bd9356618fae4540c0905d9 Mon Sep 17 00:00:00 2001
From: Tianhao-Gu <tgu@anl.gov>
Date: Thu, 11 Jan 2024 12:51:29 -0600
Subject: [PATCH 2/2] add descp placeholder

---
 .../genome_attribs-GTDB.yml                   | 47 +++++++++++++++++++
 .../genome_attribs-PMI.yml                    | 29 ++++++++++++
 2 files changed, 76 insertions(+)

diff --git a/src/common/collection_column_specs/genome_attribs-GTDB.yml b/src/common/collection_column_specs/genome_attribs-GTDB.yml
index 7e8867f6a..2ef8cb2ec 100644
--- a/src/common/collection_column_specs/genome_attribs-GTDB.yml
+++ b/src/common/collection_column_specs/genome_attribs-GTDB.yml
@@ -16,255 +16,302 @@ columns:
      filter_strategy: identity
      display_name: KBase ID
      category: Identifiers
+     description:
 
    - key: kbase_display_name
      type: string
      filter_strategy: ngram
      display_name: KBase Name
      category: Identifiers
+     description:
 
    - key: accession
      type: string
      filter_strategy: identity  # this seems like a good candidate for substring
      display_name: Accession
      category: Identifiers
+     description:
 
    - key: checkm_completeness
      type: float
      display_name: CheckM Completeness
      category: Quality
+     description:
 
    - key: checkm_contamination
      type: float
      display_name: CheckM Contamination
      category: Quality
+     description:
 
    - key: checkm_marker_count
      type: int
      display_name: Number of CheckM Markers
      category: Other
+     description:
 
    - key: checkm_marker_lineage  # GTDB node, might need better tokenizing or substring search
      type: string
      filter_strategy: fulltext
      display_name: CheckM Marker Lineage
      category: Other
+     description:
 
    - key: checkm_marker_set_count
      type: int
      display_name: Number of CheckM Marker Sets
      category: Other
+     description:
 
    - key: contig_count
      type: int
      display_name: Number of Contigs
      category: Other
+     description:
 
    - key: gc_count
      type: int
      display_name: GC Count
      category: Other
+     description:
 
    - key: gc_percentage
      type: float
      display_name: GC Content
      category: Statistics
+     description:
 
    - key: genome_size
      type: int
      display_name: Genome Size
      category: Statistics
+     description:
 
    - key: classification  # GTDB classification, might need better tokenizing
      type: string
      filter_strategy: ngram
      display_name: Classification
      category: Taxonomy
+     description:
 
    - key: longest_contig
      type: int
      display_name: Longest Contig
      category: Other
+     description:
 
    - key: longest_scaffold
      type: int
      display_name: Longest Scaffold
      category: Other
+     description:
 
    - key: mean_contig_length
      type: float
      display_name: Mean Contig Length
      category: Other
+     description:
 
    - key: mean_scaffold_length
      type: float
      display_name: Mean Scaffold Length
      category: Other
+     description:
 
    - key: mimag_high_quality  # is this a boolean? do we need a bool type & convert?
      type: string
      filter_strategy: identity
      display_name: Mimag High Quality
      category: Quality
+     description:
 
    - key: mimag_low_quality  # is this a boolean? do we need a bool type & convert?
      type: string
      filter_strategy: identity
      display_name: Mimag Low Quality
      category: Quality
+     description:
 
    - key: mimag_medium_quality  # is this a boolean? do we need a bool type & convert?
      type: string
      filter_strategy: identity
      display_name: Mimag Medium Quality
      category: Quality
+     description:
 
    - key: n50_contigs
      type: int
      display_name: N50 Contigs
      category: Other
+     description:
 
    - key: n50_scaffolds
      type: int
      display_name: N50 Scaffolds
      category: Other
+     description:
 
    - key: ncbi_assembly_level
      type: string
      filter_strategy: fulltext
      display_name: NCBI Assembly Level
      category: Source
+     description:
 
    - key: ncbi_assembly_name
      type: string
      filter_strategy: identity  # this seems like a good candidate for substring
      display_name: NCBI Assembly Name
      category: Other
+     description:
 
    - key: ncbi_bioproject
      type: string
      filter_strategy: identity  # this seems like a good candidate for substring
      display_name: NCBI BioProject
      category: Source
+     description:
 
    - key: ncbi_biosample
      type: string
      filter_strategy: identity  # this seems like a good candidate for substring
      display_name: NCBI BioSample
      category: Source
+     description:
 
    - key: ncbi_country
      type: string
      filter_strategy: fulltext
      display_name: NCBI Country
      category: Other
+     description:
 
    - key: ncbi_date  # should put all dates into ISO8601 format, but not add significance
      type: date
      display_name: NCBI Date
      category: Source
+     description:
 
    - key: ncbi_genbank_assembly_accession
      type: string
      filter_strategy: identity  # this seems like a good candidate for substring
      display_name: NCBI GenBank Assembly Accession
      category: Other
+     description:
 
    - key: ncbi_genome_category
      type: string
      filter_strategy: fulltext
      display_name: NCBI Genome Category
      category: Other
+     description:
 
    - key: ncbi_isolate
      type: string
      filter_strategy: fulltext
      display_name: NCBI Isolate
      category: Other
+     description:
 
    - key: ncbi_isolation_source
      type: string
      filter_strategy: fulltext
      display_name: NCBI Isolation Source
      category: Other
+     description:
 
    - key: ncbi_lat_lon  # might want to do specialized processing here
      type: string
      filter_strategy: fulltext
      display_name: NCBI Latitude/Longitude
      category: Other
+     description:
 
    - key: ncbi_organism_name
      type: string
      filter_strategy: fulltext
      display_name: NCBI Organism Name
      category: Taxonomy
+     description:
 
    - key: ncbi_seq_rel_date  # should put all dates into ISO8601 format, but not add significance
      type: date
      display_name: NCBI Sequence Release Date
      category: Other
+     description:
 
    - key: ncbi_species_taxid
      type: int
      display_name: NCBI Species Taxon ID
      category: Other
+     description:
 
    - key: ncbi_strain_identifiers
      type: string
      filter_strategy: fulltext
      display_name: NCBI Strain Identifiers
      category: Other
+     description:
 
    - key: ncbi_submitter
      type: string
      filter_strategy: fulltext
      display_name: NCBI Submitter
      category: Other
+     description:
 
    - key: ncbi_taxid
      type: int
      display_name: NCBI Taxon ID
      category: Other
+     description:
 
    - key: ncbi_taxonomy_unfiltered  # might need better tokenizing
      type: string
      filter_strategy: fulltext
      display_name: NCBI Taxonomy (Unfiltered)
      category: Other
+     description:
 
    - key: protein_count
      type: int
      display_name: Number of Protein Encoding Genes
      category: Statistics
+     description:
 
    - key: scaffold_count
      type: int
      display_name: Number of Scaffolds
      category: Other
+     description:
 
    - key: ssu_count
      type: int
      display_name: SSU Count
      category: Other
+     description:
 
    - key: ssu_length  # this looks like an int that's a string in the DB. Should we coerce?
      type: string
      filter_strategy: fulltext
      display_name: SSU Length
      category: Other
+     description:
 
    - key: trna_aa_count
      type: int
      display_name: tRNA AA Count
      category: Other
+     description:
 
    - key: trna_count
      type: int
      display_name: Number of tRNA Genes
      category: Statistics
+     description:
 
    - key: trna_selenocysteine_count
      type: int
      display_name: tRNA Selenocysteine Count
      category: Other
+     description:
diff --git a/src/common/collection_column_specs/genome_attribs-PMI.yml b/src/common/collection_column_specs/genome_attribs-PMI.yml
index 4faea7ffa..be9cb3e3a 100644
--- a/src/common/collection_column_specs/genome_attribs-PMI.yml
+++ b/src/common/collection_column_specs/genome_attribs-PMI.yml
@@ -16,131 +16,155 @@ columns:
      filter_strategy: identity
      display_name: KBase ID
      category: Identifiers
+     description:
 
    - key: kbase_sample_id
      type: string
      filter_strategy: identity
      display_name: KBase Sample ID
      category: Identifiers
+     description:
 
    - key: kbase_display_name
      type: string
      filter_strategy: ngram
      display_name: KBase Name
      category: Identifiers
+     description:
 
    - key: kbase_genome_size
      type: int
      display_name: Genome Size (KBase)
      category: Statistics
+     description:
 
    - key: kbase_gc_content
      type: float
      display_name: GC Content (KBase)
      category: Statistics
+     description:
 
    - key: kbase_num_contigs
      type: int
      display_name: Number of Contigs (KBase)
      category: Statistics
+     description:
 
    - key: kbase_num_cds
      type: int
      display_name: Number of CDS (KBase)
      category: Statistics
+     description:
 
    - key: kbase_num_protein_encoding_genes
      type: int
      display_name: Number of Protein Encoding Genes (KBase)
      category: Statistics
+     description:
 
    - key: Contamination  # checkm2  # GTDB is checkm_contamination
      type: float
      display_name: CheckM Contamination
      category: Quality
+     description:
    
    - key: Completeness  # checkm2  # GTDB is checkm_completeness
      type: float
      display_name: CheckM Completeness
      category: Quality
+     description:
    
    - key: user_genome  # id provided to GTDB_tk
      type: string
      filter_strategy: identity
      display_name: User Genome
      category: Other
+     description:
    
    - key: classification  # GTDB classification, might need better tokenizing
      type: string
      filter_strategy: ngram
      display_name: Classification
      category: Taxonomy
+     description:
    
    - key: fastani_reference  # e.g. the genbank accession
      type: string
      filter_strategy: identity  # this seems like a good candidate for substring
      display_name: FastANI Reference
      category: Other
+     description:
 
    - key: fastani_reference_radius
      type: float
      display_name: FastANI Reference Radius
      category: Other
+     description:
 
    - key: fastani_taxonomy  # might need better tokenizing
      type: string
      filter_strategy: fulltext
      display_name: FastANI Taxonomy
      category: Other
+     description:
 
    - key: fastani_ani
      type: float
      display_name: FastANI ANI
      category: Other
+     description:
 
    - key: fastani_af
      type: float
      display_name: FastANI AF
      category: Other
+     description:
 
    - key: closest_placement_reference  # genbank accession
      type: string
      filter_strategy: identity  # this seems like a good candidate for substring
      display_name: Closest Placement Reference
      category: Other
+     description:
 
    - key: closest_placement_radius
      type: float
      display_name: Closest Placement Radius
      category: Other
+     description:
 
    - key: closest_placement_ani
      type: float
      display_name: Closest Placement ANI
      category: Other
+     description:
 
    - key: closest_placement_af
      type: float
      display_name: Closest Placement AF
      category: Other
+     description:
 
    - key: pplacer_taxonomy  # might need better tokenizing
      type: string
      filter_strategy: fulltext
      display_name: pplacer Taxonomy
      category: Other
+     description:
 
    - key: classification_method
      type: string
      filter_strategy: fulltext
      display_name: Classification Method
      category: Taxonomy
+     description:
 
    - key: note
      type: string
      filter_strategy: fulltext
      display_name: Note
      category: Other
+     description:
 
    # this field probably needs special processing as it's a list of accessions with
    # an array of associated numbers. It might even be best not to include it in search for
@@ -150,25 +174,30 @@ columns:
      filter_strategy: fulltext
      display_name: Other Related References
      category: Other
+     description:
 
    - key: msa_percent
      type: float
      display_name: MSA Percent
      category: Other
+     description:
 
    - key: translation_table
      type: int
      display_name: Translation Table
      category: Other
+     description:
 
    - key: red_value  # not sure what this is
      type: string
      filter_strategy: fulltext
      display_name: Red Value
      category: Other
+     description:
 
    - key: warnings
      type: string
      filter_strategy: fulltext
      display_name: Warnings
      category: Other
+     description: