Skip to content

Commit

Permalink
uwrite only most impactful variant per gene in each variant(row) outp…
Browse files Browse the repository at this point in the history
…ut from
  • Loading branch information
zqfang committed Aug 14, 2024
1 parent 75eb0c3 commit 43adf3d
Show file tree
Hide file tree
Showing 8 changed files with 221 additions and 123 deletions.
15 changes: 15 additions & 0 deletions haplomap/include/constants.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
//
// Created by Zhuoqing Fang on 8/13/24.
//
// #pragma once
#ifndef CONFIG_H
#define CONFIG_H


#include <unordered_map>

extern std::unordered_map<std::string, std::string> CODONs;
extern std::unordered_map<std::string, int> PRIOR;
extern std::unordered_map<std::string, int> CSQs;

#endif
2 changes: 1 addition & 1 deletion haplomap/include/ghmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ void readFileToVec(char *fname, std::vector<std::string> &vec);
extern int numCategories; // default for non-categorical data.
extern const int AACLASSES[];
// extern std::unordered_map<std::string, int> PRIOR;
extern std::unordered_map<std::string, int> CSQs;
// extern std::unordered_map<std::string, int> CSQs;
extern std::vector<std::string> catNames; // maps strIdx -> category name.
// Maps gene names to a string of A's, M's, and P's
extern std::unordered_map<std::string, std::string> geneExprMap;
Expand Down
24 changes: 17 additions & 7 deletions haplomap/include/vep.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <string>
#include <cstring>
#include <vector>
#include <set>
#include <unordered_map>
#include <algorithm>
#include <numeric>
Expand All @@ -27,6 +28,22 @@ struct Key {
Key(std::string key);
};

struct CsqCoding {
std::string id;
std::string raw; // original string
std::string repr; // new representation
int code;
CsqCoding(): id(""), raw(""), repr(""), code(-100) {}
CsqCoding(std::string name, std::string csq, int priority):
id(name), raw(csq), repr(csq), code(priority) {}
CsqCoding(const CsqCoding& other) {
id = other.id;
raw = other.raw;
repr = other.repr;
code = other.code;
}
};

struct VEPSummary
{
std::string idx; //0
Expand All @@ -49,9 +66,6 @@ struct VEPSummary
std::string biotype; //23, protein_coding, lincRNA, etc
Dynum<std::string> HGVSc; //31 HGVSp strings
Dynum<std::string> HGVSp; //32 HGVSp strings
// std::string chrom;
// int start;
// int end;

VEPSummary(std::string uploaded_variant, std::string loc, std::string seq, std::string gene,
std::string transcript, std::string feature_type, std::string csq, std::string aa_pos, std::string aa,
Expand All @@ -67,12 +81,8 @@ class VarirantEeffectPredictor
private:
int numtoks;
bool hasIND; // whether contain samples (IND) column in vep output
std::unordered_map<std::string, int> PRIOR;
std::unordered_map<std::string, std::string> CODONs;
std::unordered_map<std::string, std::string> CSQs;
Dynum<std::string> strainAbbrevs;
std::string _line;
// variant loc -> transxid -> record, orded by variant loc
std::unordered_map<std::string, std::unordered_map<std::string, VEPSummary* >> geneCodingMap;
std::vector<VEPSummary*> data;
std::unordered_map<std::string, unsigned> columns; // header colum name to index
Expand Down
12 changes: 9 additions & 3 deletions haplomap/src/annotate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,14 @@ std::shared_ptr<VEPOptions> parseVEPOptions(int argc, char **argv)
" in.vep.txt Input ensembl-VEP file name\n"
" -o, --output Output file name, for (eblocks -g)\n"
"\nOptional arguments:\n"
" -c, --csq Output a annotation with impact and sample names.\n"
" -c, --csq Output a annotation with impact score and sample names.\n"
" -s, --samples Only get annotation for the input samples, use same strains to (eblocks -s).\n"
" -t, --type Select variant type: [snp|indel|sv|all]. Default: all\n"
" -v, --verbose\n"
" -h, --help\n"
"\nImportant message:\n"
"Please run ensemble-vep containing following flags: \n"
" vep --fasta --individual_zyg all --gencode_basic --everything\n"
" vep --fasta --individual_zyg all --everything\n"
"For structrual variant input format, please ref to: \n"
"https://ensembl.org/info/docs/tools/vep/vep_formats.html#sv \n";

Expand Down Expand Up @@ -172,7 +172,13 @@ int main_annot(int argc, char **argv)
// read data
vep.readVEP(opts->inputVEPName, (char*)"\t", (char*)varType.c_str());
// write
if (opts->verbose) std::cout<<"Write detail Annotation"<<std::endl;
if (opts->verbose)
{
std::cout<<"Write Annotation"<<std::endl;
std::cout<<" For each variant (row), only write the most impactful variant per gene.\n";
std::cout<<" If multiple variant consequence has the same impact score, select one randomly.\n";
std::cout<<" Please refer to haplomap/src/constants.cpp file to see or update scores."<<std::endl;
}
vep.writeVEPCsq(opts->outputCSQName);
if (opts->outputSUMName != nullptr)
{
Expand Down
60 changes: 60 additions & 0 deletions haplomap/src/constants.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@

#include "constants.h"

std::unordered_map<std::string, std::string> CODONs = {{"TTT", "F"}, {"TTC", "F"}, {"TCT", "S"}, {"TCC", "S"}, {"TAT", "Y"}, {"TAC", "Y"}, {"TGT", "C"}, {"TGC", "C"},
{"TTA", "L"}, {"TCA", "S"}, {"TAA", "X"}, {"TGA", "X"}, {"TTG", "L"}, {"TCG", "S"}, {"TAG", "X"}, {"TGG", "W"},
{"CTT", "L"}, {"CTC", "L"}, {"CCT", "P"}, {"CCC", "P"}, {"CAT", "H"}, {"CAC", "H"}, {"CGT", "R"}, {"CGC", "R"},
{"CTA", "L"}, {"CTG", "L"}, {"CCA", "P"}, {"CCG", "P"}, {"CAA", "Q"}, {"CAG", "Q"}, {"CGA", "R"}, {"CGG", "R"}, {"ATT", "I"},
{"ATC", "I"}, {"ACT", "T"}, {"ACC", "T"}, {"AAT", "N"}, {"AAC", "N"}, {"AGT", "S"}, {"AGC", "S"}, {"ATA", "I"}, {"ACA", "T"},
{"AAA", "K"}, {"AGA", "R"}, {"ATG", "M"}, {"ACG", "T"}, {"AAG", "K"}, {"AGG", "R"}, {"GTT", "V"}, {"GTC", "V"}, {"GCT", "A"},
{"GCC", "A"}, {"GAT", "D"}, {"GAC", "D"}, {"GGT", "G"}, {"GGC", "G"}, {"GTA", "V"}, {"GTG", "V"}, {"GCA", "A"}, {"GCG", "A"},
{"GAA", "E"}, {"GAG", "E"}, {"GGA", "G"}, {"GGG", "G"}};


/// see the variant impact coding here:
/// https://useast.ensembl.org/info/genome/variation/prediction/predicted_data.html
/// last update: 2024-08-15
std::unordered_map<std::string, int> PRIOR = {{"HIGH", 2}, {"MODERATE", 1}, {"LOW", 0}, {"MODIFIER", -1}};
std::unordered_map<std::string, int> CSQs = {
{"transcript_ablation", 2},
{"splice_acceptor_variant", 2},
{"splice_donor_variant", 2},
{"stop_gained", 2},
{"frameshift_variant", 2},
{"stop_lost", 2},
{"start_lost", 2},
{"transcript_amplification", 2},
{"feature_elongation", 2},
{"feature_truncation", 2},
{"inframe_insertion", 1},
{"inframe_deletion", 1},
{"missense_variant", 1},
{"protein_altering_variant", 1},
{"splice_donor_5th_base_variant", 0},
{"splice_region_variant", 0},
{"splice_donor_region_variant", 0},
{"splice_polypyrimidine_tract_variant", 0},
{"incomplete_terminal_codon_variant", 0},
{"start_retained_variant", 0},
{"stop_retained_variant", 0},
{"synonymous_variant", 0},
{"coding_sequence_variant", -1},
{"mature_miRNA_variant", -1},
{"5_prime_UTR_variant", -1},
{"3_prime_UTR_variant", -1},
{"non_coding_transcript_exon_variant", -1},
{"intron_variant", -1},
{"NMD_transcript_variant", -1},
{"non_coding_transcript_variant", -1},
{"coding_transcript_variant", -1},
{"upstream_gene_variant", -1},
{"downstream_gene_variant", -1},
{"TFBS_ablation", -1},
{"TFBS_amplification", -1},
{"TF_binding_site_variant", -1},
{"regulatory_region_ablation", -1},
{"regulatory_region_amplification", -1},
{"regulatory_region_variant", -1},
{"intergenic_variant", -1},
{"sequence_variant", -1},
};
42 changes: 37 additions & 5 deletions haplomap/src/eblocks.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@

#include "eblocks.h"
// #include "constants.h"

// For setting initial datastructure sizes
const int approxNumSNPs = 8000000;
Expand Down Expand Up @@ -314,8 +315,6 @@ void readSNPGeneNames(char *fname)
if (rdr.getCurrentLineNum() < 1)
continue; // skip header
std::string snpName = rdr.getToken(0);
std::string geneName = rdr.getToken(1);

// cout << " geneName = " << geneName << endl;
std::unordered_map<std::string, SNPInfo *>::iterator fit = snpMap.find(snpName);
if (fit == snpMap.end())
Expand All @@ -326,17 +325,50 @@ void readSNPGeneNames(char *fname)
}

SNPInfo *pSNPInfo = fit->second;

// rest of line is alternating gene names/codons

/// rest of line is alternating gene names/codons
for (std::vector<std::string>::iterator rit = rdr.begin() + 1; rit != rdr.end(); rit += 2)
{
// Store "<" in the map even if there is already something there.
if (pSNPInfo->geneCodonMap.find(*rit) == pSNPInfo->geneCodonMap.end() || (*(rit + 1)).find("<") != std::string::npos)
{
// one gene, one annotate only
pSNPInfo->geneCodonMap[*rit] = *(rit + 1);
}
}
// std::unordered_map<std::string, std::string> gene_csq; // used to store priority value
// for (std::vector<std::string>::iterator rit = rdr.begin() + 1; rit != rdr.end(); rit += 2)
// {
// // max csq
// std::string csq_str = *(rit + 1);
// if (gene_csq.find(*rit) != gene_csq.end() )
// {
// // do a santicheck for csq
// if (traceFBB && csq_str.find("<") == std::string::npos && CSQs.find(csq_str) == CSQs.end())
// {
// std::cout<<"Variant Annotation file error: "<<csq_str<<" is not supported"<<std::endl;
// }

// if (csq_str.find("<") != std::string::npos || CSQs[csq_str] > CSQs[gene_csq[*rit]])
// {
// gene_csq[*rit] = csq_str;
// }
// }
// else
// {
// // this means if csq_str
// gene_csq[*rit] = csq_str;
// }
// }
// for (auto it = gene_csq.begin(); it != gene_csq.end(); ++it)
// {
// // Store "<" in the map even if there is already something there.
// if (pSNPInfo->geneCodonMap.find(it->first) == pSNPInfo->geneCodonMap.end() || ((it->second).find("<") != std::string::npos))
// {
// // one gene, one annotate only
// pSNPInfo->geneCodonMap[it->first] = it->second;
// }
// }

}
}

Expand Down
77 changes: 39 additions & 38 deletions haplomap/src/ghmap.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <algorithm>
#include <functional>
#include "ghmap.h"
#include "constants.h"

// defined globals
int numCategories = 1; // default for non-categorical data.
Expand All @@ -15,44 +16,44 @@ std::vector<std::string> geneExprHeader;
int traceFStat = false;

// std::unordered_map<std::string, int> PRIOR = {{"HIGH", 2}, {"MODERATE", 1}, {"LOW", 0}, {"MODIFIER", -1}};
std::unordered_map<std::string, int> CSQs = {
{"transcript_ablation", 2},
{"splice_acceptor_variant", 2},
{"splice_donor_variant", 2},
{"stop_gained", 2},
{"frameshift_variant", 2},
{"stop_lost", 2},
{"start_lost", 2},
{"transcript_amplification", 2},
{"inframe_insertion", 1},
{"inframe_deletion", 1},
{"missense_variant", 1},
{"protein_altering_variant", 1},
{"splice_region_variant", 0},
{"incomplete_terminal_codon_variant", 0},
{"start_retained_variant", 0},
{"stop_retained_variant", 0},
{"synonymous_variant", 0},
{"coding_sequence_variant", -1},
{"mature_miRNA_variant", -1},
{"5_prime_UTR_variant", -1},
{"3_prime_UTR_variant", -1},
{"non_coding_transcript_exon_variant", -1},
{"intron_variant", -1},
{"NMD_transcript_variant", -1},
{"non_coding_transcript_variant", -1},
{"upstream_gene_variant", -1},
{"downstream_gene_variant", -1},
{"TFBS_ablation", -1},
{"TFBS_amplification", -1},
{"TF_binding_site_variant", -1},
{"regulatory_region_ablation", -1},
{"regulatory_region_amplification", -1},
{"feature_elongation", -1},
{"regulatory_region_variant", -1},
{"feature_truncation", -1},
{"intergenic_variant", -1},
};
// std::unordered_map<std::string, int> CSQs = {
// {"transcript_ablation", 2},
// {"splice_acceptor_variant", 2},
// {"splice_donor_variant", 2},
// {"stop_gained", 2},
// {"frameshift_variant", 2},
// {"stop_lost", 2},
// {"start_lost", 2},
// {"transcript_amplification", 2},
// {"inframe_insertion", 1},
// {"inframe_deletion", 1},
// {"missense_variant", 1},
// {"protein_altering_variant", 1},
// {"splice_region_variant", 0},
// {"incomplete_terminal_codon_variant", 0},
// {"start_retained_variant", 0},
// {"stop_retained_variant", 0},
// {"synonymous_variant", 0},
// {"coding_sequence_variant", -1},
// {"mature_miRNA_variant", -1},
// {"5_prime_UTR_variant", -1},
// {"3_prime_UTR_variant", -1},
// {"non_coding_transcript_exon_variant", -1},
// {"intron_variant", -1},
// {"NMD_transcript_variant", -1},
// {"non_coding_transcript_variant", -1},
// {"upstream_gene_variant", -1},
// {"downstream_gene_variant", -1},
// {"TFBS_ablation", -1},
// {"TFBS_amplification", -1},
// {"TF_binding_site_variant", -1},
// {"regulatory_region_ablation", -1},
// {"regulatory_region_amplification", -1},
// {"feature_elongation", -1},
// {"regulatory_region_variant", -1},
// {"feature_truncation", -1},
// {"intergenic_variant", -1},
// };

// constructor
BlockSummary::BlockSummary(const char *chrnm, int num, int start, int size,
Expand Down
Loading

0 comments on commit 43adf3d

Please sign in to comment.