Skip to content

Commit

Permalink
Merge pull request #90 from gymreklab/add_sexchr_support
Browse files Browse the repository at this point in the history
Add sex chr support
  • Loading branch information
nmmsv authored Apr 24, 2020
2 parents 4dd65af + 7cba671 commit 6550a3b
Show file tree
Hide file tree
Showing 12 changed files with 178 additions and 49 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.version
config/compile
config/config.guess
config/config.sub
Expand Down
28 changes: 26 additions & 2 deletions src/bam_info_extract.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ BamInfoExtract::BamInfoExtract(const Options* options_,

bool BamInfoExtract::GetReadLen(int32_t* read_len){
*read_len = -1;
int32_t flank_size = 20000;
int32_t flank_size = 200000;
int32_t req_streak = 10;
bool found_read_len = false, has_reads = false;
// Header has info about chromosome names
Expand Down Expand Up @@ -236,8 +236,13 @@ bool BamInfoExtract::GetInsertSizeDistribution(std::map<std::string, SampleProfi
int num_regions_so_far = 0;
int region_offset = 1000; // Look this far away from STR
int region_length = 5000; // Use this length of region to look at

// Requirements to continue with each sample
size_t min_reads_per_sample = options->min_reads_per_sample / 2 * options->ploidy;
size_t min_reads_per_sample = options->min_reads_per_sample / 2 * (options->ploidy==-1?2:options->ploidy);

//int break_after_min_above = min_reads_per_sample;
//int min;
std::map<std::string,int> samp_read_counts;
// Set up
std::string read_group, rgid, sample, fname;
bool found_sample;
Expand Down Expand Up @@ -269,6 +274,25 @@ bool BamInfoExtract::GetInsertSizeDistribution(std::map<std::string, SampleProfi
if (!found_sample) continue;
if (rg_ids_to_sample.find(rgid) == rg_ids_to_sample.end()) continue;
sample = rg_ids_to_sample[rgid];
/*
if (samp_read_counts.find(sample) == samp_read_counts.end()){
samp_read_counts.insert(std::pair<std::string,int>(sample,1));
}
else{
samp_read_counts[sample]++;
min = 100000000;
for (std::pair<std::string,int> element : samp_read_counts){
cerr << element.first << " " << element.second << "\t";
if (element.second < min){
min = element.second;
}
if (min > break_after_min_above){
break;
}
}
cerr << endl;
}
*/
// Get template length and assign to that sample
sample_to_tlens[sample].push_back(abs(alignment.TemplateLength()));
}
Expand Down
6 changes: 3 additions & 3 deletions src/genotyper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ Genotyper::Genotyper(RefGenome& _refgenome,
it != rg_samples.end(); it++) {
SampleProfile sp;
if (sample_info->GetSampleProfile(*it, &sp)) {
sample_likelihood_maximizers[*it] = new LikelihoodMaximizer(_options, sp, sample_info->GetReadLength());
sample_likelihood_maximizers[*it] = new LikelihoodMaximizer(_options, sp, sample_info->GetReadLength(), sample_info->GetSampleSex(*it));
} else {
PrintMessageDieOnError("Could not find sample profile for " + *it, M_ERROR, false);
}
Expand Down Expand Up @@ -130,12 +130,12 @@ bool Genotyper::ProcessLocus(BamCramMultiReader* bamreader, Locus* locus) {
sample_likelihood_maximizers[samp]->SetLocusParams(str_info->GetSTRInfo(locus->chrom, locus->start),
sample_info->GetGCCoverage(samp, gcbin),
sample_info->GetReadLength(), (int32_t)(locus->motif.size()),
ref_count);
ref_count, locus->chrom);
} else {
sample_likelihood_maximizers[samp]->SetLocusParams(str_info->GetSTRInfo(locus->chrom, locus->start),
sample_info->GetCoverage(samp),
sample_info->GetReadLength(), (int32_t)(locus->motif.size()),
ref_count);
ref_count, locus->chrom);
}
if (!sample_likelihood_maximizers[samp]->InferGridSize() ) {
PrintMessageDieOnError("Error inferring grid size", M_PROGRESS, options->quiet);
Expand Down
96 changes: 67 additions & 29 deletions src/likelihood_maximizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,10 @@ using namespace std;


LikelihoodMaximizer::LikelihoodMaximizer(const Options& _options, const SampleProfile& sp,
const int32_t& read_len) {
options = &_options; // TODO remove options
const int32_t& read_len, const std::string _sex) : sex(_sex) {

options = &_options; // TODO remove options
local_ploidy = 2;
enclosing_class_.SetGlobalParams(sp, options->flanklen, options->read_prob_mode, options->hist_mode);
frr_class_.SetGlobalParams(sp, options->flanklen, options->read_prob_mode, options->hist_mode);
spanning_class_.SetGlobalParams(sp, options->flanklen, options->read_prob_mode, options->hist_mode);
Expand Down Expand Up @@ -123,7 +124,7 @@ void LikelihoodMaximizer::AddOffTargetData(const int32_t& data) {

void LikelihoodMaximizer::SetLocusParams(const STRLocusInfo& sli, const double& cov,
const int32_t& _read_len, const int32_t _motif_len,
const int32_t& _ref_count) {
const int32_t& _ref_count, const std::string chrom) {
enclosing_class_.SetLocusParams(sli);
frr_class_.SetLocusParams(sli);
spanning_class_.SetLocusParams(sli);
Expand All @@ -146,6 +147,16 @@ void LikelihoodMaximizer::SetLocusParams(const STRLocusInfo& sli, const double&
read_len = _read_len;
motif_len = _motif_len;
ref_count = _ref_count;
// Set local_ploidy based on ploidy and sex
if (options->ploidy != -1){
local_ploidy = options->ploidy;
}
else{
local_ploidy = 2;
if (sex == male and (chrom == "chrY" or chrom == "Y" or chrom == "chrX" or chrom == "X")){
local_ploidy = 1;
}
}
locus_params_set = true;
}

Expand Down Expand Up @@ -223,7 +234,7 @@ bool LikelihoodMaximizer::GetConfidenceInterval(const int32_t& all1,
std::vector<int32_t> small_alleles, large_alleles;
for (int i = 0; i < num_boot_samp + 1; i++){
ResampleReadPool();
if (options->ploidy == 2){
if (local_ploidy == 2){
OptimizeLikelihood(true, 1, allele1,
offtarget_share,
&boot_al2_1, &boot_al2_2, &min_negLike);
Expand All @@ -234,14 +245,16 @@ bool LikelihoodMaximizer::GetConfidenceInterval(const int32_t& all1,
boot_al1 = boot_al1_2;
else if (boot_al1_2 == allele2)
boot_al1 = boot_al1_1;
else
else{
cerr<< "Error running bootstrap\n";
}
if (boot_al2_1 == allele1)
boot_al2 = boot_al2_2;
else if (boot_al2_2 == allele1)
boot_al2 = boot_al2_1;
else
std::cerr<< "Error running likelihood optimization\n";
else{
std::cerr<< "Error running likelihood optimization\n";
}
double gt_ll1, gt_ll2;
}
else{ // haploid
Expand Down Expand Up @@ -385,18 +398,18 @@ bool LikelihoodMaximizer::GetGenotypeNegLogLikelihood(const int32_t& allele1,
}
frr_class_.GetClassLogLikelihood(allele1, allele2,
read_len, motif_len, ref_count,
options->ploidy, &frr_ll);
local_ploidy, &frr_ll);
spanning_class_.GetClassLogLikelihood(allele1, allele2,
read_len, motif_len, ref_count,
options->ploidy, &span_ll);
local_ploidy, &span_ll);
enclosing_class_.GetClassLogLikelihood(allele1, allele2,
read_len, motif_len, ref_count,
options->ploidy, &encl_ll);
local_ploidy, &encl_ll);

// flanking class overloads GetClassLogLikelihood function
flanking_class_.FlankingClass::GetClassLogLikelihood(allele1, allele2,
read_len, motif_len, ref_count,
options->ploidy, &flank_ll);
local_ploidy, &flank_ll);
// TODO Substituting these lines changes optimization result. Find out why?!
//if ((options->coverage > 0) && (frr_class_.GetDataSize() > 0)){

Expand All @@ -406,7 +419,7 @@ bool LikelihoodMaximizer::GetGenotypeNegLogLikelihood(const int32_t& allele1,
read_len,
motif_len,
obj_cov,
options->ploidy,
local_ploidy,
2 * offtarget_count * offtarget_share,
&frr_count_ll);
//cerr << allele1 << ","<< allele2 << "\t"<< frr_count << " " << frr_count_ll << endl;
Expand All @@ -423,25 +436,25 @@ bool LikelihoodMaximizer::GetGenotypeNegLogLikelihood(const int32_t& allele1,
}
resampled_frr_class_.GetClassLogLikelihood(allele1, allele2,
read_len, motif_len, ref_count,
options->ploidy, &frr_ll);
local_ploidy, &frr_ll);
resampled_spanning_class_.GetClassLogLikelihood(allele1, allele2,
read_len, motif_len, ref_count,
options->ploidy, &span_ll);
local_ploidy, &span_ll);
resampled_enclosing_class_.GetClassLogLikelihood(allele1, allele2,
read_len, motif_len, ref_count,
options->ploidy, &encl_ll);
local_ploidy, &encl_ll);
// flanking class overloads GetClassLogLikelihood function
resampled_flanking_class_.FlankingClass::GetClassLogLikelihood(allele1, allele2,
read_len, motif_len, ref_count,
options->ploidy, &flank_ll);
local_ploidy, &flank_ll);

if (use_cov && obj_cov > 0 && frr_count > 0){
resampled_frr_class_.GetCountLogLikelihood(allele1,
allele2,
read_len,
motif_len,
obj_cov,
options->ploidy,
local_ploidy,
2 * offtarget_count * offtarget_share,
&frr_count_ll);

Expand Down Expand Up @@ -513,10 +526,17 @@ void LikelihoodMaximizer::GetGridSize(int32_t* min_allele, int32_t* max_allele)
}

void LikelihoodMaximizer::InferAlleleList(std::vector<int32_t>* allele_list,
const int32_t& ploidy,
const int32_t& ovwr_ploidy,
const bool& resampled, const int32_t& fix_allele) {
allele_list->clear();
enclosing_class_.ExtractEnclosingAlleles(allele_list);
int32_t func_ploidy;
if (ovwr_ploidy == -1){
func_ploidy = local_ploidy;
}
else{
func_ploidy = ovwr_ploidy;
}
if (upper_bound-lower_bound <= grid_opt_threshold) {
for (int32_t i=lower_bound; i<=upper_bound; i++) {
allele_list->push_back(i);
Expand All @@ -525,7 +545,7 @@ void LikelihoodMaximizer::InferAlleleList(std::vector<int32_t>* allele_list,
std::vector<int32_t> sublist;
int32_t a1, a2, result;
double minf;
if (ploidy == 2) {
if (func_ploidy == 2) {
for (std::vector<int32_t>::iterator allele_it = allele_list->begin();
allele_it != allele_list->end();
allele_it++) {
Expand All @@ -548,7 +568,7 @@ void LikelihoodMaximizer::InferAlleleList(std::vector<int32_t>* allele_list,
allele_list->push_back(*subl_it);
}
}
} else if (ploidy == 1) {
} else if (func_ploidy == 1) {
nlopt_1D_optimize(read_len, motif_len, ref_count,
lower_bound, upper_bound, resampled,
options->seed, this, fix_allele, &a1, &result, &minf);
Expand Down Expand Up @@ -624,7 +644,7 @@ bool LikelihoodMaximizer::GetExpansionProb(std::vector<double>* prob_vec, const
return true;
}

bool LikelihoodMaximizer::OptimizeLikelihood(const bool& resampled, const int32_t& use_ploidy,
bool LikelihoodMaximizer::OptimizeLikelihood(const bool& resampled, const int32_t& ovwr_ploidy,
const int32_t& fix_allele,
const double& off_share,
int32_t* allele1, int32_t* allele2, double* min_negLike) {
Expand All @@ -636,12 +656,22 @@ bool LikelihoodMaximizer::OptimizeLikelihood(const bool& resampled, const int32_
// PrintMessageDieOnError("Skipping locus with likely extreme GC content", M_WARNING);
return false;
}
// If overwrite ploidy is set to a number, local ploidy is not used. If not, local ploidy used
int32_t func_ploidy;
if (ovwr_ploidy == -1){
func_ploidy = local_ploidy;
}
else {
func_ploidy = ovwr_ploidy;
}
//if (resampled)
// cerr << "func: " << func_ploidy << endl;

offtarget_share = off_share; // perc. of offtarget reads.

// Get list of potential alleles to try
std::vector<int32_t> allele_list;
InferAlleleList(&allele_list, use_ploidy, resampled, fix_allele);
InferAlleleList(&allele_list, ovwr_ploidy, resampled, fix_allele);
/*
if (!resampled){
double gt_ll1;
Expand All @@ -653,25 +683,33 @@ bool LikelihoodMaximizer::OptimizeLikelihood(const bool& resampled, const int32_
}
}
*/
if (use_ploidy == 2) {
findBestAlleleListTuple(allele_list, use_ploidy, resampled, 0,

if (func_ploidy == 2) {
findBestAlleleListTuple(allele_list, func_ploidy, resampled, 0,
allele1, allele2, min_negLike);
} else if (use_ploidy == 1) {
findBestAlleleListTuple(allele_list, use_ploidy, resampled, fix_allele,
} else if (func_ploidy == 1) {
findBestAlleleListTuple(allele_list, func_ploidy, resampled, fix_allele,
allele1, allele2, min_negLike);
}
return true;
}


bool LikelihoodMaximizer::findBestAlleleListTuple(std::vector<int32_t> allele_list,
int32_t use_ploidy,
int32_t ovwr_ploidy,
bool resampled, int32_t fix_allele,
int32_t* allele1, int32_t* allele2, double* min_negLike) {
double gt_ll;
*min_negLike = 1000000;
int32_t best_a1 = 0, best_a2 = 0;
if (use_ploidy == 2) {
int32_t func_ploidy;
if (ovwr_ploidy == -1){
func_ploidy = local_ploidy;
}
else {
func_ploidy = ovwr_ploidy;
}
if (func_ploidy == 2) {
for (std::vector<int32_t>::iterator a1_it = allele_list.begin();
a1_it != allele_list.end();
a1_it++){
Expand All @@ -692,7 +730,7 @@ bool LikelihoodMaximizer::findBestAlleleListTuple(std::vector<int32_t> allele_li
}
}
}
else if (use_ploidy == 1) {
else if (func_ploidy == 1) {
best_a2 = fix_allele;
for (std::vector<int32_t>::iterator a1_it = allele_list.begin();
a1_it != allele_list.end();
Expand Down
13 changes: 11 additions & 2 deletions src/likelihood_maximizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,16 @@ struct ReadRecord{
ReadType read_type;
};

// constant strings to check for sex chrom
const std::string male = "M";
const std::string chrY = "chrY";
const std::string chrYnoChr = "Y";

class LikelihoodMaximizer {
friend class Genotyper;
public:
LikelihoodMaximizer(const Options& _options, const SampleProfile& sp,
const int32_t& read_len);
const int32_t& read_len, const std::string sex);
// LikelihoodMaximizer(const LikelihoodMaximizer& lm_obj); // copy constructor
virtual ~LikelihoodMaximizer();

Expand Down Expand Up @@ -115,7 +120,7 @@ class LikelihoodMaximizer {
// Set per-locus params
void SetLocusParams(const STRLocusInfo& sli, const double& cov,
const int32_t& _read_len, const int32_t _motif_len,
const int32_t& _ref_count);
const int32_t& _ref_count, const std::string chrom);

// Print read pool
void PrintReadPool();
Expand All @@ -126,6 +131,7 @@ class LikelihoodMaximizer {
protected:
// Other params -> Made public for gslNegLikelihood to have access
const Options* options;
const std::string sex; // {"M", "F", "U"}
private:
double obj_cov; // TODO: This is a placeholder, until we figure out how to pass coverage through sample info
EnclosingClass enclosing_class_;
Expand Down Expand Up @@ -159,6 +165,9 @@ class LikelihoodMaximizer {
int32_t read_len;
int32_t motif_len;
int32_t ref_count;
int32_t local_ploidy; // ploidy of current likelihood maximizer
// If input ploidy is set, the value will represent that
// If not, this value will be 2 unless on chrY of a male sample
};

// Helper struct for NLOPT gradient optimizer
Expand Down
Loading

0 comments on commit 6550a3b

Please sign in to comment.