diff --git a/README.md b/README.md index fc2217a..610dafb 100755 --- a/README.md +++ b/README.md @@ -108,3 +108,10 @@ prediction (red). of the bias field might not be very useful, especially in segmenting the tumor data. Use rather, if available, the jumps seen in the BAF data for both CNA and BAF data (give the BAF jumps file to both `--cna-jumps` and `--baf-jumps`). + +#How to cite + +The cloneHD and filterHD software is free under the GNU General Public License v3. +If you use this software in your work, please cite the accompanying publication: + +Andrej Fischer, Ignacio Vazquez-Garcia, Christopher J.R. Illingworth and Ville Mustonen. High-definition reconstruction of subclonal composition in cancer. Cell Reports (2014), http://dx.doi.org/ 10.1016/j.celrep.2014.04.055 diff --git a/TODO.md b/TODO.md index b3a605f..9efd5d6 100644 --- a/TODO.md +++ b/TODO.md @@ -6,8 +6,6 @@ ## cloneHD -* with `--purity` and `--clones` (mass only), `--nmax` is ignored -* if using BAF-jumps for CNA: mapping to be checked * print full if `--*jump` is (also) given, only events if `--*jumps` * check memory leaks @@ -17,14 +15,17 @@ * filter loci incompatible with emission model via 2-state HMM * do Baum-Welch -* `--filter-shortSeg [int]` via posterior jump-prob (not via pmean)! +* `--filter-shortSeg [int]` via posterior jump-prob (not via pmean) +* Use diffusion constant that scales with size in mode 3/4. +* Implement jump subtraction ## cloneHD * bias field as distribution * re-think update_snp_fixed (hashing) -* different nLevels per chr via max-tcn (for memory only) +* different nLevels per chr via max-tcn (for memory efficiency only) ## both * string chromosome ids +* re-factor emission class diff --git a/changelog.md b/changelog.md index 234f1a9..f7e36c4 100644 --- a/changelog.md +++ b/changelog.md @@ -1,6 +1,19 @@ # changelog for cloneHD/filterHD -## v1.17.6 / to come +## v1.17.8 / 29.05.2014 + +* added checks whether files are open for writing +* changed to new defaults: `--(cna/baf/snv)-rnd [double:1.0e-6]` (nan) +* allowed `--cna-jump -1` and `--baf-jump -1` (no jumps) +* `--cna-jumps [baf-jumps-file]` and vice versa enabled (useful for exome data) +* jumps read and integrated with new function match_jumps() (not get_track()). +* fixed bug when chromosomes have no non-zero observations. + +## v1.17.7 / 25.04.2014 + +* fixed range error in `pre-filter` in pick-from/match-to mode. + +## v1.17.6 / 24.04.2014 * fixed nan bug in GOF, when N==0 (missing data). * fixed bugs in `pre-filter`, when `--window-size` is greater than length diff --git a/docs/README-cloneHD.md b/docs/README-cloneHD.md index a3ebecc..78ace92 100755 --- a/docs/README-cloneHD.md +++ b/docs/README-cloneHD.md @@ -116,9 +116,9 @@ models are used (Poisson or Binomial). The rate for indiviual random emissions per data set. -* `--cna-rnd [double:0.0]` -* `--baf-rnd [double:0.0]` -* `--snv-rnd [double:0.0]` +* `--cna-rnd [double:1.0e-6]` +* `--baf-rnd [double:1.0e-6]` +* `--snv-rnd [double:1.0e-6]` Both can be learned with filterHD for data with persistence. diff --git a/docs/README-pre-filter.md b/docs/README-pre-filter.md index b1ff8e0..00472f9 100755 --- a/docs/README-pre-filter.md +++ b/docs/README-pre-filter.md @@ -2,6 +2,11 @@ The program `pre-filter` can be used to remove loci based on the observed read depth. It includes two heuristic filtering methods: loci are removed based on (i) their local variability and (ii) their being an outlier (see below). +![pref](/images/prefilter-1.png "Pre-filtering of read depth via matched normal.") + +The effect of `pre-filter` on read depth data: (A) Centromeric regions of real chromosomes often show huge large scale variability in their read depth. But there are also many small regions with very low read depth throughout. (B) After pre-filtering, the problematic regions are masked out. (C) Removing the same regions in the tumor data improves quality visibly while retaining biologically relevant features. + + ## Typical usage options * `--data [file]` Input data to be pre-filtered. diff --git a/images/filterHD-1.png b/images/filterHD-1.png new file mode 100644 index 0000000..ddc8c14 Binary files /dev/null and b/images/filterHD-1.png differ diff --git a/images/prefilter-1.png b/images/prefilter-1.png new file mode 100644 index 0000000..9e75067 Binary files /dev/null and b/images/prefilter-1.png differ diff --git a/run-example.sh b/run-example.sh index 4d24f52..044726d 100644 --- a/run-example.sh +++ b/run-example.sh @@ -68,6 +68,7 @@ if [ -z $part ] || [ $part -eq 2 ] then ### cloneHD ### echo "*** cloneHD ***" + echo "True mass and cell fractions:" `cat test/data/clones.txt` echo # The CNA and BAF data is analysed for subclonality. # Try varying the --min-jump, --force and --max-tcn values and try --mass-gauging 0. diff --git a/src/clone-prior.cpp b/src/clone-prior.cpp index 6b57bc0..c5401fb 100644 --- a/src/clone-prior.cpp +++ b/src/clone-prior.cpp @@ -48,6 +48,8 @@ void Clone::set_cna_prior( gsl_vector * prior, int sample){ //used in SNV-only mode, w/o correlation and w/o cn-info... void Clone::initialize_snv_prior_param(){// SNV prior, conditional on max-tcn if (initial_snv_prior_param != NULL) gsl_matrix_free(initial_snv_prior_param); + initial_snv_prior_param = NULL; + if (nClones == 0) return; initial_snv_prior_param = gsl_matrix_calloc( maxtcn+1, maxtcn+1); gsl_matrix_set( initial_snv_prior_param, 0, 0, snv_fpr); double p = snv_pen_high;// penalty for higher genotypes diff --git a/src/clone-update.cpp b/src/clone-update.cpp index 1d6c3cd..19f2165 100644 --- a/src/clone-update.cpp +++ b/src/clone-update.cpp @@ -68,7 +68,10 @@ double Clone::update( gsl_vector * prior, gsl_vector * post, Emission * myEmit, } gsl_vector_mul( post, prior); norm = gsl_blas_dasum(post); - if ( norm != norm || norm <= 0.0) abort(); + if ( norm != norm || norm <= 0.0){ + cout<<"ERROR\n"; + abort(); + } gsl_vector_scale( post, 1.0 / norm); return( log(norm) ); } diff --git a/src/cloneHD-functions.cpp b/src/cloneHD-functions.cpp index e200c95..07c9de6 100755 --- a/src/cloneHD-functions.cpp +++ b/src/cloneHD-functions.cpp @@ -119,41 +119,45 @@ void get_track(const char * track_fn, line_ss.str(line); line_ss >> chr >> locus; l = (chr == old_chr) ? l+1 : 0; - if (chr != old_chr && (chr > myEmit->maxchr || myEmit->idx_of[chr] < 0) ){ - printf("ERROR: unexpected chromosome.\n"); + if (chr != old_chr && myEmit->chrs.count(chr) == 0){ + printf("ERROR: unexpected chromosome in %s.\n", track_fn); cout<loci[ myEmit->idx_of[chr] ][l] != locus){ - printf("ERROR in get_track()\n"); + int s = myEmit->idx_of[chr]; + if( (int) myEmit->loci[s][l] != locus){ + printf("ERROR in get_track(): unexpected coordinate in %s\n", track_fn); cout<idx_of[chr], l, myEmit->loci[ myEmit->idx_of[chr] ][l], locus); + printf("Expected:\n%i, %i, %i vs %i\n", s, l, myEmit->loci[s][l], locus); exit(1); } line_ss >> mn; - mean[ myEmit->idx_of[chr] ][l] = mn; + mean[s][l] = mn; if (var != NULL) { line_ss >> sd; - var[ myEmit->idx_of[chr] ][l] = pow(sd,2); + var[s][l] = pow(sd,2); } if (distribution == NULL) continue; double p; line_ss >> jp; - for (int i=0; i < (int) (distribution[ myEmit->idx_of[chr] ])->size2; i++){ + for (int i=0; i < (int) (distribution[s])->size2; i++){ if (line_ss.good() != true){ - printf("ERROR in get_track()\n"); + printf("ERROR in get_track(): distribution grid too small in %s\n",track_fn); exit(1); } line_ss >> p; - gsl_matrix_set( distribution[ myEmit->idx_of[chr] ], l, i, p); + gsl_matrix_set( distribution[s], l, i, p); } } ifs.close(); } + + + // get the maximum total c.n. per chromosome void get_maxtcn_input(const char * maxtcn_fn, int maxtcn_gw, Clone * myClone){ myClone->maxtcn_input.clear(); @@ -241,7 +245,7 @@ void get_maxtcn_input(const char * maxtcn_fn, int maxtcn_gw, Clone * myClone){ -//get total copynumber tracks from file... +//get mean total copynumber from file... void get_mean_tcn( const char * mntcn_fn, Clone * myClone, Emission * myEmit){ ifstream ifs; string line; @@ -605,6 +609,53 @@ void get_fixed_clones(gsl_matrix *& clones, gsl_vector *& mass, const char * clo +void match_jumps(const char * jumps_fn, Emission * myEmit){ + ifstream ifs; + string line; + stringstream line_ss; + ifs.open( jumps_fn, ios::in); + if (ifs.fail()){ + printf("ERROR file %s cannot be opened.\n", jumps_fn); + exit(1); + } + int chr=0, old_chr = -1; + int in_locus=0, locus=-1; + int s=-1,idx=0; + int wait=0; + double pj=0; + while( ifs.good() ){ + line.clear(); + getline( ifs, line); + if (line.empty()) break; + if (line[0] == '#') continue; + line_ss.clear(); + line_ss.str(line); + line_ss >> chr >> in_locus; + if (chr != old_chr){ + wait = (myEmit->chrs.count(chr)) == 0 ? 1 : 0; + idx=1; + } + old_chr = chr; + if (wait) continue; + s = myEmit->idx_of[chr]; + if (idx == myEmit->nSites[s]) continue;//chromosome is complete! + locus = (int) myEmit->loci[s][idx];//current target locus + if (in_locus < (int) myEmit->loci[s][0]) continue; + while(in_locus > locus){ + idx++; + if (idx == myEmit->nSites[s]) break; + locus = (int) myEmit->loci[s][idx]; + } + if (idx == myEmit->nSites[s]) continue; + line_ss >> pj; + myEmit->pjump[s][idx] = 1.0 - (1.0-pj)*(1.0-myEmit->pjump[s][idx]); + } + //done + ifs.close(); +} + + + //***posterior jump probability track*** void get_jump_probability( Clone * myClone, cmdl_opts& opts){ Emission * cnaEmit = myClone->cnaEmit; @@ -614,7 +665,8 @@ void get_jump_probability( Clone * myClone, cmdl_opts& opts){ gsl_matrix ** distdummy = NULL; if (cnaEmit->is_set){//***CNA JUMPS*** if(opts.cna_jumps_fn != NULL){//1. either use external jump probability track - get_track( opts.cna_jumps_fn, distdummy, cnaEmit->pjump, vardummy, cnaEmit); + //get_track( opts.cna_jumps_fn, distdummy, cnaEmit->pjump, vardummy, cnaEmit); + match_jumps(opts.cna_jumps_fn, cnaEmit); for (int s=0; s< cnaEmit->nSamples; s++){ cnaEmit->coarse_grain_jumps( s, opts.min_jump, 5); } @@ -624,9 +676,6 @@ void get_jump_probability( Clone * myClone, cmdl_opts& opts){ cnaEmit->set_pjump(opts.cna_jump); cnaEmit->get_events_via_jumps(); } - else{ - abort(); - } //map CNA events to BAF and SNV... if(bafEmit->is_set){ for (int s=0; snSamples; s++){ @@ -635,25 +684,42 @@ void get_jump_probability( Clone * myClone, cmdl_opts& opts){ } if (snvEmit->is_set){ for (int s=0; snSamples; s++){ - if ( !bafEmit->is_set ||bafEmit->chrs.count(snvEmit->chr[s]) == 0){ + if ( !bafEmit->is_set || bafEmit->chrs.count(snvEmit->chr[s]) == 0){ snvEmit->map_idx_to_Event( cnaEmit, s); } } } } if ( bafEmit->is_set ){//***BAF JUMPS*** + if (!cnaEmit->is_set) abort(); if (opts.baf_jumps_fn != NULL){//1. either external jump probability track - get_track( opts.baf_jumps_fn, distdummy, bafEmit->pjump, vardummy, bafEmit); + // get_track( opts.baf_jumps_fn, distdummy, bafEmit->pjump, vardummy, bafEmit); + match_jumps( opts.baf_jumps_fn, bafEmit); + if ( opts.cna_jumps_fn != NULL//allow transit at CNA jumps + && opts.cna_jumps_fn != opts.baf_jumps_fn//but not the same jumps + ){ + match_jumps( opts.cna_jumps_fn, bafEmit); + } + bafEmit->add_break_points_via_jumps( cnaEmit, opts.min_jump); for (int s=0; s< bafEmit->nSamples; s++){// ignore improbable jump events bafEmit->coarse_grain_jumps( s, opts.min_jump, 5); } bafEmit->get_events_via_jumps(); - if ( cnaEmit->is_set && opts.cna_jumps_fn != NULL ){//allow transit at CNA jumps - bafEmit->add_break_points_via_jumps( cnaEmit, opts.min_jump); + /* + cout<<"BAF-jumps\n"; + for (int evt=0;evtnEvents[0];evt++){ + int idx = bafEmit->idx_of_event[0][evt]; + printf("%3i %6i %6i\n", evt, idx, bafEmit->loci[0][idx]); } - bafEmit->get_events_via_jumps(); + cout<<"CNA-jumps\n"; + for (int evt=0;evtnEvents[0];evt++){ + int idx = cnaEmit->idx_of_event[0][evt]; + printf("%3i %6i %6i\n", evt, idx, cnaEmit->loci[0][idx]); + } + exit(0); + */ } - else if ( cnaEmit->is_set && opts.cna_jumps_fn != NULL ){//2. map the CNA jumps to BAF + else if (cnaEmit->is_set && opts.cna_jumps_fn != NULL ){//2. map the CNA jumps to BAF bafEmit->map_jumps(cnaEmit); bafEmit->get_events_via_jumps(); } @@ -661,9 +727,6 @@ void get_jump_probability( Clone * myClone, cmdl_opts& opts){ bafEmit->set_pjump(opts.baf_jump); bafEmit->get_events_via_jumps(); } - else{ - abort(); - } // map BAF events to SNV if (snvEmit->is_set){ for (int s=0; snSamples; s++){ diff --git a/src/cloneHD-functions.h b/src/cloneHD-functions.h index 256afc9..8cd7a3b 100755 --- a/src/cloneHD-functions.h +++ b/src/cloneHD-functions.h @@ -80,7 +80,7 @@ void get_baf_data(Emission * bafEmit, cmdl_opts& opts, int& nTimes, int& nT); void get_snv_data(Emission * snvEmit, cmdl_opts& opts, int& nTimes, int& nT); void get_snv_bulk_prior( Clone * myClone, cmdl_opts& opts); void get_track(const char * fn, gsl_matrix **& dist, double **& mn, double **& var, Emission * myEmit); - +void match_jumps(const char * jumps_fn, Emission * myEmit); void get_maxtcn_input(const char * maxtcn_fn, int maxtcn_gw, Clone * myClone); void get_mean_tcn( const char * mtcn_fn, Clone * myClone, Emission * myEmit); void get_avail_cn( const char * avcn_fn, Clone * myClone, Emission * myEmit); diff --git a/src/cloneHD-inference.cpp b/src/cloneHD-inference.cpp index 9a9ec14..694923c 100644 --- a/src/cloneHD-inference.cpp +++ b/src/cloneHD-inference.cpp @@ -89,7 +89,11 @@ int infer_clones( gsl_matrix * Clones, gsl_vector * Mass, Clone * myClone, cmdl_ char clonal_out[1024]; sprintf( clonal_out, "%s.summary.txt", opts.pre); FILE * clonal_fp = fopen(clonal_out,"w"); - fprintf(clonal_fp, "# n cna-llh baf-llh snv-llh total-llh total-bic\n"); + if (clonal_fp == NULL){ + printf("ERROR: file %s could not be opened for writing.\n", clonal_out); + exit(1); + } + fprintf( clonal_fp, "# n cna-llh baf-llh snv-llh total-llh total-bic\n"); //containers for the best estimates, given n... gsl_vector ** best_mass = new gsl_vector * [opts.nmax+1]; gsl_matrix ** best_clones = new gsl_matrix * [opts.nmax+1]; @@ -250,7 +254,7 @@ int infer_clones( gsl_matrix * Clones, gsl_vector * Mass, Clone * myClone, cmdl_ if (best_mass[bestn] != NULL){ myClone->set_mass(best_mass[bestn]); } - if (!cnaEmit->is_set && snvEmit->is_set && !snvEmit->connect){ + if (!cnaEmit->is_set && snvEmit->is_set && !snvEmit->connect && bestn > 0){ myClone->initialize_snv_prior_param(); } if (best_priors[bestn] != NULL && bestn > 0){ diff --git a/src/cloneHD.cpp b/src/cloneHD.cpp index a46bf0e..a46a56a 100755 --- a/src/cloneHD.cpp +++ b/src/cloneHD.cpp @@ -86,8 +86,8 @@ int main (int argc, const char * argv[]){ myClone.cna_pen_diff = opts.cna_pen_diff;//CNA penalty for different c.n. myClone.cna_pen_norm = opts.cna_pen_norm;//CNA penalty for non-normal c.n. myClone.baf_pen_comp = opts.baf_pen_comp;//BAF penalty for complex chr status - myClone.snv_pen_high = opts.snv_pen_high;//penalty for high SNV genotypes - myClone.snv_pen_mult = opts.snv_pen_mult;//penalty for multiple hit SNV + myClone.snv_pen_high = opts.snv_pen_high;//SNV penalty for high SNV genotypes + myClone.snv_pen_mult = opts.snv_pen_mult;//SNV penalty for multiple hit SNVs myClone.snv_fpr = opts.snv_fpr;//SNV false-positive rate myClone.snv_fpf = opts.snv_fpf;//SNV frequency of false positives myClone.bulk_fix = opts.bulk_fix; @@ -147,6 +147,7 @@ int main (int argc, const char * argv[]){ cout<<"done."<> chr >> l; - if (chr != old ){//new chromosome encounter - if (ct>0) nSites.push_back(ct); - chrs.push_back(chr); + if (chr != old ){//new chromosome encounter + if (ct>0){ + nSites.push_back(ct); + chrs.push_back(old); + } ct=0; } old=chr; @@ -58,7 +60,10 @@ void get_dims( const char * data_fn, } if (keep || r>0) ct++; } - nSites.push_back(ct); + if (ct>0){ + nSites.push_back(ct); + chrs.push_back(old); + } nTimes = nT; data_ifs.close(); } @@ -76,7 +81,7 @@ void get_data( const char * data_fn, Emission * myEmit){ } int ct=0,l; int chr=0,old=-1, sample=0; - int d,r, keep=0; + int d,r, keep=0, wait=0; //now collect all data... while( data_ifs.good()){ line.clear(); @@ -88,17 +93,17 @@ void get_data( const char * data_fn, Emission * myEmit){ line_ss >> chr >> l;//chromosome and locus if (chr != old){ if (myEmit->chrs.count(chr) == 0){ - cout<<"ERROR in get_data(): chr "<nSamples; s++){ - printf("sample %i = chr %i, idx = %i\n", - s+1, myEmit->chr[s], myEmit->idx_of[myEmit->chr[s]]); - } - exit(1); + printf("WARNING: chr %2i in file %s will be ignored.\n", chr, data_fn); + wait = 1; + } + else{ + sample = myEmit->idx_of[chr]; + ct = 0; + wait = 0; } - sample = myEmit->idx_of[chr]; - ct = 0; old = chr; } + if (wait) continue; if (ct >= myEmit->nSites[sample]) continue; keep = 0; for (int t=0; tnTimes; t++){ diff --git a/src/emission.cpp b/src/emission.cpp index ded2885..0c1a4e9 100755 --- a/src/emission.cpp +++ b/src/emission.cpp @@ -47,7 +47,10 @@ Emission::Emission(){ // real constructor void Emission::set(int ntimes, vector& Chrs, vector& nsites, int grid){ - if ((int) Chrs.size() != (int) nsites.size()) abort(); + if ((int) Chrs.size() != (int) nsites.size()){ + cout<<"FATAL ERROR (use gdb to locate).\n"; + abort(); + } nSamples = (int) nsites.size(); nTimes = ntimes; nSites = new int [nSamples]; @@ -119,8 +122,8 @@ void Emission::init_events(){ //map each observations to an event in another data track... void Emission::map_idx_to_Event(Emission * Emit, int sample){ if (Emit->is_set == 0) abort(); - if ( chr[sample] > Emit->maxchr || Emit->idx_of[ chr[sample] ] < 0 ) abort(); - int Sample = Emit->idx_of[ chr[sample] ]; + if (Emit->chrs.count(chr[sample]) == 0) abort(); + int Sample = Emit->idx_of[chr[sample]]; int Event = 0; int Idx = Emit->idx_of_event[Sample][Event]; int Locus = Emit->loci[Sample][Idx]; @@ -144,7 +147,7 @@ void Emission::map_idx_to_Event(Emission * Emit, int sample){ if (idx == nSites[sample]) break; } while( idx < nSites[sample]){//right over-hang - Event_of_idx[sample][idx]= Emit->nEvents[Sample] - 1; + Event_of_idx[sample][idx] = Emit->nEvents[Sample] - 1; idx++; } } @@ -911,72 +914,82 @@ void Emission::set_pjump(double jp){ } -void Emission::coarse_grain_jumps( int sample, double plow, int range){ - std::map::iterator it, max_it, first, last; +void Emission::coarse_grain_jumps( int sample, double min_jump, int range){ + std::map::iterator it, max_it, first_it, last_it; int L = nSites[sample]; double * cg_pjump = new double [L]; for (int i=0; i remaining; - pjump[sample][0]=0; - for (int i=0; i( i, pjump[sample][i])); + std::map idxOf; + pjump[sample][0] = 0; + int ct=0; + for (int i=0; i( ct, pjump[sample][i])); + idxOf.insert(std::pair(ct,i)); + ct++; + } //get global minimum... - first = remaining.begin(); - last = remaining.end(); - first++; - it = std::min_element( first, last, value_comparer); - double gmin = 0; - gmin = it->second; - int ct=0,idx=0,lidx=0; + first_it = remaining.begin(); + last_it = remaining.end(); + first_it++; + it = std::min_element( first_it, last_it, value_comparer); + double gmin = it->second; double p=0,p0=0,pmin=0,ps=0; - while (ct<1000){ - ct++; + while (remaining.size() > 0){ max_it = std::max_element( remaining.begin(), remaining.end(), value_comparer); p0 = max_it->second; - idx = max_it->first; - lidx=idx; - if (p0 < plow) break; + if (p0 < min_jump) break;//pjump at peak is below min_jump ps = 1.0 - p0; - first = max_it; - last = max_it; - pmin = p0; + first_it = max_it; + last_it = max_it; + pmin = p0; //above... it = max_it; - while(it != remaining.end()){ + int incl=0,done=0; + while( it != remaining.end() ){ it++; p = it->second; - if ( p>10.0*pmin || it->first != lidx+1 || p<10.0*gmin) break; - if (it->first < idx+range) ps *= 1.0 - p; + if ( p>10.0*pmin || it->first != last_it->first+1 || p<10.0*gmin) done=1; + else if (incl < range){ + ps *= 1.0 - p; + incl++; + } pmin = min(pmin,p); - lidx = it->first; - last = it; + last_it = it; + if (done) break; } - last++; //below... it = max_it; - lidx = idx; pmin = p0; + incl = 0; + done = 0; while(it != remaining.begin()){ it--; p = it->second; - if( p>10.0*pmin || it->first != lidx-1 || p<10.*gmin) break; - if (it->first > idx-range) ps *= 1.0 - p; - pmin = min(pmin,p); - lidx = it->first; - first = it; + if( p>10.0*pmin || it->first != first_it->first-1 || p<10.*gmin) done=1; + else if (inclfirst]] = 1.0-ps; + //remove... + remaining.erase( first_it, last_it); } //fill for (int i=0; i::value_type &i1, std::map::value_type &i2){ - return i1.secondsecond; } } + diffs.clear(); //now pick... int midx = 0; int mlocus = (int) mloci[0];