diff --git a/README.md b/README.md index de09f1e..1e1d299 100755 --- a/README.md +++ b/README.md @@ -185,7 +185,7 @@ Format of input files: the first two columns of all three input file If not specified, the normal copy number is used as limit for each chromosome. This number should be chosen conservatively, since it increases the - HMM dimensionality and can open the possibility for spurious solutions. + HMM dimensionality and can open up the possibility for spurious solutions. * `--nmax [int:2]` The maximum number of subclones to be tried. @@ -200,7 +200,7 @@ Format of input files: the first two columns of all three input file the total log-likelihood. The best result out of `trials` independent, randomly seeded, runs will be used. -* `--mean-tcn [file]` Use mean total copy number constraint SNV data. +* `--mean-tcn [file]` Use a fixed mean total copy number for SNV data. For a SNV data analysis, the cloneHD output file ending `*mean_tcn.txt` from a CNA(+BAF) run can be supplied here. Since @@ -212,8 +212,11 @@ Format of input files: the first two columns of all three input file For a SNV data analysis, the cloneHD output file ending `*avail_cn.txt` from a CNA(+BAF) run can be supplied here. Since the subclonal decomposition can be different for SNVs, this option - ensures that the SNV genotype is consistent with the fraction of cells in which this number of copies is available. - Can only be used together with `--mean-tcn [file]`. In combination, this is a much stronger constraint. + ensures that the SNV genotype is consistent with the fraction of + cells in which this number of copies is available. + Can only be used together with `--mean-tcn [file]`. In + combination, this is a much stronger constraint than using + `mean-tcn` alone. ### Fuzzy segmentation options @@ -245,15 +248,6 @@ with filterHD for data with persistence. * `--baf-rnd [double:0.0]` * `--snv-rnd [double:0.0]` -A constant jump probability per base pair. If -1, then observations are -uncorrellated along the genome. Can be learned with filterHD. No fuzzy -data segmentation is performed. Very high-definition information -available. Useful in combination with `--clones`. - -* `--cna-jump [double:-1.0]` -* `--baf-jump [double:-1.0]` -* `--snv-jump [double:-1.0]` - ## Advanced options * `--clones [file]` Use fixed mass(es) and/or subclonal frequencies. @@ -332,7 +326,18 @@ available. Useful in combination with `--clones`. * `--snv-pen [double:0.01]` The penalty for higher than expected genotypes. -* `--baf-pen [double:1.0]` The penalty for complex minor allele status. +* `--baf-pen [double:1.0]` The penalty for complex minor allele + status. + +* `--cna-jump [double:-1.0]` +* `--baf-jump [double:-1.0]` +* `--snv-jump [double:-1.0]` + +A constant jump probability per base pair. If -1, then observations are +uncorrellated along the genome. Can be learned with filterHD. No fuzzy +data segmentation is performed. Useful in combination with +`--clones`, where very high-definition information +available. Using this option will change the posterior output file format. ## Bulk options @@ -367,3 +372,30 @@ The grid sizes for the pre-computed emission probabilities if fuzzy data segment # Tips and tricks +* Pre-filtering of data can be very important. If filterHD predicts + many more jumps than you would expect, it might be necessary to + filter the data, removing very short segments (with + `--filter-shortSeg 10`). + +* Make sure that the bias field for the tumor CNA data is + meaningful. If a matched normal sample was sequenced with the same + pipeline, its read depth profile, as predicted by filterHD, can be used as a + bias field for the tumor CNA data. Follow the logic of the example + given here. + +* filterHD can sometimes run into local optima. It might be useful to + fix initial values for the parameters via `--jumpi [double]` etc. + +* By default, cloneHD runs with mass gauging enabled. This seems like + an overkill, but is actually quite useful because you can see some + alternative explanations during the course of the analysis. + +* Don't put too much weight on the BIC criterion. It was calibrated + using simulated data. For real data, it should be supplied with + common sense and biological knowledge. Use `--force [int]` to use a + fixed number of subclones. + +* For exome sequencing data, the read depth bias can be enormous. Use rather, if + available, the jumps seen in the BAF data for both CNA and BAF. + + diff --git a/changelog.md b/changelog.md index 5634f25..6746ac8 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,10 @@ # changelog for cloneHD/filterHD +## v1.17.1 / to come + +* BAF posterior symmetrized for output +* CNA transition matrix penalizes clones with zero copies of a segment + ## v1.17.0 / 25.02.2014 major release ### changed the way SNV priors are computed: diff --git a/run-example.sh b/run-example.sh index 1e63093..1b782da 100644 --- a/run-example.sh +++ b/run-example.sh @@ -6,8 +6,10 @@ export OMP_NUM_THREADS=4; part=$1 # input data -data="./test/data/" -results="./test/results/" +#data="./test/data/" +#results="./test/results/" +data="/Users/af7/Projects/cloneHD/Test/data/" +results="/Users/af7/Projects/cloneHD/Test/results/" filterHD="./build/filterHD" cloneHD="./build/cloneHD" diff --git a/src/Makefile b/src/Makefile index 4bd25a0..8c2fb54 100755 --- a/src/Makefile +++ b/src/Makefile @@ -14,7 +14,7 @@ cloneHD: $(cloneHDobj) cloneHD.o $(CC) $(CC_FLAGS) $^ -o ../build/cloneHD $(LD_FLAGS) %.o: %.cpp $(CC) $(CC_FLAGS) -c $< -o $@ -$(cloneobj): clone.h $(cloneobj:.o=.cpp) +$(cloneobj): clone.h# $(cloneobj:.o=.cpp) $(filterHDobj) $(cloneHDobj): $(.TARGET:.o=.h) clean: rm -f ./*.o diff --git a/src/clone-fwd-bwd.cpp b/src/clone-fwd-bwd.cpp index 927df42..4397d2d 100644 --- a/src/clone-fwd-bwd.cpp +++ b/src/clone-fwd-bwd.cpp @@ -136,12 +136,11 @@ void Clone::do_cna_Bwd(int sample, double& ent){ void Clone::do_baf_Fwd( int sample, double& llh){ int cnaSample = 0; int bafChr = bafEmit->chr[sample]; - if (cnaEmit->is_set){ - cnaSample = cnaEmit->idx_of[bafChr]; - if (nClones > 0){ - if (gamma_cna == NULL) abort(); - if (gamma_cna[cnaSample] == NULL) abort(); - } + if (!cnaEmit->is_set) abort(); + cnaSample = cnaEmit->idx_of[bafChr]; + if (nClones > 0){ + if (gamma_cna == NULL) abort(); + if (gamma_cna[cnaSample] == NULL) abort(); } gsl_vector * prior = gsl_vector_alloc(nLevels); gsl_vector * post = gsl_vector_alloc(nLevels); @@ -165,33 +164,31 @@ void Clone::do_baf_Fwd( int sample, double& llh){ Clone::predict( prior, post, bafEmit, pj, flat); } //***CONSISTENCY WITH CNA*** - if (cnaEmit->is_set){//connect to above, with CNA - cna_evt = bafEmit->Event_of_idx[sample][idx]; - if (cna_evt != last_cna_evt){//new BAF prior from CNA post - cna_post = gsl_matrix_row( gamma_cna[cnaSample], cna_evt); - get_baf_prior_from_cna_post( Prior, &cna_post.vector); - last_cna_evt = cna_evt; - } - if (bafEmit->connect) gsl_vector_memcpy( mem, prior); - gsl_vector_memcpy( prior, Prior); - nidx = (evt < last_evt) ? bafEmit->idx_of_event[sample][evt+1] : bafEmit->nSites[sample]; - if ( nidx-idx > 1){//exponentiate prior for all observations in this block - gsl_vector_scale( prior, double(nidx-idx));//log-space! + cna_evt = bafEmit->Event_of_idx[sample][idx]; + if (cna_evt != last_cna_evt){//new BAF prior from CNA post + cna_post = gsl_matrix_row( gamma_cna[cnaSample], cna_evt); + get_baf_prior_from_cna_post( Prior, &cna_post.vector); + last_cna_evt = cna_evt; + } + if (bafEmit->connect) gsl_vector_memcpy( mem, prior); + gsl_vector_memcpy( prior, Prior); + nidx = (evt < last_evt) ? bafEmit->idx_of_event[sample][evt+1] : bafEmit->nSites[sample]; + if ( nidx-idx > 1){//exponentiate prior for all observations in this block + gsl_vector_scale( prior, double(nidx-idx));//log-space! + norm = log_vector_norm(prior); + gsl_vector_add_constant(prior,-norm); + } + if (bafEmit->connect){//multiply two priors and rescale... + if (bafEmit->log_space){ + gsl_vector_add(prior,mem); norm = log_vector_norm(prior); gsl_vector_add_constant(prior,-norm); - } - if (bafEmit->connect){//multiply two priors and rescale... - if (bafEmit->log_space){ - gsl_vector_add(prior,mem); - norm = log_vector_norm(prior); - gsl_vector_add_constant(prior,-norm); - } - else{ - gsl_vector_mul(prior,mem); - norm = gsl_blas_dasum(prior); - if (norm!=norm || norm < 0.0) abort(); - gsl_vector_scale( prior, 1.0/norm); - } + } + else{ + gsl_vector_mul(prior,mem); + norm = gsl_blas_dasum(prior); + if (norm!=norm || norm < 0.0) abort(); + gsl_vector_scale( prior, 1.0/norm); } } } @@ -201,7 +198,9 @@ void Clone::do_baf_Fwd( int sample, double& llh){ //***UPDATE STEP*** norm = Clone::update( prior, post, bafEmit, sample, evt); llh += norm; - if (save_baf_alpha == 1) gsl_matrix_set_row( alpha_baf[sample], evt, post); + if (save_baf_alpha == 1){ + gsl_matrix_set_row( alpha_baf[sample], evt, post); + } } // cleanup gsl_vector_free(prior); @@ -216,12 +215,11 @@ void Clone::do_baf_Bwd( int sample, double& ent){ if (alpha_baf[sample] == NULL || gamma_baf[sample] == NULL) abort(); int cnaSample = 0; int bafChr = bafEmit->chr[sample]; - if (cnaEmit->is_set){ - cnaSample = cnaEmit->idx_of[bafChr]; - if ( nClones>0 ){ - if ( gamma_cna == NULL ) abort(); - if ( gamma_cna[cnaSample] == NULL) abort(); - } + if (!cnaEmit->is_set) abort(); + cnaSample = cnaEmit->idx_of[bafChr]; + if ( nClones>0 ){ + if ( gamma_cna == NULL ) abort(); + if ( gamma_cna[cnaSample] == NULL) abort(); } gsl_vector * prior = gsl_vector_alloc(nLevels); gsl_vector * Prior = gsl_vector_alloc(nLevels); @@ -248,33 +246,31 @@ void Clone::do_baf_Bwd( int sample, double& ent){ last_idx = idx; } //***CONSISTENCY WITH CNA*** - if (cnaEmit->is_set){//connect with CNA... - cna_evt = bafEmit->Event_of_idx[sample][idx]; - if (cna_evt != last_cna_evt){ - cna_post = gsl_matrix_row( gamma_cna[cnaSample], cna_evt); - get_baf_prior_from_cna_post( Prior, &cna_post.vector); - last_cna_evt = cna_evt; - } - if (bafEmit->connect) gsl_vector_memcpy( mem, prior); - gsl_vector_memcpy( prior, Prior); - nidx = (evt < last_evt) ? bafEmit->idx_of_event[sample][evt+1] : bafEmit->nSites[sample]; - if ( nidx-idx > 1){//exponentiate prior for all observations in this block - gsl_vector_scale( prior, double(nidx-idx));//log-space! + cna_evt = bafEmit->Event_of_idx[sample][idx]; + if (cna_evt != last_cna_evt){ + cna_post = gsl_matrix_row( gamma_cna[cnaSample], cna_evt); + get_baf_prior_from_cna_post( Prior, &cna_post.vector); + last_cna_evt = cna_evt; + } + if (bafEmit->connect) gsl_vector_memcpy( mem, prior); + gsl_vector_memcpy( prior, Prior); + nidx = (evt < last_evt) ? bafEmit->idx_of_event[sample][evt+1] : bafEmit->nSites[sample]; + if ( nidx-idx > 1){//exponentiate prior for all observations in this block + gsl_vector_scale( prior, double(nidx-idx));//log-space! + norm = log_vector_norm(prior); + gsl_vector_add_constant(prior,-norm); + } + if (bafEmit->connect){//multiply two priors and rescale... + if (bafEmit->log_space){ + gsl_vector_add(prior,mem); norm = log_vector_norm(prior); gsl_vector_add_constant(prior,-norm); - } - if (bafEmit->connect){//multiply two priors and rescale... - if (bafEmit->log_space){ - gsl_vector_add(prior,mem); - norm = log_vector_norm(prior); - gsl_vector_add_constant(prior,-norm); - } - else{ - gsl_vector_mul(prior,mem); - norm = gsl_blas_dasum(prior); - if (norm!=norm || norm <0.0) abort(); - gsl_vector_scale(prior,1.0/norm); - } + } + else{ + gsl_vector_mul(prior,mem); + norm = gsl_blas_dasum(prior); + if (norm!=norm || norm <0.0) abort(); + gsl_vector_scale(prior,1.0/norm); } } } @@ -297,6 +293,7 @@ void Clone::do_baf_Bwd( int sample, double& ent){ if (norm <= 0.0 || norm != norm) abort(); gsl_vector_scale( mem, 1.0/norm); }//multiply done + if (nClones > 0 && symmetrize_baf) Clone::sym_baf( mem, &cna_post.vector); gsl_matrix_set_row( gamma_baf[sample], evt, mem); //ent += Clone::entropy(mem); //***UPDATE STEP*** (normalization term not needed here) @@ -562,3 +559,57 @@ void Clone::do_snv_Bwd( int sample, double& ent){ gsl_vector_free(Prior); if (Trans!= NULL) gsl_matrix_free(Trans); } + + + +void Clone::sym_baf( gsl_vector * bafPost, gsl_vector * cnaPost){ + if (map1==NULL && map2==NULL){ + map1 = new gsl_matrix * [maxtcn+1]; + for (int c=0; c<=maxtcn; c++){ + map1[c] = gsl_matrix_calloc(maxtcn+1,maxtcn+1); + for (int i=0; i<=maxtcn; i++){ + if (i>c) break; + for (int j=0; j<=maxtcn; j++){ + if (j>c) break; + double val + = (i==j && 2*j == c) + ? 1.0 + : ( (i==j || c-i==j) ? 0.5 : 0.0); + gsl_matrix_set( map1[c], i, j, val); + } + } + } + map2 = new gsl_matrix * [nLevels]; + for (int l=0; ldata[l] > 1.0e-4){ + abort(); + } + else if (norm > 0){ + gsl_vector_scale(mem2, cnaPost->data[l] / norm); + gsl_vector_add(mem1, mem2); + } + } + double norm = gsl_blas_dasum(mem1); + if (norm<=0) abort(); + gsl_vector_scale(mem1,1.0/norm); + gsl_vector_memcpy(bafPost,mem1); + gsl_vector_free(mem1); + gsl_vector_free(mem2); +} diff --git a/src/clone-predict.cpp b/src/clone-predict.cpp index 79591c8..ac99b8d 100644 --- a/src/clone-predict.cpp +++ b/src/clone-predict.cpp @@ -23,12 +23,13 @@ void Clone::set_TransMat_cna(){ // only one clone can change its state, // or clones in the same state can change in parallel void Clone::set_TransMat_cna( gsl_matrix * Trans, int chr){ - double norm; + double norm,p; gsl_vector_view row; int jumps,cni,cnf; for (int i=0; i maxtcn_per_clone[chr][k]){ jumps = 2; @@ -44,10 +45,11 @@ void Clone::set_TransMat_cna( gsl_matrix * Trans, int chr){ else if (cni - cnf != copynumber[i][k] - copynumber[j][k]){ jumps++; } + if (copynumber[j][k] == 0) p*= 0.01; } } if (jumps <= 1){ - gsl_matrix_set( Trans, i, j, 1.0); + gsl_matrix_set( Trans, i, j, p); } else{ gsl_matrix_set( Trans, i, j, 0.0); diff --git a/src/clone-prior.cpp b/src/clone-prior.cpp index 31651f7..ba739c5 100644 --- a/src/clone-prior.cpp +++ b/src/clone-prior.cpp @@ -124,9 +124,9 @@ void Clone::set_snv_prior_map(){//either via BAF or else via CNA } } double p = snv_pen;//penalty for SNVs in cn higher than max BAF cn (multiple hits) - for (int cn=0; cn <= maxtcn; cn++){ + for (int cn=0; cn <= maxtcn; cn++){//cn=total cn gsl_matrix_set_zero( snv_prior_from_cna_baf_map[cn]); - for (int j=0; j<=cn; j++){ + for (int j=0; j<=cn; j++){//j=minor cn for (int i=0; i<= cn; i++){ double pen = pow( p, max( 0, i - max(j,cn-j)) ); gsl_matrix_set( snv_prior_from_cna_baf_map[cn], i, j, pen); diff --git a/src/clone.cpp b/src/clone.cpp index 9786731..0c4e8f8 100755 --- a/src/clone.cpp +++ b/src/clone.cpp @@ -74,6 +74,9 @@ Clone::Clone(){ snvGrid = 100; bulkGrid = 100; logzero = -1.0e6; + map1=NULL; + map2=NULL; + symmetrize_baf=0; } diff --git a/src/clone.h b/src/clone.h index de9ffdc..fe4f02e 100755 --- a/src/clone.h +++ b/src/clone.h @@ -165,6 +165,9 @@ class Clone{ gsl_matrix ** alpha_cna, ** alpha_baf, ** alpha_snv; gsl_matrix ** gamma_cna, ** gamma_baf, ** gamma_snv; double entropy(gsl_vector * x); + void sym_baf( gsl_vector * bafPost, gsl_vector * cnvPost); + gsl_matrix ** map1, ** map2; + int symmetrize_baf; // *** in clone-llh.cpp **************************************************************************** double get_cna_posterior( int sample); double get_baf_posterior( int sample); diff --git a/src/cloneHD-functions.cpp b/src/cloneHD-functions.cpp index 4e7c8e1..a8d1cb7 100755 --- a/src/cloneHD-functions.cpp +++ b/src/cloneHD-functions.cpp @@ -785,6 +785,7 @@ void print_all_results( Clone * myClone, cmdl_opts& opts){ Emission * bafEmit = myClone->bafEmit; Emission * snvEmit = myClone->snvEmit; bafEmit->connect = 0; + myClone->symmetrize_baf=1; // margin-map to get the posterior per clone... char buff[1024]; sprintf( buff, "%s.clonal.margin_map.txt", opts.pre);