diff --git a/.travis.yml b/.travis.yml index d7e5769..7a6cbb0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,5 +11,13 @@ install: true compiler: - gcc +before_script: + - wget https://sourceforge.net/projects/linasm/files/linasm-1.13%28stable%29.tar.gz/download -O linasm.tar.gz + - tar -xzf linasm.tar.gz + - pushd linasm-1.13\(stable\) && make && sudo make install prefix=/usr && popd + - export PATH=/usr/bin:$PATH + - export C_INCLUDE_PATH=/usr/include/:$C_INCLUDE_PATH + - export LD_LIBRARY_PATH=/usr/lib:$LD_LIBRARY_PATH + script: - - ./setup.sh ~/wtsi-opt \ No newline at end of file + - ./setup.sh ~/wtsi-opt diff --git a/CHANGES.md b/CHANGES.md index 1be0b3c..da4b82f 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,12 @@ # CHANGES +## 1.14.0 + +* Making species and assembly required commandline arguments in the estep +* Change method by which read position index is resolved. Results in time saving. +* linasm is now required [linasm](http://linasm.sourceforge.net/index.php) +* New log and exp functions from [linasm](http://linasm.sourceforge.net/index.php) + ## 1.13.16 * Correct occasional memory blowout caused by split step not excluding an ignore region. diff --git a/INSTALL.TXT b/INSTALL.TXT index 346f291..6640d6c 100644 --- a/INSTALL.TXT +++ b/INSTALL.TXT @@ -3,7 +3,9 @@ System Requirements caveman is designed to run in a compute farm/clustre environment. caveman depends on: - htslib 1.3 + htslib >=1.3 + zlib >=1.2.3.5 + linasm >=1.13 Compilation diff --git a/Makefile b/Makefile index 6aedb75..2577b22 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -CAVEMAN_VERSION=1.13.16 +CAVEMAN_VERSION=1.14.0 TEST_REF?="" #Compiler @@ -33,7 +33,7 @@ LFLAGS?=-L$(HTSTMP) # define any libraries to link into executable: # if I want to link in libraries (libx.so or libx.a) I use the -llibname # option, something like (this will link in libmylib.so and libm.so: -LIBS =-lhts -lpthread -lz -lm -ldl +LIBS =-lhts -lpthread -lz -lm -ldl -llinasm # define the C source files SRCS = ./src/file_tests.c ./src/List.c ./src/List_algos.c ./src/bam_access.c ./src/config_file_access.c ./src/fai_access.c ./src/ignore_reg_access.c ./src/alg_bean.c ./src/split_access.c ./src/covs_access.c ./src/cn_access.c ./src/genotype.c ./src/algos.c ./src/output.c ./src/setup.c ./src/split.c ./src/mstep.c ./src/merge.c ./src/estep.c diff --git a/README.md b/README.md index 32c4430..b202571 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ See INSTALL.TXT * Reference.fasta and index * A one based bed style format file of regions to ignore during analysis (see specified format). * [zlib](https://zlib.net/) >= 1.2.3.5 +* [linasm](http://linasm.sourceforge.net/index.php) >= 1.13 ## Optional inputs (will result in more accurate calls) diff --git a/setup.sh b/setup.sh index 3b83e4a..8f0f6c8 100755 --- a/setup.sh +++ b/setup.sh @@ -125,6 +125,8 @@ fi export HTSLIB="$SETUP_DIR/htslib" +set -e + echo -n "Building CaVEMan ..." if [ -e "$SETUP_DIR/caveman.success" ]; then echo -n " previously installed ..."; diff --git a/src/alg_bean.c b/src/alg_bean.c index bf3c34f..7dba49a 100644 --- a/src/alg_bean.c +++ b/src/alg_bean.c @@ -32,6 +32,7 @@ #include #include +#include #include #include #include @@ -166,11 +167,11 @@ int alg_bean_get_index_for_char_arr(List *list,char *val){ return -1; } -int alg_bean_get_index_for_read_pos_prop_arr(List *list,int pos,int rd_len){ - List *lengths = List_create(); - int i=0; - int last_stop = 1; - LIST_FOREACH(list, first, next, cur){ +List *alg_bean_get_position_list_from_read_pos_proportion_arr(List *list,int rd_len){ + List *lengths = List_create(); + int last_stop = 1; + int i=0; + LIST_FOREACH(list, first, next, cur){ float pct = *((float *)cur->value); int lng = (((float)rd_len/(float)100) * pct); alg_bean_intrange *range = malloc(sizeof(alg_bean_intrange)); @@ -182,11 +183,60 @@ int alg_bean_get_index_for_read_pos_prop_arr(List *list,int pos,int rd_len){ range->to = (range->from) + lng; last_stop = range->to; List_push(lengths,range); - i++; + i++; } - int result = alg_bean_get_index_for_intrange_arr(lengths,pos); - List_clear_destroy(lengths); - return result; + return lengths; +} + +int alg_bean_get_index_for_read_pos_prop_arr(void *_hash, int pos,int rd_len){ + List *lengths = NULL; + khiter_t k; + khash_t(readlenpos) *h_rd_len = (khash_t(readlenpos)*) _hash; + k = kh_get(readlenpos, h_rd_len, rd_len); + lengths = kh_value(h_rd_len, k); + return alg_bean_get_index_for_intrange_arr(lengths, pos); +} + +int alg_bean_add_read_length_arrs(alg_bean_t *bean, char* list_loc, char* contig){ + FILE *output_rp = NULL; + khash_t(readlenpos) *h_rd_len; + char *dircpy = NULL; + //Create map, key is readlength, List of sections is value. + char *read_len_pos_arr_file = malloc(strlen(contig) + strlen(list_loc) + 6); + check_mem(read_len_pos_arr_file); + //Create filename here through name concatenation. + dircpy = strdup(list_loc); + strcpy(read_len_pos_arr_file,dirname(dircpy)); + strcat(read_len_pos_arr_file,"/readpos."); + strcat(read_len_pos_arr_file,contig); + output_rp = fopen(read_len_pos_arr_file,"r"); + check(output_rp != NULL, "Error opening file %s for write.",read_len_pos_arr_file); + free(read_len_pos_arr_file); + + h_rd_len = kh_init(readlenpos); + char line[500]; + int rd_len_missing = 0; + khiter_t k; + while ( fgets(line,sizeof(line),output_rp) != NULL ){ + //75 1-2;3-38;39-56;57-66;67-76; + int len; + char list_txt [4950]; + int chk = sscanf(line,"%d\t%s",&len,list_txt); + List *lenrange = alg_bean_parse_int_range(list_txt); + k = kh_put(readlenpos, h_rd_len, len, &rd_len_missing); + if(rd_len_missing){ + kh_value(h_rd_len, k) = lenrange; + } + } + fclose(output_rp); + + bean->read_len_pos = h_rd_len; + free(dircpy); + return 0; +error: + if(dircpy) free(dircpy); + if(output_rp) fclose(output_rp); + return -1; } List *alg_bean_parse_str_list(char *txt){ diff --git a/src/alg_bean.h b/src/alg_bean.h index 74e811b..cd10aa5 100644 --- a/src/alg_bean.h +++ b/src/alg_bean.h @@ -35,6 +35,10 @@ #include #include +#include "khash.h" + +//New hash to store unique readlengths +KHASH_MAP_INIT_INT(readlenpos, List *) typedef struct alg_bean_intrange{ int from; @@ -58,6 +62,7 @@ typedef struct alg_bean_t{ int call_base_size; List *strand; int strand_size; + khash_t(readlenpos) *read_len_pos; } alg_bean_t; int alg_bean_create_default_file(FILE *file, char *norm, char *tum); @@ -72,7 +77,9 @@ List *alg_bean_hard_copy_char_list(List *new_list, List *old); int alg_bean_get_index_for_str_arr(List *list,char *value); int alg_bean_get_index_for_intrange_arr(List *list,int value); int alg_bean_get_index_for_char_arr(List *list,char *value); -int alg_bean_get_index_for_read_pos_prop_arr(List *list,int pos,int rd_len); +int alg_bean_get_index_for_read_pos_prop_arr(void *_hash, int pos,int rd_len); +int alg_bean_add_read_length_arrs(alg_bean_t *bean, char* list_loc, char* contig); +List *alg_bean_get_position_list_from_read_pos_proportion_arr(List *list,int rd_len); #define CEIL(a, b) (((a) / (b)) + (((a) % (b)) > 0 ? 1 : 0)) #define MIN(a, b) (((a) < (b)) ? (a) : (b)) diff --git a/src/algos.c b/src/algos.c index cd988a5..7675173 100644 --- a/src/algos.c +++ b/src/algos.c @@ -41,6 +41,7 @@ #include #include #include +#include "Math.h" #include #include #include @@ -150,7 +151,7 @@ void set_max_tum_cvg(int i){ max_tum_cvg = i; } -int algos_mstep_read_position(alg_bean_t *alg,uint64_t ********covs, char *chr_name, uint32_t from, uint32_t to, char *ref_base, int split_size){ +int algos_mstep_read_position(alg_bean_t *alg, uint64_t ********covs, char *chr_name, uint32_t from, uint32_t to, char *ref_base, int split_size){ //Fetch all reads for this pos? Or a struct for a single read at that position? List *reads = NULL; char *cbase = NULL; @@ -174,7 +175,7 @@ int algos_mstep_read_position(alg_bean_t *alg,uint64_t ********covs, char *chr_n || (ref_b_up == 'G') || (ref_b_up = 'T')) ){ //int lane_i = alg_bean_get_index_for_str_arr(alg->lane,pos_t->lane); //check(lane_i>=0,"Error calculating lane index."); - int rpos_i = alg_bean_get_index_for_read_pos_prop_arr(alg->rd_pos,pos_t->rd_pos,pos_t->rd_len); + int rpos_i = alg_bean_get_index_for_read_pos_prop_arr(alg->read_len_pos,pos_t->rd_pos,pos_t->rd_len); check(rpos_i>=0,"Error calculating read position index."); int mq_i = alg_bean_get_index_for_intrange_arr(alg->map_qual,pos_t->map_qual); check(mq_i>=0,"Error calculating map qual index."); @@ -275,29 +276,39 @@ int algos_run_per_read_estep_maths(genotype_store_t *genos,read_pos_t *read, int long double ref_base_prob = *(read->ref_base_probs[ref_base_idx]); long double res = genos->ref_geno_norm_prob + ref_base_prob; genos->ref_geno_norm_prob = res; - + long double norm_var_base_prop; + long double nu_fx; + long double tmp_psi_var_prob_norm; + long double ans; //iterate through from 0 to highest available of genos->hom_count, genos->het_norm_count, int iter=0; for(iter=0;iternorm_max;iter++){ //Hom snps if(iterhom_count){ - long double ans = genos->hom_snp_genotypes[iter]->prob + *(read->ref_base_probs[(genos->hom_snp_genotypes[iter]->norm_geno->var_base_idx)]); + ans = genos->hom_snp_genotypes[iter]->prob + *(read->ref_base_probs[(genos->hom_snp_genotypes[iter]->norm_geno->var_base_idx)]); genos->hom_snp_genotypes[iter]->prob = ans; }//End of iteration through hom snps //Het snps if(iterhet_norm_count){ - long double norm_var_base_prop = genos->het_snp_norm_genotypes[iter]->norm_geno->var_base_prop; - long double nu_fx = (long double)ref_b * norm_var_base_prop; - long double tmp_psi_var_prob_norm = nu_fx / (nu_fx + ((long double)1 - norm_var_base_prop)); - long double ans = genos->het_snp_norm_genotypes[iter]->prob + + norm_var_base_prop = genos->het_snp_norm_genotypes[iter]->norm_geno->var_base_prop; + nu_fx = (long double)ref_b * norm_var_base_prop; + tmp_psi_var_prob_norm = nu_fx / (nu_fx + ((long double)1 - norm_var_base_prop)); + ans = genos->het_snp_norm_genotypes[iter]->prob + (long double) ( - logl( - expl( ref_base_prob + logl((long double)1 - tmp_psi_var_prob_norm) ) + Math_Log_flt64( + Math_Exp_flt64( (flt64_t)(ref_base_prob + Math_Log_flt64( + (flt64_t)((long double)1 - tmp_psi_var_prob_norm) + ) + )) + - expl( *(read->ref_base_probs[(genos->het_snp_norm_genotypes[iter]->norm_geno->var_base_idx)]) + logl(tmp_psi_var_prob_norm) ) - ) + Math_Exp_flt64( + (flt64_t)( + *(read->ref_base_probs[(genos->het_snp_norm_genotypes[iter]->norm_geno->var_base_idx)]) + Math_Log_flt64((flt64_t)tmp_psi_var_prob_norm) + ) + ) + ) ); genos->het_snp_norm_genotypes[iter]->prob = ans; }//End of iteration through het snps @@ -307,6 +318,7 @@ int algos_run_per_read_estep_maths(genotype_store_t *genos,read_pos_t *read, int long double ref_base_prob = *(read->ref_base_probs[ref_base_idx]); long double res = genos->ref_geno_tum_prob + ref_base_prob; genos->ref_geno_tum_prob = res; + long double ans; //iterate through from 0 to highest available of genos->somatic_count, genos->hom_count, genos->het_count int iter=0; for(iter=0;itertum_max;iter++){ @@ -316,14 +328,14 @@ int algos_run_per_read_estep_maths(genotype_store_t *genos,read_pos_t *read, int long double nu_xf = (long double)ref_b * genos->somatic_genotypes[iter]->tum_geno->var_base_prop; long double tmp_psi_var_prob = nu_xf * ((long double)1 - base_norm_contam) / (nu_xf + ((long double)1 - genos->somatic_genotypes[iter]->tum_geno->var_base_prop)); //Calculate read component probability. - long double log_tmp_psi_var_prob = logl(tmp_psi_var_prob); - long double log_1_minus_tmp_psi_var_prob = logl((long double)1 - tmp_psi_var_prob); - long double ans = genos->somatic_genotypes[iter]->prob + - logl( + long double log_tmp_psi_var_prob = (long double) Math_Log_flt64((flt64_t)tmp_psi_var_prob); + long double log_1_minus_tmp_psi_var_prob = (long double) Math_Log_flt64((flt64_t)((long double)1 - tmp_psi_var_prob)); + ans = genos->somatic_genotypes[iter]->prob + (long double) + Math_Log_flt64( ( - expl( (ref_base_prob + log_1_minus_tmp_psi_var_prob) ) + Math_Exp_flt64( (flt64_t)((ref_base_prob + log_1_minus_tmp_psi_var_prob)) ) + - expl( (*(read->ref_base_probs[genos->somatic_genotypes[iter]->tum_geno->var_base_idx]) + log_tmp_psi_var_prob) ) + Math_Exp_flt64( (flt64_t)((*(read->ref_base_probs[genos->somatic_genotypes[iter]->tum_geno->var_base_idx]) + log_tmp_psi_var_prob)) ) ) ); genos->somatic_genotypes[iter]->prob = ans; @@ -332,7 +344,7 @@ int algos_run_per_read_estep_maths(genotype_store_t *genos,read_pos_t *read, int //Hom snps if(iterhom_count){ - long double ans = genos->hom_snp_genotypes[iter]->prob + + ans = genos->hom_snp_genotypes[iter]->prob + *(read->ref_base_probs[(genos->hom_snp_genotypes[iter]->tum_geno->var_base_idx)]); genos->hom_snp_genotypes[iter]->prob = ans; }//End of hom snps @@ -350,10 +362,10 @@ int algos_run_per_read_estep_maths(genotype_store_t *genos,read_pos_t *read, int ( (long double)base_norm_contam * tmp_psi_var_prob_norm); long double tum_res = genos->het_snp_genotypes[iter]->prob + ( - logl( - expl( ref_base_prob + logl((long double)1 - tmp_psi_var_prob_tum) ) + Math_Log_flt64( + Math_Exp_flt64( (flt64_t)(ref_base_prob + Math_Log_flt64((flt64_t)((long double)1 - tmp_psi_var_prob_tum)) )) + - expl( *(read->ref_base_probs[(genos->het_snp_genotypes[iter]->norm_geno->var_base_idx)]) + logl(tmp_psi_var_prob_tum) ) + Math_Exp_flt64( (flt64_t)(*(read->ref_base_probs[(genos->het_snp_genotypes[iter]->norm_geno->var_base_idx)]) + Math_Log_flt64((flt64_t)tmp_psi_var_prob_tum) )) ) ); @@ -369,30 +381,30 @@ int algos_run_per_read_estep_maths(genotype_store_t *genos,read_pos_t *read, int } long double calculateLogSumExpNormFactor(estep_position_t *pos,long double norm_factor_max){ - long double sum_tot = 0.0; + flt64_t sum_tot = 0.0; //Reference - sum_tot += expl(pos->genos->ref_genotype->prob - norm_factor_max); + sum_tot += Math_Exp_flt64((flt64_t)(pos->genos->ref_genotype->prob - norm_factor_max)); //Somatic int somatic_i = 0; for(somatic_i = 0;somatic_igenos->somatic_count;somatic_i++){ - sum_tot += expl(pos->genos->somatic_genotypes[somatic_i]->prob - norm_factor_max); + sum_tot += Math_Exp_flt64((flt64_t)(pos->genos->somatic_genotypes[somatic_i]->prob - norm_factor_max)); } //het int het_snp_it=0; for(het_snp_it=0;het_snp_itgenos->het_count;het_snp_it++){ - sum_tot += expl(pos->genos->het_snp_genotypes[het_snp_it]->prob - norm_factor_max); + sum_tot += Math_Exp_flt64((flt64_t)(pos->genos->het_snp_genotypes[het_snp_it]->prob - norm_factor_max)); } //Hom int hom_snp_i=0; for(hom_snp_i=0;hom_snp_igenos->hom_count;hom_snp_i++){ - sum_tot += expl(pos->genos->hom_snp_genotypes[hom_snp_i]->prob - norm_factor_max); - } + sum_tot += Math_Exp_flt64((flt64_t)(pos->genos->hom_snp_genotypes[hom_snp_i]->prob - norm_factor_max)); + } //Return final logged value. - long double norm_factor = norm_factor_max + logl(sum_tot); + long double norm_factor = norm_factor_max + Math_Log_flt64(sum_tot); return norm_factor; } @@ -403,7 +415,6 @@ void finalise_probabilities_and_find_top_prob(estep_position_t *pos,long double //We find the normalisation factor via the log-sum-exp method to avoid underflow, then we can calculate the final probabilities long double norm_factor = calculateLogSumExpNormFactor(pos,norm_factor_max); - //printf("NORM FACTOR: %5.1Le\n",norm_factor); //Iterate through every genotype and normalise it. Adding it to a list. int iter = 0; long double somatic_sum = 0.0; @@ -411,8 +422,7 @@ void finalise_probabilities_and_find_top_prob(estep_position_t *pos,long double for(iter=0;itergenos->total_max;iter++){ //somatics if(itergenos->somatic_count){ - pos->genos->somatic_genotypes[iter]->prob = expl(pos->genos->somatic_genotypes[iter]->prob - norm_factor); - + pos->genos->somatic_genotypes[iter]->prob = (long double) (Math_Exp_flt64((flt64_t)(pos->genos->somatic_genotypes[iter]->prob - norm_factor))); somatic_sum += pos->genos->somatic_genotypes[iter]->prob; if(top_geno==NULL){ top_geno = pos->genos->somatic_genotypes[iter]; @@ -426,7 +436,7 @@ void finalise_probabilities_and_find_top_prob(estep_position_t *pos,long double //het snps if(itergenos->het_count){ - pos->genos->het_snp_genotypes[iter]->prob = expl(pos->genos->het_snp_genotypes[iter]->prob - norm_factor); + pos->genos->het_snp_genotypes[iter]->prob = (long double) (Math_Exp_flt64((flt64_t)(pos->genos->het_snp_genotypes[iter]->prob - norm_factor))); snp_sum += pos->genos->het_snp_genotypes[iter]->prob; if(pos->genos->het_snp_genotypes[iter]->prob > top_geno->prob){ @@ -439,7 +449,7 @@ void finalise_probabilities_and_find_top_prob(estep_position_t *pos,long double //hom snps if(itergenos->hom_count){ - pos->genos->hom_snp_genotypes[iter]->prob = expl(pos->genos->hom_snp_genotypes[iter]->prob - norm_factor); + pos->genos->hom_snp_genotypes[iter]->prob = (long double) (Math_Exp_flt64((flt64_t)(pos->genos->hom_snp_genotypes[iter]->prob - norm_factor))); snp_sum += pos->genos->hom_snp_genotypes[iter]->prob; if(pos->genos->hom_snp_genotypes[iter]->prob > top_geno->prob){ @@ -453,7 +463,7 @@ void finalise_probabilities_and_find_top_prob(estep_position_t *pos,long double //normalise the reference genotype probability. - pos->genos->ref_genotype->prob = expl(pos->genos->ref_genotype->prob - norm_factor); + pos->genos->ref_genotype->prob = (long double) (Math_Exp_flt64((flt64_t)(pos->genos->ref_genotype->prob - norm_factor))); if(pos->genos->ref_genotype->prob > top_geno->prob){ sec_geno = top_geno; @@ -475,7 +485,7 @@ void finalise_probabilities_and_find_top_prob(estep_position_t *pos,long double long double calculate_ref_prob(const long double norm, const long double tum){ long double ref_prob; - ref_prob = logl(((long double)1 - (long double)prior_s_prob) - (long double)prior_m_prob) + ref_prob = Math_Log_flt64(((long double)1 - (long double)prior_s_prob) - (long double)prior_m_prob) + norm + tum; return ref_prob; } @@ -486,7 +496,7 @@ void algos_run_per_position_estep_maths(estep_position_t *pos){ pos->genos->ref_genotype->prob = 0.0; long double ref_prob = calculate_ref_prob(pos->genos->ref_geno_norm_prob,pos->genos->ref_geno_tum_prob); pos->genos->ref_genotype->prob = ref_prob; - //long double ref_prob = logl(((long double)1 - (long double)prior_s_prob) - (long double)prior_m_prob) + //long double ref_prob = Math_Log_flt64(((long double)1 - (long double)prior_s_prob) - (long double)prior_m_prob) // + pos->genos->ref_geno_norm_prob + pos->genos->ref_geno_tum_prob; if(pos->genos->ref_genotype->prob > norm_factor_max){ norm_factor_max = pos->genos->ref_genotype->prob; @@ -497,7 +507,7 @@ void algos_run_per_position_estep_maths(estep_position_t *pos){ for(iter=0; itergenos->total_max; iter++){ //Somatics if(itergenos->somatic_count){ - long double res = logl((long double)prior_m_prob) - logl((long double)pos->genos->somatic_count) + long double res = Math_Log_flt64((long double)prior_m_prob) - Math_Log_flt64((long double)pos->genos->somatic_count) + pos->genos->ref_geno_norm_prob + pos->genos->somatic_genotypes[iter]->prob; pos->genos->somatic_genotypes[iter]->prob = res; @@ -513,7 +523,7 @@ void algos_run_per_position_estep_maths(estep_position_t *pos){ for(het_norm_i=0;het_norm_igenos->het_norm_count;het_norm_i++){ //find the correct het normal genotype if(genotype_equals(pos->genos->het_snp_norm_genotypes[het_norm_i]->norm_geno,pos->genos->het_snp_genotypes[iter]->norm_geno)==1){ - long double res = logl((long double)prior_s_prob) - logl((long double)(pos->genos->het_count + pos->genos->hom_count)) + long double res = Math_Log_flt64((long double)prior_s_prob) - Math_Log_flt64((long double)(pos->genos->het_count + pos->genos->hom_count)) + pos->genos->het_snp_norm_genotypes[het_norm_i]->prob + pos->genos->het_snp_genotypes[iter]->prob; pos->genos->het_snp_genotypes[iter]->prob = res; @@ -541,7 +551,7 @@ void algos_run_per_position_estep_maths(estep_position_t *pos){ int algos_get_read_specific_indices(alg_bean_t *alg, read_pos_t *pos_t, int *rpos_i, int *mq_i, int *bq_i, int *callbase_i){ - *rpos_i = alg_bean_get_index_for_read_pos_prop_arr(alg->rd_pos,pos_t->rd_pos,pos_t->rd_len); + *rpos_i = alg_bean_get_index_for_read_pos_prop_arr(alg->read_len_pos,pos_t->rd_pos,pos_t->rd_len); check(*rpos_i>=0,"Error calculating read position index."); *mq_i = alg_bean_get_index_for_intrange_arr(alg->map_qual,pos_t->map_qual); check(*mq_i>=0,"Error calculating map qual index."); @@ -680,7 +690,7 @@ int algos_estep_read_position(alg_bean_t *alg,long double ********prob_arr, char //build up genotype stores for genotype_store_t *genos pos->genos = genotype_generate_genotype_list_for_cn_and_ref_base(pos->norm_cn, pos->tum_cn, ref_b); //Setup initial hom snp probabilities. - long double init_hom_snp = logl((long double)prior_s_prob) - logl((long double) (pos->genos->het_count + pos->genos->hom_count)); + long double init_hom_snp = Math_Log_flt64((long double)prior_s_prob) - Math_Log_flt64((long double) (pos->genos->het_count + pos->genos->hom_count)); int hom_snp_i = 0; for(hom_snp_i=0;hom_snp_igenos->hom_count;hom_snp_i++){ pos->genos->hom_snp_genotypes[hom_snp_i]->prob = init_hom_snp; diff --git a/src/algos.h b/src/algos.h index 1d3d6bf..ea44bcf 100644 --- a/src/algos.h +++ b/src/algos.h @@ -38,6 +38,7 @@ #include #include #include "zlib.h" +#include "Math.h" typedef struct estep_position_t{ uint8_t norm_cn; diff --git a/src/estep.c b/src/estep.c index 34117db..d80f2ca 100644 --- a/src/estep.c +++ b/src/estep.c @@ -553,6 +553,8 @@ int estep_main(int argc, char *argv[]){ check(chk_write==0,"Error writing header to dbg file."); } + chk = alg_bean_add_read_length_arrs(alg, list_loc, chr_name); + check(chk==0,"Error reading alg_bean read length arrays from file."); //Iterate through analysis sections //Iterate through sections. diff --git a/src/mstep.c b/src/mstep.c index 3657c8a..7486b68 100644 --- a/src/mstep.c +++ b/src/mstep.c @@ -204,6 +204,9 @@ int mstep_main(int argc, char *argv[]){ //Iterate through each base in the section and create the genome profile. //an array to store the data full of zeroes. + int chk = alg_bean_add_read_length_arrs(alg, list_loc, chr_name); + check(chk==0,"Error reading alg_bean read length arrays from file."); + //Read order (1st/2nd) //Strand +/- //Lane diff --git a/src/split.c b/src/split.c index 58c0dd2..3252cb1 100644 --- a/src/split.c +++ b/src/split.c @@ -44,6 +44,10 @@ #include #include #include +#include "khash.h" + +//New hash to store unique readlengths +KHASH_MAP_INIT_INT(rdlenkhash, uint8_t) static int includeSW = 0; static int includeSingleEnd = 0; @@ -166,7 +170,12 @@ int split_main(int argc, char *argv[]){ bam1_t *norm_read = NULL; bam1_t *tum_read = NULL; seq_region_t **ignore_regs = NULL; + FILE *alg_bean_file = NULL; + alg_bean_t *alg = NULL; + char *read_len_pos_arr_file = NULL; + khash_t(rdlenkhash) *rdlen_h; int ignore_reg_count = 0; + FILE *output_rp = NULL; int is_err = split_setup_options(argc,argv); check(is_err==0,"Error parsing options"); @@ -198,7 +207,7 @@ int split_main(int argc, char *argv[]){ //Open a file to write sections, named according to CHR. char *fname = malloc(strlen(chr_name) + strlen(list_loc) + 3); check_mem(fname); - //Create filename here through name concatenation. + //Create filename here through name concatenation. strcpy(fname,list_loc); strcat(fname,"."); strcat(fname,chr_name); @@ -219,6 +228,10 @@ int split_main(int argc, char *argv[]){ //Check there's not a whole chromosome block. if(!(ignore_reg_count == 1 && ignore_regs[0]->beg == 1 && ignore_regs[0]->end >= chr_length)){ + //Initialise readlength hash + rdlen_h = kh_init(rdlenkhash); + check(rdlen_h != NULL,"Memory allocation error when initialising hash kh_init(rdlenkhash)."); + //No chromosome block, so carry on. uint32_t start = 1; uint32_t stop = chr_length; @@ -271,15 +284,20 @@ int split_main(int argc, char *argv[]){ uint32_t curr_n_pos = 0; uint32_t curr_t_pos = 0; - + khiter_t k; //Have both iterators, now need to iterate through each in sync so we don't get ahead of the stops. while(iter_n_status>=0 || iter_t_status>=0){ //Keep iterating until both iterators are out of reads. - while(curr_n_pos<=curr_t_pos && iter_n_status>=0 && iter_t_status>=0 && rd_count<=max_read_count){ //While the positions aren't equal and tumour has reads left. Normal jumps ahead iter_n_status = sam_itr_next(sf_norm,iter_norm,norm_read); check(iter_n_status>=-1,"Error detected (%d) when trying to iterate through region.",iter_n_status); curr_n_pos = norm_read->core.pos; if(iter_n_status>=0 && bam_access_check_bam_flags(norm_read) == 1 && ignore_reg_access_get_ign_reg_overlap(curr_n_pos,ignore_regs,ignore_reg_count) == NULL){ + int read_len = norm_read->core.l_qseq; + int rd_len_missing; + k = kh_put(rdlenkhash, rdlen_h, read_len, &rd_len_missing); + if(rd_len_missing){ // If the readname key doesn't yet exist + kh_value(rdlen_h, k) = read_len; + } rd_count++; } }//End of this iteration through normal reads @@ -289,6 +307,12 @@ int split_main(int argc, char *argv[]){ check(iter_t_status>=-1,"Error detected (%d) when trying to iterate through region.",iter_t_status); curr_t_pos = tum_read->core.pos; if(iter_t_status>=0 && bam_access_check_bam_flags(tum_read) == 1 && ignore_reg_access_get_ign_reg_overlap(curr_t_pos,ignore_regs,ignore_reg_count) == NULL){ + int read_len = tum_read->core.l_qseq; + int rd_len_missing; + k = kh_put(rdlenkhash, rdlen_h, read_len, &rd_len_missing); + if(rd_len_missing){ // If the readname key doesn't yet exist + kh_value(rdlen_h, k) = read_len; + } rd_count++; } }//End of this iteration through tumour reads @@ -300,7 +324,13 @@ int split_main(int argc, char *argv[]){ check(iter_t_status>=-1,"Error detected (%d) when trying to iterate through region.",iter_t_status); curr_t_pos = tum_read->core.pos; if(iter_t_status>=0 && bam_access_check_bam_flags(tum_read) == 1 && ignore_reg_access_get_ign_reg_overlap(curr_t_pos,ignore_regs,ignore_reg_count) == NULL){ - rd_count++; + int read_len = tum_read->core.l_qseq; + int rd_len_missing; + k = kh_put(rdlenkhash, rdlen_h, read_len, &rd_len_missing); + if(rd_len_missing){ // If the readname key doesn't yet exist + kh_value(rdlen_h, k) = read_len; + } + rd_count++; } } }//End of iteration through tumour reads where only tumour reads remain @@ -311,6 +341,12 @@ int split_main(int argc, char *argv[]){ check(iter_n_status>=-1,"Error detected (%d) when trying to iterate through region.",iter_n_status); curr_n_pos = norm_read->core.pos; if(iter_n_status>=0 && bam_access_check_bam_flags(norm_read) == 1 && ignore_reg_access_get_ign_reg_overlap(curr_n_pos,ignore_regs,ignore_reg_count) == NULL){ + int read_len = norm_read->core.l_qseq; + int rd_len_missing; + k = kh_put(rdlenkhash, rdlen_h, read_len, &rd_len_missing); + if(rd_len_missing){ // If the readname key doesn't yet exist + kh_value(rdlen_h, k) = read_len; + } rd_count++; } } @@ -337,7 +373,51 @@ int split_main(int argc, char *argv[]){ }//End of moving through both normal and tumour iterators + //Read in alg_bean + //Read in the alg bean + alg_bean_file = fopen(alg_bean_loc,"r"); + check(alg_bean_file != 0 ,"Error trying to open alg_bean file: %s.",alg_bean_loc); + alg = alg_bean_read_file(alg_bean_file); + check(alg != NULL,"Error reading alg_bean from file."); + check(fclose(alg_bean_file)==0,"Error closing alg bean file."); + + //Iterate through readlen hashkeys and output a format giving all readlength options for readpos boundaries + //This can be read in by mstep and estep + + //Open a file to write sections, named according to CHR. + char *read_len_pos_arr_file = malloc(strlen(chr_name) + strlen(list_loc) + 6); + check_mem(read_len_pos_arr_file); + //Create filename here through name concatenation. + char *dupdir; + dupdir = strdup(list_loc); + strcpy(read_len_pos_arr_file,dirname(dupdir)); + strcat(read_len_pos_arr_file,"/readpos."); + strcat(read_len_pos_arr_file,chr_name); + output_rp = fopen(read_len_pos_arr_file,"w"); + check(output_rp != NULL, "Error opening file %s for write.",read_len_pos_arr_file); + free(read_len_pos_arr_file); + + for (k = kh_begin(rdlen_h); k != kh_end(rdlen_h); ++k){ // traverse + // test if a bucket contains data + if (kh_exist(rdlen_h, k)){ + //Convert alg bean read length proportions to a 'by readlength' set of ranges + int len = kh_value(rdlen_h, k); + List *lengths = alg_bean_get_position_list_from_read_pos_proportion_arr(alg->rd_pos,len); + //Output readlength splits and read length itself to file + fprintf(output_rp,"%d\t",len); + LIST_FOREACH(lengths, first, next, cur){ + alg_bean_intrange *range = (alg_bean_intrange *) cur->value; + fprintf(output_rp,"%d-%d;",range->from,range->to); + } + fprintf(output_rp,"\n"); + List_clear_destroy(lengths); + } + } + fclose(output_rp); + + //No more reads left so we must print the last section. + kh_destroy(rdlenkhash, rdlen_h); split_access_print_section(output,chr_name,sect_start,chr_length); bam_destroy1(norm_read); bam_destroy1(tum_read); @@ -352,6 +432,8 @@ int split_main(int argc, char *argv[]){ return 0; error: + if(rdlen_h) kh_destroy(rdlenkhash, rdlen_h); + if(output_rp) fclose(output_rp); if(norm_read) bam_destroy1(norm_read); if(tum_read) bam_destroy1(tum_read); if(sf_norm) hts_close(sf_norm); diff --git a/testData/readpos.1 b/testData/readpos.1 new file mode 100644 index 0000000..a73dec4 --- /dev/null +++ b/testData/readpos.1 @@ -0,0 +1,2 @@ +75 1-2;3-38;39-56;57-66;67-76; +100 1-3;4-51;52-76;77-90;91-100; diff --git a/testData/readpos.2 b/testData/readpos.2 new file mode 100644 index 0000000..a73dec4 --- /dev/null +++ b/testData/readpos.2 @@ -0,0 +1,2 @@ +75 1-2;3-38;39-56;57-66;67-76; +100 1-3;4-51;52-76;77-90;91-100; diff --git a/testData/readpos.22 b/testData/readpos.22 new file mode 100644 index 0000000..ce8e7b1 --- /dev/null +++ b/testData/readpos.22 @@ -0,0 +1 @@ +75 1-2;3-38;39-56;57-66;67-76; diff --git a/tests/alg_bean_tests.c b/tests/alg_bean_tests.c index f58f786..20417fb 100644 --- a/tests/alg_bean_tests.c +++ b/tests/alg_bean_tests.c @@ -279,26 +279,6 @@ char *test_alg_bean_get_index_for_char_arr_cram(){ return NULL; } -char *test_alg_bean_get_index_for_read_pos_prop_arr(){ - test_bean = alg_bean_generate_default_alg_bean(norm_file,tum_file); - //Expect the following results: - mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,2,75)==0,"Wrong index returned for RD pos check 1.\n"); - mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,3,75)==1,"Wrong index returned for RD pos check 2.\n"); - mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,4,150)==0,"Wrong index returned for RD pos check 3.\n"); - mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,5,150)==1,"Wrong index returned for RD pos check 4.\n"); - return NULL; -} - -char *test_alg_bean_get_index_for_read_pos_prop_arr_cram(){ - test_bean = alg_bean_generate_default_alg_bean(norm_file_cram,tum_file_cram); - //Expect the following results: - mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,2,75)==0,"Wrong index returned for RD pos check 1.\n"); - mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,3,75)==1,"Wrong index returned for RD pos check 2.\n"); - mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,4,150)==0,"Wrong index returned for RD pos check 3.\n"); - mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,5,150)==1,"Wrong index returned for RD pos check 4.\n"); - return NULL; -} - char *test_alg_bean_generate_default_alg_bean(){ alg_bean_t *test_bean = alg_bean_generate_default_alg_bean(norm_file,tum_file); mu_assert(test_bean != NULL, "Couldn't retrieve default alg_bean.\n"); @@ -348,8 +328,6 @@ char *all_tests() { mu_run_test(test_alg_bean_get_index_for_intrange_arr_cram); mu_run_test(test_alg_bean_get_index_for_char_arr); mu_run_test(test_alg_bean_get_index_for_char_arr_cram); - mu_run_test(test_alg_bean_get_index_for_read_pos_prop_arr); - mu_run_test(test_alg_bean_get_index_for_read_pos_prop_arr_cram); mu_run_test(test_alg_bean_hard_copy_char_list); mu_run_test(test_alg_bean_hard_copy_char_list_cram); return NULL; diff --git a/tests/algos_tests.c b/tests/algos_tests.c index dae9a72..e4c9163 100644 --- a/tests/algos_tests.c +++ b/tests/algos_tests.c @@ -62,6 +62,7 @@ char *test_snp_out = "testData/snp.vcf.gz"; char *test_mut_out = "testData/mut.vcf.gz"; char *test_dbg_out = "testData/dbg.vcf.gz"; char *test_no_anal_out = "testData/no_analysis.bed"; +char split_loc_test[] = "testData/splitList"; char *test_algos_mstep_read_position(){ //algos_mstep_read_position(alg_bean_t *alg,uint64_t ********covs, char *chr_name, int from, int to, char *ref_base); @@ -70,9 +71,11 @@ char *test_algos_mstep_read_position(){ uint64_t ********arr = covs_access_generate_cov_array_given_dimensions(List_count(alg->read_order),List_count(alg->strand),List_count(alg->lane), List_count(alg->rd_pos),List_count(alg->map_qual),List_count(alg->base_qual),List_count(alg->ref_base),List_count(alg->call_base)); mu_assert(arr != NULL,"Array not properly created.\n"); + char *chr = "22"; int from = 17619559; int to = 17619559; + int chk = alg_bean_add_read_length_arrs(alg, split_loc_test, chr); char *ref_base = "A"; algos_mstep_read_position(alg, arr, chr, from, to, ref_base, 50000); //check we have a count of 2 in the expected place... @@ -93,6 +96,7 @@ char *test_algos_mstep_read_position_cram(){ char *chr = "22"; int from = 17619559; int to = 17619559; + int chk = alg_bean_add_read_length_arrs(alg, split_loc_test, chr); char *ref_base = "A"; algos_mstep_read_position(alg, arr, chr, from, to, ref_base, 50000); //check we have a count of 2 in the expected place... @@ -114,6 +118,7 @@ char *test_algos_mstep_read_position_two(){ char *chr = "2"; int from = 38000243; int to = 38000243; + int chk = alg_bean_add_read_length_arrs(alg, split_loc_test, chr); char *ref_base = "G"; algos_mstep_read_position(alg, arr, chr, from, to, ref_base, 50000); @@ -153,6 +158,7 @@ char *test_algos_mstep_read_position_two_cram(){ char *chr = "2"; int from = 38000243; int to = 38000243; + int chk = alg_bean_add_read_length_arrs(alg, split_loc_test, chr); char *ref_base = "G"; algos_mstep_read_position(alg, arr, chr, from, to, ref_base, 50000); @@ -200,6 +206,7 @@ int estep_no_analysis(){ check(no_anal_out!=NULL,"Error opening file."); output_set_no_analysis_file(no_anal_out); output_set_no_analysis_section_list(List_create()); + int chk = alg_bean_add_read_length_arrs(alg, split_loc_test, "1"); int estep = algos_estep_read_position(alg, probs,"1", 192462250, 192462300, ref_base, mut_wt_cn, mut_mt_cn, snp_out, mut_out, dbg_out, 50000); output_flush_no_analysis("1"); check(estep==0,"Error running estep."); @@ -252,6 +259,7 @@ int estep_no_analysis_cram(){ check(no_anal_out!=NULL,"Error opening file."); output_set_no_analysis_file(no_anal_out); output_set_no_analysis_section_list(List_create()); + int chk = alg_bean_add_read_length_arrs(alg, split_loc_test, "1"); int estep = algos_estep_read_position(alg, probs,"1", 192462250, 192462300, ref_base, mut_wt_cn, mut_mt_cn, snp_out, mut_out, dbg_out, 50000); output_flush_no_analysis("1"); check(estep==0,"Error running estep."); @@ -304,6 +312,7 @@ char *test_algos_estep_read_position(){ FILE *no_anal_out = fopen(test_no_anal_out,"w"); output_set_no_analysis_file(no_anal_out); output_set_no_analysis_section_list(List_create()); + int chk = alg_bean_add_read_length_arrs(alg, split_loc_test, "1"); int estep = algos_estep_read_position(alg, probs,"1", 192462357, 192462357, "C", mut_wt_cn, mut_mt_cn, snp_out, mut_out, dbg_out, 50000); mu_assert(estep==0,"Error running estep."); gzclose(snp_out); @@ -350,6 +359,7 @@ char *test_algos_estep_read_position_cram(){ FILE *no_anal_out = fopen(test_no_anal_out,"w"); output_set_no_analysis_file(no_anal_out); output_set_no_analysis_section_list(List_create()); + int chk = alg_bean_add_read_length_arrs(alg, split_loc_test, "1"); int estep = algos_estep_read_position(alg, probs,"1", 192462357, 192462357, "C", mut_wt_cn, mut_mt_cn, snp_out, mut_out, dbg_out, 50000); mu_assert(estep==0,"Error running estep."); gzclose(snp_out); diff --git a/tests/runtests.sh b/tests/runtests.sh index 596f001..f512669 100644 --- a/tests/runtests.sh +++ b/tests/runtests.sh @@ -30,6 +30,7 @@ # ########################### +set -e echo "Running unit tests:" for i in tests/*_tests