From 84f2d23f40902d5bd6312d60b1bc86a98c0999aa Mon Sep 17 00:00:00 2001 From: David Jones Date: Tue, 29 Jan 2019 15:59:22 +0000 Subject: [PATCH 01/29] Making assembly and species required commandline arguments --- CHANGES.md | 4 ++++ src/estep.c | 16 +++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index fd60af7..91c769b 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,9 @@ # CHANGES +## NEXT + +* Making species and assembly required commandline arguments in the estep + ## 1.13.10 ### Behaviour change diff --git a/src/estep.c b/src/estep.c index 63ca8e3..7df3f18 100644 --- a/src/estep.c +++ b/src/estep.c @@ -95,7 +95,9 @@ char *valid_protocols[3] = {"WGS","WXS","RNA"}; void estep_print_usage (int exit_code){ printf ("Usage: caveman estep -i jobindex [-f file] [-m int] [-k float] [-b float] [-p float] [-q float] [-x int] [-y int] [-c float] [-d float] [-a int]\n\n"); - printf("-i --index [int] Job index (e.g. from $LSB_JOBINDEX)\n\n"); + printf("-i --index [int] Job index (e.g. from $LSB_JOBINDEX)\n"); + printf("-v --species-assembly [string] Species assembly (eg 37/GRCh37), required if bam header SQ lines do not contain AS and SP information.\n"); + printf("-w --species [string] Species name (eg Human), required if bam header SQ lines do not contain AS and SP information.\n\n"); printf("Optional\n"); printf("-f --config-file [file] Path to the config file produced by setup. [default:'%s']\n",config_file); printf("-m --min-base-qual [int] Minimum base quality for inclusion of a read position [default:%d]\n",min_bq); @@ -111,8 +113,6 @@ void estep_print_usage (int exit_code){ printf("-s --debug Adds an extra output to a debug file. Every base analysed has an output\n"); printf("-g --cov-file [file] File location of the covariate array. [default:'%s']\n",covariate_file); printf("-o --prob-file [file] File location of the prob array. [default:'%s']\n",probs_file); - printf("-v --species-assembly [string] Species assembly (eg 37/GRCh37), required if bam header SQ lines do not contain AS and SP information.\n"); - printf("-w --species [string] Species name (eg Human), required if bam header SQ lines do not contain AS and SP information.\n"); printf("-n --normal-copy-number [int] Copy number to use when filling gaps in the normal copy number file [default:%d].\n",normal_copy_number); printf("-t --tumour-copy-number [int] Copy number to use when filling gaps in the tumour copy number file [default:%d].\n",tumour_copy_number); printf("-l --normal-protocol [string] Normal protocol. Ideally this should match -r but not checked (WGS|WXS|RNA) [default:%s].\n",norm_prot); @@ -342,6 +342,16 @@ int estep_setup_options(int argc, char *argv[]){ estep_print_usage(1); } + if(!assembly){ + printf("species-assembly is not set.",norm_prot); + estep_print_usage(1); + } + + if(!species){ + printf("species is not set.",norm_prot); + estep_print_usage(1); + } + set_max_tum_cvg(estep_max_tumour_coverage); return 0; From cc3f5730c973352ddc0b91c4d05a6f45261ab7fe Mon Sep 17 00:00:00 2001 From: David Jones Date: Mon, 11 Nov 2019 20:34:29 +0000 Subject: [PATCH 02/29] Modified means of calculating read pos covariate index using an input file rather than per read position calculation. --- CHANGES.md | 1 + src/alg_bean.c | 67 ++++++++++++++++++++++++++----- src/algos.c | 34 ++++++++++------ src/estep.c | 18 +++------ src/mstep.c | 3 ++ src/split.c | 90 ++++++++++++++++++++++++++++++++++++++++-- tests/alg_bean_tests.c | 40 +++++++++---------- tests/algos_tests.c | 10 +++++ 8 files changed, 205 insertions(+), 58 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index db818ee..7793a9f 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -3,6 +3,7 @@ ## NEXT * Making species and assembly required commandline arguments in the estep +* Change method by which read position index is resolved. Results in time saving. ## 1.13.16 diff --git a/src/alg_bean.c b/src/alg_bean.c index bf3c34f..191999c 100644 --- a/src/alg_bean.c +++ b/src/alg_bean.c @@ -166,11 +166,11 @@ int alg_bean_get_index_for_char_arr(List *list,char *val){ return -1; } -int alg_bean_get_index_for_read_pos_prop_arr(List *list,int pos,int rd_len){ - List *lengths = List_create(); - int i=0; - int last_stop = 1; - LIST_FOREACH(list, first, next, cur){ +List *alg_bean_get_position_list_from_read_pos_proportion_arr(List *list,int rd_len){ + List *lengths = List_create(); + int last_stop = 1; + int i=0; + LIST_FOREACH(list, first, next, cur){ float pct = *((float *)cur->value); int lng = (((float)rd_len/(float)100) * pct); alg_bean_intrange *range = malloc(sizeof(alg_bean_intrange)); @@ -182,11 +182,60 @@ int alg_bean_get_index_for_read_pos_prop_arr(List *list,int pos,int rd_len){ range->to = (range->from) + lng; last_stop = range->to; List_push(lengths,range); - i++; + i++; } - int result = alg_bean_get_index_for_intrange_arr(lengths,pos); - List_clear_destroy(lengths); - return result; + return lengths; +} + +int alg_bean_get_index_for_read_pos_prop_arr(void *_hash, int pos,int rd_len){ + List *lengths = NULL; + khiter_t k; + khash_t(readlenpos) *h_rd_len = (khash_t(readlenpos)*) _hash; + k = kh_get(readlenpos, h_rd_len, rd_len); + lengths = kh_value(h_rd_len, k); + return alg_bean_get_index_for_intrange_arr(lengths, pos); +} + +int alg_bean_add_read_length_arrs(alg_bean_t *bean, char* list_loc, char* contig){ + FILE *output_rp = NULL; + khash_t(readlenpos) *h_rd_len; + char *dircpy = NULL; + //Create map, key is readlength, List of sections is value. + char *read_len_pos_arr_file = malloc(strlen(contig) + strlen(list_loc) + 6); + check_mem(read_len_pos_arr_file); + //Create filename here through name concatenation. + dircpy = strdup(list_loc); + strcpy(read_len_pos_arr_file,dirname(dircpy)); + strcat(read_len_pos_arr_file,"/readpos."); + strcat(read_len_pos_arr_file,contig); + output_rp = fopen(read_len_pos_arr_file,"r"); + check(output_rp != NULL, "Error opening file %s for write.",read_len_pos_arr_file); + free(read_len_pos_arr_file); + + h_rd_len = kh_init(readlenpos); + char line[500]; + int rd_len_missing = 0; + khiter_t k; + while ( fgets(line,sizeof(line),output_rp) != NULL ){ + //75 1-2;3-38;39-56;57-66;67-76; + int len; + char list_txt [4950]; + int chk = sscanf(line,"%d\t%s",&len,list_txt); + List *lenrange = alg_bean_parse_int_range(list_txt); + k = kh_put(readlenpos, h_rd_len, len, &rd_len_missing); + if(rd_len_missing){ + kh_value(h_rd_len, k) = lenrange; + } + } + fclose(output_rp); + + bean->read_len_pos = h_rd_len; + free(dircpy); + return 0; +error: + if(dircpy) free(dircpy); + if(output_rp) fclose(output_rp); + return -1; } List *alg_bean_parse_str_list(char *txt){ diff --git a/src/algos.c b/src/algos.c index cd988a5..e6d2eab 100644 --- a/src/algos.c +++ b/src/algos.c @@ -150,7 +150,15 @@ void set_max_tum_cvg(int i){ max_tum_cvg = i; } -int algos_mstep_read_position(alg_bean_t *alg,uint64_t ********covs, char *chr_name, uint32_t from, uint32_t to, char *ref_base, int split_size){ +// inline long double expl(long double x){ +// x = (long double)1.0 + x / (long double)1024; +// x *= x; x *= x; x *= x; x *= x; +// x *= x; x *= x; x *= x; x *= x; +// x *= x; x *= x; +// return x; +// } + +int algos_mstep_read_position(alg_bean_t *alg, uint64_t ********covs, char *chr_name, uint32_t from, uint32_t to, char *ref_base, int split_size){ //Fetch all reads for this pos? Or a struct for a single read at that position? List *reads = NULL; char *cbase = NULL; @@ -174,7 +182,7 @@ int algos_mstep_read_position(alg_bean_t *alg,uint64_t ********covs, char *chr_n || (ref_b_up == 'G') || (ref_b_up = 'T')) ){ //int lane_i = alg_bean_get_index_for_str_arr(alg->lane,pos_t->lane); //check(lane_i>=0,"Error calculating lane index."); - int rpos_i = alg_bean_get_index_for_read_pos_prop_arr(alg->rd_pos,pos_t->rd_pos,pos_t->rd_len); + int rpos_i = alg_bean_get_index_for_read_pos_prop_arr(alg->read_len_pos,pos_t->rd_pos,pos_t->rd_len); check(rpos_i>=0,"Error calculating read position index."); int mq_i = alg_bean_get_index_for_intrange_arr(alg->map_qual,pos_t->map_qual); check(mq_i>=0,"Error calculating map qual index."); @@ -275,23 +283,26 @@ int algos_run_per_read_estep_maths(genotype_store_t *genos,read_pos_t *read, int long double ref_base_prob = *(read->ref_base_probs[ref_base_idx]); long double res = genos->ref_geno_norm_prob + ref_base_prob; genos->ref_geno_norm_prob = res; - + long double norm_var_base_prop; + long double nu_fx; + long double tmp_psi_var_prob_norm; + long double ans; //iterate through from 0 to highest available of genos->hom_count, genos->het_norm_count, int iter=0; for(iter=0;iternorm_max;iter++){ //Hom snps if(iterhom_count){ - long double ans = genos->hom_snp_genotypes[iter]->prob + *(read->ref_base_probs[(genos->hom_snp_genotypes[iter]->norm_geno->var_base_idx)]); + ans = genos->hom_snp_genotypes[iter]->prob + *(read->ref_base_probs[(genos->hom_snp_genotypes[iter]->norm_geno->var_base_idx)]); genos->hom_snp_genotypes[iter]->prob = ans; }//End of iteration through hom snps //Het snps if(iterhet_norm_count){ - long double norm_var_base_prop = genos->het_snp_norm_genotypes[iter]->norm_geno->var_base_prop; - long double nu_fx = (long double)ref_b * norm_var_base_prop; - long double tmp_psi_var_prob_norm = nu_fx / (nu_fx + ((long double)1 - norm_var_base_prop)); - long double ans = genos->het_snp_norm_genotypes[iter]->prob + + norm_var_base_prop = genos->het_snp_norm_genotypes[iter]->norm_geno->var_base_prop; + nu_fx = (long double)ref_b * norm_var_base_prop; + tmp_psi_var_prob_norm = nu_fx / (nu_fx + ((long double)1 - norm_var_base_prop)); + ans = genos->het_snp_norm_genotypes[iter]->prob + ( logl( expl( ref_base_prob + logl((long double)1 - tmp_psi_var_prob_norm) ) @@ -307,6 +318,7 @@ int algos_run_per_read_estep_maths(genotype_store_t *genos,read_pos_t *read, int long double ref_base_prob = *(read->ref_base_probs[ref_base_idx]); long double res = genos->ref_geno_tum_prob + ref_base_prob; genos->ref_geno_tum_prob = res; + long double ans; //iterate through from 0 to highest available of genos->somatic_count, genos->hom_count, genos->het_count int iter=0; for(iter=0;itertum_max;iter++){ @@ -318,7 +330,7 @@ int algos_run_per_read_estep_maths(genotype_store_t *genos,read_pos_t *read, int //Calculate read component probability. long double log_tmp_psi_var_prob = logl(tmp_psi_var_prob); long double log_1_minus_tmp_psi_var_prob = logl((long double)1 - tmp_psi_var_prob); - long double ans = genos->somatic_genotypes[iter]->prob + + ans = genos->somatic_genotypes[iter]->prob + logl( ( expl( (ref_base_prob + log_1_minus_tmp_psi_var_prob) ) @@ -332,7 +344,7 @@ int algos_run_per_read_estep_maths(genotype_store_t *genos,read_pos_t *read, int //Hom snps if(iterhom_count){ - long double ans = genos->hom_snp_genotypes[iter]->prob + + ans = genos->hom_snp_genotypes[iter]->prob + *(read->ref_base_probs[(genos->hom_snp_genotypes[iter]->tum_geno->var_base_idx)]); genos->hom_snp_genotypes[iter]->prob = ans; }//End of hom snps @@ -541,7 +553,7 @@ void algos_run_per_position_estep_maths(estep_position_t *pos){ int algos_get_read_specific_indices(alg_bean_t *alg, read_pos_t *pos_t, int *rpos_i, int *mq_i, int *bq_i, int *callbase_i){ - *rpos_i = alg_bean_get_index_for_read_pos_prop_arr(alg->rd_pos,pos_t->rd_pos,pos_t->rd_len); + *rpos_i = alg_bean_get_index_for_read_pos_prop_arr(alg->read_len_pos,pos_t->rd_pos,pos_t->rd_len); check(*rpos_i>=0,"Error calculating read position index."); *mq_i = alg_bean_get_index_for_intrange_arr(alg->map_qual,pos_t->map_qual); check(*mq_i>=0,"Error calculating map qual index."); diff --git a/src/estep.c b/src/estep.c index b5a246f..d80f2ca 100644 --- a/src/estep.c +++ b/src/estep.c @@ -95,9 +95,7 @@ char *valid_protocols[6] = {"WGS","WXS","RNA","RNA-Seq","AMPLICON","TARGETED"}; void estep_print_usage (int exit_code){ printf ("Usage: caveman estep -i jobindex [-f file] [-m int] [-k float] [-b float] [-p float] [-q float] [-x int] [-y int] [-c float] [-d float] [-a int]\n\n"); - printf("-i --index [int] Job index (e.g. from $LSB_JOBINDEX)\n"); - printf("-v --species-assembly [string] Species assembly (eg 37/GRCh37), required if bam header SQ lines do not contain AS and SP information.\n"); - printf("-w --species [string] Species name (eg Human), required if bam header SQ lines do not contain AS and SP information.\n\n"); + printf("-i --index [int] Job index (e.g. from $LSB_JOBINDEX)\n\n"); printf("Optional\n"); printf("-f --config-file [file] Path to the config file produced by setup. [default:'%s']\n",config_file); printf("-m --min-base-qual [int] Minimum base quality for inclusion of a read position [default:%d]\n",min_bq); @@ -113,6 +111,8 @@ void estep_print_usage (int exit_code){ printf("-s --debug Adds an extra output to a debug file. Every base analysed has an output\n"); printf("-g --cov-file [file] File location of the covariate array. [default:'%s']\n",covariate_file); printf("-o --prob-file [file] File location of the prob array. [default:'%s']\n",probs_file); + printf("-v --species-assembly [string] Species assembly (eg 37/GRCh37), required if bam header SQ lines do not contain AS and SP information.\n"); + printf("-w --species [string] Species name (eg Human), required if bam header SQ lines do not contain AS and SP information.\n"); printf("-n --normal-copy-number [int] Copy number to use when filling gaps in the normal copy number file [default:%d].\n",normal_copy_number); printf("-t --tumour-copy-number [int] Copy number to use when filling gaps in the tumour copy number file [default:%d].\n",tumour_copy_number); printf("-l --normal-protocol [string] Normal protocol. Ideally this should match -r but not checked (WGS|WXS|RNA|RNA-Seq|AMPLICON|TARGETED) [default:%s].\n",norm_prot); @@ -342,16 +342,6 @@ int estep_setup_options(int argc, char *argv[]){ estep_print_usage(1); } - if(!assembly){ - printf("species-assembly is not set.",norm_prot); - estep_print_usage(1); - } - - if(!species){ - printf("species is not set.",norm_prot); - estep_print_usage(1); - } - set_max_tum_cvg(estep_max_tumour_coverage); return 0; @@ -563,6 +553,8 @@ int estep_main(int argc, char *argv[]){ check(chk_write==0,"Error writing header to dbg file."); } + chk = alg_bean_add_read_length_arrs(alg, list_loc, chr_name); + check(chk==0,"Error reading alg_bean read length arrays from file."); //Iterate through analysis sections //Iterate through sections. diff --git a/src/mstep.c b/src/mstep.c index 3657c8a..7486b68 100644 --- a/src/mstep.c +++ b/src/mstep.c @@ -204,6 +204,9 @@ int mstep_main(int argc, char *argv[]){ //Iterate through each base in the section and create the genome profile. //an array to store the data full of zeroes. + int chk = alg_bean_add_read_length_arrs(alg, list_loc, chr_name); + check(chk==0,"Error reading alg_bean read length arrays from file."); + //Read order (1st/2nd) //Strand +/- //Lane diff --git a/src/split.c b/src/split.c index 58c0dd2..3252cb1 100644 --- a/src/split.c +++ b/src/split.c @@ -44,6 +44,10 @@ #include #include #include +#include "khash.h" + +//New hash to store unique readlengths +KHASH_MAP_INIT_INT(rdlenkhash, uint8_t) static int includeSW = 0; static int includeSingleEnd = 0; @@ -166,7 +170,12 @@ int split_main(int argc, char *argv[]){ bam1_t *norm_read = NULL; bam1_t *tum_read = NULL; seq_region_t **ignore_regs = NULL; + FILE *alg_bean_file = NULL; + alg_bean_t *alg = NULL; + char *read_len_pos_arr_file = NULL; + khash_t(rdlenkhash) *rdlen_h; int ignore_reg_count = 0; + FILE *output_rp = NULL; int is_err = split_setup_options(argc,argv); check(is_err==0,"Error parsing options"); @@ -198,7 +207,7 @@ int split_main(int argc, char *argv[]){ //Open a file to write sections, named according to CHR. char *fname = malloc(strlen(chr_name) + strlen(list_loc) + 3); check_mem(fname); - //Create filename here through name concatenation. + //Create filename here through name concatenation. strcpy(fname,list_loc); strcat(fname,"."); strcat(fname,chr_name); @@ -219,6 +228,10 @@ int split_main(int argc, char *argv[]){ //Check there's not a whole chromosome block. if(!(ignore_reg_count == 1 && ignore_regs[0]->beg == 1 && ignore_regs[0]->end >= chr_length)){ + //Initialise readlength hash + rdlen_h = kh_init(rdlenkhash); + check(rdlen_h != NULL,"Memory allocation error when initialising hash kh_init(rdlenkhash)."); + //No chromosome block, so carry on. uint32_t start = 1; uint32_t stop = chr_length; @@ -271,15 +284,20 @@ int split_main(int argc, char *argv[]){ uint32_t curr_n_pos = 0; uint32_t curr_t_pos = 0; - + khiter_t k; //Have both iterators, now need to iterate through each in sync so we don't get ahead of the stops. while(iter_n_status>=0 || iter_t_status>=0){ //Keep iterating until both iterators are out of reads. - while(curr_n_pos<=curr_t_pos && iter_n_status>=0 && iter_t_status>=0 && rd_count<=max_read_count){ //While the positions aren't equal and tumour has reads left. Normal jumps ahead iter_n_status = sam_itr_next(sf_norm,iter_norm,norm_read); check(iter_n_status>=-1,"Error detected (%d) when trying to iterate through region.",iter_n_status); curr_n_pos = norm_read->core.pos; if(iter_n_status>=0 && bam_access_check_bam_flags(norm_read) == 1 && ignore_reg_access_get_ign_reg_overlap(curr_n_pos,ignore_regs,ignore_reg_count) == NULL){ + int read_len = norm_read->core.l_qseq; + int rd_len_missing; + k = kh_put(rdlenkhash, rdlen_h, read_len, &rd_len_missing); + if(rd_len_missing){ // If the readname key doesn't yet exist + kh_value(rdlen_h, k) = read_len; + } rd_count++; } }//End of this iteration through normal reads @@ -289,6 +307,12 @@ int split_main(int argc, char *argv[]){ check(iter_t_status>=-1,"Error detected (%d) when trying to iterate through region.",iter_t_status); curr_t_pos = tum_read->core.pos; if(iter_t_status>=0 && bam_access_check_bam_flags(tum_read) == 1 && ignore_reg_access_get_ign_reg_overlap(curr_t_pos,ignore_regs,ignore_reg_count) == NULL){ + int read_len = tum_read->core.l_qseq; + int rd_len_missing; + k = kh_put(rdlenkhash, rdlen_h, read_len, &rd_len_missing); + if(rd_len_missing){ // If the readname key doesn't yet exist + kh_value(rdlen_h, k) = read_len; + } rd_count++; } }//End of this iteration through tumour reads @@ -300,7 +324,13 @@ int split_main(int argc, char *argv[]){ check(iter_t_status>=-1,"Error detected (%d) when trying to iterate through region.",iter_t_status); curr_t_pos = tum_read->core.pos; if(iter_t_status>=0 && bam_access_check_bam_flags(tum_read) == 1 && ignore_reg_access_get_ign_reg_overlap(curr_t_pos,ignore_regs,ignore_reg_count) == NULL){ - rd_count++; + int read_len = tum_read->core.l_qseq; + int rd_len_missing; + k = kh_put(rdlenkhash, rdlen_h, read_len, &rd_len_missing); + if(rd_len_missing){ // If the readname key doesn't yet exist + kh_value(rdlen_h, k) = read_len; + } + rd_count++; } } }//End of iteration through tumour reads where only tumour reads remain @@ -311,6 +341,12 @@ int split_main(int argc, char *argv[]){ check(iter_n_status>=-1,"Error detected (%d) when trying to iterate through region.",iter_n_status); curr_n_pos = norm_read->core.pos; if(iter_n_status>=0 && bam_access_check_bam_flags(norm_read) == 1 && ignore_reg_access_get_ign_reg_overlap(curr_n_pos,ignore_regs,ignore_reg_count) == NULL){ + int read_len = norm_read->core.l_qseq; + int rd_len_missing; + k = kh_put(rdlenkhash, rdlen_h, read_len, &rd_len_missing); + if(rd_len_missing){ // If the readname key doesn't yet exist + kh_value(rdlen_h, k) = read_len; + } rd_count++; } } @@ -337,7 +373,51 @@ int split_main(int argc, char *argv[]){ }//End of moving through both normal and tumour iterators + //Read in alg_bean + //Read in the alg bean + alg_bean_file = fopen(alg_bean_loc,"r"); + check(alg_bean_file != 0 ,"Error trying to open alg_bean file: %s.",alg_bean_loc); + alg = alg_bean_read_file(alg_bean_file); + check(alg != NULL,"Error reading alg_bean from file."); + check(fclose(alg_bean_file)==0,"Error closing alg bean file."); + + //Iterate through readlen hashkeys and output a format giving all readlength options for readpos boundaries + //This can be read in by mstep and estep + + //Open a file to write sections, named according to CHR. + char *read_len_pos_arr_file = malloc(strlen(chr_name) + strlen(list_loc) + 6); + check_mem(read_len_pos_arr_file); + //Create filename here through name concatenation. + char *dupdir; + dupdir = strdup(list_loc); + strcpy(read_len_pos_arr_file,dirname(dupdir)); + strcat(read_len_pos_arr_file,"/readpos."); + strcat(read_len_pos_arr_file,chr_name); + output_rp = fopen(read_len_pos_arr_file,"w"); + check(output_rp != NULL, "Error opening file %s for write.",read_len_pos_arr_file); + free(read_len_pos_arr_file); + + for (k = kh_begin(rdlen_h); k != kh_end(rdlen_h); ++k){ // traverse + // test if a bucket contains data + if (kh_exist(rdlen_h, k)){ + //Convert alg bean read length proportions to a 'by readlength' set of ranges + int len = kh_value(rdlen_h, k); + List *lengths = alg_bean_get_position_list_from_read_pos_proportion_arr(alg->rd_pos,len); + //Output readlength splits and read length itself to file + fprintf(output_rp,"%d\t",len); + LIST_FOREACH(lengths, first, next, cur){ + alg_bean_intrange *range = (alg_bean_intrange *) cur->value; + fprintf(output_rp,"%d-%d;",range->from,range->to); + } + fprintf(output_rp,"\n"); + List_clear_destroy(lengths); + } + } + fclose(output_rp); + + //No more reads left so we must print the last section. + kh_destroy(rdlenkhash, rdlen_h); split_access_print_section(output,chr_name,sect_start,chr_length); bam_destroy1(norm_read); bam_destroy1(tum_read); @@ -352,6 +432,8 @@ int split_main(int argc, char *argv[]){ return 0; error: + if(rdlen_h) kh_destroy(rdlenkhash, rdlen_h); + if(output_rp) fclose(output_rp); if(norm_read) bam_destroy1(norm_read); if(tum_read) bam_destroy1(tum_read); if(sf_norm) hts_close(sf_norm); diff --git a/tests/alg_bean_tests.c b/tests/alg_bean_tests.c index f58f786..5219129 100644 --- a/tests/alg_bean_tests.c +++ b/tests/alg_bean_tests.c @@ -279,25 +279,25 @@ char *test_alg_bean_get_index_for_char_arr_cram(){ return NULL; } -char *test_alg_bean_get_index_for_read_pos_prop_arr(){ - test_bean = alg_bean_generate_default_alg_bean(norm_file,tum_file); - //Expect the following results: - mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,2,75)==0,"Wrong index returned for RD pos check 1.\n"); - mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,3,75)==1,"Wrong index returned for RD pos check 2.\n"); - mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,4,150)==0,"Wrong index returned for RD pos check 3.\n"); - mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,5,150)==1,"Wrong index returned for RD pos check 4.\n"); - return NULL; -} - -char *test_alg_bean_get_index_for_read_pos_prop_arr_cram(){ - test_bean = alg_bean_generate_default_alg_bean(norm_file_cram,tum_file_cram); - //Expect the following results: - mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,2,75)==0,"Wrong index returned for RD pos check 1.\n"); - mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,3,75)==1,"Wrong index returned for RD pos check 2.\n"); - mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,4,150)==0,"Wrong index returned for RD pos check 3.\n"); - mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,5,150)==1,"Wrong index returned for RD pos check 4.\n"); - return NULL; -} +// char *test_alg_bean_get_index_for_read_pos_prop_arr(){ +// test_bean = alg_bean_generate_default_alg_bean(norm_file,tum_file); +// //Expect the following results: +// mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,2,75)==0,"Wrong index returned for RD pos check 1.\n"); +// mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,3,75)==1,"Wrong index returned for RD pos check 2.\n"); +// mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,4,150)==0,"Wrong index returned for RD pos check 3.\n"); +// mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,5,150)==1,"Wrong index returned for RD pos check 4.\n"); +// return NULL; +// } + +// char *test_alg_bean_get_index_for_read_pos_prop_arr_cram(){ +// test_bean = alg_bean_generate_default_alg_bean(norm_file_cram,tum_file_cram); +// //Expect the following results: +// mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,2,75)==0,"Wrong index returned for RD pos check 1.\n"); +// mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,3,75)==1,"Wrong index returned for RD pos check 2.\n"); +// mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,4,150)==0,"Wrong index returned for RD pos check 3.\n"); +// mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,5,150)==1,"Wrong index returned for RD pos check 4.\n"); +// return NULL; +// } char *test_alg_bean_generate_default_alg_bean(){ alg_bean_t *test_bean = alg_bean_generate_default_alg_bean(norm_file,tum_file); @@ -348,8 +348,6 @@ char *all_tests() { mu_run_test(test_alg_bean_get_index_for_intrange_arr_cram); mu_run_test(test_alg_bean_get_index_for_char_arr); mu_run_test(test_alg_bean_get_index_for_char_arr_cram); - mu_run_test(test_alg_bean_get_index_for_read_pos_prop_arr); - mu_run_test(test_alg_bean_get_index_for_read_pos_prop_arr_cram); mu_run_test(test_alg_bean_hard_copy_char_list); mu_run_test(test_alg_bean_hard_copy_char_list_cram); return NULL; diff --git a/tests/algos_tests.c b/tests/algos_tests.c index dae9a72..e4c9163 100644 --- a/tests/algos_tests.c +++ b/tests/algos_tests.c @@ -62,6 +62,7 @@ char *test_snp_out = "testData/snp.vcf.gz"; char *test_mut_out = "testData/mut.vcf.gz"; char *test_dbg_out = "testData/dbg.vcf.gz"; char *test_no_anal_out = "testData/no_analysis.bed"; +char split_loc_test[] = "testData/splitList"; char *test_algos_mstep_read_position(){ //algos_mstep_read_position(alg_bean_t *alg,uint64_t ********covs, char *chr_name, int from, int to, char *ref_base); @@ -70,9 +71,11 @@ char *test_algos_mstep_read_position(){ uint64_t ********arr = covs_access_generate_cov_array_given_dimensions(List_count(alg->read_order),List_count(alg->strand),List_count(alg->lane), List_count(alg->rd_pos),List_count(alg->map_qual),List_count(alg->base_qual),List_count(alg->ref_base),List_count(alg->call_base)); mu_assert(arr != NULL,"Array not properly created.\n"); + char *chr = "22"; int from = 17619559; int to = 17619559; + int chk = alg_bean_add_read_length_arrs(alg, split_loc_test, chr); char *ref_base = "A"; algos_mstep_read_position(alg, arr, chr, from, to, ref_base, 50000); //check we have a count of 2 in the expected place... @@ -93,6 +96,7 @@ char *test_algos_mstep_read_position_cram(){ char *chr = "22"; int from = 17619559; int to = 17619559; + int chk = alg_bean_add_read_length_arrs(alg, split_loc_test, chr); char *ref_base = "A"; algos_mstep_read_position(alg, arr, chr, from, to, ref_base, 50000); //check we have a count of 2 in the expected place... @@ -114,6 +118,7 @@ char *test_algos_mstep_read_position_two(){ char *chr = "2"; int from = 38000243; int to = 38000243; + int chk = alg_bean_add_read_length_arrs(alg, split_loc_test, chr); char *ref_base = "G"; algos_mstep_read_position(alg, arr, chr, from, to, ref_base, 50000); @@ -153,6 +158,7 @@ char *test_algos_mstep_read_position_two_cram(){ char *chr = "2"; int from = 38000243; int to = 38000243; + int chk = alg_bean_add_read_length_arrs(alg, split_loc_test, chr); char *ref_base = "G"; algos_mstep_read_position(alg, arr, chr, from, to, ref_base, 50000); @@ -200,6 +206,7 @@ int estep_no_analysis(){ check(no_anal_out!=NULL,"Error opening file."); output_set_no_analysis_file(no_anal_out); output_set_no_analysis_section_list(List_create()); + int chk = alg_bean_add_read_length_arrs(alg, split_loc_test, "1"); int estep = algos_estep_read_position(alg, probs,"1", 192462250, 192462300, ref_base, mut_wt_cn, mut_mt_cn, snp_out, mut_out, dbg_out, 50000); output_flush_no_analysis("1"); check(estep==0,"Error running estep."); @@ -252,6 +259,7 @@ int estep_no_analysis_cram(){ check(no_anal_out!=NULL,"Error opening file."); output_set_no_analysis_file(no_anal_out); output_set_no_analysis_section_list(List_create()); + int chk = alg_bean_add_read_length_arrs(alg, split_loc_test, "1"); int estep = algos_estep_read_position(alg, probs,"1", 192462250, 192462300, ref_base, mut_wt_cn, mut_mt_cn, snp_out, mut_out, dbg_out, 50000); output_flush_no_analysis("1"); check(estep==0,"Error running estep."); @@ -304,6 +312,7 @@ char *test_algos_estep_read_position(){ FILE *no_anal_out = fopen(test_no_anal_out,"w"); output_set_no_analysis_file(no_anal_out); output_set_no_analysis_section_list(List_create()); + int chk = alg_bean_add_read_length_arrs(alg, split_loc_test, "1"); int estep = algos_estep_read_position(alg, probs,"1", 192462357, 192462357, "C", mut_wt_cn, mut_mt_cn, snp_out, mut_out, dbg_out, 50000); mu_assert(estep==0,"Error running estep."); gzclose(snp_out); @@ -350,6 +359,7 @@ char *test_algos_estep_read_position_cram(){ FILE *no_anal_out = fopen(test_no_anal_out,"w"); output_set_no_analysis_file(no_anal_out); output_set_no_analysis_section_list(List_create()); + int chk = alg_bean_add_read_length_arrs(alg, split_loc_test, "1"); int estep = algos_estep_read_position(alg, probs,"1", 192462357, 192462357, "C", mut_wt_cn, mut_mt_cn, snp_out, mut_out, dbg_out, 50000); mu_assert(estep==0,"Error running estep."); gzclose(snp_out); From eea76d249ed690c98c040ead64733cd4eb15d66d Mon Sep 17 00:00:00 2001 From: David Jones Date: Wed, 13 Nov 2019 14:56:16 +0000 Subject: [PATCH 03/29] first changes to use Math.h library from linasm --- src/algos.c | 65 ++++++++++++++++++++++++++--------------------------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/src/algos.c b/src/algos.c index e6d2eab..acc8bc5 100644 --- a/src/algos.c +++ b/src/algos.c @@ -41,6 +41,7 @@ #include #include #include +#include "Math.h" #include #include #include @@ -150,14 +151,6 @@ void set_max_tum_cvg(int i){ max_tum_cvg = i; } -// inline long double expl(long double x){ -// x = (long double)1.0 + x / (long double)1024; -// x *= x; x *= x; x *= x; x *= x; -// x *= x; x *= x; x *= x; x *= x; -// x *= x; x *= x; -// return x; -// } - int algos_mstep_read_position(alg_bean_t *alg, uint64_t ********covs, char *chr_name, uint32_t from, uint32_t to, char *ref_base, int split_size){ //Fetch all reads for this pos? Or a struct for a single read at that position? List *reads = NULL; @@ -304,10 +297,16 @@ int algos_run_per_read_estep_maths(genotype_store_t *genos,read_pos_t *read, int tmp_psi_var_prob_norm = nu_fx / (nu_fx + ((long double)1 - norm_var_base_prop)); ans = genos->het_snp_norm_genotypes[iter]->prob + ( - logl( - expl( ref_base_prob + logl((long double)1 - tmp_psi_var_prob_norm) ) + Math_Log_flt64( + Math_Expm1_flt64( (flt64_t)ref_base_prob + Math_Log_flt64( + (flt64_t)((long double)1 - tmp_psi_var_prob_norm) + ) + ) + - expl( *(read->ref_base_probs[(genos->het_snp_norm_genotypes[iter]->norm_geno->var_base_idx)]) + logl(tmp_psi_var_prob_norm) ) + Math_Expm1_flt64( + (flt64_t)( + *(read->ref_base_probs[(genos->het_snp_norm_genotypes[iter]->norm_geno->var_base_idx)]) + Math_Log_flt64((flt64_t)tmp_psi_var_prob_norm) + ) ) ); genos->het_snp_norm_genotypes[iter]->prob = ans; @@ -328,14 +327,14 @@ int algos_run_per_read_estep_maths(genotype_store_t *genos,read_pos_t *read, int long double nu_xf = (long double)ref_b * genos->somatic_genotypes[iter]->tum_geno->var_base_prop; long double tmp_psi_var_prob = nu_xf * ((long double)1 - base_norm_contam) / (nu_xf + ((long double)1 - genos->somatic_genotypes[iter]->tum_geno->var_base_prop)); //Calculate read component probability. - long double log_tmp_psi_var_prob = logl(tmp_psi_var_prob); - long double log_1_minus_tmp_psi_var_prob = logl((long double)1 - tmp_psi_var_prob); + long double log_tmp_psi_var_prob = Math_Log_flt64((flt64_t)tmp_psi_var_prob); + long double log_1_minus_tmp_psi_var_prob = Math_Log_flt64((flt64_t)((long double)1 - tmp_psi_var_prob)); ans = genos->somatic_genotypes[iter]->prob + - logl( + Math_Log_flt64( ( - expl( (ref_base_prob + log_1_minus_tmp_psi_var_prob) ) + Math_Expm1_flt64( (flt64_t)((ref_base_prob + log_1_minus_tmp_psi_var_prob)) ) + - expl( (*(read->ref_base_probs[genos->somatic_genotypes[iter]->tum_geno->var_base_idx]) + log_tmp_psi_var_prob) ) + Math_Expm1_flt64( (flt64_t)((*(read->ref_base_probs[genos->somatic_genotypes[iter]->tum_geno->var_base_idx]) + log_tmp_psi_var_prob)) ) ) ); genos->somatic_genotypes[iter]->prob = ans; @@ -362,10 +361,10 @@ int algos_run_per_read_estep_maths(genotype_store_t *genos,read_pos_t *read, int ( (long double)base_norm_contam * tmp_psi_var_prob_norm); long double tum_res = genos->het_snp_genotypes[iter]->prob + ( - logl( - expl( ref_base_prob + logl((long double)1 - tmp_psi_var_prob_tum) ) + Math_Log_flt64( + Math_Expm1_flt64( (flt64_t)(ref_base_prob + Math_Log_flt64((flt64_t)((long double)1 - tmp_psi_var_prob_tum)) )) + - expl( *(read->ref_base_probs[(genos->het_snp_genotypes[iter]->norm_geno->var_base_idx)]) + logl(tmp_psi_var_prob_tum) ) + Math_Expm1_flt64( (flt64_t)(*(read->ref_base_probs[(genos->het_snp_genotypes[iter]->norm_geno->var_base_idx)]) + Math_Log_flt64((flt64_t)tmp_psi_var_prob_tum) )) ) ); @@ -383,28 +382,28 @@ int algos_run_per_read_estep_maths(genotype_store_t *genos,read_pos_t *read, int long double calculateLogSumExpNormFactor(estep_position_t *pos,long double norm_factor_max){ long double sum_tot = 0.0; //Reference - sum_tot += expl(pos->genos->ref_genotype->prob - norm_factor_max); + sum_tot += Math_Expm1_flt64((flt64_t)(pos->genos->ref_genotype->prob - norm_factor_max)); //Somatic int somatic_i = 0; for(somatic_i = 0;somatic_igenos->somatic_count;somatic_i++){ - sum_tot += expl(pos->genos->somatic_genotypes[somatic_i]->prob - norm_factor_max); + sum_tot += Math_Expm1_flt64((flt64_t)(pos->genos->somatic_genotypes[somatic_i]->prob - norm_factor_max)); } //het int het_snp_it=0; for(het_snp_it=0;het_snp_itgenos->het_count;het_snp_it++){ - sum_tot += expl(pos->genos->het_snp_genotypes[het_snp_it]->prob - norm_factor_max); + sum_tot += Math_Expm1_flt64((flt64_t)(pos->genos->het_snp_genotypes[het_snp_it]->prob - norm_factor_max)); } //Hom int hom_snp_i=0; for(hom_snp_i=0;hom_snp_igenos->hom_count;hom_snp_i++){ - sum_tot += expl(pos->genos->hom_snp_genotypes[hom_snp_i]->prob - norm_factor_max); + sum_tot += Math_Expm1_flt64((flt64_t)(pos->genos->hom_snp_genotypes[hom_snp_i]->prob - norm_factor_max)); } //Return final logged value. - long double norm_factor = norm_factor_max + logl(sum_tot); + long double norm_factor = norm_factor_max + Math_Log_flt64((flt64_t)sum_tot); return norm_factor; } @@ -423,7 +422,7 @@ void finalise_probabilities_and_find_top_prob(estep_position_t *pos,long double for(iter=0;itergenos->total_max;iter++){ //somatics if(itergenos->somatic_count){ - pos->genos->somatic_genotypes[iter]->prob = expl(pos->genos->somatic_genotypes[iter]->prob - norm_factor); + pos->genos->somatic_genotypes[iter]->prob = Math_Expm1_flt64((flt64_t)(pos->genos->somatic_genotypes[iter]->prob - norm_factor)); somatic_sum += pos->genos->somatic_genotypes[iter]->prob; if(top_geno==NULL){ @@ -438,7 +437,7 @@ void finalise_probabilities_and_find_top_prob(estep_position_t *pos,long double //het snps if(itergenos->het_count){ - pos->genos->het_snp_genotypes[iter]->prob = expl(pos->genos->het_snp_genotypes[iter]->prob - norm_factor); + pos->genos->het_snp_genotypes[iter]->prob = Math_Expm1_flt64((flt64_t)(pos->genos->het_snp_genotypes[iter]->prob - norm_factor)); snp_sum += pos->genos->het_snp_genotypes[iter]->prob; if(pos->genos->het_snp_genotypes[iter]->prob > top_geno->prob){ @@ -451,7 +450,7 @@ void finalise_probabilities_and_find_top_prob(estep_position_t *pos,long double //hom snps if(itergenos->hom_count){ - pos->genos->hom_snp_genotypes[iter]->prob = expl(pos->genos->hom_snp_genotypes[iter]->prob - norm_factor); + pos->genos->hom_snp_genotypes[iter]->prob = Math_Expm1_flt64((flt64_t)(pos->genos->hom_snp_genotypes[iter]->prob - norm_factor)); snp_sum += pos->genos->hom_snp_genotypes[iter]->prob; if(pos->genos->hom_snp_genotypes[iter]->prob > top_geno->prob){ @@ -465,7 +464,7 @@ void finalise_probabilities_and_find_top_prob(estep_position_t *pos,long double //normalise the reference genotype probability. - pos->genos->ref_genotype->prob = expl(pos->genos->ref_genotype->prob - norm_factor); + pos->genos->ref_genotype->prob = Math_Expm1_flt64((flt64_t)(pos->genos->ref_genotype->prob - norm_factor)); if(pos->genos->ref_genotype->prob > top_geno->prob){ sec_geno = top_geno; @@ -487,7 +486,7 @@ void finalise_probabilities_and_find_top_prob(estep_position_t *pos,long double long double calculate_ref_prob(const long double norm, const long double tum){ long double ref_prob; - ref_prob = logl(((long double)1 - (long double)prior_s_prob) - (long double)prior_m_prob) + ref_prob = Math_Log_flt64(((long double)1 - (long double)prior_s_prob) - (long double)prior_m_prob) + norm + tum; return ref_prob; } @@ -498,7 +497,7 @@ void algos_run_per_position_estep_maths(estep_position_t *pos){ pos->genos->ref_genotype->prob = 0.0; long double ref_prob = calculate_ref_prob(pos->genos->ref_geno_norm_prob,pos->genos->ref_geno_tum_prob); pos->genos->ref_genotype->prob = ref_prob; - //long double ref_prob = logl(((long double)1 - (long double)prior_s_prob) - (long double)prior_m_prob) + //long double ref_prob = Math_Log_flt64(((long double)1 - (long double)prior_s_prob) - (long double)prior_m_prob) // + pos->genos->ref_geno_norm_prob + pos->genos->ref_geno_tum_prob; if(pos->genos->ref_genotype->prob > norm_factor_max){ norm_factor_max = pos->genos->ref_genotype->prob; @@ -509,7 +508,7 @@ void algos_run_per_position_estep_maths(estep_position_t *pos){ for(iter=0; itergenos->total_max; iter++){ //Somatics if(itergenos->somatic_count){ - long double res = logl((long double)prior_m_prob) - logl((long double)pos->genos->somatic_count) + long double res = Math_Log_flt64((long double)prior_m_prob) - Math_Log_flt64((long double)pos->genos->somatic_count) + pos->genos->ref_geno_norm_prob + pos->genos->somatic_genotypes[iter]->prob; pos->genos->somatic_genotypes[iter]->prob = res; @@ -525,7 +524,7 @@ void algos_run_per_position_estep_maths(estep_position_t *pos){ for(het_norm_i=0;het_norm_igenos->het_norm_count;het_norm_i++){ //find the correct het normal genotype if(genotype_equals(pos->genos->het_snp_norm_genotypes[het_norm_i]->norm_geno,pos->genos->het_snp_genotypes[iter]->norm_geno)==1){ - long double res = logl((long double)prior_s_prob) - logl((long double)(pos->genos->het_count + pos->genos->hom_count)) + long double res = Math_Log_flt64((long double)prior_s_prob) - Math_Log_flt64((long double)(pos->genos->het_count + pos->genos->hom_count)) + pos->genos->het_snp_norm_genotypes[het_norm_i]->prob + pos->genos->het_snp_genotypes[iter]->prob; pos->genos->het_snp_genotypes[iter]->prob = res; @@ -692,7 +691,7 @@ int algos_estep_read_position(alg_bean_t *alg,long double ********prob_arr, char //build up genotype stores for genotype_store_t *genos pos->genos = genotype_generate_genotype_list_for_cn_and_ref_base(pos->norm_cn, pos->tum_cn, ref_b); //Setup initial hom snp probabilities. - long double init_hom_snp = logl((long double)prior_s_prob) - logl((long double) (pos->genos->het_count + pos->genos->hom_count)); + long double init_hom_snp = Math_Log_flt64((long double)prior_s_prob) - Math_Log_flt64((long double) (pos->genos->het_count + pos->genos->hom_count)); int hom_snp_i = 0; for(hom_snp_i=0;hom_snp_igenos->hom_count;hom_snp_i++){ pos->genos->hom_snp_genotypes[hom_snp_i]->prob = init_hom_snp; From 8b0e6d8a6a721970398f3d7314c04b60c8b3fc35 Mon Sep 17 00:00:00 2001 From: David Jones Date: Wed, 13 Nov 2019 15:02:05 +0000 Subject: [PATCH 04/29] updated method desclarations in src/alg_bean.h --- src/alg_bean.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/alg_bean.h b/src/alg_bean.h index 74e811b..4718d47 100644 --- a/src/alg_bean.h +++ b/src/alg_bean.h @@ -35,6 +35,10 @@ #include #include +#include "khash.h" + +//New hash to store unique readlengths +KHASH_MAP_INIT_INT(readlenpos, List *) typedef struct alg_bean_intrange{ int from; @@ -58,6 +62,7 @@ typedef struct alg_bean_t{ int call_base_size; List *strand; int strand_size; + khash_t(readlenpos) *read_len_pos; } alg_bean_t; int alg_bean_create_default_file(FILE *file, char *norm, char *tum); @@ -72,7 +77,10 @@ List *alg_bean_hard_copy_char_list(List *new_list, List *old); int alg_bean_get_index_for_str_arr(List *list,char *value); int alg_bean_get_index_for_intrange_arr(List *list,int value); int alg_bean_get_index_for_char_arr(List *list,char *value); -int alg_bean_get_index_for_read_pos_prop_arr(List *list,int pos,int rd_len); +// int alg_bean_get_index_for_read_pos_prop_arr(List *list,int pos,int rd_len); +int alg_bean_get_index_for_read_pos_prop_arr(void *_hash, int pos,int rd_len); +int alg_bean_add_read_length_arrs(alg_bean_t *bean, char* list_loc, char* contig); +List *alg_bean_get_position_list_from_read_pos_proportion_arr(List *list,int rd_len); #define CEIL(a, b) (((a) / (b)) + (((a) % (b)) > 0 ? 1 : 0)) #define MIN(a, b) (((a) < (b)) ? (a) : (b)) From f50265ce6dbb2a7ae9ec8374bf5d7a9cc146732d Mon Sep 17 00:00:00 2001 From: David Jones Date: Thu, 14 Nov 2019 15:54:44 +0000 Subject: [PATCH 05/29] test files for readpos changes --- testData/readpos.1 | 2 ++ testData/readpos.2 | 2 ++ testData/readpos.22 | 1 + 3 files changed, 5 insertions(+) create mode 100644 testData/readpos.1 create mode 100644 testData/readpos.2 create mode 100644 testData/readpos.22 diff --git a/testData/readpos.1 b/testData/readpos.1 new file mode 100644 index 0000000..a73dec4 --- /dev/null +++ b/testData/readpos.1 @@ -0,0 +1,2 @@ +75 1-2;3-38;39-56;57-66;67-76; +100 1-3;4-51;52-76;77-90;91-100; diff --git a/testData/readpos.2 b/testData/readpos.2 new file mode 100644 index 0000000..a73dec4 --- /dev/null +++ b/testData/readpos.2 @@ -0,0 +1,2 @@ +75 1-2;3-38;39-56;57-66;67-76; +100 1-3;4-51;52-76;77-90;91-100; diff --git a/testData/readpos.22 b/testData/readpos.22 new file mode 100644 index 0000000..ce8e7b1 --- /dev/null +++ b/testData/readpos.22 @@ -0,0 +1 @@ +75 1-2;3-38;39-56;57-66;67-76; From 26d0fa60d50e21d400d8e753fe63a4e16b13485f Mon Sep 17 00:00:00 2001 From: David Jones Date: Thu, 14 Nov 2019 15:55:16 +0000 Subject: [PATCH 06/29] Add library -llinasm for faster exp and log functions --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 6aedb75..98735e8 100644 --- a/Makefile +++ b/Makefile @@ -33,7 +33,7 @@ LFLAGS?=-L$(HTSTMP) # define any libraries to link into executable: # if I want to link in libraries (libx.so or libx.a) I use the -llibname # option, something like (this will link in libmylib.so and libm.so: -LIBS =-lhts -lpthread -lz -lm -ldl +LIBS =-lhts -lpthread -lz -lm -ldl -llinasm # define the C source files SRCS = ./src/file_tests.c ./src/List.c ./src/List_algos.c ./src/bam_access.c ./src/config_file_access.c ./src/fai_access.c ./src/ignore_reg_access.c ./src/alg_bean.c ./src/split_access.c ./src/covs_access.c ./src/cn_access.c ./src/genotype.c ./src/algos.c ./src/output.c ./src/setup.c ./src/split.c ./src/mstep.c ./src/merge.c ./src/estep.c From ad00ea13e8a19b36274d1bc641bb5b8d654909d3 Mon Sep 17 00:00:00 2001 From: David Jones Date: Thu, 14 Nov 2019 16:24:07 +0000 Subject: [PATCH 07/29] Add Math.h from linasm library for rapid log and exp finctions --- src/algos.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/algos.h b/src/algos.h index 1d3d6bf..ea44bcf 100644 --- a/src/algos.h +++ b/src/algos.h @@ -38,6 +38,7 @@ #include #include #include "zlib.h" +#include "Math.h" typedef struct estep_position_t{ uint8_t norm_cn; From 8bce4800a103f09a2a16d57665c5e11a73f9c6aa Mon Sep 17 00:00:00 2001 From: David Jones Date: Thu, 14 Nov 2019 16:26:40 +0000 Subject: [PATCH 08/29] Convert logl and expl functions to use faster linasm versions. Unfortunately this may lead to underflow since only a double rather than long double type is available --- src/algos.c | 49 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/src/algos.c b/src/algos.c index acc8bc5..7675173 100644 --- a/src/algos.c +++ b/src/algos.c @@ -295,19 +295,20 @@ int algos_run_per_read_estep_maths(genotype_store_t *genos,read_pos_t *read, int norm_var_base_prop = genos->het_snp_norm_genotypes[iter]->norm_geno->var_base_prop; nu_fx = (long double)ref_b * norm_var_base_prop; tmp_psi_var_prob_norm = nu_fx / (nu_fx + ((long double)1 - norm_var_base_prop)); - ans = genos->het_snp_norm_genotypes[iter]->prob + + ans = genos->het_snp_norm_genotypes[iter]->prob + (long double) ( Math_Log_flt64( - Math_Expm1_flt64( (flt64_t)ref_base_prob + Math_Log_flt64( + Math_Exp_flt64( (flt64_t)(ref_base_prob + Math_Log_flt64( (flt64_t)((long double)1 - tmp_psi_var_prob_norm) ) - ) + )) + - Math_Expm1_flt64( + Math_Exp_flt64( (flt64_t)( *(read->ref_base_probs[(genos->het_snp_norm_genotypes[iter]->norm_geno->var_base_idx)]) + Math_Log_flt64((flt64_t)tmp_psi_var_prob_norm) ) - ) + ) + ) ); genos->het_snp_norm_genotypes[iter]->prob = ans; }//End of iteration through het snps @@ -327,14 +328,14 @@ int algos_run_per_read_estep_maths(genotype_store_t *genos,read_pos_t *read, int long double nu_xf = (long double)ref_b * genos->somatic_genotypes[iter]->tum_geno->var_base_prop; long double tmp_psi_var_prob = nu_xf * ((long double)1 - base_norm_contam) / (nu_xf + ((long double)1 - genos->somatic_genotypes[iter]->tum_geno->var_base_prop)); //Calculate read component probability. - long double log_tmp_psi_var_prob = Math_Log_flt64((flt64_t)tmp_psi_var_prob); - long double log_1_minus_tmp_psi_var_prob = Math_Log_flt64((flt64_t)((long double)1 - tmp_psi_var_prob)); - ans = genos->somatic_genotypes[iter]->prob + + long double log_tmp_psi_var_prob = (long double) Math_Log_flt64((flt64_t)tmp_psi_var_prob); + long double log_1_minus_tmp_psi_var_prob = (long double) Math_Log_flt64((flt64_t)((long double)1 - tmp_psi_var_prob)); + ans = genos->somatic_genotypes[iter]->prob + (long double) Math_Log_flt64( ( - Math_Expm1_flt64( (flt64_t)((ref_base_prob + log_1_minus_tmp_psi_var_prob)) ) + Math_Exp_flt64( (flt64_t)((ref_base_prob + log_1_minus_tmp_psi_var_prob)) ) + - Math_Expm1_flt64( (flt64_t)((*(read->ref_base_probs[genos->somatic_genotypes[iter]->tum_geno->var_base_idx]) + log_tmp_psi_var_prob)) ) + Math_Exp_flt64( (flt64_t)((*(read->ref_base_probs[genos->somatic_genotypes[iter]->tum_geno->var_base_idx]) + log_tmp_psi_var_prob)) ) ) ); genos->somatic_genotypes[iter]->prob = ans; @@ -362,9 +363,9 @@ int algos_run_per_read_estep_maths(genotype_store_t *genos,read_pos_t *read, int long double tum_res = genos->het_snp_genotypes[iter]->prob + ( Math_Log_flt64( - Math_Expm1_flt64( (flt64_t)(ref_base_prob + Math_Log_flt64((flt64_t)((long double)1 - tmp_psi_var_prob_tum)) )) + Math_Exp_flt64( (flt64_t)(ref_base_prob + Math_Log_flt64((flt64_t)((long double)1 - tmp_psi_var_prob_tum)) )) + - Math_Expm1_flt64( (flt64_t)(*(read->ref_base_probs[(genos->het_snp_genotypes[iter]->norm_geno->var_base_idx)]) + Math_Log_flt64((flt64_t)tmp_psi_var_prob_tum) )) + Math_Exp_flt64( (flt64_t)(*(read->ref_base_probs[(genos->het_snp_genotypes[iter]->norm_geno->var_base_idx)]) + Math_Log_flt64((flt64_t)tmp_psi_var_prob_tum) )) ) ); @@ -380,30 +381,30 @@ int algos_run_per_read_estep_maths(genotype_store_t *genos,read_pos_t *read, int } long double calculateLogSumExpNormFactor(estep_position_t *pos,long double norm_factor_max){ - long double sum_tot = 0.0; + flt64_t sum_tot = 0.0; //Reference - sum_tot += Math_Expm1_flt64((flt64_t)(pos->genos->ref_genotype->prob - norm_factor_max)); + sum_tot += Math_Exp_flt64((flt64_t)(pos->genos->ref_genotype->prob - norm_factor_max)); //Somatic int somatic_i = 0; for(somatic_i = 0;somatic_igenos->somatic_count;somatic_i++){ - sum_tot += Math_Expm1_flt64((flt64_t)(pos->genos->somatic_genotypes[somatic_i]->prob - norm_factor_max)); + sum_tot += Math_Exp_flt64((flt64_t)(pos->genos->somatic_genotypes[somatic_i]->prob - norm_factor_max)); } //het int het_snp_it=0; for(het_snp_it=0;het_snp_itgenos->het_count;het_snp_it++){ - sum_tot += Math_Expm1_flt64((flt64_t)(pos->genos->het_snp_genotypes[het_snp_it]->prob - norm_factor_max)); + sum_tot += Math_Exp_flt64((flt64_t)(pos->genos->het_snp_genotypes[het_snp_it]->prob - norm_factor_max)); } //Hom int hom_snp_i=0; for(hom_snp_i=0;hom_snp_igenos->hom_count;hom_snp_i++){ - sum_tot += Math_Expm1_flt64((flt64_t)(pos->genos->hom_snp_genotypes[hom_snp_i]->prob - norm_factor_max)); - } + sum_tot += Math_Exp_flt64((flt64_t)(pos->genos->hom_snp_genotypes[hom_snp_i]->prob - norm_factor_max)); + } //Return final logged value. - long double norm_factor = norm_factor_max + Math_Log_flt64((flt64_t)sum_tot); + long double norm_factor = norm_factor_max + Math_Log_flt64(sum_tot); return norm_factor; } @@ -414,7 +415,6 @@ void finalise_probabilities_and_find_top_prob(estep_position_t *pos,long double //We find the normalisation factor via the log-sum-exp method to avoid underflow, then we can calculate the final probabilities long double norm_factor = calculateLogSumExpNormFactor(pos,norm_factor_max); - //printf("NORM FACTOR: %5.1Le\n",norm_factor); //Iterate through every genotype and normalise it. Adding it to a list. int iter = 0; long double somatic_sum = 0.0; @@ -422,8 +422,7 @@ void finalise_probabilities_and_find_top_prob(estep_position_t *pos,long double for(iter=0;itergenos->total_max;iter++){ //somatics if(itergenos->somatic_count){ - pos->genos->somatic_genotypes[iter]->prob = Math_Expm1_flt64((flt64_t)(pos->genos->somatic_genotypes[iter]->prob - norm_factor)); - + pos->genos->somatic_genotypes[iter]->prob = (long double) (Math_Exp_flt64((flt64_t)(pos->genos->somatic_genotypes[iter]->prob - norm_factor))); somatic_sum += pos->genos->somatic_genotypes[iter]->prob; if(top_geno==NULL){ top_geno = pos->genos->somatic_genotypes[iter]; @@ -437,7 +436,7 @@ void finalise_probabilities_and_find_top_prob(estep_position_t *pos,long double //het snps if(itergenos->het_count){ - pos->genos->het_snp_genotypes[iter]->prob = Math_Expm1_flt64((flt64_t)(pos->genos->het_snp_genotypes[iter]->prob - norm_factor)); + pos->genos->het_snp_genotypes[iter]->prob = (long double) (Math_Exp_flt64((flt64_t)(pos->genos->het_snp_genotypes[iter]->prob - norm_factor))); snp_sum += pos->genos->het_snp_genotypes[iter]->prob; if(pos->genos->het_snp_genotypes[iter]->prob > top_geno->prob){ @@ -450,7 +449,7 @@ void finalise_probabilities_and_find_top_prob(estep_position_t *pos,long double //hom snps if(itergenos->hom_count){ - pos->genos->hom_snp_genotypes[iter]->prob = Math_Expm1_flt64((flt64_t)(pos->genos->hom_snp_genotypes[iter]->prob - norm_factor)); + pos->genos->hom_snp_genotypes[iter]->prob = (long double) (Math_Exp_flt64((flt64_t)(pos->genos->hom_snp_genotypes[iter]->prob - norm_factor))); snp_sum += pos->genos->hom_snp_genotypes[iter]->prob; if(pos->genos->hom_snp_genotypes[iter]->prob > top_geno->prob){ @@ -464,7 +463,7 @@ void finalise_probabilities_and_find_top_prob(estep_position_t *pos,long double //normalise the reference genotype probability. - pos->genos->ref_genotype->prob = Math_Expm1_flt64((flt64_t)(pos->genos->ref_genotype->prob - norm_factor)); + pos->genos->ref_genotype->prob = (long double) (Math_Exp_flt64((flt64_t)(pos->genos->ref_genotype->prob - norm_factor))); if(pos->genos->ref_genotype->prob > top_geno->prob){ sec_geno = top_geno; From 5fc5396211d0dab7c44a96ff7effbbbdd0374c3e Mon Sep 17 00:00:00 2001 From: David Jones Date: Fri, 15 Nov 2019 12:38:56 +0000 Subject: [PATCH 09/29] Update alg_bean.h to contain correct method names --- src/alg_bean.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/alg_bean.h b/src/alg_bean.h index 74e811b..4718d47 100644 --- a/src/alg_bean.h +++ b/src/alg_bean.h @@ -35,6 +35,10 @@ #include #include +#include "khash.h" + +//New hash to store unique readlengths +KHASH_MAP_INIT_INT(readlenpos, List *) typedef struct alg_bean_intrange{ int from; @@ -58,6 +62,7 @@ typedef struct alg_bean_t{ int call_base_size; List *strand; int strand_size; + khash_t(readlenpos) *read_len_pos; } alg_bean_t; int alg_bean_create_default_file(FILE *file, char *norm, char *tum); @@ -72,7 +77,10 @@ List *alg_bean_hard_copy_char_list(List *new_list, List *old); int alg_bean_get_index_for_str_arr(List *list,char *value); int alg_bean_get_index_for_intrange_arr(List *list,int value); int alg_bean_get_index_for_char_arr(List *list,char *value); -int alg_bean_get_index_for_read_pos_prop_arr(List *list,int pos,int rd_len); +// int alg_bean_get_index_for_read_pos_prop_arr(List *list,int pos,int rd_len); +int alg_bean_get_index_for_read_pos_prop_arr(void *_hash, int pos,int rd_len); +int alg_bean_add_read_length_arrs(alg_bean_t *bean, char* list_loc, char* contig); +List *alg_bean_get_position_list_from_read_pos_proportion_arr(List *list,int rd_len); #define CEIL(a, b) (((a) / (b)) + (((a) % (b)) > 0 ? 1 : 0)) #define MIN(a, b) (((a) < (b)) ? (a) : (b)) From 98e5d1e0f309cd985a7726bf8d0c3c515aaeda5f Mon Sep 17 00:00:00 2001 From: David Jones Date: Fri, 15 Nov 2019 12:43:25 +0000 Subject: [PATCH 10/29] Ensure dirname is available --- src/alg_bean.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/alg_bean.c b/src/alg_bean.c index 191999c..7dba49a 100644 --- a/src/alg_bean.c +++ b/src/alg_bean.c @@ -32,6 +32,7 @@ #include #include +#include #include #include #include From 5905da6ee5e92e1bfa7f7de3cf7631ba601cb106 Mon Sep 17 00:00:00 2001 From: David Jones Date: Fri, 15 Nov 2019 12:59:12 +0000 Subject: [PATCH 11/29] Add missing test files --- testData/readpos.1 | 2 ++ testData/readpos.2 | 2 ++ testData/readpos.22 | 1 + 3 files changed, 5 insertions(+) create mode 100644 testData/readpos.1 create mode 100644 testData/readpos.2 create mode 100644 testData/readpos.22 diff --git a/testData/readpos.1 b/testData/readpos.1 new file mode 100644 index 0000000..a73dec4 --- /dev/null +++ b/testData/readpos.1 @@ -0,0 +1,2 @@ +75 1-2;3-38;39-56;57-66;67-76; +100 1-3;4-51;52-76;77-90;91-100; diff --git a/testData/readpos.2 b/testData/readpos.2 new file mode 100644 index 0000000..a73dec4 --- /dev/null +++ b/testData/readpos.2 @@ -0,0 +1,2 @@ +75 1-2;3-38;39-56;57-66;67-76; +100 1-3;4-51;52-76;77-90;91-100; diff --git a/testData/readpos.22 b/testData/readpos.22 new file mode 100644 index 0000000..ce8e7b1 --- /dev/null +++ b/testData/readpos.22 @@ -0,0 +1 @@ +75 1-2;3-38;39-56;57-66;67-76; From e05e976907763fa3fca0adf3704f5536baf95ea3 Mon Sep 17 00:00:00 2001 From: David Jones Date: Fri, 15 Nov 2019 13:04:47 +0000 Subject: [PATCH 12/29] Add libgen so dirname is declared --- src/alg_bean.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/alg_bean.c b/src/alg_bean.c index 191999c..7dba49a 100644 --- a/src/alg_bean.c +++ b/src/alg_bean.c @@ -32,6 +32,7 @@ #include #include +#include #include #include #include From c35bddaf5e63a3d42b6199738aa3555167d4aad4 Mon Sep 17 00:00:00 2001 From: David Jones Date: Tue, 19 Nov 2019 14:24:59 +0000 Subject: [PATCH 13/29] CHANGES and README update --- CHANGES.md | 2 ++ README.md | 1 + 2 files changed, 3 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 7793a9f..8d627ef 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,6 +4,8 @@ * Making species and assembly required commandline arguments in the estep * Change method by which read position index is resolved. Results in time saving. +* linasm is now required [linasm](http://linasm.sourceforge.net/index.php) +* New log and exp functions from [linasm](http://linasm.sourceforge.net/index.php) ## 1.13.16 diff --git a/README.md b/README.md index 32c4430..f96d427 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ See INSTALL.TXT * Reference.fasta and index * A one based bed style format file of regions to ignore during analysis (see specified format). * [zlib](https://zlib.net/) >= 1.2.3.5 +* [linasm](http://linasm.sourceforge.net/index.php) ## Optional inputs (will result in more accurate calls) From 37b484f570e07cd4e45df0585b01ca54baa7c843 Mon Sep 17 00:00:00 2001 From: David Jones Date: Tue, 19 Nov 2019 14:50:28 +0000 Subject: [PATCH 14/29] REmove no longer used test cases --- tests/alg_bean_tests.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/tests/alg_bean_tests.c b/tests/alg_bean_tests.c index 5219129..20417fb 100644 --- a/tests/alg_bean_tests.c +++ b/tests/alg_bean_tests.c @@ -279,26 +279,6 @@ char *test_alg_bean_get_index_for_char_arr_cram(){ return NULL; } -// char *test_alg_bean_get_index_for_read_pos_prop_arr(){ -// test_bean = alg_bean_generate_default_alg_bean(norm_file,tum_file); -// //Expect the following results: -// mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,2,75)==0,"Wrong index returned for RD pos check 1.\n"); -// mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,3,75)==1,"Wrong index returned for RD pos check 2.\n"); -// mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,4,150)==0,"Wrong index returned for RD pos check 3.\n"); -// mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,5,150)==1,"Wrong index returned for RD pos check 4.\n"); -// return NULL; -// } - -// char *test_alg_bean_get_index_for_read_pos_prop_arr_cram(){ -// test_bean = alg_bean_generate_default_alg_bean(norm_file_cram,tum_file_cram); -// //Expect the following results: -// mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,2,75)==0,"Wrong index returned for RD pos check 1.\n"); -// mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,3,75)==1,"Wrong index returned for RD pos check 2.\n"); -// mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,4,150)==0,"Wrong index returned for RD pos check 3.\n"); -// mu_assert(alg_bean_get_index_for_read_pos_prop_arr(test_bean->rd_pos,5,150)==1,"Wrong index returned for RD pos check 4.\n"); -// return NULL; -// } - char *test_alg_bean_generate_default_alg_bean(){ alg_bean_t *test_bean = alg_bean_generate_default_alg_bean(norm_file,tum_file); mu_assert(test_bean != NULL, "Couldn't retrieve default alg_bean.\n"); From 1d97ee1f7755f25842f133df5c2e28223766eba5 Mon Sep 17 00:00:00 2001 From: David Jones Date: Tue, 19 Nov 2019 15:10:40 +0000 Subject: [PATCH 15/29] Update setup to have another \-e --- setup.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.sh b/setup.sh index 3b83e4a..8f0f6c8 100755 --- a/setup.sh +++ b/setup.sh @@ -125,6 +125,8 @@ fi export HTSLIB="$SETUP_DIR/htslib" +set -e + echo -n "Building CaVEMan ..." if [ -e "$SETUP_DIR/caveman.success" ]; then echo -n " previously installed ..."; From a8ebf435d0b4247c58a7cfd9e8b2a794e745f29d Mon Sep 17 00:00:00 2001 From: David Jones Date: Tue, 19 Nov 2019 15:10:54 +0000 Subject: [PATCH 16/29] Install linasm in travis --- .travis.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index d7e5769..b70bed4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,5 +11,13 @@ install: true compiler: - gcc +before_script: + - wget https://sourceforge.net/projects/linasm/files/linasm-1.13%28stable%29.tar.gz/download -O /tmp/linasm.tar.gz + - mkdir /tmp/linasm + - tar --strip-components 1 -C /tmp/linasm -xzf /tmp/linasm.tar.gz + - cd /tmp/linasm + - make + - make install + script: - - ./setup.sh ~/wtsi-opt \ No newline at end of file + - ./setup.sh ~/wtsi-opt From 379eb259fd58c2cf2bdf2ee8db5e47f4c88760eb Mon Sep 17 00:00:00 2001 From: David Jones Date: Tue, 19 Nov 2019 15:47:02 +0000 Subject: [PATCH 17/29] Install linasm in travis - add sudo --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index b70bed4..386805d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,7 +17,7 @@ before_script: - tar --strip-components 1 -C /tmp/linasm -xzf /tmp/linasm.tar.gz - cd /tmp/linasm - make - - make install + - sudo make install script: - ./setup.sh ~/wtsi-opt From a35cc635e4b703d5975f95654b8758e7343cc3fc Mon Sep 17 00:00:00 2001 From: David Jones Date: Tue, 19 Nov 2019 15:49:31 +0000 Subject: [PATCH 18/29] Install linasm in travis - ensure we return to woring directory --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index 386805d..42b4d72 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,12 +12,14 @@ compiler: - gcc before_script: + - WORKDIR=`pwd` - wget https://sourceforge.net/projects/linasm/files/linasm-1.13%28stable%29.tar.gz/download -O /tmp/linasm.tar.gz - mkdir /tmp/linasm - tar --strip-components 1 -C /tmp/linasm -xzf /tmp/linasm.tar.gz - cd /tmp/linasm - make - sudo make install + - cd $WORKDIR script: - ./setup.sh ~/wtsi-opt From 64b8b935ccf860067db3741a90d55d041cba47fd Mon Sep 17 00:00:00 2001 From: David Jones Date: Tue, 19 Nov 2019 15:52:41 +0000 Subject: [PATCH 19/29] Add -e to runtests.dh --- tests/runtests.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/runtests.sh b/tests/runtests.sh index 596f001..f512669 100644 --- a/tests/runtests.sh +++ b/tests/runtests.sh @@ -30,6 +30,7 @@ # ########################### +set -e echo "Running unit tests:" for i in tests/*_tests From b37763397d6ff93339beb950f17a07c68c598328 Mon Sep 17 00:00:00 2001 From: David Jones Date: Tue, 19 Nov 2019 15:57:17 +0000 Subject: [PATCH 20/29] Install linasm in travis - ensure we return to woring directory --- .travis.yml | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/.travis.yml b/.travis.yml index 42b4d72..87ab938 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,15 +11,10 @@ install: true compiler: - gcc -before_script: - - WORKDIR=`pwd` +install: - wget https://sourceforge.net/projects/linasm/files/linasm-1.13%28stable%29.tar.gz/download -O /tmp/linasm.tar.gz - - mkdir /tmp/linasm - - tar --strip-components 1 -C /tmp/linasm -xzf /tmp/linasm.tar.gz - - cd /tmp/linasm - - make - - sudo make install - - cd $WORKDIR + - tar -xzf /tmp/linasm.tar.gz + - pushd /tmp/linasm-1.13\(stable\)/ && make && sudo make install && popd script: - ./setup.sh ~/wtsi-opt From 0ae74b34e45b72aa79d77fc61a5527dbc66fe54e Mon Sep 17 00:00:00 2001 From: David Jones Date: Tue, 19 Nov 2019 15:59:30 +0000 Subject: [PATCH 21/29] Install linasm in travis - ensure we return to woring directory --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 87ab938..0ad8a95 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,6 +14,7 @@ compiler: install: - wget https://sourceforge.net/projects/linasm/files/linasm-1.13%28stable%29.tar.gz/download -O /tmp/linasm.tar.gz - tar -xzf /tmp/linasm.tar.gz + - ls /tmp - pushd /tmp/linasm-1.13\(stable\)/ && make && sudo make install && popd script: From 83a8dc110a1301402540cd18c52d3dfdcadc826a Mon Sep 17 00:00:00 2001 From: David Jones Date: Tue, 19 Nov 2019 16:01:18 +0000 Subject: [PATCH 22/29] Install linasm in travis - ensure we return to woring directory --- .travis.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 0ad8a95..2f8b618 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,9 +12,9 @@ compiler: - gcc install: - - wget https://sourceforge.net/projects/linasm/files/linasm-1.13%28stable%29.tar.gz/download -O /tmp/linasm.tar.gz - - tar -xzf /tmp/linasm.tar.gz - - ls /tmp + - wget https://sourceforge.net/projects/linasm/files/linasm-1.13%28stable%29.tar.gz/download -O linasm.tar.gz + - tar -xzf linasm.tar.gz + - ls ./ - pushd /tmp/linasm-1.13\(stable\)/ && make && sudo make install && popd script: From 8be4952c773c56443e293920e3ad72b6e562ea69 Mon Sep 17 00:00:00 2001 From: David Jones Date: Tue, 19 Nov 2019 16:02:39 +0000 Subject: [PATCH 23/29] Install linasm in travis - ensure we return to woring directory --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 2f8b618..eaa3029 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,7 +15,7 @@ install: - wget https://sourceforge.net/projects/linasm/files/linasm-1.13%28stable%29.tar.gz/download -O linasm.tar.gz - tar -xzf linasm.tar.gz - ls ./ - - pushd /tmp/linasm-1.13\(stable\)/ && make && sudo make install && popd + - pushd linasm-1.13\(stable\) && make && sudo make install && popd script: - ./setup.sh ~/wtsi-opt From 3515896bec27fecfb5bdc893c04c43d7aa46a676 Mon Sep 17 00:00:00 2001 From: David Jones Date: Tue, 19 Nov 2019 16:16:56 +0000 Subject: [PATCH 24/29] Install linasm in travis - ensure we return to woring directory --- .travis.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index eaa3029..319c9b5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,11 +11,13 @@ install: true compiler: - gcc -install: +before_script: - wget https://sourceforge.net/projects/linasm/files/linasm-1.13%28stable%29.tar.gz/download -O linasm.tar.gz - tar -xzf linasm.tar.gz - - ls ./ - pushd linasm-1.13\(stable\) && make && sudo make install && popd + - export PATH=/usr/bin:$PATH + - export C_INCLUDE_PATH=/usr/include/:$C_INCLUDE_PATH + - export LD_LIBRARY_PATH=/usr/lib:$LD_LIBRARY_PATH script: - ./setup.sh ~/wtsi-opt From 978ce0bfe2eb2ea7720f47ebd3ba292c7d122b2e Mon Sep 17 00:00:00 2001 From: David Jones Date: Tue, 19 Nov 2019 16:19:04 +0000 Subject: [PATCH 25/29] Install linasm in travis - set install dir to /usr --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 319c9b5..4370c93 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ compiler: before_script: - wget https://sourceforge.net/projects/linasm/files/linasm-1.13%28stable%29.tar.gz/download -O linasm.tar.gz - tar -xzf linasm.tar.gz - - pushd linasm-1.13\(stable\) && make && sudo make install && popd + - pushd linasm-1.13\(stable\) && make && sudo make install --prefix=/usr && popd - export PATH=/usr/bin:$PATH - export C_INCLUDE_PATH=/usr/include/:$C_INCLUDE_PATH - export LD_LIBRARY_PATH=/usr/lib:$LD_LIBRARY_PATH From d0b94b2abb347f229bb46c524d84b1ce9d6e4cb5 Mon Sep 17 00:00:00 2001 From: David Jones Date: Tue, 19 Nov 2019 16:20:59 +0000 Subject: [PATCH 26/29] Install linasm in travis - set install prefix to /usr --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 4370c93..7a6cbb0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ compiler: before_script: - wget https://sourceforge.net/projects/linasm/files/linasm-1.13%28stable%29.tar.gz/download -O linasm.tar.gz - tar -xzf linasm.tar.gz - - pushd linasm-1.13\(stable\) && make && sudo make install --prefix=/usr && popd + - pushd linasm-1.13\(stable\) && make && sudo make install prefix=/usr && popd - export PATH=/usr/bin:$PATH - export C_INCLUDE_PATH=/usr/include/:$C_INCLUDE_PATH - export LD_LIBRARY_PATH=/usr/lib:$LD_LIBRARY_PATH From aadd40eabdb2fd3fd9e1809f5848cac6d9bcf78c Mon Sep 17 00:00:00 2001 From: David Jones Date: Tue, 19 Nov 2019 16:30:31 +0000 Subject: [PATCH 27/29] REmove int alg_bean_get_index_for_read_pos_prop_arr method definition as no longer used --- src/alg_bean.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/alg_bean.h b/src/alg_bean.h index 4718d47..cd10aa5 100644 --- a/src/alg_bean.h +++ b/src/alg_bean.h @@ -77,7 +77,6 @@ List *alg_bean_hard_copy_char_list(List *new_list, List *old); int alg_bean_get_index_for_str_arr(List *list,char *value); int alg_bean_get_index_for_intrange_arr(List *list,int value); int alg_bean_get_index_for_char_arr(List *list,char *value); -// int alg_bean_get_index_for_read_pos_prop_arr(List *list,int pos,int rd_len); int alg_bean_get_index_for_read_pos_prop_arr(void *_hash, int pos,int rd_len); int alg_bean_add_read_length_arrs(alg_bean_t *bean, char* list_loc, char* contig); List *alg_bean_get_position_list_from_read_pos_proportion_arr(List *list,int rd_len); From 0b4cc5b162f49fd902cae26c3b78638e09fed6ec Mon Sep 17 00:00:00 2001 From: David Jones Date: Tue, 19 Nov 2019 16:45:07 +0000 Subject: [PATCH 28/29] Update version in Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 98735e8..2577b22 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -CAVEMAN_VERSION=1.13.16 +CAVEMAN_VERSION=1.14.0 TEST_REF?="" #Compiler From 38f5be47920b920dba0c266d8290ff9510fe2e5e Mon Sep 17 00:00:00 2001 From: David Jones Date: Tue, 19 Nov 2019 16:45:37 +0000 Subject: [PATCH 29/29] Update CHANGES, INSTALL and README --- CHANGES.md | 2 +- INSTALL.TXT | 4 +++- README.md | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 8d627ef..da4b82f 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,6 @@ # CHANGES -## NEXT +## 1.14.0 * Making species and assembly required commandline arguments in the estep * Change method by which read position index is resolved. Results in time saving. diff --git a/INSTALL.TXT b/INSTALL.TXT index 346f291..6640d6c 100644 --- a/INSTALL.TXT +++ b/INSTALL.TXT @@ -3,7 +3,9 @@ System Requirements caveman is designed to run in a compute farm/clustre environment. caveman depends on: - htslib 1.3 + htslib >=1.3 + zlib >=1.2.3.5 + linasm >=1.13 Compilation diff --git a/README.md b/README.md index f96d427..b202571 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ See INSTALL.TXT * Reference.fasta and index * A one based bed style format file of regions to ignore during analysis (see specified format). * [zlib](https://zlib.net/) >= 1.2.3.5 -* [linasm](http://linasm.sourceforge.net/index.php) +* [linasm](http://linasm.sourceforge.net/index.php) >= 1.13 ## Optional inputs (will result in more accurate calls)