diff --git a/DB.c b/DB.c index 69060d0..9b5c85f 100644 --- a/DB.c +++ b/DB.c @@ -41,6 +41,24 @@ char Ebuffer[1000]; #endif +int Count_Args(char *var) +{ int cnt, lev; + char *s; + + cnt = 1; + lev = 0; + for (s = var; *s != '\0'; s++) + if (*s == ',') + { if (lev == 0) + cnt += 1; + } + else if (*s == '(') + lev += 1; + else if (*s == ')') + lev -= 1; + return (cnt); +} + void *Malloc(int64 size, char *mesg) { void *p; @@ -382,7 +400,7 @@ void Number_Arrow(char *s) ********************************************************************************************/ -// Open the given database or dam, "path" into the supplied HITS_DB record "db". If the name has +// Open the given database or dam, "path" into the supplied DAZZ_DB record "db". If the name has // a part # in it then just the part is opened. The index array is allocated (for all or // just the part) and read in. // Return status of routine: @@ -390,8 +408,8 @@ void Number_Arrow(char *s) // 0: Open of DB proceeded without mishap // 1: Open of DAM proceeded without mishap -int Open_DB(char* path, HITS_DB *db) -{ HITS_DB dbcopy; +int Open_DB(char* path, DAZZ_DB *db) +{ DAZZ_DB dbcopy; char *root, *pwd, *bptr, *fptr, *cat; int nreads; FILE *index, *dbvis; @@ -437,7 +455,7 @@ int Open_DB(char* path, HITS_DB *db) if ((index = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r")) == NULL) goto error1; - if (fread(db,sizeof(HITS_DB),1,index) != 1) + if (fread(db,sizeof(DAZZ_DB),1,index) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); goto error2; } @@ -505,28 +523,28 @@ int Open_DB(char* path, HITS_DB *db) nreads = ulast-ufirst; if (part <= 0) - { db->reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index"); + { db->reads = (DAZZ_READ *) Malloc(sizeof(DAZZ_READ)*(nreads+2),"Allocating Open_DB index"); if (db->reads == NULL) goto error2; db->reads += 1; - if (fread(db->reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads) + if (fread(db->reads,sizeof(DAZZ_READ),nreads,index) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); free(db->reads-1); goto error2; } } else - { HITS_READ *reads; + { DAZZ_READ *reads; int i, r, maxlen; int64 totlen; - reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index"); + reads = (DAZZ_READ *) Malloc(sizeof(DAZZ_READ)*(nreads+2),"Allocating Open_DB index"); if (reads == NULL) goto error2; reads += 1; - fseeko(index,sizeof(HITS_READ)*ufirst,SEEK_CUR); - if (fread(reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads) + fseeko(index,sizeof(DAZZ_READ)*ufirst,SEEK_CUR); + if (fread(reads,sizeof(DAZZ_READ),nreads,index) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); free(reads-1); goto error2; @@ -580,13 +598,13 @@ int Open_DB(char* path, HITS_DB *db) // of the current DB partition. Reallocate smaller memory blocks for the information kept // for the retained reads. -void Trim_DB(HITS_DB *db) +void Trim_DB(DAZZ_DB *db) { int i, j, r; int allflag, cutoff; int64 totlen; int maxlen, nreads; - HITS_TRACK *record; - HITS_READ *reads; + DAZZ_TRACK *record; + DAZZ_READ *reads; if (db->trimmed) return; @@ -603,7 +621,7 @@ void Trim_DB(HITS_DB *db) for (record = db->tracks; record != NULL; record = record->next) if (strcmp(record->name,".@qvs") == 0) - { uint16 *table = ((HITS_QV *) record)->table; + { uint16 *table = ((DAZZ_QV *) record)->table; j = 0; for (i = 0; i < db->nreads; i++) @@ -675,7 +693,7 @@ void Trim_DB(HITS_DB *db) db->trimmed = 1; if (j < nreads) - { db->reads = Realloc(reads-1,sizeof(HITS_READ)*(j+2),NULL); + { db->reads = Realloc(reads-1,sizeof(DAZZ_READ)*(j+2),NULL); db->reads += 1; } } @@ -683,12 +701,12 @@ void Trim_DB(HITS_DB *db) // The DB has already been trimmed, but a track over the untrimmed DB needs to be loaded. // Trim the track by rereading the untrimmed DB index from the file system. -static int Late_Track_Trim(HITS_DB *db, HITS_TRACK *track, int ispart) +static int Late_Track_Trim(DAZZ_DB *db, DAZZ_TRACK *track, int ispart) { int i, j, r; int allflag, cutoff; int ureads; char *root; - HITS_READ read; + DAZZ_READ read; FILE *indx; if (!db->trimmed) return (0); @@ -703,7 +721,7 @@ static int Late_Track_Trim(HITS_DB *db, HITS_TRACK *track, int ispart) root = rindex(db->path,'/') + 2; indx = Fopen(Catenate(db->path,"","",".idx"),"r"); - fseeko(indx,sizeof(HITS_DB) + sizeof(HITS_READ)*db->ufirst,SEEK_SET); + fseeko(indx,sizeof(DAZZ_DB) + sizeof(DAZZ_READ)*db->ufirst,SEEK_SET); if (ispart) ureads = ((int *) (db->reads))[-1]; else @@ -725,7 +743,7 @@ static int Late_Track_Trim(HITS_DB *db, HITS_TRACK *track, int ispart) { anno = (char *) track->anno; j = r = 0; for (i = r = 0; i < ureads; i++, r += size) - { if (fread(&read,sizeof(HITS_READ),1,indx) != 1) + { if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); fclose(indx); EXIT(1); @@ -744,7 +762,7 @@ static int Late_Track_Trim(HITS_DB *db, HITS_TRACK *track, int ispart) anno4 = (int *) (track->anno); j = anno4[0] = 0; for (i = 0; i < ureads; i++) - { if (fread(&read,sizeof(HITS_READ),1,indx) != 1) + { if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); fclose(indx); EXIT(1); @@ -764,7 +782,7 @@ static int Late_Track_Trim(HITS_DB *db, HITS_TRACK *track, int ispart) anno8 = (int64 *) (track->anno); j = anno8[0] = 0; for (i = 0; i < ureads; i++) - { if (fread(&read,sizeof(HITS_READ),1,indx) != 1) + { if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); fclose(indx); EXIT(1); @@ -789,8 +807,8 @@ static int Late_Track_Trim(HITS_DB *db, HITS_TRACK *track, int ispart) // and any open file pointers. The record pointed at by db however remains (the user // supplied it and so should free it). -void Close_DB(HITS_DB *db) -{ HITS_TRACK *t, *p; +void Close_DB(DAZZ_DB *db) +{ DAZZ_TRACK *t, *p; if (db->loaded) free(((char *) (db->bases)) - 1); @@ -813,19 +831,19 @@ void Close_DB(HITS_DB *db) // Return the size in bytes of the memory occupied by a given DB -int64 sizeof_DB(HITS_DB *db) +int64 sizeof_DB(DAZZ_DB *db) { int64 s; - HITS_TRACK *t; + DAZZ_TRACK *t; - s = sizeof(HITS_DB) - + sizeof(HITS_READ)*(db->nreads+2) + s = sizeof(DAZZ_DB) + + sizeof(DAZZ_READ)*(db->nreads+2) + strlen(db->path)+1 + (db->totlen+db->nreads+4); t = db->tracks; if (t != NULL && strcmp(t->name,".@qvs") == 0) - { HITS_QV *q = (HITS_QV *) t; - s += sizeof(HITS_QV) + { DAZZ_QV *q = (DAZZ_QV *) t; + s += sizeof(DAZZ_QV) + sizeof(uint16) * db->nreads + q->ncodes * sizeof(QVcoding) + 6; @@ -833,7 +851,7 @@ int64 sizeof_DB(HITS_DB *db) } for (; t != NULL; t = t->next) - { s += sizeof(HITS_TRACK) + { s += sizeof(DAZZ_TRACK) + strlen(t->name)+1 + t->size * (db->nreads+1); if (t->data != NULL) @@ -854,14 +872,14 @@ int64 sizeof_DB(HITS_DB *db) * ********************************************************************************************/ -HITS_DB *Active_DB = NULL; // Last db/qv used by "Load_QVentry" -HITS_QV *Active_QV; // Becomes invalid after closing +DAZZ_DB *Active_DB = NULL; // Last db/qv used by "Load_QVentry" +DAZZ_QV *Active_QV; // Becomes invalid after closing -int Load_QVs(HITS_DB *db) +int Load_QVs(DAZZ_DB *db) { FILE *quiva, *istub, *indx; char *root; uint16 *table; - HITS_QV *qvtrk; + DAZZ_QV *qvtrk; QVcoding *coding, *nx; int ncodes = 0; @@ -956,7 +974,7 @@ int Load_QVs(HITS_DB *db) goto error; } - // Carefully get the first coding scheme (its offset is most likely in a HITS_RECORD + // Carefully get the first coding scheme (its offset is most likely in a DAZZ_RECORD // in .idx that is *not* in memory). Get all the other coding schemes normally and // assign the tables # for each read in the block in "tables". @@ -974,10 +992,10 @@ int Load_QVs(HITS_DB *db) i = n-fbeg; if (first < pfirst) - { HITS_READ read; + { DAZZ_READ read; - fseeko(indx,sizeof(HITS_DB) + sizeof(HITS_READ)*first,SEEK_SET); - if (fread(&read,sizeof(HITS_READ),1,indx) != 1) + fseeko(indx,sizeof(DAZZ_DB) + sizeof(DAZZ_READ)*first,SEEK_SET); + if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); ncodes = i; goto error; @@ -1050,17 +1068,17 @@ int Load_QVs(HITS_DB *db) } } - // Allocate and fill in the HITS_QV record and add it to the front of the + // Allocate and fill in the DAZZ_QV record and add it to the front of the // track list - qvtrk = (HITS_QV *) Malloc(sizeof(HITS_QV),"Allocating QV pseudo-track"); + qvtrk = (DAZZ_QV *) Malloc(sizeof(DAZZ_QV),"Allocating QV pseudo-track"); if (qvtrk == NULL) goto error; qvtrk->name = Strdup(".@qvs","Allocating QV pseudo-track name"); if (qvtrk->name == NULL) goto error; qvtrk->next = db->tracks; - db->tracks = (HITS_TRACK *) qvtrk; + db->tracks = (DAZZ_TRACK *) qvtrk; qvtrk->ncodes = ncodes; qvtrk->table = table; qvtrk->coding = coding; @@ -1091,16 +1109,16 @@ int Load_QVs(HITS_DB *db) // Close the QV stream, free the QV pseudo track and all associated memory -void Close_QVs(HITS_DB *db) -{ HITS_TRACK *track; - HITS_QV *qvtrk; +void Close_QVs(DAZZ_DB *db) +{ DAZZ_TRACK *track; + DAZZ_QV *qvtrk; int i; Active_DB = NULL; track = db->tracks; if (track != NULL && strcmp(track->name,".@qvs") == 0) - { qvtrk = (HITS_QV *) track; + { qvtrk = (DAZZ_QV *) track; for (i = 0; i < qvtrk->ncodes; i++) Free_QVcoding(qvtrk->coding+i); free(qvtrk->coding); @@ -1125,7 +1143,7 @@ void Close_QVs(HITS_DB *db) // -1: Track is not the right size of DB either trimmed or untrimmed // -2: Could not find the track -int Check_Track(HITS_DB *db, char *track, int *kind) +int Check_Track(DAZZ_DB *db, char *track, int *kind) { FILE *afile; int tracklen, size, ispart; int ureads, treads; @@ -1181,10 +1199,10 @@ int Check_Track(HITS_DB *db, char *track, int *kind) // If track is not already in the db's track list, then allocate all the storage for it, // read it in from the appropriate file, add it to the track list, and return a pointer -// to the newly created HITS_TRACK record. If the track does not exist or cannot be +// to the newly created DAZZ_TRACK record. If the track does not exist or cannot be // opened for some reason, then NULL is returned. -HITS_TRACK *Load_Track(HITS_DB *db, char *track) +DAZZ_TRACK *Load_Track(DAZZ_DB *db, char *track) { FILE *afile, *dfile; int tracklen, size; int nreads, ispart; @@ -1192,7 +1210,7 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track) void *anno; void *data; char *name; - HITS_TRACK *record; + DAZZ_TRACK *record; if (track[0] == '.') { EPRINTF(EPLACE,"%s: Track name, '%s', cannot begin with a .\n",Prog_Name,track); @@ -1340,7 +1358,7 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track) fclose(afile); - record = (HITS_TRACK *) Malloc(sizeof(HITS_TRACK),"Allocating Track Record"); + record = (DAZZ_TRACK *) Malloc(sizeof(DAZZ_TRACK),"Allocating Track Record"); if (record == NULL) goto error; record->name = Strdup(track,"Allocating Track Name"); @@ -1379,8 +1397,161 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track) EXIT (NULL); } -void Close_Track(HITS_DB *db, char *track) -{ HITS_TRACK *record, *prev; +// Assumming file pointer for afile is correctly positioned at the start of a extra item, +// and aname is the name of the .anno file, decode the value present and places it in +// extra if extra->nelem == 0, otherwise reduce the value just read into extra according +// according the to the directive given by 'accum'. Leave the read poinrt at the next +// extra or end-of-file. +// Returns: +// 1 if at the end of file, +// 0 if item was read and folded correctly, +// -1 if there was a system IO or allocation error (if interactive), and +// -2 if the new value could not be reduced into the currenct value of extra (interactive) + +int Read_Extra(FILE *afile, char *aname, DAZZ_EXTRA *extra) +{ int vtype, nelem, accum, slen; + char *name; + void *value; + +#define EREAD(v,s,n,file,ret) \ + { if (fread(v,s,n,file) != (size_t) n) \ + { if (ferror(file)) \ + fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); \ + else if (ret) \ + return (1); \ + else \ + fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,aname); \ + EXIT(-1); \ + } \ + } + + EREAD(&vtype,sizeof(int),1,afile,1) + EREAD(&nelem,sizeof(int),1,afile,0) + EREAD(&accum,sizeof(int),1,afile,0) + EREAD(&slen,sizeof(int),1,afile,0) + + if (extra == NULL) + { if (fseeko(afile,slen+8*nelem,SEEK_CUR) < 0) + { fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); + EXIT(-1); + } + return (0); + } + + name = (char *) Malloc(slen+1,"Allocating extra name"); + value = Malloc(8*nelem,"Allocating extra value"); + if (name == NULL || value == NULL) + EXIT(-1); + + EREAD(name,1,slen,afile,0); + EREAD(value,8,nelem,afile,0); + name[slen] = '\0'; + + if (extra->nelem == 0) + { extra->vtype = vtype; + extra->nelem = nelem; + extra->accum = accum; + extra->name = name; + extra->value = value; + return (0); + } + + if (vtype != extra->vtype) + { fprintf(stderr,"%s: Type of extra %s does not agree with previous .anno block files\n", + Prog_Name,name); + goto error; + } + if (nelem != extra->nelem) + { fprintf(stderr,"%s: Length of extra %s does not agree with previous .anno block files\n", + Prog_Name,name); + goto error; + } + if (accum != extra->accum) + { fprintf(stderr,"%s: Reduction indicator of extra %s does not agree with",Prog_Name,name); + fprintf(stderr," previos .anno block files\n"); + goto error; + } + if (strcmp(name,extra->name) != 0) + { fprintf(stderr,"%s: Expecting extra %s in .anno block file, not %s\n", + Prog_Name,extra->name,name); + goto error; + } + + if (vtype == DB_INT) + { int64 *ival = (int64 *) value; + int64 *eval = (int64 *) (extra->value); + int j; + + if (accum == DB_EXACT) + { for (j = 0; j < nelem; j++) + if (eval[j] != ival[j]) + { fprintf(stderr,"%s: Value of extra %s doe not agree",Prog_Name,name); + fprintf(stderr," with previous .anno block files\n"); + goto error; + } + } + else + { for (j = 0; j < nelem; j++) + eval[j] += ival[j]; + } + } + + else + { double *ival = (double *) value; + double *eval = (double *) (extra->value); + int j; + + if (accum == DB_EXACT) + { for (j = 0; j < nelem; j++) + if (eval[j] != ival[j]) + { fprintf(stderr,"%s: Value of extra %s doe not agree",Prog_Name,name); + fprintf(stderr," with previous .anoo block files\n"); + goto error; + } + } + else + { for (j = 0; j < nelem; j++) + eval[j] += ival[j]; + } + } + + free(value); + free(name); + return (0); + +error: + free(value); + free(name); + EXIT(1); +} + +// Write extra record to end of file afile and advance write pointer +// If interactive, then return non-zero on error, if bash, then print +// and halt if an error + +int Write_Extra(FILE *afile, DAZZ_EXTRA *extra) +{ int slen; + +#define EWRITE(v,s,n,file) \ + { if (fwrite(v,s,n,file) != (size_t) n) \ + { fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); \ + EXIT(1); \ + } \ + } + + EWRITE(&(extra->vtype),sizeof(int),1,afile) + FWRITE(&(extra->nelem),sizeof(int),1,afile) + FWRITE(&(extra->accum),sizeof(int),1,afile) + slen = strlen(extra->name); + FWRITE(&slen,sizeof(int),1,afile) + FWRITE(extra->name,1,slen,afile) + FWRITE(extra->value,8,extra->nelem,afile) + + return (0); +} + +void Close_Track(DAZZ_DB *db, char *track) +{ DAZZ_TRACK *record, *prev; prev = NULL; for (record = db->tracks; record != NULL; record = record->next) @@ -1410,7 +1581,7 @@ void Close_Track(HITS_DB *db, char *track) // Allocate and return a buffer big enough for the largest read in 'db', leaving room // for an initial delimiter character -char *New_Read_Buffer(HITS_DB *db) +char *New_Read_Buffer(DAZZ_DB *db) { char *read; read = (char *) Malloc(db->maxlen+4,"Allocating New Read Buffer"); @@ -1425,11 +1596,11 @@ char *New_Read_Buffer(HITS_DB *db) // // **NB**, the byte before read will be set to a delimiter character! -int Load_Read(HITS_DB *db, int i, char *read, int ascii) +int Load_Read(DAZZ_DB *db, int i, char *read, int ascii) { FILE *bases = (FILE *) db->bases; int64 off; int len, clen; - HITS_READ *r = db->reads; + DAZZ_READ *r = db->reads; if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); @@ -1472,14 +1643,14 @@ int Load_Read(HITS_DB *db, int i, char *read, int ascii) // and as a numeric string otherwise. // -HITS_DB *Arrow_DB = NULL; // Last db/arw used by "Load_Arrow" +DAZZ_DB *Arrow_DB = NULL; // Last db/arw used by "Load_Arrow" FILE *Arrow_File = NULL; // Becomes invalid after closing -int Load_Arrow(HITS_DB *db, int i, char *read, int ascii) +int Load_Arrow(DAZZ_DB *db, int i, char *read, int ascii) { FILE *arrow; int64 off; int len, clen; - HITS_READ *r = db->reads; + DAZZ_READ *r = db->reads; if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Arrow)\n",Prog_Name); @@ -1519,12 +1690,12 @@ int Load_Arrow(HITS_DB *db, int i, char *read, int ascii) return (0); } -char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii) +char *Load_Subread(DAZZ_DB *db, int i, int beg, int end, char *read, int ascii) { FILE *bases = (FILE *) db->bases; int64 off; int len, clen; int bbeg, bend; - HITS_READ *r = db->reads; + DAZZ_READ *r = db->reads; if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); @@ -1578,7 +1749,7 @@ char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii) // Allocate and return a buffer of 5 vectors big enough for the largest read in 'db' -char **New_QV_Buffer(HITS_DB *db) +char **New_QV_Buffer(DAZZ_DB *db) { char **entry; char *qvs; int i; @@ -1595,8 +1766,8 @@ char **New_QV_Buffer(HITS_DB *db) // Load into entry the QV streams for the i'th read from db. The parameter ascii applies to // the DELTAG stream as described for Load_Read. -int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii) -{ HITS_READ *reads; +int Load_QVentry(DAZZ_DB *db, int i, char **entry, int ascii) +{ DAZZ_READ *reads; FILE *quiva; int rlen; @@ -1605,7 +1776,7 @@ int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii) { EPRINTF(EPLACE,"%s: QV's are not loaded (Load_QVentry)\n",Prog_Name); EXIT(1); } - Active_QV = (HITS_QV *) db->tracks; + Active_QV = (DAZZ_QV *) db->tracks; Active_DB = db; } if (i >= db->nreads) @@ -1655,10 +1826,10 @@ int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii) // non-zero then the reads are converted to ACGT ascii, otherwise the reads are left // as numeric strings over 0(A), 1(C), 2(G), and 3(T). -int Read_All_Sequences(HITS_DB *db, int ascii) +int Read_All_Sequences(DAZZ_DB *db, int ascii) { FILE *bases; int nreads = db->nreads; - HITS_READ *reads = db->reads; + DAZZ_READ *reads = db->reads; void (*translate)(char *s); char *seq; @@ -1713,6 +1884,16 @@ int Read_All_Sequences(HITS_DB *db, int ascii) return (0); } +// For the DB or DAM "path" = "prefix/root.[db|dam]", find all the files for that DB, i.e. all +// those of the form "prefix/[.]root.part" and call actor with the complete path to each file +// pointed at by path, and the suffix of the path by extension. The . proceeds the root +// name if the defined constant HIDE_FILES is set. Always the first call is with the +// path "prefix/root.[db|dam]" and extension "db" or "dam". There will always be calls for +// "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and +// so this routine gives one a way to know all the tracks associated with a given DB. +// -1 is returned if the path could not be found, and 1 is returned if an error (reported +// to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned. + int List_DB_Files(char *path, void actor(char *path, char *extension)) { int status, plen, rlen, dlen; char *root, *pwd, *name; @@ -1750,19 +1931,9 @@ int List_DB_Files(char *path, void actor(char *path, char *extension)) { isdam = 1; break; } - if (strcasecmp(name,Catenate("","",root,".db")) == 0) - { strncpy(root,name,rlen); - break; - } - if (strcasecmp(name,Catenate("","",root,".dam")) == 0) - { strncpy(root,name,rlen); - isdam = 1; - break; - } } if (dp == NULL) - { EPRINTF(EPLACE,"%s: Cannot find %s (List_DB_Files)\n",Prog_Name,pwd); - status = -1; + { status = -1; closedir(dirp); goto error; } diff --git a/DB.h b/DB.h index dc281de..ff41e9f 100644 --- a/DB.h +++ b/DB.h @@ -12,9 +12,9 @@ * ********************************************************************************************/ -#ifndef _HITS_DB +#ifndef _DAZZ_DB -#define _HITS_DB +#define _DAZZ_DB #include @@ -74,11 +74,6 @@ extern char Ebuffer[]; #endif -#define SYSTEM_ERROR \ - { EPRINTF(EPLACE,"%s: System error, read failed!\n",Prog_Name); \ - exit (2); \ - } - #define ARG_INIT(name) \ Prog_Name = Strdup(name,""); \ for (i = 0; i < 128; i++) \ @@ -125,6 +120,108 @@ extern char Ebuffer[]; exit (1); \ } + +/******************************************************************************************* + * + * GUARDED BATCH IO MACROS + * + ********************************************************************************************/ + + // Utilitieis + +int Count_Args(char *arg); + +#define SYSTEM_READ_ERROR \ + { fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); \ + exit (2); \ + } + +#define SYSTEM_WRITE_ERROR \ + { fprintf(stderr,"%s: System error, write failed!\n",Prog_Name); \ + exit (2); \ + } + +#define SYSTEM_CLOSE_ERROR \ + { fprintf(stderr,"%s: System error, file close failed!\n",Prog_Name); \ + exit (2); \ + } + + // Output + +#define FWRITE(v,s,n,file) \ + { if (fwrite(v,s,n,file) != (size_t) n) \ + SYSTEM_WRITE_ERROR \ + } + +#define FPRINTF(file,...) \ + { if (fprintf(file,__VA_ARGS__) < 0) \ + SYSTEM_WRITE_ERROR \ + } + +#define PRINTF(...) \ + { if (printf(__VA_ARGS__) < 0) \ + SYSTEM_WRITE_ERROR \ + } + +#define FPUTS(x,file) \ + { if (fputs(x,file) == EOF) \ + SYSTEM_WRITE_ERROR \ + } + + // Close + +#define FCLOSE(file) \ + { if (fclose(file) != 0) \ + SYSTEM_CLOSE_ERROR \ + } + + // Input + +#define FREAD(v,s,n,file) \ + { if (fread(v,s,n,file) != (size_t) n) \ + { if (ferror(file)) \ + SYSTEM_READ_ERROR \ + else \ + { fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \ + exit (1); \ + } \ + } \ + } + +#define FSCANF(file,...) \ + { if (fscanf(file,__VA_ARGS__) != Count_Args(#__VA_ARGS__)-1) \ + { if (ferror(file)) \ + SYSTEM_READ_ERROR \ + else \ + { fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \ + exit (1); \ + } \ + } \ + } + +#define FGETS(v,n,file) \ + { if (fgets(v,n,file) == NULL) \ + { if (ferror(file)) \ + SYSTEM_READ_ERROR \ + else \ + { fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \ + exit (1); \ + } \ + } \ + } + +#define FSEEKO(file,p,d) \ + { if (fseeko(file,p,d) < 0) \ + SYSTEM_READ_ERROR \ + } + +#define FTELLO(file) \ + ( { int x = ftello(file); \ + if (x < 0) \ + SYSTEM_READ_ERROR \ + ; x; \ + } ) + /******************************************************************************************* * * UTILITIES @@ -193,7 +290,7 @@ typedef struct // Offset (in bytes) of scaffold header string in '.hdr' file (DAM) // 4 compressed shorts containing snr info if an arrow DB. int flags; // QV of read + flags above (DB only) - } HITS_READ; + } DAZZ_READ; // A track can be of 3 types: // data == NULL: there are nreads 'anno' records of size 'size'. @@ -208,9 +305,31 @@ typedef struct _track int size; // Size in bytes of anno records void *anno; // over [0,nreads]: read i annotation: int, int64, or 'size' records void *data; // data[anno[i] .. anno[i+1]-1] is data if data != NULL - } HITS_TRACK; + } DAZZ_TRACK; + +// The tailing part of a .anno track file can contain meta-information produced by the +// command that produced the track. For example, the coverage, or good/bad parameters +// for trimming, or even say a histogram of QV values. Each item is an array of 'nelem' +// 64-bit ints or floats ('vtype' = DB_INT or DB_REAL), has a 'name' string that +// describes it, and an indicator as to whether the values should be equal accross all +// block tracks, or summed accross all block tracks (by Catrack). 'value' points at the +// array of values + +#define DB_INT 0 +#define DB_REAL 1 -// The information for accessing QV streams is in a HITS_QV record that is a "pseudo-track" +#define DB_EXACT 0 +#define DB_SUM 1 + +typedef struct + { int vtype; // INT64 or FLOAST64 + int nelem; // >= 1 + int accum; // EXACT, SUM + char *name; + void *value; + } DAZZ_EXTRA; + +// The information for accessing QV streams is in a DAZZ_QV record that is a "pseudo-track" // named ".@qvs" and is always the first track record in the list (if present). Since normal // track names cannot begin with a . (this is enforced), this pseudo-track is never confused // with a normal track. @@ -223,11 +342,11 @@ typedef struct uint16 *table; // for i in [0,db->nreads-1]: read i should be decompressed with // scheme coding[table[i]] FILE *quiva; // the open file pointer to the .qvs file - } HITS_QV; + } DAZZ_QV; // The DB record holds all information about the current state of an active DB including an -// array of HITS_READS, one per read, and a linked list of HITS_TRACKs the first of which -// is always a HITS_QV pseudo-track (if the QVs have been loaded). +// array of DAZZ_READS, one per read, and a linked list of DAZZ_TRACKs the first of which +// is always a DAZZ_QV pseudo-track (if the QVs have been loaded). typedef struct { int ureads; // Total number of reads in untrimmed DB @@ -257,9 +376,9 @@ typedef struct int loaded; // Are reads loaded in memory? void *bases; // file pointer for bases file (to fetch reads from), // or memory pointer to uncompressed block of all sequences. - HITS_READ *reads; // Array [-1..nreads] of HITS_READ - HITS_TRACK *tracks; // Linked list of loaded tracks - } HITS_DB; + DAZZ_READ *reads; // Array [-1..nreads] of DAZZ_READ + DAZZ_TRACK *tracks; // Linked list of loaded tracks + } DAZZ_DB; /******************************************************************************************* @@ -294,7 +413,7 @@ typedef struct // contain N-separated contigs), and .fpulse the first base of the contig in the // fasta entry - // Open the given database or dam, "path" into the supplied HITS_DB record "db". If the name has + // Open the given database or dam, "path" into the supplied DAZZ_DB record "db". If the name has // a part # in it then just the part is opened. The index array is allocated (for all or // just the part) and read in. // Return status of routine: @@ -302,34 +421,34 @@ typedef struct // 0: Open of DB proceeded without mishap // 1: Open of DAM proceeded without mishap -int Open_DB(char *path, HITS_DB *db); +int Open_DB(char *path, DAZZ_DB *db); // Trim the DB or part thereof and all loaded tracks according to the cutoff and all settings // of the current DB partition. Reallocate smaller memory blocks for the information kept // for the retained reads. -void Trim_DB(HITS_DB *db); +void Trim_DB(DAZZ_DB *db); // Shut down an open 'db' by freeing all associated space, including tracks and QV structures, // and any open file pointers. The record pointed at by db however remains (the user // supplied it and so should free it). -void Close_DB(HITS_DB *db); +void Close_DB(DAZZ_DB *db); // Return the size in bytes of the given DB -int64 sizeof_DB(HITS_DB *db); +int64 sizeof_DB(DAZZ_DB *db); // If QV pseudo track is not already in db's track list, then load it and set it up. // The database must not have been trimmed yet. -1 is returned if a .qvs file is not // present, and 1 is returned if an error (reported to EPLACE) occured and INTERACTIVE // is defined. Otherwise a 0 is returned. -int Load_QVs(HITS_DB *db); +int Load_QVs(DAZZ_DB *db); // Remove the QV pseudo track, all space associated with it, and close the .qvs file. -void Close_QVs(HITS_DB *db); +void Close_QVs(DAZZ_DB *db); // Look up the file and header in the file of the indicated track. Return: // 1: Track is for trimmed DB @@ -344,28 +463,47 @@ void Close_QVs(HITS_DB *db); #define CUSTOM_TRACK 0 #define MASK_TRACK 1 -int Check_Track(HITS_DB *db, char *track, int *kind); +int Check_Track(DAZZ_DB *db, char *track, int *kind); // If track is not already in the db's track list, then allocate all the storage for it, // read it in from the appropriate file, add it to the track list, and return a pointer - // to the newly created HITS_TRACK record. If the track does not exist or cannot be + // to the newly created DAZZ_TRACK record. If the track does not exist or cannot be // opened for some reason, then NULL is returned if INTERACTIVE is defined. Otherwise // the routine prints an error message to stderr and exits if an error occurs, and returns // with NULL only if the track does not exist. -HITS_TRACK *Load_Track(HITS_DB *db, char *track); +DAZZ_TRACK *Load_Track(DAZZ_DB *db, char *track); + + // Assumming file pointer for afile is correctly positioned at the start of a extra item, + // and aname is the name of the .anno file, decode the value present and places it in + // extra if extra->nelem == 0, otherwise reduce the value just read into extra according + // according the to the directive given by 'accum'. Leave the read poinrt at the next + // extra or end-of-file. + // Returns: + // 1 if at the end of file, + // 0 if item was read and folded correctly, + // -1 if there was a system IO or allocation error (if interactive), and + // -2 if the new value could not be reduced into the currenct value of extra (interactive) + +int Read_Extra(FILE *afile, char *aname, DAZZ_EXTRA *extra); + +// Write extra record to end of file afile and advance write pointer +// If interactive, then return non-zero on error, if bash, then print +// and halt if an error + +int Write_Extra(FILE *afile, DAZZ_EXTRA *extra); // If track is on the db's track list, then it is removed and all storage associated with it // is freed. -void Close_Track(HITS_DB *db, char *track); +void Close_Track(DAZZ_DB *db, char *track); // Allocate and return a buffer big enough for the largest read in 'db'. // **NB** free(x-1) if x is the value returned as *prefix* and suffix '\0'(4)-byte // are needed by the alignment algorithms. If cannot allocate memory then return NULL // if INTERACTIVE is defined, or print error to stderr and exit otherwise. -char *New_Read_Buffer(HITS_DB *db); +char *New_Read_Buffer(DAZZ_DB *db); // Load into 'read' the i'th read in 'db'. As a lower case ascii string if ascii is 1, an // upper case ascii string if ascii is 2, and a numeric string over 0(A), 1(C), 2(G), and 3(T) @@ -373,12 +511,12 @@ char *New_Read_Buffer(HITS_DB *db); // for traversals in either direction. A non-zero value is returned if an error occured // and INTERACTIVE is defined. -int Load_Read(HITS_DB *db, int i, char *read, int ascii); +int Load_Read(DAZZ_DB *db, int i, char *read, int ascii); // Exactly the same as Load_Read, save the arrow information is loaded, not the DNA sequence, // and there is only a choice between numeric (0) or ascii (1); -int Load_Arrow(HITS_DB *db, int i, char *read, int ascii); +int Load_Arrow(DAZZ_DB *db, int i, char *read, int ascii); // Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the // the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii @@ -387,7 +525,7 @@ int Load_Arrow(HITS_DB *db, int i, char *read, int ascii); // the string holding the substring so it has a delimeter for traversals in either direction. // A NULL pointer is returned if an error occured and INTERACTIVE is defined. -char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii); +char *Load_Subread(DAZZ_DB *db, int i, int beg, int end, char *read, int ascii); // Allocate a set of 5 vectors large enough to hold the longest QV stream that will occur // in the database. If cannot allocate memory then return NULL if INTERACTIVE is defined, @@ -399,13 +537,13 @@ char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii); #define SUB_QV 3 // The substitution QVs #define MRG_QV 4 // The merge QVs -char **New_QV_Buffer(HITS_DB *db); +char **New_QV_Buffer(DAZZ_DB *db); // Load into 'entry' the 5 QV vectors for i'th read in 'db'. The deletion tag or characters // are converted to a numeric or upper/lower case ascii string as per ascii. Return with // a zero, except when an error occurs and INTERACTIVE is defined in which case return wtih 1. -int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii); +int Load_QVentry(DAZZ_DB *db, int i, char **entry, int ascii); // Allocate a block big enough for all the uncompressed sequences, read them into it, // reset the 'off' in each read record to be its in-memory offset, and set the @@ -415,7 +553,7 @@ int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii); // Return with a zero, except when an error occurs and INTERACTIVE is defined in which // case return wtih 1. -int Read_All_Sequences(HITS_DB *db, int ascii); +int Read_All_Sequences(DAZZ_DB *db, int ascii); // For the DB or DAM "path" = "prefix/root.[db|dam]", find all the files for that DB, i.e. all // those of the form "prefix/[.]root.part" and call actor with the complete path to each file @@ -429,4 +567,4 @@ int Read_All_Sequences(HITS_DB *db, int ascii); int List_DB_Files(char *path, void actor(char *path, char *extension)); -#endif // _HITS_DB +#endif // _DAZZ_DB diff --git a/HPC.REPmask.c b/HPC.REPmask.c index d5c67d7..d9d32b3 100644 --- a/HPC.REPmask.c +++ b/HPC.REPmask.c @@ -215,12 +215,12 @@ int main(int argc, char *argv[]) } if (fscanf(dbvis,"files = %d\n",&nfiles) != 1) - SYSTEM_ERROR + SYSTEM_READ_ERROR for (i = 0; i < nfiles; i++) { char buffer[30001]; if (fgets(buffer,30000,dbvis) == NULL) - SYSTEM_ERROR + SYSTEM_READ_ERROR } useblock = 1; diff --git a/HPC.TANmask.c b/HPC.TANmask.c index b91dfa2..abed031 100644 --- a/HPC.TANmask.c +++ b/HPC.TANmask.c @@ -152,12 +152,12 @@ int main(int argc, char *argv[]) } if (fscanf(dbvis,"files = %d\n",&nfiles) != 1) - SYSTEM_ERROR + SYSTEM_READ_ERROR for (i = 0; i < nfiles; i++) { char buffer[30001]; if (fgets(buffer,30000,dbvis) == NULL) - SYSTEM_ERROR + SYSTEM_READ_ERROR } useblock = 1; diff --git a/REPmask.c b/REPmask.c index 5ffb89d..99e8ce8 100644 --- a/REPmask.c +++ b/REPmask.c @@ -41,7 +41,7 @@ static char *Usage = "[-v] [-m .. static int VERBOSE; static int MIN_COVER; -static HITS_DB _DB, *DB = &_DB; // Data base +static DAZZ_DB _DB, *DB = &_DB; // Data base static int DB_PART; // Input is an Overlap block static int DB_FIRST; // for reads DB_FIRST to DB_LAST-1 @@ -349,11 +349,9 @@ static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int tra else TBYTES = sizeof(uint16); - if (novl <= 0) - return (0); - - Read_Overlap(input,ovls); - if (trace) + if (Read_Overlap(input,ovls) != 0) + ovls[0].aread = INT32_MAX; + else if (trace) { if (ovls[0].path.tlen > pmax) { pmax = 1.2*(ovls[0].path.tlen)+10000; paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer"); @@ -427,6 +425,12 @@ static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int tra ACTION(j,ovls,n); } + if (ovls[n].aread < INT32_MAX) + { fprintf(stderr,"%s: .las file overlaps don't correspond to reads in block %d of DB\n", + Prog_Name,DB_PART); + exit (1); + } + return (max); } @@ -541,19 +545,19 @@ int main(int argc, char *argv[]) if (dbfile == NULL) exit (1); if (fscanf(dbfile,DB_NFILE,&nfiles) != 1) - SYSTEM_ERROR + SYSTEM_READ_ERROR for (i = 0; i < nfiles; i++) if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL) - SYSTEM_ERROR + SYSTEM_READ_ERROR if (fscanf(dbfile,DB_NBLOCK,&nblocks) != 1) - SYSTEM_ERROR + SYSTEM_READ_ERROR if (fscanf(dbfile,DB_PARAMS,&size,&cutoff,&all) != 3) - SYSTEM_ERROR + SYSTEM_READ_ERROR for (i = 1; i <= part; i++) if (fscanf(dbfile,DB_BDATA,&oindx,&DB_FIRST) != 2) - SYSTEM_ERROR + SYSTEM_READ_ERROR if (fscanf(dbfile,DB_BDATA,&oindx,&DB_LAST) != 2) - SYSTEM_ERROR + SYSTEM_READ_ERROR fclose(dbfile); DB_PART = part; *p = '\0'; diff --git a/TANmask.c b/TANmask.c index 0aeec0e..501d994 100644 --- a/TANmask.c +++ b/TANmask.c @@ -36,7 +36,7 @@ static char *Usage = "[-v] [-m] [-l] pmax) { pmax = 1.2*(ovls[0].path.tlen)+10000; paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer"); @@ -266,6 +264,12 @@ static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int tra ACTION(j,ovls,n); } + if (ovls[n].aread < INT32_MAX) + { fprintf(stderr,"%s: .las file overlaps don't correspond to reads in block %d of DB\n", + Prog_Name,DB_PART); + exit (1); + } + return (max); } @@ -371,19 +375,19 @@ int main(int argc, char *argv[]) if (dbfile == NULL) exit (1); if (fscanf(dbfile,DB_NFILE,&nfiles) != 1) - SYSTEM_ERROR + SYSTEM_READ_ERROR for (i = 0; i < nfiles; i++) if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL) - SYSTEM_ERROR + SYSTEM_READ_ERROR if (fscanf(dbfile,DB_NBLOCK,&nblocks) != 1) - SYSTEM_ERROR + SYSTEM_READ_ERROR if (fscanf(dbfile,DB_PARAMS,&size,&cutoff,&all) != 3) - SYSTEM_ERROR + SYSTEM_READ_ERROR for (i = 1; i <= part; i++) if (fscanf(dbfile,DB_BDATA,&oindx,&DB_FIRST) != 2) - SYSTEM_ERROR + SYSTEM_READ_ERROR if (fscanf(dbfile,DB_BDATA,&oindx,&DB_LAST) != 2) - SYSTEM_ERROR + SYSTEM_READ_ERROR fclose(dbfile); DB_PART = part; *p = '\0'; diff --git a/align.c b/align.c index d4fdfe8..e408eb9 100644 --- a/align.c +++ b/align.c @@ -161,6 +161,7 @@ static double Bias_Factor[10] = { .690, .690, .690, .690, .780, typedef struct { double ave_corr; int trace_space; + int reach; float freq[4]; int ave_path; int16 *score; @@ -196,7 +197,7 @@ static void set_table(int bit, int prefix, int score, int max, Table_Bits *parms /* Create an alignment specification record including path tip tables & values */ -Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq) +Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq, int reach) { _Align_Spec *spec; Table_Bits parms; double match; @@ -208,6 +209,7 @@ Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq) spec->ave_corr = ave_corr; spec->trace_space = trace_space; + spec->reach = reach; spec->freq[0] = freq[0]; spec->freq[1] = freq[1]; spec->freq[2] = freq[2]; @@ -257,6 +259,9 @@ int Trace_Spacing(Align_Spec *espec) float *Base_Frequencies(Align_Spec *espec) { return (((_Align_Spec *) espec)->freq); } +int Overlap_If_Possible(Align_Spec *espec) +{ return (((_Align_Spec *) espec)->reach); } + /****************************************************************************************\ * * @@ -343,6 +348,7 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P int TRACE_SPACE = spec->trace_space; int PATH_AVE = spec->ave_path; + int REACH = spec->reach; int16 *SCORE = spec->score; int16 *TABLE = spec->table; @@ -874,7 +880,7 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P int a, b, k, h; int d, e; - if (morem >= 0) + if (morem >= 0 && REACH) { trimx = morea-morey; trimy = morey; trimd = mored; @@ -1004,6 +1010,7 @@ static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P int TRACE_SPACE = spec->trace_space; int PATH_AVE = spec->ave_path; + int REACH = spec->reach; int16 *SCORE = spec->score; int16 *TABLE = spec->table; @@ -1527,7 +1534,7 @@ static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P int a, b, k, h; int d, e; - if (morem >= 0) + if (morem >= 0 && REACH) { trimx = morea-morey; trimy = morey; trimd = mored; @@ -4194,7 +4201,10 @@ static int ToA[4] = { 'a', 'c', 'g', 't' }; #endif -static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode) +static char *TP_Align = + "Bad alignment between trace points (Compute_Trace), source DB likely incorrect"; + +static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode, int dmax) { int **PVF = wave->PVF; int **PHF = wave->PHF; int D; @@ -4226,17 +4236,21 @@ static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode) hgh = 0; } - posl = -INT32_MAX; - posh = INT32_MAX; + posl = -dmax; + posh = dmax; if (wave->Aabs == wave->Babs) { if (B == A) - { EPRINTF(EPLACE,"Error: self comparison starts on diagonal 0 (Compute_Trace)\n"); + { EPRINTF(EPLACE,"%s: self comparison starts on diagonal 0 (Compute_Trace)\n",Prog_Name); EXIT(-1); } else if (B < A) - posl = (B-A)+1; + { if ((B-A)+1 > posl) + posl = (B-A)+1; + } else - posh = (B-A)-1; + { if ((B-A)-1 < posh) + posh = (B-A)-1; + } } F1 = PVF[-2]; @@ -4254,6 +4268,11 @@ static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode) int am, ac, ap; char *a; + if (D > dmax) + { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Align); + EXIT(-1); + } + F2 = F1; F1 = F0; F0 = PVF[D]; @@ -4523,7 +4542,7 @@ static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode) return (D + abs(del)); } -static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode) +static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode, int dmax) { int **PVF = wave->PVF; int **PHF = wave->PHF; int D; @@ -4555,17 +4574,21 @@ static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode hgh = 0; } - posl = -INT32_MAX; - posh = INT32_MAX; + posl = -dmax; + posh = dmax; if (wave->Aabs == wave->Babs) { if (B == A) - { EPRINTF(EPLACE,"Error: self comparison starts on diagonal 0 (Compute_Trace)\n"); + { EPRINTF(EPLACE,"%s: self comparison starts on diagonal 0 (Compute_Trace)\n",Prog_Name); EXIT(1); } else if (B < A) - posl = (B-A)+1; + { if ((B-A)+1 > posl) + posl = (B-A)+1; + } else - posh = (B-A)-1; + { if ((B-A)-1 < posh) + posh = (B-A)-1; + } } F1 = PVF[-2]; @@ -4583,6 +4606,11 @@ static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode int am, ac, ap; char *a; + if (D > dmax) + { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Align); + EXIT(-1); + } + F2 = F1; F1 = F0; F0 = PVF[D]; @@ -4795,14 +4823,20 @@ static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode * * \****************************************************************************************/ +static char *TP_Error = "Trace point out of bounds (Compute_Trace), source DB likely incorrect"; + int Compute_Trace_ALL(Alignment *align, Work_Data *ework) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; + int alen, blen; int M, N, D; + int dmax; + alen = align->alen; + blen = align->blen; path = align->path; aseq = align->aseq; bseq = align->bseq; @@ -4812,7 +4846,6 @@ int Compute_Trace_ALL(Alignment *align, Work_Data *ework) { int64 s; int d; - int dmax; int **PVF, **PHF; if (M < N) @@ -4851,7 +4884,11 @@ int Compute_Trace_ALL(Alignment *align, Work_Data *ework) wave.Aabs = aseq; wave.Babs = bseq; - D = iter_np(aseq+path->abpos,M,bseq+path->bbpos,N,&wave,GREEDIEST); + if (path->aepos > alen || path->bepos > blen) + { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error); + EXIT(1); + } + D = iter_np(aseq+path->abpos,M,bseq+path->bbpos,N,&wave,GREEDIEST,dmax); if (D < 0) EXIT(1); path->diffs = D; @@ -4867,12 +4904,15 @@ int Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing, int Path *path; char *aseq, *bseq; + int alen, blen; uint16 *points; int tlen; int ab, bb; int ae, be; - int diffs; + int diffs, dmax; + alen = align->alen; + blen = align->blen; path = align->path; aseq = align->aseq; bseq = align->bseq; @@ -4882,7 +4922,7 @@ int Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing, int { int64 s; int d; int M, N; - int dmax, nmax; + int nmax; int **PVF, **PHF; M = path->aepos-path->abpos; @@ -4938,7 +4978,11 @@ int Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing, int for (i = 1; i < tlen; i += 2) { ae = ae + trace_spacing; be = bb + points[i]; - d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode); + if (ae > alen || be > blen) + { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error); + EXIT(1); + } + d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax); if (d < 0) EXIT(1); diffs += d; @@ -4947,7 +4991,11 @@ int Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing, int } ae = path->aepos; be = path->bepos; - d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode); + if (ae > alen || be > blen) + { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error); + EXIT(1); + } + d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax); if (d < 0) EXIT(1); diffs += d; @@ -4966,12 +5014,15 @@ int Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing, int Path *path; char *aseq, *bseq; + int alen, blen; uint16 *points; int tlen; int ab, bb; int ae, be; - int diffs; + int diffs, dmax; + alen = align->alen; + blen = align->blen; path = align->path; aseq = align->aseq; bseq = align->bseq; @@ -4981,7 +5032,7 @@ int Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing, int { int64 s; int d; int M, N; - int dmax, nmax; + int nmax; int **PVF, **PHF; M = path->aepos-path->abpos; @@ -5039,11 +5090,15 @@ int Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing, int for (i = 1; i < tlen; i += 2) { ae = ae + trace_spacing; be = bb + points[i]; - if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode)) + if (ae > alen || be > blen) + { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error); + EXIT(1); + } + if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax)) EXIT(1); af = wave.mida; bf = wave.midb; - d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave,mode); + d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave,mode,dmax); if (d < 0) EXIT(1); diffs += d; @@ -5056,18 +5111,22 @@ int Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing, int ae = path->aepos; be = path->bepos; - if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode)) + if (ae > alen || be > blen) + { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error); + EXIT(1); + } + if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax)) EXIT(1); af = wave.mida; bf = wave.midb; - d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave,mode); + d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave,mode,dmax); if (d < 0) EXIT(1); diffs += d; as = af; bs = bf; - d += iter_np(aseq+af,ae-as,bseq+bf,be-bs,&wave,mode); + d += iter_np(aseq+af,ae-as,bseq+bf,be-bs,&wave,mode,dmax); if (d < 0) EXIT(1); diffs += d; @@ -5086,12 +5145,15 @@ int Compute_Trace_IRR(Alignment *align, Work_Data *ework, int mode) Path *path; char *aseq, *bseq; + int alen, blen; uint16 *points; int tlen; int ab, bb; int ae, be; - int diffs; + int diffs, dmax; + alen = align->alen; + blen = align->blen; path = align->path; aseq = align->aseq; bseq = align->bseq; @@ -5101,7 +5163,7 @@ int Compute_Trace_IRR(Alignment *align, Work_Data *ework, int mode) { int64 s; int d; int M, N; - int mmax, nmax, dmax; + int mmax, nmax; int **PVF, **PHF; M = path->aepos-path->abpos; @@ -5160,7 +5222,11 @@ int Compute_Trace_IRR(Alignment *align, Work_Data *ework, int mode) for (i = 0; i < tlen; i += 2) { ae = ab + points[i]; be = bb + points[i+1]; - d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode); + if (ae > alen || be > blen) + { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error); + EXIT(1); + } + d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax); if (d < 0) EXIT(1); diffs += d; diff --git a/align.h b/align.h index daa9151..d5f49a9 100644 --- a/align.h +++ b/align.h @@ -138,6 +138,10 @@ typedef struct #define CHAIN_NEXT(x) ((x) & NEXT_FLAG) #define BEST_CHAIN(x) ((x) & BEST_FLAG) +#define ELIM_FLAG 0x20 // This LA should be ignored + +#define ELIM(x) ((x) & ELIM_FLAG) + typedef struct { Path *path; uint32 flags; /* Pipeline status and complementation flags */ @@ -173,7 +177,9 @@ void Complement_Seq(char *a, int n); description of 'trace' for Paths above) freq[4]: a 4-element vector where afreq[0] = frequency of A, f(A), freq[1] = f(C), freq[2] = f(G), and freq[3] = f(T). This vector is part of the header - of every HITS database (see db.h). + of every DAZZ database (see db.h). + reach: a boolean, if set alignment extend to the boundary when reasonable, otherwise + the terminate only at suffix-positive points. If an alignment cannot reach the boundary of the d.p. matrix with this condition (i.e. overlap), then the last/first 30 columns of the alignment are guaranteed to be @@ -187,13 +193,14 @@ void Complement_Seq(char *a, int n); typedef void Align_Spec; - Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq); + Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq, int reach); void Free_Align_Spec(Align_Spec *spec); int Trace_Spacing (Align_Spec *spec); double Average_Correlation(Align_Spec *spec); float *Base_Frequencies (Align_Spec *spec); + int Overlap_If_Possible(Align_Spec *spec); /* Local_Alignment finds the longest significant local alignment between the sequences in 'align' subject to: @@ -303,7 +310,7 @@ void Complement_Seq(char *a, int n); /*** OVERLAP ABSTRACTION: Externally, between modules an Alignment is modeled by an "Overlap" record, which - (a) replaces the pointers to the two sequences with their ID's in the HITS data bases, + (a) replaces the pointers to the two sequences with their ID's in the DAZZ data bases, (b) does not contain the length of the 2 sequences (must fetch from DB), and (c) contains its path as a subrecord rather than as a pointer (indeed, typically the corresponding Alignment record points at the Overlap's path sub-record). The trace pointer diff --git a/datander.c b/datander.c index d049ae3..1880ec3 100644 --- a/datander.c +++ b/datander.c @@ -48,7 +48,7 @@ int VERBOSE; // Globally visible to tandem.c char *SORT_PATH; int MINOVER; -static int read_DB(HITS_DB *block, char *name, int kmer) +static int read_DB(DAZZ_DB *block, char *name, int kmer) { int i, isdam; isdam = Open_DB(name,block); @@ -88,8 +88,8 @@ static char *CommandBuffer(char *bname) } int main(int argc, char *argv[]) -{ HITS_DB _bblock; - HITS_DB *bblock = &_bblock; +{ DAZZ_DB _bblock; + DAZZ_DB *bblock = &_bblock; char *bfile; char *broot; Align_Spec *settings; @@ -197,7 +197,7 @@ int main(int argc, char *argv[]) else broot = Root(bfile,".db"); - settings = New_Align_Spec( AVE_ERROR, SPACING, bblock->freq); + settings = New_Align_Spec( AVE_ERROR, SPACING, bblock->freq, 0); Match_Self(broot,bblock,settings); diff --git a/tandem.c b/tandem.c index bf69266..7cdb50c 100644 --- a/tandem.c +++ b/tandem.c @@ -347,7 +347,7 @@ static Double *lex_sort(int bytes[16], Double *src, Double *trg, Lex_Arg *parmx) * ********************************************************************************************/ -static HITS_DB *TA_block; +static DAZZ_DB *TA_block; static KmerPos *TA_list; typedef struct @@ -388,7 +388,7 @@ static void *tuple_thread(void *arg) return (NULL); } -static KmerPos *Sort_Kmers(HITS_DB *block, int *len, KmerPos **buffer) +static KmerPos *Sort_Kmers(DAZZ_DB *block, int *len, KmerPos **buffer) { THREAD threads[NTHREADS]; Tuple_Arg parmt[NTHREADS]; Lex_Arg parmx[NTHREADS]; @@ -554,7 +554,7 @@ static void *count_thread(void *arg) // Report threads: using the linked lists of consereved K-mers find likely seeds and then check // for alignments as per "daligner". -static HITS_DB *MR_ablock; +static DAZZ_DB *MR_ablock; static Align_Spec *MR_spec; static int MR_tspace; @@ -857,7 +857,7 @@ typedef struct static void *report_thread(void *arg) { Report_Arg *data = (Report_Arg *) arg; - HITS_READ *aread = MR_ablock->reads; + DAZZ_READ *aread = MR_ablock->reads; KmerPos *asort = MG_alist; int *score = data->score; @@ -1130,7 +1130,7 @@ static void *report_thread(void *arg) * ********************************************************************************************/ -void Match_Self(char *aname, HITS_DB *ablock, Align_Spec *aspec) +void Match_Self(char *aname, DAZZ_DB *ablock, Align_Spec *aspec) { THREAD threads[NTHREADS]; Merge_Arg parmm[NTHREADS]; Lex_Arg parmx[NTHREADS]; diff --git a/tandem.h b/tandem.h index 79ddb5a..71b28da 100644 --- a/tandem.h +++ b/tandem.h @@ -20,6 +20,6 @@ extern char *SORT_PATH; int Set_Filter_Params(int kmer, int binshift, int hitmin, int nthreads); -void Match_Self(char *aname, HITS_DB *ablock, Align_Spec *settings); +void Match_Self(char *aname, DAZZ_DB *ablock, Align_Spec *settings); #endif