Skip to content

Commit

Permalink
Merge remote-tracking branch 'pd3/feature/ploidy/feature/ploidy' into…
Browse files Browse the repository at this point in the history
… production
  • Loading branch information
pd3 committed Jan 30, 2017
2 parents 9e6f79a + 1071aff commit 832335e
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 31 deletions.
6 changes: 4 additions & 2 deletions pbwtCore.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ PBWT *pbwtCreate (int M, int N)

p->M = M ; p->N = N ;
p->aFstart = myalloc (M, int) ; int i ; for (i = 0 ; i < M ; ++i) p->aFstart[i] = i ;

return p ;
}

Expand Down Expand Up @@ -690,6 +689,8 @@ PBWT *pbwtSelectSites (PBWT *pOld, Array sites, BOOL isKeepOld)
fprintf (logFile, "%d sites selected from %d, %d missing sites, pbwt size for %d haplotypes is %ld\n",
pNew->N, pOld->N, nMissingSites, pNew->M, arrayMax(pNew->yz)) ;

pNew->isX = pOld->isX ; pNew->isY = pOld->isY ;

if (isKeepOld)
{ if (pOld->samples) pNew->samples = arrayCopy (pOld->samples) ;
if (pOld->chrom) pNew->chrom = strdup (pOld->chrom) ;
Expand All @@ -702,11 +703,12 @@ PBWT *pbwtSelectSites (PBWT *pOld, Array sites, BOOL isKeepOld)
else
{ pNew->chrom = pOld->chrom ; pOld->chrom = 0 ;
pNew->samples = pOld->samples ; pOld->samples = 0 ;
if (pOld->missingOffset) arrayDestroy (pOld->missingOffset) ;
pOld->missingOffset = 0 ;
if (pOld->zMissing) arrayDestroy (pOld->zMissing) ;
pOld->zMissing = 0 ;
pbwtDestroy (pOld) ;
}
pNew->isX = pOld->isX ; pNew->isY = pOld->isY ;

free(x) ; pbwtCursorDestroy (uOld) ; pbwtCursorDestroy (uNew) ;
return pNew ;
Expand Down
27 changes: 10 additions & 17 deletions pbwtHtslib.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,13 @@ static void readVcfSamples (PBWT *p, bcf_hdr_t *hr)

static int variation (const char *ref, const char *alt)
{
static char *buf = 0 ;
static int buflen = 0 ;
if (!buf) { buflen = 64 ; buf = myalloc (buflen, char) ; }
int var ;
if (strlen (ref) + strlen (alt) + 2 > buflen)
{ do buflen *= 2 ; while (strlen (ref) + strlen (alt) + 2 > buflen) ;
free (buf) ; buf = myalloc (buflen, char) ;
}
int var, len = strlen (ref) + strlen (alt) + 2;
char *buf = (char*) malloc(len);
sprintf (buf, "%s\t%s", ref, alt) ;
char *ptr = buf;
while ( *ptr ) { *ptr = toupper(*ptr); ptr++; }
dictAdd (variationDict, buf, &var) ;
free(buf);
return var ;
}

Expand Down Expand Up @@ -114,9 +111,6 @@ PBWT *pbwtReadVcfGT (char *filename, int isXY) /* read GTs from vcf/bcf using h
if (!p->chrom) p->chrom = strdup (chrom) ;
else if (strcmp (chrom, p->chrom)) break ;
int pos = line->pos + 1 ; // bcf coordinates are 0-based
char *ref, *REF;
ref = REF = strdup(line->d.allele[0]);
while ( (*ref = toupper(*ref)) ) ++ref ;

// get a copy of GTs
int ngt = bcf_get_genotypes(hr, line, &gt_arr, &mgt_arr) ;
Expand Down Expand Up @@ -234,10 +228,6 @@ PBWT *pbwtReadVcfGT (char *filename, int isXY) /* read GTs from vcf/bcf using h
/* not in the REF/ALT site */
for (i = 1 ; i < n_allele ; i++)
{
char *alt, *ALT;
alt = ALT = no_alt ? "." : strdup(line->d.allele[i]) ;
if (!no_alt) while ( (*alt = toupper(*alt)) ) ++alt ;

/* and pack them into the PBWT */
for (j = 0 ; j < p->M ; ++j) u->y[j] = x[u->a[j]] == i ? 1 : 0;
pbwtCursorWriteForwards (u) ;
Expand All @@ -259,7 +249,7 @@ PBWT *pbwtReadVcfGT (char *filename, int isXY) /* read GTs from vcf/bcf using h
// add the site
Site *s = arrayp(p->sites, p->N++, Site) ;
s->x = pos ;
s->varD = variation (REF, ALT) ;
s->varD = variation (line->d.allele[0], no_alt ? "." : line->d.allele[i]) ;
}

if (nCheckPoint && !(p->N % nCheckPoint)) pbwtCheckPoint (u, p) ;
Expand Down Expand Up @@ -424,6 +414,7 @@ void pbwtWriteVcf (PBWT *p, char *filename, char *referenceFasta, char *mode)
float *fls = myalloc (3*nSamples, float);
uchar *missing = mycalloc (p->M, uchar) ;
if (!p->missingOffset) bzero (missing, p->M) ;

for (i = 0 ; i < p->N ; ++i)
{
Site *s = arrp(p->sites, i, Site) ;
Expand Down Expand Up @@ -577,10 +568,12 @@ void pbwtWriteVcf (PBWT *p, char *filename, char *referenceFasta, char *mode)
}

// cleanup
free(d) ;
free(hap) ;
free(gts) ;
free(missing) ;
if (isDosage) { free(fls) ; free(gps) ; free(ds) ; free(ad) ; }
/* pbwtCursorDestroy(u) ; */
pbwtCursorDestroy(u) ; // this was commented, not sure why??
bcf_hdr_destroy(bcfHeader) ;
bcf_destroy1(bcfRecord);
hts_close(fp) ;
Expand Down
3 changes: 3 additions & 0 deletions pbwtIO.c
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ PBWT *pbwtRead (FILE *fp)
if (fread (&m, sizeof(int), 1, fp) != 1) die ("error reading m in pbwtRead") ;
if (fread (&n, sizeof(int), 1, fp) != 1) die ("error reading n in pbwtRead") ;
p = pbwtCreate (m, n) ;
free(p->aFstart);
if (version > 1) /* read aFstart and aFend */
{ p->aFstart = myalloc (m, int) ;
if (fread (p->aFstart, sizeof(int), m, fp) != m) die ("error reading aFstart in pbwtRead") ;
Expand Down Expand Up @@ -386,7 +387,9 @@ Array pbwtReadSamplesFile (FILE *fp) {
Array samples = arrayCreate (1024, int) ;

while (1) {
char *rmme = line;
line = fgets(line, 1024, fp);
if ( !line ) free(rmme);
if(feof(fp)) break;
else if(sscanf(line, "ID_1%s", name)) { // IMPUTE2 file
die("IMPUTE2 style sample files not yet supported - coming soon");
Expand Down
37 changes: 26 additions & 11 deletions pbwtImpute.c
Original file line number Diff line number Diff line change
Expand Up @@ -1141,7 +1141,8 @@ static PBWT *referenceImpute3 (PBWT *pOld, PBWT *pRef, PBWT *pFrame,
if (pOld == pFrame) /* self-imputing - no sparse option yet */
matchMaximalWithin (pFrame, reportMatch) ;
else
matchSequencesSweepSparse (pFrame, pOld, nSparse, reportMatchSparse) ;
matchSequencesSweep (pFrame, pOld, reportMatch) ;
//matchSequencesSweepSparse (pFrame, pOld, nSparse, reportMatchSparse) ;

for (j = 0 ; j < pOld->M ; ++j) /* add terminating element to arrays */
{ if (nSparse > 1) /* can't guarantee order of sparse segments */
Expand All @@ -1167,11 +1168,11 @@ static PBWT *referenceImpute3 (PBWT *pOld, PBWT *pRef, PBWT *pFrame,
pNew->isRefFreq = TRUE ;
PbwtCursor *uNew = pbwtCursorCreate (pNew, TRUE, TRUE) ;
uchar *x = myalloc (pOld->M, uchar) ; /* uNew->y values in original sort order */
double *p = myalloc (pOld->M, double) ; /* estimated prob of uNew->y in original sort order */
// unused: double *p = myalloc (pOld->M, double) ; /* estimated prob of uNew->y in original sort order */
int *aRefInv = myalloc (pRef->M, int) ; /* holds the inverse mapping from uRef->a[i] -> i */
int *firstSeg = mycalloc (pOld->M, int) ; /* position in maxMatch to start looking at */
int nConflicts = 0 ;
uchar *missing = (pOld == pFrame) ? mycalloc (pOld->M, uchar) : 0 ;
uchar *missing = mycalloc (pOld->M, uchar) ;
double *xDosage = myalloc (pOld->M, double), *yDosage = myalloc (pOld->M, double) ;

pNew->dosageOffset = arrayReCreate (pNew->dosageOffset, pRef->N, long) ;
Expand All @@ -1181,9 +1182,15 @@ static PBWT *referenceImpute3 (PBWT *pOld, PBWT *pRef, PBWT *pFrame,
while (kRef < pRef->N)
{ if (arrp(pRef->sites,kRef,Site)->x == arrp(pFrame->sites,kOld,Site)->x
&& arrp(pRef->sites,kRef,Site)->varD == arrp(pFrame->sites,kOld,Site)->varD)
{ pbwtCursorForwardsRead (uOld) ; ++kOld ;
{
if ( pOld != pFrame )
{
if (!pOld->missingOffset || !arr(pOld->missingOffset, kOld, long)) bzero (missing, pOld->M) ;
else unpack3 (arrp(pOld->zMissing,arr(pOld->missingOffset,kOld,long), uchar), pOld->M, missing, 0) ;
}
pbwtCursorForwardsRead (uOld) ; ++kOld ;
for (j = 0 ; j < pOld->M ; ++j)
while (kOld >= (arrp(maxMatch[j],firstSeg[j],MatchSegment)->end & SPARSE_MASK)) ++firstSeg[j] ;
while (kOld > (arrp(maxMatch[j],firstSeg[j],MatchSegment)->end & SPARSE_MASK)) ++firstSeg[j] ;
}
else
arrp(pRef->sites,kRef,Site)->isImputed = TRUE ;
Expand All @@ -1204,8 +1211,8 @@ static PBWT *referenceImpute3 (PBWT *pOld, PBWT *pRef, PBWT *pFrame,
double bit = 0, sum = 0, score = 0 ;
MatchSegment *m = arrp(maxMatch[j],firstSeg[j],MatchSegment) ;
MatchSegment *mStop = arrp(maxMatch[j],arrayMax(maxMatch[j]),MatchSegment) ;
while (m->start <= kOld && m < mStop)
{ bit = (kOld - m->start + 1) * ((m->end & SPARSE_MASK) - kOld) ;
while (m->start <= kOld && m->end >= kOld && m < mStop)
{ bit = (kOld - m->start) * ((m->end & SPARSE_MASK) - kOld + 1) ;
if (m->end & SPARSE_BIT) bit *= fSparse ;
if (bit > 0)
{ sum += bit ;
Expand All @@ -1232,10 +1239,15 @@ static PBWT *referenceImpute3 (PBWT *pOld, PBWT *pRef, PBWT *pFrame,
}

for (j = 0 ; j < pOld->M ; ++j) uNew->y[j] = x[uNew->a[j]] ; /* transfer to uNew */
pbwtCursorWriteForwards (uNew) ;
/* need to sort the dosages into uNew cursor order as well */
for (j = 0 ; j < pOld->M ; ++j) yDosage[j] = xDosage[uNew->a[j]] ;
for (j = 0 ; j < pOld->M ; ++j)
{
/* make sure the dosages of the typed markers are {0,1} */
if ( arrp(pRef->sites,kRef,Site)->isImputed!=TRUE && !missing[uNew->a[j]] ) yDosage[j] = x[uNew->a[j]] ? 1 : 0;
else yDosage[j] = xDosage[uNew->a[j]] ;
}
pbwtDosageStore (pNew, yDosage, kRef) ;
pbwtCursorWriteForwards (uNew) ;

if (n)
{ psum /= n ; xsum /= n ; pxsum /= n ;
Expand All @@ -1254,8 +1266,8 @@ static PBWT *referenceImpute3 (PBWT *pOld, PBWT *pRef, PBWT *pFrame,

pbwtCursorDestroy (uOld) ; pbwtCursorDestroy (uRef) ; pbwtCursorDestroy (uNew) ;
free (aRefInv) ; free (firstSeg) ;
for (j = 0 ; j < pOld->M ; ++j) free (maxMatch[j]) ; free (maxMatch) ;
free (xDosage) ; free (yDosage) ; if (missing) free (missing) ;
for (j = 0 ; j < pOld->M ; ++j) arrayDestroy (maxMatch[j]) ; free (maxMatch) ;
free (xDosage) ; free (yDosage) ; if (missing) free (missing) ; free (x) ;
return pNew ;
}

Expand Down Expand Up @@ -1630,6 +1642,9 @@ PBWT *pbwtCopySamples (PBWT *pOld, int Mnew, double meanLength)
gl[n][0] = (1-d[2*n]) * (1-d[2*n+1])
gl[n][1] = d[2*n] + d[2*n+1] - 2*d[2*n]*d[2*n+1]
gl[n][2] = d[2*n] * d[2*n+1]
Note that only differences from the {0,1} calls are stored. This is why
pbwtDosageStore() does not need the cursor but pbwtDosageRetrieve() does.
*/

static inline uchar dosageEncode (double d)
Expand Down
2 changes: 2 additions & 0 deletions pbwtMain.c
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,8 @@ int main (int argc, char *argv[])
if (p) pbwtDestroy(p) ;
if (variationDict) dictDestroy(variationDict);
sampleDestroy();
metaDataDestroy();
if (*commandLine) free(commandLine);
fgetword (NULL) ; // to keep valgrind happy, free malloced memory
LOGCLOSE ;
return 0 ;
Expand Down
2 changes: 1 addition & 1 deletion pbwtSample.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ Sample *sample (int i)

int sampleCount (void)
{
return arrayMax(samples);
return arrayMax(samples) - 1;
}

int pbwtSamplePloidy(PBWT *p, int i)
Expand Down

0 comments on commit 832335e

Please sign in to comment.