From e3d61cb9283bfd964da0363a2ee2ed5df47ab8fc Mon Sep 17 00:00:00 2001 From: Sabrina Krakau Date: Sat, 8 Sep 2018 19:43:45 +0200 Subject: [PATCH] Got rid of tmp files, set numThreadsA automatically. --- src/call_sites.h | 200 +++++++++++------------------------------ src/hmm_1.h | 13 +-- src/parse_alignments.h | 4 + src/pureclip.cpp | 7 +- src/util.h | 4 +- 5 files changed, 66 insertions(+), 162 deletions(-) diff --git a/src/call_sites.h b/src/call_sites.h index d714616..78b2ce2 100644 --- a/src/call_sites.h +++ b/src/call_sites.h @@ -178,8 +178,8 @@ bool loadApplyChrs(AppOptions &options, TStore &store) -template -int loadObservations(TContigObservations &contigObservationsF, TContigObservations &contigObservationsR, unsigned contigId, TStore &store, AppOptions &options) +template +int loadObservations(TContigObservations &contigObservationsF, TContigObservations &contigObservationsR, unsigned contigId, TBai &baiIndex, TStore &store, AppOptions &options) { #ifdef HMM_PROFILE double timeStamp = sysTime(); @@ -197,13 +197,7 @@ int loadObservations(TContigObservations &contigObservationsF, TContigObservatio } BamHeader header; readHeader(header, inFile); - // Read BAI index. - BamIndex baiIndex; - if (!open(baiIndex, toCString(options.baiFileName))) - { - std::cerr << "ERROR: Could not read BAI index file " << options.baiFileName << "\n"; - return 1; - } + // Translate from contig name to rID. int rID = 0; if (!getIdByName(rID, contigNamesCache(context(inFile)), store.contigNameStore[contigId])) @@ -229,8 +223,8 @@ int loadObservations(TContigObservations &contigObservationsF, TContigObservatio } -template -bool loadBAMCovariates(Data &data, TStore &store, bool parallelize, TOptions &options) +template +bool loadBAMCovariates(Data &data, TBai &inputBaiIndex, TStore &store, bool parallelize, TOptions &options) { if (options.verbosity >= 1) std::cout << " Parse input BAM, get truncCounts, compute KDEs ... " << std::endl; @@ -254,15 +248,6 @@ bool loadBAMCovariates(Data &data, TStore &store, bool parallelize, TOptions &op } BamHeader header; readHeader(header, inFile); - // Read BAI index. - BamIndex baiIndex; - if (!open(baiIndex, toCString(options.inputBaiFileName))) - { - SEQAN_OMP_PRAGMA(critical) - std::cerr << "ERROR: Could not read BAI index file " << options.inputBaiFileName << "\n"; - stop = true; - continue; - } for (unsigned i = 0; i < length(data.setObs[s]); ++i) { @@ -281,14 +266,14 @@ bool loadBAMCovariates(Data &data, TStore &store, bool parallelize, TOptions &op { unsigned beginPos = data.setPos[s][i]; unsigned endPos = data.setPos[s][i] + data.setObs[s][i].length(); - parse_bamRegion(truncCounts, inFile, baiIndex, rID, beginPos, endPos, true, options); + parse_bamRegion(truncCounts, inFile, inputBaiIndex, rID, beginPos, endPos, true, options); } else { unsigned beginPos = length(store.contigStore[data.setObs[s][i].contigId].seq) - (data.setObs[s][i].length() + data.setPos[s][i]); unsigned endPos = beginPos + data.setObs[s][i].length(); - parse_bamRegion(truncCounts, inFile, baiIndex, rID, beginPos, endPos, false, options); + parse_bamRegion(truncCounts, inFile, inputBaiIndex, rID, beginPos, endPos, false, options); // reverse reverse(truncCounts); } @@ -816,8 +801,8 @@ void computeSLR(double &b0, double &b1, Data &data, TOptions &options) // TODO c } -template -void preproCoveredIntervals(Data &data, double &b0, double &b1, TStore &store, bool parallelize, TOptions &options) +template +void preproCoveredIntervals(Data &data, double &b0, double &b1, TBai &inputBaiIndex, TStore &store, bool parallelize, TOptions &options) { if (options.verbosity >= 1) std::cout << " Compute KDEs ... " << std::endl; for (unsigned s = 0; s < 2; ++s) @@ -855,7 +840,7 @@ void preproCoveredIntervals(Data &data, double &b0, double &b1, TStore &store, b // if input BAM file given if (options.useCov_RPKM && !empty(options.inputBamFileName)) { - loadBAMCovariates(data, store, parallelize, options); // interval-wise + loadBAMCovariates(data, inputBaiIndex, store, parallelize, options); // interval-wise } } @@ -966,36 +951,6 @@ bool applyHMM(Data &data, } -inline -const char * myTempFileName(std::string const suffix = "", std::string const tempPath = "") -{ - if (mkdir(tempPath.c_str(), 0777) == -1) - { - if(errno == EEXIST ) { - //std::cout << "Directory already exists " << std::endl; - } else { - std::cerr << tempPath << std::endl; - throw std::runtime_error("ERROR: Could not create directory"); - } - } - - static char fileNameBuffer[1000]; - strcpy(fileNameBuffer, (tempPath + "PURECLIP.XXXXXX" + suffix).c_str()); - - int _tmp = mkstemps(fileNameBuffer, suffix.size()); - if (_tmp == -1) - throw std::runtime_error("Could not create temp file"); - - close(_tmp); - return fileNameBuffer; -} - -inline bool exists_test(const CharString& fileName) { - std::ifstream f(toCString(fileName)); - return f.good(); -} - - template bool doIt(TGamma &gamma1, TGamma &gamma2, TBIN &bin1, TBIN &bin2, TDOUBLE /**/, TOptions &options) { @@ -1023,6 +978,12 @@ bool doIt(TGamma &gamma1, TGamma &gamma2, TBIN &bin1, TBIN &bin2, TDOUBLE /**/, } loadIntervals(options, store); loadApplyChrs(options, store); + if (options.numThreadsA == 0) + { + options.numThreadsA = std::min((int)options.numThreads, (int)length(options.intervals_contigIds)); + if (options.verbosity >= 2) std::cout << "Set no. of threads for applying HMM to: " << options.numThreadsA << std::endl; + } + #if SEQAN_HAS_ZLIB if (options.verbosity > 1) std::cout << "SEQAN_HAS_ZLIB" << std::endl; #else @@ -1074,6 +1035,23 @@ bool doIt(TGamma &gamma1, TGamma &gamma2, TBIN &bin1, TBIN &bin2, TDOUBLE /**/, std::cout << "Set n threshold used for learning of binomial p parameters and transition probabilities '2' -> '2'/'3' to: " << options.nThresholdForTransP << std::endl; } + // Read BAI index. + BamIndex baiIndex; + if (!open(baiIndex, toCString(options.baiFileName))) + { + std::cerr << "ERROR: Could not read BAI index file " << options.baiFileName << "\n"; + return 1; + } + // Read control BAI index. + BamIndex inputBaiIndex; + if (options.useCov_RPKM && !empty(options.inputBaiFileName)) + { + if (!open(inputBaiIndex, toCString(options.inputBaiFileName))) + { + std::cerr << "ERROR: Could not read input control BAI index file " << options.inputBaiFileName << "\n"; + return 1; + } + } // ***************** double slr_NfromKDE_b0 = 0.0; double slr_NfromKDE_b1 = 0.0; @@ -1097,7 +1075,7 @@ bool doIt(TGamma &gamma1, TGamma &gamma2, TBIN &bin1, TBIN &bin2, TDOUBLE /**/, { unsigned contigId = options.intervals_contigIds[i]; - int r = loadObservations(contigObservationsF[i], contigObservationsR[i], contigId, store, options); + int r = loadObservations(contigObservationsF[i], contigObservationsR[i], contigId, baiIndex, store, options); if (r == 1) { stop = true; @@ -1137,7 +1115,7 @@ bool doIt(TGamma &gamma1, TGamma &gamma2, TBIN &bin1, TBIN &bin2, TDOUBLE /**/, computeSLR(slr_NfromKDE_b0, slr_NfromKDE_b1, data, options); // precompute KDE values, estimate Ns, etc. - preproCoveredIntervals(data, slr_NfromKDE_b0, slr_NfromKDE_b1, store, true, options); + preproCoveredIntervals(data, slr_NfromKDE_b0, slr_NfromKDE_b1, inputBaiIndex, store, true, options); gamma1.tp = options.useKdeThreshold; gamma2.tp = options.useKdeThreshold; // left tuncated @@ -1158,10 +1136,12 @@ bool doIt(TGamma &gamma1, TGamma &gamma2, TBIN &bin1, TBIN &bin2, TDOUBLE /**/, #if HMM_PARALLEL omp_set_num_threads(options.numThreadsA); #endif - String contigTempFileNamesBed; - String contigTempFileNamesBed2; - resize(contigTempFileNamesBed, length(store.contigStore)); - resize(contigTempFileNamesBed2, length(store.contigStore)); + + // store contig-wise bedRecords + String > > bedRecords_sites; + String > > bedRecords_regions; + resize(bedRecords_sites, length(options.applyChr_contigIds), Exact()); + resize(bedRecords_regions, length(options.applyChr_contigIds), Exact()); #ifdef HMM_PROFILE double timeStamp2 = sysTime(); @@ -1178,7 +1158,7 @@ bool doIt(TGamma &gamma1, TGamma &gamma2, TBIN &bin1, TBIN &bin2, TDOUBLE /**/, ContigObservations contigObservationsF; ContigObservations contigObservationsR; - int r = loadObservations(contigObservationsF, contigObservationsR, contigId, store, options); + int r = loadObservations(contigObservationsF, contigObservationsR, contigId, baiIndex, store, options); if (r == 1) { stop = true; @@ -1204,7 +1184,7 @@ bool doIt(TGamma &gamma1, TGamma &gamma2, TBIN &bin1, TBIN &bin2, TDOUBLE /**/, if (!empty(c_data.setObs[0]) || !empty(c_data.setObs[1])) { - preproCoveredIntervals(c_data, slr_NfromKDE_b0, slr_NfromKDE_b1, store, false, options); + preproCoveredIntervals(c_data, slr_NfromKDE_b0, slr_NfromKDE_b1, inputBaiIndex, store, false, options); // Apply learned parameters unsigned contigLen = length(store.contigStore[contigId].seq); if (!applyHMM(c_data, transMatrix, gamma1, gamma2, bin1, bin2, (TDOUBLE)0.0, contigLen, options)) @@ -1212,91 +1192,30 @@ bool doIt(TGamma &gamma1, TGamma &gamma2, TBIN &bin1, TBIN &bin2, TDOUBLE /**/, SEQAN_OMP_PRAGMA(critical) stop = true; } - // Temp. output - CharString tempFileNameBed; - - if (empty(options.tempPath)) - { - SEQAN_OMP_PRAGMA(critical) - append(tempFileNameBed, SEQAN_TEMP_FILENAME()); - std::stringstream ss; - ss << contigId; - append(tempFileNameBed, ss.str()); - append(tempFileNameBed, ".bed"); - } - else - { - SEQAN_OMP_PRAGMA(critical) - tempFileNameBed = myTempFileName(".bed", toCString(options.tempPath)); - } - contigTempFileNamesBed[contigId] = tempFileNameBed; - if (options.verbosity >= 2) std::cout << "temp file Name: " << tempFileNameBed << std::endl; - BedFileOut outBed(toCString(tempFileNameBed)); - writeStates(outBed, c_data, store, contigId, options); + writeStates(bedRecords_sites[contigId], c_data, store, contigId, options); if (!empty(options.outRegionsFileName)) { - if (empty(options.tempPath)) - { - - SEQAN_OMP_PRAGMA(critical) - tempFileNameBed = SEQAN_TEMP_FILENAME(); - std::stringstream ss; - ss << contigId; - append(tempFileNameBed, ss.str()); - append(tempFileNameBed, ".regions.bed"); - } - else - { - SEQAN_OMP_PRAGMA(critical) - tempFileNameBed = myTempFileName(".regions.bed", toCString(options.tempPath)); - } - - contigTempFileNamesBed2[contigId] = tempFileNameBed; - if (options.verbosity >= 2) std::cout << "temp file Name: " << tempFileNameBed << std::endl; - BedFileOut outBed2(toCString(tempFileNameBed)); - - writeRegions(outBed2, c_data, store, contigId, options); + writeRegions(bedRecords_regions[contigId], c_data, store, contigId, options); } } } } if (stop) return 1; - if (options.verbosity >= 2) std::cout << "Merge output files ... " << std::endl; - // Append content of temp files to final output in contig order + if (options.verbosity >= 2) std::cout << "Write bedRecords to BED files ... " << std::endl; // crosslink sites BedFileOut outBed(toCString(options.outFileName)); for (unsigned i = 0; i < length(options.applyChr_contigIds); ++i) { unsigned contigId = options.applyChr_contigIds[i]; - if (!empty(contigTempFileNamesBed[contigId])) + for (unsigned j = 0; j < length(bedRecords_sites[contigId]); ++j) { - // BED - BedFileIn bedFileIn; - if (!open(bedFileIn, toCString(contigTempFileNamesBed[contigId]))) - { - //std::cerr << "ERROR: Could not open temporary bed file: " << contigTempFileNamesBed[contigId] << "\n"; - continue; - } - - BedRecord bedRecord; - while (!atEnd(bedFileIn)) - { - readRecord(bedRecord, bedFileIn); - writeRecord(outBed, bedRecord); - } - std::remove(toCString(contigTempFileNamesBed[contigId])); - if (exists_test(contigTempFileNamesBed[contigId])) - { - std::cerr << "ERROR: Could open temporary bed file which should be deleted: " << contigTempFileNamesBed[contigId] << std::endl; - return 1; - } + writeRecord(outBed, bedRecords_sites[contigId][j]); } } - // Append content of temp files to final output in contig order // binding regions if (!empty(options.outRegionsFileName)) { @@ -1304,28 +1223,9 @@ bool doIt(TGamma &gamma1, TGamma &gamma2, TBIN &bin1, TBIN &bin2, TDOUBLE /**/, for (unsigned i = 0; i < length(options.applyChr_contigIds); ++i) { unsigned contigId = options.applyChr_contigIds[i]; - if (!empty(contigTempFileNamesBed2[contigId])) + for (unsigned j = 0; j < length(bedRecords_regions[contigId]); ++j) { - // BED - BedFileIn bedFileIn; - if (!open(bedFileIn, toCString(contigTempFileNamesBed2[contigId]))) - { - //std::cerr << "ERROR: Could not open temporary bed file: " << contigTempFileNamesBed[contigId] << "\n"; - continue; - } - - BedRecord bedRecord; - while (!atEnd(bedFileIn)) - { - readRecord(bedRecord, bedFileIn); - writeRecord(outBed2, bedRecord); - } - std::remove(toCString(contigTempFileNamesBed2[contigId])); - if (exists_test(contigTempFileNamesBed2[contigId])) - { - std::cerr << "ERROR: Could open temporary bed file which should be deleted: " << contigTempFileNamesBed2[contigId] << std::endl; - return 1; - } + writeRecord(outBed2, bedRecords_regions[contigId][j]); } } } diff --git a/src/hmm_1.h b/src/hmm_1.h index 77885ed..1c3b8cf 100644 --- a/src/hmm_1.h +++ b/src/hmm_1.h @@ -1373,7 +1373,7 @@ void HMM::rmBoarderArtifacts(String > &bedRecords_sites, Data &data, FragmentStore<> &store, unsigned contigId, @@ -1462,7 +1462,7 @@ void writeStates(BedFileOut &outBed, ss.str(""); ss.clear(); - writeRecord(outBed, record); + appendValue(bedRecords_sites, record); } else if (data.setObs[s][i].discard && options.outputAll && data.setObs[s][i].truncCounts[t] >= 1) // discarded interval { @@ -1528,7 +1528,7 @@ void writeStates(BedFileOut &outBed, ss.str(""); ss.clear(); - writeRecord(outBed, record); + appendValue(bedRecords_sites, record); } else if (!data.setObs[s][i].discard && data.states[s][i][t] == 3) { @@ -1577,7 +1577,7 @@ void writeStates(BedFileOut &outBed, else record.strand = '-'; - writeRecord(outBed, record); + appendValue(bedRecords_sites, record); } } } @@ -1585,7 +1585,7 @@ void writeStates(BedFileOut &outBed, } -void writeRegions(BedFileOut &outBed, +void writeRegions(String > &bedRecords_regions, Data &data, FragmentStore<> &store, unsigned contigId, @@ -1689,7 +1689,8 @@ void writeRegions(BedFileOut &outBed, record.name = ss_indivScores.str(); ss_indivScores.str(""); ss_indivScores.clear(); - writeRecord(outBed, record); + + appendValue(bedRecords_regions, record); } } } diff --git a/src/parse_alignments.h b/src/parse_alignments.h index 222f246..f1dea3d 100644 --- a/src/parse_alignments.h +++ b/src/parse_alignments.h @@ -55,6 +55,8 @@ bool parse_bamRegion(TContigObservations &contigObservationsF, TContigObservatio // Seek linearly to the selected position BamAlignmentRecord bamRecord; BamAlignmentRecord newBamRecord; + + SEQAN_OMP_PRAGMA(critical) while (!atEnd(inFile)) { readRecord(bamRecord, inFile); @@ -111,6 +113,8 @@ bool parse_bamRegion(TTruncCounts &truncCounts, TBamIn &inFile, TBai &baiIndex, // Seek linearly to the selected position BamAlignmentRecord bamRecord; BamAlignmentRecord newBamRecord; + + SEQAN_OMP_PRAGMA(critical) while (!atEnd(inFile)) { readRecord(bamRecord, inFile); diff --git a/src/pureclip.cpp b/src/pureclip.cpp index 0918ac4..da0fd0f 100644 --- a/src/pureclip.cpp +++ b/src/pureclip.cpp @@ -164,7 +164,7 @@ parseCommandLine(AppOptions & options, int argc, char const ** argv) addOption(parser, ArgParseOption("et2", "epta", "Exclude intervals containing poly-U stretches from analysis.")); addOption(parser, ArgParseOption("mrtf", "mrtf", "Fit gamma shape k only for positions with min. covariate value.", ArgParseArgument::DOUBLE)); - addOption(parser, ArgParseOption("mtc", "mtc", "Maximum number of truncations at one position used for learning. For sites with counts above threshold the whole covered regions will be ignored for learning! Default: 250.", ArgParseArgument::INTEGER)); + addOption(parser, ArgParseOption("mtc", "mtc", "Maximum number of read starts at one position used for learning. For sites with counts above threshold the whole covered regions will be ignored for learning! Default: 500.", ArgParseArgument::INTEGER)); setMinValue(parser, "mtc", "50"); setMaxValue(parser, "mtc", "50000"); @@ -174,8 +174,7 @@ parseCommandLine(AppOptions & options, int argc, char const ** argv) addSection(parser, "General user options"); addOption(parser, ArgParseOption("nt", "nt", "Number of threads used for learning.", ArgParseArgument::INTEGER)); - addOption(parser, ArgParseOption("nta", "nta", "Number of threads used for applying learned parameters. Increases memory usage, if greater than number of chromosomes used for learning, since HMM will be build for multiple chromosomes in parallel.", ArgParseArgument::INTEGER)); - addOption(parser, ArgParseOption("tmp", "tmp", "Path to directory to store intermediate files. Default: /tmp", ArgParseArgument::STRING)); + addOption(parser, ArgParseOption("nta", "nta", "Number of threads used for applying learned parameters. Increases memory usage, if greater than number of chromosomes used for learning, since HMM will be build for multiple chromosomes in parallel. Default: min(nt, no. of chromosomes/transcripts used for learning).", ArgParseArgument::INTEGER)); addOption(parser, ArgParseOption("oa", "oa", "Outputs all sites with at least one read start in extended output format.")); addOption(parser, ArgParseOption("oe", "oe", "Outputs additionally all sites that are 'enriched' and contain at least one read start.")); @@ -302,7 +301,7 @@ parseCommandLine(AppOptions & options, int argc, char const ** argv) getOptionValue(options.numThreads, parser, "nt"); getOptionValue(options.numThreadsA, parser, "nta"); - getOptionValue(options.tempPath, parser, "tmp"); + if (isSet(parser, "oa")) options.outputAll = true; diff --git a/src/util.h b/src/util.h index baba102..b7ff6b6 100644 --- a/src/util.h +++ b/src/util.h @@ -120,7 +120,6 @@ namespace seqan { unsigned numThreads; unsigned numThreadsA; - CharString tempPath; bool outputAll; // Verbosity level. 0 -- quiet, 1 -- normal, 2 -- verbose, 3 -- very verbose. int verbosity; @@ -182,7 +181,7 @@ namespace seqan { useHighPrecision(false), selectRead(0), numThreads(1), - numThreadsA(1), + numThreadsA(0), outputAll(false), verbosity(1) {} @@ -447,6 +446,7 @@ namespace seqan { clear(data.states); } + }