diff --git a/ABYSS/abyss.cc b/ABYSS/abyss.cc index 2e48e0e24..f1e075ac9 100644 --- a/ABYSS/abyss.cc +++ b/ABYSS/abyss.cc @@ -1,35 +1,40 @@ +#include "Assembly/Options.h" #if PAIRED_DBG -# include "PairedDBG/SequenceCollection.h" -# include "PairedDBG/PairedDBGAlgorithms.h" +#include "PairedDBG/PairedDBGAlgorithms.h" +#include "PairedDBG/SequenceCollection.h" #else -# include "Assembly/SequenceCollection.h" +#include "Assembly/SequenceCollection.h" #endif #include "Assembly/AssemblyAlgorithms.h" #include "Assembly/DotWriter.h" +#include "DataBase/DB.h" + #include #include // for setvbuf #include #include #include -#include "DataBase/DB.h" using namespace std; DB db; -static void removeLowCoverageContigs(SequenceCollectionHash& g) +static void +removeLowCoverageContigs(SequenceCollectionHash& g) { AssemblyAlgorithms::markAmbiguous(&g); cout << "Removing low-coverage contigs " - "(mean k-mer coverage < " << opt::coverage << ")\n"; + "(mean k-mer coverage < " + << opt::coverage << ")\n"; AssemblyAlgorithms::assemble(&g); AssemblyAlgorithms::splitAmbiguous(&g); opt::coverage = 0; } -static void popBubbles(SequenceCollectionHash& g) +static void +popBubbles(SequenceCollectionHash& g) { cout << "Popping bubbles" << endl; ofstream out; @@ -39,8 +44,8 @@ static void popBubbles(SequenceCollectionHash& g) cout << "Removed " << numPopped << " bubbles\n"; } -static void write_graph(const string& path, - const SequenceCollectionHash& c) +static void +write_graph(const string& path, const SequenceCollectionHash& c) { if (path.empty()) return; @@ -49,16 +54,17 @@ static void write_graph(const string& path, DotWriter::write(out, c); } -static void assemble(const string& pathIn, const string& pathOut) +static void +assemble(const string& pathIn, const string& pathOut) { Timer timer(__func__); SequenceCollectionHash g; if (!pathIn.empty()) AssemblyAlgorithms::loadSequences(&g, pathIn.c_str()); - for_each(opt::inFiles.begin(), opt::inFiles.end(), bind1st( - ptr_fun(AssemblyAlgorithms::loadSequences), - &g)); + for_each(opt::inFiles.begin(), opt::inFiles.end(), [&g](std::string s) { + AssemblyAlgorithms::loadSequences(&g, s); + }); size_t numLoaded = g.size(); if (!opt::db.empty()) addToDb(db, "loadedKmer", numLoaded); @@ -70,16 +76,14 @@ static void assemble(const string& pathIn, const string& pathOut) exit(EXIT_FAILURE); } - AssemblyAlgorithms::setCoverageParameters( - AssemblyAlgorithms::coverageHistogram(g)); + AssemblyAlgorithms::setCoverageParameters(AssemblyAlgorithms::coverageHistogram(g)); if (opt::kc > 0) { cout << "Minimum k-mer multiplicity kc is " << opt::kc << endl; cout << "Removing low-multiplicity k-mers" << endl; size_t removed = AssemblyAlgorithms::applyKmerCoverageThreshold(g, opt::kc); - cout << "Removed " << removed - << " low-multiplicity k-mers, " << g.size() - << " k-mers remaining" << std::endl; + cout << "Removed " << removed << " low-multiplicity k-mers, " << g.size() + << " k-mers remaining" << std::endl; } cout << "Generating adjacency" << endl; @@ -122,13 +126,14 @@ static void assemble(const string& pathIn, const string& pathOut) size_t numAssembled = g.size(); size_t numRemoved = numLoaded - numAssembled; - cout << "Removed " << numRemoved << " k-mer.\n" - "The signal-to-noise ratio (SNR) is " - << 10 * log10((double)numAssembled / numRemoved) - << " dB.\n"; + cout << "Removed " << numRemoved + << " k-mer.\n" + "The signal-to-noise ratio (SNR) is " + << 10 * log10((double)numAssembled / numRemoved) << " dB.\n"; } -int main(int argc, char* const* argv) +int +main(int argc, char* const* argv) { Timer timer("Total"); @@ -142,16 +147,16 @@ int main(int argc, char* const* argv) bool krange = opt::kMin != opt::kMax; if (krange) - cout << "Assembling k=" << opt::kMin << "-" << opt::kMax - << ":" << opt::kStep << endl; + cout << "Assembling k=" << opt::kMin << "-" << opt::kMax << ":" << opt::kStep << endl; if (!opt::db.empty()) { - init(db, - opt::getUvalue(), - opt::getVvalue(), - "ABYSS", - opt::getCommand(), - opt::getMetaValue()); + init( + db, + opt::getUvalue(), + opt::getVvalue(), + "ABYSS", + opt::getCommand(), + opt::getMetaValue()); addToDb(db, "SS", opt::ss); addToDb(db, "k", opt::kmerSize); addToDb(db, "singleK", opt::singleKmerSize); @@ -175,7 +180,7 @@ int main(int argc, char* const* argv) opt::erodeStrand = (unsigned)-1; opt::coverage = -1; opt::trimLen = k; - opt::bubbleLen = 3*k; + opt::bubbleLen = 3 * k; } ostringstream k0, k1; diff --git a/Assembly/BranchGroup.h b/Assembly/BranchGroup.h index b0b46c8c2..a6bfe87bd 100644 --- a/Assembly/BranchGroup.h +++ b/Assembly/BranchGroup.h @@ -19,195 +19,194 @@ enum BranchGroupStatus /** A container of BranchRecord. */ class BranchGroup { - public: - typedef std::vector BranchGroupData; - typedef BranchGroupData::iterator iterator; - typedef BranchGroupData::const_iterator const_iterator; - - BranchGroup() - : m_dir(SENSE), m_maxNumBranches(0), - m_noExt(false), m_status(BGS_ACTIVE) - { } - - BranchGroup(extDirection dir, size_t maxNumBranches, - const BranchRecord::V &origin) - : m_dir(dir), m_origin(origin), - m_maxNumBranches(maxNumBranches), m_noExt(false), - m_status(BGS_ACTIVE) - { - m_branches.reserve(m_maxNumBranches); - } - - BranchGroup(extDirection dir, size_t maxNumBranches, - const BranchRecord::V &origin, const BranchRecord& branch) - : m_dir(dir), m_origin(origin), - m_maxNumBranches(maxNumBranches), m_noExt(false), - m_status(BGS_ACTIVE) - { - m_branches.reserve(m_maxNumBranches); - m_branches.push_back(branch); - } + public: + typedef std::vector BranchGroupData; + typedef BranchGroupData::iterator iterator; + typedef BranchGroupData::const_iterator const_iterator; + + BranchGroup() + : m_dir(SENSE) + , m_maxNumBranches(0) + , m_noExt(false) + , m_status(BGS_ACTIVE) + {} + + BranchGroup(extDirection dir, size_t maxNumBranches, const BranchRecord::V& origin) + : m_dir(dir) + , m_origin(origin) + , m_maxNumBranches(maxNumBranches) + , m_noExt(false) + , m_status(BGS_ACTIVE) + { + m_branches.reserve(m_maxNumBranches); + } - BranchGroup(const BranchGroup& o) - : m_branches(o.m_branches), m_dir(o.m_dir), - m_origin(o.m_origin), - m_maxNumBranches(o.m_maxNumBranches), - m_noExt(o.m_noExt), m_status(o.m_status) - { - m_branches.reserve(m_maxNumBranches); - } + BranchGroup( + extDirection dir, + size_t maxNumBranches, + const BranchRecord::V& origin, + const BranchRecord& branch) + : m_dir(dir) + , m_origin(origin) + , m_maxNumBranches(maxNumBranches) + , m_noExt(false) + , m_status(BGS_ACTIVE) + { + m_branches.reserve(m_maxNumBranches); + m_branches.push_back(branch); + } - /** Add a branch to this group. */ - BranchRecord& addBranch(const BranchRecord& branch) - { - assert(m_branches.size() < m_maxNumBranches); - m_branches.push_back(branch); - return m_branches.back(); - } + BranchGroup(const BranchGroup& o) + : m_branches(o.m_branches) + , m_dir(o.m_dir) + , m_origin(o.m_origin) + , m_maxNumBranches(o.m_maxNumBranches) + , m_noExt(o.m_noExt) + , m_status(o.m_status) + { + m_branches.reserve(m_maxNumBranches); + } - /** Add a branch to this group and extend the new branch with - * the given k-mer. */ - void addBranch(const BranchRecord& branch, - const BranchRecord::V& kmer) - { - if (m_branches.size() < m_maxNumBranches) - addBranch(branch).push_back( - std::make_pair(kmer, BranchRecord::VP())); - else - m_status = BGS_TOOMANYBRANCHES; - } + /** Add a branch to this group. */ + BranchRecord& addBranch(const BranchRecord& branch) + { + assert(m_branches.size() < m_maxNumBranches); + m_branches.push_back(branch); + return m_branches.back(); + } - /** Return the specified branch. */ - BranchRecord& operator [](unsigned id) - { - return m_branches[id]; - } + /** Add a branch to this group and extend the new branch with + * the given k-mer. */ + void addBranch(const BranchRecord& branch, const BranchRecord::V& kmer) + { + if (m_branches.size() < m_maxNumBranches) + addBranch(branch).push_back(std::make_pair(kmer, BranchRecord::VP())); + else + m_status = BGS_TOOMANYBRANCHES; + } - /** Return the number of branches in this group. */ - size_t size() const { return m_branches.size(); } - - /** Return whether a branch contains the specified k-mer at - * the index i. */ - bool exists(unsigned i, const BranchRecord::V& kmer) const - { - for (BranchGroupData::const_iterator it - = m_branches.begin(); - it != m_branches.end(); ++it) - if (it->exists(i, kmer)) - return true; - return false; - } + /** Return the specified branch. */ + BranchRecord& operator[](unsigned id) { return m_branches[id]; } - // return the current status of the branch - BranchGroupStatus getStatus() const { return m_status; } + /** Return the number of branches in this group. */ + size_t size() const { return m_branches.size(); } - // set the no extension flag - void setNoExtension() { m_noExt = true; } + /** Return whether a branch contains the specified k-mer at + * the index i. */ + bool exists(unsigned i, const BranchRecord::V& kmer) const + { + for (BranchGroupData::const_iterator it = m_branches.begin(); it != m_branches.end(); ++it) + if (it->exists(i, kmer)) + return true; + return false; + } - // is the no extension flag set? - bool isNoExt() const { return m_noExt; } + // return the current status of the branch + BranchGroupStatus getStatus() const { return m_status; } - // return the direction of growth - extDirection getDirection() const { return m_dir; } + // set the no extension flag + void setNoExtension() { m_noExt = true; } - iterator begin() { return m_branches.begin(); } - iterator end() { return m_branches.end(); } - const_iterator begin() const { return m_branches.begin(); } - const_iterator end() const { return m_branches.end(); } + // is the no extension flag set? + bool isNoExt() const { return m_noExt; } -// Check the stop conditions for the bubble growth -BranchGroupStatus -updateStatus(unsigned maxLength) -{ - assert(m_branches.size() <= m_maxNumBranches); + // return the direction of growth + extDirection getDirection() const { return m_dir; } - if (m_status != BGS_ACTIVE) - return m_status; + iterator begin() { return m_branches.begin(); } + iterator end() { return m_branches.end(); } + const_iterator begin() const { return m_branches.begin(); } + const_iterator end() const { return m_branches.end(); } - // Check if the no extension flag is set - if(m_noExt) + // Check the stop conditions for the bubble growth + BranchGroupStatus updateStatus(unsigned maxLength) { - m_status = BGS_NOEXT; - return m_status; - } + assert(m_branches.size() <= m_maxNumBranches); - // Check if any branches are too long or any sequence has a loop - for (BranchGroupData::const_iterator iter = m_branches.begin(); - iter != m_branches.end(); ++iter) { - if (iter->isTooLong(maxLength)) { - m_status = BGS_TOOLONG; + if (m_status != BGS_ACTIVE) + return m_status; + + // Check if the no extension flag is set + if (m_noExt) { + m_status = BGS_NOEXT; return m_status; } + + // Check if any branches are too long or any sequence has a loop + for (BranchGroupData::const_iterator iter = m_branches.begin(); iter != m_branches.end(); + ++iter) { + if (iter->isTooLong(maxLength)) { + m_status = BGS_TOOLONG; + return m_status; + } + } + + BranchGroupData::const_iterator it = m_branches.begin(); + const BranchRecord::V& lastSeq = it->back().first; + while (++it != m_branches.end()) + if (it->back().first != lastSeq) + return m_status = BGS_ACTIVE; + + // All the branches of the bubble have joined. + // Remove the last base, which is identical for every branch. + std::for_each( + m_branches.begin(), m_branches.end(), [](BranchRecord& b) { return b.pop_back(); }); + + // Sort the branches by coverage. + std::function lambda = [](const BranchRecord& b) { + return b.calculateBranchMultiplicity(); + }; + sort_by_transform(m_branches.begin(), m_branches.end(), lambda); + reverse(m_branches.begin(), m_branches.end()); + + return m_status = BGS_JOINED; } - BranchGroupData::const_iterator it = m_branches.begin(); - const BranchRecord::V& lastSeq = it->back().first; - while (++it != m_branches.end()) - if (it->back().first != lastSeq) - return m_status = BGS_ACTIVE; + /** Return whether any branches of this group are active. */ + bool isActive() const + { + for (BranchGroupData::const_iterator it = m_branches.begin(); it != m_branches.end(); ++it) + if (it->isActive()) + return true; + return false; + } - // All the branches of the bubble have joined. - // Remove the last base, which is identical for every branch. - std::for_each(m_branches.begin(), m_branches.end(), - std::mem_fun_ref(&BranchRecord::pop_back)); + /** Return whether this branch is extendable. */ + bool isExtendable() + { + if (m_noExt) + return false; - // Sort the branches by coverage. - sort_by_transform(m_branches.begin(), m_branches.end(), - std::mem_fun_ref(&BranchRecord::calculateBranchMultiplicity)); - reverse(m_branches.begin(), m_branches.end()); + // A group is extendable when all the branches are the same + // length. All the branches are lockstepped for growth. + BranchGroupData::iterator it = m_branches.begin(); + unsigned length = it++->size(); + for (; it != m_branches.end(); ++it) + if (it->size() != length) + return false; + return true; + } - return m_status = BGS_JOINED; -} + /** Return whether this branch is ambiguous at its origin. Also + * returns false if the origin of the branch has since been deleted. + */ + bool isAmbiguous(const SequenceCollectionHash& g) const + { + // Get fresh data from the collection to check that this bubble + // does in fact still exist. + const BranchRecord::VP& data = g.getSeqAndData(m_origin).second; + return data.deleted() ? false : data.isAmbiguous(m_dir); + } -/** Return whether any branches of this group are active. */ -bool -isActive() const -{ - for (BranchGroupData::const_iterator it = m_branches.begin(); - it != m_branches.end(); ++it) - if (it->isActive()) - return true; - return false; -} - -/** Return whether this branch is extendable. */ -bool -isExtendable() -{ - if (m_noExt) - return false; + private: + BranchGroup& operator=(const BranchGroup& o); - // A group is extendable when all the branches are the same - // length. All the branches are lockstepped for growth. - BranchGroupData::iterator it = m_branches.begin(); - unsigned length = it++->size(); - for (; it != m_branches.end(); ++it) - if (it->size() != length) - return false; - return true; -} - -/** Return whether this branch is ambiguous at its origin. Also - * returns false if the origin of the branch has since been deleted. - */ -bool -isAmbiguous(const SequenceCollectionHash& g) const -{ - // Get fresh data from the collection to check that this bubble - // does in fact still exist. - const BranchRecord::VP& data = g.getSeqAndData(m_origin).second; - return data.deleted() ? false : data.isAmbiguous(m_dir); -} - - private: - BranchGroup& operator =(const BranchGroup& o); - - BranchGroupData m_branches; - extDirection m_dir; - BranchRecord::V m_origin; - size_t m_maxNumBranches; - bool m_noExt; - BranchGroupStatus m_status; + BranchGroupData m_branches; + extDirection m_dir; + BranchRecord::V m_origin; + size_t m_maxNumBranches; + bool m_noExt; + BranchGroupStatus m_status; }; #endif diff --git a/Bloom/bloom.cc b/Bloom/bloom.cc index daa1ff4a2..7673598f0 100644 --- a/Bloom/bloom.cc +++ b/Bloom/bloom.cc @@ -99,7 +99,7 @@ static const char USAGE_MESSAGE[] = " -n, --num-locks=N number of write locks on bloom filter [1000]\n" " -q, --trim-quality=N trim bases from the ends of reads whose\n" " quality is less than the threshold\n" - " -t, --bloom-type=STR 'konnector' or 'rolling-hash' [konnector]\n" + " -t, --bloom-type=STR 'konnector', 'rolling-hash', or 'counting' [konnector]\n" " --standard-quality zero quality is `!' (33)\n" " default for FASTQ and SAM files\n" " --illumina-quality zero quality is `@' (64)\n" diff --git a/BloomDBG/bloom-dbg.h b/BloomDBG/bloom-dbg.h index d86476141..2b1b84b60 100644 --- a/BloomDBG/bloom-dbg.h +++ b/BloomDBG/bloom-dbg.h @@ -90,6 +90,24 @@ addKmersToBloom(const Sequence& seq, BloomT& bloom) } } +/** + * Returns the sum of all kmer multiplicities in `seq` by querying `bloom` + */ +template +inline static unsigned +getSeqAbsoluteKmerCoverage(const Sequence& seq, const CountingBloomT& bloom) +{ + const unsigned k = bloom.getKmerSize(); + const unsigned numHashes = bloom.getHashNum(); + assert(seq.length() >= k); + unsigned coverage = 0; + for (RollingHashIterator it(seq, numHashes, k); it != RollingHashIterator::end(); + ++it) { + coverage += bloom.minCount(*it); + } + return coverage; +} + /** * Translate a DNA sequence to an equivalent path in the * de Bruijn graph. @@ -170,6 +188,8 @@ struct ContigRecord size_t contigID; /** length of contig (bp) */ unsigned length; + /** coverage of contig */ + unsigned coverage; /** FASTA ID of seeding read */ std::string readID; /** Type of sequence used to seed contig (branch k-mer or full read) */ @@ -433,6 +453,8 @@ trimSeq(Sequence& seq, const BloomT& goodKmerSet) inline static void printContig( const Sequence& seq, + unsigned length, + unsigned coverage, size_t contigID, const std::string& readID, unsigned k, @@ -448,6 +470,7 @@ printContig( /* add FASTA comment indicating extended read id */ std::ostringstream comment; + comment << length << ' ' << coverage << ' '; comment << "read:" << readID; assert(id.good()); contig.id = id.str(); @@ -510,11 +533,12 @@ hasBluntEnd(const Sequence& seq, const GraphT& graph, const AssemblyParams& para * Output a contig sequence if it is not redundant, i.e. it has not already * been generated from a different read / thread of execution. */ -template +template inline static void outputContig( const Path& contigPath, ContigRecord& rec, + SolidKmerSetT& solidKmerSet, AssembledKmerSetT& assembledKmerSet, KmerHash& contigEndKmers, const AssemblyParams& params, @@ -572,15 +596,17 @@ outputContig( if (!redundant) { #pragma omp critical(fasta) { + rec.length = seq.length(); + rec.coverage = getSeqAbsoluteKmerCoverage(seq, solidKmerSet); + /* add contig to output FASTA */ - printContig(seq, counters.contigID, rec.readID, params.k, streams.out); + printContig(seq, rec.length, rec.coverage, counters.contigID, rec.readID, params.k, streams.out); /* add contig to checkpoint FASTA file */ if (params.checkpointsEnabled()) - printContig(seq, counters.contigID, rec.readID, params.k, streams.checkpointOut); + printContig(seq, rec.length, rec.coverage, counters.contigID, rec.readID, params.k, streams.checkpointOut); rec.contigID = counters.contigID; - rec.length = seq.length(); counters.contigID++; counters.basesAssembled += seq.length(); @@ -842,7 +868,7 @@ processRead( /* output contig to FASTA file */ outputContig( - contigPath, contigRec, assembledKmerSet, contigEndKmers, params, counters, streams); + contigPath, contigRec, solidKmerSet, assembledKmerSet, contigEndKmers, params, counters, streams); } /* mark contig k-mers as visited */ diff --git a/ChangeLog b/ChangeLog index 096688c48..450d5c0f0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +2020-01-30 Johnathan Wong + + * Release version 2.2.4 + + General: + * Refactor deprecated functions in clang-8 + + Sealer: + * Remove unsupported -D option from help page + + abyss-bloom: + * Add counting Bloom Filter instruction to help page + + abyss-bloom-dbg: + * Report coverage information of unitigs + + 2019-09-20 Johnathan Wong * Release version 2.2.3 diff --git a/FMIndex/BitArrays.h b/FMIndex/BitArrays.h index 113e0377e..5a4b3cabd 100644 --- a/FMIndex/BitArrays.h +++ b/FMIndex/BitArrays.h @@ -23,94 +23,84 @@ class BitArrays static T SENTINEL() { return std::numeric_limits::max(); } public: - -/** Count the occurrences of the symbols of [first, last). */ -template -void assign(It first, It last) -{ - assert(first < last); - m_data.clear(); - - // Determine the size of the alphabet ignoring the sentinel. - T n = 0; - for (It it = first; it != last; ++it) - if (*it != SENTINEL()) - n = std::max(n, *it); - n++; - - assert(n < std::numeric_limits::max()); - m_data.resize(n, wat_array::BitArray(last - first)); - - size_t i = 0; - for (It it = first; it != last; ++it, ++i) { - T c = *it; - if (c == SENTINEL()) - continue; - assert(c < m_data.size()); - m_data[c].SetBit(1, i); + /** Count the occurrences of the symbols of [first, last). */ + template + void assign(It first, It last) + { + assert(first < last); + m_data.clear(); + + // Determine the size of the alphabet ignoring the sentinel. + T n = 0; + for (It it = first; it != last; ++it) + if (*it != SENTINEL()) + n = std::max(n, *it); + n++; + + assert(n < std::numeric_limits::max()); + m_data.resize(n, wat_array::BitArray(last - first)); + + size_t i = 0; + for (It it = first; it != last; ++it, ++i) { + T c = *it; + if (c == SENTINEL()) + continue; + assert(c < m_data.size()); + m_data[c].SetBit(1, i); + } + + std::for_each( + m_data.begin(), m_data.end(), [](wat_array::BitArray& b) { return b.Build(); }); } - std::for_each(m_data.begin(), m_data.end(), - std::mem_fun_ref(&wat_array::BitArray::Build)); -} - -/** Return the size of the string. */ -size_t size() const -{ - assert(!m_data.empty()); - return m_data.front().length(); -} + /** Return the size of the string. */ + size_t size() const + { + assert(!m_data.empty()); + return m_data.front().length(); + } -/** Return the number of occurrences of the specified symbol. */ -size_t count(T c) const -{ - return m_data[c].one_num(); -} + /** Return the number of occurrences of the specified symbol. */ + size_t count(T c) const { return m_data[c].one_num(); } + + /** Return the count of symbol c in s[0, i). */ + size_t rank(T c, size_t i) const { return m_data[c].Rank(1, i); } + + /** Return the symbol at the specified position. */ + T at(size_t i) const + { + assert(!m_data.empty()); + assert(i < m_data.front().length()); + for (Data::const_iterator it = m_data.begin(); it != m_data.end(); ++it) + if (it->Lookup(i)) + return it - m_data.begin(); + return std::numeric_limits::max(); + } -/** Return the count of symbol c in s[0, i). */ -size_t rank(T c, size_t i) const -{ - return m_data[c].Rank(1, i); -} + /** Store this data structure. */ + friend std::ostream& operator<<(std::ostream& out, const BitArrays& o) + { + uint32_t n = o.m_data.size(); + out.write(reinterpret_cast(&n), sizeof n); + for (Data::const_iterator it = o.m_data.begin(); it != o.m_data.end(); ++it) + it->Save(out); + return out; + } -/** Return the symbol at the specified position. */ -T at(size_t i) const -{ - assert(!m_data.empty()); - assert(i < m_data.front().length()); - for (Data::const_iterator it = m_data.begin(); - it != m_data.end(); ++it) - if (it->Lookup(i)) - return it - m_data.begin(); - return std::numeric_limits::max(); -} - -/** Store this data structure. */ -friend std::ostream& operator<<(std::ostream& out, const BitArrays& o) -{ - uint32_t n = o.m_data.size(); - out.write(reinterpret_cast(&n), sizeof n); - for (Data::const_iterator it = o.m_data.begin(); - it != o.m_data.end(); ++it) - it->Save(out); - return out; -} - -/** Load this data structure. */ -friend std::istream& operator>>(std::istream& in, BitArrays& o) -{ - o.m_data.clear(); - uint32_t n = 0; - if (!in.read(reinterpret_cast(&n), sizeof n)) + /** Load this data structure. */ + friend std::istream& operator>>(std::istream& in, BitArrays& o) + { + o.m_data.clear(); + uint32_t n = 0; + if (!in.read(reinterpret_cast(&n), sizeof n)) + return in; + assert(n > 0); + assert(n < std::numeric_limits::max()); + o.m_data.resize(n); + for (Data::iterator it = o.m_data.begin(); it != o.m_data.end(); ++it) + it->Load(in); return in; - assert(n > 0); - assert(n < std::numeric_limits::max()); - o.m_data.resize(n); - for (Data::iterator it = o.m_data.begin(); - it != o.m_data.end(); ++it) - it->Load(in); - return in; -} + } private: typedef std::vector Data; diff --git a/FilterGraph/FilterGraph.cc b/FilterGraph/FilterGraph.cc index f73165af7..cc0628fcd 100644 --- a/FilterGraph/FilterGraph.cc +++ b/FilterGraph/FilterGraph.cc @@ -9,16 +9,16 @@ #include "ContigPath.h" #include "ContigProperties.h" #include "FastaReader.h" -#include "IOUtil.h" -#include "Uncompress.h" #include "Graph/ContigGraph.h" #include "Graph/ContigGraphAlgorithms.h" #include "Graph/DirectedGraph.h" #include "Graph/GraphIO.h" #include "Graph/GraphUtil.h" +#include "IOUtil.h" +#include "Uncompress.h" +#include #include #include -#include #include #include #include @@ -37,153 +37,159 @@ using boost::tie; #define PROGRAM "abyss-filtergraph" static const char VERSION_MESSAGE[] = -PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" -"Written by Tony Raymond.\n" -"\n" -"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; + PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" + "Written by Tony Raymond.\n" + "\n" + "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; static const char USAGE_MESSAGE[] = -"Usage: " PROGRAM " -k [OPTION]... ADJ [FASTA]\n" -"Remove short contigs that do not contribute any relevant\n" -"information to the assembly.\n" -"\n" -" Arguments:\n" -"\n" -" ADJ contig adjacency graph\n" -" FASTA contigs to check consistency of ADJ edges\n" -"\n" -" Options:\n" -"\n" -" -k, --kmer=N k-mer size\n" -" --SS expect contigs to be oriented correctly\n" -" --no-SS no assumption about contig orientation\n" -" -T, --island=N remove islands shorter than N [0]\n" -" -t, --tip=N remove tips shorter than N [0]\n" -" -l, --length=N remove contigs shorter than N [0]\n" -" -L, --max-length=N remove contigs longer than N [0]\n" -" -c, --coverage=FLOAT remove contigs with mean k-mer coverage less than FLOAT [0]\n" -" -C, --max-coverage=FLOAT remove contigs with mean k-mer coverage at least FLOAT [0]\n" -" --shim remove filler contigs that only contribute\n" -" to adjacency [default]\n" -" --no-shim disable filler contigs removal\n" -" --shim-max-degree=N only remove shims where the smaller of \n" -" in/out degree is smaller than N [1]\n" -" -m, --min-overlap=N require a minimum overlap of N bases [10]\n" -" --assemble assemble unambiguous paths\n" -" --no-assemble disable assembling of paths [default]\n" -" -g, --graph=FILE write the contig adjacency graph to FILE\n" -" -i, --ignore=FILE ignore contigs seen in FILE\n" -" -r, --remove=FILE remove contigs seen in FILE\n" -" --adj output the graph in ADJ format [default]\n" -" --asqg output the graph in ASQG format\n" -" --dot output the graph in GraphViz format\n" -" --gfa output the graph in GFA1 format\n" -" --gfa1 output the graph in GFA1 format\n" -" --gfa2 output the graph in GFA2 format\n" -" --gv output the graph in GraphViz format\n" -" --sam output the graph in SAM format\n" -" -v, --verbose display verbose output\n" -" --help display this help and exit\n" -" --version output version information and exit\n" -"\n" -"Report bugs to <" PACKAGE_BUGREPORT ">.\n"; + "Usage: " PROGRAM " -k [OPTION]... ADJ [FASTA]\n" + "Remove short contigs that do not contribute any relevant\n" + "information to the assembly.\n" + "\n" + " Arguments:\n" + "\n" + " ADJ contig adjacency graph\n" + " FASTA contigs to check consistency of ADJ edges\n" + "\n" + " Options:\n" + "\n" + " -k, --kmer=N k-mer size\n" + " --SS expect contigs to be oriented correctly\n" + " --no-SS no assumption about contig orientation\n" + " -T, --island=N remove islands shorter than N [0]\n" + " -t, --tip=N remove tips shorter than N [0]\n" + " -l, --length=N remove contigs shorter than N [0]\n" + " -L, --max-length=N remove contigs longer than N [0]\n" + " -c, --coverage=FLOAT remove contigs with mean k-mer coverage less than FLOAT [0]\n" + " -C, --max-coverage=FLOAT remove contigs with mean k-mer coverage at least FLOAT [0]\n" + " --shim remove filler contigs that only contribute\n" + " to adjacency [default]\n" + " --no-shim disable filler contigs removal\n" + " --shim-max-degree=N only remove shims where the smaller of \n" + " in/out degree is smaller than N [1]\n" + " -m, --min-overlap=N require a minimum overlap of N bases [10]\n" + " --assemble assemble unambiguous paths\n" + " --no-assemble disable assembling of paths [default]\n" + " -g, --graph=FILE write the contig adjacency graph to FILE\n" + " -i, --ignore=FILE ignore contigs seen in FILE\n" + " -r, --remove=FILE remove contigs seen in FILE\n" + " --adj output the graph in ADJ format [default]\n" + " --asqg output the graph in ASQG format\n" + " --dot output the graph in GraphViz format\n" + " --gfa output the graph in GFA1 format\n" + " --gfa1 output the graph in GFA1 format\n" + " --gfa2 output the graph in GFA2 format\n" + " --gv output the graph in GraphViz format\n" + " --sam output the graph in SAM format\n" + " -v, --verbose display verbose output\n" + " --help display this help and exit\n" + " --version output version information and exit\n" + "\n" + "Report bugs to <" PACKAGE_BUGREPORT ">.\n"; namespace opt { - unsigned k; // used by ContigProperties +unsigned k; // used by ContigProperties - /** Run a strand-specific RNA-Seq assembly. */ - static int ss; +/** Run a strand-specific RNA-Seq assembly. */ +static int ss; - /** Remove island contigs less than this length. */ - static unsigned minIslandLen = 0; +/** Remove island contigs less than this length. */ +static unsigned minIslandLen = 0; - /** Remove tips less than this length. */ - static unsigned minTipLen = 0; +/** Remove tips less than this length. */ +static unsigned minTipLen = 0; - /** Remove all contigs less than this length. */ - static unsigned minLen = 0; +/** Remove all contigs less than this length. */ +static unsigned minLen = 0; - /** Remove all contigs more than this length. */ - static unsigned maxLen = 0; +/** Remove all contigs more than this length. */ +static unsigned maxLen = 0; - /** Remove contigs with mean k-mer coverage less than this threshold. */ - static float minCoverage = 0; +/** Remove contigs with mean k-mer coverage less than this threshold. */ +static float minCoverage = 0; - /** Remove contigs with mean k-mer coverage at least this threshold. */ - static float maxCoverage = 0; +/** Remove contigs with mean k-mer coverage at least this threshold. */ +static float maxCoverage = 0; - /** Remove short contigs that don't contribute any sequence. */ - static int shim = 1; +/** Remove short contigs that don't contribute any sequence. */ +static int shim = 1; - /** Only remove shims where the smaller of in/out degree is small - * enough. */ - static unsigned shimMaxDegree = 1; +/** Only remove shims where the smaller of in/out degree is small + * enough. */ +static unsigned shimMaxDegree = 1; - /** Assemble unambiguous paths. */ - static int assemble = 0; +/** Assemble unambiguous paths. */ +static int assemble = 0; - /** Write the contig adjacency graph to this file. */ - static string graphPath; +/** Write the contig adjacency graph to this file. */ +static string graphPath; - /** Contigs to ignore. */ - static string ignorePath; +/** Contigs to ignore. */ +static string ignorePath; - /** Contigs to remove. */ - static string removePath; +/** Contigs to remove. */ +static string removePath; - /** The minimum overlap allowed between two contigs. */ - static int minOverlap = 10; +/** The minimum overlap allowed between two contigs. */ +static int minOverlap = 10; - /** Output graph format. */ - int format = ADJ; // used by ContigProperties +/** Output graph format. */ +int format = ADJ; // used by ContigProperties } static const char shortopts[] = "c:C:g:i:r:k:l:L:m:t:T:v"; -enum { OPT_HELP = 1, OPT_VERSION, OPT_SHIM_MAX_DEG }; +enum +{ + OPT_HELP = 1, + OPT_VERSION, + OPT_SHIM_MAX_DEG +}; static const struct option longopts[] = { - { "adj", no_argument, &opt::format, ADJ }, - { "asqg", no_argument, &opt::format, ASQG }, - { "dot", no_argument, &opt::format, DOT }, - { "gfa", no_argument, &opt::format, GFA1 }, - { "gfa1", no_argument, &opt::format, GFA1 }, - { "gfa2", no_argument, &opt::format, GFA2 }, - { "gv", no_argument, &opt::format, DOT }, - { "sam", no_argument, &opt::format, SAM }, - { "graph", required_argument, NULL, 'g' }, - { "ignore", required_argument, NULL, 'i' }, - { "remove", required_argument, NULL, 'r' }, - { "SS", no_argument, &opt::ss, 1 }, - { "no-SS", no_argument, &opt::ss, 0 }, - { "kmer", required_argument, NULL, 'k' }, - { "island", required_argument, NULL, 'T' }, - { "tip", required_argument, NULL, 't' }, - { "length", required_argument, NULL, 'l' }, - { "max-length", required_argument, NULL, 'L' }, - { "coverage", required_argument, NULL, 'c' }, - { "max-coverage", required_argument, NULL, 'C' }, - { "shim", no_argument, &opt::shim, 1 }, - { "no-shim", no_argument, &opt::shim, 0 }, + { "adj", no_argument, &opt::format, ADJ }, + { "asqg", no_argument, &opt::format, ASQG }, + { "dot", no_argument, &opt::format, DOT }, + { "gfa", no_argument, &opt::format, GFA1 }, + { "gfa1", no_argument, &opt::format, GFA1 }, + { "gfa2", no_argument, &opt::format, GFA2 }, + { "gv", no_argument, &opt::format, DOT }, + { "sam", no_argument, &opt::format, SAM }, + { "graph", required_argument, NULL, 'g' }, + { "ignore", required_argument, NULL, 'i' }, + { "remove", required_argument, NULL, 'r' }, + { "SS", no_argument, &opt::ss, 1 }, + { "no-SS", no_argument, &opt::ss, 0 }, + { "kmer", required_argument, NULL, 'k' }, + { "island", required_argument, NULL, 'T' }, + { "tip", required_argument, NULL, 't' }, + { "length", required_argument, NULL, 'l' }, + { "max-length", required_argument, NULL, 'L' }, + { "coverage", required_argument, NULL, 'c' }, + { "max-coverage", required_argument, NULL, 'C' }, + { "shim", no_argument, &opt::shim, 1 }, + { "no-shim", no_argument, &opt::shim, 0 }, { "shim-max-degree", required_argument, NULL, OPT_SHIM_MAX_DEG }, - { "assemble", no_argument, &opt::assemble, 1 }, - { "no-assemble", no_argument, &opt::assemble, 0 }, - { "min-overlap", required_argument, NULL, 'm' }, - { "verbose", no_argument, NULL, 'v' }, - { "help", no_argument, NULL, OPT_HELP }, - { "version", no_argument, NULL, OPT_VERSION }, + { "assemble", no_argument, &opt::assemble, 1 }, + { "no-assemble", no_argument, &opt::assemble, 0 }, + { "min-overlap", required_argument, NULL, 'm' }, + { "verbose", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, OPT_HELP }, + { "version", no_argument, NULL, OPT_VERSION }, { NULL, 0, NULL, 0 } }; static vector g_removed; /** Contig adjacency graph. */ -typedef ContigGraph > Graph; +typedef ContigGraph> Graph; typedef Graph::vertex_descriptor vertex_descriptor; typedef Graph::edge_descriptor edge_descriptor; /** Data for verbose output. */ -static struct { +static struct +{ unsigned removed; unsigned tails; unsigned too_long; @@ -194,7 +200,8 @@ static struct { } g_count; /** Returns if the contig can be removed from the graph. */ -static bool removable(const Graph* pg, vertex_descriptor v) +static bool +removable(const Graph* pg, vertex_descriptor v) { typedef graph_traits GTraits; typedef GTraits::out_edge_iterator OEit; @@ -248,8 +255,7 @@ static bool removable(const Graph* pg, vertex_descriptor v) if (g[*maxvw].distance < g[*vw].distance) maxvw = vw; - if (g[*maxuv].distance + (int)g[v].length + g[*maxvw].distance > - -opt::minOverlap) { + if (g[*maxuv].distance + (int)g[v].length + g[*maxvw].distance > -opt::minOverlap) { g_count.too_long++; return false; } @@ -257,20 +263,32 @@ static bool removable(const Graph* pg, vertex_descriptor v) } /** Data to store information of an edge. */ -struct EdgeInfo { +struct EdgeInfo +{ vertex_descriptor u; vertex_descriptor w; edge_bundle_type::type ep; EdgeInfo(vertex_descriptor u, vertex_descriptor w, int ep) - : u(u), w(w), ep(ep) {} - EdgeInfo() : u(), w(), ep() {} + : u(u) + , w(w) + , ep(ep) + {} + EdgeInfo() + : u() + , w() + , ep() + {} }; /** Returns a list of edges that may be added when the vertex v is * removed. */ -static bool findNewEdges(const Graph& g, vertex_descriptor v, - vector& eds, vector& markedContigs) +static bool +findNewEdges( + const Graph& g, + vertex_descriptor v, + vector& eds, + vector& markedContigs) { typedef graph_traits GTraits; typedef GTraits::vertex_descriptor V; @@ -288,8 +306,7 @@ static bool findNewEdges(const Graph& g, vertex_descriptor v, // for every edge from u->v and v->w we must add an edge u->w for (IEit uv = iei0; uv != iei1; ++uv) { for (OEit vw = oei0; vw != oei1; ++vw) { - int x = g[*uv].distance + (int)g[v].length + - g[*vw].distance; + int x = g[*uv].distance + (int)g[v].length + g[*vw].distance; assert(x <= 0); EdgeInfo ed(source(*uv, g), target(*vw, g), x); eds.push_back(ed); @@ -299,17 +316,16 @@ static bool findNewEdges(const Graph& g, vertex_descriptor v, marked.push_back(ed.w); } } - for (vector::const_iterator it = marked.begin(); - it != marked.end(); it++) + for (vector::const_iterator it = marked.begin(); it != marked.end(); it++) markedContigs[get(vertex_index, g, *it)] = true; return true; } /** Adds all edges described in the vector eds. */ -static void addNewEdges(Graph& g, const vector& eds) +static void +addNewEdges(Graph& g, const vector& eds) { - for (vector::const_iterator edsit = eds.begin(); - edsit != eds.end(); ++edsit) { + for (vector::const_iterator edsit = eds.begin(); edsit != eds.end(); ++edsit) { // Don't add parallel edges! This can happen when removing a palindrome. if (edge(edsit->u, edsit->w, g).second) { g_count.parallel_edge++; @@ -320,16 +336,18 @@ static void addNewEdges(Graph& g, const vector& eds) } } -static void removeContig(vertex_descriptor v, Graph& g) +static void +removeContig(vertex_descriptor v, Graph& g) { - clear_vertex(v, g); - remove_vertex(v, g); - g_removed.push_back(get(vertex_contig_index, g, v)); - g_count.removed++; + clear_vertex(v, g); + remove_vertex(v, g); + g_removed.push_back(get(vertex_contig_index, g, v)); + g_count.removed++; } /** Remove the specified contig from the adjacency graph. */ -static void removeContigs(Graph& g, vector& sc) +static void +removeContigs(Graph& g, vector& sc) { typedef graph_traits GTraits; typedef GTraits::vertex_descriptor V; @@ -338,13 +356,11 @@ static void removeContigs(Graph& g, vector& sc) out.reserve(sc.size()); vector markedContigs(g.num_vertices()); - for (vector::iterator it = sc.begin(); - it != sc.end(); ++it) { + for (vector::iterator it = sc.begin(); it != sc.end(); ++it) { V v = *it; if (opt::verbose > 0 && ++g_count.checked % 10000000 == 0) - cerr << "Removed " << g_count.removed << "/" - << g_count.checked - << " vertices that have been checked.\n"; + cerr << "Removed " << g_count.removed << "/" << g_count.checked + << " vertices that have been checked.\n"; if (markedContigs[get(vertex_index, g, v)]) { out.push_back(v); @@ -366,40 +382,47 @@ static void removeContigs(Graph& g, vector& sc) } /** Return the value of the bit at the specified index. */ -struct Marked : unary_function { +struct Marked : unary_function +{ typedef vector Data; Marked(const Graph& g, const Data& data) - : m_g(g), m_data(data) { } - bool operator()(vertex_descriptor u) const - { - return m_data[get(vertex_contig_index, m_g, u)]; - } + : m_g(g) + , m_data(data) + {} + bool operator()(vertex_descriptor u) const { return m_data[get(vertex_contig_index, m_g, u)]; } + private: const Graph& m_g; const Data& m_data; }; /** Finds all potentially removable contigs in the graph. */ -static void findShortContigs(const Graph& g, const vector& seen, - vector& sc) +static void +findShortContigs(const Graph& g, const vector& seen, vector& sc) { typedef graph_traits GTraits; typedef GTraits::vertex_iterator Vit; Vit first, second; tie(first, second) = vertices(g); - ::copy_if(first, second, back_inserter(sc), - !boost::lambda::bind(Marked(g, seen), _1) && boost::lambda::bind(removable, &g, _1)); + ::copy_if( + first, + second, + back_inserter(sc), + !boost::lambda::bind(Marked(g, seen), _1) && boost::lambda::bind(removable, &g, _1)); } /** Functor used for sorting contigs based on degree, then size, * and then ID. */ -struct sortContigs { +struct sortContigs +{ const Graph& g; - sortContigs(const Graph& g) : g(g) {} + sortContigs(const Graph& g) + : g(g) + {} - template - bool operator() (V a, V b) + template + bool operator()(V a, V b) { const ContigProperties& ap = g[a]; const ContigProperties& bp = g[b]; @@ -407,60 +430,71 @@ struct sortContigs { unsigned dega = out_degree(a, g) * in_degree(a, g); unsigned degb = out_degree(b, g) * in_degree(b, g); - return dega != degb ? dega < degb - : ap.length != bp.length ? ap.length < bp.length - : a < b; + return dega != degb ? dega < degb : ap.length != bp.length ? ap.length < bp.length : a < b; } }; -struct ShorterThanX : unary_function { +struct ShorterThanX : unary_function +{ const Graph& g; const vector& seen; size_t x; ShorterThanX(const Graph& g, const vector& seen, size_t x) - : g(g), seen(seen), x(x) { } + : g(g) + , seen(seen) + , x(x) + {} bool operator()(vertex_descriptor y) const { - return g[y].length < x && !get(vertex_removed, g, y) - && !seen[get(vertex_contig_index, g, y)]; + return g[y].length < x && !get(vertex_removed, g, y) && + !seen[get(vertex_contig_index, g, y)]; } }; -struct LongerThanX : unary_function { +struct LongerThanX : unary_function +{ const Graph& g; const vector& seen; size_t x; LongerThanX(const Graph& g, const vector& seen, size_t x) - : g(g), seen(seen), x(x) { } + : g(g) + , seen(seen) + , x(x) + {} bool operator()(vertex_descriptor y) const { - return g[y].length > x && !get(vertex_removed, g, y) - && !seen[get(vertex_contig_index, g, y)]; + return g[y].length > x && !get(vertex_removed, g, y) && + !seen[get(vertex_contig_index, g, y)]; } }; -struct CoverageLessThan : unary_function { +struct CoverageLessThan : unary_function +{ const Graph& g; const vector& seen; float minCov; CoverageLessThan(const Graph& g, const vector& seen, float minCov) - : g(g), seen(seen), minCov(minCov) { } + : g(g) + , seen(seen) + , minCov(minCov) + {} bool operator()(vertex_descriptor u) const { assert(opt::k > 0); float meanCoverage = (float)g[u].coverage / (g[u].length - opt::k + 1); - return meanCoverage < minCov && !get(vertex_removed, g, u) - && !seen[get(vertex_contig_index, g, u)]; + return meanCoverage < minCov && !get(vertex_removed, g, u) && + !seen[get(vertex_contig_index, g, u)]; } }; -static void removeShims(Graph& g, const vector& seen) +static void +removeShims(Graph& g, const vector& seen) { if (opt::verbose > 0) cerr << "Removing shim contigs from the graph...\n"; @@ -468,25 +502,22 @@ static void removeShims(Graph& g, const vector& seen) findShortContigs(g, seen, shortContigs); for (unsigned i = 0; !shortContigs.empty(); ++i) { if (opt::verbose > 0) - cerr << "Pass " << i + 1 << ": Checking " - << shortContigs.size() << " contigs.\n"; - sort(shortContigs.begin(), shortContigs.end(), - sortContigs(g)); + cerr << "Pass " << i + 1 << ": Checking " << shortContigs.size() << " contigs.\n"; + sort(shortContigs.begin(), shortContigs.end(), sortContigs(g)); removeContigs(g, shortContigs); } if (opt::verbose > 0) { cerr << "Shim removal stats:\n"; - cerr << "Removed: " << g_count.removed/2 - << " Too Complex: " << g_count.too_complex/2 - << " Tails: " << g_count.tails/2 - << " Too Long: " << g_count.too_long/2 - << " Self Adjacent: " << g_count.self_adj/2 - << " Parallel Edges: " << g_count.parallel_edge/2 << '\n'; + cerr << "Removed: " << g_count.removed / 2 << " Too Complex: " << g_count.too_complex / 2 + << " Tails: " << g_count.tails / 2 << " Too Long: " << g_count.too_long / 2 + << " Self Adjacent: " << g_count.self_adj / 2 + << " Parallel Edges: " << g_count.parallel_edge / 2 << '\n'; } } -template -static void removeContigs_if(Graph& g, pred p) +template +static void +removeContigs_if(Graph& g, pred p) { typedef graph_traits GTraits; typedef GTraits::vertex_iterator Vit; @@ -496,10 +527,11 @@ static void removeContigs_if(Graph& g, pred p) vector sc; ::copy_if(first, second, back_inserter(sc), p); remove_vertex_if(g, sc.begin(), sc.end(), True()); - transform(sc.begin(), sc.end(), back_inserter(g_removed), - mem_fun_ref(&ContigNode::contigIndex)); + transform(sc.begin(), sc.end(), back_inserter(g_removed), [](const ContigNode& c) { + return c.contigIndex(); + }); if (opt::verbose > 0) - cerr << "Removed " << sc.size()/2 << " contigs.\n"; + cerr << "Removed " << sc.size() / 2 << " contigs.\n"; } /** Contig sequences. */ @@ -507,7 +539,8 @@ typedef vector Contigs; static Contigs g_contigs; /** Return the sequence of vertex u. */ -static string getSequence(const Graph& g, vertex_descriptor u) +static string +getSequence(const Graph& g, vertex_descriptor u) { size_t i = get(vertex_contig_index, g, u); assert(i < g_contigs.size()); @@ -516,11 +549,13 @@ static string getSequence(const Graph& g, vertex_descriptor u) } /** Return whether the specified edge is inconsistent. */ -struct is_edge_inconsistent : unary_function { +struct is_edge_inconsistent : unary_function +{ const Graph& g; is_edge_inconsistent(const Graph& g) - : g(g) { } + : g(g) + {} bool operator()(edge_descriptor e) const { @@ -541,15 +576,17 @@ struct is_edge_inconsistent : unary_function { } }; -template -static void remove_edge(Graph& g, It first, It last) +template +static void +remove_edge(Graph& g, It first, It last) { for (; first != last; first++) remove_edge(*first, g); } template -static void removeEdges_if(Graph& g, pred p) +static void +removeEdges_if(Graph& g, pred p) { typedef graph_traits GTraits; typedef GTraits::edge_iterator Eit; @@ -565,86 +602,86 @@ static void removeEdges_if(Graph& g, pred p) } } -int main(int argc, char** argv) +int +main(int argc, char** argv) { string commandLine; { ostringstream ss; char** last = argv + argc - 1; - copy(argv, last, ostream_iterator(ss, " ")); + copy(argv, last, ostream_iterator(ss, " ")); ss << *last; commandLine = ss.str(); } bool die = false; - for (int c; (c = getopt_long(argc, argv, - shortopts, longopts, NULL)) != -1;) { + for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { istringstream arg(optarg != NULL ? optarg : ""); switch (c) { - case '?': + case '?': die = true; break; - case 'c': + case 'c': arg >> opt::minCoverage; break; - case 'C': + case 'C': arg >> opt::maxCoverage; break; - case 'l': + case 'l': arg >> opt::minLen; break; - case 'L': + case 'L': arg >> opt::maxLen; break; - case 'm': + case 'm': arg >> opt::minOverlap; break; - case 'g': + case 'g': arg >> opt::graphPath; break; - case 'i': + case 'i': arg >> opt::ignorePath; break; - case 'r': + case 'r': arg >> opt::removePath; break; - case 'k': + case 'k': arg >> opt::k; break; - case 'T': + case 'T': arg >> opt::minIslandLen; break; - case 't': + case 't': arg >> opt::minTipLen; break; - case 'v': + case 'v': opt::verbose++; break; - case OPT_SHIM_MAX_DEG: + case OPT_SHIM_MAX_DEG: arg >> opt::shimMaxDegree; break; - case OPT_HELP: + case OPT_HELP: cout << USAGE_MESSAGE; exit(EXIT_SUCCESS); - case OPT_VERSION: + case OPT_VERSION: cout << VERSION_MESSAGE; exit(EXIT_SUCCESS); } if (optarg != NULL && !arg.eof()) { - cerr << PROGRAM ": invalid option: `-" - << (char)c << optarg << "'\n"; + cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n"; exit(EXIT_FAILURE); } } if (opt::minOverlap < 0) { cerr << PROGRAM ": " - << "--min-overlap must be a positive integer.\n"; + << "--min-overlap must be a positive integer.\n"; die = true; } if (opt::k <= 0) { - cerr << PROGRAM ": " << "missing -k,--kmer option\n"; + cerr << PROGRAM ": " + << "missing -k,--kmer option\n"; die = true; } @@ -659,8 +696,7 @@ int main(int argc, char** argv) } if (die) { - cerr << "Try `" << PROGRAM - << " --help' for more information.\n"; + cerr << "Try `" << PROGRAM << " --help' for more information.\n"; exit(EXIT_FAILURE); } @@ -669,8 +705,7 @@ int main(int argc, char** argv) { string adjPath(argv[optind++]); if (opt::verbose > 0) - cerr << "Loading graph from file: " << adjPath - << '\n'; + cerr << "Loading graph from file: " << adjPath << '\n'; ifstream fin(adjPath.c_str()); assert_good(fin, adjPath); fin >> g; @@ -698,12 +733,11 @@ int main(int argc, char** argv) size_t b = g_removed.size(); while (in >> s) { size_t i = get(g_contigNames, s); - removeContig(ContigNode(i,0), g); + removeContig(ContigNode(i, 0), g); } assert(in.eof()); if (opt::verbose) - cerr << "Removed " << g_removed.size() - b - << " contigs.\n"; + cerr << "Removed " << g_removed.size() - b << " contigs.\n"; } // Remove shims. @@ -713,11 +747,9 @@ int main(int argc, char** argv) // Remove islands. if (opt::minIslandLen > 0) { size_t s = g_removed.size(); - removeIslands_if(g, back_inserter(g_removed), - ShorterThanX(g, seen, opt::minIslandLen)); + removeIslands_if(g, back_inserter(g_removed), ShorterThanX(g, seen, opt::minIslandLen)); if (opt::verbose) - cerr << "Removed " << g_removed.size() - s - << " islands.\n"; + cerr << "Removed " << g_removed.size() - s << " islands.\n"; } // Remove tips. @@ -726,12 +758,10 @@ int main(int argc, char** argv) s = g_removed.size(); do { prev = g_removed.size(); - pruneTips_if(g, back_inserter(g_removed), - ShorterThanX(g, seen, opt::minTipLen)); + pruneTips_if(g, back_inserter(g_removed), ShorterThanX(g, seen, opt::minTipLen)); } while (prev < g_removed.size()); if (opt::verbose) - cerr << "Removed " << g_removed.size() - s - << " tips.\n"; + cerr << "Removed " << g_removed.size() - s << " tips.\n"; } // Remove short contigs. @@ -748,8 +778,7 @@ int main(int argc, char** argv) // Remove contigs with high mean k-mer coverage. if (opt::maxCoverage > 0) - removeContigs_if(g, - std::not1(CoverageLessThan(g, seen, opt::maxCoverage))); + removeContigs_if(g, std::not1(CoverageLessThan(g, seen, opt::maxCoverage))); // Remove inconsistent edges of spaceseeds if (argc - optind == 1) { @@ -793,8 +822,7 @@ int main(int argc, char** argv) else assemble(g, back_inserter(paths)); g_contigNames.unlock(); - for (ContigPaths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) { ContigNode u(numContigs + it - paths.begin(), false); string name = createContigName(); put(vertex_name, g, u, name); diff --git a/Graph/ContigGraphAlgorithms.h b/Graph/ContigGraphAlgorithms.h index dd98fd5bd..f6ea421ef 100644 --- a/Graph/ContigGraphAlgorithms.h +++ b/Graph/ContigGraphAlgorithms.h @@ -7,8 +7,8 @@ #include "Common/Functional.h" #include "Common/Iterator.h" #include "Graph/ContigGraph.h" -#include #include +#include #include #include #include @@ -18,51 +18,49 @@ using boost::graph_traits; /** Return true if the edge e is a palindrome. */ template -struct IsPalindrome : std::unary_function< - typename graph_traits::edge_descriptor, bool> +struct IsPalindrome : std::unary_function::edge_descriptor, bool> { - IsPalindrome(const Graph& g) : m_g(g) { } - bool operator()( - typename graph_traits::edge_descriptor e) const + IsPalindrome(const Graph& g) + : m_g(g) + {} + bool operator()(typename graph_traits::edge_descriptor e) const { - return source(e, m_g) - == get(vertex_complement, m_g, target(e, m_g)); + return source(e, m_g) == get(vertex_complement, m_g, target(e, m_g)); } + private: const Graph& m_g; }; /** Return whether the outgoing edge of vertex u is contiguous. */ template -bool contiguous_out(const Graph& g, - typename graph_traits::vertex_descriptor u) +bool +contiguous_out(const Graph& g, typename graph_traits::vertex_descriptor u) { - return out_degree(u, g) == 1 - && in_degree(*adjacent_vertices(u, g).first, g) == 1; + return out_degree(u, g) == 1 && in_degree(*adjacent_vertices(u, g).first, g) == 1; } /** Return whether the incoming edge of vertex u is contiguous. */ template -bool contiguous_in(const Graph& g, - typename graph_traits::vertex_descriptor u) +bool +contiguous_in(const Graph& g, typename graph_traits::vertex_descriptor u) { return contiguous_out(g, get(vertex_complement, g, u)); } /** Add the outgoing edges of vertex u to vertex uout. */ template -void copy_out_edges(Graph &g, - typename Graph::vertex_descriptor u, - typename Graph::vertex_descriptor uout) +void +copy_out_edges( + Graph& g, + typename Graph::vertex_descriptor u, + typename Graph::vertex_descriptor uout) { - typedef typename graph_traits::vertex_descriptor - vertex_descriptor; - typedef typename graph_traits::out_edge_iterator - out_edge_iterator; + typedef typename graph_traits::vertex_descriptor vertex_descriptor; + typedef typename graph_traits::out_edge_iterator out_edge_iterator; typedef typename edge_property::type edge_property_type; assert(u != uout); - std::pair - edges = g.out_edges(u); + std::pair edges = g.out_edges(u); bool palindrome = false; edge_property_type palindrome_ep; for (out_edge_iterator e = edges.first; e != edges.second; ++e) { @@ -87,24 +85,20 @@ void copy_out_edges(Graph &g, /** Add the incoming edges of vertex u to vertex v. */ template -void copy_in_edges(Graph& g, - typename Graph::vertex_descriptor u, - typename Graph::vertex_descriptor v) +void +copy_in_edges(Graph& g, typename Graph::vertex_descriptor u, typename Graph::vertex_descriptor v) { - copy_out_edges(g, - get(vertex_complement, g, u), - get(vertex_complement, g, v)); + copy_out_edges(g, get(vertex_complement, g, u), get(vertex_complement, g, v)); } /** Assemble a path of unambigous out edges starting at vertex u. * u itself is not copied to out. */ template -OutIt extend(const Graph& g, - typename Graph::vertex_descriptor u, OutIt out) +OutIt +extend(const Graph& g, typename Graph::vertex_descriptor u, OutIt out) { - typedef typename graph_traits::vertex_descriptor - vertex_descriptor; + typedef typename graph_traits::vertex_descriptor vertex_descriptor; std::set seen; while (out_degree(u, g) == 1 && seen.insert(u).second) { u = *adjacent_vertices(u, g).first; @@ -116,12 +110,10 @@ OutIt extend(const Graph& g, /** Assemble an unambiguous path starting at vertex u. * Every edge must satisfy the predicate. */ template -OutIt assemble_if(const Graph& g, - typename Graph::vertex_descriptor u, OutIt out, - Predicate pred) +OutIt +assemble_if(const Graph& g, typename Graph::vertex_descriptor u, OutIt out, Predicate pred) { - typedef typename graph_traits::edge_descriptor - edge_descriptor; + typedef typename graph_traits::edge_descriptor edge_descriptor; while (contiguous_out(g, u)) { edge_descriptor e = *out_edges(u, g).first; if (!pred(e)) @@ -138,20 +130,21 @@ OutIt assemble_if(const Graph& g, * are removed as well. */ template -void remove_vertex_if(Graph& g, It first, It last, Predicate p) +void +remove_vertex_if(Graph& g, It first, It last, Predicate p) { - for_each_if(first, last, - bind1st(std::mem_fun(&Graph::clear_vertex), &g), p); - for_each_if(first, last, - bind1st(std::mem_fun(&Graph::remove_vertex), &g), p); + for_each_if( + first, last, [&g](const ContigNode& c) { return g.clear_vertex(c); }, p); + for_each_if( + first, last, [&g](const ContigNode& c) { return g.remove_vertex(c); }, p); } /** Add the vertex and edge propeties of the path [first, last). */ template -VP addProp(const Graph& g, It first, It last, const VP*) +VP +addProp(const Graph& g, It first, It last, const VP*) { - typedef typename graph_traits::vertex_descriptor - vertex_descriptor; + typedef typename graph_traits::vertex_descriptor vertex_descriptor; assert(first != last); VP vp = get(vertex_bundle, g, *first); for (It it = first + 1; it != last; ++it) { @@ -164,17 +157,17 @@ VP addProp(const Graph& g, It first, It last, const VP*) } template -no_property addProp(const Graph&, It, It, const no_property*) +no_property +addProp(const Graph&, It, It, const no_property*) { return no_property(); } template -typename vertex_property::type addProp(const Graph& g, - It first, It last) +typename vertex_property::type +addProp(const Graph& g, It first, It last) { - return addProp(g, first, last, - (typename vertex_property::type*)NULL); + return addProp(g, first, last, (typename vertex_property::type*)NULL); } /** Merge the vertices in the sequence [first, last). @@ -185,8 +178,7 @@ template typename graph_traits::vertex_descriptor merge(Graph& g, It first, It last) { - typedef typename graph_traits::vertex_descriptor - vertex_descriptor; + typedef typename graph_traits::vertex_descriptor vertex_descriptor; assert(first != last); vertex_descriptor u = add_vertex(addProp(g, first, last), g); copy_in_edges(g, *first, u); @@ -197,26 +189,24 @@ merge(Graph& g, It first, It last) /** Assemble unambiguous paths. Write the paths to out. * Every edge must satisfy the predicate. */ template -OutIt assemble_if(Graph& g, OutIt out, Predicate pred0) +OutIt +assemble_if(Graph& g, OutIt out, Predicate pred0) { typedef typename Graph::vertex_iterator vertex_iterator; // pred(e) = !isPalindrome(e) && pred0(e) - binary_compose, - std::unary_negate >, Predicate> - pred(compose2(std::logical_and(), - std::not1(IsPalindrome(g)), pred0)); + binary_compose, std::unary_negate>, Predicate> pred( + compose2(std::logical_and(), std::not1(IsPalindrome(g)), pred0)); std::pair uit = g.vertices(); for (vertex_iterator u = uit.first; u != uit.second; ++u) { - if (!contiguous_out(g, *u) || contiguous_in(g, *u) - || !pred(*out_edges(*u, g).first)) + if (!contiguous_out(g, *u) || contiguous_in(g, *u) || !pred(*out_edges(*u, g).first)) continue; typename output_iterator_traits::value_type path; assemble_if(g, *u, back_inserter(path), pred); assert(path.size() >= 2); assert(path.front() != path.back()); merge(g, path.begin(), path.end()); - remove_vertex_if(g, path.begin(), path.end(), - not1(std::mem_fun_ref(&ContigNode::ambiguous))); + remove_vertex_if( + g, path.begin(), path.end(), [](const ContigNode& c) { return !c.ambiguous(); }); *out++ = path; } return out; @@ -224,25 +214,25 @@ OutIt assemble_if(Graph& g, OutIt out, Predicate pred0) /** Assemble unambiguous paths. Write the paths to out. */ template -OutIt assemble(Graph& g, OutIt out) +OutIt +assemble(Graph& g, OutIt out) { - typedef typename graph_traits::edge_descriptor - edge_descriptor; + typedef typename graph_traits::edge_descriptor edge_descriptor; return assemble_if(g, out, True()); } /** Return true if the edge e is +ve sense. */ template -struct IsPositive : std::unary_function< - typename graph_traits::edge_descriptor, bool> +struct IsPositive : std::unary_function::edge_descriptor, bool> { - IsPositive(const Graph& g) : m_g(g) { } - bool operator()( - typename graph_traits::edge_descriptor e) const + IsPositive(const Graph& g) + : m_g(g) + {} + bool operator()(typename graph_traits::edge_descriptor e) const { - return !get(vertex_sense, m_g, source(e, m_g)) - && !get(vertex_sense, m_g, target(e, m_g)); + return !get(vertex_sense, m_g, source(e, m_g)) && !get(vertex_sense, m_g, target(e, m_g)); } + private: const Graph& m_g; }; @@ -250,7 +240,8 @@ struct IsPositive : std::unary_function< /** Assemble unambiguous paths in forward orientation only. * Write the paths to out. */ template -OutIt assemble_stranded(Graph& g, OutIt out) +OutIt +assemble_stranded(Graph& g, OutIt out) { return assemble_if(g, out, IsPositive(g)); } @@ -260,8 +251,9 @@ OutIt assemble_stranded(Graph& g, OutIt out) * deg+(v) = 0, and p(v) is true. * Stores all removed vertices in result. */ -template -OutputIt pruneTips_if(Graph& g, OutputIt result, Pred p) +template +OutputIt +pruneTips_if(Graph& g, OutputIt result, Pred p) { typedef typename graph_traits::adjacency_iterator Vit; typedef typename graph_traits::vertex_iterator Uit; @@ -277,7 +269,7 @@ OutputIt pruneTips_if(Graph& g, OutputIt result, Pred p) std::pair vrange = adjacent_vertices(u, g); for (Vit vit = vrange.first; vit != vrange.second; ++vit) { V v = *vit; - //assert(v != u); + // assert(v != u); if (out_degree(v, g) == 0 && p(v)) tips.push_back(v); } @@ -285,23 +277,23 @@ OutputIt pruneTips_if(Graph& g, OutputIt result, Pred p) /** Remove the tips. */ remove_vertex_if(g, tips.begin(), tips.end(), True()); - std::transform(tips.begin(), tips.end(), result, - std::mem_fun_ref(&ContigNode::contigIndex)); - + std::transform( + tips.begin(), tips.end(), result, [](const ContigNode& c) { return c.contigIndex(); }); return result; } /** Return true if the vertex is a normal 1-in 0-out tip. */ template -struct IsTip : std::unary_function< - typename graph_traits::vertex_descriptor, bool> +struct IsTip : std::unary_function::vertex_descriptor, bool> { - IsTip(const Graph& g) : m_g(g) { } - bool operator()( - typename graph_traits::vertex_descriptor v) const + IsTip(const Graph& g) + : m_g(g) + {} + bool operator()(typename graph_traits::vertex_descriptor v) const { return in_degree(v, m_g) == 1; } + private: const Graph& m_g; }; @@ -311,8 +303,9 @@ struct IsTip : std::unary_function< * and deg-(v) = 1 and deg+(v) = 0. * Stores all removed vertices in result. */ -template -OutputIt pruneTips(Graph& g, OutputIt result) +template +OutputIt +pruneTips(Graph& g, OutputIt result) { return pruneTips_if(g, result, IsTip(g)); } @@ -322,8 +315,9 @@ OutputIt pruneTips(Graph& g, OutputIt result) * true. * Stores all removed vertices in result. */ -template -OutputIt removeIslands_if(Graph& g, OutputIt result, Pred p) +template +OutputIt +removeIslands_if(Graph& g, OutputIt result, Pred p) { typedef typename graph_traits::vertex_iterator Uit; typedef typename graph_traits::vertex_descriptor V; @@ -344,8 +338,9 @@ OutputIt removeIslands_if(Graph& g, OutputIt result, Pred p) } /** Add missing complementary edges. */ -template -size_t addComplementaryEdges(ContigGraph& g) +template +size_t +addComplementaryEdges(ContigGraph& g) { typedef ContigGraph Graph; typedef graph_traits GTraits; diff --git a/KAligner/Aligner.h b/KAligner/Aligner.h index 947b8f603..6f2b7a11b 100644 --- a/KAligner/Aligner.h +++ b/KAligner/Aligner.h @@ -1,13 +1,13 @@ #ifndef ALIGNER_H #define ALIGNER_H 1 -#include "config.h" -#include "KAligner/Options.h" #include "Alignment.h" #include "ConstString.h" #include "Functional.h" +#include "KAligner/Options.h" #include "Kmer.h" #include "UnorderedMap.h" +#include "config.h" #include #include #include // for strcpy @@ -27,131 +27,124 @@ struct Position { uint32_t contig; uint32_t pos; // 0 indexed - Position(uint32_t contig = std::numeric_limits::max(), - uint32_t pos = std::numeric_limits::max()) - : contig(contig), pos(pos) { } + Position( + uint32_t contig = std::numeric_limits::max(), + uint32_t pos = std::numeric_limits::max()) + : contig(contig) + , pos(pos) + {} /** Mark this seed as a duplicate. */ - void setDuplicate(const char* thisContig, const char* otherContig, - const Sequence& kmer) + void setDuplicate(const char* thisContig, const char* otherContig, const Sequence& kmer) { if (opt::multimap == opt::IGNORE) contig = std::numeric_limits::max(); else { - std::cerr << "error: duplicate k-mer in " - << thisContig - << " also in " - << otherContig - << ": " << kmer << '\n'; + std::cerr << "error: duplicate k-mer in " << thisContig << " also in " << otherContig + << ": " << kmer << '\n'; exit(EXIT_FAILURE); } } /** Return whether this seed is a duplciate. */ - bool isDuplicate() const - { - return contig == std::numeric_limits::max(); - } + bool isDuplicate() const { return contig == std::numeric_limits::max(); } }; -typedef unordered_multimap > - SeqPosHashMultiMap; +typedef unordered_multimap> SeqPosHashMultiMap; #if HAVE_GOOGLE_SPARSE_HASH_MAP -# include -typedef google::sparse_hash_map > - SeqPosHashUniqueMap; +#include +typedef google::sparse_hash_map> SeqPosHashUniqueMap; #else -typedef unordered_map > - SeqPosHashUniqueMap; +typedef unordered_map> SeqPosHashUniqueMap; #endif - typedef std::vector AlignmentVector; /** * Index a target sequence and align query sequences to that indexed * target. */ -template +template class Aligner { - public: - typedef typename SeqPosHashMap::iterator map_iterator; - typedef typename SeqPosHashMap::const_iterator - map_const_iterator; - - Aligner(int hashSize, size_t buckets) - : m_hashSize(hashSize), m_target(buckets) { } - - Aligner(int hashSize, size_t buckets, float factor) - : m_hashSize(hashSize) - { - m_target.max_load_factor(factor); - m_target.rehash(buckets); - } + public: + typedef typename SeqPosHashMap::iterator map_iterator; + typedef typename SeqPosHashMap::const_iterator map_const_iterator; - void addReferenceSequence(const StringID& id, - const Sequence& seq); - void addReferenceSequence(const Kmer& kmer, Position pos); + Aligner(int hashSize, size_t buckets) + : m_hashSize(hashSize) + , m_target(buckets) + {} - template - void alignRead(const std::string& qid, const Sequence& seq, - oiterator dest); + Aligner(int hashSize, size_t buckets, float factor) + : m_hashSize(hashSize) + { + m_target.max_load_factor(factor); + m_target.rehash(buckets); + } - size_t size() const { return m_target.size(); } - size_t bucket_count() const - { - return m_target.bucket_count(); - } + void addReferenceSequence(const StringID& id, const Sequence& seq); + void addReferenceSequence(const Kmer& kmer, Position pos); - /** Return the number of duplicate k-mer in the target. */ - size_t countDuplicates() const - { - assert(opt::multimap == opt::IGNORE); - return count_if(m_target.begin(), m_target.end(), - compose1(std::mem_fun_ref(&Position::isDuplicate), - mem_var(&SeqPosHashMap::value_type::second))); - } + template + void alignRead(const std::string& qid, const Sequence& seq, oiterator dest); + + size_t size() const { return m_target.size(); } + size_t bucket_count() const { return m_target.bucket_count(); } - private: - explicit Aligner(const Aligner&); + /** Return the number of duplicate k-mer in the target. */ + size_t countDuplicates() const + { + assert(opt::multimap == opt::IGNORE); + return count_if( + m_target.begin(), m_target.end(), [](const std::pair& s) { + return s.second.isDuplicate(); + }); + } - typedef std::map AlignmentSet; + private: + explicit Aligner(const Aligner&); - void alignKmer( - AlignmentSet& aligns, const Sequence& kmer, - bool isRC, bool good, int read_ind, int seqLen); + typedef std::map AlignmentSet; - AlignmentSet getAlignmentsInternal( - const Sequence& seq, bool isRC); + void alignKmer( + AlignmentSet& aligns, + const Sequence& kmer, + bool isRC, + bool good, + int read_ind, + int seqLen); - template - void coalesceAlignments( - const std::string& qid, const std::string& seq, - const AlignmentSet& alignSet, - oiterator& dest); + AlignmentSet getAlignmentsInternal(const Sequence& seq, bool isRC); - // The number of bases to hash on - int m_hashSize; + template + void coalesceAlignments( + const std::string& qid, + const std::string& seq, + const AlignmentSet& alignSet, + oiterator& dest); - /** A map of k-mer to contig coordinates. */ - SeqPosHashMap m_target; + // The number of bases to hash on + int m_hashSize; - /** A dictionary of contig IDs. */ - std::vector m_dict; + /** A map of k-mer to contig coordinates. */ + SeqPosHashMap m_target; - unsigned contigIDToIndex(const std::string& id) - { - m_dict.push_back(id); - return m_dict.size() - 1; - } + /** A dictionary of contig IDs. */ + std::vector m_dict; - cstring contigIndexToID(unsigned index) - { - assert(index < m_dict.size()); - return m_dict[index]; - } + unsigned contigIDToIndex(const std::string& id) + { + m_dict.push_back(id); + return m_dict.size() - 1; + } + + cstring contigIndexToID(unsigned index) + { + assert(index < m_dict.size()); + return m_dict[index]; + } }; #endif diff --git a/KAligner/PipeMux.h b/KAligner/PipeMux.h index a3d702d00..56afed07d 100644 --- a/KAligner/PipeMux.h +++ b/KAligner/PipeMux.h @@ -2,16 +2,19 @@ #define PIPEMUX_H 1 #include "Pipe.h" -#include +#include "Semaphore.h" #include #include -template -class PipeMux { +template +class PipeMux +{ public: - /** Default constructor. */ - PipeMux(size_t pipe_size = 1) - : m_index(0), m_entry_num(0), m_pipe_size(pipe_size) + /** Default constructor. */ + PipeMux(size_t pipe_size = 1) + : m_index(0) + , m_entry_num(0) + , m_pipe_size(pipe_size) { pthread_rwlock_init(&m_rwlock_vecs, NULL); pthread_mutex_init(&m_mutex_index, NULL); @@ -94,7 +97,8 @@ class PipeMux { } /** Checks that the PipeMux is empty. */ - bool empty() { + bool empty() + { pthread_rwlock_rdlock(&m_rwlock_vecs); bool isEmpty = m_pipes.empty(); pthread_rwlock_unlock(&m_rwlock_vecs); @@ -128,11 +132,11 @@ class PipeMux { } assert(i < m_pipes.size()); delete m_pipes[i]; - m_pipes.erase(m_pipes.begin()+i); + m_pipes.erase(m_pipes.begin() + i); assert(i < m_mutex_pipes.size()); pthread_mutex_destroy(m_mutex_pipes[i]); delete m_mutex_pipes[i]; - m_mutex_pipes.erase(m_mutex_pipes.begin()+i); + m_mutex_pipes.erase(m_mutex_pipes.begin() + i); // Make sure the index points to the next element. pthread_mutex_lock(&m_mutex_index); m_index = m_index == m_pipes.size() ? 0 : m_index; @@ -140,7 +144,6 @@ class PipeMux { pthread_rwlock_unlock(&m_rwlock_vecs); } - }; #endif diff --git a/Layout/layout.cc b/Layout/layout.cc index 43a51210e..cf02b8013 100644 --- a/Layout/layout.cc +++ b/Layout/layout.cc @@ -1,7 +1,5 @@ -#include "config.h" #include "ContigPath.h" #include "ContigProperties.h" -#include "Uncompress.h" #include "Graph/Assemble.h" #include "Graph/ContigGraph.h" #include "Graph/ContigGraphAlgorithms.h" @@ -9,6 +7,8 @@ #include "Graph/GraphAlgorithms.h" #include "Graph/GraphIO.h" #include "Graph/GraphUtil.h" +#include "Uncompress.h" +#include "config.h" #include #include #include @@ -23,92 +23,94 @@ using boost::tie; #define PROGRAM "abyss-layout" -static const char VERSION_MESSAGE[] = -PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" -"Written by Shaun Jackman.\n" -"\n" -"Copyright 2012 Shaun Jackman\n"; +static const char VERSION_MESSAGE[] = PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" + "Written by Shaun Jackman.\n" + "\n" + "Copyright 2012 Shaun Jackman\n"; static const char USAGE_MESSAGE[] = -"Usage: " PROGRAM " [OPTION]... OVERLAP\n" -"Layout contigs using the sequence overlap graph.\n" -"Output sequence paths.\n" -"\n" -" Arguments:\n" -"\n" -" OVERLAP the sequence overlap graph\n" -"\n" -" Options:\n" -"\n" -" -s, --min-length=N minimum sequence length [0]\n" -" -m, --min-overlap=N minimum overlap [0]\n" -" -k, --kmer=N length of a k-mer\n" -" -o, --out=FILE write the paths to FILE\n" -" -g, --graph=FILE write the graph to FILE\n" -" --tred remove transitive edges\n" -" --no-tred do not remove transitive edges [default]\n" -" --SS expect contigs to be oriented correctly\n" -" --no-SS no assumption about contig orientation [default]\n" -" -v, --verbose display verbose output\n" -" --help display this help and exit\n" -" --version output version information and exit\n" -"\n" -"Report bugs to <" PACKAGE_BUGREPORT ">.\n"; + "Usage: " PROGRAM " [OPTION]... OVERLAP\n" + "Layout contigs using the sequence overlap graph.\n" + "Output sequence paths.\n" + "\n" + " Arguments:\n" + "\n" + " OVERLAP the sequence overlap graph\n" + "\n" + " Options:\n" + "\n" + " -s, --min-length=N minimum sequence length [0]\n" + " -m, --min-overlap=N minimum overlap [0]\n" + " -k, --kmer=N length of a k-mer\n" + " -o, --out=FILE write the paths to FILE\n" + " -g, --graph=FILE write the graph to FILE\n" + " --tred remove transitive edges\n" + " --no-tred do not remove transitive edges [default]\n" + " --SS expect contigs to be oriented correctly\n" + " --no-SS no assumption about contig orientation [default]\n" + " -v, --verbose display verbose output\n" + " --help display this help and exit\n" + " --version output version information and exit\n" + "\n" + "Report bugs to <" PACKAGE_BUGREPORT ">.\n"; namespace opt { - unsigned k; // used by ContigProperties +unsigned k; // used by ContigProperties - /** Minimum sequence length. */ - static unsigned minLength; +/** Minimum sequence length. */ +static unsigned minLength; - /** Minimum overlap. */ - static unsigned minOverlap; +/** Minimum overlap. */ +static unsigned minOverlap; - /** Write the paths to this file. */ - static string out; +/** Write the paths to this file. */ +static string out; - /** Write the graph to this file. */ - static string graphPath; +/** Write the graph to this file. */ +static string graphPath; - /** Remove transitive edges. */ - static int tred; +/** Remove transitive edges. */ +static int tred; - /** Run a strand-specific RNA-Seq assembly. */ - static int ss; +/** Run a strand-specific RNA-Seq assembly. */ +static int ss; - /** Verbose output. */ - int verbose; // used by PopBubbles +/** Verbose output. */ +int verbose; // used by PopBubbles - /** Output format */ - int format = DOT; +/** Output format */ +int format = DOT; } static const char shortopts[] = "g:k:m:o:s:v"; -enum { OPT_HELP = 1, OPT_VERSION }; - -static const struct option longopts[] = { - { "graph", required_argument, NULL, 'g' }, - { "kmer", required_argument, NULL, 'k' }, - { "min-overlap", required_argument, NULL, 'm' }, - { "out", required_argument, NULL, 'o' }, - { "min-length", required_argument, NULL, 's' }, - { "tred", no_argument, &opt::tred, true }, - { "no-tred", no_argument, &opt::tred, false }, - { "SS", no_argument, &opt::ss, 1 }, - { "no-SS", no_argument, &opt::ss, 0 }, - { "verbose", no_argument, NULL, 'v' }, - { "help", no_argument, NULL, OPT_HELP }, - { "version", no_argument, NULL, OPT_VERSION }, - { NULL, 0, NULL, 0 } +enum +{ + OPT_HELP = 1, + OPT_VERSION }; +static const struct option longopts[] = { { "graph", required_argument, NULL, 'g' }, + { "kmer", required_argument, NULL, 'k' }, + { "min-overlap", required_argument, NULL, 'm' }, + { "out", required_argument, NULL, 'o' }, + { "min-length", required_argument, NULL, 's' }, + { "tred", no_argument, &opt::tred, true }, + { "no-tred", no_argument, &opt::tred, false }, + { "SS", no_argument, &opt::ss, 1 }, + { "no-SS", no_argument, &opt::ss, 0 }, + { "verbose", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, OPT_HELP }, + { "version", no_argument, NULL, OPT_VERSION }, + { NULL, 0, NULL, 0 } }; + /** An overlap graph. */ typedef DirectedGraph DG; typedef ContigGraph Graph; /** Remove short vertices. */ -static void filterVertices(Graph& g, unsigned minLength) +static void +filterVertices(Graph& g, unsigned minLength) { typedef graph_traits GTraits; typedef GTraits::vertex_descriptor V; @@ -131,15 +133,17 @@ static void filterVertices(Graph& g, unsigned minLength) } if (opt::verbose > 0) { - cerr << "Ignored " << numRemoved << " sequences shorter than " - << minLength << " bp.\n"; + cerr << "Ignored " << numRemoved << " sequences shorter than " << minLength << " bp.\n"; printGraphStats(cerr, g); } } /** Return true if the edge is a small overlap. */ -struct IsSmallOverlap { - IsSmallOverlap(Graph& g) : m_g(g) { } +struct IsSmallOverlap +{ + IsSmallOverlap(Graph& g) + : m_g(g) + {} bool operator()(graph_traits::edge_descriptor e) const { int maxDistance = -opt::minOverlap; @@ -149,7 +153,8 @@ struct IsSmallOverlap { }; /** Remove small overlaps. */ -static void filterEdges(Graph& g, unsigned minOverlap) +static void +filterEdges(Graph& g, unsigned minOverlap) { if (minOverlap == 0) return; @@ -163,7 +168,8 @@ static void filterEdges(Graph& g, unsigned minOverlap) } /** Read a graph from the specified file. */ -static void readGraph(const string& path, Graph& g) +static void +readGraph(const string& path, Graph& g) { if (opt::verbose > 0) cerr << "Reading `" << path << "'...\n"; @@ -178,7 +184,8 @@ static void readGraph(const string& path, Graph& g) } /** Return the length histogram. */ -static Histogram buildLengthHistogram(const Graph& g) +static Histogram +buildLengthHistogram(const Graph& g) { typedef graph_traits::vertex_descriptor V; typedef graph_traits::vertex_iterator Vit; @@ -193,44 +200,43 @@ static Histogram buildLengthHistogram(const Graph& g) } /** Run abyss-layout. */ -int main(int argc, char** argv) +int +main(int argc, char** argv) { bool die = false; - for (int c; (c = getopt_long(argc, argv, - shortopts, longopts, NULL)) != -1;) { + for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { istringstream arg(optarg != NULL ? optarg : ""); switch (c) { - case '?': + case '?': die = true; break; - case 'k': + case 'k': arg >> opt::k; break; - case 'g': + case 'g': arg >> opt::graphPath; break; - case 'm': + case 'm': arg >> opt::minOverlap; break; - case 'o': + case 'o': arg >> opt::out; break; - case 's': + case 's': arg >> opt::minLength; break; - case 'v': + case 'v': opt::verbose++; break; - case OPT_HELP: + case OPT_HELP: cout << USAGE_MESSAGE; exit(EXIT_SUCCESS); - case OPT_VERSION: + case OPT_VERSION: cout << VERSION_MESSAGE; exit(EXIT_SUCCESS); } if (optarg != NULL && !arg.eof()) { - cerr << PROGRAM ": invalid option: `-" - << (char)c << optarg << "'\n"; + cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n"; exit(EXIT_FAILURE); } } @@ -241,8 +247,7 @@ int main(int argc, char** argv) } if (die) { - cerr << "Try `" << PROGRAM - << " --help' for more information.\n"; + cerr << "Try `" << PROGRAM << " --help' for more information.\n"; exit(EXIT_FAILURE); } @@ -263,8 +268,7 @@ int main(int argc, char** argv) if (opt::tred) { unsigned numTransitive = remove_transitive_edges(g); if (opt::verbose > 0) { - cerr << "Removed " << numTransitive - << " transitive edges.\n"; + cerr << "Removed " << numTransitive << " transitive edges.\n"; printGraphStats(cerr, g); } } @@ -278,11 +282,9 @@ int main(int argc, char** argv) sort(paths.begin(), paths.end()); if (opt::verbose > 0) { unsigned n = 0; - for (ContigPaths::const_iterator it = paths.begin(); - it != paths.end(); ++it) + for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) n += it->size(); - cerr << "Assembled " << n << " sequences in " - << paths.size() << " contigs.\n"; + cerr << "Assembled " << n << " sequences in " << paths.size() << " contigs.\n"; printGraphStats(cerr, g); } @@ -291,18 +293,16 @@ int main(int argc, char** argv) ostream& out = opt::out.empty() || opt::out == "-" ? cout : fout; assert_good(out, opt::out); g_contigNames.unlock(); - for (vector::const_iterator it = paths.begin(); - it != paths.end(); ++it) + for (vector::const_iterator it = paths.begin(); it != paths.end(); ++it) out << createContigName() << '\t' << *it << '\n'; assert_good(out, opt::out); // Create the new vertices. - for (vector::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (vector::const_iterator it = paths.begin(); it != paths.end(); ++it) { const ContigPath& path = *it; merge(g, path.begin(), path.end()); - remove_vertex_if(g, path.begin(), path.end(), - not1(std::mem_fun_ref(&ContigNode::ambiguous))); + remove_vertex_if( + g, path.begin(), path.end(), [](const ContigNode& c) { return !c.ambiguous(); }); } if (opt::verbose > 0) printGraphStats(cerr, g); diff --git a/Makefile.am b/Makefile.am index 747446486..b9d156088 100644 --- a/Makefile.am +++ b/Makefile.am @@ -69,8 +69,16 @@ clean-local: clang-format: for i in Bloom/RollingBloomDBGVisitor.h Bloom/bloom.cc BloomDBG/BloomIO.h \ BloomDBG/Checkpoint.h BloomDBG/HashAgnosticCascadingBloom.h BloomDBG/bloom-dbg.* \ + ABYSS/abyss.cc Assembly/BranchGroup.h FMIndex/BitArrays.h FilterGraph/FilterGraph.cc \ + Graph/ContigGraphAlgorithms.h KAligner/Aligner.h KAligner/PipeMux.h Layout/layout.cc \ + MergePaths/MergeContigs.cpp MergePaths/MergePaths.cpp ParseAligns/ParseAligns.cpp \ + ParseAligns/abyss-fixmate.cc PathOverlap/PathOverlap.cpp PopBubbles/PopBubbles.cpp Scaffold/scaffold.cc \ Unittest/BloomDBG/HashAgnosticCascadingBloomTest.cpp; do clang-format -style=file $$i >$$i.fixed; done for i in Bloom/RollingBloomDBGVisitor.h Bloom/bloom.cc BloomDBG/BloomIO.h \ BloomDBG/Checkpoint.h BloomDBG/HashAgnosticCascadingBloom.h BloomDBG/bloom-dbg.* \ + ABYSS/abyss.cc Assembly/BranchGroup.h FMIndex/BitArrays.h FilterGraph/FilterGraph.cc \ + Graph/ContigGraphAlgorithms.h KAligner/Aligner.h KAligner/PipeMux.h Layout/layout.cc \ + MergePaths/MergeContigs.cpp MergePaths/MergePaths.cpp ParseAligns/ParseAligns.cpp \ + ParseAligns/abyss-fixmate.cc PathOverlap/PathOverlap.cpp PopBubbles/PopBubbles.cpp Scaffold/scaffold.cc \ Unittest/BloomDBG/HashAgnosticCascadingBloomTest.cpp; do diff -su $$i $$i.fixed && rm -f $$i.fixed; done if ls *.fixed; then exit 1; fi diff --git a/MergePaths/MergeContigs.cpp b/MergePaths/MergeContigs.cpp index cad36ecf3..78692ef55 100644 --- a/MergePaths/MergeContigs.cpp +++ b/MergePaths/MergeContigs.cpp @@ -1,24 +1,26 @@ -#include "config.h" #include "Common/Options.h" #include "ContigNode.h" #include "ContigPath.h" #include "ContigProperties.h" +#include "DataBase/DB.h" +#include "DataBase/Options.h" #include "DataLayer/Options.h" #include "Dictionary.h" #include "FastaReader.h" -#include "Histogram.h" -#include "IOUtil.h" -#include "MemoryUtil.h" -#include "smith_waterman.h" -#include "Sequence.h" -#include "StringUtil.h" -#include "Uncompress.h" #include "Graph/ContigGraph.h" #include "Graph/ContigGraphAlgorithms.h" #include "Graph/DirectedGraph.h" #include "Graph/GraphIO.h" #include "Graph/GraphUtil.h" #include "Graph/Options.h" +#include "Histogram.h" +#include "IOUtil.h" +#include "MemoryUtil.h" +#include "Sequence.h" +#include "StringUtil.h" +#include "Uncompress.h" +#include "config.h" +#include "smith_waterman.h" #include #include #include @@ -26,8 +28,6 @@ #include #include #include -#include "DataBase/Options.h" -#include "DataBase/DB.h" using namespace std; @@ -36,104 +36,116 @@ using namespace std; DB db; static const char VERSION_MESSAGE[] = -PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" -"Written by Shaun Jackman.\n" -"\n" -"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; + PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" + "Written by Shaun Jackman.\n" + "\n" + "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; static const char USAGE_MESSAGE[] = -"Usage: " PROGRAM " -k -o [OPTION]... FASTA [OVERLAP] PATH\n" -"Merge paths of contigs to create larger contigs.\n" -"\n" -" Arguments:\n" -"\n" -" FASTA contigs in FASTA format\n" -" OVERLAP contig overlap graph\n" -" PATH sequences of contig IDs\n" -"\n" -" Options:\n" -"\n" -" -k, --kmer=KMER_SIZE k-mer size\n" -" -o, --out=FILE output the merged contigs to FILE [stdout]\n" -" -g, --graph=FILE write the contig overlap graph to FILE\n" -" --merged output only merged contigs\n" -" --adj output the graph in adj format\n" -" --dot output the graph in dot format [default]\n" -" --dot-meancov same as above but give the mean coverage\n" -" --gfa output the graph in GFA1 format\n" -" --gfa1 output the graph in GFA1 format\n" -" --gfa2 output the graph in GFA2 format\n" -" --gv output the graph in GraphViz format\n" -" --sam output the graph in SAM format\n" -" -v, --verbose display verbose output\n" -" --help display this help and exit\n" -" --version output version information and exit\n" -" --db=FILE specify path of database repository in FILE\n" -" --library=NAME specify library NAME for database\n" -" --strain=NAME specify strain NAME for database\n" -" --species=NAME specify species NAME for database\n" -"\n" -"Report bugs to <" PACKAGE_BUGREPORT ">.\n"; + "Usage: " PROGRAM " -k -o [OPTION]... FASTA [OVERLAP] PATH\n" + "Merge paths of contigs to create larger contigs.\n" + "\n" + " Arguments:\n" + "\n" + " FASTA contigs in FASTA format\n" + " OVERLAP contig overlap graph\n" + " PATH sequences of contig IDs\n" + "\n" + " Options:\n" + "\n" + " -k, --kmer=KMER_SIZE k-mer size\n" + " -o, --out=FILE output the merged contigs to FILE [stdout]\n" + " -g, --graph=FILE write the contig overlap graph to FILE\n" + " --merged output only merged contigs\n" + " --adj output the graph in adj format\n" + " --dot output the graph in dot format [default]\n" + " --dot-meancov same as above but give the mean coverage\n" + " --gfa output the graph in GFA1 format\n" + " --gfa1 output the graph in GFA1 format\n" + " --gfa2 output the graph in GFA2 format\n" + " --gv output the graph in GraphViz format\n" + " --sam output the graph in SAM format\n" + " -v, --verbose display verbose output\n" + " --help display this help and exit\n" + " --version output version information and exit\n" + " --db=FILE specify path of database repository in FILE\n" + " --library=NAME specify library NAME for database\n" + " --strain=NAME specify strain NAME for database\n" + " --species=NAME specify species NAME for database\n" + "\n" + "Report bugs to <" PACKAGE_BUGREPORT ">.\n"; namespace opt { - string db; - dbVars metaVars; - unsigned k; // used by ContigProperties - unsigned pathCount; // num of initial paths +string db; +dbVars metaVars; +unsigned k; // used by ContigProperties +unsigned pathCount; // num of initial paths - /** Output FASTA path. */ - static string out = "-"; +/** Output FASTA path. */ +static string out = "-"; - /** Output graph path. */ - static string graphPath; +/** Output graph path. */ +static string graphPath; - /** Output graph format. */ - int format = DOT; +/** Output graph format. */ +int format = DOT; - /** Output only merged contigs. */ - int onlyMerged; +/** Output only merged contigs. */ +int onlyMerged; - /** Minimum overlap. */ - static unsigned minOverlap = 20; +/** Minimum overlap. */ +static unsigned minOverlap = 20; - /** Minimum alignment identity. */ - static float minIdentity = 0.9; +/** Minimum alignment identity. */ +static float minIdentity = 0.9; } static const char shortopts[] = "g:k:o:v"; -enum { OPT_HELP = 1, OPT_VERSION, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES }; -//enum { OPT_HELP = 1, OPT_VERSION }; - -static const struct option longopts[] = { - { "adj", no_argument, &opt::format, ADJ }, - { "dot", no_argument, &opt::format, DOT }, - { "dot-meancov", no_argument, &opt::format, DOT_MEANCOV }, - { "gfa", no_argument, &opt::format, GFA1 }, - { "gfa1", no_argument, &opt::format, GFA1 }, - { "gfa2", no_argument, &opt::format, GFA2 }, - { "gv", no_argument, &opt::format, DOT }, - { "sam", no_argument, &opt::format, SAM }, - { "graph", required_argument, NULL, 'g' }, - { "kmer", required_argument, NULL, 'k' }, - { "merged", no_argument, &opt::onlyMerged, 1 }, - { "out", required_argument, NULL, 'o' }, - { "path", required_argument, NULL, 'p' }, - { "verbose", no_argument, NULL, 'v' }, - { "help", no_argument, NULL, OPT_HELP }, - { "version", no_argument, NULL, OPT_VERSION }, - { "db", required_argument, NULL, OPT_DB }, - { "library", required_argument, NULL, OPT_LIBRARY }, - { "strain", required_argument, NULL, OPT_STRAIN }, - { "species", required_argument, NULL, OPT_SPECIES }, - { NULL, 0, NULL, 0 } +enum +{ + OPT_HELP = 1, + OPT_VERSION, + OPT_DB, + OPT_LIBRARY, + OPT_STRAIN, + OPT_SPECIES }; +// enum { OPT_HELP = 1, OPT_VERSION }; + +static const struct option longopts[] = { { "adj", no_argument, &opt::format, ADJ }, + { "dot", no_argument, &opt::format, DOT }, + { "dot-meancov", no_argument, &opt::format, DOT_MEANCOV }, + { "gfa", no_argument, &opt::format, GFA1 }, + { "gfa1", no_argument, &opt::format, GFA1 }, + { "gfa2", no_argument, &opt::format, GFA2 }, + { "gv", no_argument, &opt::format, DOT }, + { "sam", no_argument, &opt::format, SAM }, + { "graph", required_argument, NULL, 'g' }, + { "kmer", required_argument, NULL, 'k' }, + { "merged", no_argument, &opt::onlyMerged, 1 }, + { "out", required_argument, NULL, 'o' }, + { "path", required_argument, NULL, 'p' }, + { "verbose", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, OPT_HELP }, + { "version", no_argument, NULL, OPT_VERSION }, + { "db", required_argument, NULL, OPT_DB }, + { "library", required_argument, NULL, OPT_LIBRARY }, + { "strain", required_argument, NULL, OPT_STRAIN }, + { "species", required_argument, NULL, OPT_SPECIES }, + { NULL, 0, NULL, 0 } }; /* A contig sequence. */ -struct Contig { +struct Contig +{ Contig(const string& comment, const string& seq) - : comment(comment), seq(seq) { } - Contig(const FastaRecord& o) : comment(o.comment), seq(o.seq) { } + : comment(comment) + , seq(seq) + {} + Contig(const FastaRecord& o) + : comment(o.comment) + , seq(o.seq) + {} string comment; string seq; }; @@ -144,7 +156,8 @@ typedef vector Contigs; /** Return the sequence of the specified contig node. The sequence * may be ambiguous or reverse complemented. */ -static Sequence sequence(const Contigs& contigs, const ContigNode& id) +static Sequence +sequence(const Contigs& contigs, const ContigNode& id) { if (id.ambiguous()) { string s(id.ambiguousSequence()); @@ -160,22 +173,22 @@ static Sequence sequence(const Contigs& contigs, const ContigNode& id) /** Return a consensus sequence of a and b. * @return an empty string if a consensus could not be found */ -static string createConsensus(const Sequence& a, const Sequence& b) +static string +createConsensus(const Sequence& a, const Sequence& b) { assert(a.length() == b.length()); if (a == b) return a; string s; s.reserve(a.length()); - for (string::const_iterator ita = a.begin(), itb = b.begin(); - ita != a.end(); ++ita, ++itb) { + for (string::const_iterator ita = a.begin(), itb = b.begin(); ita != a.end(); ++ita, ++itb) { bool mask = islower(*ita) || islower(*itb); char ca = toupper(*ita), cb = toupper(*itb); - char c = ca == cb ? ca - : ca == 'N' ? cb - : cb == 'N' ? ca - : ambiguityIsSubset(ca, cb) ? ambiguityOr(ca, cb) - : 'x'; + char c = ca == cb + ? ca + : ca == 'N' + ? cb + : cb == 'N' ? ca : ambiguityIsSubset(ca, cb) ? ambiguityOr(ca, cb) : 'x'; if (c == 'x') return string(""); s += mask ? tolower(c) : c; @@ -183,25 +196,28 @@ static string createConsensus(const Sequence& a, const Sequence& b) return s; } -typedef ContigGraph > Graph; +typedef ContigGraph> Graph; typedef graph_traits::vertex_descriptor vertex_descriptor; /** Return the properties of the specified vertex, unless u is * ambiguous, in which case return the length of the ambiguous * sequence. */ -static inline -ContigProperties get(vertex_bundle_t, const Graph& g, ContigNode u) +static inline ContigProperties +get(vertex_bundle_t, const Graph& g, ContigNode u) { - return u.ambiguous() - ? ContigProperties(u.length() + opt::k - 1, 0) - : g[u]; + return u.ambiguous() ? ContigProperties(u.length() + opt::k - 1, 0) : g[u]; } /** Append the sequence of contig v to seq. */ -static void mergeContigs(const Graph& g, const Contigs& contigs, - vertex_descriptor u, vertex_descriptor v, - Sequence& seq, const ContigPath& path) +static void +mergeContigs( + const Graph& g, + const Contigs& contigs, + vertex_descriptor u, + vertex_descriptor v, + Sequence& seq, + const ContigPath& path) { int d = get(edge_bundle, g, u, v).distance; assert(d < 0); @@ -235,14 +251,13 @@ static void mergeContigs(const Graph& g, const Contigs& contigs, unsigned matches = o.overlap_match; const string& consensus = o.overlap_str; float identity = (float)matches / consensus.size(); - good = matches >= opt::minOverlap - && identity >= opt::minIdentity; + good = matches >= opt::minOverlap && identity >= opt::minIdentity; if (opt::verbose > 2) - cerr << matches << " / " << consensus.size() - << " = " << identity - << (matches < opt::minOverlap ? " (too few)" - : identity < opt::minIdentity ? " (too low)" - : " (good)") << '\n'; + cerr << matches << " / " << consensus.size() << " = " << identity + << (matches < opt::minOverlap + ? " (too few)" + : identity < opt::minIdentity ? " (too low)" : " (good)") + << '\n'; } if (good) { assert(overlaps.size() == 1); @@ -252,16 +267,18 @@ static void mergeContigs(const Graph& g, const Contigs& contigs, seq += Sequence(s, o.overlap_h_pos + 1); } else { cerr << "warning: the head of " << get(vertex_name, g, v) - << " does not match the tail of the previous contig\n" - << ao << '\n' << bo << '\n' << path << endl; + << " does not match the tail of the previous contig\n" + << ao << '\n' + << bo << '\n' + << path << endl; seq += 'n'; seq += s; } } /** Return a FASTA comment for the specified path. */ -static void pathToComment(ostream& out, - const Graph& g, const ContigPath& path) +static void +pathToComment(ostream& out, const Graph& g, const ContigPath& path) { out << get(vertex_name, g, path.front()); if (path.size() == 1) @@ -274,20 +291,19 @@ static void pathToComment(ostream& out, } /** Merge the specified path. */ -static Contig mergePath(const Graph& g, const Contigs& contigs, - const ContigPath& path) +static Contig +mergePath(const Graph& g, const Contigs& contigs, const ContigPath& path) { Sequence seq; unsigned coverage = 0; - for (ContigPath::const_iterator it = path.begin(); - it != path.end(); ++it) { + for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it) { if (!it->ambiguous()) coverage += g[*it].coverage; if (seq.empty()) { seq = sequence(contigs, *it); } else { assert(it != path.begin()); - mergeContigs(g, contigs, *(it-1), *it, seq, path); + mergeContigs(g, contigs, *(it - 1), *it, seq, path); } } ostringstream ss; @@ -302,8 +318,8 @@ typedef vector ContigPaths; /** Read contig paths from the specified file. * @param ids [out] the string ID of the paths */ -static ContigPaths readPaths(const string& inPath, - vector* ids = NULL) +static ContigPaths +readPaths(const string& inPath, vector* ids = NULL) { if (ids != NULL) assert(ids->empty()); @@ -325,13 +341,16 @@ static ContigPaths readPaths(const string& inPath, ++count; if (opt::verbose > 1 && count % 1000000 == 0) - cerr << "Read " << count << " paths. " - "Using " << toSI(getMemoryUsage()) - << "B of memory.\n"; + cerr << "Read " << count + << " paths. " + "Using " + << toSI(getMemoryUsage()) << "B of memory.\n"; } if (opt::verbose > 0) - cerr << "Read " << count << " paths. " - "Using " << toSI(getMemoryUsage()) << "B of memory.\n"; + cerr << "Read " << count + << " paths. " + "Using " + << toSI(getMemoryUsage()) << "B of memory.\n"; if (!opt::db.empty()) addToDb(db, "Init_paths", count); opt::pathCount = count; @@ -341,12 +360,11 @@ static ContigPaths readPaths(const string& inPath, /** Finds all contigs used in each path in paths, and * marks them as seen in the vector seen. */ -static void seenContigs(vector& seen, const ContigPaths& paths) +static void +seenContigs(vector& seen, const ContigPaths& paths) { - for (ContigPaths::const_iterator it = paths.begin(); - it != paths.end(); ++it) - for (ContigPath::const_iterator itc = it->begin(); - itc != it->end(); ++itc) + for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) + for (ContigPath::const_iterator itc = it->begin(); itc != it->end(); ++itc) if (itc->id() < seen.size()) seen[itc->id()] = true; } @@ -354,14 +372,12 @@ static void seenContigs(vector& seen, const ContigPaths& paths) /** Mark contigs for removal. An empty path indicates that a contig * should be removed. */ -static void markRemovedContigs(vector& marked, - const vector& pathIDs, const ContigPaths& paths) +static void +markRemovedContigs(vector& marked, const vector& pathIDs, const ContigPaths& paths) { - for (ContigPaths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) { if (it->empty()) { - size_t i = get(g_contigNames, - pathIDs[it - paths.begin()]); + size_t i = get(g_contigNames, pathIDs[it - paths.begin()]); assert(i < marked.size()); marked[i] = true; } @@ -369,16 +385,18 @@ static void markRemovedContigs(vector& marked, } /** Output the updated overlap graph. */ -static void outputGraph(Graph& g, - const vector& pathIDs, const ContigPaths& paths, - const string& commandLine) +static void +outputGraph( + Graph& g, + const vector& pathIDs, + const ContigPaths& paths, + const string& commandLine) { typedef graph_traits::vertex_descriptor V; // Add the path vertices. g_contigNames.unlock(); - for (ContigPaths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) { const ContigPath& path = *it; const string& id = pathIDs[it - paths.begin()]; if (!path.empty()) { @@ -389,15 +407,14 @@ static void outputGraph(Graph& g, g_contigNames.lock(); // Remove the vertices that are used in paths. - for (ContigPaths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) { const ContigPath& path = *it; const string& id = pathIDs[it - paths.begin()]; if (path.empty()) { remove_vertex(find_vertex(id, false, g), g); } else { - remove_vertex_if(g, path.begin(), path.end(), - not1(std::mem_fun_ref(&ContigNode::ambiguous))); + remove_vertex_if( + g, path.begin(), path.end(), [](const ContigNode& c) { return !c.ambiguous(); }); } } @@ -414,7 +431,8 @@ static void outputGraph(Graph& g, printGraphStats(cerr, g); } -int main(int argc, char** argv) +int +main(int argc, char** argv) { opt::trimMasked = false; @@ -422,7 +440,7 @@ int main(int argc, char** argv) { ostringstream ss; char** last = argv + argc - 1; - copy(argv, last, ostream_iterator(ss, " ")); + copy(argv, last, ostream_iterator(ss, " ")); ss << *last; commandLine = ss.str(); } @@ -431,33 +449,45 @@ int main(int argc, char** argv) opt::metaVars.resize(3); bool die = false; - for (int c; (c = getopt_long(argc, argv, - shortopts, longopts, NULL)) != -1;) { + for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { istringstream arg(optarg != NULL ? optarg : ""); switch (c) { - case '?': die = true; break; - case 'g': arg >> opt::graphPath; break; - case 'k': arg >> opt::k; break; - case 'o': arg >> opt::out; break; - case 'v': opt::verbose++; break; - case OPT_HELP: - cout << USAGE_MESSAGE; - exit(EXIT_SUCCESS); - case OPT_VERSION: - cout << VERSION_MESSAGE; - exit(EXIT_SUCCESS); - case OPT_DB: - arg >> opt::db; break; - case OPT_LIBRARY: - arg >> opt::metaVars[0]; break; - case OPT_STRAIN: - arg >> opt::metaVars[1]; break; - case OPT_SPECIES: - arg >> opt::metaVars[2]; break; + case '?': + die = true; + break; + case 'g': + arg >> opt::graphPath; + break; + case 'k': + arg >> opt::k; + break; + case 'o': + arg >> opt::out; + break; + case 'v': + opt::verbose++; + break; + case OPT_HELP: + cout << USAGE_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_VERSION: + cout << VERSION_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_DB: + arg >> opt::db; + break; + case OPT_LIBRARY: + arg >> opt::metaVars[0]; + break; + case OPT_STRAIN: + arg >> opt::metaVars[1]; + break; + case OPT_SPECIES: + arg >> opt::metaVars[2]; + break; } if (optarg != NULL && !arg.eof()) { - cerr << PROGRAM ": invalid option: `-" - << (char)c << optarg << "'\n"; + cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n"; exit(EXIT_FAILURE); } } @@ -468,7 +498,8 @@ int main(int argc, char** argv) } if (opt::out.empty()) { - cerr << PROGRAM ": " << "missing -o,--out option\n"; + cerr << PROGRAM ": " + << "missing -o,--out option\n"; die = true; } @@ -483,19 +514,12 @@ int main(int argc, char** argv) } if (die) { - cerr << "Try `" << PROGRAM - << " --help' for more information.\n"; + cerr << "Try `" << PROGRAM << " --help' for more information.\n"; exit(EXIT_FAILURE); } if (!opt::db.empty()) { - init(db, - opt::db, - opt::verbose, - PROGRAM, - opt::getCommand(argc, argv), - opt::metaVars - ); + init(db, opt::db, opt::verbose, PROGRAM, opt::getCommand(argc, argv), opt::metaVars); addToDb(db, "K", opt::k); } @@ -513,9 +537,10 @@ int main(int argc, char** argv) fin >> g; assert(fin.eof()); if (opt::verbose > 0) - cerr << "Read " << num_vertices(g) << " vertices. " - "Using " << toSI(getMemoryUsage()) - << "B of memory.\n"; + cerr << "Read " << num_vertices(g) + << " vertices. " + "Using " + << toSI(getMemoryUsage()) << "B of memory.\n"; if (!opt::db.empty()) { addToDb(db, "Init_vertices", num_vertices(g)); addToDb(db, "Init_edges", num_edges(g)); @@ -531,12 +556,11 @@ int main(int argc, char** argv) unsigned count = 0; FastaReader in(contigFile, FastaReader::NO_FOLD_CASE); for (FastaRecord rec; in >> rec;) { - if (!adjPath.empty() - && g_contigNames.count(rec.id) == 0) + if (!adjPath.empty() && g_contigNames.count(rec.id) == 0) continue; if (adjPath.empty()) { - graph_traits::vertex_descriptor - u = add_vertex(ContigProperties(rec.seq.length(), 0), g); + graph_traits::vertex_descriptor u = + add_vertex(ContigProperties(rec.seq.length(), 0), g); put(vertex_name, g, u, rec.id); } assert(get(g_contigNames, rec.id) == contigs.size()); @@ -544,14 +568,16 @@ int main(int argc, char** argv) ++count; if (opt::verbose > 1 && count % 1000000 == 0) - cerr << "Read " << count << " sequences. " - "Using " << toSI(getMemoryUsage()) - << "B of memory.\n"; + cerr << "Read " << count + << " sequences. " + "Using " + << toSI(getMemoryUsage()) << "B of memory.\n"; } if (opt::verbose > 0) - cerr << "Read " << count << " sequences. " - "Using " << toSI(getMemoryUsage()) - << "B of memory.\n"; + cerr << "Read " << count + << " sequences. " + "Using " + << toSI(getMemoryUsage()) << "B of memory.\n"; if (!opt::db.empty()) addToDb(db, "Init_seq", count); assert(in.eof()); @@ -571,12 +597,10 @@ int main(int argc, char** argv) // Output those contigs that were not seen in a path. Histogram lengthHistogram; ofstream fout; - ostream& out = opt::out == "-" ? cout - : (fout.open(opt::out.c_str()), fout); + ostream& out = opt::out == "-" ? cout : (fout.open(opt::out.c_str()), fout); assert_good(out, opt::out); if (!opt::onlyMerged) { - for (Contigs::const_iterator it = contigs.begin(); - it != contigs.end(); ++it) { + for (Contigs::const_iterator it = contigs.begin(); it != contigs.end(); ++it) { ContigID id(it - contigs.begin()); if (!seen[id]) { const Contig& contig = *it; @@ -585,29 +609,23 @@ int main(int argc, char** argv) out << ' ' << contig.comment; out << '\n' << contig.seq << '\n'; if (opt::verbose > 0) - lengthHistogram.insert( - count_if(contig.seq.begin(), contig.seq.end(), - isACGT)); + lengthHistogram.insert(count_if(contig.seq.begin(), contig.seq.end(), isACGT)); } } } unsigned npaths = 0; - for (ContigPaths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) { const ContigPath& path = *it; if (path.empty()) continue; Contig contig = mergePath(g, contigs, path); - out << '>' << pathIDs[it - paths.begin()] - << ' ' << contig.comment << '\n' - << contig.seq << '\n'; + out << '>' << pathIDs[it - paths.begin()] << ' ' << contig.comment << '\n' + << contig.seq << '\n'; assert_good(out, opt::out); npaths++; if (opt::verbose > 0) - lengthHistogram.insert( - count_if(contig.seq.begin(), contig.seq.end(), - isACGT)); + lengthHistogram.insert(count_if(contig.seq.begin(), contig.seq.end(), isACGT)); } if (!opt::graphPath.empty()) @@ -617,7 +635,7 @@ int main(int argc, char** argv) return 0; float minCov = numeric_limits::infinity(), - minCovUsed = numeric_limits::infinity(); + minCovUsed = numeric_limits::infinity(); for (unsigned i = 0; i < contigs.size(); i++) { ContigProperties vp = g[ContigNode(i, false)]; if (vp.coverage == 0 || vp.length < opt::k) @@ -628,18 +646,16 @@ int main(int argc, char** argv) minCovUsed = min(minCovUsed, cov); } - cerr << "The minimum coverage of single-end contigs is " - << minCov << ".\n" - << "The minimum coverage of merged contigs is " - << minCovUsed << ".\n"; + cerr << "The minimum coverage of single-end contigs is " << minCov << ".\n" + << "The minimum coverage of merged contigs is " << minCovUsed << ".\n"; if (minCov < minCovUsed) cerr << "Consider increasing the coverage threshold " - "parameter, c, to " << minCovUsed << ".\n"; + "parameter, c, to " + << minCovUsed << ".\n"; if (opt::verbose > 0) { const unsigned STATS_MIN_LENGTH = 200; // bp - printContiguityStats(cerr, lengthHistogram, STATS_MIN_LENGTH) - << '\t' << opt::out << '\n'; + printContiguityStats(cerr, lengthHistogram, STATS_MIN_LENGTH) << '\t' << opt::out << '\n'; } return 0; } diff --git a/MergePaths/MergePaths.cpp b/MergePaths/MergePaths.cpp index bf514e618..c203125e9 100644 --- a/MergePaths/MergePaths.cpp +++ b/MergePaths/MergePaths.cpp @@ -1,18 +1,18 @@ -#include "config.h" #include "Common/Options.h" #include "ContigID.h" #include "ContigPath.h" #include "Functional.h" // for mem_var -#include "IOUtil.h" -#include "Uncompress.h" #include "Graph/Assemble.h" #include "Graph/ContigGraph.h" #include "Graph/DirectedGraph.h" #include "Graph/DotIO.h" #include "Graph/GraphAlgorithms.h" #include "Graph/GraphUtil.h" -#include +#include "IOUtil.h" +#include "Uncompress.h" +#include "config.h" #include +#include #include #include // for UINT_MAX #include @@ -27,10 +27,10 @@ #include #include #if _OPENMP -# include +#include #endif -#include "DataBase/Options.h" #include "DataBase/DB.h" +#include "DataBase/Options.h" using namespace std; using boost::tie; @@ -40,89 +40,96 @@ using boost::tie; DB db; static const char VERSION_MESSAGE[] = -PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" -"Written by Jared Simpson and Shaun Jackman.\n" -"\n" -"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; + PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" + "Written by Jared Simpson and Shaun Jackman.\n" + "\n" + "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; static const char USAGE_MESSAGE[] = -"Usage: " PROGRAM " -k [OPTION]... LEN PATH\n" -"Merge sequences of contigs IDs.\n" -"\n" -" Arguments:\n" -"\n" -" LEN lengths of the contigs\n" -" PATH sequences of contig IDs\n" -"\n" -" Options:\n" -"\n" -" -k, --kmer=KMER_SIZE k-mer size\n" -" -s, --seed-length=L minimum length of a seed contig [0]\n" -" -G, --genome-size=N expected genome size. Used to calculate NG50\n" -" and associated stats [disabled]\n" -" -o, --out=FILE write result to FILE\n" -" --no-greedy use the non-greedy algorithm [default]\n" -" --greedy use the greedy algorithm\n" -" -g, --graph=FILE write the path overlap graph to FILE\n" -" -j, --threads=N use N parallel threads [1]\n" -" -v, --verbose display verbose output\n" -" --help display this help and exit\n" -" --version output version information and exit\n" -" --db=FILE specify path of database repository in FILE\n" -" --library=NAME specify library NAME for database\n" -" --strain=NAME specify strain NAME for database\n" -" --species=NAME specify species NAME for database\n" -"\n" -"Report bugs to <" PACKAGE_BUGREPORT ">.\n"; + "Usage: " PROGRAM " -k [OPTION]... LEN PATH\n" + "Merge sequences of contigs IDs.\n" + "\n" + " Arguments:\n" + "\n" + " LEN lengths of the contigs\n" + " PATH sequences of contig IDs\n" + "\n" + " Options:\n" + "\n" + " -k, --kmer=KMER_SIZE k-mer size\n" + " -s, --seed-length=L minimum length of a seed contig [0]\n" + " -G, --genome-size=N expected genome size. Used to calculate NG50\n" + " and associated stats [disabled]\n" + " -o, --out=FILE write result to FILE\n" + " --no-greedy use the non-greedy algorithm [default]\n" + " --greedy use the greedy algorithm\n" + " -g, --graph=FILE write the path overlap graph to FILE\n" + " -j, --threads=N use N parallel threads [1]\n" + " -v, --verbose display verbose output\n" + " --help display this help and exit\n" + " --version output version information and exit\n" + " --db=FILE specify path of database repository in FILE\n" + " --library=NAME specify library NAME for database\n" + " --strain=NAME specify strain NAME for database\n" + " --species=NAME specify species NAME for database\n" + "\n" + "Report bugs to <" PACKAGE_BUGREPORT ">.\n"; namespace opt { - string db; - dbVars metaVars; - unsigned k; // used by GraphIO - static string out; - static int threads = 1; +string db; +dbVars metaVars; +unsigned k; // used by GraphIO +static string out; +static int threads = 1; - /** Minimum length of a seed contig. */ - static unsigned seedLen; +/** Minimum length of a seed contig. */ +static unsigned seedLen; - /** Use a greedy algorithm. */ - static int greedy; +/** Use a greedy algorithm. */ +static int greedy; - /** Genome size. Used to calculate NG50. */ - static long long unsigned genomeSize; +/** Genome size. Used to calculate NG50. */ +static long long unsigned genomeSize; - /** Write the path overlap graph to this file. */ - static string graphPath; +/** Write the path overlap graph to this file. */ +static string graphPath; } static const char shortopts[] = "G:g:j:k:o:s:v"; -enum { OPT_HELP = 1, OPT_VERSION, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES }; -//enum { OPT_HELP = 1, OPT_VERSION }; - -static const struct option longopts[] = { - { "genome-size", required_argument, NULL, 'G' }, - { "graph", no_argument, NULL, 'g' }, - { "greedy", no_argument, &opt::greedy, true }, - { "no-greedy", no_argument, &opt::greedy, false }, - { "kmer", required_argument, NULL, 'k' }, - { "out", required_argument, NULL, 'o' }, - { "seed-length", required_argument, NULL, 's' }, - { "threads", required_argument, NULL, 'j' }, - { "verbose", no_argument, NULL, 'v' }, - { "help", no_argument, NULL, OPT_HELP }, - { "version", no_argument, NULL, OPT_VERSION }, - { "db", required_argument, NULL, OPT_DB }, - { "library", required_argument, NULL, OPT_LIBRARY }, - { "strain", required_argument, NULL, OPT_STRAIN }, - { "species", required_argument, NULL, OPT_SPECIES }, - { NULL, 0, NULL, 0 } +enum +{ + OPT_HELP = 1, + OPT_VERSION, + OPT_DB, + OPT_LIBRARY, + OPT_STRAIN, + OPT_SPECIES }; +// enum { OPT_HELP = 1, OPT_VERSION }; + +static const struct option longopts[] = { { "genome-size", required_argument, NULL, 'G' }, + { "graph", no_argument, NULL, 'g' }, + { "greedy", no_argument, &opt::greedy, true }, + { "no-greedy", no_argument, &opt::greedy, false }, + { "kmer", required_argument, NULL, 'k' }, + { "out", required_argument, NULL, 'o' }, + { "seed-length", required_argument, NULL, 's' }, + { "threads", required_argument, NULL, 'j' }, + { "verbose", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, OPT_HELP }, + { "version", no_argument, NULL, OPT_VERSION }, + { "db", required_argument, NULL, OPT_DB }, + { "library", required_argument, NULL, OPT_LIBRARY }, + { "strain", required_argument, NULL, OPT_STRAIN }, + { "species", required_argument, NULL, OPT_SPECIES }, + { NULL, 0, NULL, 0 } }; typedef map ContigPathMap; /** Orientation of an edge. */ -enum dir_type { +enum dir_type +{ DIR_X, // u--v none DIR_F, // u->v forward DIR_R, // u<-v reverse @@ -132,19 +139,22 @@ enum dir_type { /** Lengths of contigs measured in k-mer. */ typedef vector Lengths; -static ContigPath align(const Lengths& lengths, - const ContigPath& p1, const ContigPath& p2, - ContigNode pivot); -static ContigPath align(const Lengths& lengths, - const ContigPath& p1, const ContigPath& p2, - ContigNode pivot, dir_type& orientation); +static ContigPath +align(const Lengths& lengths, const ContigPath& p1, const ContigPath& p2, ContigNode pivot); +static ContigPath +align( + const Lengths& lengths, + const ContigPath& p1, + const ContigPath& p2, + ContigNode pivot, + dir_type& orientation); static bool gDebugPrint; /** - * Build a histogram of the lengths of the assembled paths and unused contigs. - * Note: This function does not account for the ammount of overlap between contigs. - */ + * Build a histogram of the lengths of the assembled paths and unused contigs. + * Note: This function does not account for the ammount of overlap between contigs. + */ static Histogram buildAssembledLengthHistogram(const Lengths& lengths, const ContigPaths& paths) { @@ -153,8 +163,7 @@ buildAssembledLengthHistogram(const Lengths& lengths, const ContigPaths& paths) // Compute the lengths of the paths // Mark the vertices that are used in paths vector used(lengths.size()); - for (ContigPaths::const_iterator pathIt = paths.begin(); - pathIt != paths.end(); ++pathIt) { + for (ContigPaths::const_iterator pathIt = paths.begin(); pathIt != paths.end(); ++pathIt) { const ContigPath& path = *pathIt; size_t totalLength = 0; for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it) { @@ -184,25 +193,23 @@ reportAssemblyMetrics(const Lengths& lengths, const ContigPaths& paths) Histogram h = buildAssembledLengthHistogram(lengths, paths); const unsigned STATS_MIN_LENGTH = opt::seedLen; printContiguityStats(cerr, h, STATS_MIN_LENGTH, true, "\t", opt::genomeSize) - << '\t' << opt::out << '\n'; + << '\t' << opt::out << '\n'; } /** Return all contigs that are tandem repeats, identified as those * contigs that appear more than once in a single path. */ -static set findRepeats(const ContigPathMap& paths) +static set +findRepeats(const ContigPathMap& paths) { set repeats; - for (ContigPathMap::const_iterator pathIt = paths.begin(); - pathIt != paths.end(); ++pathIt) { + for (ContigPathMap::const_iterator pathIt = paths.begin(); pathIt != paths.end(); ++pathIt) { const ContigPath& path = pathIt->second; map count; - for (ContigPath::const_iterator it = path.begin(); - it != path.end(); ++it) + for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it) if (!it->ambiguous()) count[it->contigIndex()]++; - for (map::const_iterator - it = count.begin(); it != count.end(); ++it) + for (map::const_iterator it = count.begin(); it != count.end(); ++it) if (it->second > 1) repeats.insert(it->first); } @@ -212,14 +219,14 @@ static set findRepeats(const ContigPathMap& paths) /** Remove tandem repeats from the set of paths. * @return the removed paths */ -static set removeRepeats(ContigPathMap& paths) +static set +removeRepeats(ContigPathMap& paths) { set repeats = findRepeats(paths); if (gDebugPrint) { cout << "Repeats:"; if (!repeats.empty()) { - for (set::const_iterator it = repeats.begin(); - it != repeats.end(); ++it) + for (set::const_iterator it = repeats.begin(); it != repeats.end(); ++it) cout << ' ' << get(g_contigNames, *it); } else cout << " none"; @@ -227,8 +234,7 @@ static set removeRepeats(ContigPathMap& paths) } unsigned removed = 0; - for (set::const_iterator it = repeats.begin(); - it != repeats.end(); ++it) + for (set::const_iterator it = repeats.begin(); it != repeats.end(); ++it) if (paths.count(*it) > 0) removed++; if (removed == paths.size()) { @@ -239,8 +245,7 @@ static set removeRepeats(ContigPathMap& paths) } ostringstream ss; - for (set::const_iterator it = repeats.begin(); - it != repeats.end(); ++it) + for (set::const_iterator it = repeats.begin(); it != repeats.end(); ++it) if (paths.erase(*it) > 0) ss << ' ' << get(g_contigNames, *it); @@ -249,42 +254,46 @@ static set removeRepeats(ContigPathMap& paths) return repeats; } -static void appendToMergeQ(deque& mergeQ, - set& seen, const ContigPath& path) +static void +appendToMergeQ(deque& mergeQ, set& seen, const ContigPath& path) { - for (ContigPath::const_iterator it = path.begin(); - it != path.end(); ++it) + for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it) if (!it->ambiguous() && seen.insert(*it).second) mergeQ.push_back(*it); } /** A path overlap graph. */ -typedef ContigGraph > PathGraph; +typedef ContigGraph> PathGraph; /** Add an edge if the two paths overlap. * @param pivot the pivot at which to seed the alignment * @return whether an overlap was found */ -static bool addOverlapEdge(const Lengths& lengths, - PathGraph& gout, ContigNode pivot, - ContigNode seed1, const ContigPath& path1, - ContigNode seed2, const ContigPath& path2) +static bool +addOverlapEdge( + const Lengths& lengths, + PathGraph& gout, + ContigNode pivot, + ContigNode seed1, + const ContigPath& path1, + ContigNode seed2, + const ContigPath& path2) { assert(seed1 != seed2); // Determine the orientation of the overlap edge. dir_type orientation = DIR_X; - ContigPath consensus = align(lengths, - path1, path2, pivot, orientation); + ContigPath consensus = align(lengths, path1, path2, pivot, orientation); if (consensus.empty()) return false; assert(orientation != DIR_X); if (orientation == DIR_B) { // One of the paths subsumes the other. Use the order of the // seeds to determine the orientation of the edge. - orientation = find(consensus.begin(), consensus.end(), seed1) - < find(consensus.begin(), consensus.end(), seed2) - ? DIR_F : DIR_R; + orientation = find(consensus.begin(), consensus.end(), seed1) < + find(consensus.begin(), consensus.end(), seed2) + ? DIR_F + : DIR_R; } assert(orientation == DIR_F || orientation == DIR_R); @@ -301,7 +310,8 @@ static bool addOverlapEdge(const Lengths& lengths, } /** Return the specified path. */ -static ContigPath getPath(const ContigPathMap& paths, ContigNode u) +static ContigPath +getPath(const ContigPathMap& paths, ContigNode u) { ContigPathMap::const_iterator it = paths.find(u.contigIndex()); assert(it != paths.end()); @@ -312,45 +322,47 @@ static ContigPath getPath(const ContigPathMap& paths, ContigNode u) } /** Find the overlaps between paths and add edges to the graph. */ -static void findPathOverlaps(const Lengths& lengths, - const ContigPathMap& paths, - const ContigNode& seed1, const ContigPath& path1, - PathGraph& gout) +static void +findPathOverlaps( + const Lengths& lengths, + const ContigPathMap& paths, + const ContigNode& seed1, + const ContigPath& path1, + PathGraph& gout) { - for (ContigPath::const_iterator it = path1.begin(); - it != path1.end(); ++it) { + for (ContigPath::const_iterator it = path1.begin(); it != path1.end(); ++it) { ContigNode seed2 = *it; if (seed1 == seed2) continue; if (seed2.ambiguous()) continue; - ContigPathMap::const_iterator path2It - = paths.find(seed2.contigIndex()); + ContigPathMap::const_iterator path2It = paths.find(seed2.contigIndex()); if (path2It == paths.end()) continue; ContigPath path2 = path2It->second; if (seed2.sense()) reverseComplement(path2.begin(), path2.end()); - addOverlapEdge(lengths, - gout, seed2, seed1, path1, seed2, path2); + addOverlapEdge(lengths, gout, seed2, seed1, path1, seed2, path2); } } /** Attempt to merge the paths specified in mergeQ with path. * @return the number of paths merged */ -static unsigned mergePaths(const Lengths& lengths, - ContigPath& path, - deque& mergeQ, set& seen, - const ContigPathMap& paths) +static unsigned +mergePaths( + const Lengths& lengths, + ContigPath& path, + deque& mergeQ, + set& seen, + const ContigPathMap& paths) { unsigned merged = 0; deque invalid; for (ContigNode pivot; !mergeQ.empty(); mergeQ.pop_front()) { pivot = mergeQ.front(); - ContigPathMap::const_iterator path2It - = paths.find(pivot.contigIndex()); + ContigPathMap::const_iterator path2It = paths.find(pivot.contigIndex()); if (path2It == paths.end()) continue; @@ -367,9 +379,7 @@ static unsigned mergePaths(const Lengths& lengths, path.swap(consensus); if (gDebugPrint) #pragma omp critical(cout) - cout << get(g_contigNames, pivot) - << '\t' << path2 << '\n' - << '\t' << path << '\n'; + cout << get(g_contigNames, pivot) << '\t' << path2 << '\n' << '\t' << path << '\n'; merged++; } mergeQ.swap(invalid); @@ -379,13 +389,12 @@ static unsigned mergePaths(const Lengths& lengths, /** Merge the paths of the specified seed path. * @return the merged contig path */ -static ContigPath mergePath(const Lengths& lengths, - const ContigPathMap& paths, const ContigPath& seedPath) +static ContigPath +mergePath(const Lengths& lengths, const ContigPathMap& paths, const ContigPath& seedPath) { assert(!seedPath.empty()); ContigNode seed1 = seedPath.front(); - ContigPathMap::const_iterator path1It - = paths.find(seed1.contigIndex()); + ContigPathMap::const_iterator path1It = paths.find(seed1.contigIndex()); assert(path1It != paths.end()); ContigPath path(path1It->second); if (seedPath.front().sense()) @@ -393,36 +402,27 @@ static ContigPath mergePath(const Lengths& lengths, if (opt::verbose > 1) #pragma omp critical(cout) cout << "\n* " << seedPath << '\n' - << get(g_contigNames, seedPath.front()) - << '\t' << path << '\n'; - for (ContigPath::const_iterator it = seedPath.begin() + 1; - it != seedPath.end(); ++it) { + << get(g_contigNames, seedPath.front()) << '\t' << path << '\n'; + for (ContigPath::const_iterator it = seedPath.begin() + 1; it != seedPath.end(); ++it) { ContigNode seed2 = *it; - ContigPathMap::const_iterator path2It - = paths.find(seed2.contigIndex()); + ContigPathMap::const_iterator path2It = paths.find(seed2.contigIndex()); assert(path2It != paths.end()); ContigPath path2 = path2It->second; if (seed2.sense()) reverseComplement(path2.begin(), path2.end()); - ContigNode pivot - = find(path.begin(), path.end(), seed2) != path.end() - ? seed2 : seed1; + ContigNode pivot = find(path.begin(), path.end(), seed2) != path.end() ? seed2 : seed1; ContigPath consensus = align(lengths, path, path2, pivot); if (consensus.empty()) { // This seed could be removed from the seed path. if (opt::verbose > 1) #pragma omp critical(cout) - cout << get(g_contigNames, seed2) - << '\t' << path2 << '\n' - << "\tinvalid\n"; + cout << get(g_contigNames, seed2) << '\t' << path2 << '\n' << "\tinvalid\n"; } else { path.swap(consensus); if (opt::verbose > 1) #pragma omp critical(cout) - cout << get(g_contigNames, seed2) - << '\t' << path2 << '\n' - << '\t' << path << '\n'; + cout << get(g_contigNames, seed2) << '\t' << path2 << '\n' << '\t' << path << '\n'; } seed1 = seed2; } @@ -435,16 +435,15 @@ typedef vector ContigPaths; /** Merge the specified seed paths. * @return the merged contig paths */ -static ContigPaths mergeSeedPaths(const Lengths& lengths, - const ContigPathMap& paths, const ContigPaths& seedPaths) +static ContigPaths +mergeSeedPaths(const Lengths& lengths, const ContigPathMap& paths, const ContigPaths& seedPaths) { if (opt::verbose > 0) cout << "\nMerging paths\n"; ContigPaths out; out.reserve(seedPaths.size()); - for (ContigPaths::const_iterator it = seedPaths.begin(); - it != seedPaths.end(); ++it) + for (ContigPaths::const_iterator it = seedPaths.begin(); it != seedPaths.end(); ++it) out.push_back(mergePath(lengths, paths, *it)); return out; } @@ -452,23 +451,21 @@ static ContigPaths mergeSeedPaths(const Lengths& lengths, /** Extend the specified path as long as is unambiguously possible and * add the result to the specified container. */ -static void extendPaths(const Lengths& lengths, - ContigID id, const ContigPathMap& paths, - ContigPathMap& out) +static void +extendPaths(const Lengths& lengths, ContigID id, const ContigPathMap& paths, ContigPathMap& out) { ContigPathMap::const_iterator pathIt = paths.find(id); assert(pathIt != paths.end()); pair inserted; - #pragma omp critical(out) +#pragma omp critical(out) inserted = out.insert(*pathIt); assert(inserted.second); ContigPath& path = inserted.first->second; if (gDebugPrint) - #pragma omp critical(cout) - cout << "\n* " << get(g_contigNames, id) << "+\n" - << '\t' << path << '\n'; +#pragma omp critical(cout) + cout << "\n* " << get(g_contigNames, id) << "+\n" << '\t' << path << '\n'; set seen; seen.insert(ContigNode(id, false)); @@ -478,34 +475,33 @@ static void extendPaths(const Lengths& lengths, ; if (!mergeQ.empty() && gDebugPrint) { - #pragma omp critical(cout) +#pragma omp critical(cout) { cout << "invalid\n"; - for (deque::const_iterator it - = mergeQ.begin(); it != mergeQ.end(); ++it) - cout << get(g_contigNames, *it) << '\t' - << paths.find(it->contigIndex())->second << '\n'; + for (deque::const_iterator it = mergeQ.begin(); it != mergeQ.end(); ++it) + cout << get(g_contigNames, *it) << '\t' << paths.find(it->contigIndex())->second + << '\n'; } } } /** Return true if the contigs are equal or both are ambiguous. */ -static bool equalOrBothAmbiguos(const ContigNode& a, - const ContigNode& b) +static bool +equalOrBothAmbiguos(const ContigNode& a, const ContigNode& b) { return a == b || (a.ambiguous() && b.ambiguous()); } /** Return true if both paths are equal, ignoring ambiguous nodes. */ -static bool equalIgnoreAmbiguos(const ContigPath& a, - const ContigPath& b) +static bool +equalIgnoreAmbiguos(const ContigPath& a, const ContigPath& b) { - return a.size() == b.size() - && equal(a.begin(), a.end(), b.begin(), equalOrBothAmbiguos); + return a.size() == b.size() && equal(a.begin(), a.end(), b.begin(), equalOrBothAmbiguos); } /** Return whether this path is a cycle. */ -static bool isCycle(const Lengths& lengths, const ContigPath& path) +static bool +isCycle(const Lengths& lengths, const ContigPath& path) { return !align(lengths, path, path, path.front()).empty(); } @@ -514,27 +510,26 @@ static bool isCycle(const Lengths& lengths, const ContigPath& path) * @param overlaps [out] paths that are found to overlap * @return the ID of the subsuming path */ -static ContigID identifySubsumedPaths(const Lengths& lengths, - ContigPathMap::const_iterator path1It, - ContigPathMap& paths, - set& out, - set& overlaps) +static ContigID +identifySubsumedPaths( + const Lengths& lengths, + ContigPathMap::const_iterator path1It, + ContigPathMap& paths, + set& out, + set& overlaps) { ostringstream vout; out.clear(); ContigID id(path1It->first); const ContigPath& path = path1It->second; if (gDebugPrint) - vout << get(g_contigNames, ContigNode(id, false)) - << '\t' << path << '\n'; + vout << get(g_contigNames, ContigNode(id, false)) << '\t' << path << '\n'; - for (ContigPath::const_iterator it = path.begin(); - it != path.end(); ++it) { + for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it) { ContigNode pivot = *it; if (pivot.ambiguous() || pivot.id() == id) continue; - ContigPathMap::iterator path2It - = paths.find(pivot.contigIndex()); + ContigPathMap::iterator path2It = paths.find(pivot.contigIndex()); if (path2It == paths.end()) continue; ContigPath path2 = path2It->second; @@ -545,13 +540,11 @@ static ContigID identifySubsumedPaths(const Lengths& lengths, continue; if (equalIgnoreAmbiguos(consensus, path)) { if (gDebugPrint) - vout << get(g_contigNames, pivot) - << '\t' << path2 << '\n'; + vout << get(g_contigNames, pivot) << '\t' << path2 << '\n'; out.insert(path2It->first); } else if (equalIgnoreAmbiguos(consensus, path2)) { // This path is larger. Use it as the seed. - return identifySubsumedPaths(lengths, path2It, paths, out, - overlaps); + return identifySubsumedPaths(lengths, path2It, paths, out, overlaps); } else if (isCycle(lengths, consensus)) { // The consensus path is a cycle. bool isCyclePath1 = isCycle(lengths, path); @@ -559,17 +552,15 @@ static ContigID identifySubsumedPaths(const Lengths& lengths, if (!isCyclePath1 && !isCyclePath2) { // Neither path is a cycle. if (gDebugPrint) - vout << get(g_contigNames, pivot) - << '\t' << path2 << '\n' - << "ignored\t" << consensus << '\n'; + vout << get(g_contigNames, pivot) << '\t' << path2 << '\n' + << "ignored\t" << consensus << '\n'; overlaps.insert(id); overlaps.insert(path2It->first); } else { // At least one path is a cycle. if (gDebugPrint) - vout << get(g_contigNames, pivot) - << '\t' << path2 << '\n' - << "cycle\t" << consensus << '\n'; + vout << get(g_contigNames, pivot) << '\t' << path2 << '\n' + << "cycle\t" << consensus << '\n'; if (isCyclePath1 && isCyclePath2) out.insert(path2It->first); else if (!isCyclePath1) @@ -579,9 +570,8 @@ static ContigID identifySubsumedPaths(const Lengths& lengths, } } else { if (gDebugPrint) - vout << get(g_contigNames, pivot) - << '\t' << path2 << '\n' - << "ignored\t" << consensus << '\n'; + vout << get(g_contigNames, pivot) << '\t' << path2 << '\n' + << "ignored\t" << consensus << '\n'; overlaps.insert(id); overlaps.insert(path2It->first); } @@ -595,19 +585,20 @@ static ContigID identifySubsumedPaths(const Lengths& lengths, * @param overlaps [out] paths that are found to overlap * @return the next iterator after path1it */ -static ContigPathMap::const_iterator removeSubsumedPaths( - const Lengths& lengths, - ContigPathMap::const_iterator path1It, ContigPathMap& paths, - ContigID& seed, set& overlaps) +static ContigPathMap::const_iterator +removeSubsumedPaths( + const Lengths& lengths, + ContigPathMap::const_iterator path1It, + ContigPathMap& paths, + ContigID& seed, + set& overlaps) { if (gDebugPrint) cout << '\n'; set eq; - seed = identifySubsumedPaths(lengths, - path1It, paths, eq, overlaps); + seed = identifySubsumedPaths(lengths, path1It, paths, eq, overlaps); ++path1It; - for (set::const_iterator it = eq.begin(); - it != eq.end(); ++it) { + for (set::const_iterator it = eq.begin(); it != eq.end(); ++it) { if (*it == path1It->first) ++path1It; paths.erase(*it); @@ -618,16 +609,14 @@ static ContigPathMap::const_iterator removeSubsumedPaths( /** Remove paths subsumed by another path. * @return paths that are found to overlap */ -static set removeSubsumedPaths(const Lengths& lengths, - ContigPathMap& paths) +static set +removeSubsumedPaths(const Lengths& lengths, ContigPathMap& paths) { set overlaps, seen; - for (ContigPathMap::const_iterator iter = paths.begin(); - iter != paths.end();) { + for (ContigPathMap::const_iterator iter = paths.begin(); iter != paths.end();) { if (seen.count(iter->first) == 0) { ContigID seed; - iter = removeSubsumedPaths(lengths, - iter, paths, seed, overlaps); + iter = removeSubsumedPaths(lengths, iter, paths, seed, overlaps); seen.insert(seed); } else ++iter; @@ -639,8 +628,8 @@ static set removeSubsumedPaths(const Lengths& lengths, * outgoing edges, (u,v1) and (u,v2), add the edge (v1,v2) if v1 < v2, * and add the edge (v2,v1) if v2 < v1. */ -static void addMissingEdges(const Lengths& lengths, - PathGraph& g, const ContigPathMap& paths) +static void +addMissingEdges(const Lengths& lengths, PathGraph& g, const ContigPathMap& paths) { typedef graph_traits::adjacency_iterator Vit; typedef graph_traits::vertex_iterator Uit; @@ -658,8 +647,7 @@ static void addMissingEdges(const Lengths& lengths, ++vit1; assert(v1 != u); ContigPath path1 = getPath(paths, v1); - if (find(path1.begin(), path1.end(), - ContigPath::value_type(u)) == path1.end()) + if (find(path1.begin(), path1.end(), ContigPath::value_type(u)) == path1.end()) continue; for (Vit vit2 = vit1; vit2 != vrange.second; ++vit2) { V v2 = *vit2; @@ -668,11 +656,9 @@ static void addMissingEdges(const Lengths& lengths, if (edge(v1, v2, g).second || edge(v2, v1, g).second) continue; ContigPath path2 = getPath(paths, v2); - if (find(path2.begin(), path2.end(), - ContigPath::value_type(u)) == path2.end()) + if (find(path2.begin(), path2.end(), ContigPath::value_type(u)) == path2.end()) continue; - numAdded += addOverlapEdge(lengths, - g, u, v1, path1, v2, path2); + numAdded += addOverlapEdge(lengths, g, u, v1, path1, v2, path2); } } } @@ -683,15 +669,15 @@ static void addMissingEdges(const Lengths& lengths, } /** Remove transitive edges. */ -static void removeTransitiveEdges(PathGraph& pathGraph) +static void +removeTransitiveEdges(PathGraph& pathGraph) { unsigned nbefore = num_edges(pathGraph); unsigned nremoved = remove_transitive_edges(pathGraph); unsigned nafter = num_edges(pathGraph); if (opt::verbose > 0) - cout << "Removed " << nremoved << " transitive edges of " - << nbefore << " edges leaving " - << nafter << " edges.\n"; + cout << "Removed " << nremoved << " transitive edges of " << nbefore << " edges leaving " + << nafter << " edges.\n"; assert(nbefore - nremoved == nafter); if (!opt::db.empty()) { addToDb(db, "Edges_init", nbefore); @@ -703,8 +689,8 @@ static void removeTransitiveEdges(PathGraph& pathGraph) * Remove the edge (u,v) if deg+(u) > 1 and deg-(v) > 1 and the * overlap of (u,v) is small. */ -static void removeSmallOverlaps(PathGraph& g, - const ContigPathMap& paths) +static void +removeSmallOverlaps(PathGraph& g, const ContigPathMap& paths) { typedef graph_traits::edge_descriptor E; typedef graph_traits::out_edge_iterator Eit; @@ -726,21 +712,20 @@ static void removeSmallOverlaps(PathGraph& g, if (in_degree(v, g) < 2) continue; ContigPath pathv = getPath(paths, v); - if (pathu.back() == pathv.front() - && paths.count(pathu.back().contigIndex()) > 0) + if (pathu.back() == pathv.front() && paths.count(pathu.back().contigIndex()) > 0) edges.push_back(uv); } } remove_edges(g, edges.begin(), edges.end()); if (opt::verbose > 0) - cout << "Removed " << edges.size() - << " small overlap edges.\n"; + cout << "Removed " << edges.size() << " small overlap edges.\n"; if (!opt::db.empty()) addToDb(db, "Edges_removed_small_overlap", edges.size()); } /** Output the path overlap graph. */ -static void outputPathGraph(PathGraph& pathGraph) +static void +outputPathGraph(PathGraph& pathGraph) { if (opt::graphPath.empty()) return; @@ -751,20 +736,23 @@ static void outputPathGraph(PathGraph& pathGraph) } /** Sort and output the specified paths. */ -static void outputSortedPaths(const Lengths& lengths, const ContigPathMap& paths) +static void +outputSortedPaths(const Lengths& lengths, const ContigPathMap& paths) { // Sort the paths. vector sortedPaths(paths.size()); - transform(paths.begin(), paths.end(), sortedPaths.begin(), - mem_var(&ContigPathMap::value_type::second)); + transform( + paths.begin(), + paths.end(), + sortedPaths.begin(), + mem_var(&ContigPathMap::value_type::second)); sort(sortedPaths.begin(), sortedPaths.end()); // Output the paths. ofstream fout(opt::out.c_str()); ostream& out = opt::out.empty() ? cout : fout; assert_good(out, opt::out); - for (vector::const_iterator it = sortedPaths.begin(); - it != sortedPaths.end(); ++it) + for (vector::const_iterator it = sortedPaths.begin(); it != sortedPaths.end(); ++it) out << createContigName() << '\t' << *it << '\n'; assert_good(out, opt::out); @@ -773,28 +761,24 @@ static void outputSortedPaths(const Lengths& lengths, const ContigPathMap& paths } /** Assemble the path overlap graph. */ -static void assemblePathGraph(const Lengths& lengths, - PathGraph& pathGraph, ContigPathMap& paths) +static void +assemblePathGraph(const Lengths& lengths, PathGraph& pathGraph, ContigPathMap& paths) { ContigPaths seedPaths; assembleDFS(pathGraph, back_inserter(seedPaths)); - ContigPaths mergedPaths = mergeSeedPaths(lengths, - paths, seedPaths); + ContigPaths mergedPaths = mergeSeedPaths(lengths, paths, seedPaths); if (opt::verbose > 1) cout << '\n'; // Replace each path with the merged path. - for (ContigPaths::const_iterator it1 = seedPaths.begin(); - it1 != seedPaths.end(); ++it1) { + for (ContigPaths::const_iterator it1 = seedPaths.begin(); it1 != seedPaths.end(); ++it1) { const ContigPath& path(mergedPaths[it1 - seedPaths.begin()]); ContigPath pathrc(path); reverseComplement(pathrc.begin(), pathrc.end()); - for (ContigPath::const_iterator it2 = it1->begin(); - it2 != it1->end(); ++it2) { + for (ContigPath::const_iterator it2 = it1->begin(); it2 != it1->end(); ++it2) { ContigNode seed(*it2); if (find(path.begin(), path.end(), seed) != path.end()) { - paths[seed.contigIndex()] - = seed.sense() ? pathrc : path; + paths[seed.contigIndex()] = seed.sense() ? pathrc : path; } else { // This seed was not included in the merged path. } @@ -812,8 +796,8 @@ static void assemblePathGraph(const Lengths& lengths, } /** Read a set of paths from the specified file. */ -static ContigPathMap readPaths(const Lengths& lengths, - const string& filePath) +static ContigPathMap +readPaths(const Lengths& lengths, const string& filePath) { if (opt::verbose > 0) cerr << "Reading `" << filePath << "'..." << endl; @@ -833,17 +817,15 @@ static ContigPathMap readPaths(const Lengths& lengths, continue; } - bool inserted = paths.insert( - make_pair(id, path)).second; + bool inserted = paths.insert(make_pair(id, path)).second; assert(inserted); (void)inserted; } assert(in.eof()); if (opt::seedLen > 0) - cout << "Ignored " << tooSmall - << " paths whose seeds are shorter than " - << opt::seedLen << " bp.\n"; + cout << "Ignored " << tooSmall << " paths whose seeds are shorter than " << opt::seedLen + << " bp.\n"; return paths; } @@ -851,16 +833,17 @@ static ContigPathMap readPaths(const Lengths& lengths, * @return true if out != last */ template -bool atomicInc(T1& it, T2 last, T3& out) +bool +atomicInc(T1& it, T2 last, T3& out) { - #pragma omp critical(atomicInc) +#pragma omp critical(atomicInc) out = it == last ? it : it++; return out != last; } /** Build the path overlap graph. */ -static void buildPathGraph(const Lengths& lengths, - PathGraph& g, const ContigPathMap& paths) +static void +buildPathGraph(const Lengths& lengths, PathGraph& g, const ContigPathMap& paths) { // Create the vertices of the path overlap graph. PathGraph(lengths.size()).swap(g); @@ -874,11 +857,9 @@ static void buildPathGraph(const Lengths& lengths, // Find the overlapping paths. ContigPathMap::const_iterator sharedIt = paths.begin(); - #pragma omp parallel - for (ContigPathMap::const_iterator it; - atomicInc(sharedIt, paths.end(), it);) - findPathOverlaps(lengths, paths, - ContigNode(it->first, false), it->second, g); +#pragma omp parallel + for (ContigPathMap::const_iterator it; atomicInc(sharedIt, paths.end(), it);) + findPathOverlaps(lengths, paths, ContigNode(it->first, false), it->second, g); if (gDebugPrint) cout << '\n'; @@ -890,23 +871,23 @@ static void buildPathGraph(const Lengths& lengths, // graph statistics vector vals = passGraphStatsVal(g); - vector keys = make_vector() - << "V" - << "E" - << "degree0pctg" - << "degree1pctg" - << "degree234pctg" - << "degree5pctg" - << "degree_max"; + vector keys = make_vector() << "V" + << "E" + << "degree0pctg" + << "degree1pctg" + << "degree234pctg" + << "degree5pctg" + << "degree_max"; if (!opt::db.empty()) { - for (unsigned i=0; i> x; - opt::genomeSize = x; - break; - } - case 'g': arg >> opt::graphPath; break; - case 'j': arg >> opt::threads; break; - case 'k': arg >> opt::k; break; - case 'o': arg >> opt::out; break; - case 's': arg >> opt::seedLen; break; - case 'v': opt::verbose++; break; - case OPT_HELP: - cout << USAGE_MESSAGE; - exit(EXIT_SUCCESS); - case OPT_VERSION: - cout << VERSION_MESSAGE; - exit(EXIT_SUCCESS); - case OPT_DB: - arg >> opt::db; break; - case OPT_LIBRARY: - arg >> opt::metaVars[0]; break; - case OPT_STRAIN: - arg >> opt::metaVars[1]; break; - case OPT_SPECIES: - arg >> opt::metaVars[2]; break; + case '?': + die = true; + break; + case 'G': { + double x; + arg >> x; + opt::genomeSize = x; + break; + } + case 'g': + arg >> opt::graphPath; + break; + case 'j': + arg >> opt::threads; + break; + case 'k': + arg >> opt::k; + break; + case 'o': + arg >> opt::out; + break; + case 's': + arg >> opt::seedLen; + break; + case 'v': + opt::verbose++; + break; + case OPT_HELP: + cout << USAGE_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_VERSION: + cout << VERSION_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_DB: + arg >> opt::db; + break; + case OPT_LIBRARY: + arg >> opt::metaVars[0]; + break; + case OPT_STRAIN: + arg >> opt::metaVars[1]; + break; + case OPT_SPECIES: + arg >> opt::metaVars[2]; + break; } if (optarg != NULL && !arg.eof()) { - cerr << PROGRAM ": invalid option: `-" - << (char)c << optarg << "'\n"; + cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n"; exit(EXIT_FAILURE); } } @@ -994,8 +993,7 @@ int main(int argc, char** argv) } if (die) { - cerr << "Try `" << PROGRAM - << " --help' for more information.\n"; + cerr << "Try `" << PROGRAM << " --help' for more information.\n"; exit(EXIT_FAILURE); } @@ -1013,18 +1011,11 @@ int main(int argc, char** argv) cerr << "Reading `" << argv[optind] << "'..." << endl; if (!opt::db.empty()) { - init(db, - opt::db, - opt::verbose, - PROGRAM, - opt::getCommand(argc, argv), - opt::metaVars - ); + init(db, opt::db, opt::verbose, PROGRAM, opt::getCommand(argc, argv), opt::metaVars); } Lengths lengths = readContigLengths(argv[optind++]); - ContigPathMap originalPathMap = readPaths( - lengths, argv[optind++]); + ContigPathMap originalPathMap = readPaths(lengths, argv[optind++]); removeRepeats(originalPathMap); @@ -1042,16 +1033,13 @@ int main(int argc, char** argv) ContigPathMap resultsPathMap; #if _OPENMP ContigPathMap::iterator sharedIt = originalPathMap.begin(); - #pragma omp parallel - for (ContigPathMap::iterator it; - atomicInc(sharedIt, originalPathMap.end(), it);) - extendPaths(lengths, - it->first, originalPathMap, resultsPathMap); +#pragma omp parallel + for (ContigPathMap::iterator it; atomicInc(sharedIt, originalPathMap.end(), it);) + extendPaths(lengths, it->first, originalPathMap, resultsPathMap); #else - for (ContigPathMap::const_iterator it = originalPathMap.begin(); - it != originalPathMap.end(); ++it) - extendPaths(lengths, - it->first, originalPathMap, resultsPathMap); + for (ContigPathMap::const_iterator it = originalPathMap.begin(); it != originalPathMap.end(); + ++it) + extendPaths(lengths, it->first, originalPathMap, resultsPathMap); #endif if (gDebugPrint) cout << '\n'; @@ -1060,27 +1048,23 @@ int main(int argc, char** argv) if (gDebugPrint) cout << "\nRemoving redundant contigs\n"; - set overlaps = removeSubsumedPaths(lengths, - resultsPathMap); + set overlaps = removeSubsumedPaths(lengths, resultsPathMap); if (!overlaps.empty() && !repeats.empty()) { // Remove the newly-discovered repeat contigs from the // original paths. - for (set::const_iterator it = repeats.begin(); - it != repeats.end(); ++it) + for (set::const_iterator it = repeats.begin(); it != repeats.end(); ++it) originalPathMap.erase(*it); // Reassemble the paths that were found to overlap. if (gDebugPrint) { cout << "\nReassembling overlapping contigs:"; - for (set::const_iterator it = overlaps.begin(); - it != overlaps.end(); ++it) + for (set::const_iterator it = overlaps.begin(); it != overlaps.end(); ++it) cout << ' ' << get(g_contigNames, *it); cout << '\n'; } - for (set::const_iterator it = overlaps.begin(); - it != overlaps.end(); ++it) { + for (set::const_iterator it = overlaps.begin(); it != overlaps.end(); ++it) { if (originalPathMap.count(*it) == 0) continue; // repeat ContigPathMap::iterator oldIt = resultsPathMap.find(*it); @@ -1088,8 +1072,7 @@ int main(int argc, char** argv) continue; // subsumed ContigPath old = oldIt->second; resultsPathMap.erase(oldIt); - extendPaths(lengths, - *it, originalPathMap, resultsPathMap); + extendPaths(lengths, *it, originalPathMap, resultsPathMap); if (gDebugPrint) { if (resultsPathMap[*it] == old) cout << "no change\n"; @@ -1104,8 +1087,7 @@ int main(int argc, char** argv) overlaps = removeSubsumedPaths(lengths, resultsPathMap); if (!overlaps.empty() && gDebugPrint) { cout << "\nOverlapping contigs:"; - for (set::const_iterator it = overlaps.begin(); - it != overlaps.end(); ++it) + for (set::const_iterator it = overlaps.begin(); it != overlaps.end(); ++it) cout << ' ' << get(g_contigNames, *it); cout << '\n'; } @@ -1117,21 +1099,23 @@ int main(int argc, char** argv) } /** Return the length of the specified contig in k-mer. */ -static unsigned getLength(const Lengths& lengths, - const ContigNode& u) +static unsigned +getLength(const Lengths& lengths, const ContigNode& u) { - return u.ambiguous() ? u.length() - : lengths.at(u.id()); + return u.ambiguous() ? u.length() : lengths.at(u.id()); } /** Functor to add the number of k-mer in two contigs. */ struct AddLength { - AddLength(const Lengths& lengths) : m_lengths(lengths) { } + AddLength(const Lengths& lengths) + : m_lengths(lengths) + {} unsigned operator()(unsigned addend, const ContigNode& u) const { return addend + getLength(m_lengths, u); } + private: const Lengths& m_lengths; }; @@ -1141,10 +1125,15 @@ struct AddLength * found. * @return true if an alignment is found */ -template -static bool alignCoordinates(const Lengths& lengths, - iterator& first1, iterator last1, - iterator& first2, iterator last2, oiterator& result) +template +static bool +alignCoordinates( + const Lengths& lengths, + iterator& first1, + iterator last1, + iterator& first2, + iterator last2, + oiterator& result) { oiterator out = result; @@ -1206,10 +1195,15 @@ static bool alignCoordinates(const Lengths& lengths, * the consensus at out if an alignment is found. * @return true if an alignment is found */ -template -static bool buildConsensus(const Lengths& lengths, - iterator it1, iterator it1e, - iterator it2, iterator it2e, oiterator& out) +template +static bool +buildConsensus( + const Lengths& lengths, + iterator it1, + iterator it1e, + iterator it2, + iterator it2e, + oiterator& out) { iterator it1b = it1 + 1; assert(!it1b->ambiguous()); @@ -1229,21 +1223,16 @@ static bool buildConsensus(const Lengths& lengths, unsigned ambiguous1 = it1->length(); unsigned ambiguous2 = it2a->length(); - unsigned unambiguous1 = accumulate(it1b, it1e, - 0, AddLength(lengths)); - unsigned unambiguous2 = accumulate(it2, it2a, - 0, AddLength(lengths)); - if (ambiguous1 < unambiguous2 - || ambiguous2 < unambiguous1) { + unsigned unambiguous1 = accumulate(it1b, it1e, 0, AddLength(lengths)); + unsigned unambiguous2 = accumulate(it2, it2a, 0, AddLength(lengths)); + if (ambiguous1 < unambiguous2 || ambiguous2 < unambiguous1) { // Two gaps overlap and either of the gaps is smaller // than the unambiguous sequence that overlaps the // gap. No alignment. return false; } - unsigned n = max(1U, - max(ambiguous2 - unambiguous1, - ambiguous1 - unambiguous2)); + unsigned n = max(1U, max(ambiguous2 - unambiguous1, ambiguous1 - unambiguous2)); out = copy(it2, it2a, out); *out++ = ContigNode(n, 'N'); out = copy(it1b, it1e, out); @@ -1255,10 +1244,16 @@ static bool buildConsensus(const Lengths& lengths, * in it1 and it2. * @return true if an alignment is found */ -template -static bool alignAtSeed(const Lengths& lengths, - iterator& it1, iterator it1e, iterator last1, - iterator& it2, iterator last2, oiterator& out) +template +static bool +alignAtSeed( + const Lengths& lengths, + iterator& it1, + iterator it1e, + iterator last1, + iterator& it2, + iterator last2, + oiterator& out) { assert(it1 != last1); assert(it1->ambiguous()); @@ -1270,11 +1265,10 @@ static bool alignAtSeed(const Lengths& lengths, // fewest number of contigs in the consensus sequence. unsigned bestLen = UINT_MAX; iterator bestIt2e; - for (iterator it2e = it2; - (it2e = find(it2e, last2, *it1e)) != last2; ++it2e) { + for (iterator it2e = it2; (it2e = find(it2e, last2, *it1e)) != last2; ++it2e) { oiterator myOut = out; - if (buildConsensus(lengths, it1, it1e, it2, it2e, myOut) - && align(lengths, it1e, last1, it2e, last2, myOut)) { + if (buildConsensus(lengths, it1, it1e, it2, it2e, myOut) && + align(lengths, it1e, last1, it2e, last2, myOut)) { unsigned len = myOut - out; if (len <= bestLen) { bestLen = len; @@ -1283,8 +1277,7 @@ static bool alignAtSeed(const Lengths& lengths, } } if (bestLen != UINT_MAX) { - bool good = buildConsensus(lengths, - it1, it1e, it2, bestIt2e, out); + bool good = buildConsensus(lengths, it1, it1e, it2, bestIt2e, out); assert(good); it1 = it1e; it2 = bestIt2e; @@ -1297,10 +1290,15 @@ static bool alignAtSeed(const Lengths& lengths, * The end of the alignment is returned in it1 and it2. * @return true if an alignment is found */ -template -static bool alignAmbiguous(const Lengths& lengths, - iterator& it1, iterator last1, - iterator& it2, iterator last2, oiterator& out) +template +static bool +alignAmbiguous( + const Lengths& lengths, + iterator& it1, + iterator last1, + iterator& it2, + iterator last2, + oiterator& out) { assert(it1 != last1); assert(it1->ambiguous()); @@ -1324,10 +1322,15 @@ static bool alignAmbiguous(const Lengths& lengths, * The end of the alignment is returned in it1 and it2. * @return true if an alignment is found */ -template -static bool alignOne(const Lengths& lengths, - iterator& it1, iterator last1, - iterator& it2, iterator last2, oiterator& out) +template +static bool +alignOne( + const Lengths& lengths, + iterator& it1, + iterator last1, + iterator& it2, + iterator last2, + oiterator& out) { // Check for a trivial alignment. unsigned n1 = last1 - it1, n2 = last2 - it2; @@ -1347,17 +1350,14 @@ static bool alignOne(const Lengths& lengths, return true; } - return - it1->ambiguous() && it2->ambiguous() - ? (it1->length() > it2->length() - ? alignAmbiguous(lengths, it1, last1, it2, last2, out) - : alignAmbiguous(lengths, it2, last2, it1, last1, out) - ) - : it1->ambiguous() - ? alignAmbiguous(lengths, it1, last1, it2, last2, out) - : it2->ambiguous() - ? alignAmbiguous(lengths, it2, last2, it1, last1, out) - : (*out++ = *it1, *it1++ == *it2++); + return it1->ambiguous() && it2->ambiguous() + ? (it1->length() > it2->length() + ? alignAmbiguous(lengths, it1, last1, it2, last2, out) + : alignAmbiguous(lengths, it2, last2, it1, last1, out)) + : it1->ambiguous() + ? alignAmbiguous(lengths, it1, last1, it2, last2, out) + : it2->ambiguous() ? alignAmbiguous(lengths, it2, last2, it1, last1, out) + : (*out++ = *it1, *it1++ == *it2++); } /** Align the ambiguous region [it1, last1) to [it2, last2) @@ -1365,10 +1365,15 @@ static bool alignOne(const Lengths& lengths, * @return the orientation of the alignment if an alignments is found * or zero otherwise */ -template -static dir_type align(const Lengths& lengths, - iterator it1, iterator last1, - iterator it2, iterator last2, oiterator& out) +template +static dir_type +align( + const Lengths& lengths, + iterator it1, + iterator last1, + iterator it2, + iterator last2, + oiterator& out) { assert(it1 != last1); assert(it2 != last2); @@ -1379,9 +1384,7 @@ static dir_type align(const Lengths& lengths, out = copy(it1, last1, out); out = copy(it2, last2, out); return it1 == last1 && it2 == last2 ? DIR_B - : it1 == last1 ? DIR_F - : it2 == last2 ? DIR_R - : DIR_X; + : it1 == last1 ? DIR_F : it2 == last2 ? DIR_R : DIR_X; } /** Find an equivalent region of the two specified paths, starting the @@ -1389,27 +1392,27 @@ static dir_type align(const Lengths& lengths, * @param[out] orientation the orientation of the alignment * @return the consensus sequence */ -static ContigPath align(const Lengths& lengths, - const ContigPath& p1, const ContigPath& p2, - ContigPath::const_iterator pivot1, - ContigPath::const_iterator pivot2, - dir_type& orientation) +static ContigPath +align( + const Lengths& lengths, + const ContigPath& p1, + const ContigPath& p2, + ContigPath::const_iterator pivot1, + ContigPath::const_iterator pivot2, + dir_type& orientation) { assert(*pivot1 == *pivot2); - ContigPath::const_reverse_iterator - rit1 = ContigPath::const_reverse_iterator(pivot1+1), - rit2 = ContigPath::const_reverse_iterator(pivot2+1); + ContigPath::const_reverse_iterator rit1 = ContigPath::const_reverse_iterator(pivot1 + 1), + rit2 = ContigPath::const_reverse_iterator(pivot2 + 1); ContigPath alignmentr(p1.rend() - rit1 + p2.rend() - rit2); ContigPath::iterator rout = alignmentr.begin(); - dir_type alignedr = align(lengths, - rit1, p1.rend(), rit2, p2.rend(), rout); + dir_type alignedr = align(lengths, rit1, p1.rend(), rit2, p2.rend(), rout); alignmentr.erase(rout, alignmentr.end()); ContigPath::const_iterator it1 = pivot1, it2 = pivot2; ContigPath alignmentf(p1.end() - it1 + p2.end() - it2); ContigPath::iterator fout = alignmentf.begin(); - dir_type alignedf = align(lengths, - it1, p1.end(), it2, p2.end(), fout); + dir_type alignedf = align(lengths, it1, p1.end(), it2, p2.end(), fout); alignmentf.erase(fout, alignmentf.end()); ContigPath consensus; @@ -1417,10 +1420,9 @@ static ContigPath align(const Lengths& lengths, // Found an alignment. assert(!alignmentf.empty()); assert(!alignmentr.empty()); - consensus.reserve(alignmentr.size()-1 + alignmentf.size()); - consensus.assign(alignmentr.rbegin(), alignmentr.rend()-1); - consensus.insert(consensus.end(), - alignmentf.begin(), alignmentf.end()); + consensus.reserve(alignmentr.size() - 1 + alignmentf.size()); + consensus.assign(alignmentr.rbegin(), alignmentr.rend() - 1); + consensus.insert(consensus.end(), alignmentf.begin(), alignmentf.end()); // Determine the orientation of the alignment. unsigned dirs = alignedr << 2 | alignedf; @@ -1452,15 +1454,14 @@ static ContigPath align(const Lengths& lengths, /** Return a pivot suitable for aligning the two paths if one exists, * otherwise return false. */ -static pair findPivot( - const ContigPath& path1, const ContigPath& path2) +static pair +findPivot(const ContigPath& path1, const ContigPath& path2) { - for (ContigPath::const_iterator it = path2.begin(); - it != path2.end(); ++it) { + for (ContigPath::const_iterator it = path2.begin(); it != path2.end(); ++it) { if (it->ambiguous()) continue; - if (count(path2.begin(), path2.end(), *it) == 1 - && count(path1.begin(), path1.end(), *it) == 1) + if (count(path2.begin(), path2.end(), *it) == 1 && + count(path1.begin(), path1.end(), *it) == 1) return make_pair(*it, true); } return make_pair(ContigNode(0), false); @@ -1470,9 +1471,13 @@ static pair findPivot( * @param[out] orientation the orientation of the alignment * @return the consensus sequence */ -static ContigPath align(const Lengths& lengths, - const ContigPath& path1, const ContigPath& path2, - ContigNode pivot, dir_type& orientation) +static ContigPath +align( + const Lengths& lengths, + const ContigPath& path1, + const ContigPath& path2, + ContigNode pivot, + dir_type& orientation) { if (&path1 == &path2) { // Ignore the trivial alignment when aligning a path to @@ -1482,24 +1487,20 @@ static ContigPath align(const Lengths& lengths, orientation = DIR_B; return path1; } else { - ContigPath::const_iterator it - = search(path1.begin(), path1.end(), - path2.begin(), path2.end()); + ContigPath::const_iterator it = + search(path1.begin(), path1.end(), path2.begin(), path2.end()); if (it != path1.end()) { // path2 is subsumed in path1. // Determine the orientation of the edge. - orientation - = it == path1.begin() ? DIR_R - : it + path2.size() == path1.end() ? DIR_F - : DIR_B; + orientation = + it == path1.begin() ? DIR_R : it + path2.size() == path1.end() ? DIR_F : DIR_B; return path1; } } // Find a suitable pivot. - if (find(path1.begin(), path1.end(), pivot) == path1.end() - || find(path2.begin(), path2.end(), pivot) - == path2.end()) { + if (find(path1.begin(), path1.end(), pivot) == path1.end() || + find(path2.begin(), path2.end(), pivot) == path2.end()) { bool good; tie(pivot, good) = findPivot(path1, path2); if (!good) @@ -1507,29 +1508,26 @@ static ContigPath align(const Lengths& lengths, } assert(find(path1.begin(), path1.end(), pivot) != path1.end()); - ContigPath::const_iterator it2 = find(path2.begin(), path2.end(), - pivot); + ContigPath::const_iterator it2 = find(path2.begin(), path2.end(), pivot); assert(it2 != path2.end()); if (&path1 != &path2) { // The seed must be unique in path2, unless we're aligning a // path to itself. - assert(count(it2+1, path2.end(), pivot) == 0); + assert(count(it2 + 1, path2.end(), pivot) == 0); } ContigPath consensus; for (ContigPath::const_iterator it1 = find_if( - path1.begin(), path1.end(), - bind2nd(equal_to(), pivot)); - it1 != path1.end(); - it1 = find_if(it1+1, path1.end(), - bind2nd(equal_to(), pivot))) { + path1.begin(), path1.end(), [&pivot](const ContigNode& c) { return c == pivot; }); + it1 != path1.end(); + it1 = + find_if(it1 + 1, path1.end(), [&pivot](const ContigNode& c) { return c == pivot; })) { if (&*it1 == &*it2) { // We are aligning a path to itself, and this is the // trivial alignment, which we'll ignore. continue; } - consensus = align(lengths, - path1, path2, it1, it2, orientation); + consensus = align(lengths, path1, path2, it1, it2, orientation); if (!consensus.empty()) return consensus; } @@ -1539,9 +1537,8 @@ static ContigPath align(const Lengths& lengths, /** Find an equivalent region of the two specified paths. * @return the consensus sequence */ -static ContigPath align(const Lengths& lengths, - const ContigPath& path1, const ContigPath& path2, - ContigNode pivot) +static ContigPath +align(const Lengths& lengths, const ContigPath& path1, const ContigPath& path2, ContigNode pivot) { dir_type orientation; return align(lengths, path1, path2, pivot, orientation); diff --git a/Misc/samtobreak.hs b/Misc/samtobreak.hs index e1839d3e3..fd70deff9 100644 --- a/Misc/samtobreak.hs +++ b/Misc/samtobreak.hs @@ -279,7 +279,7 @@ parseArgs = do where help = putStr (usageInfo usage options) >> exitSuccess tryHelp = "Try 'abyss-samtobreak --help' for more information." - version = "abyss-samtobreak (ABySS) 2.2.3\n" + version = "abyss-samtobreak (ABySS) 2.2.4\n" usage = "Usage: samtobreak [OPTION]... [FILE]...\n\ \Calculate contig and scaffold contiguity and correctness metrics.\n" diff --git a/ParseAligns/ParseAligns.cpp b/ParseAligns/ParseAligns.cpp index 75672f015..9b168d43c 100644 --- a/ParseAligns/ParseAligns.cpp +++ b/ParseAligns/ParseAligns.cpp @@ -27,67 +27,76 @@ using namespace std; #define PROGRAM "ParseAligns" static const char VERSION_MESSAGE[] = -PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" -"Written by Jared Simpson and Shaun Jackman.\n" -"\n" -"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; + PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" + "Written by Jared Simpson and Shaun Jackman.\n" + "\n" + "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; static const char USAGE_MESSAGE[] = -"Usage: " PROGRAM " -k [OPTION]... [FILE]...\n" -"Write pairs that map to the same contig to the file SAME.\n" -"Write pairs that map to different contigs to standard output.\n" -"Alignments may be read from FILE(s) or standard input.\n" -"\n" -" Options:\n" -"\n" -" -l, --min-align=N minimum alignment length\n" -" -d, --dist=DISTANCE write distance estimates to this file\n" -" -f, --frag=SAME write fragment sizes to this file\n" -" -h, --hist=FILE write the fragment size histogram to FILE\n" -" --sam alignments are in SAM format\n" -" --kaligner alignments are in KAligner format\n" -" -c, --cover=COVERAGE coverage cut-off for distance estimates\n" -" -v, --verbose display verbose output\n" -" --help display this help and exit\n" -" --version output version information and exit\n" -"\n" -"Report bugs to <" PACKAGE_BUGREPORT ">.\n"; + "Usage: " PROGRAM " -k [OPTION]... [FILE]...\n" + "Write pairs that map to the same contig to the file SAME.\n" + "Write pairs that map to different contigs to standard output.\n" + "Alignments may be read from FILE(s) or standard input.\n" + "\n" + " Options:\n" + "\n" + " -l, --min-align=N minimum alignment length\n" + " -d, --dist=DISTANCE write distance estimates to this file\n" + " -f, --frag=SAME write fragment sizes to this file\n" + " -h, --hist=FILE write the fragment size histogram to FILE\n" + " --sam alignments are in SAM format\n" + " --kaligner alignments are in KAligner format\n" + " -c, --cover=COVERAGE coverage cut-off for distance estimates\n" + " -v, --verbose display verbose output\n" + " --help display this help and exit\n" + " --version output version information and exit\n" + "\n" + "Report bugs to <" PACKAGE_BUGREPORT ">.\n"; namespace opt { - unsigned k; // used by DistanceEst - static unsigned c; - static int verbose; - static string distPath; - static string fragPath; - static string histPath; - - /** Input alignment format. */ - static int inputFormat; - enum { KALIGNER, SAM }; - - /** Output format */ - int format = ADJ; // used by Estimate +unsigned k; // used by DistanceEst +static unsigned c; +static int verbose; +static string distPath; +static string fragPath; +static string histPath; + +/** Input alignment format. */ +static int inputFormat; +enum +{ + KALIGNER, + SAM +}; + +/** Output format */ +int format = ADJ; // used by Estimate } static const char shortopts[] = "d:l:f:h:c:v"; -enum { OPT_HELP = 1, OPT_VERSION }; +enum +{ + OPT_HELP = 1, + OPT_VERSION +}; static const struct option longopts[] = { - { "dist", required_argument, NULL, 'd' }, + { "dist", required_argument, NULL, 'd' }, { "min-align", required_argument, NULL, 'l' }, - { "frag", required_argument, NULL, 'f' }, - { "hist", required_argument, NULL, 'h' }, - { "kaligner",no_argument, &opt::inputFormat, opt::KALIGNER }, - { "sam", no_argument, &opt::inputFormat, opt::SAM }, - { "cover", required_argument, NULL, 'c' }, - { "verbose", no_argument, NULL, 'v' }, - { "help", no_argument, NULL, OPT_HELP }, - { "version", no_argument, NULL, OPT_VERSION }, + { "frag", required_argument, NULL, 'f' }, + { "hist", required_argument, NULL, 'h' }, + { "kaligner", no_argument, &opt::inputFormat, opt::KALIGNER }, + { "sam", no_argument, &opt::inputFormat, opt::SAM }, + { "cover", required_argument, NULL, 'c' }, + { "verbose", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, OPT_HELP }, + { "version", no_argument, NULL, OPT_VERSION }, { NULL, 0, NULL, 0 } }; -static struct { +static struct +{ size_t alignments; size_t bothUnaligned; size_t oneUnaligned; @@ -109,14 +118,17 @@ typedef unordered_map ReadAlignMap; typedef unordered_map EstimateMap; static EstimateMap estMap; -static bool checkUniqueAlignments(const AlignmentVector& alignVec); -static string makePairID(string id); +static bool +checkUniqueAlignments(const AlignmentVector& alignVec); +static string +makePairID(string id); /** * Return the size of the fragment demarcated by the specified * alignments. */ -static int fragmentSize(const Alignment& a0, const Alignment& a1) +static int +fragmentSize(const Alignment& a0, const Alignment& a1) { assert(a0.contig == a1.contig); assert(a0.isRC != a1.isRC); @@ -128,17 +140,16 @@ static int fragmentSize(const Alignment& a0, const Alignment& a1) typedef pair Estimate; typedef vector Estimates; -static void addEstimate(EstimateMap& map, const Alignment& a, - Estimate& est, bool reverse) +static void +addEstimate(EstimateMap& map, const Alignment& a, Estimate& est, bool reverse) { - //count up the number of estimates that agree + // count up the number of estimates that agree bool placed = false; bool a_isRC = a.isRC != reverse; EstimateMap::iterator estimatesIt = map.find(a.contig); if (estimatesIt != map.end()) { Estimates& estimates = estimatesIt->second.estimates[a_isRC]; - for (Estimates::iterator estIt = estimates.begin(); - estIt != estimates.end(); ++estIt) { + for (Estimates::iterator estIt = estimates.begin(); estIt != estimates.end(); ++estIt) { if (estIt->first.id() == est.first.id()) { estIt->second.numPairs++; estIt->second.distance += est.second.distance; @@ -149,10 +160,10 @@ static void addEstimate(EstimateMap& map, const Alignment& a, } if (!placed) map[a.contig].estimates[a_isRC].push_back(est); - } -static void doReadIntegrity(const ReadAlignMap::value_type& a) +static void +doReadIntegrity(const ReadAlignMap::value_type& a) { AlignmentVector::const_iterator refAlignIter = a.second.begin(); unsigned firstStart, lastEnd, largestSize; @@ -164,16 +175,14 @@ static void doReadIntegrity(const ReadAlignMap::value_type& a) first = last = largest = *refAlignIter; ++refAlignIter; - //for each alignment in the vector a.second + // for each alignment in the vector a.second for (; refAlignIter != a.second.end(); ++refAlignIter) { if ((unsigned)refAlignIter->read_start_pos < firstStart) { firstStart = refAlignIter->read_start_pos; first = *refAlignIter; } - if ((unsigned)(refAlignIter->read_start_pos + - refAlignIter->align_length) > lastEnd) { - lastEnd = refAlignIter->read_start_pos + - refAlignIter->align_length; + if ((unsigned)(refAlignIter->read_start_pos + refAlignIter->align_length) > lastEnd) { + lastEnd = refAlignIter->read_start_pos + refAlignIter->align_length; last = *refAlignIter; } if ((unsigned)refAlignIter->align_length > largestSize) { @@ -184,27 +193,20 @@ static void doReadIntegrity(const ReadAlignMap::value_type& a) if (largest.contig != last.contig) { Estimate est; - unsigned largest_end = - largest.read_start_pos + largest.align_length - opt::k; + unsigned largest_end = largest.read_start_pos + largest.align_length - opt::k; int distance = last.read_start_pos - largest_end; - est.first = find_vertex( - last.contig, largest.isRC != last.isRC, - g_contigNames); + est.first = find_vertex(last.contig, largest.isRC != last.isRC, g_contigNames); est.second.distance = distance - opt::k; est.second.numPairs = 1; est.second.stdDev = 0; addEstimate(estMap, largest, est, false); } - if (largest.contig != first.contig && - largest.contig != last.contig) { + if (largest.contig != first.contig && largest.contig != last.contig) { Estimate est; - unsigned first_end = - first.read_start_pos + first.align_length - opt::k; + unsigned first_end = first.read_start_pos + first.align_length - opt::k; int distance = last.read_start_pos - first_end; - est.first = find_vertex( - last.contig, first.isRC != last.isRC, - g_contigNames); + est.first = find_vertex(last.contig, first.isRC != last.isRC, g_contigNames); est.second.distance = distance - opt::k; est.second.numPairs = 1; est.second.stdDev = 0; @@ -215,12 +217,9 @@ static void doReadIntegrity(const ReadAlignMap::value_type& a) largest.flipQuery(); first.flipQuery(); Estimate est; - unsigned largest_end = - largest.read_start_pos + largest.align_length - opt::k; + unsigned largest_end = largest.read_start_pos + largest.align_length - opt::k; int distance = first.read_start_pos - largest_end; - est.first = find_vertex( - first.contig, largest.isRC != first.isRC, - g_contigNames); + est.first = find_vertex(first.contig, largest.isRC != first.isRC, g_contigNames); est.second.distance = distance - opt::k; est.second.numPairs = 1; est.second.stdDev = 0; @@ -259,31 +258,27 @@ static void doReadIntegrity(const ReadAlignMap::value_type& a) #endif } -static void generateDistFile() +static void +generateDistFile() { ofstream distFile(opt::distPath.c_str()); assert(distFile.is_open()); - for (EstimateMap::iterator mapIt = estMap.begin(); - mapIt != estMap.end(); ++mapIt) { - //Skip empty iterators + for (EstimateMap::iterator mapIt = estMap.begin(); mapIt != estMap.end(); ++mapIt) { + // Skip empty iterators assert(!mapIt->second.estimates[0].empty() || !mapIt->second.estimates[1].empty()); distFile << mapIt->first; for (int refIsRC = 0; refIsRC <= 1; refIsRC++) { if (refIsRC) distFile << " ;"; - for (Estimates::iterator vecIt - = mapIt->second.estimates[refIsRC].begin(); - vecIt != mapIt->second.estimates[refIsRC].end(); ++vecIt) { - vecIt->second.distance - = (int)round((double)vecIt->second.distance / - (double)vecIt->second.numPairs); - if (vecIt->second.numPairs >= opt::c - && vecIt->second.numPairs != 0 - /*&& vecIt->distance > 1 - opt::k*/) - distFile - << ' ' << get(g_contigNames, vecIt->first) - << ',' << vecIt->second; + for (Estimates::iterator vecIt = mapIt->second.estimates[refIsRC].begin(); + vecIt != mapIt->second.estimates[refIsRC].end(); + ++vecIt) { + vecIt->second.distance = + (int)round((double)vecIt->second.distance / (double)vecIt->second.numPairs); + if (vecIt->second.numPairs >= opt::c && vecIt->second.numPairs != 0 + /*&& vecIt->distance > 1 - opt::k*/) + distFile << ' ' << get(g_contigNames, vecIt->first) << ',' << vecIt->second; } } distFile << '\n'; @@ -292,8 +287,10 @@ static void generateDistFile() distFile.close(); } -static bool isSingleEnd(const string& id); -static bool needsFlipping(const string& id); +static bool +isSingleEnd(const string& id); +static bool +needsFlipping(const string& id); /** * Return an alignment flipped as necessary to produce an alignment @@ -302,14 +299,14 @@ static bool needsFlipping(const string& id); * alignment, so that the alignment is forward-reverse, which is * required by DistanceEst. */ -static const Alignment flipAlignment(const Alignment& a, - const string& id) +static const Alignment +flipAlignment(const Alignment& a, const string& id) { return needsFlipping(id) ? a.flipQuery() : a; } -static void handleAlignmentPair(const ReadAlignMap::value_type& curr, - const ReadAlignMap::value_type& pair) +static void +handleAlignmentPair(const ReadAlignMap::value_type& curr, const ReadAlignMap::value_type& pair) { const string& currID = curr.first; const string& pairID = pair.first; @@ -323,31 +320,24 @@ static void handleAlignmentPair(const ReadAlignMap::value_type& curr, stats.bothUnaligned++; } else if (curr.second.empty() || pair.second.empty()) { stats.oneUnaligned++; - } else if (!checkUniqueAlignments(curr.second) - || !checkUniqueAlignments(pair.second)) { + } else if (!checkUniqueAlignments(curr.second) || !checkUniqueAlignments(pair.second)) { stats.numMulti++; - } else if (curr.second.size() > MAX_SPAN - && pair.second.size() > MAX_SPAN) { + } else if (curr.second.size() > MAX_SPAN && pair.second.size() > MAX_SPAN) { stats.numSplit++; } else { // Iterate over the vectors, outputting the aligments bool counted = false; - for (AlignmentVector::const_iterator refAlignIter - = curr.second.begin(); - refAlignIter != curr.second.end(); ++refAlignIter) { - for (AlignmentVector::const_iterator pairAlignIter - = pair.second.begin(); - pairAlignIter != pair.second.end(); - ++pairAlignIter) { - const Alignment& a0 = flipAlignment(*refAlignIter, - currID); - const Alignment& a1 = flipAlignment(*pairAlignIter, - pairID); + for (AlignmentVector::const_iterator refAlignIter = curr.second.begin(); + refAlignIter != curr.second.end(); + ++refAlignIter) { + for (AlignmentVector::const_iterator pairAlignIter = pair.second.begin(); + pairAlignIter != pair.second.end(); + ++pairAlignIter) { + const Alignment& a0 = flipAlignment(*refAlignIter, currID); + const Alignment& a1 = flipAlignment(*pairAlignIter, pairID); bool sameTarget = a0.contig == a1.contig; - if (sameTarget - && curr.second.size() == 1 - && pair.second.size() == 1) { + if (sameTarget && curr.second.size() == 1 && pair.second.size() == 1) { // Same target and the only alignment. if (a0.isRC != a1.isRC) { // Correctly oriented. Add this alignment to @@ -363,11 +353,9 @@ static void handleAlignmentPair(const ReadAlignMap::value_type& curr, counted = true; } - bool outputSameTarget = opt::fragPath.empty() - && opt::histPath.empty(); + bool outputSameTarget = opt::fragPath.empty() && opt::histPath.empty(); if (!sameTarget || outputSameTarget) { - cout << SAMRecord(a0, a1) << '\n' - << SAMRecord(a1, a0) << '\n'; + cout << SAMRecord(a0, a1) << '\n' << SAMRecord(a1, a0) << '\n'; assert(cout.good()); } } @@ -377,7 +365,8 @@ static void handleAlignmentPair(const ReadAlignMap::value_type& curr, } } -static void printProgress(const ReadAlignMap& map) +static void +printProgress(const ReadAlignMap& map) { if (opt::verbose == 0) return; @@ -390,16 +379,16 @@ static void printProgress(const ReadAlignMap& map) if (stats.alignments % 1000000 == 0 || buckets != prevBuckets) { prevBuckets = buckets; size_t size = map.size(); - cerr << "Read " << stats.alignments << " alignments. " - "Hash load: " << size << " / " << buckets - << " = " << (float)size / buckets - << " using " << toSI(getMemoryUsage()) << "B." << endl; + cerr << "Read " << stats.alignments + << " alignments. " + "Hash load: " + << size << " / " << buckets << " = " << (float)size / buckets << " using " + << toSI(getMemoryUsage()) << "B." << endl; } } -static void handleAlignment( - const ReadAlignMap::value_type& alignments, - ReadAlignMap& out) +static void +handleAlignment(const ReadAlignMap::value_type& alignments, ReadAlignMap& out) { if (!isSingleEnd(alignments.first)) { string pairID = makePairID(alignments.first); @@ -408,8 +397,7 @@ static void handleAlignment( handleAlignmentPair(*pairIter, alignments); out.erase(pairIter); } else if (!out.insert(alignments).second) { - cerr << "error: duplicate read ID `" << alignments.first - << "'\n"; + cerr << "error: duplicate read ID `" << alignments.first << "'\n"; exit(EXIT_FAILURE); } } @@ -421,13 +409,13 @@ static void handleAlignment( printProgress(out); } -static void readAlignment(const string& line, ReadAlignMap& out) +static void +readAlignment(const string& line, ReadAlignMap& out) { istringstream s(line); pair v; switch (opt::inputFormat) { - case opt::SAM: - { + case opt::SAM: { SAMRecord sam; s >> sam; assert(s); @@ -439,23 +427,21 @@ static void readAlignment(const string& line, ReadAlignMap& out) if (!sam.isUnmapped()) v.second.push_back(sam); break; - } - case opt::KALIGNER: - { + } + case opt::KALIGNER: { s >> v.first; assert(s); v.second.reserve(count(line.begin(), line.end(), '\t')); - v.second.assign( - istream_iterator(s), - istream_iterator()); + v.second.assign(istream_iterator(s), istream_iterator()); assert(s.eof()); break; - } + } } handleAlignment(v, out); } -static void readAlignments(istream& in, ReadAlignMap* pout) +static void +readAlignments(istream& in, ReadAlignMap* pout) { for (string line; getline(in, line);) if (line.empty() || line[0] == '@') @@ -465,7 +451,8 @@ static void readAlignments(istream& in, ReadAlignMap* pout) assert(in.eof()); } -static void readAlignmentsFile(string path, ReadAlignMap* pout) +static void +readAlignmentsFile(string path, ReadAlignMap* pout) { if (opt::verbose > 0) cerr << "Reading `" << path << "'..." << endl; @@ -476,51 +463,65 @@ static void readAlignmentsFile(string path, ReadAlignMap* pout) } /** Return the specified number formatted as a percent. */ -static string percent(size_t x, size_t n) +static string +percent(size_t x, size_t n) { ostringstream ss; ss << setw((int)log10(n) + 1) << x; if (x > 0) - ss << " " << setprecision(3) << (float)100*x/n << '%'; + ss << " " << setprecision(3) << (float)100 * x / n << '%'; return ss.str(); } -int main(int argc, char* const* argv) +int +main(int argc, char* const* argv) { bool die = false; - for (int c; (c = getopt_long(argc, argv, - shortopts, longopts, NULL)) != -1;) { + for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { istringstream arg(optarg != NULL ? optarg : ""); switch (c) { - case '?': die = true; break; - case 'l': arg >> opt::k; break; - case 'c': arg >> opt::c; break; - case 'd': arg >> opt::distPath; break; - case 'f': arg >> opt::fragPath; break; - case 'h': arg >> opt::histPath; break; - case 'v': opt::verbose++; break; - case OPT_HELP: - cout << USAGE_MESSAGE; - exit(EXIT_SUCCESS); - case OPT_VERSION: - cout << VERSION_MESSAGE; - exit(EXIT_SUCCESS); + case '?': + die = true; + break; + case 'l': + arg >> opt::k; + break; + case 'c': + arg >> opt::c; + break; + case 'd': + arg >> opt::distPath; + break; + case 'f': + arg >> opt::fragPath; + break; + case 'h': + arg >> opt::histPath; + break; + case 'v': + opt::verbose++; + break; + case OPT_HELP: + cout << USAGE_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_VERSION: + cout << VERSION_MESSAGE; + exit(EXIT_SUCCESS); } if (optarg != NULL && !arg.eof()) { - cerr << PROGRAM ": invalid option: `-" - << (char)c << optarg << "'\n"; + cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n"; exit(EXIT_FAILURE); } } if (opt::k <= 0 && opt::inputFormat == opt::KALIGNER) { - cerr << PROGRAM ": " << "missing -k,--kmer option\n"; + cerr << PROGRAM ": " + << "missing -k,--kmer option\n"; die = true; } if (die) { - cerr << "Try `" << PROGRAM - << " --help' for more information.\n"; + cerr << "Try `" << PROGRAM << " --help' for more information.\n"; exit(EXIT_FAILURE); } @@ -531,8 +532,9 @@ int main(int argc, char* const* argv) ReadAlignMap alignTable(1); if (optind < argc) { - for_each(argv + optind, argv + argc, - bind2nd(ptr_fun(readAlignmentsFile), &alignTable)); + for_each(argv + optind, argv + argc, [&alignTable](const std::string& s) { + readAlignmentsFile(s, &alignTable); + }); } else { if (opt::verbose > 0) cerr << "Reading from standard input..." << endl; @@ -543,21 +545,36 @@ int main(int argc, char* const* argv) unsigned numRF = histogram.count(INT_MIN, 0); unsigned numFR = histogram.count(1, INT_MAX); - size_t sum = alignTable.size() - + stats.bothUnaligned + stats.oneUnaligned - + numFR + numRF + stats.numFF - + stats.numDifferent + stats.numMulti + stats.numSplit; - cerr << - "Mateless " << percent(alignTable.size(), sum) << "\n" - "Unaligned " << percent(stats.bothUnaligned, sum) << "\n" - "Singleton " << percent(stats.oneUnaligned, sum) << "\n" - "FR " << percent(numFR, sum) << "\n" - "RF " << percent(numRF, sum) << "\n" - "FF " << percent(stats.numFF, sum) << "\n" - "Different " << percent(stats.numDifferent, sum) << "\n" - "Multimap " << percent(stats.numMulti, sum) << "\n" - "Split " << percent(stats.numSplit, sum) << "\n" - "Total " << sum << endl; + size_t sum = alignTable.size() + stats.bothUnaligned + stats.oneUnaligned + numFR + numRF + + stats.numFF + stats.numDifferent + stats.numMulti + stats.numSplit; + cerr << "Mateless " << percent(alignTable.size(), sum) + << "\n" + "Unaligned " + << percent(stats.bothUnaligned, sum) + << "\n" + "Singleton " + << percent(stats.oneUnaligned, sum) + << "\n" + "FR " + << percent(numFR, sum) + << "\n" + "RF " + << percent(numRF, sum) + << "\n" + "FF " + << percent(stats.numFF, sum) + << "\n" + "Different " + << percent(stats.numDifferent, sum) + << "\n" + "Multimap " + << percent(stats.numMulti, sum) + << "\n" + "Split " + << percent(stats.numSplit, sum) + << "\n" + "Total " + << sum << endl; if (!opt::distPath.empty()) generateDistFile(); @@ -580,17 +597,25 @@ int main(int argc, char* const* argv) histogram.removeOutliers(); Histogram h = histogram.trimFraction(0.0001); if (opt::verbose > 0) - cerr << "Stats mean: " << setprecision(4) << h.mean() << " " - "median: " << setprecision(4) << h.median() << " " - "sd: " << setprecision(4) << h.sd() << " " - "n: " << h.size() << " " - "min: " << h.minimum() << " max: " << h.maximum() << '\n' - << h.barplot() << endl; + cerr << "Stats mean: " << setprecision(4) << h.mean() + << " " + "median: " + << setprecision(4) << h.median() + << " " + "sd: " + << setprecision(4) << h.sd() + << " " + "n: " + << h.size() + << " " + "min: " + << h.minimum() << " max: " << h.maximum() << '\n' + << h.barplot() << endl; if (stats.numFF > numFR && stats.numFF > numRF) { cerr << "error: The mate pairs of this library are oriented " - "forward-forward (FF), which is not supported by ABySS." - << endl; + "forward-forward (FF), which is not supported by ABySS." + << endl; exit(EXIT_FAILURE); } @@ -599,7 +624,8 @@ int main(int argc, char* const* argv) /** Return whether any k-mer in the query is aligned more than once. */ -static bool checkUniqueAlignments(const AlignmentVector& alignVec) +static bool +checkUniqueAlignments(const AlignmentVector& alignVec) { assert(!alignVec.empty()); if (alignVec.size() == 1) @@ -608,11 +634,9 @@ static bool checkUniqueAlignments(const AlignmentVector& alignVec) unsigned nKmer = alignVec.front().read_length - opt::k + 1; vector coverage(nKmer); - for (AlignmentVector::const_iterator iter = alignVec.begin(); - iter != alignVec.end(); ++iter) { + for (AlignmentVector::const_iterator iter = alignVec.begin(); iter != alignVec.end(); ++iter) { assert((unsigned)iter->align_length >= opt::k); - unsigned end = iter->read_start_pos - + iter->align_length - opt::k + 1; + unsigned end = iter->read_start_pos + iter->align_length - opt::k + 1; assert(end <= nKmer); for (unsigned i = iter->read_start_pos; i < end; i++) coverage[i]++; @@ -624,31 +648,30 @@ static bool checkUniqueAlignments(const AlignmentVector& alignVec) return true; } -static bool replaceSuffix(string& s, - const string& suffix0, const string& suffix1) +static bool +replaceSuffix(string& s, const string& suffix0, const string& suffix1) { if (endsWith(s, suffix0)) { - s.replace(s.length() - suffix0.length(), string::npos, - suffix1); + s.replace(s.length() - suffix0.length(), string::npos, suffix1); return true; } else if (endsWith(s, suffix1)) { - s.replace(s.length() - suffix1.length(), string::npos, - suffix0); + s.replace(s.length() - suffix1.length(), string::npos, suffix0); return true; } else return false; } /** Return true if the specified read ID is of a single-end read. */ -static bool isSingleEnd(const string& id) +static bool +isSingleEnd(const string& id) { unsigned l = id.length(); - return endsWith(id, ".fn") - || (l > 6 && id.substr(l-6, 5) == ".part"); + return endsWith(id, ".fn") || (l > 6 && id.substr(l - 6, 5) == ".part"); } /** Return the mate ID of the specified read ID. */ -static string makePairID(string id) +static string +makePairID(string id) { if (equal(id.begin(), id.begin() + 3, "SRR")) return id; @@ -656,27 +679,44 @@ static string makePairID(string id) assert(!id.empty()); char& c = id[id.length() - 1]; switch (c) { - case '1': c = '2'; return id; - case '2': c = '1'; return id; - case 'A': c = 'B'; return id; - case 'B': c = 'A'; return id; - case 'F': c = 'R'; return id; - case 'R': c = 'F'; return id; - case 'f': c = 'r'; return id; - case 'r': c = 'f'; return id; + case '1': + c = '2'; + return id; + case '2': + c = '1'; + return id; + case 'A': + c = 'B'; + return id; + case 'B': + c = 'A'; + return id; + case 'F': + c = 'R'; + return id; + case 'R': + c = 'F'; + return id; + case 'f': + c = 'r'; + return id; + case 'r': + c = 'f'; + return id; } - if (replaceSuffix(id, "forward", "reverse") - || replaceSuffix(id, "F3", "R3")) + if (replaceSuffix(id, "forward", "reverse") || replaceSuffix(id, "F3", "R3")) return id; - cerr << "error: read ID `" << id << "' must end in one of\n" - "\t1 and 2 or A and B or F and R or" - " F3 and R3 or forward and reverse\n"; + cerr << "error: read ID `" << id + << "' must end in one of\n" + "\t1 and 2 or A and B or F and R or" + " F3 and R3 or forward and reverse\n"; exit(EXIT_FAILURE); } -static bool needsFlipping(const string& id) +static bool +needsFlipping(const string& id) { return endsWith(id, "F3"); } diff --git a/ParseAligns/abyss-fixmate.cc b/ParseAligns/abyss-fixmate.cc index 83c5e182a..bb9fba381 100644 --- a/ParseAligns/abyss-fixmate.cc +++ b/ParseAligns/abyss-fixmate.cc @@ -1,3 +1,6 @@ +#include "ContigID.h" +#include "DataBase/DB.h" +#include "DataBase/Options.h" #include "Histogram.h" #include "IOUtil.h" #include "MemoryUtil.h" @@ -5,8 +8,8 @@ #include "StringUtil.h" #include "Uncompress.h" #include "UnorderedMap.h" -#include "ContigID.h" #include +#include #include #include #include @@ -14,9 +17,6 @@ #include #include #include -#include -#include "DataBase/Options.h" -#include "DataBase/DB.h" using namespace std; @@ -25,47 +25,47 @@ using namespace std; DB db; static const char VERSION_MESSAGE[] = -PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" -"Written by Shaun Jackman.\n" -"\n" -"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; + PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" + "Written by Shaun Jackman.\n" + "\n" + "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; static const char USAGE_MESSAGE[] = -"Usage: " PROGRAM " [OPTION]... [FILE]...\n" -"Write read pairs that map to the same contig to the file SAME.\n" -"Write read pairs that map to different contigs to stdout.\n" -"Alignments may be in FILE(s) or standard input.\n" -"\n" -" Options:\n" -"\n" -" --no-qname set the qname to * [default]\n" -" --qname do not alter the qname\n" -" --all print all alignments\n" -" --diff print alignments that align to different\n" -" contigs [default]\n" -" -l, --min-align=N the minimal alignment size [1]\n" -" -s, --same=SAME write properly-paired reads to this file\n" -" -h, --hist=FILE write the fragment size histogram to FILE\n" -" -c, --cov=FILE write the physical coverage to FILE\n" -" -v, --verbose display verbose output\n" -" --help display this help and exit\n" -" --version output version information and exit\n" -" --db=FILE specify path of database repository in FILE\n" -" --library=NAME specify library NAME for sqlite\n" -" --strain=NAME specify strain NAME for sqlite\n" -" --species=NAME specify species NAME for sqlite\n" -"\n" -"Report bugs to <" PACKAGE_BUGREPORT ">.\n"; + "Usage: " PROGRAM " [OPTION]... [FILE]...\n" + "Write read pairs that map to the same contig to the file SAME.\n" + "Write read pairs that map to different contigs to stdout.\n" + "Alignments may be in FILE(s) or standard input.\n" + "\n" + " Options:\n" + "\n" + " --no-qname set the qname to * [default]\n" + " --qname do not alter the qname\n" + " --all print all alignments\n" + " --diff print alignments that align to different\n" + " contigs [default]\n" + " -l, --min-align=N the minimal alignment size [1]\n" + " -s, --same=SAME write properly-paired reads to this file\n" + " -h, --hist=FILE write the fragment size histogram to FILE\n" + " -c, --cov=FILE write the physical coverage to FILE\n" + " -v, --verbose display verbose output\n" + " --help display this help and exit\n" + " --version output version information and exit\n" + " --db=FILE specify path of database repository in FILE\n" + " --library=NAME specify library NAME for sqlite\n" + " --strain=NAME specify strain NAME for sqlite\n" + " --species=NAME specify species NAME for sqlite\n" + "\n" + "Report bugs to <" PACKAGE_BUGREPORT ">.\n"; namespace opt { - string db; - dbVars metaVars; - static string fragPath; - static string histPath; - static string covPath; - static int qname; - static int verbose; - static int print_all; +string db; +dbVars metaVars; +static string fragPath; +static string histPath; +static string covPath; +static int qname; +static int verbose; +static int print_all; } // for sqlite params @@ -74,28 +74,35 @@ static vector vals; static const char shortopts[] = "h:c:l:s:v"; -enum { OPT_HELP = 1, OPT_VERSION, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES }; - -static const struct option longopts[] = { - { "qname", no_argument, &opt::qname, 1 }, - { "no-qname", no_argument, &opt::qname, 0 }, - { "all", no_argument, &opt::print_all, 1 }, - { "diff", no_argument, &opt::print_all, 0 }, - { "min-align", required_argument, NULL, 'l' }, - { "hist", required_argument, NULL, 'h' }, - { "cov", required_argument, NULL, 'c' }, - { "same", required_argument, NULL, 's' }, - { "verbose", no_argument, NULL, 'v' }, - { "help", no_argument, NULL, OPT_HELP }, - { "version", no_argument, NULL, OPT_VERSION }, - { "db", required_argument, NULL, OPT_DB }, - { "library", required_argument, NULL, OPT_LIBRARY }, - { "strain", required_argument, NULL, OPT_STRAIN }, - { "species", required_argument, NULL, OPT_SPECIES }, - { NULL, 0, NULL, 0 } +enum +{ + OPT_HELP = 1, + OPT_VERSION, + OPT_DB, + OPT_LIBRARY, + OPT_STRAIN, + OPT_SPECIES }; -static struct { +static const struct option longopts[] = { { "qname", no_argument, &opt::qname, 1 }, + { "no-qname", no_argument, &opt::qname, 0 }, + { "all", no_argument, &opt::print_all, 1 }, + { "diff", no_argument, &opt::print_all, 0 }, + { "min-align", required_argument, NULL, 'l' }, + { "hist", required_argument, NULL, 'h' }, + { "cov", required_argument, NULL, 'c' }, + { "same", required_argument, NULL, 's' }, + { "verbose", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, OPT_HELP }, + { "version", no_argument, NULL, OPT_VERSION }, + { "db", required_argument, NULL, OPT_DB }, + { "library", required_argument, NULL, OPT_LIBRARY }, + { "strain", required_argument, NULL, OPT_STRAIN }, + { "species", required_argument, NULL, OPT_SPECIES }, + { NULL, 0, NULL, 0 } }; + +static struct +{ size_t alignments; size_t bothUnaligned; size_t oneUnaligned; @@ -106,9 +113,10 @@ static struct { static ofstream g_fragFile; static Histogram g_histogram; static ofstream g_covFile; -static vector< vector > g_contigCov; +static vector> g_contigCov; -static void incrementRange(SAMRecord& a) +static void +incrementRange(SAMRecord& a) { unsigned inx = get(g_contigNames, a.rname); g_contigCov[inx][a.pos]++; @@ -119,14 +127,12 @@ static void incrementRange(SAMRecord& a) g_contigCov[inx][end]--; } -static void handlePair(SAMRecord& a0, SAMRecord& a1) +static void +handlePair(SAMRecord& a0, SAMRecord& a1) { - if ((a0.isRead1() && a1.isRead1()) - || (a0.isRead2() && a1.isRead2())) { - cerr << "error: duplicate read ID `" << a0.qname - << (a0.isRead1() ? "/1" : "") - << (a0.isRead2() ? "/2" : "") - << "'\n"; + if ((a0.isRead1() && a1.isRead1()) || (a0.isRead2() && a1.isRead2())) { + cerr << "error: duplicate read ID `" << a0.qname << (a0.isRead1() ? "/1" : "") + << (a0.isRead2() ? "/2" : "") << "'\n"; exit(EXIT_FAILURE); } @@ -175,7 +181,8 @@ typedef boost::unordered_map Alignments; typedef boost::unordered_map Alignments; #endif -static void printProgress(const Alignments& map) +static void +printProgress(const Alignments& map) { if (opt::verbose == 0) return; @@ -189,16 +196,15 @@ static void printProgress(const Alignments& map) prevBuckets = buckets; size_t size = map.size(); cerr << "Read " << stats.alignments << " alignments. " - << "Hash load: " << size << " / " << buckets - << " = " << (float)size / buckets - << " using " << toSI(getMemoryUsage()) << "B." << endl; + << "Hash load: " << size << " / " << buckets << " = " << (float)size / buckets + << " using " << toSI(getMemoryUsage()) << "B." << endl; } } -static void handleAlignment(SAMRecord& sam, Alignments& map) +static void +handleAlignment(SAMRecord& sam, Alignments& map) { - pair it = map.insert( - make_pair(sam.qname, sam)); + pair it = map.insert(make_pair(sam.qname, sam)); if (!it.second) { #if SAM_SEQ_QUAL SAMRecord& a0 = it.first->second; @@ -222,7 +228,8 @@ static void handleAlignment(SAMRecord& sam, Alignments& map) printProgress(map); } -static void assert_eof(istream& in) +static void +assert_eof(istream& in) { if (in.eof()) return; @@ -234,21 +241,26 @@ static void assert_eof(istream& in) } /** Print physical coverage in wiggle format. */ -static void printCov(string file) +static void +printCov(string file) { ofstream out(file.c_str()); for (unsigned i = 0; i < g_contigCov.size(); i++) { out << "variableStep\tchrom=" << get(g_contigNames, i) << '\n'; int prev = g_contigCov[i][0]; - if (prev != 0) out << "0\t" << prev << '\n'; + if (prev != 0) + out << "0\t" << prev << '\n'; for (unsigned j = 1; j < g_contigCov[i].size(); j++) { prev += g_contigCov[i][j]; - if (prev != 0) out << j << "\t" << prev << '\n'; + if (prev != 0) + out << j << "\t" << prev << '\n'; } } } -void parseTag(string line) { +void +parseTag(string line) +{ stringstream ss(line); string tag; char type[2]; @@ -269,10 +281,11 @@ void parseTag(string line) { assert(length > 0); assert(id.size() > 0); put(g_contigNames, g_contigCov.size(), id); - g_contigCov.push_back(vector(length)); + g_contigCov.push_back(vector(length)); } -static void readAlignments(istream& in, Alignments* pMap) +static void +readAlignments(istream& in, Alignments* pMap) { for (SAMRecord sam; in >> ws;) { if (in.peek() == '@') { @@ -294,7 +307,8 @@ static void readAlignments(istream& in, Alignments* pMap) assert_eof(in); } -static void readAlignmentsFile(string path, Alignments* pMap) +static void +readAlignmentsFile(string path, Alignments* pMap) { if (opt::verbose > 0) cerr << "Reading `" << path << "'..." << endl; @@ -305,103 +319,118 @@ static void readAlignmentsFile(string path, Alignments* pMap) } /** Return the specified number formatted as a percent. */ -static string percent(size_t x, size_t n) +static string +percent(size_t x, size_t n) { ostringstream ss; ss << setw((int)log10(n) + 1) << x; if (x > 0) - ss << " " << setprecision(3) << (float)100*x/n << '%'; + ss << " " << setprecision(3) << (float)100 * x / n << '%'; return ss.str(); } /** Print statistics of the specified histogram. h not passed by - * reference because we want to make a copy + * reference because we want to make a copy **/ -static void printHistogramStats(Histogram h) +static void +printHistogramStats(Histogram h) { unsigned n_orig = h.size(); h.eraseNegative(); h.removeNoise(); h.removeOutliers(); h = h.trimFraction(0.0001); - cerr << "Stats mean: " << setprecision(4) << h.mean() << " " - "median: " << setprecision(4) << h.median() << " " - "sd: " << setprecision(4) << h.sd() << " " - "n: " << h.size() << " " - "min: " << h.minimum() << " " - "max: " << h.maximum() << " " - "ignored: " << n_orig - h.size() << '\n' - << h.barplot() << endl; + cerr << "Stats mean: " << setprecision(4) << h.mean() + << " " + "median: " + << setprecision(4) << h.median() + << " " + "sd: " + << setprecision(4) << h.sd() + << " " + "n: " + << h.size() + << " " + "min: " + << h.minimum() + << " " + "max: " + << h.maximum() + << " " + "ignored: " + << n_orig - h.size() << '\n' + << h.barplot() << endl; if (!opt::db.empty()) { - vals = make_vector() - << (int)round(h.mean()) - << h.median() - << (int)round(h.sd()) - << h.size() - << h.minimum() - << h.maximum() - << n_orig-h.size(); - - keys = make_vector() - << "mean" - << "median" - << "sd" - << "n" - << "min" - << "max" - << "ignored"; - - for (unsigned i=0; i() << (int)round(h.mean()) << h.median() << (int)round(h.sd()) + << h.size() << h.minimum() << h.maximum() << n_orig - h.size(); + + keys = make_vector() << "mean" + << "median" + << "sd" + << "n" + << "min" + << "max" + << "ignored"; + + for (unsigned i = 0; i < vals.size(); i++) addToDb(db, keys[i], vals[i]); } } -int main(int argc, char* const* argv) +int +main(int argc, char* const* argv) { opt::metaVars.resize(3); bool die = false; - for (int c; (c = getopt_long(argc, argv, - shortopts, longopts, NULL)) != -1;) { + for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { istringstream arg(optarg != NULL ? optarg : ""); switch (c) { - case '?': die = true; break; - case 'l': - arg >> opt::minAlign; - break; - case 's': arg >> opt::fragPath; break; - case 'h': arg >> opt::histPath; break; - case 'c': arg >> opt::covPath; break; - case 'v': opt::verbose++; break; - case OPT_HELP: - cout << USAGE_MESSAGE; - exit(EXIT_SUCCESS); - case OPT_VERSION: - cout << VERSION_MESSAGE; - exit(EXIT_SUCCESS); - case OPT_DB: - arg >> opt::db; - break; - case OPT_LIBRARY: - arg >> opt::metaVars[0]; - break; - case OPT_STRAIN: - arg >> opt::metaVars[1]; - break; - case OPT_SPECIES: - arg >> opt::metaVars[2]; - break; + case '?': + die = true; + break; + case 'l': + arg >> opt::minAlign; + break; + case 's': + arg >> opt::fragPath; + break; + case 'h': + arg >> opt::histPath; + break; + case 'c': + arg >> opt::covPath; + break; + case 'v': + opt::verbose++; + break; + case OPT_HELP: + cout << USAGE_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_VERSION: + cout << VERSION_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_DB: + arg >> opt::db; + break; + case OPT_LIBRARY: + arg >> opt::metaVars[0]; + break; + case OPT_STRAIN: + arg >> opt::metaVars[1]; + break; + case OPT_SPECIES: + arg >> opt::metaVars[2]; + break; } if (optarg != NULL && !arg.eof()) { - cerr << PROGRAM ": invalid option: `-" - << (char)c << optarg << "'\n"; + cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n"; exit(EXIT_FAILURE); } } if (die) { - cerr << "Try `" << PROGRAM - << " --help' for more information.\n"; + cerr << "Try `" << PROGRAM << " --help' for more information.\n"; exit(EXIT_FAILURE); } @@ -411,18 +440,13 @@ int main(int argc, char* const* argv) } if (!opt::db.empty()) - init(db, - opt::db, - opt::verbose, - PROGRAM, - opt::getCommand(argc, argv), - opt::metaVars - ); + init(db, opt::db, opt::verbose, PROGRAM, opt::getCommand(argc, argv), opt::metaVars); Alignments alignments(1); if (optind < argc) { - for_each(argv + optind, argv + argc, - bind2nd(ptr_fun(readAlignmentsFile), &alignments)); + for_each(argv + optind, argv + argc, [&alignments](const std::string& s) { + readAlignmentsFile(s, &alignments); + }); } else { if (opt::verbose > 0) cerr << "Reading from standard input..." << endl; @@ -435,8 +459,7 @@ int main(int argc, char* const* argv) // Print the unpaired alignments. if (opt::print_all) { - for (Alignments::iterator it = alignments.begin(); - it != alignments.end(); it++) { + for (Alignments::iterator it = alignments.begin(); it != alignments.end(); it++) { #if SAM_SEQ_QUAL SAMRecord& a0 = it->second; #else @@ -450,49 +473,52 @@ int main(int argc, char* const* argv) unsigned numRF = g_histogram.count(INT_MIN, 0); unsigned numFR = g_histogram.count(1, INT_MAX); - size_t sum = alignments.size() - + stats.bothUnaligned + stats.oneUnaligned - + numFR + numRF + stats.numFF - + stats.numDifferent; - cerr << - "Mateless " << percent(alignments.size(), sum) << "\n" - "Unaligned " << percent(stats.bothUnaligned, sum) << "\n" - "Singleton " << percent(stats.oneUnaligned, sum) << "\n" - "FR " << percent(numFR, sum) << "\n" - "RF " << percent(numRF, sum) << "\n" - "FF " << percent(stats.numFF, sum) << "\n" - "Different " << percent(stats.numDifferent, sum) << "\n" - "Total " << sum << endl; + size_t sum = alignments.size() + stats.bothUnaligned + stats.oneUnaligned + numFR + numRF + + stats.numFF + stats.numDifferent; + cerr << "Mateless " << percent(alignments.size(), sum) + << "\n" + "Unaligned " + << percent(stats.bothUnaligned, sum) + << "\n" + "Singleton " + << percent(stats.oneUnaligned, sum) + << "\n" + "FR " + << percent(numFR, sum) + << "\n" + "RF " + << percent(numRF, sum) + << "\n" + "FF " + << percent(stats.numFF, sum) + << "\n" + "Different " + << percent(stats.numDifferent, sum) + << "\n" + "Total " + << sum << endl; if (!opt::db.empty()) { - vals = make_vector() - << alignments.size() - << stats.bothUnaligned - << stats.oneUnaligned - << numFR - << numRF - << stats.numFF - << stats.numDifferent - << sum; - - keys = make_vector() - << "Mateless" - << "Unaligned" - << "Singleton" - << "FR" - << "RF" - << "FF" - << "Different" - << "Total"; - - for (unsigned i=0; i() << alignments.size() << stats.bothUnaligned << stats.oneUnaligned + << numFR << numRF << stats.numFF << stats.numDifferent << sum; + + keys = make_vector() << "Mateless" + << "Unaligned" + << "Singleton" + << "FR" + << "RF" + << "FF" + << "Different" + << "Total"; + + for (unsigned i = 0; i < vals.size(); i++) addToDb(db, keys[i], vals[i]); } if (alignments.size() == sum) { cerr << PROGRAM ": error: All reads are mateless. This " - "can happen when first and second read IDs do not match." - << endl; + "can happen when first and second read IDs do not match." + << endl; exit(EXIT_FAILURE); } @@ -525,9 +551,9 @@ int main(int argc, char* const* argv) if (stats.numFF > numFR && stats.numFF > numRF) { cerr << PROGRAM ": error: The mate pairs of this library are " - "oriented forward-forward (FF), which is not supported " - "by ABySS." - << endl; + "oriented forward-forward (FF), which is not supported " + "by ABySS." + << endl; exit(EXIT_FAILURE); } diff --git a/PathOverlap/PathOverlap.cpp b/PathOverlap/PathOverlap.cpp index 361e00277..531a7b009 100644 --- a/PathOverlap/PathOverlap.cpp +++ b/PathOverlap/PathOverlap.cpp @@ -1,27 +1,27 @@ -#include "config.h" #include "ContigID.h" #include "ContigPath.h" #include "ContigProperties.h" +#include "DataBase/DB.h" +#include "DataBase/Options.h" #include "Functional.h" -#include "IOUtil.h" -#include "Uncompress.h" #include "Graph/ContigGraph.h" #include "Graph/ContigGraphAlgorithms.h" #include "Graph/DirectedGraph.h" #include "Graph/GraphIO.h" +#include "IOUtil.h" +#include "Uncompress.h" +#include "config.h" #include #include #include -#include // for strerror #include -#include -#include +#include // for strerror #include +#include #include +#include #include #include -#include "DataBase/Options.h" -#include "DataBase/DB.h" using namespace std; @@ -29,116 +29,124 @@ using namespace std; DB db; -static const char *VERSION_MESSAGE = -PROGRAM " (ABySS) " VERSION "\n" -"Written by Shaun Jackman and Tony Raymond.\n" -"\n" -"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; - -static const char *USAGE_MESSAGE = -"Usage: " PROGRAM " -k [OPTION]... ADJ PATH\n" -"Find paths that overlap. Either output the graph of overlapping\n" -"paths, assemble overlapping paths into larger paths, or trim the\n" -"overlapping paths.\n" -"\n" -" Arguments:\n" -"\n" -" ADJ contig adjacency graph\n" -" PATH sequences of contig IDs\n" -"\n" -" Options:\n" -"\n" -" -k, --kmer=N k-mer size\n" -" -g, --graph=FILE write the contig adjacency graph to FILE\n" -" -r, --repeats=FILE write repeat contigs to FILE\n" -" --overlap find overlapping paths [default]\n" -" --assemble assemble overlapping paths\n" -" --trim trim overlapping paths\n" -" --adj output the graph in ADJ format [default]\n" -" --asqg output the graph in ASQG format\n" -" --dot output the graph in GraphViz format\n" -" --gfa output the graph in GFA1 format\n" -" --gfa1 output the graph in GFA1 format\n" -" --gfa2 output the graph in GFA2 format\n" -" --gv output the graph in GraphViz format\n" -" --sam output the graph in SAM format\n" -" --SS expect contigs to be oriented correctly\n" -" --no-SS no assumption about contig orientation [default]\n" -" -v, --verbose display verbose output\n" -" --help display this help and exit\n" -" --version output version information and exit\n" -" --db=FILE specify path of database repository in FILE\n" -" --library=NAME specify library NAME for sqlite\n" -" --strain=NAME specify strain NAME for sqlite\n" -" --species=NAME specify species NAME for sqlite\n" -"\n" -"Report bugs to <" PACKAGE_BUGREPORT ">.\n"; +static const char* VERSION_MESSAGE = + PROGRAM " (ABySS) " VERSION "\n" + "Written by Shaun Jackman and Tony Raymond.\n" + "\n" + "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; + +static const char* USAGE_MESSAGE = + "Usage: " PROGRAM " -k [OPTION]... ADJ PATH\n" + "Find paths that overlap. Either output the graph of overlapping\n" + "paths, assemble overlapping paths into larger paths, or trim the\n" + "overlapping paths.\n" + "\n" + " Arguments:\n" + "\n" + " ADJ contig adjacency graph\n" + " PATH sequences of contig IDs\n" + "\n" + " Options:\n" + "\n" + " -k, --kmer=N k-mer size\n" + " -g, --graph=FILE write the contig adjacency graph to FILE\n" + " -r, --repeats=FILE write repeat contigs to FILE\n" + " --overlap find overlapping paths [default]\n" + " --assemble assemble overlapping paths\n" + " --trim trim overlapping paths\n" + " --adj output the graph in ADJ format [default]\n" + " --asqg output the graph in ASQG format\n" + " --dot output the graph in GraphViz format\n" + " --gfa output the graph in GFA1 format\n" + " --gfa1 output the graph in GFA1 format\n" + " --gfa2 output the graph in GFA2 format\n" + " --gv output the graph in GraphViz format\n" + " --sam output the graph in SAM format\n" + " --SS expect contigs to be oriented correctly\n" + " --no-SS no assumption about contig orientation [default]\n" + " -v, --verbose display verbose output\n" + " --help display this help and exit\n" + " --version output version information and exit\n" + " --db=FILE specify path of database repository in FILE\n" + " --library=NAME specify library NAME for sqlite\n" + " --strain=NAME specify strain NAME for sqlite\n" + " --species=NAME specify species NAME for sqlite\n" + "\n" + "Report bugs to <" PACKAGE_BUGREPORT ">.\n"; namespace opt { - string db; - dbVars metaVars; - unsigned k; - - /** Output format. */ - int format; // used by ContigProperties - - /** Write the contig adjacency graph to this file. */ - static string graphPath; - - /** Output the IDs of contigs in overlaps to this file. */ - static string repeatContigs; - - /** Run a strand-specific RNA-Seq assembly. */ - static int ss; - - /** Mode of operation. */ - enum { - /** Find overlapping paths, do not assemble. */ - OVERLAP, - /** Assemble overlapping paths. */ - ASSEMBLE, - /** Trim overlapping paths. */ - TRIM, - }; - static int mode; - - static int verbose; +string db; +dbVars metaVars; +unsigned k; + +/** Output format. */ +int format; // used by ContigProperties + +/** Write the contig adjacency graph to this file. */ +static string graphPath; + +/** Output the IDs of contigs in overlaps to this file. */ +static string repeatContigs; + +/** Run a strand-specific RNA-Seq assembly. */ +static int ss; + +/** Mode of operation. */ +enum +{ + /** Find overlapping paths, do not assemble. */ + OVERLAP, + /** Assemble overlapping paths. */ + ASSEMBLE, + /** Trim overlapping paths. */ + TRIM, +}; +static int mode; + +static int verbose; } static const char* shortopts = "g:k:r:v"; -enum { OPT_HELP = 1, OPT_VERSION, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES }; -//enum { OPT_HELP = 1, OPT_VERSION }; - -static const struct option longopts[] = { - { "graph", required_argument, NULL, 'g' }, - { "kmer", required_argument, NULL, 'k' }, - { "assemble", no_argument, &opt::mode, opt::ASSEMBLE }, - { "overlap", no_argument, &opt::mode, opt::OVERLAP }, - { "trim", no_argument, &opt::mode, opt::TRIM }, - { "adj", no_argument, &opt::format, ADJ }, - { "asqg", no_argument, &opt::format, ASQG }, - { "dot", no_argument, &opt::format, DOT }, - { "gfa", no_argument, &opt::format, GFA1 }, - { "gfa1", no_argument, &opt::format, GFA1 }, - { "gfa2", no_argument, &opt::format, GFA2 }, - { "gv", no_argument, &opt::format, DOT }, - { "sam", no_argument, &opt::format, SAM }, - { "SS", no_argument, &opt::ss, 1 }, - { "no-SS", no_argument, &opt::ss, 0 }, - { "repeats", required_argument, NULL, 'r' }, - { "verbose", no_argument, NULL, 'v' }, - { "help", no_argument, NULL, OPT_HELP }, - { "version", no_argument, NULL, OPT_VERSION }, - { "db", required_argument, NULL, OPT_DB }, - { "library", required_argument, NULL, OPT_LIBRARY }, - { "strain", required_argument, NULL, OPT_STRAIN }, - { "species", required_argument, NULL, OPT_SPECIES }, - { NULL, 0, NULL, 0 } +enum +{ + OPT_HELP = 1, + OPT_VERSION, + OPT_DB, + OPT_LIBRARY, + OPT_STRAIN, + OPT_SPECIES }; +// enum { OPT_HELP = 1, OPT_VERSION }; + +static const struct option longopts[] = { { "graph", required_argument, NULL, 'g' }, + { "kmer", required_argument, NULL, 'k' }, + { "assemble", no_argument, &opt::mode, opt::ASSEMBLE }, + { "overlap", no_argument, &opt::mode, opt::OVERLAP }, + { "trim", no_argument, &opt::mode, opt::TRIM }, + { "adj", no_argument, &opt::format, ADJ }, + { "asqg", no_argument, &opt::format, ASQG }, + { "dot", no_argument, &opt::format, DOT }, + { "gfa", no_argument, &opt::format, GFA1 }, + { "gfa1", no_argument, &opt::format, GFA1 }, + { "gfa2", no_argument, &opt::format, GFA2 }, + { "gv", no_argument, &opt::format, DOT }, + { "sam", no_argument, &opt::format, SAM }, + { "SS", no_argument, &opt::ss, 1 }, + { "no-SS", no_argument, &opt::ss, 0 }, + { "repeats", required_argument, NULL, 'r' }, + { "verbose", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, OPT_HELP }, + { "version", no_argument, NULL, OPT_VERSION }, + { "db", required_argument, NULL, OPT_DB }, + { "library", required_argument, NULL, OPT_LIBRARY }, + { "strain", required_argument, NULL, OPT_STRAIN }, + { "species", required_argument, NULL, OPT_SPECIES }, + { NULL, 0, NULL, 0 } }; /** A vertex of the overlap graph. */ -struct Vertex { +struct Vertex +{ unsigned id; bool sense; @@ -146,23 +154,20 @@ struct Vertex { static unsigned s_offset; Vertex(unsigned id, bool sense) - : id(id), sense(sense) { } + : id(id) + , sense(sense) + {} - bool operator ==(const Vertex& v) const - { - return id == v.id && sense == v.sense; - } + bool operator==(const Vertex& v) const { return id == v.id && sense == v.sense; } - ContigNode descriptor() const - { - return ContigNode(s_offset + id, sense); - } + ContigNode descriptor() const { return ContigNode(s_offset + id, sense); } }; unsigned Vertex::s_offset; /** An alignment of two overlapping contigs. */ -struct Overlap { +struct Overlap +{ Vertex source; Vertex target; @@ -172,10 +177,12 @@ struct Overlap { /** Overlap measured in bp. */ int distance; - Overlap(const Vertex& source, const Vertex& target, - unsigned overlap, int distance) - : source(source), target(target), - overlap(overlap), distance(distance) { } + Overlap(const Vertex& source, const Vertex& target, unsigned overlap, int distance) + : source(source) + , target(target) + , overlap(overlap) + , distance(distance) + {} }; /** The contig IDs that have been removed from paths. */ @@ -188,13 +195,15 @@ typedef ContigGraph Graph; typedef vector Paths; /** Return whether this vertex is a path or a contig. */ -static bool isPath(const ContigNode& u) +static bool +isPath(const ContigNode& u) { return u.id() >= Vertex::s_offset; } /** Return a path, complemented if necessary. */ -static ContigPath getPath(const Paths& paths, const ContigNode& u) +static ContigPath +getPath(const Paths& paths, const ContigNode& u) { if (isPath(u)) { unsigned i = u.id() - Vertex::s_offset; @@ -209,8 +218,8 @@ static ContigPath getPath(const Paths& paths, const ContigNode& u) * @param[out] pathIDs the path IDs * @return the paths */ -static Paths readPaths(Graph& g, - const string& inPath, vector& pathIDs) +static Paths +readPaths(Graph& g, const string& inPath, vector& pathIDs) { typedef graph_traits::vertex_descriptor V; @@ -245,42 +254,45 @@ typedef multimap SeedMap; /** Index the first and last contig of each path to facilitate finding * overlaps between paths. */ -static SeedMap makeSeedMap(const Paths& paths) +static SeedMap +makeSeedMap(const Paths& paths) { SeedMap seedMap; - for (Paths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (Paths::const_iterator it = paths.begin(); it != paths.end(); ++it) { if (it->empty()) continue; assert(!it->front().ambiguous()); - seedMap.insert(make_pair(it->front(), - Vertex(it - paths.begin(), false))); + seedMap.insert(make_pair(it->front(), Vertex(it - paths.begin(), false))); assert(!it->back().ambiguous()); - seedMap.insert(make_pair(it->back() ^ 1, - Vertex(it - paths.begin(), true))); + seedMap.insert(make_pair(it->back() ^ 1, Vertex(it - paths.begin(), true))); } return seedMap; } /** Check whether path starts with the sequence [first, last). */ -static bool startsWith(ContigPath path, bool rc, - ContigPath::const_iterator first, - ContigPath::const_iterator last) +static bool +startsWith( + ContigPath path, + bool rc, + ContigPath::const_iterator first, + ContigPath::const_iterator last) { if (rc) reverseComplement(path.begin(), path.end()); assert(*first == path.front()); assert(first < last); - return unsigned(last - first) > path.size() ? false - : equal(first, last, path.begin()); + return unsigned(last - first) > path.size() ? false : equal(first, last, path.begin()); } /** Check whether path starts with the sequence [first, last). */ -static unsigned findOverlap(const Graph& g, - const Paths& paths, - ContigPath::const_iterator first, - ContigPath::const_iterator last, - const Vertex& v, int &distance) +static unsigned +findOverlap( + const Graph& g, + const Paths& paths, + ContigPath::const_iterator first, + ContigPath::const_iterator last, + const Vertex& v, + int& distance) { if (!startsWith(paths[v.id], v.sense, first, last)) return 0; @@ -291,9 +303,13 @@ static unsigned findOverlap(const Graph& g, typedef vector Overlaps; /** Find every path that overlaps with the specified path. */ -static void findOverlaps(const Graph& g, - const Paths& paths, const SeedMap& seedMap, - const Vertex& v, Overlaps& overlaps) +static void +findOverlaps( + const Graph& g, + const Paths& paths, + const SeedMap& seedMap, + const Vertex& v, + Overlaps& overlaps) { ContigPath rc; if (v.sense) { @@ -302,36 +318,30 @@ static void findOverlaps(const Graph& g, } const ContigPath& path = v.sense ? rc : paths[v.id]; - for (ContigPath::const_iterator it = path.begin(); - it != path.end(); ++it) { + for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it) { if (it->ambiguous()) continue; - pair - range = seedMap.equal_range(*it); - for (SeedMap::const_iterator seed = range.first; - seed != range.second; ++seed) { + pair range = seedMap.equal_range(*it); + for (SeedMap::const_iterator seed = range.first; seed != range.second; ++seed) { if (v == seed->second) continue; int distance = 0; - unsigned overlap = findOverlap(g, paths, it, path.end(), - seed->second, distance); + unsigned overlap = findOverlap(g, paths, it, path.end(), seed->second, distance); if (overlap > 0) - overlaps.push_back(Overlap(v, seed->second, - overlap, distance)); - + overlaps.push_back(Overlap(v, seed->second, overlap, distance)); } } } /** Find every pair of overlapping paths. */ -static Overlaps findOverlaps(const Graph& g, const Paths& paths) +static Overlaps +findOverlaps(const Graph& g, const Paths& paths) { SeedMap seedMap = makeSeedMap(paths); Overlaps overlaps; - for (Paths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (Paths::const_iterator it = paths.begin(); it != paths.end(); ++it) { unsigned i = it - paths.begin(); findOverlaps(g, paths, seedMap, Vertex(i, false), overlaps); findOverlaps(g, paths, seedMap, Vertex(i, true), overlaps); @@ -340,9 +350,8 @@ static Overlaps findOverlaps(const Graph& g, const Paths& paths) } /** Record the trimmed contigs. */ -static void recordTrimmedContigs( - ContigPath::const_iterator first, - ContigPath::const_iterator last) +static void +recordTrimmedContigs(ContigPath::const_iterator first, ContigPath::const_iterator last) { for (ContigPath::const_iterator it = first; it != last; ++it) if (!it->ambiguous()) @@ -350,7 +359,8 @@ static void recordTrimmedContigs( } /** Remove ambiguous contigs from the ends of the path. */ -static void removeAmbiguousContigs(ContigPath& path) +static void +removeAmbiguousContigs(ContigPath& path) { if (!path.empty() && path.back().ambiguous()) path.erase(path.end() - 1); @@ -359,8 +369,8 @@ static void removeAmbiguousContigs(ContigPath& path) } /** Remove the overlapping portion of the specified contig. */ -static void removeContigs(ContigPath& path, - unsigned first, unsigned last) +static void +removeContigs(ContigPath& path, unsigned first, unsigned last) { assert(first <= path.size()); assert(last <= path.size()); @@ -377,14 +387,14 @@ static void removeContigs(ContigPath& path, } /** Find the largest overlap for each contig and remove it. */ -static void trimOverlaps(Paths& paths, const Overlaps& overlaps) +static void +trimOverlaps(Paths& paths, const Overlaps& overlaps) { vector removed[2]; removed[0].resize(paths.size()); removed[1].resize(paths.size()); - for (Overlaps::const_iterator it = overlaps.begin(); - it != overlaps.end(); ++it) { + for (Overlaps::const_iterator it = overlaps.begin(); it != overlaps.end(); ++it) { unsigned& a = removed[!it->source.sense][it->source.id]; unsigned& b = removed[it->target.sense][it->target.id]; a = max(a, it->overlap); @@ -392,41 +402,41 @@ static void trimOverlaps(Paths& paths, const Overlaps& overlaps) } for (Paths::iterator it = paths.begin(); it != paths.end(); ++it) - removeContigs(*it, removed[0][it - paths.begin()], - it->size() - removed[1][it - paths.begin()]); + removeContigs( + *it, removed[0][it - paths.begin()], it->size() - removed[1][it - paths.begin()]); } /** Trim the ends of paths that overlap another path. */ -static void trimOverlaps(const Graph& g, Paths& paths) +static void +trimOverlaps(const Graph& g, Paths& paths) { - for (Overlaps overlaps = findOverlaps(g, paths); - !overlaps.empty(); - overlaps = findOverlaps(g, paths)) { + for (Overlaps overlaps = findOverlaps(g, paths); !overlaps.empty(); + overlaps = findOverlaps(g, paths)) { cerr << "Found " << overlaps.size() / 2 << " overlaps.\n"; trimOverlaps(paths, overlaps); } } -static inline -ContigProperties get(vertex_bundle_t, const Graph& g, ContigNode u) +static inline ContigProperties +get(vertex_bundle_t, const Graph& g, ContigNode u) { - return u.ambiguous() - ? ContigProperties(u.length() + opt::k - 1, 0) - : g[u]; + return u.ambiguous() ? ContigProperties(u.length() + opt::k - 1, 0) : g[u]; } /** Add the path overlap edges to the specified graph. */ -static void addPathOverlapEdges(Graph& g, - const Paths& paths, const vector& pathIDs, - const Overlaps& overlaps) +static void +addPathOverlapEdges( + Graph& g, + const Paths& paths, + const vector& pathIDs, + const Overlaps& overlaps) { typedef graph_traits::vertex_descriptor V; const bool allowParallelEdge = opt::mode == opt::ASSEMBLE; // Add the path vertices. g_contigNames.unlock(); - for (Paths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (Paths::const_iterator it = paths.begin(); it != paths.end(); ++it) { const ContigPath& path = *it; const string& id = pathIDs[it - paths.begin()]; if (!path.empty()) { @@ -437,21 +447,19 @@ static void addPathOverlapEdges(Graph& g, g_contigNames.lock(); // Remove the single-end contigs that are in paths. - for (Paths::const_iterator it = paths.begin(); - it != paths.end(); ++it) - remove_vertex_if(g, it->begin(), it->end(), - not1(std::mem_fun_ref(&ContigNode::ambiguous))); + for (Paths::const_iterator it = paths.begin(); it != paths.end(); ++it) + remove_vertex_if( + g, it->begin(), it->end(), [](const ContigNode& c) { return !c.ambiguous(); }); // Add the path edges. - for (Overlaps::const_iterator it = overlaps.begin(); - it != overlaps.end(); ++it) { + for (Overlaps::const_iterator it = overlaps.begin(); it != overlaps.end(); ++it) { V u = it->source.descriptor(); V v = it->target.descriptor(); if (allowParallelEdge || !edge(u, v, g).second) add_edge(u, v, it->distance, static_cast(g)); else if (opt::verbose > 0) - cerr << "ambiguous overlap: " << get(vertex_name, g, u) - << " -> " << get(vertex_name, g, v) << '\n'; + cerr << "ambiguous overlap: " << get(vertex_name, g, u) << " -> " + << get(vertex_name, g, v) << '\n'; } } @@ -462,14 +470,15 @@ typedef graph_traits::edge_descriptor edge_descriptor; typedef map OverlapMap; /** Return the number of contigs by which the two paths overlap. */ -static unsigned getOverlap(const OverlapMap& pmap, - graph_traits::vertex_descriptor u, - graph_traits::vertex_descriptor v) +static unsigned +getOverlap( + const OverlapMap& pmap, + graph_traits::vertex_descriptor u, + graph_traits::vertex_descriptor v) { if (isPath(u) && isPath(v)) { // Both vertices are paths. - OverlapMap::const_iterator it = pmap.find( - edge_descriptor(u, v)); + OverlapMap::const_iterator it = pmap.find(edge_descriptor(u, v)); return it == pmap.end() ? 0 : it->second; } else { // One of the two vertices is a contig. @@ -478,21 +487,19 @@ static unsigned getOverlap(const OverlapMap& pmap, } /** Merge a sequence of overlapping paths. */ -static ContigPath mergePaths(const Paths& paths, - const OverlapMap& overlaps, const ContigPath& merge) +static ContigPath +mergePaths(const Paths& paths, const OverlapMap& overlaps, const ContigPath& merge) { assert(!merge.empty()); ContigNode u = merge.front(); ContigPath path(getPath(paths, u)); - for (ContigPath::const_iterator it = merge.begin() + 1; - it != merge.end(); ++it) { + for (ContigPath::const_iterator it = merge.begin() + 1; it != merge.end(); ++it) { ContigNode v = *it; ContigPath vpath(getPath(paths, v)); unsigned overlap = getOverlap(overlaps, u, v); assert(path.size() > overlap); assert(vpath.size() > overlap); - assert(equal(path.end() - overlap, path.end(), - vpath.begin())); + assert(equal(path.end() - overlap, path.end(), vpath.begin())); path.insert(path.end(), vpath.begin() + overlap, vpath.end()); u = v; } @@ -500,18 +507,21 @@ static ContigPath mergePaths(const Paths& paths, } /** Return true if the edge e is a path overlap. */ -struct IsPathOverlap : unary_function { - IsPathOverlap(const Graph& g, const OverlapMap& pmap, - const IsPositive& pred) - : m_g(g), m_pmap(pmap), m_isPositive(pred) { } +struct IsPathOverlap : unary_function +{ + IsPathOverlap(const Graph& g, const OverlapMap& pmap, const IsPositive& pred) + : m_g(g) + , m_pmap(pmap) + , m_isPositive(pred) + {} bool operator()(edge_descriptor e) const { bool stranded = true; if (opt::ss) stranded = m_isPositive(e); - return stranded && - getOverlap(m_pmap, source(e, m_g), target(e, m_g)); + return stranded && getOverlap(m_pmap, source(e, m_g), target(e, m_g)); } + private: const Graph& m_g; const OverlapMap& m_pmap; @@ -519,8 +529,8 @@ struct IsPathOverlap : unary_function { }; /** Assemble overlapping paths. */ -static void assembleOverlappingPaths(Graph& g, - Paths& paths, vector& pathIDs) +static void +assembleOverlappingPaths(Graph& g, Paths& paths, vector& pathIDs) { if (paths.empty()) return; @@ -531,25 +541,19 @@ static void assembleOverlappingPaths(Graph& g, // Create a property map of path overlaps. OverlapMap overlapMap; - for (Overlaps::const_iterator it = overlaps.begin(); - it != overlaps.end(); ++it) + for (Overlaps::const_iterator it = overlaps.begin(); it != overlaps.end(); ++it) overlapMap.insert(OverlapMap::value_type( - OverlapMap::key_type( - it->source.descriptor(), - it->target.descriptor()), - it->overlap)); + OverlapMap::key_type(it->source.descriptor(), it->target.descriptor()), it->overlap)); // Assemble unambiguously overlapping paths. Paths merges; - assemble_if(g, back_inserter(merges), - IsPathOverlap(g, overlapMap, IsPositive(g))); + assemble_if(g, back_inserter(merges), IsPathOverlap(g, overlapMap, IsPositive(g))); // Merge overlapping paths. g_contigNames.unlock(); assert(!pathIDs.empty()); setNextContigName(pathIDs.back()); - for (Paths::const_iterator it = merges.begin(); - it != merges.end(); ++it) { + for (Paths::const_iterator it = merges.begin(); it != merges.end(); ++it) { string name = createContigName(); if (opt::verbose > 0) cerr << name << '\t' << *it << '\n'; @@ -559,8 +563,7 @@ static void assembleOverlappingPaths(Graph& g, paths.push_back(mergePaths(paths, overlapMap, *it)); // Remove the merged paths. - for (ContigPath::const_iterator it2 = it->begin(); - it2 != it->end(); ++it2) { + for (ContigPath::const_iterator it2 = it->begin(); it2 != it->end(); ++it2) { if (isPath(*it2)) paths[it2->id() - Vertex::s_offset].clear(); } @@ -568,13 +571,14 @@ static void assembleOverlappingPaths(Graph& g, g_contigNames.lock(); } -int main(int argc, char** argv) +int +main(int argc, char** argv) { string commandLine; { ostringstream ss; char** last = argv + argc - 1; - copy(argv, last, ostream_iterator(ss, " ")); + copy(argv, last, ostream_iterator(ss, " ")); ss << *last; commandLine = ss.str(); } @@ -583,33 +587,45 @@ int main(int argc, char** argv) opt::metaVars.resize(3); bool die = false; - for (int c; (c = getopt_long(argc, argv, - shortopts, longopts, NULL)) != -1;) { + for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { istringstream arg(optarg != NULL ? optarg : ""); switch (c) { - case '?': die = true; break; - case 'g': arg >> opt::graphPath; break; - case 'k': arg >> opt::k; break; - case 'r': arg >> opt::repeatContigs; break; - case 'v': opt::verbose++; break; - case OPT_HELP: - cout << USAGE_MESSAGE; - exit(EXIT_SUCCESS); - case OPT_VERSION: - cout << VERSION_MESSAGE; - exit(EXIT_SUCCESS); - case OPT_DB: - arg >> opt::db; break; - case OPT_LIBRARY: - arg >> opt::metaVars[0]; break; - case OPT_STRAIN: - arg >> opt::metaVars[1]; break; - case OPT_SPECIES: - arg >> opt::metaVars[2]; break; + case '?': + die = true; + break; + case 'g': + arg >> opt::graphPath; + break; + case 'k': + arg >> opt::k; + break; + case 'r': + arg >> opt::repeatContigs; + break; + case 'v': + opt::verbose++; + break; + case OPT_HELP: + cout << USAGE_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_VERSION: + cout << VERSION_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_DB: + arg >> opt::db; + break; + case OPT_LIBRARY: + arg >> opt::metaVars[0]; + break; + case OPT_STRAIN: + arg >> opt::metaVars[1]; + break; + case OPT_SPECIES: + arg >> opt::metaVars[2]; + break; } if (optarg != NULL && !arg.eof()) { - cerr << PROGRAM ": invalid option: `-" - << (char)c << optarg << "'\n"; + cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n"; exit(EXIT_FAILURE); } } @@ -628,12 +644,11 @@ int main(int argc, char** argv) } if (die) { - cerr << "Try `" << PROGRAM - << " --help' for more information.\n"; + cerr << "Try `" << PROGRAM << " --help' for more information.\n"; exit(EXIT_FAILURE); } - const char *adjPath = argv[optind++]; + const char* adjPath = argv[optind++]; if (opt::verbose > 0) cerr << "Reading `" << adjPath << "'..." << endl; ifstream fin(adjPath); @@ -647,37 +662,35 @@ int main(int argc, char** argv) Paths paths = readPaths(g, pathsFile, pathIDs); switch (opt::mode) { - case opt::OVERLAP: + case opt::OVERLAP: // Find overlapping paths, do not assemble. - addPathOverlapEdges(g, paths, pathIDs, - findOverlaps(g, paths)); + addPathOverlapEdges(g, paths, pathIDs, findOverlaps(g, paths)); paths.clear(); if (opt::graphPath.empty()) opt::graphPath = "-"; break; - case opt::ASSEMBLE: + case opt::ASSEMBLE: // Assemble overlapping paths. assembleOverlappingPaths(g, paths, pathIDs); break; - case opt::TRIM: + case opt::TRIM: // Trim overlapping paths. trimOverlaps(g, paths); // Remove paths consisting of a single contig. - for_each_if(paths.begin(), paths.end(), - mem_fun_ref(&ContigPath::clear), - compose1( - bind2nd(equal_to(), 1), - mem_fun_ref(&ContigPath::size))); + for_each_if( + paths.begin(), + paths.end(), + [](ContigPath& c) { return c.clear(); }, + [](const ContigPath& c) { return c.size() == 1; }); // Add the paths to the graph. addPathOverlapEdges(g, paths, pathIDs, Overlaps()); break; } // Output the paths. - for (Paths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (Paths::const_iterator it = paths.begin(); it != paths.end(); ++it) { if (it->empty()) continue; assert(it->size() != 1); @@ -688,8 +701,7 @@ int main(int argc, char** argv) // Output the graph. if (!opt::graphPath.empty()) { ofstream fout; - ostream& out = opt::graphPath == "-" ? cout - : (fout.open(opt::graphPath.c_str()), fout); + ostream& out = opt::graphPath == "-" ? cout : (fout.open(opt::graphPath.c_str()), fout); assert_good(out, opt::graphPath); write_graph(out, g, PROGRAM, commandLine); assert_good(out, opt::graphPath); @@ -699,24 +711,18 @@ int main(int argc, char** argv) if (!opt::repeatContigs.empty()) { sort(s_trimmedContigs.begin(), s_trimmedContigs.end()); s_trimmedContigs.erase( - unique(s_trimmedContigs.begin(), - s_trimmedContigs.end()), s_trimmedContigs.end()); + unique(s_trimmedContigs.begin(), s_trimmedContigs.end()), s_trimmedContigs.end()); ofstream out(opt::repeatContigs.c_str()); assert_good(out, opt::repeatContigs); - for (vector::const_iterator it - = s_trimmedContigs.begin(); - it != s_trimmedContigs.end(); ++it) + for (vector::const_iterator it = s_trimmedContigs.begin(); + it != s_trimmedContigs.end(); + ++it) out << get(g_contigNames, *it) << '\n'; assert_good(out, opt::repeatContigs); } if (!opt::db.empty()) { - init(db, - opt::db, - opt::verbose, - PROGRAM, - opt::getCommand(argc, argv), - opt::metaVars); + init(db, opt::db, opt::verbose, PROGRAM, opt::getCommand(argc, argv), opt::metaVars); addToDb(db, "SS", opt::ss); addToDb(db, "K", opt::k); } diff --git a/PopBubbles/PopBubbles.cpp b/PopBubbles/PopBubbles.cpp index 1b511bdc2..c0e6e5a11 100644 --- a/PopBubbles/PopBubbles.cpp +++ b/PopBubbles/PopBubbles.cpp @@ -3,40 +3,40 @@ * Written by Shaun Jackman . */ -#include "config.h" +#include "Graph/PopBubbles.h" #include "Common/Options.h" #include "ConstString.h" #include "ContigPath.h" #include "ContigProperties.h" #include "FastaReader.h" -#include "IOUtil.h" -#include "Sequence.h" -#include "Uncompress.h" -#include "alignGlobal.h" #include "Graph/ContigGraph.h" #include "Graph/ContigGraphAlgorithms.h" #include "Graph/DepthFirstSearch.h" #include "Graph/DirectedGraph.h" #include "Graph/GraphIO.h" #include "Graph/GraphUtil.h" -#include "Graph/PopBubbles.h" +#include "IOUtil.h" +#include "Sequence.h" +#include "Uncompress.h" +#include "alignGlobal.h" +#include "config.h" +#include #include #include -#include #include // for UINT_MAX #include #include #include -#include #include #include +#include #include #include #include #include #include #if _OPENMP -# include +#include #endif using namespace std; @@ -46,126 +46,133 @@ using boost::tie; #define PROGRAM "PopBubbles" static const char VERSION_MESSAGE[] = -PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" -"Written by Shaun Jackman.\n" -"\n" -"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; + PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" + "Written by Shaun Jackman.\n" + "\n" + "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n"; static const char USAGE_MESSAGE[] = -"Usage: " PROGRAM " -k [OPTION]... FASTA ADJ\n" -"Identify and pop simple bubbles.\n" -"\n" -" Arguments:\n" -"\n" -" FASTA contigs in FASTA format\n" -" ADJ contig adjacency graph\n" -"\n" -" Options:\n" -"\n" -" -k, --kmer=N k-mer size\n" -" -a, --branches=N maximum number of branches, default: 2\n" -" -b, --bubble-length=N pop bubbles shorter than N bp\n" -" default is 10000\n" -" -p, --identity=REAL minimum identity, default: 0.9\n" -" -c, --coverage=REAL remove contigs with mean k-mer coverage\n" -" less than this threshold [0]\n" -" --scaffold scaffold over bubbles that have\n" -" insufficient identity\n" -" --no-scaffold disable scaffolding [default]\n" -" --SS expect contigs to be oriented correctly\n" -" --no-SS no assumption about contig orientation [default]\n" -" -g, --graph=FILE write the contig adjacency graph to FILE\n" -" --adj output the graph in ADJ format [default]\n" -" --asqg output the graph in ASQG format\n" -" --dot output the graph in GraphViz format\n" -" --gfa output the graph in GFA1 format\n" -" --gfa1 output the graph in GFA1 format\n" -" --gfa2 output the graph in GFA2 format\n" -" --gv output the graph in GraphViz format\n" -" --sam output the graph in SAM format\n" -" --bubble-graph output a graph of the bubbles\n" -" -j, --threads=N use N parallel threads [1]\n" -" -v, --verbose display verbose output\n" -" --help display this help and exit\n" -" --version output version information and exit\n" -"\n" -"Report bugs to <" PACKAGE_BUGREPORT ">.\n"; + "Usage: " PROGRAM " -k [OPTION]... FASTA ADJ\n" + "Identify and pop simple bubbles.\n" + "\n" + " Arguments:\n" + "\n" + " FASTA contigs in FASTA format\n" + " ADJ contig adjacency graph\n" + "\n" + " Options:\n" + "\n" + " -k, --kmer=N k-mer size\n" + " -a, --branches=N maximum number of branches, default: 2\n" + " -b, --bubble-length=N pop bubbles shorter than N bp\n" + " default is 10000\n" + " -p, --identity=REAL minimum identity, default: 0.9\n" + " -c, --coverage=REAL remove contigs with mean k-mer coverage\n" + " less than this threshold [0]\n" + " --scaffold scaffold over bubbles that have\n" + " insufficient identity\n" + " --no-scaffold disable scaffolding [default]\n" + " --SS expect contigs to be oriented correctly\n" + " --no-SS no assumption about contig orientation [default]\n" + " -g, --graph=FILE write the contig adjacency graph to FILE\n" + " --adj output the graph in ADJ format [default]\n" + " --asqg output the graph in ASQG format\n" + " --dot output the graph in GraphViz format\n" + " --gfa output the graph in GFA1 format\n" + " --gfa1 output the graph in GFA1 format\n" + " --gfa2 output the graph in GFA2 format\n" + " --gv output the graph in GraphViz format\n" + " --sam output the graph in SAM format\n" + " --bubble-graph output a graph of the bubbles\n" + " -j, --threads=N use N parallel threads [1]\n" + " -v, --verbose display verbose output\n" + " --help display this help and exit\n" + " --version output version information and exit\n" + "\n" + "Report bugs to <" PACKAGE_BUGREPORT ">.\n"; namespace opt { - unsigned k; // used by ContigProperties +unsigned k; // used by ContigProperties - /** Maximum number of branches. */ - static unsigned maxBranches = 2; +/** Maximum number of branches. */ +static unsigned maxBranches = 2; - /** Pop bubbles shorter than this threshold. */ - static unsigned maxLength = 10000; +/** Pop bubbles shorter than this threshold. */ +static unsigned maxLength = 10000; - /** Minimum identity. */ - static float identity = 0.9; +/** Minimum identity. */ +static float identity = 0.9; - /** Minimum mean k-mer coverage. */ - static float minCoverage; +/** Minimum mean k-mer coverage. */ +static float minCoverage; - /** Scaffold over bubbles that have insufficient identity. */ - static int scaffold; +/** Scaffold over bubbles that have insufficient identity. */ +static int scaffold; - /** Write the contig adjacency graph to this file. */ - static string graphPath; +/** Write the contig adjacency graph to this file. */ +static string graphPath; - /** Output a graph of the bubbles. */ - static int bubbleGraph; +/** Output a graph of the bubbles. */ +static int bubbleGraph; - int format; // used by ContigProperties +int format; // used by ContigProperties - /** Run a strand-specific RNA-Seq assembly. */ - static int ss; +/** Run a strand-specific RNA-Seq assembly. */ +static int ss; - /** Number of threads. */ - static int threads = 1; +/** Number of threads. */ +static int threads = 1; } static const char shortopts[] = "a:b:c:g:j:k:p:v"; -enum { OPT_HELP = 1, OPT_VERSION }; - -static const struct option longopts[] = { - { "branches", required_argument, NULL, 'a' }, - { "bubble-length", required_argument, NULL, 'b' }, - { "coverage", required_argument, NULL, 'c' }, - { "bubble-graph", no_argument, &opt::bubbleGraph, 1, }, - { "graph", required_argument, NULL, 'g' }, - { "adj", no_argument, &opt::format, ADJ }, - { "asqg", no_argument, &opt::format, ASQG }, - { "dot", no_argument, &opt::format, DOT }, - { "gfa", no_argument, &opt::format, GFA1 }, - { "gfa1", no_argument, &opt::format, GFA1 }, - { "gfa2", no_argument, &opt::format, GFA2 }, - { "gv", no_argument, &opt::format, DOT }, - { "sam", no_argument, &opt::format, SAM }, - { "kmer", required_argument, NULL, 'k' }, - { "identity", required_argument, NULL, 'p' }, - { "scaffold", no_argument, &opt::scaffold, 1}, - { "no-scaffold", no_argument, &opt::scaffold, 0}, - { "SS", no_argument, &opt::ss, 1 }, - { "no-SS", no_argument, &opt::ss, 0 }, - { "threads", required_argument, NULL, 'j' }, - { "verbose", no_argument, NULL, 'v' }, - { "help", no_argument, NULL, OPT_HELP }, - { "version", no_argument, NULL, OPT_VERSION }, - { NULL, 0, NULL, 0 } +enum +{ + OPT_HELP = 1, + OPT_VERSION }; +static const struct option longopts[] = { { "branches", required_argument, NULL, 'a' }, + { "bubble-length", required_argument, NULL, 'b' }, + { "coverage", required_argument, NULL, 'c' }, + { + "bubble-graph", + no_argument, + &opt::bubbleGraph, + 1, + }, + { "graph", required_argument, NULL, 'g' }, + { "adj", no_argument, &opt::format, ADJ }, + { "asqg", no_argument, &opt::format, ASQG }, + { "dot", no_argument, &opt::format, DOT }, + { "gfa", no_argument, &opt::format, GFA1 }, + { "gfa1", no_argument, &opt::format, GFA1 }, + { "gfa2", no_argument, &opt::format, GFA2 }, + { "gv", no_argument, &opt::format, DOT }, + { "sam", no_argument, &opt::format, SAM }, + { "kmer", required_argument, NULL, 'k' }, + { "identity", required_argument, NULL, 'p' }, + { "scaffold", no_argument, &opt::scaffold, 1 }, + { "no-scaffold", no_argument, &opt::scaffold, 0 }, + { "SS", no_argument, &opt::ss, 1 }, + { "no-SS", no_argument, &opt::ss, 0 }, + { "threads", required_argument, NULL, 'j' }, + { "verbose", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, OPT_HELP }, + { "version", no_argument, NULL, OPT_VERSION }, + { NULL, 0, NULL, 0 } }; + /** Popped branches. */ static vector g_popped; /** Contig adjacency graph. */ -typedef ContigGraph > Graph; +typedef ContigGraph> Graph; typedef Graph::vertex_descriptor vertex_descriptor; typedef Graph::adjacency_iterator adjacency_iterator; /** Return the distance from vertex u to v. */ -static int getDistance(const Graph& g, - vertex_descriptor u, vertex_descriptor v) +static int +getDistance(const Graph& g, vertex_descriptor u, vertex_descriptor v) { typedef graph_traits::edge_descriptor edge_descriptor; pair e = edge(u, v, g); @@ -173,9 +180,12 @@ static int getDistance(const Graph& g, return g[e.first].distance; } -struct CompareCoverage { +struct CompareCoverage +{ const Graph& g; - CompareCoverage(const Graph& g) : g(g) { } + CompareCoverage(const Graph& g) + : g(g) + {} bool operator()(vertex_descriptor u, vertex_descriptor v) { return g[u].coverage > g[v].coverage; @@ -183,33 +193,33 @@ struct CompareCoverage { }; /** Pop the bubble between vertices v and tail. */ -static void popBubble(Graph& g, - vertex_descriptor v, vertex_descriptor tail) +static void +popBubble(Graph& g, vertex_descriptor v, vertex_descriptor tail) { unsigned nbranches = g.out_degree(v); assert(nbranches > 1); assert(nbranches == g.in_degree(tail)); vector sorted(nbranches); - pair - adj = g.adjacent_vertices(v); + pair adj = g.adjacent_vertices(v); copy(adj.first, adj.second, sorted.begin()); sort(sorted.begin(), sorted.end(), CompareCoverage(g)); if (opt::bubbleGraph) #pragma omp critical(cout) { cout << '"' << get(vertex_name, g, v) << "\" -> {"; - for (vector::const_iterator - it = sorted.begin(); it != sorted.end(); ++it) + for (vector::const_iterator it = sorted.begin(); it != sorted.end(); + ++it) cout << " \"" << get(vertex_name, g, *it) << '"'; cout << " } -> \"" << get(vertex_name, g, tail) << "\"\n"; } #pragma omp critical(g_popped) - transform(sorted.begin() + 1, sorted.end(), - back_inserter(g_popped), - mem_fun_ref(&ContigNode::contigIndex)); + transform(sorted.begin() + 1, sorted.end(), back_inserter(g_popped), [](const ContigNode& c) { + return c.contigIndex(); + }); } -static struct { +static struct +{ unsigned bubbles; unsigned popped; unsigned scaffold; @@ -224,7 +234,8 @@ typedef vector Contigs; static Contigs g_contigs; /** Return the sequence of vertex u. */ -static string getSequence(const Graph* g, vertex_descriptor u) +static string +getSequence(const Graph* g, vertex_descriptor u) { size_t i = get(vertex_contig_index, *g, u); assert(i < g_contigs.size()); @@ -233,7 +244,8 @@ static string getSequence(const Graph* g, vertex_descriptor u) } /** Return the length of vertex v. */ -static unsigned getLength(const Graph* g, vertex_descriptor v) +static unsigned +getLength(const Graph* g, vertex_descriptor v) { return (*g)[v].length; } @@ -243,38 +255,35 @@ static unsigned getLength(const Graph* g, vertex_descriptor v) * @param v the vertex to the right of the bubble * @return the identity of the global alignment */ -template -static float getAlignmentIdentity(const Graph& g, - vertex_descriptor t, vertex_descriptor v, - It first, It last) +template +static float +getAlignmentIdentity(const Graph& g, vertex_descriptor t, vertex_descriptor v, It first, It last) { unsigned nbranches = distance(first, last); vector inDists(nbranches); - transform(first, last, inDists.begin(), - boost::lambda::bind(getDistance, boost::cref(g), t, _1)); + transform( + first, last, inDists.begin(), boost::lambda::bind(getDistance, boost::cref(g), t, _1)); vector outDists(nbranches); - transform(first, last, outDists.begin(), - boost::lambda::bind(getDistance, boost::cref(g), _1, v)); + transform( + first, last, outDists.begin(), boost::lambda::bind(getDistance, boost::cref(g), _1, v)); vector insertLens(nbranches); - transform(first, last, insertLens.begin(), - boost::lambda::bind(getDistance, boost::cref(g), t, _1) - + boost::lambda::bind(getLength, &g, _1) - + boost::lambda::bind(getDistance, boost::cref(g), _1, v)); - - int max_in_overlap = -(*min_element(inDists.begin(), - inDists.end())); + transform( + first, + last, + insertLens.begin(), + boost::lambda::bind(getDistance, boost::cref(g), t, _1) + + boost::lambda::bind(getLength, &g, _1) + + boost::lambda::bind(getDistance, boost::cref(g), _1, v)); + + int max_in_overlap = -(*min_element(inDists.begin(), inDists.end())); assert(max_in_overlap >= 0); - int max_out_overlap = -(*min_element(outDists.begin(), - outDists.end())); + int max_out_overlap = -(*min_element(outDists.begin(), outDists.end())); assert(max_out_overlap >= 0); - int min_insert_len = *min_element(insertLens.begin(), - insertLens.end()); - int max_insert_len = *max_element(insertLens.begin(), - insertLens.end()); - - float max_identity = - (float)(min_insert_len + max_in_overlap + max_out_overlap) / - (max_insert_len + max_in_overlap + max_out_overlap); + int min_insert_len = *min_element(insertLens.begin(), insertLens.end()); + int max_insert_len = *max_element(insertLens.begin(), insertLens.end()); + + float max_identity = (float)(min_insert_len + max_in_overlap + max_out_overlap) / + (max_insert_len + max_in_overlap + max_out_overlap); if (min_insert_len <= 0 || max_identity < opt::identity) return max_identity; @@ -291,13 +300,14 @@ static float getAlignmentIdentity(const Graph& g, unsigned matches, consensusSize; tie(matches, consensusSize) = align(seqs); return (float)(matches + max_in_overlap + max_out_overlap) / - (consensusSize + max_in_overlap + max_out_overlap); + (consensusSize + max_in_overlap + max_out_overlap); } /** Pop the specified bubble if it is a simple bubble. * @return whether the bubble is popped */ -static bool popSimpleBubble(Graph* pg, vertex_descriptor v) +static bool +popSimpleBubble(Graph* pg, vertex_descriptor v) { Graph& g = *pg; unsigned nbranches = g.out_degree(v); @@ -310,15 +320,14 @@ static bool popSimpleBubble(Graph* pg, vertex_descriptor v) } vertex_descriptor tail = *g.adjacent_vertices(v1).first; if (v == get(vertex_complement, g, tail) // Palindrome - || g.in_degree(tail) != nbranches) { + || g.in_degree(tail) != nbranches) { #pragma omp atomic g_count.notSimple++; return false; } // Check that every branch is simple and ends at the same node. - pair - adj = g.adjacent_vertices(v); + pair adj = g.adjacent_vertices(v); for (adjacency_iterator it = adj.first; it != adj.second; ++it) { if (g.out_degree(*it) != 1 || g.in_degree(*it) != 1) { #pragma omp atomic @@ -337,8 +346,7 @@ static bool popSimpleBubble(Graph* pg, vertex_descriptor v) #pragma omp critical(cerr) { cerr << "\n* " << get(vertex_name, g, v) << " ->"; - for (adjacency_iterator it = adj.first; - it != adj.second; ++it) + for (adjacency_iterator it = adj.first; it != adj.second; ++it) cerr << ' ' << get(vertex_name, g, *it); cerr << " -> " << get(vertex_name, g, tail) << '\n'; } @@ -354,8 +362,9 @@ static bool popSimpleBubble(Graph* pg, vertex_descriptor v) } vector lengths(nbranches); - transform(adj.first, adj.second, lengths.begin(), - bind1st(ptr_fun(getLength), &g)); + transform(adj.first, adj.second, lengths.begin(), [&g](const ContigNode& c) { + return getLength(&g, c); + }); unsigned minLength = *min_element(lengths.begin(), lengths.end()); unsigned maxLength = *max_element(lengths.begin(), lengths.end()); if (maxLength >= opt::maxLength) { @@ -364,18 +373,17 @@ static bool popSimpleBubble(Graph* pg, vertex_descriptor v) g_count.tooLong++; if (opt::verbose > 1) #pragma omp critical(cerr) - cerr << minLength << '\t' << maxLength - << "\t0\t(too long)\n"; + cerr << minLength << '\t' << maxLength << "\t0\t(too long)\n"; return false; } - float identity = opt::identity == 0 ? 0 - : getAlignmentIdentity(g, v, tail, adj.first, adj.second); + float identity = + opt::identity == 0 ? 0 : getAlignmentIdentity(g, v, tail, adj.first, adj.second); bool dissimilar = identity < opt::identity; if (opt::verbose > 1) #pragma omp critical(cerr) cerr << minLength << '\t' << maxLength << '\t' << identity - << (dissimilar ? "\t(dissimilar)" : "") << '\n'; + << (dissimilar ? "\t(dissimilar)" : "") << '\n'; if (dissimilar) { // Insufficient identity. #pragma omp atomic @@ -390,14 +398,14 @@ static bool popSimpleBubble(Graph* pg, vertex_descriptor v) } /** Add distances to a path. */ -static ContigPath addDistance(const Graph& g, const ContigPath& path) +static ContigPath +addDistance(const Graph& g, const ContigPath& path) { ContigPath out; out.reserve(path.size()); ContigNode u = path.front(); out.push_back(u); - for (ContigPath::const_iterator it = path.begin() + 1; - it != path.end(); ++it) { + for (ContigPath::const_iterator it = path.begin() + 1; it != path.end(); ++it) { ContigNode v = *it; int distance = getDistance(g, u, v); if (distance >= 0) { @@ -413,7 +421,8 @@ static ContigPath addDistance(const Graph& g, const ContigPath& path) } /** Return the length of the longest path through the bubble. */ -static int longestPath(const Graph& g, const Bubble& topo) +static int +longestPath(const Graph& g, const Bubble& topo) { typedef graph_traits::edge_descriptor E; typedef graph_traits::out_edge_iterator Eit; @@ -422,8 +431,7 @@ static int longestPath(const Graph& g, const Bubble& topo) EdgeWeightMap weight(g); map distance; distance[topo.front()] = 0; - for (Bubble::const_iterator it = topo.begin(); - it != topo.end(); ++it) { + for (Bubble::const_iterator it = topo.begin(); it != topo.end(); ++it) { V u = *it; Eit eit, elast; for (tie(eit, elast) = out_edges(u, g); eit != elast; ++eit) { @@ -440,7 +448,8 @@ static int longestPath(const Graph& g, const Bubble& topo) * Add an edge (u,w) with the distance property set to the length of * the largest branch of the bubble. */ -static void scaffoldBubble(Graph& g, const Bubble& bubble) +static void +scaffoldBubble(Graph& g, const Bubble& bubble) { typedef graph_traits::vertex_descriptor V; assert(opt::scaffold); @@ -456,15 +465,15 @@ static void scaffoldBubble(Graph& g, const Bubble& bubble) assert(bubble.size() > 2); size_t n = bubble.size() - 2; g_popped.reserve(g_popped.size() + n); - for (Bubble::const_iterator it = bubble.begin() + 1; - it != bubble.end() - 1; ++it) + for (Bubble::const_iterator it = bubble.begin() + 1; it != bubble.end() - 1; ++it) g_popped.push_back(it->contigIndex()); add_edge(u, w, max(longestPath(g, bubble), 1), g); } /** Pop the specified bubble if it is simple, otherwise scaffold. */ -static void popOrScaffoldBubble(Graph& g, const Bubble& bubble) +static void +popOrScaffoldBubble(Graph& g, const Bubble& bubble) { #pragma omp atomic g_count.bubbles++; @@ -476,20 +485,23 @@ static void popOrScaffoldBubble(Graph& g, const Bubble& bubble) } /** Return the length of the specified vertex in k-mer. */ -static unsigned getKmerLength(const ContigProperties& vp) +static unsigned +getKmerLength(const ContigProperties& vp) { assert(vp.length >= opt::k); return vp.length - opt::k + 1; } /** Return the mean k-mer coverage of the specified vertex. */ -static float getMeanCoverage(const ContigProperties& vp) +static float +getMeanCoverage(const ContigProperties& vp) { return (float)vp.coverage / getKmerLength(vp); } /** Remove contigs with insufficient coverage. */ -static void filterGraph(Graph& g) +static void +filterGraph(Graph& g) { typedef graph_traits GTraits; typedef GTraits::vertex_descriptor V; @@ -511,62 +523,82 @@ static void filterGraph(Graph& g) } } if (opt::verbose > 0) { - cerr << "Removed " << removedKmer << " k-mer in " - << removedContigs << " contigs with mean k-mer coverage " - "less than " << opt::minCoverage << ".\n"; + cerr << "Removed " << removedKmer << " k-mer in " << removedContigs + << " contigs with mean k-mer coverage " + "less than " + << opt::minCoverage << ".\n"; printGraphStats(cerr, g); } } /** Remove the specified contig from the adjacency graph. */ -static void removeContig(Graph* g, ContigID id) +static void +removeContig(Graph* g, ContigID id) { ContigNode v(id, false); g->clear_vertex(v); g->remove_vertex(v); } -int main(int argc, char** argv) +int +main(int argc, char** argv) { string commandLine; { ostringstream ss; char** last = argv + argc - 1; - copy(argv, last, ostream_iterator(ss, " ")); + copy(argv, last, ostream_iterator(ss, " ")); ss << *last; commandLine = ss.str(); } bool die = false; - for (int c; (c = getopt_long(argc, argv, - shortopts, longopts, NULL)) != -1;) { + for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { istringstream arg(optarg != NULL ? optarg : ""); switch (c) { - case '?': die = true; break; - case 'a': arg >> opt::maxBranches; break; - case 'b': arg >> opt::maxLength; break; - case 'c': arg >> opt::minCoverage; break; - case 'g': arg >> opt::graphPath; break; - case 'j': arg >> opt::threads; break; - case 'k': arg >> opt::k; break; - case 'p': arg >> opt::identity; break; - case 'v': opt::verbose++; break; - case OPT_HELP: - cout << USAGE_MESSAGE; - exit(EXIT_SUCCESS); - case OPT_VERSION: - cout << VERSION_MESSAGE; - exit(EXIT_SUCCESS); + case '?': + die = true; + break; + case 'a': + arg >> opt::maxBranches; + break; + case 'b': + arg >> opt::maxLength; + break; + case 'c': + arg >> opt::minCoverage; + break; + case 'g': + arg >> opt::graphPath; + break; + case 'j': + arg >> opt::threads; + break; + case 'k': + arg >> opt::k; + break; + case 'p': + arg >> opt::identity; + break; + case 'v': + opt::verbose++; + break; + case OPT_HELP: + cout << USAGE_MESSAGE; + exit(EXIT_SUCCESS); + case OPT_VERSION: + cout << VERSION_MESSAGE; + exit(EXIT_SUCCESS); } if (optarg != NULL && !arg.eof()) { - cerr << PROGRAM ": invalid option: `-" - << (char)c << optarg << "'\n"; + cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n"; exit(EXIT_FAILURE); } } if (opt::k <= 0) { - cerr << PROGRAM ": " << "missing -k,--kmer option\n"; + cerr << PROGRAM ": " + << "missing -k,--kmer option\n"; die = true; } @@ -581,8 +613,7 @@ int main(int argc, char** argv) } if (die) { - cerr << "Try `" << PROGRAM - << " --help' for more information.\n"; + cerr << "Try `" << PROGRAM << " --help' for more information.\n"; exit(EXIT_FAILURE); } @@ -626,37 +657,33 @@ int main(int argc, char** argv) cout << "digraph bubbles {\n"; Bubbles bubbles = discoverBubbles(g); - for (Bubbles::const_iterator it = bubbles.begin(); - it != bubbles.end(); ++it) + for (Bubbles::const_iterator it = bubbles.begin(); it != bubbles.end(); ++it) popOrScaffoldBubble(g, *it); // Each bubble should be identified twice. Remove the duplicate. sort(g_popped.begin(), g_popped.end()); - g_popped.erase(unique(g_popped.begin(), g_popped.end()), - g_popped.end()); + g_popped.erase(unique(g_popped.begin(), g_popped.end()), g_popped.end()); if (opt::bubbleGraph) { cout << "}\n"; } else { - for (vector::const_iterator it = g_popped.begin(); - it != g_popped.end(); ++it) + for (vector::const_iterator it = g_popped.begin(); it != g_popped.end(); ++it) cout << get(g_contigNames, *it) << '\n'; } if (opt::verbose > 0) - cerr << "Bubbles: " << (g_count.bubbles + 1) / 2 - << " Popped: " << (g_count.popped + 1) / 2 - << " Scaffolds: " << (g_count.scaffold + 1) / 2 - << " Complex: " << (g_count.notSimple + 1) / 2 - << " Too long: " << (g_count.tooLong + 1) / 2 - << " Too many: " << (g_count.tooMany + 1) / 2 - << " Dissimilar: " << (g_count.dissimilar + 1) / 2 - << '\n'; + cerr << "Bubbles: " << (g_count.bubbles + 1) / 2 << " Popped: " << (g_count.popped + 1) / 2 + << " Scaffolds: " << (g_count.scaffold + 1) / 2 + << " Complex: " << (g_count.notSimple + 1) / 2 + << " Too long: " << (g_count.tooLong + 1) / 2 + << " Too many: " << (g_count.tooMany + 1) / 2 + << " Dissimilar: " << (g_count.dissimilar + 1) / 2 << '\n'; if (!opt::graphPath.empty()) { // Remove the popped contigs from the adjacency graph. - for_each(g_popped.begin(), g_popped.end(), - bind1st(ptr_fun(removeContig), &g)); + for_each(g_popped.begin(), g_popped.end(), [&g](const ContigID& c) { + return removeContig(&g, c); + }); // Assemble unambiguous paths. g_contigNames.unlock(); @@ -669,21 +696,18 @@ int main(int argc, char** argv) assemble_stranded(g, back_inserter(paths)); else assemble(g, back_inserter(paths)); - for (ContigPaths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) { ContigNode u(numContigs + it - paths.begin(), false); string name = createContigName(); put(vertex_name, g, u, name); - cout << name << '\t' - << addDistance(gorig, *it) << '\n'; + cout << name << '\t' << addDistance(gorig, *it) << '\n'; } } else { if (opt::ss) assemble_stranded(g, back_inserter(paths)); else assemble(g, back_inserter(paths)); - for (ContigPaths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) { ContigNode u(numContigs + it - paths.begin(), false); string name = createContigName(); put(vertex_name, g, u, name); diff --git a/Scaffold/scaffold.cc b/Scaffold/scaffold.cc index 3343881cf..139bf2bfa 100644 --- a/Scaffold/scaffold.cc +++ b/Scaffold/scaffold.cc @@ -1,12 +1,10 @@ -#include "config.h" +#include "Common/UnorderedMap.h" #include "ContigNode.h" #include "ContigPath.h" #include "ContigProperties.h" +#include "DataBase/DB.h" +#include "DataBase/Options.h" #include "Estimate.h" -#include "IOUtil.h" -#include "Iterator.h" -#include "Uncompress.h" -#include "Common/UnorderedMap.h" #include "Graph/Assemble.h" #include "Graph/ContigGraph.h" #include "Graph/ContigGraphAlgorithms.h" @@ -15,6 +13,10 @@ #include "Graph/GraphIO.h" #include "Graph/GraphUtil.h" #include "Graph/PopBubbles.h" +#include "IOUtil.h" +#include "Iterator.h" +#include "Uncompress.h" +#include "config.h" #include #include #include @@ -24,8 +26,6 @@ #include #include #include -#include "DataBase/Options.h" -#include "DataBase/DB.h" using namespace std; using namespace std::rel_ops; @@ -37,130 +37,144 @@ using boost::tie; DB db; static const char VERSION_MESSAGE[] = -PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" -"Written by Shaun Jackman.\n" -"\n" -"Copyright 2018 Canada's Michael Smith Genome Sciences Centre\n"; + PROGRAM " (" PACKAGE_NAME ") " VERSION "\n" + "Written by Shaun Jackman.\n" + "\n" + "Copyright 2018 Canada's Michael Smith Genome Sciences Centre\n"; static const char USAGE_MESSAGE[] = -"Usage: " PROGRAM " -k [OPTION]... FASTA|OVERLAP DIST...\n" -"Scaffold contigs using the distance estimate graph.\n" -"\n" -" Arguments:\n" -"\n" -" FASTA contigs in FASTA format\n" -" OVERLAP the contig overlap graph\n" -" DIST estimates of the distance between contigs\n" -"\n" -" Options:\n" -"\n" -" -n, --npairs=N minimum number of pairs [0]\n" -" or -n A-B:S Find the value of n in [A,B] with step size S\n" -" that maximizes the scaffold N50.\n" -" Default value for the step size is 1, if unspecified.\n" -" -s, --seed-length=N minimum contig length [1000]\n" -" or -s A-B Find the value of s in [A,B]\n" -" that maximizes the scaffold N50.\n" -" --grid optimize using a grid search [default]\n" -" --line optimize using a line search\n" -" -k, --kmer=N length of a k-mer\n" -" -G, --genome-size=N expected genome size. Used to calculate NG50\n" -" and associated stats [disabled]\n" -" --min-gap=N minimum scaffold gap length to output [50]\n" -" --max-gap=N maximum scaffold gap length to output [inf]\n" -" --complex remove complex transitive edges\n" -" --no-complex don't remove complex transitive edges [default]\n" -" --SS expect contigs to be oriented correctly\n" -" --no-SS no assumption about contig orientation [default]\n" -" -o, --out=FILE write the paths to FILE\n" -" -g, --graph=FILE write the graph to FILE\n" -" -v, --verbose display verbose output\n" -" --help display this help and exit\n" -" --version output version information and exit\n" -" --db=FILE specify path of database repository in FILE\n" -" --library=NAME specify library NAME for sqlite\n" -" --strain=NAME specify strain NAME for sqlite\n" -" --species=NAME specify species NAME for sqlite\n" -"\n" -"Report bugs to <" PACKAGE_BUGREPORT ">.\n"; + "Usage: " PROGRAM " -k [OPTION]... FASTA|OVERLAP DIST...\n" + "Scaffold contigs using the distance estimate graph.\n" + "\n" + " Arguments:\n" + "\n" + " FASTA contigs in FASTA format\n" + " OVERLAP the contig overlap graph\n" + " DIST estimates of the distance between contigs\n" + "\n" + " Options:\n" + "\n" + " -n, --npairs=N minimum number of pairs [0]\n" + " or -n A-B:S Find the value of n in [A,B] with step size S\n" + " that maximizes the scaffold N50.\n" + " Default value for the step size is 1, if unspecified.\n" + " -s, --seed-length=N minimum contig length [1000]\n" + " or -s A-B Find the value of s in [A,B]\n" + " that maximizes the scaffold N50.\n" + " --grid optimize using a grid search [default]\n" + " --line optimize using a line search\n" + " -k, --kmer=N length of a k-mer\n" + " -G, --genome-size=N expected genome size. Used to calculate NG50\n" + " and associated stats [disabled]\n" + " --min-gap=N minimum scaffold gap length to output [50]\n" + " --max-gap=N maximum scaffold gap length to output [inf]\n" + " --complex remove complex transitive edges\n" + " --no-complex don't remove complex transitive edges [default]\n" + " --SS expect contigs to be oriented correctly\n" + " --no-SS no assumption about contig orientation [default]\n" + " -o, --out=FILE write the paths to FILE\n" + " -g, --graph=FILE write the graph to FILE\n" + " -v, --verbose display verbose output\n" + " --help display this help and exit\n" + " --version output version information and exit\n" + " --db=FILE specify path of database repository in FILE\n" + " --library=NAME specify library NAME for sqlite\n" + " --strain=NAME specify strain NAME for sqlite\n" + " --species=NAME specify species NAME for sqlite\n" + "\n" + "Report bugs to <" PACKAGE_BUGREPORT ">.\n"; namespace opt { - string db; - dbVars metaVars; +string db; +dbVars metaVars; - unsigned k; // used by ContigProperties +unsigned k; // used by ContigProperties - /** Optimization search strategy. */ - static int searchStrategy; +/** Optimization search strategy. */ +static int searchStrategy; - /** Minimum number of pairs. */ - static unsigned minEdgeWeight; - static unsigned minEdgeWeightEnd; - static unsigned minEdgeWeightStep; +/** Minimum number of pairs. */ +static unsigned minEdgeWeight; +static unsigned minEdgeWeightEnd; +static unsigned minEdgeWeightStep; - /** Minimum contig length. */ - static unsigned minContigLength = 1000; - static unsigned minContigLengthEnd = 1000; +/** Minimum contig length. */ +static unsigned minContigLength = 1000; +static unsigned minContigLengthEnd = 1000; - /** Genome size. Used to calculate NG50. */ - static long long unsigned genomeSize; +/** Genome size. Used to calculate NG50. */ +static long long unsigned genomeSize; - /** Minimum scaffold gap length to output. */ - static int minGap = 50; +/** Minimum scaffold gap length to output. */ +static int minGap = 50; - /** Maximum scaffold gap length to output. - * -ve value means no maximum. */ - static int maxGap = -1; +/** Maximum scaffold gap length to output. + * -ve value means no maximum. */ +static int maxGap = -1; - /** Write the paths to this file. */ - static string out; +/** Write the paths to this file. */ +static string out; - /** Write the graph to this file. */ - static string graphPath; +/** Write the graph to this file. */ +static string graphPath; - /** Run a strand-specific RNA-Seq assembly. */ - static int ss; +/** Run a strand-specific RNA-Seq assembly. */ +static int ss; - /** Verbose output. */ - int verbose; // used by PopBubbles +/** Verbose output. */ +int verbose; // used by PopBubbles - /** Output format */ - int format = DOT; // used by DistanceEst +/** Output format */ +int format = DOT; // used by DistanceEst - /** Remove complex transitive edges */ - static int comp_trans; +/** Remove complex transitive edges */ +static int comp_trans; } static const char shortopts[] = "G:g:k:n:o:s:v"; -enum { OPT_HELP = 1, OPT_VERSION, OPT_MIN_GAP, OPT_MAX_GAP, OPT_COMP, - OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES }; +enum +{ + OPT_HELP = 1, + OPT_VERSION, + OPT_MIN_GAP, + OPT_MAX_GAP, + OPT_COMP, + OPT_DB, + OPT_LIBRARY, + OPT_STRAIN, + OPT_SPECIES +}; /** Optimization search strategy. */ -enum { GRID_SEARCH, LINE_SEARCH }; +enum +{ + GRID_SEARCH, + LINE_SEARCH +}; static const struct option longopts[] = { - { "graph", no_argument, NULL, 'g' }, - { "kmer", required_argument, NULL, 'k' }, + { "graph", no_argument, NULL, 'g' }, + { "kmer", required_argument, NULL, 'k' }, { "genome-size", required_argument, NULL, 'G' }, - { "min-gap", required_argument, NULL, OPT_MIN_GAP }, - { "max-gap", required_argument, NULL, OPT_MAX_GAP }, - { "npairs", required_argument, NULL, 'n' }, - { "grid", no_argument, &opt::searchStrategy, GRID_SEARCH }, - { "line", no_argument, &opt::searchStrategy, LINE_SEARCH }, - { "out", required_argument, NULL, 'o' }, + { "min-gap", required_argument, NULL, OPT_MIN_GAP }, + { "max-gap", required_argument, NULL, OPT_MAX_GAP }, + { "npairs", required_argument, NULL, 'n' }, + { "grid", no_argument, &opt::searchStrategy, GRID_SEARCH }, + { "line", no_argument, &opt::searchStrategy, LINE_SEARCH }, + { "out", required_argument, NULL, 'o' }, { "seed-length", required_argument, NULL, 's' }, - { "complex", no_argument, &opt::comp_trans, 1 }, - { "no-complex", no_argument, &opt::comp_trans, 0 }, - { "SS", no_argument, &opt::ss, 1 }, - { "no-SS", no_argument, &opt::ss, 0 }, - { "verbose", no_argument, NULL, 'v' }, - { "help", no_argument, NULL, OPT_HELP }, - { "version", no_argument, NULL, OPT_VERSION }, - { "db", required_argument, NULL, OPT_DB }, - { "library", required_argument, NULL, OPT_LIBRARY }, - { "strain", required_argument, NULL, OPT_STRAIN }, - { "species", required_argument, NULL, OPT_SPECIES }, + { "complex", no_argument, &opt::comp_trans, 1 }, + { "no-complex", no_argument, &opt::comp_trans, 0 }, + { "SS", no_argument, &opt::ss, 1 }, + { "no-SS", no_argument, &opt::ss, 0 }, + { "verbose", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, OPT_HELP }, + { "version", no_argument, NULL, OPT_VERSION }, + { "db", required_argument, NULL, OPT_DB }, + { "library", required_argument, NULL, OPT_LIBRARY }, + { "strain", required_argument, NULL, OPT_STRAIN }, + { "species", required_argument, NULL, OPT_SPECIES }, { NULL, 0, NULL, 0 } }; @@ -172,8 +186,11 @@ typedef ContigGraph Graph; * An edge is invalid when the overlap is larger than the length of * either of its incident sequences. */ -struct InvalidEdge { - InvalidEdge(Graph& g) : m_g(g) { } +struct InvalidEdge +{ + InvalidEdge(Graph& g) + : m_g(g) + {} bool operator()(graph_traits::edge_descriptor e) const { int d = m_g[e].distance; @@ -185,8 +202,12 @@ struct InvalidEdge { }; /** Return whether the specified edges has sufficient support. */ -struct PoorSupport { - PoorSupport(Graph& g, unsigned minEdgeWeight) : m_g(g), m_minEdgeWeight(minEdgeWeight) { } +struct PoorSupport +{ + PoorSupport(Graph& g, unsigned minEdgeWeight) + : m_g(g) + , m_minEdgeWeight(minEdgeWeight) + {} bool operator()(graph_traits::edge_descriptor e) const { return m_g[e].numPairs < m_minEdgeWeight; @@ -196,7 +217,8 @@ struct PoorSupport { }; /** Remove short vertices and unsupported edges from the graph. */ -static void filterGraph(Graph& g, unsigned minEdgeWeight, unsigned minContigLength) +static void +filterGraph(Graph& g, unsigned minEdgeWeight, unsigned minContigLength) { typedef graph_traits GTraits; typedef GTraits::vertex_descriptor V; @@ -230,13 +252,15 @@ static void filterGraph(Graph& g, unsigned minEdgeWeight, unsigned minContigLeng } /** Return true if the specified edge is a cycle. */ -static bool isCycle(Graph& g, graph_traits::edge_descriptor e) +static bool +isCycle(Graph& g, graph_traits::edge_descriptor e) { return edge(target(e, g), source(e, g), g).second; } /** Remove simple cycles of length two from the graph. */ -static void removeCycles(Graph& g) +static void +removeCycles(Graph& g) { typedef graph_traits::edge_descriptor E; typedef graph_traits::edge_iterator Eit; @@ -265,7 +289,8 @@ static void removeCycles(Graph& g) * For a pair of edges (u,v1) and (u,v2) in g, if exactly one of the * edges (v1,v2) or (v2,v1) exists in g0, add that edge to g. */ -static void resolveForks(Graph& g, const Graph& g0) +static void +resolveForks(Graph& g, const Graph& g0) { typedef graph_traits::adjacency_iterator Vit; typedef graph_traits::edge_descriptor E; @@ -293,25 +318,22 @@ static void resolveForks(Graph& g, const Graph& g0) pair e21 = edge(v2, v1, g0); if (e12.second && e21.second) { if (opt::verbose > 1) - cerr << "cycle: " << get(vertex_name, g, v1) - << ' ' << get(vertex_name, g, v2) << '\n'; + cerr << "cycle: " << get(vertex_name, g, v1) << ' ' + << get(vertex_name, g, v2) << '\n'; } else if (e12.second || e21.second) { E e = e12.second ? e12.first : e21.first; V v = source(e, g0), w = target(e, g0); add_edge(v, w, g0[e], g); numEdges++; if (opt::verbose > 1) - cerr << get(vertex_name, g, u) - << " -> " << get(vertex_name, g, v) - << " -> " << get(vertex_name, g, w) - << " [" << g0[e] << "]\n"; + cerr << get(vertex_name, g, u) << " -> " << get(vertex_name, g, v) << " -> " + << get(vertex_name, g, w) << " [" << g0[e] << "]\n"; } } } } if (opt::verbose > 0) - cerr << "Added " << numEdges - << " edges to ambiguous vertices.\n"; + cerr << "Added " << numEdges << " edges to ambiguous vertices.\n"; if (!opt::db.empty()) addToDb(db, "E_added_ambig", numEdges); } @@ -320,7 +342,8 @@ static void resolveForks(Graph& g, const Graph& g0) * For an edge (u,v), remove the vertex v if deg+(u) > 1 * and deg-(v) = 1 and deg+(v) = 0. */ -static void pruneTips(Graph& g) +static void +pruneTips(Graph& g) { /** Identify the tips. */ size_t n = 0; @@ -340,7 +363,8 @@ static void pruneTips(Graph& g) * operation: remove vertex u * output: digraph g { t1->v1 t2->v2 } */ -static void removeRepeats(Graph& g) +static void +removeRepeats(Graph& g) { typedef graph_traits::adjacency_iterator Ait; typedef graph_traits::edge_descriptor E; @@ -349,28 +373,23 @@ static void removeRepeats(Graph& g) vector repeats; vector transitive; find_transitive_edges(g, back_inserter(transitive)); - for (vector::const_iterator it = transitive.begin(); - it != transitive.end(); ++it) { + for (vector::const_iterator it = transitive.begin(); it != transitive.end(); ++it) { // Iterate through the transitive edges, u->w1. V u = source(*it, g), w1 = target(*it, g); Ait vit, vlast; - for (tie(vit, vlast) = adjacent_vertices(u, g); - vit != vlast; ++vit) { + for (tie(vit, vlast) = adjacent_vertices(u, g); vit != vlast; ++vit) { V v = *vit; assert(u != v); // no self loops if (!edge(v, w1, g).second) continue; // u->w1 is a transitive edge spanning u->v->w1. Ait wit, wlast; - for (tie(wit, wlast) = adjacent_vertices(v, g); - wit != wlast; ++wit) { + for (tie(wit, wlast) = adjacent_vertices(v, g); wit != wlast; ++wit) { // For each edge v->w2, check that an edge // w1->w2 or w2->w1 exists. If not, v is a repeat. V w2 = *wit; assert(v != w2); // no self loops - if (w1 != w2 - && !edge(w1, w2, g).second - && !edge(w2, w1, g).second) { + if (w1 != w2 && !edge(w1, w2, g).second && !edge(w2, w1, g).second) { repeats.push_back(v); break; } @@ -379,20 +398,17 @@ static void removeRepeats(Graph& g) } sort(repeats.begin(), repeats.end()); - repeats.erase(unique(repeats.begin(), repeats.end()), - repeats.end()); + repeats.erase(unique(repeats.begin(), repeats.end()), repeats.end()); if (opt::verbose > 1) { cerr << "Ambiguous:"; - for (vector::const_iterator it = repeats.begin(); - it != repeats.end(); ++it) + for (vector::const_iterator it = repeats.begin(); it != repeats.end(); ++it) cerr << ' ' << get(vertex_name, g, *it); cerr << '\n'; } // Remove the repetitive vertices. unsigned numRemoved = 0; - for (vector::const_iterator it = repeats.begin(); - it != repeats.end(); ++it) { + for (vector::const_iterator it = repeats.begin(); it != repeats.end(); ++it) { V u = *it; V uc = get(vertex_complement, g, u); clear_out_edges(u, g); @@ -403,10 +419,8 @@ static void removeRepeats(Graph& g) } if (opt::verbose > 0) { - cerr << "Cleared " - << repeats.size() << " ambiguous vertices.\n" - << "Removed " - << numRemoved << " ambiguous vertices.\n"; + cerr << "Cleared " << repeats.size() << " ambiguous vertices.\n" + << "Removed " << numRemoved << " ambiguous vertices.\n"; printGraphStats(cerr, g); } if (!opt::db.empty()) { @@ -421,7 +435,8 @@ static void removeRepeats(Graph& g) * operation: remove edge u1->v2 * output: digraph g {u1->v1 u2->v2 } */ -static void removeWeakEdges(Graph& g) +static void +removeWeakEdges(Graph& g) { typedef graph_traits::edge_descriptor E; typedef graph_traits::edge_iterator Eit; @@ -478,11 +493,9 @@ static void removeWeakEdges(Graph& g) if (opt::verbose > 1) { cerr << "Weak edges:\n"; - for (vector::const_iterator it = weak.begin(); - it != weak.end(); ++it) { + for (vector::const_iterator it = weak.begin(); it != weak.end(); ++it) { E e = *it; - cerr << '\t' << get(edge_name, g, e) - << " [" << g[e] << "]\n"; + cerr << '\t' << get(edge_name, g, e) << " [" << g[e] << "]\n"; } } @@ -496,7 +509,8 @@ static void removeWeakEdges(Graph& g) addToDb(db, "E_removed_weak", weak.size()); } -static void removeLongEdges(Graph& g) +static void +removeLongEdges(Graph& g) { typedef graph_traits::edge_descriptor E; typedef graph_traits::edge_iterator Eit; @@ -514,7 +528,8 @@ static void removeLongEdges(Graph& g) /** Return whether the specified distance estimate is an exact * overlap. */ -static bool isOverlap(const DistanceEst& d) +static bool +isOverlap(const DistanceEst& d) { if (d.stdDev == 0) { assert(d.distance < 0); @@ -527,8 +542,8 @@ static bool isOverlap(const DistanceEst& d) * @param g0 the original graph * @param g1 the transformed graph */ -static ContigPath addDistEst(const Graph& g0, const Graph& g1, - const ContigPath& path) +static ContigPath +addDistEst(const Graph& g0, const Graph& g1, const ContigPath& path) { typedef graph_traits::edge_descriptor E; typedef edge_bundle_type::type EP; @@ -537,14 +552,14 @@ static ContigPath addDistEst(const Graph& g0, const Graph& g1, out.reserve(2 * path.size()); ContigNode u = path.front(); out.push_back(u); - for (ContigPath::const_iterator it = path.begin() + 1; - it != path.end(); ++it) { + for (ContigPath::const_iterator it = path.begin() + 1; it != path.end(); ++it) { ContigNode v = *it; assert(!v.ambiguous()); pair e0 = edge(u, v, g0); pair e1 = edge(u, v, g1); if (!e0.second && !e1.second) - std::cerr << "error: missing edge: " << get(vertex_name, g0, u) << " -> " << get(vertex_name, g0, v) << '\n'; + std::cerr << "error: missing edge: " << get(vertex_name, g0, u) << " -> " + << get(vertex_name, g0, v) << '\n'; assert(e0.second || e1.second); const EP& ep = e0.second ? g0[e0.first] : g1[e1.first]; if (!isOverlap(ep)) { @@ -561,7 +576,8 @@ static ContigPath addDistEst(const Graph& g0, const Graph& g1, } /** Read a graph from the specified file. */ -static void readGraph(const string& path, Graph& g) +static void +readGraph(const string& path, Graph& g) { if (opt::verbose > 0) cerr << "Reading `" << path << "'...\n"; @@ -574,17 +590,16 @@ static void readGraph(const string& path, Graph& g) printGraphStats(cerr, g); vector vals = passGraphStatsVal(g); - vector keys = make_vector() - << "V_readGraph" - << "E_readGraph" - << "degree0_readGraph" - << "degree1_readGraph" - << "degree234_readGraph" - << "degree5_readGraph" - << "max_readGraph"; + vector keys = make_vector() << "V_readGraph" + << "E_readGraph" + << "degree0_readGraph" + << "degree1_readGraph" + << "degree234_readGraph" + << "degree5_readGraph" + << "max_readGraph"; if (!opt::db.empty()) { - for(unsigned i=0; i -unsigned addLength(const Graph& g, It first, It last) +unsigned +addLength(const Graph& g, It first, It last) { typedef typename graph_traits::vertex_descriptor V; assert(first != last); @@ -610,11 +626,11 @@ unsigned addLength(const Graph& g, It first, It last) typedef vector ContigPaths; /** - * Build the scaffold length histogram. - * @param g The graph g is destroyed. - */ -static Histogram buildScaffoldLengthHistogram( - Graph& g, const ContigPaths& paths) + * Build the scaffold length histogram. + * @param g The graph g is destroyed. + */ +static Histogram +buildScaffoldLengthHistogram(Graph& g, const ContigPaths& paths) { Histogram h; @@ -626,11 +642,10 @@ static Histogram buildScaffoldLengthHistogram( // Remove the vertices that are used in paths // and add the lengths of the scaffolds. - for (ContigPaths::const_iterator it = paths.begin(); - it != paths.end(); ++it) { + for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) { h.insert(addLength(g, it->begin(), it->end())); - remove_vertex_if(g, it->begin(), it->end(), - not1(std::mem_fun_ref(&ContigNode::ambiguous))); + remove_vertex_if( + g, it->begin(), it->end(), [](const ContigNode& c) { return !c.ambiguous(); }); } // Add the contigs that were not used in paths. @@ -645,51 +660,63 @@ static Histogram buildScaffoldLengthHistogram( } /** Add contiguity stats to database */ -static void addCntgStatsToDb( - const Histogram h, const unsigned min) +static void +addCntgStatsToDb(const Histogram h, const unsigned min) { vector vals = passContiguityStatsVal(h, min); - vector keys = make_vector() - << "n" - << "n200" - << "nN50" - << "min" - << "N75" - << "N50" - << "N25" - << "Esize" - << "max" - << "sum" - << "nNG50" - << "NG50"; + vector keys = make_vector() << "n" + << "n200" + << "nN50" + << "min" + << "N75" + << "N50" + << "N25" + << "Esize" + << "max" + << "sum" + << "nNG50" + << "NG50"; if (!opt::db.empty()) { - for(unsigned i=0; i struct hash { - size_t operator()(const ScaffoldParam& param) const - { - return hash()(param.n) ^ hash()(param.s); - } - }; +template<> +struct hash +{ + size_t operator()(const ScaffoldParam& param) const + { + return hash()(param.n) ^ hash()(param.s); + } +}; NAMESPACE_STD_HASH_END /** Result of scaffolding. */ -struct ScaffoldResult : ScaffoldParam{ - ScaffoldResult() : ScaffoldParam(0, 0), n50(0) { } +struct ScaffoldResult : ScaffoldParam +{ + ScaffoldResult() + : ScaffoldParam(0, 0) + , n50(0) + {} ScaffoldResult(unsigned n, unsigned s, unsigned n50, std::string metrics) - : ScaffoldParam(n, s), n50(n50), metrics(metrics) { } + : ScaffoldParam(n, s) + , n50(n50) + , metrics(metrics) + {} unsigned n50; std::string metrics; }; @@ -699,9 +726,7 @@ struct ScaffoldResult : ScaffoldParam{ * @return the scaffold N50 */ ScaffoldResult -scaffold(const Graph& g0, - unsigned minEdgeWeight, unsigned minContigLength, - bool output) +scaffold(const Graph& g0, unsigned minEdgeWeight, unsigned minContigLength, bool output) { Graph g(g0); @@ -744,8 +769,7 @@ scaffold(const Graph& g0, typedef graph_traits::vertex_descriptor V; vector popped = popBubbles(g); if (opt::verbose > 0) { - cerr << "Removed " << popped.size() - << " vertices in bubbles.\n"; + cerr << "Removed " << popped.size() << " vertices in bubbles.\n"; printGraphStats(cerr, g); } @@ -754,8 +778,7 @@ scaffold(const Graph& g0, if (opt::verbose > 1) { cerr << "Popped:"; - for (vector::const_iterator it = popped.begin(); - it != popped.end(); ++it) + for (vector::const_iterator it = popped.begin(); it != popped.end(); ++it) cerr << ' ' << get(vertex_name, g, *it); cerr << '\n'; } @@ -773,11 +796,9 @@ scaffold(const Graph& g0, sort(paths.begin(), paths.end()); unsigned n = 0; if (opt::verbose > 0) { - for (ContigPaths::const_iterator it = paths.begin(); - it != paths.end(); ++it) + for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) n += it->size(); - cerr << "Assembled " << n << " contigs in " - << paths.size() << " scaffolds.\n"; + cerr << "Assembled " << n << " contigs in " << paths.size() << " scaffolds.\n"; printGraphStats(cerr, g); } @@ -792,10 +813,8 @@ scaffold(const Graph& g0, ostream& out = opt::out.empty() || opt::out == "-" ? cout : fout; assert_good(out, opt::out); g_contigNames.unlock(); - for (vector::const_iterator it = paths.begin(); - it != paths.end(); ++it) - out << createContigName() << '\t' - << addDistEst(g0, g, *it) << '\n'; + for (vector::const_iterator it = paths.begin(); it != paths.end(); ++it) + out << createContigName() << '\t' << addDistEst(g0, g, *it) << '\n'; assert_good(out, opt::out); // Output the graph. @@ -811,15 +830,16 @@ scaffold(const Graph& g0, const unsigned STATS_MIN_LENGTH = opt::minContigLength; std::ostringstream ss; Histogram scaffold_histogram = buildScaffoldLengthHistogram(g, paths); - printContiguityStats(ss, scaffold_histogram, STATS_MIN_LENGTH, - false, "\t", opt::genomeSize) - << "\tn=" << minEdgeWeight << " s=" << minContigLength << '\n'; + printContiguityStats(ss, scaffold_histogram, STATS_MIN_LENGTH, false, "\t", opt::genomeSize) + << "\tn=" << minEdgeWeight << " s=" << minContigLength << '\n'; std::string metrics = ss.str(); addCntgStatsToDb(scaffold_histogram, STATS_MIN_LENGTH); - return ScaffoldResult(minEdgeWeight, minContigLength, - scaffold_histogram.trimLow(STATS_MIN_LENGTH).n50(), - metrics); + return ScaffoldResult( + minEdgeWeight, + minContigLength, + scaffold_histogram.trimLow(STATS_MIN_LENGTH).n50(), + metrics); } /** Memoize the optimization results so far. */ @@ -858,10 +878,11 @@ scaffold_memoized(const Graph& g, unsigned n, unsigned s, ScaffoldMemo& memo) /** Find the value of n that maximizes the scaffold N50. */ static ScaffoldResult -optimize_n(const Graph& g, - std::pair minEdgeWeight, - unsigned minContigLength, - ScaffoldMemo& memo) +optimize_n( + const Graph& g, + std::pair minEdgeWeight, + unsigned minContigLength, + ScaffoldMemo& memo) { std::string metrics_table; unsigned bestn = 0, bestN50 = 0; @@ -879,19 +900,17 @@ optimize_n(const Graph& g, /** Find the value of s that maximizes the scaffold N50. */ static ScaffoldResult -optimize_s(const Graph& g, - unsigned minEdgeWeight, - std::pair minContigLength, - ScaffoldMemo& memo) +optimize_s( + const Graph& g, + unsigned minEdgeWeight, + std::pair minContigLength, + ScaffoldMemo& memo) { std::string metrics_table; unsigned bests = 0, bestN50 = 0; const double STEP = cbrt(10); // Three steps per decade. - unsigned ilast = (unsigned)round( - log(minContigLength.second) / log(STEP)); - for (unsigned i = (unsigned)round( - log(minContigLength.first) / log(STEP)); - i <= ilast; ++i) { + unsigned ilast = (unsigned)round(log(minContigLength.second) / log(STEP)); + for (unsigned i = (unsigned)round(log(minContigLength.first) / log(STEP)); i <= ilast; ++i) { unsigned s = (unsigned)pow(STEP, (int)i); // Round to 1 figure. @@ -911,9 +930,10 @@ optimize_s(const Graph& g, /** Find the values of n and s that maximizes the scaffold N50. */ static ScaffoldResult -optimize_grid_search(const Graph& g, - std::pair minEdgeWeight, - std::pair minContigLength) +optimize_grid_search( + const Graph& g, + std::pair minEdgeWeight, + std::pair minContigLength) { const unsigned STATS_MIN_LENGTH = opt::minContigLength; if (opt::verbose == 0) @@ -935,9 +955,10 @@ optimize_grid_search(const Graph& g, /** Find the values of n and s that maximizes the scaffold N50. */ static ScaffoldResult -optimize_line_search(const Graph& g, - std::pair minEdgeWeight, - std::pair minContigLength) +optimize_line_search( + const Graph& g, + std::pair minEdgeWeight, + std::pair minContigLength) { const unsigned STATS_MIN_LENGTH = opt::minContigLength; if (opt::verbose == 0) @@ -946,10 +967,10 @@ optimize_line_search(const Graph& g, ScaffoldMemo memo; std::string metrics_table; ScaffoldResult best( - (minEdgeWeight.first + minEdgeWeight.second) / 2, - minContigLength.second, 0, ""); + (minEdgeWeight.first + minEdgeWeight.second) / 2, minContigLength.second, 0, ""); // An upper limit on the number of iterations. - const unsigned MAX_ITERATIONS = 1 + (minEdgeWeight.second - minEdgeWeight.first) / opt::minEdgeWeightStep; + const unsigned MAX_ITERATIONS = + 1 + (minEdgeWeight.second - minEdgeWeight.first) / opt::minEdgeWeightStep; for (unsigned i = 0; i < MAX_ITERATIONS; ++i) { // Optimize s. if (opt::verbose > 0) { @@ -981,33 +1002,32 @@ optimize_line_search(const Graph& g, } /** Run abyss-scaffold. */ -int main(int argc, char** argv) +int +main(int argc, char** argv) { if (!opt::db.empty()) opt::metaVars.resize(3); bool die = false; - for (int c; (c = getopt_long(argc, argv, - shortopts, longopts, NULL)) != -1;) { + for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { istringstream arg(optarg != NULL ? optarg : ""); switch (c) { - case '?': + case '?': die = true; break; - case 'k': + case 'k': arg >> opt::k; break; - case 'G': - { - double x; - arg >> x; - opt::genomeSize = x; - break; - } - case 'g': + case 'G': { + double x; + arg >> x; + opt::genomeSize = x; + break; + } + case 'g': arg >> opt::graphPath; break; - case 'n': + case 'n': arg >> opt::minEdgeWeight; if (arg.peek() == '-') { arg >> expect("-") >> opt::minEdgeWeightEnd; @@ -1019,56 +1039,55 @@ int main(int argc, char** argv) else opt::minEdgeWeightStep = 1; break; - case 'o': + case 'o': arg >> opt::out; break; - case 's': + case 's': arg >> opt::minContigLength; if (arg.peek() == '-') { opt::minContigLengthEnd = 100 * opt::minContigLength; arg >> expect("-") >> opt::minContigLengthEnd; - assert(opt::minContigLength - <= opt::minContigLengthEnd); + assert(opt::minContigLength <= opt::minContigLengthEnd); } else opt::minContigLengthEnd = opt::minContigLength; break; - case 'v': + case 'v': opt::verbose++; break; - case OPT_MIN_GAP: + case OPT_MIN_GAP: arg >> opt::minGap; break; - case OPT_MAX_GAP: + case OPT_MAX_GAP: arg >> opt::maxGap; break; - case OPT_HELP: + case OPT_HELP: cout << USAGE_MESSAGE; exit(EXIT_SUCCESS); - case OPT_VERSION: + case OPT_VERSION: cout << VERSION_MESSAGE; exit(EXIT_SUCCESS); - case OPT_DB: + case OPT_DB: arg >> opt::db; break; - case OPT_LIBRARY: + case OPT_LIBRARY: arg >> opt::metaVars[0]; break; - case OPT_STRAIN: + case OPT_STRAIN: arg >> opt::metaVars[1]; break; - case OPT_SPECIES: + case OPT_SPECIES: arg >> opt::metaVars[2]; break; } if (optarg != NULL && !arg.eof()) { - cerr << PROGRAM ": invalid option: `-" - << (char)c << optarg << "'\n"; + cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n"; exit(EXIT_FAILURE); } } if (opt::k <= 0) { - cerr << PROGRAM ": " << "missing -k,--kmer option\n"; + cerr << PROGRAM ": " + << "missing -k,--kmer option\n"; die = true; } @@ -1078,17 +1097,11 @@ int main(int argc, char** argv) } if (die) { - cerr << "Try `" << PROGRAM - << " --help' for more information.\n"; + cerr << "Try `" << PROGRAM << " --help' for more information.\n"; exit(EXIT_FAILURE); } if (!opt::db.empty()) { - init(db, - opt::db, - opt::verbose, - PROGRAM, - opt::getCommand(argc, argv), - opt::metaVars); + init(db, opt::db, opt::verbose, PROGRAM, opt::getCommand(argc, argv), opt::metaVars); addToDb(db, "K", opt::k); } @@ -1114,15 +1127,14 @@ int main(int argc, char** argv) remove_edge_if(InvalidEdge(g), static_cast(g)); unsigned numRemoved = numBefore - num_edges(g); if (numRemoved > 0) - cerr << "warning: Removed " - << numRemoved << " invalid edges.\n"; + cerr << "warning: Removed " << numRemoved << " invalid edges.\n"; if (!opt::db.empty()) addToDb(db, "Edges_invalid", numRemoved); const unsigned STATS_MIN_LENGTH = opt::minContigLength; - if (opt::minEdgeWeight == opt::minEdgeWeightEnd - && opt::minContigLength == opt::minContigLengthEnd) { + if (opt::minEdgeWeight == opt::minEdgeWeightEnd && + opt::minContigLength == opt::minContigLengthEnd) { ScaffoldResult result = scaffold(g, opt::minEdgeWeight, opt::minContigLength, true); // Print assembly contiguity statistics. if (opt::verbose > 0) @@ -1132,19 +1144,21 @@ int main(int argc, char** argv) } else { ScaffoldResult best(0, 0, 0, ""); switch (opt::searchStrategy) { - case GRID_SEARCH: - best = optimize_grid_search(g, - std::make_pair(opt::minEdgeWeight, opt::minEdgeWeightEnd), - std::make_pair(opt::minContigLength, opt::minContigLengthEnd)); - break; - case LINE_SEARCH: - best = optimize_line_search(g, - std::make_pair(opt::minEdgeWeight, opt::minEdgeWeightEnd), - std::make_pair(opt::minContigLength, opt::minContigLengthEnd)); - break; - default: - abort(); - break; + case GRID_SEARCH: + best = optimize_grid_search( + g, + std::make_pair(opt::minEdgeWeight, opt::minEdgeWeightEnd), + std::make_pair(opt::minContigLength, opt::minContigLengthEnd)); + break; + case LINE_SEARCH: + best = optimize_line_search( + g, + std::make_pair(opt::minEdgeWeight, opt::minEdgeWeightEnd), + std::make_pair(opt::minContigLength, opt::minContigLengthEnd)); + break; + default: + abort(); + break; } if (opt::verbose > 0) @@ -1158,7 +1172,9 @@ int main(int argc, char** argv) std::cerr << best.metrics; } - std::cerr << '\n' << "Best scaffold N50 is " << best.n50 << " at n=" << best.n << " s=" << best.s << ".\n"; + std::cerr << '\n' + << "Best scaffold N50 is " << best.n50 << " at n=" << best.n << " s=" << best.s + << ".\n"; // Print assembly contiguity statistics. std::cerr << '\n'; diff --git a/Sealer/sealer.cc b/Sealer/sealer.cc index 9da7eeb28..796fc83a7 100644 --- a/Sealer/sealer.cc +++ b/Sealer/sealer.cc @@ -71,7 +71,6 @@ static const char USAGE_MESSAGE[] = " --print-flanks outputs flank files\n" " -S, --input-scaffold=FILE load scaffold from FILE\n" " -L, --flank-length=N length of flanks to be used as pseudoreads [100]\n" -" -D, --flank-distance=N distance of flank from gap [0]\n" " -G, --max-gap-length=N max gap size to fill in bp [800]; runtime increases\n" " exponentially with respect to this parameter\n" " -j, --threads=N use N parallel threads [1]\n" diff --git a/bin/abyss-pe b/bin/abyss-pe index 2caead3f9..99176300a 100755 --- a/bin/abyss-pe +++ b/bin/abyss-pe @@ -398,7 +398,7 @@ help: @echo 'Report bugs to https://github.com/bcgsc/abyss/issues or abyss-users@bcgsc.ca.' version: - @echo "abyss-pe (ABySS) 2.2.3" + @echo "abyss-pe (ABySS) 2.2.4" @echo "Written by Shaun Jackman and Anthony Raymond." @echo @echo "Copyright 2012 Canada's Michael Smith Genome Science Centre" diff --git a/configure.ac b/configure.ac index 40c8e7c64..e5a821955 100644 --- a/configure.ac +++ b/configure.ac @@ -1,5 +1,5 @@ AC_PREREQ(2.62) -AC_INIT(ABySS, 2.2.3, abyss-users@bcgsc.ca, abyss, +AC_INIT(ABySS, 2.2.4, abyss-users@bcgsc.ca, abyss, http://www.bcgsc.ca/platform/bioinfo/software/abyss) AC_CONFIG_MACRO_DIR([m4]) diff --git a/doc/ABYSS.1 b/doc/ABYSS.1 index 1515db755..b39841adf 100644 --- a/doc/ABYSS.1 +++ b/doc/ABYSS.1 @@ -1,4 +1,4 @@ -.TH ABYSS "1" "2015-May" "ABYSS (ABySS) 2.2.3" "User Commands" +.TH ABYSS "1" "2015-May" "ABYSS (ABySS) 2.2.4" "User Commands" .SH NAME ABYSS \- assemble short reads into contigs .SH SYNOPSIS diff --git a/doc/abyss-pe.1 b/doc/abyss-pe.1 index c31a1cbd1..ea7652852 100644 --- a/doc/abyss-pe.1 +++ b/doc/abyss-pe.1 @@ -1,4 +1,4 @@ -.TH abyss-pe "1" "2015-May" "abyss-pe (ABySS) 2.2.3" "User Commands" +.TH abyss-pe "1" "2015-May" "abyss-pe (ABySS) 2.2.4" "User Commands" .SH NAME abyss-pe - assemble reads into contigs .SH SYNOPSIS diff --git a/doc/abyss-tofastq.1 b/doc/abyss-tofastq.1 index 55609ee75..97e005f24 100644 --- a/doc/abyss-tofastq.1 +++ b/doc/abyss-tofastq.1 @@ -1,4 +1,4 @@ -.TH abyss-tofastq "1" "2015-May" "ABySS 2.2.3" "User Commands" +.TH abyss-tofastq "1" "2015-May" "ABySS 2.2.4" "User Commands" .SH NAME abyss-tofastq \- convert various file formats to FASTQ format .br