Skip to content

Commit

Permalink
changes correspond to 1.0rc1 in beta and article submission
Browse files Browse the repository at this point in the history
  • Loading branch information
joh committed Aug 19, 2013
1 parent f26e1e9 commit 1d1a7e6
Show file tree
Hide file tree
Showing 13 changed files with 292 additions and 237 deletions.
20 changes: 10 additions & 10 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ set( project_name taxator-tk )

# cmake requirements detection
set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake-modules)
find_package( Sqlite3 )
# find_package( Sqlite3 )
find_package( Boost "${boost_version_required}" REQUIRED COMPONENTS program_options filesystem )
if( Boost_FOUND )
if( Boost_VERSION LESS "${boost_version_required_enc}" )
Expand All @@ -22,11 +22,11 @@ else()
message( FATAL_ERROR "Unable to find a Boost version. Did you set BOOST_ROOT?" )
endif()
include_directories( "includes-external" )
set(CMAKE_CXX_FLAGS "-Wall -pedantic -Wno-long-long -Wno-variadic-macros -fpermissive") #-g for debuggin, -m32 for x32
set(CMAKE_CXX_FLAGS "-Wall -pedantic -Wno-long-long -Wno-variadic-macros -fpermissive -O3") #-g for debuggin, -m32 for x32

# unittest: constructs the taxonomy from NCBI dump files and tests the structure thoroughly
add_executable( unittest_ncbitaxonomy unittest_ncbitaxonomy.cpp src/ncbidata.cpp src/accessconv.cpp src/taxontree.cpp src/taxonomyinterface.cpp src/sqlite3pp.cpp )
target_link_libraries( unittest_ncbitaxonomy boost_system sqlite3 boost_filesystem )
add_executable( unittest_ncbitaxonomy unittest_ncbitaxonomy.cpp src/ncbidata.cpp src/accessconv.cpp src/taxontree.cpp src/taxonomyinterface.cpp )
target_link_libraries( unittest_ncbitaxonomy boost_system boost_filesystem )

# traverse nodes up to a given rank
add_executable( rank-filter rank-filter.cpp src/taxontree.cpp src/taxonomyinterface.cpp src/ncbidata.cpp )
Expand All @@ -37,20 +37,20 @@ add_executable( name-filter name-filter.cpp src/taxontree.cpp src/taxonomyinterf
target_link_libraries( name-filter boost_program_options boost_system boost_filesystem )

# general converter for string or id mappings
add_executable( acc2taxid acc2taxid.cpp src/accessconv.cpp src/sqlite3pp )
target_link_libraries( acc2taxid sqlite3 boost_program_options boost_system boost_filesystem )
add_executable( acc2taxid acc2taxid.cpp src/accessconv.cpp )
target_link_libraries( acc2taxid boost_program_options boost_system boost_filesystem )

# parses NCBI-style identifiers in the FASTA comment header
add_executable( extract-fastacomment-ncbifield extract-fastacomment-ncbifield.cpp src/ncbidata.cpp src/taxontree.cpp )
target_link_libraries( extract-fastacomment-ncbifield boost_program_options boost_system boost_filesystem )

# apply filtering to alignments file
add_executable( alignments-filter alignments-filter.cpp src/alignmentrecord.cpp src/accessconv.cpp src/sqlite3pp.cpp )
target_link_libraries( alignments-filter boost_program_options boost_system boost_filesystem sqlite3 )
add_executable( alignments-filter alignments-filter.cpp src/alignmentrecord.cpp src/accessconv.cpp )
target_link_libraries( alignments-filter boost_program_options boost_system boost_filesystem )

# takes input alignments and predicts a taxon for each query id using various methods and parameters
add_executable( taxator taxator.cpp src/taxontree.cpp src/taxonomyinterface.cpp src/ncbidata.cpp src/accessconv.cpp src/sqlite3pp.cpp src/taxonpredictionmodel.cpp src/predictionrecord.cpp )
target_link_libraries( taxator sqlite3 boost_program_options boost_system boost_filesystem boost_thread )
add_executable( taxator taxator.cpp src/taxontree.cpp src/taxonomyinterface.cpp src/ncbidata.cpp src/accessconv.cpp src/taxonpredictionmodel.cpp src/predictionrecord.cpp )
target_link_libraries( taxator boost_program_options boost_system boost_filesystem boost_thread )

# apply filtering to predictions file
add_executable( binner binner.cpp src/taxontree.cpp src/taxonomyinterface.cpp src/ncbidata.cpp src/predictionrecord.cpp )
Expand Down
288 changes: 165 additions & 123 deletions README

Large diffs are not rendered by default.

14 changes: 7 additions & 7 deletions alignments-filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ int main( int argc, char** argv ) {

float minscore, toppercent, minpid;
double maxevalue;
unsigned int numbestbitscore, minsupport;
unsigned int numbestscore, minsupport;

std::string tax_map1_filename, tax_map2_filename;

Expand All @@ -95,8 +95,8 @@ int main( int argc, char** argv ) {
( "min-pid,p", po::value< float >( &minpid )->default_value( 0.0 ), "set minimal PID to consider" )
( "top-percent,t", po::value< float >( &toppercent )->default_value( 1.0 ), "set top-percent filter value" )
( "max-evalue,e", po::value< double >( &maxevalue )->default_value( -1.0 ), "set maximum evalue for filtering" )
( "best-alignments,b", po::value< unsigned int >( &numbestbitscore )->default_value( 0 ), "set number of top bitscore alignments to consider (after toppercent filter)" )
( "sort-bitscore,s", "sort alignments by decreasing bitscore" )
( "best-alignments,b", po::value< unsigned int >( &numbestscore )->default_value( 0 ), "set number of top score alignments to consider (after toppercent filter)" )
( "sort-score,s", "sort alignments by decreasing score" )
( "keep-best-per-ref,k", "for each combination of query and reference sequence id all but the best scoring alignment are removed" )
( "min-support,c", po::value< unsigned int >( &minsupport )->default_value( 1 ), "set minimum number of hits an alignment needs to have (after filtering)" )
( "remove-ref-from-query-taxon,r", "remove alignments for labeled data to test different degrees of taxonomic distance" )
Expand All @@ -113,7 +113,7 @@ int main( int argc, char** argv ) {
return EXIT_SUCCESS;
}

bool sort_by_bitscore = vm.count( "sort-bitscore" );
bool sort_by_score = vm.count( "sort-score" );
bool keep_best_per_gi = vm.count( "keep-best-per-ref" );
bool mask_by_star = vm.count( "mask-by-star" );
bool remove_same_taxon = vm.count( "remove-ref-from-query-taxon" );
Expand All @@ -139,7 +139,7 @@ int main( int argc, char** argv ) {
if( keep_best_per_gi ) {
filters.push_back( new BestScorePerReferenceSeqIDFilter< RecordSetType >() );
}
if( sort_by_bitscore ) {
if( sort_by_score ) {
filters.push_back( new SortByBitscoreFilter< RecordSetType >() );
}
if( minpid > 0.0 ) {
Expand All @@ -152,8 +152,8 @@ int main( int argc, char** argv ) {
filters.push_back( new MinScoreTopPercentFilter< RecordSetType >( minscore, toppercent ) );
}
}
if( numbestbitscore ) {
filters.push_back( new NumBestBitscoreFilter< RecordSetType >( numbestbitscore ) );
if( numbestscore ) {
filters.push_back( new NumBestBitscoreFilter< RecordSetType >( numbestscore ) );
}
if( minsupport ) {
filters.push_back( new MinSupportFilter< RecordSetType >( minsupport ) );
Expand Down
6 changes: 4 additions & 2 deletions binner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ int main ( int argc, char** argv ) {
po::options_description visible_options ( "Allowed options" );
visible_options.add_options()
( "help,h", "show help message" )
( "sample-min-support,m", po::value< std::string >( &min_support_in_sample_str )->default_value( "0.01" ), "minimum support in positions (>=1) or fraction of total support (<1) for any taxon" )
( "sequence-min-support,s", po::value< medium_unsigned_int >( &min_support_per_sequence )->default_value( 50 ), "minimum number of positions supporting a taxonomic signal for any single sequence. If not reached, a fall-back on a more robust algorthm will be used" )
( "signal-majority,j", po::value< float >( &signal_majority_per_sequence )->default_value( .7 ), "minimum combined fraction of support for any single sequence (> 0.5 to be stable)" )
( "identity-constrain,i", po::value< vector< string > >(), "minimum required identity for this rank (e.g. -i species:0.8 -i genus:0.7)")
Expand All @@ -66,6 +65,7 @@ int main ( int argc, char** argv ) {

po::options_description hidden_options("Hidden options");
hidden_options.add_options()
( "sample-min-support,m", po::value< std::string >( &min_support_in_sample_str )->default_value( "0" ), "minimum support in positions (>=1) or fraction of total support (<1) for any taxon" )
( "preallocate-num-queries", po::value< boost::ptr_vector< boost::ptr_list< PredictionRecordBinning > >::size_type >( & num_queries_preallocation )->default_value( 5000 ), "advanced parameter for better memory allocation, set to number of query sequences or similar (no need to be set)" )
( "delete-notranks,d", po::value< bool >( &delete_unmarked )->default_value( true ), "delete all nodes that don't have any of the given ranks (make sure that input taxons are at those ranks)" );

Expand Down Expand Up @@ -315,12 +315,14 @@ int main ( int argc, char** argv ) {
map< const string*, float >::const_iterator find_it;
const TaxonNode* predict_node = root_node;
const TaxonNode* target_node = prec->getUpperNode();
const float rank_pid = prec->getSupportAt( target_node )/seqlen;
Taxonomy::CPathDownIterator pit = taxinter.traverseDown<Taxonomy::CPathDownIterator>( target_node );
do {
pit++;
find_it = pid_per_rank.find( &(pit->data->annotation->rank) );
if ( find_it != pid_per_rank.end() ) min_pid = max( min_pid, find_it->second );
if ( prec->getSupportAt( &*pit )/seqlen < min_pid ) break;
binning_debug_output << "constraint ctrl: " << rank_pid << " >= " << min_pid << " ?" << endl;
if ( rank_pid < min_pid ) break;
predict_node = &*pit;
} while ( pit != target_node );
std::cout << prec->getQueryIdentifier() << tab << predict_node->data->taxid << endline;
Expand Down
8 changes: 3 additions & 5 deletions releasefiles.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,16 @@
build.sh
Changelog.txt
CMakeLists.txt
cmake-modules/FindSqlite3.cmake
#cmake-modules/FindSqlite3.cmake
gpl-3.0.txt
README

# taxator-tk executables
acc2taxid.cpp
alignments-prefilter.cpp
alignments-prefilter-training.cpp
alignments-filter.cpp
binner.cpp
extract-fastacomment-ncbifield.cpp
name-filter.cpp
prediction2distances.cpp
prediction-error.cpp
rank-filter.cpp
taxator.cpp
unittest_ncbitaxonomy.cpp
Expand All @@ -29,6 +26,7 @@ src/alignmentrecord.cpp
src/alignmentrecord.hh
src/alignmentsfilter.hh
src/boundedbuffer.hh
src/concurrentoutstream.hh
src/constants.hh
src/fastnodemap.hh
src/losses.hh
Expand Down
165 changes: 82 additions & 83 deletions src/accessconv.hh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include <boost/format.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/filesystem.hpp>
#include "sqlite3pp.hh"
#include "constants.hh"
#include "types.hh"
#include "utils.hh"
Expand All @@ -47,81 +46,81 @@ class AccessIDConverter {



template< typename TypeT >
class AccessIDConverterSQLite : public AccessIDConverter< TypeT > {
public:
AccessIDConverterSQLite( const std::string& database_filename ) {
db = new sqlite3pp::database( database_filename.c_str() );
initialization_error = db->error_code();
};

~AccessIDConverterSQLite() {
delete db;
};

TaxonID operator[]( const TypeT& acc ) /*throw( std::out_of_range)*/ {
const std::string sql_statement = boost::str( boost::format("select ncbi_taxon_id from ncbi_gitaxid_mapping where ncbi_gi = %1%" ) % acc );
sqlite3pp::query qry( *db, sql_statement.c_str() );
sqlite3pp::query::iterator q_it = qry.begin();
if( q_it == qry.end() ) {
throw std::out_of_range( sql_statement );
}

return q_it->get< int >( 0 );
};

bool fail( bool print_message = false ) {
bool code = db->error_code();
if( code && print_message ) {
std::cerr << db->error_msg() << std::endl;
return true;
}
return false;
};

private:
sqlite3pp::database* db;
bool initialization_error;
};



template< typename TypeT >
class AccessIDConverterSQLiteCache : public AccessIDConverterSQLite< TypeT > {
public:
AccessIDConverterSQLiteCache( const std::string& database_filename, unsigned int cachesize = 0 ) : AccessIDConverterSQLite< TypeT >( database_filename ), last_lookup( std::make_pair(TypeT(),0) ) {}; //TODO: initialize differently

TaxonID operator[]( const TypeT& acc ) /*throw( std::out_of_range )*/ {
// check for repeated lookup
if( acc == last_lookup.first ) {
return last_lookup.second;
}

// search in cache
typename std::map< TypeT, TaxonID >::iterator cache_it = cache.find( acc );
if( cache_it != cache.end() ) { //if found in cache
return cache_it->second;
}

// fall back on subclass operator[]
TaxonID taxid = AccessIDConverterSQLite< TypeT >::operator[]( acc ); //can throw exception if not found

// update cache
if( max_cache_size && history.size() >= max_cache_size ) {
cache.erase( history.back() );
history.pop();
}

history.push( cache.insert( std::make_pair( acc, taxid ) ).first );
return taxid;
}

private:
unsigned int max_cache_size;
typename std::pair< TypeT, TaxonID > last_lookup; //constant lookup time
typename std::map< TypeT, TaxonID > cache; //logarithmic lookup time (in memory)
typename std::queue< typename std::map< TypeT, TaxonID >::iterator > history;
};
// template< typename TypeT >
// class AccessIDConverterSQLite : public AccessIDConverter< TypeT > {
// public:
// AccessIDConverterSQLite( const std::string& database_filename ) {
// db = new sqlite3pp::database( database_filename.c_str() );
// initialization_error = db->error_code();
// };
//
// ~AccessIDConverterSQLite() {
// delete db;
// };
//
// TaxonID operator[]( const TypeT& acc ) /*throw( std::out_of_range)*/ {
// const std::string sql_statement = boost::str( boost::format("select ncbi_taxon_id from ncbi_gitaxid_mapping where ncbi_gi = %1%" ) % acc );
// sqlite3pp::query qry( *db, sql_statement.c_str() );
// sqlite3pp::query::iterator q_it = qry.begin();
// if( q_it == qry.end() ) {
// throw std::out_of_range( sql_statement );
// }
//
// return q_it->get< int >( 0 );
// };
//
// bool fail( bool print_message = false ) {
// bool code = db->error_code();
// if( code && print_message ) {
// std::cerr << db->error_msg() << std::endl;
// return true;
// }
// return false;
// };
//
// private:
// sqlite3pp::database* db;
// bool initialization_error;
// };



// template< typename TypeT >
// class AccessIDConverterSQLiteCache : public AccessIDConverterSQLite< TypeT > {
// public:
// AccessIDConverterSQLiteCache( const std::string& database_filename, unsigned int cachesize = 0 ) : AccessIDConverterSQLite< TypeT >( database_filename ), last_lookup( std::make_pair(TypeT(),0) ) {}; //TODO: initialize differently
//
// TaxonID operator[]( const TypeT& acc ) /*throw( std::out_of_range )*/ {
// // check for repeated lookup
// if( acc == last_lookup.first ) {
// return last_lookup.second;
// }
//
// // search in cache
// typename std::map< TypeT, TaxonID >::iterator cache_it = cache.find( acc );
// if( cache_it != cache.end() ) { //if found in cache
// return cache_it->second;
// }
//
// // fall back on subclass operator[]
// TaxonID taxid = AccessIDConverterSQLite< TypeT >::operator[]( acc ); //can throw exception if not found
//
// // update cache
// if( max_cache_size && history.size() >= max_cache_size ) {
// cache.erase( history.back() );
// history.pop();
// }
//
// history.push( cache.insert( std::make_pair( acc, taxid ) ).first );
// return taxid;
// }
//
// private:
// unsigned int max_cache_size;
// typename std::pair< TypeT, TaxonID > last_lookup; //constant lookup time
// typename std::map< TypeT, TaxonID > cache; //logarithmic lookup time (in memory)
// typename std::queue< typename std::map< TypeT, TaxonID >::iterator > history;
// };



Expand All @@ -135,7 +134,7 @@ class AccessIDConverterFlatfileMemory : public AccessIDConverter< TypeT > {
TaxonID operator[]( const TypeT& acc ) /*throw( std::out_of_range )*/ {
typename std::map< TypeT, TaxonID >::iterator it = accessidconv.find( acc );
if( it == accessidconv.end() ) {
std::cerr << "sequence accession key \"" << acc << "\" not found" << std::endl;
//std::cerr << "sequence accession key \"" << acc << "\" not found" << std::endl;
throw std::out_of_range( boost::lexical_cast<std::string>( acc ) );
}
return it->second;
Expand Down Expand Up @@ -187,11 +186,11 @@ AccessIDConverter< TypeT >* loadAccessIDConverterFromFile( const std::string& fi
}

std::cerr << "loading accession to taxonomic id converter file...";
if( filename.substr( filename.size() - 9 ) == ".sqlitedb" ) {
accidconv = new AccessIDConverterSQLiteCache< TypeT >( filename, cachesize );
} else {
// if( filename.substr( filename.size() - 9 ) == ".sqlitedb" ) {
// accidconv = new AccessIDConverterSQLiteCache< TypeT >( filename, cachesize );
// } else {
accidconv = new AccessIDConverterFlatfileMemory< TypeT >( filename );
}
// }
std::cerr << " done" << std::endl;
return accidconv;
}
Expand All @@ -200,9 +199,9 @@ AccessIDConverter< TypeT >* loadAccessIDConverterFromFile( const std::string& fi
// converts general string sequence identifier to taxonomic id
typedef AccessIDConverter< std::string > StrIDConverter;

typedef AccessIDConverterSQLite< std::string > StrIDConverterSQLite;
// typedef AccessIDConverterSQLite< std::string > StrIDConverterSQLite;

typedef AccessIDConverterSQLiteCache< std::string > StrIDConverterSQLiteCache;
// typedef AccessIDConverterSQLiteCache< std::string > StrIDConverterSQLiteCache;

typedef AccessIDConverterFlatfileMemory< std::string > StrIDConverterFlatfileMemory;

Expand Down
16 changes: 14 additions & 2 deletions src/alignmentsfilter.hh
Original file line number Diff line number Diff line change
Expand Up @@ -824,10 +824,22 @@ class TaxonMaskingFilter : public AlignmentsFilter< ContainerT > {
void filter( ContainerT& recordset ) {
if( ! recordset.empty() ) {
typename ContainerT::iterator record_it = recordset.begin();
const TaxonID qtax = staxon_[ (*record_it)->getQueryIdentifier() ];
TaxonID qtax;
try {
qtax = staxon_[ (*record_it)->getQueryIdentifier() ];
} catch ( std::out_of_range ) {
std::cerr << "No mapping for query identifier \"" << (*record_it)->getQueryIdentifier() << "\", masking all alignments..." << std::endl;
while( record_it != recordset.end() ) (*record_it++)->filterOut();
return;
}
while( record_it != recordset.end() ) {
if( qtax == rtaxon_[ (*record_it)->getReferenceIdentifier() ] ) {
try {
if( qtax == rtaxon_[ (*record_it)->getReferenceIdentifier() ] ) {
(*record_it)->filterOut();
}
} catch ( std::out_of_range ) {
(*record_it)->filterOut();
std::cerr << "No mapping for reference identifier \"" << (*record_it)->getReferenceIdentifier() << "\", masking alignment..." << std::endl;
}
++record_it;
}
Expand Down
1 change: 0 additions & 1 deletion src/ncbidata.hh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#define ncbidata_hh_

#include "taxontree.hh"
#include "sqlite3pp.hh"
#include "utils.hh"


Expand Down
Loading

0 comments on commit 1d1a7e6

Please sign in to comment.