changes correspond to 1.0rc1 in beta and article submission

fungs · Aug 19, 2013 · 1d1a7e6 · 1d1a7e6
1 parent f26e1e9
commit 1d1a7e6
Show file tree

Hide file tree

Showing 13 changed files with 292 additions and 237 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -8,7 +8,7 @@ set( project_name taxator-tk )
 
 # cmake requirements detection
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake-modules)
-find_package( Sqlite3 )
+# find_package( Sqlite3 )
 find_package( Boost "${boost_version_required}" REQUIRED COMPONENTS program_options filesystem )
 if( Boost_FOUND )
 	if( Boost_VERSION LESS "${boost_version_required_enc}" )
@@ -22,11 +22,11 @@ else()
   message( FATAL_ERROR "Unable to find a Boost version. Did you set BOOST_ROOT?" )
 endif()
 include_directories( "includes-external" )
-set(CMAKE_CXX_FLAGS "-Wall -pedantic -Wno-long-long -Wno-variadic-macros -fpermissive") #-g for debuggin, -m32 for x32
+set(CMAKE_CXX_FLAGS "-Wall -pedantic -Wno-long-long -Wno-variadic-macros -fpermissive -O3") #-g for debuggin, -m32 for x32
 
 # unittest: constructs the taxonomy from NCBI dump files and tests the structure thoroughly
-add_executable( unittest_ncbitaxonomy unittest_ncbitaxonomy.cpp src/ncbidata.cpp src/accessconv.cpp src/taxontree.cpp src/taxonomyinterface.cpp src/sqlite3pp.cpp )
-target_link_libraries( unittest_ncbitaxonomy boost_system sqlite3 boost_filesystem )
+add_executable( unittest_ncbitaxonomy unittest_ncbitaxonomy.cpp src/ncbidata.cpp src/accessconv.cpp src/taxontree.cpp src/taxonomyinterface.cpp )
+target_link_libraries( unittest_ncbitaxonomy boost_system boost_filesystem )
 
 # traverse nodes up to a given rank
 add_executable( rank-filter rank-filter.cpp src/taxontree.cpp src/taxonomyinterface.cpp src/ncbidata.cpp )
@@ -37,20 +37,20 @@ add_executable( name-filter name-filter.cpp src/taxontree.cpp src/taxonomyinterf
 target_link_libraries( name-filter boost_program_options boost_system boost_filesystem )
 
 # general converter for string or id mappings
-add_executable( acc2taxid acc2taxid.cpp src/accessconv.cpp src/sqlite3pp )
-target_link_libraries( acc2taxid sqlite3 boost_program_options boost_system boost_filesystem )
+add_executable( acc2taxid acc2taxid.cpp src/accessconv.cpp )
+target_link_libraries( acc2taxid boost_program_options boost_system boost_filesystem )
 
 # parses NCBI-style identifiers in the FASTA comment header
 add_executable( extract-fastacomment-ncbifield extract-fastacomment-ncbifield.cpp src/ncbidata.cpp src/taxontree.cpp )
 target_link_libraries( extract-fastacomment-ncbifield boost_program_options boost_system boost_filesystem )
 
 # apply filtering to alignments file
-add_executable( alignments-filter alignments-filter.cpp src/alignmentrecord.cpp src/accessconv.cpp src/sqlite3pp.cpp )
-target_link_libraries( alignments-filter boost_program_options boost_system boost_filesystem sqlite3 )
+add_executable( alignments-filter alignments-filter.cpp src/alignmentrecord.cpp src/accessconv.cpp )
+target_link_libraries( alignments-filter boost_program_options boost_system boost_filesystem )
 
 # takes input alignments and predicts a taxon for each query id using various methods and parameters
-add_executable( taxator taxator.cpp src/taxontree.cpp src/taxonomyinterface.cpp src/ncbidata.cpp src/accessconv.cpp src/sqlite3pp.cpp src/taxonpredictionmodel.cpp src/predictionrecord.cpp )
-target_link_libraries( taxator sqlite3 boost_program_options boost_system boost_filesystem boost_thread )
+add_executable( taxator taxator.cpp src/taxontree.cpp src/taxonomyinterface.cpp src/ncbidata.cpp src/accessconv.cpp src/taxonpredictionmodel.cpp src/predictionrecord.cpp )
+target_link_libraries( taxator boost_program_options boost_system boost_filesystem boost_thread )
 
 # apply filtering to predictions file
 add_executable( binner binner.cpp src/taxontree.cpp src/taxonomyinterface.cpp src/ncbidata.cpp src/predictionrecord.cpp )

diff --git a/README b/README
diff --git a/alignments-filter.cpp b/alignments-filter.cpp
@@ -83,7 +83,7 @@ int main( int argc, char** argv ) {
 
 	float minscore, toppercent, minpid;
 	double maxevalue;
-	unsigned int numbestbitscore, minsupport;
+	unsigned int numbestscore, minsupport;
 
 	std::string tax_map1_filename, tax_map2_filename;
 
@@ -95,8 +95,8 @@ int main( int argc, char** argv ) {
 		( "min-pid,p", po::value< float >( &minpid )->default_value( 0.0 ), "set minimal PID to consider" )
 		( "top-percent,t", po::value< float >( &toppercent )->default_value( 1.0 ), "set top-percent filter value" )
 		( "max-evalue,e", po::value< double >( &maxevalue )->default_value( -1.0 ), "set maximum evalue for filtering" )
-		( "best-alignments,b", po::value< unsigned int >( &numbestbitscore )->default_value( 0 ), "set number of top bitscore alignments to consider (after toppercent filter)" )
-		( "sort-bitscore,s", "sort alignments by decreasing bitscore" )
+		( "best-alignments,b", po::value< unsigned int >( &numbestscore )->default_value( 0 ), "set number of top score alignments to consider (after toppercent filter)" )
+		( "sort-score,s", "sort alignments by decreasing score" )
 		( "keep-best-per-ref,k", "for each combination of query and reference sequence id all but the best scoring alignment are removed" )
 		( "min-support,c", po::value< unsigned int >( &minsupport )->default_value( 1 ), "set minimum number of hits an alignment needs to have (after filtering)" )
 		( "remove-ref-from-query-taxon,r", "remove alignments for labeled data to test different degrees of taxonomic distance" )
@@ -113,7 +113,7 @@ int main( int argc, char** argv ) {
 		return EXIT_SUCCESS;
 	}
 
-	bool sort_by_bitscore = vm.count( "sort-bitscore" );
+	bool sort_by_score = vm.count( "sort-score" );
 	bool keep_best_per_gi = vm.count( "keep-best-per-ref" );
 	bool mask_by_star = vm.count( "mask-by-star" );
 	bool remove_same_taxon = vm.count( "remove-ref-from-query-taxon" );
@@ -139,7 +139,7 @@ int main( int argc, char** argv ) {
 	if( keep_best_per_gi ) {
 		filters.push_back( new BestScorePerReferenceSeqIDFilter< RecordSetType >() );
 	}
-	if( sort_by_bitscore ) {
+	if( sort_by_score ) {
 		filters.push_back( new SortByBitscoreFilter< RecordSetType >() );
 	}
 	if( minpid > 0.0 ) {
@@ -152,8 +152,8 @@ int main( int argc, char** argv ) {
 			filters.push_back( new MinScoreTopPercentFilter< RecordSetType >( minscore, toppercent ) );
 		}
 	}
-	if( numbestbitscore ) {
-		filters.push_back( new NumBestBitscoreFilter< RecordSetType >( numbestbitscore ) );
+	if( numbestscore ) {
+		filters.push_back( new NumBestBitscoreFilter< RecordSetType >( numbestscore ) );
 	}
 	if( minsupport ) {
 	  filters.push_back( new MinSupportFilter< RecordSetType >( minsupport ) );

diff --git a/binner.cpp b/binner.cpp
@@ -56,7 +56,6 @@ int main ( int argc, char** argv ) {
 	po::options_description visible_options ( "Allowed options" );
 	visible_options.add_options()
 	( "help,h", "show help message" )
-	( "sample-min-support,m", po::value< std::string >( &min_support_in_sample_str )->default_value( "0.01" ), "minimum support in positions (>=1) or fraction of total support (<1) for any taxon" )
 	( "sequence-min-support,s", po::value< medium_unsigned_int >( &min_support_per_sequence )->default_value( 50 ), "minimum number of positions supporting a taxonomic signal for any single sequence. If not reached, a fall-back on a more robust algorthm will be used" )
 	( "signal-majority,j", po::value< float >( &signal_majority_per_sequence )->default_value( .7 ), "minimum combined fraction of support for any single sequence (> 0.5 to be stable)" )
 	( "identity-constrain,i", po::value< vector< string > >(), "minimum required identity for this rank (e.g. -i species:0.8 -i genus:0.7)")
@@ -66,6 +65,7 @@ int main ( int argc, char** argv ) {
 
 	po::options_description hidden_options("Hidden options");
 	hidden_options.add_options()
+	( "sample-min-support,m", po::value< std::string >( &min_support_in_sample_str )->default_value( "0" ), "minimum support in positions (>=1) or fraction of total support (<1) for any taxon" )
 	( "preallocate-num-queries", po::value< boost::ptr_vector< boost::ptr_list< PredictionRecordBinning > >::size_type >( & num_queries_preallocation )->default_value( 5000 ), "advanced parameter for better memory allocation, set to number of query sequences or similar (no need to be set)" )
 	( "delete-notranks,d", po::value< bool >( &delete_unmarked )->default_value( true ), "delete all nodes that don't have any of the given ranks (make sure that input taxons are at those ranks)" );
 
@@ -315,12 +315,14 @@ int main ( int argc, char** argv ) {
 			map< const string*, float >::const_iterator find_it;
 			const TaxonNode* predict_node = root_node;
 			const TaxonNode* target_node = prec->getUpperNode();
+			const float rank_pid = prec->getSupportAt( target_node )/seqlen;
 			Taxonomy::CPathDownIterator pit = taxinter.traverseDown<Taxonomy::CPathDownIterator>( target_node );
 			do {
 				pit++;
 				find_it = pid_per_rank.find( &(pit->data->annotation->rank) );
 				if ( find_it != pid_per_rank.end() ) min_pid = max( min_pid, find_it->second );
-				if ( prec->getSupportAt( &*pit )/seqlen < min_pid ) break;
+				binning_debug_output << "constraint ctrl: " << rank_pid << " >= " << min_pid << " ?" << endl;
+				if ( rank_pid < min_pid ) break;
 				predict_node = &*pit;
 			} while ( pit != target_node );
 			std::cout << prec->getQueryIdentifier() << tab << predict_node->data->taxid << endline;

diff --git a/releasefiles.txt b/releasefiles.txt
@@ -5,19 +5,16 @@
 build.sh
 Changelog.txt
 CMakeLists.txt
-cmake-modules/FindSqlite3.cmake
+#cmake-modules/FindSqlite3.cmake
 gpl-3.0.txt
 README
 
 # taxator-tk executables
 acc2taxid.cpp
-alignments-prefilter.cpp
-alignments-prefilter-training.cpp
+alignments-filter.cpp
 binner.cpp
 extract-fastacomment-ncbifield.cpp
 name-filter.cpp
-prediction2distances.cpp
-prediction-error.cpp
 rank-filter.cpp
 taxator.cpp
 unittest_ncbitaxonomy.cpp
@@ -29,6 +26,7 @@ src/alignmentrecord.cpp
 src/alignmentrecord.hh
 src/alignmentsfilter.hh
 src/boundedbuffer.hh
+src/concurrentoutstream.hh
 src/constants.hh
 src/fastnodemap.hh
 src/losses.hh

diff --git a/src/accessconv.hh b/src/accessconv.hh
@@ -30,7 +30,6 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include <boost/format.hpp>
 #include <boost/lexical_cast.hpp>
 #include <boost/filesystem.hpp>
-#include "sqlite3pp.hh"
 #include "constants.hh"
 #include "types.hh"
 #include "utils.hh"
@@ -47,81 +46,81 @@ class AccessIDConverter {
 
 
 
-template< typename TypeT >
-class AccessIDConverterSQLite : public AccessIDConverter< TypeT > {
-	public:
-		AccessIDConverterSQLite( const std::string& database_filename ) {
-      db = new sqlite3pp::database( database_filename.c_str() );
-      initialization_error = db->error_code();
-    };
-
-		~AccessIDConverterSQLite() {
-		  delete db;
-    };
-
-		TaxonID operator[]( const TypeT& acc ) /*throw( std::out_of_range)*/ {
-      const std::string sql_statement = boost::str( boost::format("select ncbi_taxon_id from ncbi_gitaxid_mapping where ncbi_gi = %1%" ) % acc );
-      sqlite3pp::query qry( *db, sql_statement.c_str() );
-      sqlite3pp::query::iterator q_it = qry.begin();
-      if( q_it == qry.end() ) {
-        throw std::out_of_range( sql_statement );
-      }
-
-      return q_it->get< int >( 0 );
-    };
-
-		bool fail( bool print_message = false ) {
-		  bool code = db->error_code();
-      if( code && print_message ) {
-        std::cerr << db->error_msg() << std::endl;
-        return true;
-      }
-      return false;
-    };
-
- 	private:
-		sqlite3pp::database* db;
-		bool initialization_error;
-};
-
-
-
-template< typename TypeT >
-class AccessIDConverterSQLiteCache : public AccessIDConverterSQLite< TypeT > {
-	public:
-		AccessIDConverterSQLiteCache( const std::string& database_filename, unsigned int cachesize = 0 ) : AccessIDConverterSQLite< TypeT >( database_filename ), last_lookup( std::make_pair(TypeT(),0) ) {}; //TODO: initialize differently
-
-		TaxonID operator[]( const TypeT& acc ) /*throw( std::out_of_range )*/ {
-      // check for repeated lookup
-      if( acc == last_lookup.first ) {
-        return last_lookup.second;
-      }
-
-      // search in cache
-      typename std::map< TypeT, TaxonID >::iterator cache_it = cache.find( acc );
-      if( cache_it != cache.end() ) { //if found in cache
-        return cache_it->second;
-      }
-
-      // fall back on subclass operator[]
-      TaxonID taxid = AccessIDConverterSQLite< TypeT >::operator[]( acc ); //can throw exception if not found
-
-      // update cache
-      if( max_cache_size && history.size() >= max_cache_size ) {
-        cache.erase( history.back() );
-        history.pop();
-      }
-
-      history.push( cache.insert( std::make_pair( acc, taxid ) ).first );
-      return taxid;
-		}
-
-	private:
-		unsigned int max_cache_size;
-		typename std::pair< TypeT, TaxonID > last_lookup; //constant lookup time
-		typename std::map< TypeT, TaxonID > cache; //logarithmic lookup time (in memory)
-		typename std::queue< typename std::map< TypeT, TaxonID >::iterator > history;
-};
+// template< typename TypeT >
+// class AccessIDConverterSQLite : public AccessIDConverter< TypeT > {
+// 	public:
+// 		AccessIDConverterSQLite( const std::string& database_filename ) {
+//       db = new sqlite3pp::database( database_filename.c_str() );
+//       initialization_error = db->error_code();
+//     };
+// 
+// 		~AccessIDConverterSQLite() {
+// 		  delete db;
+//     };
+// 
+// 		TaxonID operator[]( const TypeT& acc ) /*throw( std::out_of_range)*/ {
+//       const std::string sql_statement = boost::str( boost::format("select ncbi_taxon_id from ncbi_gitaxid_mapping where ncbi_gi = %1%" ) % acc );
+//       sqlite3pp::query qry( *db, sql_statement.c_str() );
+//       sqlite3pp::query::iterator q_it = qry.begin();
+//       if( q_it == qry.end() ) {
+//         throw std::out_of_range( sql_statement );
+//       }
+// 
+//       return q_it->get< int >( 0 );
+//     };
+// 
+// 		bool fail( bool print_message = false ) {
+// 		  bool code = db->error_code();
+//       if( code && print_message ) {
+//         std::cerr << db->error_msg() << std::endl;
+//         return true;
+//       }
+//       return false;
+//     };
+// 		
+//  	private:
+// 		sqlite3pp::database* db;
+// 		bool initialization_error;
+// };
+
+
+
+// template< typename TypeT >
+// class AccessIDConverterSQLiteCache : public AccessIDConverterSQLite< TypeT > {
+// 	public:
+// 		AccessIDConverterSQLiteCache( const std::string& database_filename, unsigned int cachesize = 0 ) : AccessIDConverterSQLite< TypeT >( database_filename ), last_lookup( std::make_pair(TypeT(),0) ) {}; //TODO: initialize differently
+// 
+// 		TaxonID operator[]( const TypeT& acc ) /*throw( std::out_of_range )*/ {
+//       // check for repeated lookup
+//       if( acc == last_lookup.first ) {
+//         return last_lookup.second;
+//       }
+// 
+//       // search in cache
+//       typename std::map< TypeT, TaxonID >::iterator cache_it = cache.find( acc );
+//       if( cache_it != cache.end() ) { //if found in cache
+//         return cache_it->second;
+//       }
+// 
+//       // fall back on subclass operator[]
+//       TaxonID taxid = AccessIDConverterSQLite< TypeT >::operator[]( acc ); //can throw exception if not found
+// 
+//       // update cache
+//       if( max_cache_size && history.size() >= max_cache_size ) {
+//         cache.erase( history.back() );
+//         history.pop();
+//       }
+// 
+//       history.push( cache.insert( std::make_pair( acc, taxid ) ).first );
+//       return taxid;
+// 		}
+// 		
+// 	private:
+// 		unsigned int max_cache_size;
+// 		typename std::pair< TypeT, TaxonID > last_lookup; //constant lookup time
+// 		typename std::map< TypeT, TaxonID > cache; //logarithmic lookup time (in memory)
+// 		typename std::queue< typename std::map< TypeT, TaxonID >::iterator > history;
+// };
 
 
 
@@ -135,7 +134,7 @@ class AccessIDConverterFlatfileMemory : public AccessIDConverter< TypeT > {
 		TaxonID operator[]( const TypeT& acc ) /*throw( std::out_of_range )*/ {
       typename std::map< TypeT, TaxonID >::iterator it = accessidconv.find( acc );
       if( it == accessidconv.end() ) {
-				std::cerr << "sequence accession key \"" << acc << "\" not found" << std::endl;
+				//std::cerr << "sequence accession key \"" << acc << "\" not found" << std::endl;
         throw std::out_of_range( boost::lexical_cast<std::string>( acc ) );
       }
       return it->second;
@@ -187,11 +186,11 @@ AccessIDConverter< TypeT >* loadAccessIDConverterFromFile( const std::string& fi
 	}
 
 	std::cerr << "loading accession to taxonomic id converter file...";
-  if( filename.substr( filename.size() - 9 ) == ".sqlitedb" ) {
-    accidconv = new AccessIDConverterSQLiteCache< TypeT >( filename, cachesize );
-  } else {
+//   if( filename.substr( filename.size() - 9 ) == ".sqlitedb" ) {
+//     accidconv = new AccessIDConverterSQLiteCache< TypeT >( filename, cachesize );
+//   } else {
     accidconv = new AccessIDConverterFlatfileMemory< TypeT >( filename );
-  }
+//   }
   std::cerr << " done" << std::endl;
   return accidconv;
 }
@@ -200,9 +199,9 @@ AccessIDConverter< TypeT >* loadAccessIDConverterFromFile( const std::string& fi
 // converts general string sequence identifier to taxonomic id
 typedef AccessIDConverter< std::string > StrIDConverter;
 
-typedef AccessIDConverterSQLite< std::string > StrIDConverterSQLite;
+// typedef AccessIDConverterSQLite< std::string > StrIDConverterSQLite;
 
-typedef AccessIDConverterSQLiteCache< std::string > StrIDConverterSQLiteCache;
+// typedef AccessIDConverterSQLiteCache< std::string > StrIDConverterSQLiteCache;
 
 typedef AccessIDConverterFlatfileMemory< std::string > StrIDConverterFlatfileMemory;
 

diff --git a/src/alignmentsfilter.hh b/src/alignmentsfilter.hh
@@ -824,10 +824,22 @@ class TaxonMaskingFilter : public AlignmentsFilter< ContainerT > {
 		void filter( ContainerT& recordset ) {
 			if( ! recordset.empty() ) {
 				typename ContainerT::iterator record_it = recordset.begin();
-        const TaxonID qtax = staxon_[ (*record_it)->getQueryIdentifier() ];
+				TaxonID qtax;
+				try {
+					qtax = staxon_[ (*record_it)->getQueryIdentifier() ];
+				} catch ( std::out_of_range ) {
+					std::cerr << "No mapping for query identifier \"" << (*record_it)->getQueryIdentifier() << "\", masking all alignments..." << std::endl;
+					while( record_it != recordset.end() ) (*record_it++)->filterOut();
+					return;
+				}
 				while( record_it != recordset.end() ) {
-					if( qtax == rtaxon_[ (*record_it)->getReferenceIdentifier() ] ) {
+					try {
+						if( qtax == rtaxon_[ (*record_it)->getReferenceIdentifier() ] ) {
+							(*record_it)->filterOut();
+						}
+					} catch ( std::out_of_range ) {
 						(*record_it)->filterOut();
+						std::cerr << "No mapping for reference identifier \"" << (*record_it)->getReferenceIdentifier() << "\", masking alignment..." << std::endl;
 					}
 					++record_it;
 				}

diff --git a/src/ncbidata.hh b/src/ncbidata.hh
@@ -22,7 +22,6 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #define ncbidata_hh_
 
 #include "taxontree.hh"
-#include "sqlite3pp.hh"
 #include "utils.hh"