diff --git a/.clang-format b/.clang-format index c5b2882..dbb9719 100644 --- a/.clang-format +++ b/.clang-format @@ -2,8 +2,8 @@ # SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik # SPDX-License-Identifier: CC0-1.0 -# Format all files in include/test folder, including std module, excluding contrib module -# find . \( -iname "*.cpp" -or -iname "*.hpp" \) -and -not -path "./lib/*" -and -not -path "./build/*" | xargs clang-format-18 --style=file -i +# Format all files in include/test folder +# find . \( -iname "*.cpp" -or -iname "*.hpp" \) -and -not -path "./build/*" | xargs clang-format-18 --style=file -i # Staged files: git diff --name-only HEAD --diff-filter=ACMRT | grep -E "(\.cpp|\.hpp)$" | xargs clang-format-18 --style=file -i --- Language: Cpp diff --git a/.cmake-format.yaml b/.cmake-format.yaml index 96038c5..8a29c3d 100644 --- a/.cmake-format.yaml +++ b/.cmake-format.yaml @@ -2,18 +2,20 @@ # SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik # SPDX-License-Identifier: CC0-1.0 -# find . \( -iname CMakeLists.txt -o -iname *.cmake \) -a -not -path "./lib/*" -a -not -path "./build/*" | xargs cmake-format -c .cmake-format.yaml -i +# find . \( -iname CMakeLists.txt -o -iname *.cmake \) -a -not -path "./build/*" | xargs cmake-format -c .cmake-format.yaml -i _help_parse: Options affecting listfile parsing parse: _help_additional_commands: - Specify structure for custom cmake functions additional_commands: - declare_internal_datasource: + declare_datasource: + pargs: + nargs: '*' + flags: [] kwargs: - FILE: '*' + FILE: 1 URL: '*' - URL_HASH: '*' - CONFIGURE: '*' + URL_HASH: 1 cpmgetpackage: pargs: 1 spelling: CPMGetPackage diff --git a/cmake/CPM.cmake b/cmake/CPM.cmake index baf2d8c..56cac81 100644 --- a/cmake/CPM.cmake +++ b/cmake/CPM.cmake @@ -2,23 +2,22 @@ # # SPDX-FileCopyrightText: Copyright (c) 2019-2023 Lars Melchior and contributors -set(CPM_DOWNLOAD_VERSION 0.40.2) -set(CPM_HASH_SUM "c8cdc32c03816538ce22781ed72964dc864b2a34a310d3b7104812a5ca2d835d") +set (CPM_DOWNLOAD_VERSION 0.40.2) +set (CPM_HASH_SUM "c8cdc32c03816538ce22781ed72964dc864b2a34a310d3b7104812a5ca2d835d") -if(CPM_SOURCE_CACHE) - set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") -elseif(DEFINED ENV{CPM_SOURCE_CACHE}) - set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") -else() - set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake") -endif() +if (CPM_SOURCE_CACHE) + set (CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +elseif (DEFINED ENV{CPM_SOURCE_CACHE}) + set (CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +else () + set (CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +endif () # Expand relative path. This is important if the provided path contains a tilde (~) -get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE) +get_filename_component (CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE) -file(DOWNLOAD - https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake - ${CPM_DOWNLOAD_LOCATION} EXPECTED_HASH SHA256=${CPM_HASH_SUM} +file (DOWNLOAD https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake + ${CPM_DOWNLOAD_LOCATION} EXPECTED_HASH SHA256=${CPM_HASH_SUM} ) -include(${CPM_DOWNLOAD_LOCATION}) +include (${CPM_DOWNLOAD_LOCATION}) diff --git a/cmake/test/config.cmake b/cmake/test/config.cmake index 90652fe..31ea3c0 100644 --- a/cmake/test/config.cmake +++ b/cmake/test/config.cmake @@ -19,7 +19,6 @@ if (NOT TARGET ${PROJECT_NAME}_test) add_library (${PROJECT_NAME}_test INTERFACE) target_compile_options (${PROJECT_NAME}_lib PUBLIC "-pedantic" "-Wall" "-Wextra" "-Werror") - if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") # Disable bogus warnings in GCC12. if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12 AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 13) @@ -36,8 +35,8 @@ if (NOT TARGET ${PROJECT_NAME}_test) # !Workaround: Get seqan3 test include dir from seqan3 target find_path (SEQAN3_TEST_INCLUDE_DIR - NAMES seqan3/test/tmp_directory.hpp - HINTS "${seqan3_SOURCE_DIR}/test/include" + NAMES seqan3/test/tmp_directory.hpp + HINTS "${seqan3_SOURCE_DIR}/test/include" ) target_include_directories (${PROJECT_NAME}_test SYSTEM INTERFACE "${SEQAN3_TEST_INCLUDE_DIR}") diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index 6580539..fec0d08 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt @@ -19,7 +19,8 @@ if (${DOXYGEN_FOUND}) COMMAND ${DOXYGEN_EXECUTABLE} WORKING_DIRECTORY ${APP_TEMPLATE_DOXYGEN_OUTPUT_DIR} COMMENT "Generating (developer) API documentation with Doxygen." - VERBATIM) + VERBATIM + ) message (STATUS "${FontBold}You can run `make doc` to build api documentation.${FontReset}") else () message (STATUS "Doxygen not found.") diff --git a/include/estimate.h b/include/estimate.hpp similarity index 98% rename from include/estimate.h rename to include/estimate.hpp index 8f2dc72..cee6977 100644 --- a/include/estimate.h +++ b/include/estimate.hpp @@ -9,7 +9,7 @@ #include -#include "shared.h" +#include "shared.hpp" /*!\brief The arguments necessary for a search. * \param std::filesystem::path search_file The sequence file containing the transcripts to be searched for. diff --git a/include/ibf.h b/include/ibf.hpp similarity index 76% rename from include/ibf.h rename to include/ibf.hpp index 88972c7..911c776 100644 --- a/include/ibf.h +++ b/include/ibf.hpp @@ -7,6 +7,7 @@ #pragma once +#include #include #include #include @@ -14,30 +15,32 @@ #include #include -#include -#include "shared.h" +#include "shared.hpp" struct minimiser_arguments { - std::filesystem::path include_file; // Needs to be defined when only minimisers appearing in this file should be stored - std::filesystem::path exclude_file; // Needs to be defined when minimisers appearing in this file should NOT be stored - std::vector samples{}; // Can be used to indicate that sequence files belong to the same experiment - bool paired = false; // If true, than experiments are seen as paired-end experiments + std::filesystem::path + include_file; // Needs to be defined when only minimisers appearing in this file should be stored + std::filesystem::path + exclude_file; // Needs to be defined when minimisers appearing in this file should NOT be stored + std::vector samples{}; // Can be used to indicate that sequence files belong to the same experiment + bool paired = false; // If true, than experiments are seen as paired-end experiments bool experiment_names = false; // Flag, if names of experiment should be stored in a txt file bool ram_friendly = false; }; //!\brief Generates a random integer not greater than a given maximum -struct RandomGenerator { - int maxi; - RandomGenerator(int max) : - maxi(max) { - } - - int operator()() { - return rand() % maxi; - } +struct RandomGenerator +{ + int maxi; + RandomGenerator(int max) : maxi(max) + {} + + int operator()() + { + return rand() % maxi; + } }; /*!\brief Get the concrete expression values (= median of all counts of one transcript) for given experiments. @@ -48,8 +51,11 @@ struct RandomGenerator { * \param genome_file A "*.genome" file constructed with the command genome. * \param paired Flag to indicate if input data is paired or not. */ -void count(min_arguments const & args, std::vector sequence_files, std::filesystem::path include_file, - std::filesystem::path genome_file, bool paired); +void count(min_arguments const & args, + std::vector sequence_files, + std::filesystem::path include_file, + std::filesystem::path genome_file, + bool paired); /*!\brief Creates a set of minimizers to ignore, which should be used as an input to count. * \param args The minimiser arguments to use (seed, shape, window size). @@ -71,7 +77,10 @@ void read_binary(std::filesystem::path filename, robin_hood::unordered_node_map< * \param num_of_minimisers Variable, where to number of minimisers should be stored. * \param cutoff cutoff value. */ -void read_binary_start(min_arguments & args, std::filesystem::path filename, uint64_t & num_of_minimisers, uint8_t & cutoff); +void read_binary_start(min_arguments & args, + std::filesystem::path filename, + uint64_t & num_of_minimisers, + uint8_t & cutoff); /*! \brief Creates IBFs. * \param sequence_files A vector of sequence file paths. @@ -85,8 +94,11 @@ void read_binary_start(min_arguments & args, std::filesystem::path filename, uin * \param num_hash The number of hash functions to use. * \returns The expression thresholds per experiment. */ -std::vector ibf(std::vector const & sequence_files, estimate_ibf_arguments & ibf_args, - minimiser_arguments & minimiser_args, std::vector & fpr, std::vector & cutoffs, +std::vector ibf(std::vector const & sequence_files, + estimate_ibf_arguments & ibf_args, + minimiser_arguments & minimiser_args, + std::vector & fpr, + std::vector & cutoffs, std::filesystem::path const expression_by_genome_file = "", size_t num_hash = 1); @@ -101,7 +113,8 @@ std::vector ibf(std::vector const & sequence_fi * \returns The expression thresholds per experiment. */ std::vector ibf(std::vector const & minimiser_files, - estimate_ibf_arguments & ibf_args, std::vector & fpr, + estimate_ibf_arguments & ibf_args, + std::vector & fpr, std::filesystem::path const expression_by_genome_file = "", size_t num_hash = 1); @@ -111,8 +124,10 @@ std::vector ibf(std::vector const & minimiser_f * \param minimiser_args The minimiser specific arguments to use. * \param cutoffs List of cutoffs. */ -void minimiser(std::vector const & sequence_files, min_arguments const & args, - minimiser_arguments & minimiser_args, std::vector & cutoffs); +void minimiser(std::vector const & sequence_files, + min_arguments const & args, + minimiser_arguments & minimiser_args, + std::vector & cutoffs); /*! \brief Insert into IBFs. * \param sequence_files A vector of sequence file paths. @@ -127,9 +142,12 @@ void minimiser(std::vector const & sequence_files, min_ar * \returns The expression thresholds per experiment. */ std::vector insert(std::vector const & sequence_files, - estimate_ibf_arguments & ibf_args, minimiser_arguments & minimiser_args, + estimate_ibf_arguments & ibf_args, + minimiser_arguments & minimiser_args, std::vector & cutoffs, - std::filesystem::path const expression_by_genome_file, std::filesystem::path path_in, bool samplewise); + std::filesystem::path const expression_by_genome_file, + std::filesystem::path path_in, + bool samplewise); /*! \brief Insert into IBFs based on the minimiser files * \param minimiser_files A vector of minimiser file paths. @@ -143,7 +161,9 @@ std::vector insert(std::vector const & sequence */ std::vector insert(std::vector const & minimiser_files, estimate_ibf_arguments & ibf_args, - std::filesystem::path const expression_by_genome_file, std::filesystem::path path_in, bool samplewise); + std::filesystem::path const expression_by_genome_file, + std::filesystem::path path_in, + bool samplewise); /*! \brief Delete bins from ibfs * \param delete_files A vector of integers specifiying the bins to delete. @@ -152,4 +172,7 @@ std::vector insert(std::vector const & minimise * \param path_in Input directory. * \param samplewise True, if expression levels were set beforehand. */ -void delete_bin(std::vector const & delete_files, estimate_ibf_arguments & ibf_args, std::filesystem::path path_in, bool samplewise); +void delete_bin(std::vector const & delete_files, + estimate_ibf_arguments & ibf_args, + std::filesystem::path path_in, + bool samplewise); diff --git a/include/shared.h b/include/shared.hpp similarity index 89% rename from include/shared.h rename to include/shared.hpp index 5b9c535..53eee22 100644 --- a/include/shared.h +++ b/include/shared.hpp @@ -15,7 +15,8 @@ #include #include -inline constexpr static uint64_t adjust_seed(uint8_t const kmer_size, uint64_t const seed = 0x8F3F73B5CF1C9ADEULL) noexcept +static inline constexpr uint64_t adjust_seed(uint8_t const kmer_size, + uint64_t const seed = 0x8F'3F'73'B5'CF'1C'9A'DEULL) noexcept { return seed >> (64u - 2u * kmer_size); } @@ -31,7 +32,7 @@ struct all_arguments struct min_arguments : all_arguments { uint8_t k{20}; - seqan3::seed s{0x8F3F73B5CF1C9ADEULL}; + seqan3::seed s{0x8F'3F'73'B5'CF'1C'9A'DEULL}; seqan3::shape shape = seqan3::ungapped{k}; seqan3::window_size w_size{60}; }; @@ -41,10 +42,10 @@ struct estimate_ibf_arguments : min_arguments { bool compressed = false; std::vector expression_thresholds{}; // Expression levels which should be created - uint8_t number_expression_thresholds{}; // If set, the expression levels are determined by the program. + uint8_t number_expression_thresholds{}; // If set, the expression levels are determined by the program. bool samplewise{false}; - template + template void save(Archive & archive) const { archive(k); @@ -57,7 +58,7 @@ struct estimate_ibf_arguments : min_arguments archive(samplewise); } - template + template void load(Archive & archive) { archive(k); @@ -119,8 +120,7 @@ void load_ibf(IBFType & ibf, std::filesystem::path ipath) * \param opath Path, where the IBF should be stored. */ template -void store_ibf(IBFType const & ibf, - std::filesystem::path opath) +void store_ibf(IBFType const & ibf, std::filesystem::path opath) { std::ofstream os{opath, std::ios::binary}; cereal::BinaryOutputArchive oarchive{os}; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e001b75..517f7ca 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,10 +1,10 @@ cmake_minimum_required (VERSION 3.25) -find_package(OpenMP REQUIRED COMPONENTS CXX) -add_library ("${PROJECT_NAME}_lib" STATIC ibf.cpp estimate.cpp) +find_package (OpenMP REQUIRED COMPONENTS CXX) +add_library ("${PROJECT_NAME}_lib" STATIC estimate.cpp ibf.cpp) target_link_libraries ("${PROJECT_NAME}_lib" PUBLIC seqan3::seqan3) target_link_libraries ("${PROJECT_NAME}_lib" PUBLIC robin_hood::robin_hood) -target_link_libraries("${PROJECT_NAME}_lib" PUBLIC OpenMP::OpenMP_CXX) +target_link_libraries ("${PROJECT_NAME}_lib" PUBLIC OpenMP::OpenMP_CXX) target_include_directories ("${PROJECT_NAME}_lib" PUBLIC ../include) add_executable ("${PROJECT_NAME}" main.cpp) diff --git a/src/estimate.cpp b/src/estimate.cpp index 1680bbf..ff93f38 100644 --- a/src/estimate.cpp +++ b/src/estimate.cpp @@ -5,20 +5,19 @@ // shipped with this file and also available at: https://github.com/seqan/needle/blob/master/LICENSE.md // ----------------------------------------------------------------------------------------------------- +#include #include #include #include #include #include +#include #include #include #include -#include - -#include #if SEQAN3_WITH_CEREAL -#include +# include #endif // SEQAN3_WITH_CEREAL #include @@ -26,13 +25,19 @@ #include #include -#include "estimate.h" +#include "estimate.hpp" // Actual estimation template -void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector & estimations_i, - seqan3::dna4_vector const seq, std::vector & prev_counts, - exp_t const & expressions, uint16_t const k, std::vector const fprs, std::vector & deleted) +void check_ibf(min_arguments const & args, + IBFType const & ibf, + std::vector & estimations_i, + seqan3::dna4_vector const seq, + std::vector & prev_counts, + exp_t const & expressions, + uint16_t const k, + std::vector const fprs, + std::vector & deleted) { // Check, if one expression threshold for all or individual thresholds static constexpr bool multiple_expressions = std::same_as>>; @@ -44,28 +49,31 @@ void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector()); + std::transform(counter.begin(), + counter.end(), + agent.bulk_contains(minHash).begin(), + counter.begin(), + std::plus()); ++minimiser_length; } // Defines, where the median should be - float minimiser_pos = minimiser_length/2.0; + float minimiser_pos = minimiser_length / 2.0; // Check every experiment by going over the number of bins in the ibf. - for(size_t j = 0; j < counter.size(); j++) + for (size_t j = 0; j < counter.size(); j++) { if (std::find(deleted.begin(), deleted.end(), j) != deleted.end()) continue; // Correction by substracting the expected number of false positives - counter[j] = std::max((double) 0.0, (double) ((counter[j]-(minimiser_length*fprs[j]))/(1.0-fprs[j]))); + counter[j] = std::max((double)0.0, (double)((counter[j] - (minimiser_length * fprs[j])) / (1.0 - fprs[j]))); // Check, if considering previously seen minimisers and minimisers found ar current level equal to or are greater // than the minimiser_pow, which gives the median position. // If ań estimation took already place (estimations_i[j]!=0), a second estimation is not performed. if (((prev_counts[j] + counter[j]) >= minimiser_pos) && (estimations_i[j] == 0)) { // If there was no previous level, because we are looking at the last level. - if constexpr(last_exp) + if constexpr (last_exp) { if constexpr (multiple_expressions) estimations_i[j] = expressions[k][j]; @@ -74,16 +82,21 @@ void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector +template void read_levels(std::vector> & expressions, std::filesystem::path filename) { std::ifstream fin; @@ -112,13 +125,13 @@ void read_levels(std::vector> & expressions, std::file if (j == expressions.size()) expressions.push_back(empty_vector); std::ranges::copy(stream_view | seqan3::detail::take_until_or_throw(seqan3::is_char<' '>), - std::back_inserter(buffer)); - if constexpr(std::same_as) - expressions[j].push_back((uint16_t) std::stoi(buffer)); + std::back_inserter(buffer)); + if constexpr (std::same_as) + expressions[j].push_back((uint16_t)std::stoi(buffer)); else - expressions[j].push_back((double) std::stod(buffer)); + expressions[j].push_back((double)std::stod(buffer)); buffer.clear(); - if(*stream_it != '/') + if (*stream_it != '/') ++stream_it; if (*stream_it == '\n') @@ -126,7 +139,8 @@ void read_levels(std::vector> & expressions, std::file ++stream_it; j++; } - } while (*stream_it != '/'); + } + while (*stream_it != '/'); ++stream_it; fin.close(); @@ -139,7 +153,9 @@ void read_levels(std::vector> & expressions, std::file * \param estimate_args The estimate arguments. */ template -void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::path file_out, +void estimate(estimate_ibf_arguments & args, + IBFType & ibf, + std::filesystem::path file_out, estimate_arguments const & estimate_args) { std::vector ids; @@ -156,7 +172,8 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat omp_set_num_threads(args.threads); seqan3::contrib::bgzf_thread_count = args.threads; - seqan3::sequence_file_input> fin{estimate_args.search_file}; + seqan3::sequence_file_input> fin{ + estimate_args.search_file}; for (auto & [id, seq] : fin) { ids.push_back(id); @@ -189,9 +206,12 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat // Initialse last expression. if constexpr (samplewise) - load_ibf(ibf, estimate_args.path_in.string() + "IBF_Level_" + std::to_string(args.number_expression_thresholds-1)); + load_ibf(ibf, + estimate_args.path_in.string() + "IBF_Level_" + std::to_string(args.number_expression_thresholds - 1)); else - load_ibf(ibf, estimate_args.path_in.string() + "IBF_" + std::to_string(args.expression_thresholds[args.expression_thresholds.size()-1])); + load_ibf(ibf, + estimate_args.path_in.string() + "IBF_" + + std::to_string(args.expression_thresholds[args.expression_thresholds.size() - 1])); counter.assign(ibf.bin_count(), 0); counter_est.assign(ibf.bin_count(), 0); @@ -203,22 +223,40 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat counter_est.clear(); counter.clear(); - // Go over the sequences - #pragma omp parallel for +// Go over the sequences +#pragma omp parallel for for (size_t i = 0; i < seqs.size(); ++i) { if constexpr (samplewise && normalization_method) - check_ibf(args, ibf, estimations[i], seqs[i], prev_counts[i], - expressions,args.number_expression_thresholds - 1, - fprs[args.number_expression_thresholds - 1], deleted); + check_ibf(args, + ibf, + estimations[i], + seqs[i], + prev_counts[i], + expressions, + args.number_expression_thresholds - 1, + fprs[args.number_expression_thresholds - 1], + deleted); else if constexpr (samplewise) - check_ibf(args, ibf, estimations[i], seqs[i], prev_counts[i], - expressions, args.number_expression_thresholds - 1, - fprs[args.number_expression_thresholds - 1], deleted); + check_ibf(args, + ibf, + estimations[i], + seqs[i], + prev_counts[i], + expressions, + args.number_expression_thresholds - 1, + fprs[args.number_expression_thresholds - 1], + deleted); else - check_ibf(args, ibf, estimations[i], seqs[i], prev_counts[i], - args.expression_thresholds[args.expression_thresholds.size() - 1], prev_expression, - fprs[args.expression_thresholds.size() - 1], deleted); + check_ibf(args, + ibf, + estimations[i], + seqs[i], + prev_counts[i], + args.expression_thresholds[args.expression_thresholds.size() - 1], + prev_expression, + fprs[args.expression_thresholds.size() - 1], + deleted); } if constexpr (!samplewise) @@ -232,19 +270,40 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat else load_ibf(ibf, estimate_args.path_in.string() + "IBF_" + std::to_string(args.expression_thresholds[j])); - // Go over the sequences - #pragma omp parallel for +// Go over the sequences +#pragma omp parallel for for (size_t i = 0; i < seqs.size(); ++i) { if constexpr (samplewise && normalization_method) - check_ibf(args, ibf, estimations[i], seqs[i], prev_counts[i], - expressions, j, fprs[j], deleted); + check_ibf(args, + ibf, + estimations[i], + seqs[i], + prev_counts[i], + expressions, + j, + fprs[j], + deleted); else if constexpr (samplewise) - check_ibf(args, ibf, estimations[i], seqs[i], prev_counts[i], - expressions, j, fprs[j], deleted); + check_ibf(args, + ibf, + estimations[i], + seqs[i], + prev_counts[i], + expressions, + j, + fprs[j], + deleted); else - check_ibf(args, ibf, estimations[i], seqs[i], prev_counts[i], - args.expression_thresholds[j], prev_expression, fprs[j], deleted); + check_ibf(args, + ibf, + estimations[i], + seqs[i], + prev_counts[i], + args.expression_thresholds[j], + prev_expression, + fprs[j], + deleted); } if (!samplewise) @@ -254,16 +313,15 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat // Write output file. std::ofstream outfile; outfile.open(std::string{file_out}); - for (size_t i = 0; i < seqs.size(); ++i) + for (size_t i = 0; i < seqs.size(); ++i) { outfile << ids[i] << "\t"; for (size_t j = 0; j < ibf.bin_count(); ++j) - outfile << estimations[i][j] << "\t"; + outfile << estimations[i][j] << "\t"; outfile << "\n"; } outfile.close(); - } // Calls the correct form of estimate @@ -277,13 +335,22 @@ void call_estimate(estimate_ibf_arguments & args, estimate_arguments & estimate_ if (args.samplewise) { if (estimate_args.normalization_method) - estimate, true, true>(args, ibf, args.path_out, estimate_args); + estimate, true, true>(args, + ibf, + args.path_out, + estimate_args); else - estimate, true>(args, ibf, args.path_out, estimate_args); + estimate, true>(args, + ibf, + args.path_out, + estimate_args); } else { - estimate, false>(args, ibf, args.path_out, estimate_args); + estimate, false>(args, + ibf, + args.path_out, + estimate_args); } } else @@ -292,13 +359,23 @@ void call_estimate(estimate_ibf_arguments & args, estimate_arguments & estimate_ if (args.samplewise) { if (estimate_args.normalization_method) - estimate, true, true>(args, ibf, args.path_out, estimate_args); + estimate, true, true>( + args, + ibf, + args.path_out, + estimate_args); else - estimate, true>(args, ibf, args.path_out, estimate_args); + estimate, true>(args, + ibf, + args.path_out, + estimate_args); } else { - estimate, false>(args, ibf, args.path_out, estimate_args); + estimate, false>(args, + ibf, + args.path_out, + estimate_args); } } } diff --git a/src/ibf.cpp b/src/ibf.cpp index 67706ab..3be21d3 100644 --- a/src/ibf.cpp +++ b/src/ibf.cpp @@ -5,22 +5,21 @@ // shipped with this file and also available at: https://github.com/seqan/needle/blob/master/LICENSE.md // ----------------------------------------------------------------------------------------------------- +#include //reorded because of this error:https://github.com/Homebrew/homebrew-core/issues/44579 #include #include +#include #include #include #include #include #include +#include #include #include -#include //reorded because of this error:https://github.com/Homebrew/homebrew-core/issues/44579 - -#include -#include #if SEQAN3_WITH_CEREAL -#include +# include #endif // SEQAN3_WITH_CEREAL #include @@ -34,14 +33,15 @@ #include #include -#include "ibf.h" -#include "shared.h" +#include "ibf.hpp" +#include "shared.hpp" // Create set with hashes from the minimisers from an include or exclude file. -void get_include_set_table(min_arguments const & args, std::filesystem::path const include_file, +void get_include_set_table(min_arguments const & args, + std::filesystem::path const include_file, robin_hood::unordered_set & include_table) { - seqan3::sequence_file_input> fin3{include_file}; + seqan3::sequence_file_input> fin3{include_file}; for (auto & [seq] : fin3) { if (seq.size() >= args.w_size.get()) @@ -56,19 +56,20 @@ void get_include_set_table(min_arguments const & args, std::filesystem::path con inline bool check_for_fasta_format(std::vector const & valid_extensions, std::string const & file_path) { - auto case_insensitive_string_ends_with = [&] (std::string_view str, std::string_view suffix) + auto case_insensitive_string_ends_with = [&](std::string_view str, std::string_view suffix) { size_t const suffix_length{suffix.size()}; size_t const str_length{str.size()}; - return suffix_length > str_length ? - false : - std::ranges::equal(str.substr(str_length - suffix_length), suffix, [] (char const chr1, char const chr2) - { - return std::tolower(chr1) == std::tolower(chr2); - }); + return suffix_length > str_length ? false + : std::ranges::equal(str.substr(str_length - suffix_length), + suffix, + [](char const chr1, char const chr2) + { + return std::tolower(chr1) == std::tolower(chr2); + }); }; - auto case_insensitive_ends_with = [&] (std::string const & ext) + auto case_insensitive_ends_with = [&](std::string const & ext) { return case_insensitive_string_ends_with(file_path, ext); }; @@ -89,10 +90,13 @@ uint8_t calculate_cutoff(std::filesystem::path sequence_file, int samples) // Since the curoffs are based on the filesize of a gzipped fastq file, we try account for the other cases: // We multiply by two if we have fasta input. // We divide by 3 if the input is not compressed. - bool const is_compressed = sequence_file.extension() == ".gz" || sequence_file.extension() == ".bgzf" || sequence_file.extension() == ".bz2"; - bool const is_fasta = is_compressed ? check_for_fasta_format(seqan3::format_fasta::file_extensions, sequence_file.stem()) - : check_for_fasta_format(seqan3::format_fasta::file_extensions, sequence_file.extension()); - size_t const filesize = std::filesystem::file_size(sequence_file) * samples * (is_fasta ? 2 : 1) / (is_compressed ? 1 : 3); + bool const is_compressed = sequence_file.extension() == ".gz" || sequence_file.extension() == ".bgzf" + || sequence_file.extension() == ".bz2"; + bool const is_fasta = is_compressed + ? check_for_fasta_format(seqan3::format_fasta::file_extensions, sequence_file.stem()) + : check_for_fasta_format(seqan3::format_fasta::file_extensions, sequence_file.extension()); + size_t const filesize = + std::filesystem::file_size(sequence_file) * samples * (is_fasta ? 2 : 1) / (is_compressed ? 1 : 3); for (size_t k = 0; k < cutoff_bounds.size(); ++k) { @@ -107,18 +111,20 @@ uint8_t calculate_cutoff(std::filesystem::path sequence_file, int samples) // Fill hash table with minimisers greater than the cutoff. void fill_hash_table(min_arguments const & args, - seqan3::sequence_file_input> & fin, + seqan3::sequence_file_input> & fin, robin_hood::unordered_node_map & hash_table, robin_hood::unordered_node_map & cutoff_table, robin_hood::unordered_set const & include_set_table, robin_hood::unordered_set const & exclude_set_table, - bool const only_include = false, uint8_t cutoff = 0) + bool const only_include = false, + uint8_t cutoff = 0) { for (auto & [seq] : fin) { for (auto && minHash : seqan3::views::minimiser_hash(seq, args.shape, args.w_size, args.s)) { - if ((only_include && include_set_table.contains(minHash)) || (!only_include && !exclude_set_table.contains(minHash))) + if ((only_include && include_set_table.contains(minHash)) + || (!only_include && !exclude_set_table.contains(minHash))) { auto it = hash_table.find(minHash); // If minHash is already in hash table, increase count in hash table @@ -149,7 +155,7 @@ void fill_hash_table(min_arguments const & args, } void fill_hash_table_parallel(min_arguments const & args, - seqan3::sequence_file_input> & fin, + seqan3::sequence_file_input> & fin, robin_hood::unordered_node_map & hash_table, robin_hood::unordered_node_map & cutoff_table, robin_hood::unordered_set const & include_set_table, @@ -172,7 +178,7 @@ void fill_hash_table_parallel(min_arguments const & args, auto seq_file_it = std::ranges::begin(fin); using sequence_t = seqan3::dna4_vector; - auto load_next_chunk = [&] () + auto load_next_chunk = [&]() { constexpr size_t batch_size = 100000; @@ -188,7 +194,7 @@ void fill_hash_table_parallel(min_arguments const & args, return sequence_batch; }; - auto count_minimiser = [&] (auto & local_hash_table, std::vector minimisers) + auto count_minimiser = [&](auto & local_hash_table, std::vector minimisers) { // Sort the minimiser by their value. std::ranges::sort(minimisers, std::less{}); @@ -202,12 +208,16 @@ void fill_hash_table_parallel(min_arguments const & args, while (minimiser_it != minimiser_end) { uint64_t current_minimiser = *minimiser_it; - auto predicate = [=] (uint64_t const other_hash) { return other_hash == current_minimiser; }; + auto predicate = [=](uint64_t const other_hash) + { + return other_hash == current_minimiser; + }; auto next_minimiser_it = std::ranges::find_if_not(minimiser_it, minimiser_end, predicate); // minimiser_it now points to the first non equal position size_t const minimiser_count = std::ranges::distance(minimiser_it, next_minimiser_it); - if ((only_include && (include_set_table.contains(current_minimiser))) || (!only_include && !exclude_set_table.contains(current_minimiser))) + if ((only_include && (include_set_table.contains(current_minimiser))) + || (!only_include && !exclude_set_table.contains(current_minimiser))) { if (auto it = local_hash_table.find(current_minimiser); it != local_hash_table.end()) // update { @@ -215,7 +225,7 @@ void fill_hash_table_parallel(min_arguments const & args, } else if (minimiser_count > cutoff) { - // insert first. + // insert first. local_hash_table[current_minimiser] = minimiser_count; } else // not above cutoff. @@ -241,7 +251,7 @@ void fill_hash_table_parallel(min_arguments const & args, std::vector> intervals{}; std::optional>> queue; - auto job = [&] (size_t const thread_id) + auto job = [&](size_t const thread_id) { while (true) { @@ -268,7 +278,7 @@ void fill_hash_table_parallel(min_arguments const & args, sync_point.arrive_and_wait(); - {// sequential phase to merge sub tables. + { // sequential phase to merge sub tables. std::scoped_lock lk{load_mutex}; if (!is_merged) { @@ -278,7 +288,8 @@ void fill_hash_table_parallel(min_arguments const & args, for (auto && [key, counter] : local_hash_table) { if (auto it = hash_table.find(key); it != hash_table.end()) - it->second = static_cast(std::min(65534ul, it->second + counter.load())); + it->second = + static_cast(std::min(65534ul, it->second + counter.load())); else hash_table.insert(value_t{key, counter.load()}); } @@ -290,8 +301,10 @@ void fill_hash_table_parallel(min_arguments const & args, for (auto & local_remaining_minimisers : thread_local_remaining_minimisers) { std::vector local_remaining_minimisers2 = - count_minimiser(hash_table, std::move(local_remaining_minimisers)); - remaining_minimisers.insert(remaining_minimisers.end(), local_remaining_minimisers2.begin(), local_remaining_minimisers2.end()); + count_minimiser(hash_table, std::move(local_remaining_minimisers)); + remaining_minimisers.insert(remaining_minimisers.end(), + local_remaining_minimisers2.begin(), + local_remaining_minimisers2.end()); } std::ranges::sort(remaining_minimisers, std::less{}); @@ -301,12 +314,16 @@ void fill_hash_table_parallel(min_arguments const & args, while (minimiser_it != minimiser_end) { uint64_t current_minimiser = *minimiser_it; - auto predicate = [=] (uint64_t const other_hash) { return other_hash == current_minimiser; }; + auto predicate = [=](uint64_t const other_hash) + { + return other_hash == current_minimiser; + }; auto next_minimiser_it = std::ranges::find_if_not(minimiser_it, minimiser_end, predicate); // minimiser_it now points to the first non equal position size_t const minimiser_count = std::ranges::distance(minimiser_it, next_minimiser_it); - if ((only_include && (include_set_table.contains(current_minimiser))) || (!only_include && !exclude_set_table.contains(current_minimiser))) + if ((only_include && (include_set_table.contains(current_minimiser))) + || (!only_include && !exclude_set_table.contains(current_minimiser))) { if (auto it = hash_table.find(current_minimiser); it != hash_table.end()) // update { @@ -314,7 +331,7 @@ void fill_hash_table_parallel(min_arguments const & args, } else if (minimiser_count > cutoff) { - // insert first. + // insert first. hash_table[current_minimiser] = minimiser_count; } else if (auto it = cutoff_table.find(current_minimiser); it != cutoff_table.end()) @@ -351,8 +368,7 @@ void fill_hash_table_parallel(min_arguments const & args, thread.join(); } -void count_genome(min_arguments const & args, std::filesystem::path include_file, - std::filesystem::path exclude_file) +void count_genome(min_arguments const & args, std::filesystem::path include_file, std::filesystem::path exclude_file) { robin_hood::unordered_set include_set_table{}; robin_hood::unordered_set exclude_set_table{}; @@ -360,7 +376,7 @@ void count_genome(min_arguments const & args, std::filesystem::path include_file if (exclude_file != "") { - seqan3::sequence_file_input> fin{exclude_file}; + seqan3::sequence_file_input> fin{exclude_file}; for (auto & [seq] : fin) { if (seq.size() >= args.w_size.get()) @@ -371,37 +387,39 @@ void count_genome(min_arguments const & args, std::filesystem::path include_file } } - seqan3::sequence_file_input> fin2{include_file}; + seqan3::sequence_file_input> fin2{include_file}; for (auto & [seq] : fin2) { if (seq.size() >= args.w_size.get()) { for (auto && minHash : seqan3::views::minimiser_hash(seq, args.shape, args.w_size, args.s)) { - if ( !(exclude_set_table.contains(minHash))) + if (!(exclude_set_table.contains(minHash))) include_set_table.insert(minHash); } } } // Write minimiser to binary - outfile.open(std::string{args.path_out} + std::string{include_file.stem()} - + ".genome", std::ios::binary); + outfile.open(std::string{args.path_out} + std::string{include_file.stem()} + ".genome", std::ios::binary); for (auto && hash : include_set_table) { - outfile.write(reinterpret_cast(&hash), sizeof(hash)); + outfile.write(reinterpret_cast(&hash), sizeof(hash)); } outfile.close(); } -void count(min_arguments const & args, std::vector sequence_files, std::filesystem::path include_file, - std::filesystem::path genome_file, bool paired) +void count(min_arguments const & args, + std::vector sequence_files, + std::filesystem::path include_file, + std::filesystem::path genome_file, + bool paired) { robin_hood::unordered_node_map hash_table{}; // Create a smaller cutoff table to save RAM, this cutoff table is only used for constructing the hash table // and afterwards discarded. - robin_hood::unordered_node_map cutoff_table; + robin_hood::unordered_node_map cutoff_table; robin_hood::unordered_set include_set_table{}; robin_hood::unordered_set exclude_set_table{}; std::vector counter{}; @@ -412,7 +430,7 @@ void count(min_arguments const & args, std::vector sequen // Read minimiser from binary infile.open(genome_file, std::ios::binary); uint64_t minimiser; - while(infile.read((char*)&minimiser, sizeof(minimiser))) + while (infile.read((char *)&minimiser, sizeof(minimiser))) { include_set_table.insert(minimiser); } @@ -437,15 +455,16 @@ void count(min_arguments const & args, std::vector sequen cutoff_table.clear(); outfile.open(std::string{args.path_out} + std::string{sequence_files[i].stem()} + ".count.out"); - seqan3::sequence_file_input> fin2{include_file}; + seqan3::sequence_file_input> fin2{ + include_file}; for (auto & [id, seq] : fin2) { if (seq.size() >= args.w_size.get()) { for (auto && minHash : seqan3::views::minimiser_hash(seq, args.shape, args.w_size, args.s)) counter.push_back(hash_table[minHash]); - std::nth_element(counter.begin(), counter.begin() + counter.size()/2, counter.end()); - expression = counter[counter.size()/2]; + std::nth_element(counter.begin(), counter.begin() + counter.size() / 2, counter.end()); + expression = counter[counter.size() / 2]; outfile << id << "\t" << expression << "\n"; counter.clear(); } @@ -468,7 +487,7 @@ void read_binary(std::filesystem::path filename, robin_hood::unordered_node_map< fin.ignore(22); bool ungapped; - fin.read((char*)&ungapped, sizeof(ungapped)); + fin.read((char *)&ungapped, sizeof(ungapped)); if (!ungapped) { fin.ignore(8); // args.shape @@ -477,9 +496,9 @@ void read_binary(std::filesystem::path filename, robin_hood::unordered_node_map< uint64_t minimiser; uint16_t minimiser_count; - while(fin.read((char*)&minimiser, sizeof(minimiser))) + while (fin.read((char *)&minimiser, sizeof(minimiser))) { - fin.read((char*)&minimiser_count, sizeof(minimiser_count)); + fin.read((char *)&minimiser_count, sizeof(minimiser_count)); hash_table[minimiser] = minimiser_count; } @@ -487,48 +506,51 @@ void read_binary(std::filesystem::path filename, robin_hood::unordered_node_map< } void read_binary_start(min_arguments & args, - std::filesystem::path filename, - uint64_t & num_of_minimisers, uint8_t & cutoff) + std::filesystem::path filename, + uint64_t & num_of_minimisers, + uint8_t & cutoff) { std::ifstream fin{filename, std::ios::binary}; - fin.read((char*)&num_of_minimisers, sizeof(num_of_minimisers)); - fin.read((char*)&cutoff, sizeof(cutoff)); - fin.read((char*)&args.k, sizeof(args.k)); - fin.read((char*)&args.w_size, sizeof(args.w_size)); - fin.read((char*)&args.s, sizeof(args.s)); + fin.read((char *)&num_of_minimisers, sizeof(num_of_minimisers)); + fin.read((char *)&cutoff, sizeof(cutoff)); + fin.read((char *)&args.k, sizeof(args.k)); + fin.read((char *)&args.w_size, sizeof(args.w_size)); + fin.read((char *)&args.s, sizeof(args.s)); bool ungapped; - fin.read((char*)&ungapped, sizeof(ungapped)); + fin.read((char *)&ungapped, sizeof(ungapped)); if (ungapped) { args.shape = seqan3::ungapped{args.k}; } else { - fin.read((char*)&args.shape, sizeof(args.shape)); + fin.read((char *)&args.shape, sizeof(args.shape)); } fin.close(); } // Check number of expression levels, sort expression levels -void check_expression(std::vector & expression_thresholds, uint8_t & number_expression_thresholds, +void check_expression(std::vector & expression_thresholds, + uint8_t & number_expression_thresholds, std::filesystem::path const expression_by_genome_file) { // Sort given expression rates sort(expression_thresholds.begin(), expression_thresholds.end()); - // If no expression levels are given and the no number of expression levels is specified, throw. + // If no expression levels are given and the no number of expression levels is specified, throw. if ((number_expression_thresholds == 0) && (expression_thresholds.size() == 0)) { throw std::invalid_argument{"Error. Please set the expression levels OR give the number of expression levels."}; } else if ((expression_by_genome_file != "") && (expression_thresholds.size() > 0)) { - throw std::invalid_argument{"Error. The determination of expression levels can not be used with individual levels" - " already given. Please set the expression levels without the option " - "--level-by-genome OR use the number of expression levels with that option."}; + throw std::invalid_argument{ + "Error. The determination of expression levels can not be used with individual levels" + " already given. Please set the expression levels without the option " + "--level-by-genome OR use the number of expression levels with that option."}; } else if (number_expression_thresholds == 0) { @@ -538,18 +560,18 @@ void check_expression(std::vector & expression_thresholds, uint8_t & n { throw std::invalid_argument{"Error. Please set the expression levels OR give the number of expression levels."}; } - } // Check and set samples and cutoffs void check_cutoffs_samples(std::vector const & sequence_files, - bool const paired, std::vector & samples, + bool const paired, + std::vector & samples, std::vector & cutoffs) { if (paired) // If paired is true, a pair is seen as one sample - samples.assign(sequence_files.size()/2,2); + samples.assign(sequence_files.size() / 2, 2); if (samples.empty()) // If no samples are given and not paired, every file is seen as one experiment - samples.assign(sequence_files.size(),1); + samples.assign(sequence_files.size(), 1); if (cutoffs.size() == 1) // If one cutoff is given, every experiment gets this cutoff. cutoffs.assign(samples.size(), cutoffs[0]); @@ -574,16 +596,20 @@ void check_fpr(uint8_t const number_expression_thresholds, std::vector & } else if (fprs.size() != number_expression_thresholds) { - throw std::invalid_argument{"Error. Length of false positive rates for IBFs is not equal to length of expression " - "thresholds."}; + throw std::invalid_argument{ + "Error. Length of false positive rates for IBFs is not equal to length of expression " + "thresholds."}; } } // Calculate expression thresholds and sizes void get_expression_thresholds(uint8_t const number_expression_thresholds, - robin_hood::unordered_node_map const & hash_table, - std::vector & expression_thresholds, std::vector & sizes, - robin_hood::unordered_set const & genome, uint8_t cutoff, bool all = true) + robin_hood::unordered_node_map const & hash_table, + std::vector & expression_thresholds, + std::vector & sizes, + robin_hood::unordered_set const & genome, + uint8_t cutoff, + bool all = true) { // Calculate expression thresholds by taking median recursively std::vector counts; @@ -601,19 +627,20 @@ void get_expression_thresholds(uint8_t const number_expression_thresholds, // Zero Level = cutoff + 1 expression_thresholds.push_back(cutoff + 1); // First Level - std::nth_element(counts.begin() + prev_pos, counts.begin() + prev_pos + counts.size()/dev, counts.end()); - exp = counts[prev_pos + counts.size()/dev]; - prev_pos = prev_pos + counts.size()/dev; - dev = dev*2; + std::nth_element(counts.begin() + prev_pos, counts.begin() + prev_pos + counts.size() / dev, counts.end()); + exp = counts[prev_pos + counts.size() / dev]; + prev_pos = prev_pos + counts.size() / dev; + dev = dev * 2; expression_thresholds.push_back(exp); sizes.push_back(prev_pos); - while((expression_thresholds.size() < number_expression_thresholds) && (prev_exp < max_elem) && (dev < counts.size())) + while ((expression_thresholds.size() < number_expression_thresholds) && (prev_exp < max_elem) + && (dev < counts.size())) { - std::nth_element(counts.begin() + prev_pos, counts.begin() + prev_pos + counts.size()/dev, counts.end()); - exp = counts[prev_pos + counts.size()/dev]; - prev_pos = prev_pos + counts.size()/dev; - dev = dev*2; + std::nth_element(counts.begin() + prev_pos, counts.begin() + prev_pos + counts.size() / dev, counts.end()); + exp = counts[prev_pos + counts.size() / dev]; + prev_pos = prev_pos + counts.size() / dev; + dev = dev * 2; // If expression does not change compared to previous one, do not store it again as an expression threshold. if ((exp - prev_exp) > 1) @@ -626,48 +653,51 @@ void get_expression_thresholds(uint8_t const number_expression_thresholds, } sizes.push_back(prev_pos); // In case not all levels have a threshold, give the last levels a maximal threshold, which can not be met by any minimiser. - while(expression_thresholds.size() < number_expression_thresholds) + while (expression_thresholds.size() < number_expression_thresholds) expression_thresholds.push_back(max_elem + 1); counts.clear(); } // Estimate the file size for every expression level, necessary when samplewise=false, because then it is completly // unclear how many minimisers are to store per file. -void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t const number_expression_thresholds, - std::vector const & expression_thresholds, std::vector & sizes, - robin_hood::unordered_set const & genome, bool all = true) +void get_filsize_per_expression_level(std::filesystem::path filename, + uint8_t const number_expression_thresholds, + std::vector const & expression_thresholds, + std::vector & sizes, + robin_hood::unordered_set const & genome, + bool all = true) { std::ifstream fin; uint8_t small_buffer; uint32_t window; uint64_t buffer; fin.open(filename, std::ios::binary); - fin.read((char*)&buffer, sizeof(buffer)); - fin.read((char*)&small_buffer, sizeof(small_buffer)); - fin.read((char*)&small_buffer, sizeof(small_buffer)); - fin.read((char*)&window, sizeof(window)); - fin.read((char*)&buffer, sizeof(buffer)); + fin.read((char *)&buffer, sizeof(buffer)); + fin.read((char *)&small_buffer, sizeof(small_buffer)); + fin.read((char *)&small_buffer, sizeof(small_buffer)); + fin.read((char *)&window, sizeof(window)); + fin.read((char *)&buffer, sizeof(buffer)); bool ungapped; - fin.read((char*)&ungapped, sizeof(ungapped)); + fin.read((char *)&ungapped, sizeof(ungapped)); if (!ungapped) { - fin.read((char*)&buffer, sizeof(buffer)); + fin.read((char *)&buffer, sizeof(buffer)); } uint64_t minimiser; uint16_t minimiser_count; sizes.assign(number_expression_thresholds, 0); - while(fin.read((char*)&minimiser, sizeof(minimiser))) + while (fin.read((char *)&minimiser, sizeof(minimiser))) { - fin.read((char*)&minimiser_count, sizeof(minimiser_count)); + fin.read((char *)&minimiser_count, sizeof(minimiser_count)); if (all || genome.contains(minimiser)) { // Find the level with the smallest greater value than the minimiser occurrence, in the level before that the // minimiser is going to be stored. auto p = std::upper_bound(expression_thresholds.begin(), expression_thresholds.end(), minimiser_count); - if(p != expression_thresholds.begin()) - sizes[(p-expression_thresholds.begin())-1]++; + if (p != expression_thresholds.begin()) + sizes[(p - expression_thresholds.begin()) - 1]++; } } @@ -675,11 +705,13 @@ void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t co } // Actual ibf construction -template +template void ibf_helper(std::vector const & minimiser_files, std::vector const & fprs, - estimate_ibf_arguments & ibf_args, std::vector & cutoffs, - size_t num_hash = 1, std::filesystem::path expression_by_genome_file = "", + estimate_ibf_arguments & ibf_args, + std::vector & cutoffs, + size_t num_hash = 1, + std::filesystem::path expression_by_genome_file = "", minimiser_arguments const & minimiser_args = {}) { @@ -699,7 +731,7 @@ void ibf_helper(std::vector const & minimiser_files, robin_hood::unordered_set include_set_table; // Storage for minimisers in include file robin_hood::unordered_set exclude_set_table; // Storage for minimisers in exclude file - if constexpr(samplewise) + if constexpr (samplewise) { std::vector zero_vector(ibf_args.number_expression_thresholds); for (unsigned j = 0; j < num_files; j++) @@ -734,9 +766,10 @@ void ibf_helper(std::vector const & minimiser_files, // Get expression levels and sizes for (unsigned i = 0; i < num_files; i++) { - uint64_t filesize{}; // Store filesize(minimiser_files_given=false) or number of minimisers(minimiser_files_given=true) + uint64_t + filesize{}; // Store filesize(minimiser_files_given=false) or number of minimisers(minimiser_files_given=true) - if constexpr(minimiser_files_given) + if constexpr (minimiser_files_given) { uint8_t cutoff; read_binary_start(ibf_args, minimiser_files[i], filesize, cutoff); @@ -746,17 +779,23 @@ void ibf_helper(std::vector const & minimiser_files, { // Estimate sizes on filesize, assuming every byte translates to one letter (which is obiously not true, // because ids contain letters as well), so size might be overestimated. TODO: Find a better estimation! - unsigned file_iterator = std::accumulate(minimiser_args.samples.begin(), minimiser_args.samples.begin() + i, 0); + unsigned file_iterator = + std::accumulate(minimiser_args.samples.begin(), minimiser_args.samples.begin() + i, 0); // Determine cutoffs if (calculate_cutoffs) cutoffs.push_back(calculate_cutoff(minimiser_files[file_iterator], minimiser_args.samples[i])); - bool const is_compressed = minimiser_files[file_iterator].extension() == ".gz" || minimiser_files[file_iterator].extension() == ".bgzf" || minimiser_files[file_iterator].extension() == ".bz2"; - bool const is_fasta = is_compressed ? check_for_fasta_format(seqan3::format_fasta::file_extensions,minimiser_files[file_iterator].stem()) - : check_for_fasta_format(seqan3::format_fasta::file_extensions, minimiser_files[file_iterator].extension()); - filesize = std::filesystem::file_size(minimiser_files[file_iterator]) * minimiser_args.samples[i] * (is_fasta ? 2 : 1) / (is_compressed ? 1 : 3); - filesize = filesize/((cutoffs[i] + 1) * (is_fasta ? 1 : 2)); + bool const is_compressed = minimiser_files[file_iterator].extension() == ".gz" + || minimiser_files[file_iterator].extension() == ".bgzf" + || minimiser_files[file_iterator].extension() == ".bz2"; + bool const is_fasta = is_compressed ? check_for_fasta_format(seqan3::format_fasta::file_extensions, + minimiser_files[file_iterator].stem()) + : check_for_fasta_format(seqan3::format_fasta::file_extensions, + minimiser_files[file_iterator].extension()); + filesize = std::filesystem::file_size(minimiser_files[file_iterator]) * minimiser_args.samples[i] + * (is_fasta ? 2 : 1) / (is_compressed ? 1 : 3); + filesize = filesize / ((cutoffs[i] + 1) * (is_fasta ? 1 : 2)); } // If set_expression_thresholds_samplewise is not set the expressions as determined by the first file are used for // all files. @@ -766,24 +805,28 @@ void ibf_helper(std::vector const & minimiser_files, for (int c = 0; c < ibf_args.number_expression_thresholds - 1; c++) { diff = diff * 2; - sizes[i].push_back(filesize/diff); + sizes[i].push_back(filesize / diff); } - sizes[i].push_back(filesize/diff); + sizes[i].push_back(filesize / diff); } else if constexpr (minimiser_files_given) { - get_filsize_per_expression_level(minimiser_files[i], ibf_args.number_expression_thresholds, ibf_args.expression_thresholds, sizes[i], - genome, expression_by_genome); + get_filsize_per_expression_level(minimiser_files[i], + ibf_args.number_expression_thresholds, + ibf_args.expression_thresholds, + sizes[i], + genome, + expression_by_genome); } else { float diff{1}; for (int c = 0; c < ibf_args.number_expression_thresholds - 1; c++) { - diff = ibf_args.expression_thresholds[c+1]/ibf_args.expression_thresholds[c]; - sizes[i].push_back(filesize/diff); + diff = ibf_args.expression_thresholds[c + 1] / ibf_args.expression_thresholds[c]; + sizes[i].push_back(filesize / diff); } - sizes[i].push_back(filesize/diff); + sizes[i].push_back(filesize / diff); } } @@ -797,28 +840,30 @@ void ibf_helper(std::vector const & minimiser_files, if (size < 1) { - throw std::invalid_argument{std::string("[Error]. The chosen expression threshold is not well picked. If you use the automatic ") + - std::string("expression threshold determination, please decrease the number of levels. If you use ") + - std::string("your own expression thresholds, decrease the thresholds from level ") + - std::to_string(ibf_args.expression_thresholds[j]) + - std::string(" on.\n")}; + throw std::invalid_argument{ + std::string("[Error]. The chosen expression threshold is not well picked. If you use the automatic ") + + std::string("expression threshold determination, please decrease the number of levels. If you use ") + + std::string("your own expression thresholds, decrease the thresholds from level ") + + std::to_string(ibf_args.expression_thresholds[j]) + std::string(" on.\n")}; } // m = -hn/ln(1-p^(1/h)) - size = static_cast((-1.0*num_hash*((1.0*size)/num_files))/(std::log(1.0-std::pow(fprs[j], 1.0/num_hash)))); + size = static_cast((-1.0 * num_hash * ((1.0 * size) / num_files)) + / (std::log(1.0 - std::pow(fprs[j], 1.0 / num_hash)))); sizes_ibf.push_back(size); - ibfs.push_back(seqan3::interleaved_bloom_filter( - seqan3::bin_count{num_files}, seqan3::bin_size{size}, - seqan3::hash_function_count{num_hash})); + ibfs.push_back( + seqan3::interleaved_bloom_filter(seqan3::bin_count{num_files}, + seqan3::bin_size{size}, + seqan3::hash_function_count{num_hash})); } - // Add minimisers to ibf - #pragma omp parallel for schedule(dynamic, chunk_size) +// Add minimisers to ibf +#pragma omp parallel for schedule(dynamic, chunk_size) for (unsigned i = 0; i < num_files; i++) { robin_hood::unordered_node_map hash_table{}; // Storage for minimisers // Create a smaller cutoff table to save RAM, this cutoff table is only used for constructing the hash table // and afterwards discarded. - robin_hood::unordered_node_map cutoff_table; + robin_hood::unordered_node_map cutoff_table; std::vector expression_thresholds; // Fill hash table with minimisers. @@ -828,16 +873,30 @@ void ibf_helper(std::vector const & minimiser_files, } else { - unsigned file_iterator = std::accumulate(minimiser_args.samples.begin(), minimiser_args.samples.begin() + i, 0); + unsigned file_iterator = + std::accumulate(minimiser_args.samples.begin(), minimiser_args.samples.begin() + i, 0); for (int f = 0; f < minimiser_args.samples[i]; f++) { - seqan3::sequence_file_input> fin{minimiser_files[file_iterator+f]}; - if (minimiser_args.ram_friendly) - fill_hash_table_parallel(ibf_args, fin, hash_table, cutoff_table, include_set_table, exclude_set_table, - (minimiser_args.include_file != ""), cutoffs[i]); - else - fill_hash_table(ibf_args, fin, hash_table, cutoff_table, include_set_table, exclude_set_table, - (minimiser_args.include_file != ""), cutoffs[i]); + seqan3::sequence_file_input> fin{ + minimiser_files[file_iterator + f]}; + if (minimiser_args.ram_friendly) + fill_hash_table_parallel(ibf_args, + fin, + hash_table, + cutoff_table, + include_set_table, + exclude_set_table, + (minimiser_args.include_file != ""), + cutoffs[i]); + else + fill_hash_table(ibf_args, + fin, + hash_table, + cutoff_table, + include_set_table, + exclude_set_table, + (minimiser_args.include_file != ""), + cutoffs[i]); } cutoff_table.clear(); } @@ -846,20 +905,20 @@ void ibf_helper(std::vector const & minimiser_files, // all files. if constexpr (samplewise) { - get_expression_thresholds(ibf_args.number_expression_thresholds, - hash_table, - expression_thresholds, - sizes[i], - genome, - cutoffs[i], - expression_by_genome); - expressions[i] = expression_thresholds; + get_expression_thresholds(ibf_args.number_expression_thresholds, + hash_table, + expression_thresholds, + sizes[i], + genome, + cutoffs[i], + expression_by_genome); + expressions[i] = expression_thresholds; } // Every minimiser is stored in IBF, if it occurence is greater than or equal to the expression level for (auto && elem : hash_table) { - for (int j = ibf_args.number_expression_thresholds - 1; j >= 0 ; --j) + for (int j = ibf_args.number_expression_thresholds - 1; j >= 0; --j) { if constexpr (samplewise) { @@ -887,8 +946,8 @@ void ibf_helper(std::vector const & minimiser_files, for (unsigned i = 0; i < ibf_args.number_expression_thresholds; i++) { std::filesystem::path filename; - if constexpr(samplewise) - filename = ibf_args.path_out.string() + "IBF_Level_" + std::to_string(i); + if constexpr (samplewise) + filename = ibf_args.path_out.string() + "IBF_Level_" + std::to_string(i); else filename = ibf_args.path_out.string() + "IBF_" + std::to_string(ibf_args.expression_thresholds[i]); @@ -904,14 +963,14 @@ void ibf_helper(std::vector const & minimiser_files, } // Store all expression thresholds per level. - if constexpr(samplewise) + if constexpr (samplewise) { std::ofstream outfile; - outfile.open(std::string{ibf_args.path_out} + "IBF_Levels.levels"); + outfile.open(std::string{ibf_args.path_out} + "IBF_Levels.levels"); for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++) { for (unsigned i = 0; i < num_files; i++) - outfile << expressions[i][j] << " "; + outfile << expressions[i][j] << " "; outfile << "\n"; } outfile << "/\n"; @@ -919,13 +978,14 @@ void ibf_helper(std::vector const & minimiser_files, } std::ofstream outfile_fpr; - outfile_fpr.open(std::string{ibf_args.path_out} + "IBF_FPRs.fprs"); + outfile_fpr.open(std::string{ibf_args.path_out} + "IBF_FPRs.fprs"); for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++) { for (unsigned i = 0; i < num_files; i++) { // m = -hn/ln(1-p^(1/h)) - double fpr = std::pow(1.0- std::pow(1.0-(1.0/sizes_ibf[j]), num_hash *counts_per_level[i][j]), num_hash); + double fpr = + std::pow(1.0 - std::pow(1.0 - (1.0 / sizes_ibf[j]), num_hash * counts_per_level[i][j]), num_hash); outfile_fpr << fpr << " "; } outfile_fpr << "\n"; @@ -936,17 +996,19 @@ void ibf_helper(std::vector const & minimiser_files, // Create ibfs std::vector ibf(std::vector const & sequence_files, - estimate_ibf_arguments & ibf_args, minimiser_arguments & minimiser_args, - std::vector & fpr, std::vector & cutoffs, - std::filesystem::path const expression_by_genome_file, size_t num_hash) + estimate_ibf_arguments & ibf_args, + minimiser_arguments & minimiser_args, + std::vector & fpr, + std::vector & cutoffs, + std::filesystem::path const expression_by_genome_file, + size_t num_hash) { // Declarations robin_hood::unordered_node_map hash_table{}; // Storage for minimisers - seqan3::concatenated_sequences sequences; // Storage for sequences in experiment files + seqan3::concatenated_sequences sequences; // Storage for sequences in experiment files check_cutoffs_samples(sequence_files, minimiser_args.paired, minimiser_args.samples, cutoffs); - check_expression(ibf_args.expression_thresholds, ibf_args.number_expression_thresholds, expression_by_genome_file); check_fpr(ibf_args.number_expression_thresholds, fpr); @@ -959,16 +1021,30 @@ std::vector ibf(std::vector const & sequence_fi outfile.open(std::string{ibf_args.path_out} + "Stored_Files.txt"); for (unsigned i = 0; i < minimiser_args.samples.size(); i++) { - outfile << sequence_files[std::accumulate(minimiser_args.samples.begin(), - minimiser_args.samples.begin()+i, 0)] << "\n"; + outfile << sequence_files[std::accumulate(minimiser_args.samples.begin(), + minimiser_args.samples.begin() + i, + 0)] + << "\n"; } outfile.close(); } if (ibf_args.samplewise) - ibf_helper(sequence_files, fpr, ibf_args, cutoffs, num_hash, expression_by_genome_file, minimiser_args); + ibf_helper(sequence_files, + fpr, + ibf_args, + cutoffs, + num_hash, + expression_by_genome_file, + minimiser_args); else - ibf_helper(sequence_files, fpr, ibf_args, cutoffs, num_hash, expression_by_genome_file, minimiser_args); + ibf_helper(sequence_files, + fpr, + ibf_args, + cutoffs, + num_hash, + expression_by_genome_file, + minimiser_args); store_args(ibf_args, std::string{ibf_args.path_out} + "IBF_Data"); @@ -977,7 +1053,8 @@ std::vector ibf(std::vector const & sequence_fi // Create ibfs based on the minimiser file std::vector ibf(std::vector const & minimiser_files, - estimate_ibf_arguments & ibf_args, std::vector & fpr, + estimate_ibf_arguments & ibf_args, + std::vector & fpr, std::filesystem::path const expression_by_genome_file, size_t num_hash) { @@ -998,7 +1075,7 @@ std::vector ibf(std::vector const & minimiser_f } // Reads the level file ibf creates -template +template void read_levels(std::vector> & expressions, std::filesystem::path filename) { std::ifstream fin; @@ -1016,13 +1093,13 @@ void read_levels(std::vector> & expressions, std::file if (j == expressions.size()) expressions.push_back(empty_vector); std::ranges::copy(stream_view | seqan3::detail::take_until_or_throw(seqan3::is_char<' '>), - std::back_inserter(buffer)); - if constexpr(std::same_as) - expressions[j].push_back((uint16_t) std::stoi(buffer)); + std::back_inserter(buffer)); + if constexpr (std::same_as) + expressions[j].push_back((uint16_t)std::stoi(buffer)); else - expressions[j].push_back((double) std::stod(buffer)); + expressions[j].push_back((double)std::stod(buffer)); buffer.clear(); - if(*stream_it != '/') + if (*stream_it != '/') ++stream_it; if (*stream_it == '\n') @@ -1030,16 +1107,19 @@ void read_levels(std::vector> & expressions, std::file ++stream_it; j++; } - } while (*stream_it != '/'); + } + while (*stream_it != '/'); ++stream_it; fin.close(); } // Actual insertion -template +template void insert_helper(std::vector const & minimiser_files, - estimate_ibf_arguments & ibf_args, std::filesystem::path path_in, std::vector & cutoffs, + estimate_ibf_arguments & ibf_args, + std::filesystem::path path_in, + std::vector & cutoffs, std::filesystem::path expression_by_genome_file = "", minimiser_arguments const & minimiser_args = {}) { @@ -1061,7 +1141,7 @@ void insert_helper(std::vector const & minimiser_files, robin_hood::unordered_set include_set_table; // Storage for minimisers in include file robin_hood::unordered_set exclude_set_table; // Storage for minimisers in exclude file - if constexpr(samplewise) + if constexpr (samplewise) { std::vector zero_vector(ibf_args.number_expression_thresholds); for (unsigned j = 0; j < num_files; j++) @@ -1133,14 +1213,14 @@ void insert_helper(std::vector const & minimiser_files, for (unsigned j = old_bin_number; j < new_bin_number; j++) pos_insert.push_back(j); - // Add minimisers to ibf - #pragma omp parallel for schedule(dynamic, chunk_size) +// Add minimisers to ibf +#pragma omp parallel for schedule(dynamic, chunk_size) for (unsigned i = 0; i < num_files; i++) { robin_hood::unordered_node_map hash_table{}; // Storage for minimisers // Create a smaller cutoff table to save RAM, this cutoff table is only used for constructing the hash table // and afterwards discarded. - robin_hood::unordered_node_map cutoff_table; + robin_hood::unordered_node_map cutoff_table; std::vector expression_thresholds; // Fill hash table with minimisers. @@ -1156,27 +1236,46 @@ void insert_helper(std::vector const & minimiser_files, { // Estimate sizes on filesize, assuming every byte translates to one letter (which is obiously not true, // because ids contain letters as well), so size might be overestimated. TODO: Find a better estimation! - unsigned file_iterator = std::accumulate(minimiser_args.samples.begin(), minimiser_args.samples.begin() + i, 0); + unsigned file_iterator = + std::accumulate(minimiser_args.samples.begin(), minimiser_args.samples.begin() + i, 0); uint64_t filesize{}; // Determine cutoffs if (calculate_cutoffs) cutoffs.push_back(calculate_cutoff(minimiser_files[file_iterator], minimiser_args.samples[i])); - bool const is_compressed = minimiser_files[file_iterator].extension() == ".gz" || minimiser_files[file_iterator].extension() == ".bgzf" || minimiser_files[file_iterator].extension() == ".bz2"; - bool const is_fasta = is_compressed ? check_for_fasta_format(seqan3::format_fasta::file_extensions,minimiser_files[file_iterator].stem()) - : check_for_fasta_format(seqan3::format_fasta::file_extensions, minimiser_files[file_iterator].extension()); - filesize = std::filesystem::file_size(minimiser_files[file_iterator]) * minimiser_args.samples[i] * (is_fasta ? 2 : 1) / (is_compressed ? 1 : 3); - filesize = filesize/((cutoffs[i] + 1) * (is_fasta ? 1 : 2)); + bool const is_compressed = minimiser_files[file_iterator].extension() == ".gz" + || minimiser_files[file_iterator].extension() == ".bgzf" + || minimiser_files[file_iterator].extension() == ".bz2"; + bool const is_fasta = is_compressed ? check_for_fasta_format(seqan3::format_fasta::file_extensions, + minimiser_files[file_iterator].stem()) + : check_for_fasta_format(seqan3::format_fasta::file_extensions, + minimiser_files[file_iterator].extension()); + filesize = std::filesystem::file_size(minimiser_files[file_iterator]) * minimiser_args.samples[i] + * (is_fasta ? 2 : 1) / (is_compressed ? 1 : 3); + filesize = filesize / ((cutoffs[i] + 1) * (is_fasta ? 1 : 2)); for (int f = 0; f < minimiser_args.samples[i]; f++) { - seqan3::sequence_file_input> fin{minimiser_files[file_iterator+f]}; - if (minimiser_args.ram_friendly) - fill_hash_table_parallel(ibf_args, fin, hash_table, cutoff_table, include_set_table, exclude_set_table, - (minimiser_args.include_file != ""), cutoffs[i]); - else - fill_hash_table(ibf_args, fin, hash_table, cutoff_table, include_set_table, exclude_set_table, - (minimiser_args.include_file != ""), cutoffs[i]); + seqan3::sequence_file_input> fin{ + minimiser_files[file_iterator + f]}; + if (minimiser_args.ram_friendly) + fill_hash_table_parallel(ibf_args, + fin, + hash_table, + cutoff_table, + include_set_table, + exclude_set_table, + (minimiser_args.include_file != ""), + cutoffs[i]); + else + fill_hash_table(ibf_args, + fin, + hash_table, + cutoff_table, + include_set_table, + exclude_set_table, + (minimiser_args.include_file != ""), + cutoffs[i]); } cutoff_table.clear(); } @@ -1185,22 +1284,22 @@ void insert_helper(std::vector const & minimiser_files, // all files. if constexpr (samplewise) { - std::vector sizes_tmp{}; - get_expression_thresholds(ibf_args.number_expression_thresholds, - hash_table, - expression_thresholds, - sizes_tmp, - genome, - cutoffs[i], - expression_by_genome); - expressions[i] = expression_thresholds; - sizes_tmp.clear(); + std::vector sizes_tmp{}; + get_expression_thresholds(ibf_args.number_expression_thresholds, + hash_table, + expression_thresholds, + sizes_tmp, + genome, + cutoffs[i], + expression_by_genome); + expressions[i] = expression_thresholds; + sizes_tmp.clear(); } // Every minimiser is stored in IBF, if it occurence is greater than or equal to the expression level for (auto && elem : hash_table) { - for (int j = ibf_args.number_expression_thresholds - 1; j >= 0 ; --j) + for (int j = ibf_args.number_expression_thresholds - 1; j >= 0; --j) { if constexpr (samplewise) { @@ -1228,8 +1327,8 @@ void insert_helper(std::vector const & minimiser_files, for (unsigned i = 0; i < ibf_args.number_expression_thresholds; i++) { std::filesystem::path filename; - if constexpr(samplewise) - filename = ibf_args.path_out.string() + "IBF_Level_" + std::to_string(i); + if constexpr (samplewise) + filename = ibf_args.path_out.string() + "IBF_Level_" + std::to_string(i); else filename = ibf_args.path_out.string() + "IBF_" + std::to_string(ibf_args.expression_thresholds[i]); @@ -1242,17 +1341,16 @@ void insert_helper(std::vector const & minimiser_files, { store_ibf(ibfs[i], filename); } - } // Store all expression thresholds per level. - if constexpr(samplewise) + if constexpr (samplewise) { std::vector> expressions_prev{}; read_levels(expressions_prev, path_in.string() + "IBF_Levels.levels"); std::ofstream outfile; - outfile.open(std::string{ibf_args.path_out} + "IBF_Levels.levels"); + outfile.open(std::string{ibf_args.path_out} + "IBF_Levels.levels"); for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++) { int exp_i = 0; @@ -1280,7 +1378,7 @@ void insert_helper(std::vector const & minimiser_files, read_levels(fprs_prev, path_in.string() + "IBF_FPRs.fprs"); std::ofstream outfile_fpr; - outfile_fpr.open(std::string{ibf_args.path_out} + "IBF_FPRs.fprs"); + outfile_fpr.open(std::string{ibf_args.path_out} + "IBF_FPRs.fprs"); for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++) { int exp_i = 0; @@ -1289,7 +1387,9 @@ void insert_helper(std::vector const & minimiser_files, if (std::find(pos_insert.begin(), pos_insert.end(), i) != pos_insert.end()) { // m = -hn/ln(1-p^(1/h)) - double fpr = std::pow(1.0- std::pow(1.0-(1.0/sizes[j]), num_hash_functions *counts_per_level[exp_i][j]), num_hash_functions); + double fpr = + std::pow(1.0 - std::pow(1.0 - (1.0 / sizes[j]), num_hash_functions * counts_per_level[exp_i][j]), + num_hash_functions); outfile_fpr << fpr << " "; exp_i++; } @@ -1306,21 +1406,34 @@ void insert_helper(std::vector const & minimiser_files, // Insert into ibfs std::vector insert(std::vector const & sequence_files, - estimate_ibf_arguments & ibf_args, minimiser_arguments & minimiser_args, + estimate_ibf_arguments & ibf_args, + minimiser_arguments & minimiser_args, std::vector & cutoffs, - std::filesystem::path const expression_by_genome_file, std::filesystem::path path_in, bool samplewise) + std::filesystem::path const expression_by_genome_file, + std::filesystem::path path_in, + bool samplewise) { // Declarations robin_hood::unordered_node_map hash_table{}; // Storage for minimisers - seqan3::concatenated_sequences sequences; // Storage for sequences in experiment files + seqan3::concatenated_sequences sequences; // Storage for sequences in experiment files check_cutoffs_samples(sequence_files, minimiser_args.paired, minimiser_args.samples, cutoffs); load_args(ibf_args, std::string{path_in} + "IBF_Data"); if (samplewise) - insert_helper(sequence_files, ibf_args, path_in, cutoffs, expression_by_genome_file, minimiser_args); + insert_helper(sequence_files, + ibf_args, + path_in, + cutoffs, + expression_by_genome_file, + minimiser_args); else - insert_helper(sequence_files,ibf_args, path_in, cutoffs, expression_by_genome_file, minimiser_args); + insert_helper(sequence_files, + ibf_args, + path_in, + cutoffs, + expression_by_genome_file, + minimiser_args); store_args(ibf_args, std::string{ibf_args.path_out} + "IBF_Data"); return ibf_args.expression_thresholds; @@ -1329,7 +1442,9 @@ std::vector insert(std::vector const & sequence // Insert into ibfs based on the minimiser file std::vector insert(std::vector const & minimiser_files, estimate_ibf_arguments & ibf_args, - std::filesystem::path const expression_by_genome_file, std::filesystem::path path_in, bool samplewise) + std::filesystem::path const expression_by_genome_file, + std::filesystem::path path_in, + bool samplewise) { std::vector cutoffs{}; load_args(ibf_args, std::string{path_in} + "IBF_Data"); @@ -1345,20 +1460,20 @@ std::vector insert(std::vector const & minimise // Delete from ibfs void delete_bin(std::vector const & delete_files, - estimate_ibf_arguments & ibf_args, - std::filesystem::path path_in, - bool samplewise) + estimate_ibf_arguments & ibf_args, + std::filesystem::path path_in, + bool samplewise) { load_args(ibf_args, std::string{path_in} + "IBF_Data"); std::vector bins_to_delete{}; - for (size_t i = 0; i< delete_files.size(); i++) + for (size_t i = 0; i < delete_files.size(); i++) bins_to_delete.push_back(seqan3::bin_index{delete_files[i]}); omp_set_num_threads(ibf_args.threads); - // Delete bins from ibfs - #pragma omp parallel +// Delete bins from ibfs +#pragma omp parallel for (unsigned i = 0; i < ibf_args.number_expression_thresholds; i++) { std::filesystem::path filename; @@ -1376,7 +1491,7 @@ void delete_bin(std::vector const & delete_files, else filename = ibf_args.path_out.string() + "IBF_" + std::to_string(ibf_args.expression_thresholds[i]); - if (ibf_args.compressed) + if (ibf_args.compressed) { seqan3::interleaved_bloom_filter ibfc{std::move(ibf)}; store_ibf(ibfc, filename); @@ -1398,9 +1513,8 @@ void delete_bin(std::vector const & delete_files, outfile.close(); } - // Actuall minimiser calculation -template +template void calculate_minimiser(std::vector const & sequence_files, robin_hood::unordered_set const & include_set_table, robin_hood::unordered_set const & exclude_set_table, @@ -1414,7 +1528,7 @@ void calculate_minimiser(std::vector const & sequence_fil // Create a smaller cutoff table to save RAM, this cutoff table is only used for constructing the hash table // and afterwards discarded. - robin_hood::unordered_node_map cutoff_table; + robin_hood::unordered_node_map cutoff_table; std::ofstream outfile; unsigned file_iterator = std::accumulate(minimiser_args.samples.begin(), minimiser_args.samples.begin() + i, 0); @@ -1428,46 +1542,63 @@ void calculate_minimiser(std::vector const & sequence_fil // Fill hash_table with minimisers. for (int f = 0; f < minimiser_args.samples[i]; f++) { - seqan3::sequence_file_input> fin{sequence_files[file_iterator+f]}; + seqan3::sequence_file_input> fin{ + sequence_files[file_iterator + f]}; if constexpr (parallel) { - fill_hash_table_parallel(args, fin, hash_table, cutoff_table, include_set_table, exclude_set_table, (minimiser_args.include_file != ""), cutoff); + fill_hash_table_parallel(args, + fin, + hash_table, + cutoff_table, + include_set_table, + exclude_set_table, + (minimiser_args.include_file != ""), + cutoff); } else { - fill_hash_table(args, fin, hash_table, cutoff_table, include_set_table, exclude_set_table, (minimiser_args.include_file != ""), cutoff); + fill_hash_table(args, + fin, + hash_table, + cutoff_table, + include_set_table, + exclude_set_table, + (minimiser_args.include_file != ""), + cutoff); } } cutoff_table.clear(); // Write minimiser and their counts to binary - outfile.open(std::string{args.path_out} + std::string{sequence_files[file_iterator].stem()} - + ".minimiser", std::ios::binary); + outfile.open(std::string{args.path_out} + std::string{sequence_files[file_iterator].stem()} + ".minimiser", + std::ios::binary); auto hash_size = hash_table.size(); - outfile.write(reinterpret_cast(&hash_size), sizeof(hash_size)); - outfile.write(reinterpret_cast(&cutoff), sizeof(cutoff)); - outfile.write(reinterpret_cast(&args.k), sizeof(args.k)); - outfile.write(reinterpret_cast(&args.w_size), sizeof(args.w_size)); - outfile.write(reinterpret_cast(&args.s), sizeof(args.s)); + outfile.write(reinterpret_cast(&hash_size), sizeof(hash_size)); + outfile.write(reinterpret_cast(&cutoff), sizeof(cutoff)); + outfile.write(reinterpret_cast(&args.k), sizeof(args.k)); + outfile.write(reinterpret_cast(&args.w_size), sizeof(args.w_size)); + outfile.write(reinterpret_cast(&args.s), sizeof(args.s)); bool ungapped = args.shape.all(); - outfile.write(reinterpret_cast(&ungapped), sizeof(ungapped)); + outfile.write(reinterpret_cast(&ungapped), sizeof(ungapped)); if (!ungapped) { - outfile.write(reinterpret_cast(&args.shape), sizeof(args.shape)); + outfile.write(reinterpret_cast(&args.shape), sizeof(args.shape)); } for (auto && hash : hash_table) { - outfile.write(reinterpret_cast(&hash.first), sizeof(hash.first)); - outfile.write(reinterpret_cast(&hash.second), sizeof(hash.second)); + outfile.write(reinterpret_cast(&hash.first), sizeof(hash.first)); + outfile.write(reinterpret_cast(&hash.second), sizeof(hash.second)); } outfile.close(); } -void minimiser(std::vector const & sequence_files, min_arguments const & args, - minimiser_arguments & minimiser_args, std::vector & cutoffs) +void minimiser(std::vector const & sequence_files, + min_arguments const & args, + minimiser_arguments & minimiser_args, + std::vector & cutoffs) { // Declarations robin_hood::unordered_set include_set_table{}; // Storage for minimisers in include file @@ -1486,20 +1617,25 @@ void minimiser(std::vector const & sequence_files, min_ar // Add minimisers to ibf if (minimiser_args.ram_friendly) { - for(unsigned i = 0; i < minimiser_args.samples.size(); i++) + for (unsigned i = 0; i < minimiser_args.samples.size(); i++) { - calculate_minimiser(sequence_files, include_set_table, exclude_set_table, args, minimiser_args, i, cutoffs); + calculate_minimiser(sequence_files, + include_set_table, + exclude_set_table, + args, + minimiser_args, + i, + cutoffs); } } else { omp_set_num_threads(args.threads); - #pragma omp parallel for schedule(dynamic, chunk_size) - for(unsigned i = 0; i < minimiser_args.samples.size(); i++) +#pragma omp parallel for schedule(dynamic, chunk_size) + for (unsigned i = 0; i < minimiser_args.samples.size(); i++) { calculate_minimiser(sequence_files, include_set_table, exclude_set_table, args, minimiser_args, i, cutoffs); } } - } diff --git a/src/main.cpp b/src/main.cpp index 3c2927f..e1db188 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -5,13 +5,14 @@ // shipped with this file and also available at: https://github.com/seqan/needle/blob/master/LICENSE.md // ----------------------------------------------------------------------------------------------------- -#include #include + +#include #include -#include "shared.h" -#include "ibf.h" -#include "estimate.h" +#include "estimate.hpp" +#include "ibf.hpp" +#include "shared.hpp" uint32_t w_size; uint64_t shape{}; @@ -21,26 +22,46 @@ void initialise_min_arguments(seqan3::argument_parser & parser, min_arguments & { parser.add_option(args.k, 'k', "kmer", "Define k-mer size for the minimisers. Default: 20."); parser.add_option(w_size, 'w', "window", "Define window size for the minimisers. Default: 60."); - parser.add_option(shape, '\0', "shape", "Define a shape for the minimisers by the decimal of a bitvector, where 0 symbolizes a " - "position to be ignored, 1 a position considered. Default: ungapped."); + parser.add_option(shape, + '\0', + "shape", + "Define a shape for the minimisers by the decimal of a bitvector, where 0 symbolizes a " + "position to be ignored, 1 a position considered. Default: ungapped."); parser.add_option(se, '\0', "seed", "Define seed for the minimisers."); parser.add_option(args.path_out, 'o', "out", "Directory, where output files should be saved."); parser.add_option(args.threads, 't', "threads", "Number of threads to use. Default: 1."); } -void initialise_arguments_ibf(seqan3::argument_parser & parser, estimate_ibf_arguments & ibf_args, size_t & num_hash, +void initialise_arguments_ibf(seqan3::argument_parser & parser, + estimate_ibf_arguments & ibf_args, + size_t & num_hash, std::vector & fpr) { - parser.add_flag(ibf_args.compressed, 'c', "compressed", "If c is set, the IBFS are compressed. Default: Not compressed."); - parser.add_option(fpr, 'f', "fpr", "List of bin false positive rate per expression level. If only one is given" - ", then that fpr is used for all expression levels."); - parser.add_option(ibf_args.expression_thresholds, 'e', "expression_thresholds", "Which expression thresholds should be used for" - " constructing the IBFs."); - parser.add_option(ibf_args.number_expression_thresholds, 'l', "number_expression_thresholds", "Number of expression thresholds. " - "Can be set alternatively to expression_thresholds, then " - "the expression thresholds are determined automatically."); - parser.add_option(num_hash, 'n', "hash", "Number of hash functions that should be used when constructing " - "one IBF."); + parser.add_flag(ibf_args.compressed, + 'c', + "compressed", + "If c is set, the IBFS are compressed. Default: Not compressed."); + parser.add_option(fpr, + 'f', + "fpr", + "List of bin false positive rate per expression level. If only one is given" + ", then that fpr is used for all expression levels."); + parser.add_option(ibf_args.expression_thresholds, + 'e', + "expression_thresholds", + "Which expression thresholds should be used for" + " constructing the IBFs."); + parser.add_option(ibf_args.number_expression_thresholds, + 'l', + "number_expression_thresholds", + "Number of expression thresholds. " + "Can be set alternatively to expression_thresholds, then " + "the expression thresholds are determined automatically."); + parser.add_option(num_hash, + 'n', + "hash", + "Number of hash functions that should be used when constructing " + "one IBF."); } void parsing(seqan3::argument_parser & parser, min_arguments & args) @@ -50,29 +71,42 @@ void parsing(seqan3::argument_parser & parser, min_arguments & args) parser.parse(); args.w_size = seqan3::window_size{w_size}; if (shape == 0) - args.shape = seqan3::ungapped{args.k}; + args.shape = seqan3::ungapped{args.k}; else - args.shape = seqan3::bin_literal{shape}; + args.shape = seqan3::bin_literal{shape}; args.s = seqan3::seed{adjust_seed(args.k, se)}; } // Initialize arguments for ibf and minimiser -void initialise_arguments_minimiser(seqan3::argument_parser & parser, minimiser_arguments & minimiser_args, std::vector & cutoffs) +void initialise_arguments_minimiser(seqan3::argument_parser & parser, + minimiser_arguments & minimiser_args, + std::vector & cutoffs) { - parser.add_option(minimiser_args.include_file, '\0', "include", "Sequence file containing minimizers, only those " - "minimizers will be considered."); - parser.add_option(minimiser_args.exclude_file, '\0', "exclude", "Sequence file containing minimizers that should " - "not be stored."); - parser.add_option(minimiser_args.samples, '\0', "samples", "Define which samples belong together, sum has to be " - "equal to number of sequence files. Default: Every" - " sequence file is one sample from one experiment."); + parser.add_option(minimiser_args.include_file, + '\0', + "include", + "Sequence file containing minimizers, only those " + "minimizers will be considered."); + parser.add_option(minimiser_args.exclude_file, + '\0', + "exclude", + "Sequence file containing minimizers that should " + "not be stored."); + parser.add_option(minimiser_args.samples, + '\0', + "samples", + "Define which samples belong together, sum has to be " + "equal to number of sequence files. Default: Every" + " sequence file is one sample from one experiment."); parser.add_flag(minimiser_args.paired, 'p', "paired", "If set, experiments are paired. Default: Not paired."); - parser.add_option(cutoffs, '\0', "cutoff", "Define for each sample, what number of found minimisers " - "should be considered the result of a sequencing error " - "and therefore be ignored. Default: Every sample has an" - "automatically generated cutoff, which is based on the " - "file size."); - + parser.add_option(cutoffs, + '\0', + "cutoff", + "Define for each sample, what number of found minimisers " + "should be considered the result of a sequencing error " + "and therefore be ignored. Default: Every sample has an" + "automatically generated cutoff, which is based on the " + "file size."); } void read_input_file_list(std::vector & sequence_files, std::filesystem::path input_file) @@ -104,7 +138,11 @@ int run_needle_count(seqan3::argument_parser & parser) "for all sequences in the genome file based on the exact minimiser occurrences of " "the given sequence files. Please run genome beforehand to create the genome file."; parser.add_positional_option(sequence_files, "Please provide at least one sequence file."); - parser.add_option(include_file, '\0', "include", "Please provide one sequence file with transcripts.",seqan3::option_spec::required); + parser.add_option(include_file, + '\0', + "include", + "Please provide one sequence file with transcripts.", + seqan3::option_spec::required); parser.add_option(genome_file, '\0', "genome", "Please provide one *.genome file created with the genome command."); parser.add_flag(paired, 'p', "paired", "If set, experiments are paired. Default: Not paired."); @@ -164,19 +202,21 @@ int run_needle_estimate(seqan3::argument_parser & parser) parser.add_option(estimate_args.path_in, 'i', "in", "Directory where input files can be found."); parser.add_option(args.path_out, 'o', "out", "Directory, where output files should be saved."); parser.add_option(args.threads, 't', "threads", "Number of threads to use. Default: 1."); - parser.add_flag(estimate_args.normalization_method, 'm', "normalization-mode", - "Set, if normalization is wanted. Normalization is achieved by" - "dividing the expression value with the expression threshold of the first" - " ibf. Only make sense if every bin has its own expression " - "thresholds (which is the case if expression thresholds " - "were generated automatically)." - "Default: False."); + parser.add_flag(estimate_args.normalization_method, + 'm', + "normalization-mode", + "Set, if normalization is wanted. Normalization is achieved by" + "dividing the expression value with the expression threshold of the first" + " ibf. Only make sense if every bin has its own expression " + "thresholds (which is the case if expression thresholds " + "were generated automatically)." + "Default: False."); try { parsing(parser, args); } - catch (seqan3::argument_parser_error const & ext) // catch user errors + catch (seqan3::argument_parser_error const & ext) // catch user errors { seqan3::debug_stream << "Error. Incorrect command line input for estimate. " << ext.what() << "\n"; return -1; @@ -187,7 +227,6 @@ int run_needle_estimate(seqan3::argument_parser & parser) return 0; } - int run_needle_ibf(seqan3::argument_parser & parser) { estimate_ibf_arguments ibf_args{}; @@ -205,16 +244,26 @@ int run_needle_ibf(seqan3::argument_parser & parser) parser.info.short_description = "Constructs the Needle index."; - parser.add_positional_option(sequence_files, "Please provide at least one sequence file OR provide one file " - "containing all sequence files with the extension '.lst'."); - parser.add_option(minimiser_args.experiment_names, '\0', "experiment-names", "If set, names of the experiments are stored" - " in a txt file."); - parser.add_option(expression_by_genome_file, '\0', "levels-by-genome", "Sequence file containing minimizers, only " - "those minimizers will be considered for " - "determining the expression thresholds."); - - parser.add_flag(minimiser_args.ram_friendly, '\0', "ram", "If ram is set and multiple threads are used, the multithreading" - " is more RAM friendly at the cost of being slower."); + parser.add_positional_option(sequence_files, + "Please provide at least one sequence file OR provide one file " + "containing all sequence files with the extension '.lst'."); + parser.add_option(minimiser_args.experiment_names, + '\0', + "experiment-names", + "If set, names of the experiments are stored" + " in a txt file."); + parser.add_option(expression_by_genome_file, + '\0', + "levels-by-genome", + "Sequence file containing minimizers, only " + "those minimizers will be considered for " + "determining the expression thresholds."); + + parser.add_flag(minimiser_args.ram_friendly, + '\0', + "ram", + "If ram is set and multiple threads are used, the multithreading" + " is more RAM friendly at the cost of being slower."); try { @@ -225,7 +274,6 @@ int run_needle_ibf(seqan3::argument_parser & parser) sequence_files = {}; read_input_file_list(sequence_files, input_file); } - } catch (seqan3::argument_parser_error const & ext) { @@ -237,7 +285,7 @@ int run_needle_ibf(seqan3::argument_parser & parser) { ibf(sequence_files, ibf_args, minimiser_args, fpr, cutoffs, expression_by_genome_file, num_hash); } - catch (const std::invalid_argument & e) + catch (std::invalid_argument const & e) { std::cerr << e.what() << std::endl; return -1; @@ -260,19 +308,32 @@ int run_needle_insert(seqan3::argument_parser & parser) initialise_arguments_minimiser(parser, minimiser_args, cutoffs); parser.info.short_description = "Inserts into a given uncompressed Needle index."; - parser.add_flag(ibf_args.compressed, 'c', "compressed", "If c is set, the IBFS are compressed. Default: Not compressed."); + parser.add_flag(ibf_args.compressed, + 'c', + "compressed", + "If c is set, the IBFS are compressed. Default: Not compressed."); parser.add_option(ibf_args.threads, 't', "threads", "Number of threads to use. Default: 1."); parser.add_option(path_in, 'i', "in", "Directory where input files can be found."); - parser.add_positional_option(sequence_files, "Please provide at least one sequence file OR provide one file " - "containing all sequence files with the extension '.lst'."); - parser.add_option(minimiser_args.experiment_names, '\0', "experiment-names", "If set, names of the experiments are stored" - " in a txt file."); - parser.add_option(expression_by_genome_file, '\0', "levels-by-genome", "Sequence file containing minimizers, only " - "those minimizers will be considered for " - "determining the expression thresholds."); - - parser.add_flag(minimiser_args.ram_friendly, '\0', "ram", "If ram is set and multiple threads are used, the multithreading" - " is more RAM friendly at the cost of being slower."); + parser.add_positional_option(sequence_files, + "Please provide at least one sequence file OR provide one file " + "containing all sequence files with the extension '.lst'."); + parser.add_option(minimiser_args.experiment_names, + '\0', + "experiment-names", + "If set, names of the experiments are stored" + " in a txt file."); + parser.add_option(expression_by_genome_file, + '\0', + "levels-by-genome", + "Sequence file containing minimizers, only " + "those minimizers will be considered for " + "determining the expression thresholds."); + + parser.add_flag(minimiser_args.ram_friendly, + '\0', + "ram", + "If ram is set and multiple threads are used, the multithreading" + " is more RAM friendly at the cost of being slower."); try { @@ -285,7 +346,6 @@ int run_needle_insert(seqan3::argument_parser & parser) sequence_files = {}; read_input_file_list(sequence_files, input_file); } - } catch (seqan3::argument_parser_error const & ext) { @@ -297,7 +357,7 @@ int run_needle_insert(seqan3::argument_parser & parser) { insert(sequence_files, ibf_args, minimiser_args, cutoffs, expression_by_genome_file, path_in, samplewise); } - catch (const std::invalid_argument & e) + catch (std::invalid_argument const & e) { std::cerr << e.what() << std::endl; return -1; @@ -317,14 +377,18 @@ int run_needle_ibf_min(seqan3::argument_parser & parser) parser.info.short_description = "Constructs the Needle index from the minimiser files created by needle minimiser."; - parser.add_positional_option(minimiser_files, "Please provide at least one minimiser file OR provide one file " - "containing all minimiser files with the extension '.lst'."); + parser.add_positional_option(minimiser_files, + "Please provide at least one minimiser file OR provide one file " + "containing all minimiser files with the extension '.lst'."); parser.add_option(ibf_args.path_out, 'o', "out", "Directory, where output files should be saved."); parser.add_option(ibf_args.threads, 't', "threads", "Number of threads to use. Default: 1."); - parser.add_option(expression_by_genome_file, '\0', "levels-by-genome", "Sequence file containing minimizers, only " - "those minimizers will be considered for " - "determining the expression thresholds."); + parser.add_option(expression_by_genome_file, + '\0', + "levels-by-genome", + "Sequence file containing minimizers, only " + "those minimizers will be considered for " + "determining the expression thresholds."); initialise_arguments_ibf(parser, ibf_args, num_hash, fpr); @@ -348,7 +412,7 @@ int run_needle_ibf_min(seqan3::argument_parser & parser) { ibf(minimiser_files, ibf_args, fpr, expression_by_genome_file, num_hash); } - catch (const std::invalid_argument & e) + catch (std::invalid_argument const & e) { std::cerr << e.what() << std::endl; return -1; @@ -368,16 +432,23 @@ int run_needle_insert_min(seqan3::argument_parser & parser) parser.info.short_description = "Constructs the Needle index from the minimiser files created by needle minimiser."; - parser.add_positional_option(minimiser_files, "Please provide at least one minimiser file OR provide one file " - "containing all minimiser files with the extension '.lst'."); + parser.add_positional_option(minimiser_files, + "Please provide at least one minimiser file OR provide one file " + "containing all minimiser files with the extension '.lst'."); parser.add_option(ibf_args.path_out, 'o', "out", "Directory, where output files should be saved."); - parser.add_flag(ibf_args.compressed, 'c', "compressed", "If c is set, the IBFS are compressed. Default: Not compressed."); + parser.add_flag(ibf_args.compressed, + 'c', + "compressed", + "If c is set, the IBFS are compressed. Default: Not compressed."); parser.add_option(ibf_args.threads, 't', "threads", "Number of threads to use. Default: 1."); parser.add_option(path_in, 'i', "in", "Directory where input files can be found."); - parser.add_option(expression_by_genome_file, '\0', "levels-by-genome", "Sequence file containing minimizers, only " - "those minimizers will be considered for " - "determining the expression thresholds."); + parser.add_option(expression_by_genome_file, + '\0', + "levels-by-genome", + "Sequence file containing minimizers, only " + "those minimizers will be considered for " + "determining the expression thresholds."); try { @@ -399,9 +470,9 @@ int run_needle_insert_min(seqan3::argument_parser & parser) try { - insert(minimiser_files, ibf_args,expression_by_genome_file, path_in, samplewise); + insert(minimiser_files, ibf_args, expression_by_genome_file, path_in, samplewise); } - catch (const std::invalid_argument & e) + catch (std::invalid_argument const & e) { std::cerr << e.what() << std::endl; return -1; @@ -422,7 +493,10 @@ int run_needle_delete_bin(seqan3::argument_parser & parser) parser.add_positional_option(delete_files, "Please provide at one position to be deleted."); parser.add_option(ibf_args.path_out, 'o', "out", "Directory, where output files should be saved."); - parser.add_flag(ibf_args.compressed, 'c', "compressed", "If c is set, the IBFS are compressed. Default: Not compressed."); + parser.add_flag(ibf_args.compressed, + 'c', + "compressed", + "If c is set, the IBFS are compressed. Default: Not compressed."); parser.add_option(ibf_args.threads, 't', "threads", "Number of threads to use. Default: 1."); parser.add_option(path_in, 'i', "in", "Directory where input files can be found."); @@ -442,7 +516,7 @@ int run_needle_delete_bin(seqan3::argument_parser & parser) samplewise = true; delete_bin(delete_files, ibf_args, path_in, samplewise); } - catch (const std::invalid_argument & e) + catch (std::invalid_argument const & e) { std::cerr << e.what() << std::endl; return -1; @@ -451,7 +525,6 @@ int run_needle_delete_bin(seqan3::argument_parser & parser) return 0; } - int run_needle_minimiser(seqan3::argument_parser & parser) { min_arguments args{}; @@ -463,11 +536,15 @@ int run_needle_minimiser(seqan3::argument_parser & parser) std::filesystem::path input_file{}; parser.info.short_description = "Calculates minimiser for given experiments."; - parser.add_positional_option(sequence_files, "Please provide at least one sequence file OR provide one file " - "containing all sequence files with the extension '.lst'."); + parser.add_positional_option(sequence_files, + "Please provide at least one sequence file OR provide one file " + "containing all sequence files with the extension '.lst'."); - parser.add_flag(minimiser_args.ram_friendly, '\0', "ram", "If ram is set and multiple threads are used, the multithreading" - " is more RAM friendly at the cost of being slower."); + parser.add_flag(minimiser_args.ram_friendly, + '\0', + "ram", + "If ram is set and multiple threads are used, the multithreading" + " is more RAM friendly at the cost of being slower."); try { @@ -488,7 +565,7 @@ int run_needle_minimiser(seqan3::argument_parser & parser) { minimiser(sequence_files, args, minimiser_args, cutoffs); } - catch (const std::invalid_argument & e) + catch (std::invalid_argument const & e) { std::cerr << e.what() << std::endl; return -1; @@ -499,8 +576,12 @@ int run_needle_minimiser(seqan3::argument_parser & parser) int main(int argc, char const ** argv) { - seqan3::argument_parser needle_parser{"needle", argc, argv, seqan3::update_notifications::on, - {"count", "delete","estimate", "genome", "ibf", "ibfmin", "insert", "insertmin", "minimiser"}}; + seqan3::argument_parser needle_parser{ + "needle", + argc, + argv, + seqan3::update_notifications::on, + {"count", "delete", "estimate", "genome", "ibf", "ibfmin", "insert", "insertmin", "minimiser"}}; needle_parser.info.description.push_back("Needle allows you to build an Interleaved Bloom Filter (IBF) with the " "command ibf or estimate the expression of transcripts with the command " "estimate."); diff --git a/test/api/count_test.cpp b/test/api/count_test.cpp index 0b43380..565b6f8 100644 --- a/test/api/count_test.cpp +++ b/test/api/count_test.cpp @@ -1,11 +1,12 @@ #include + #include #include -#include "ibf.h" -#include "shared.h" #include "../app_test.hpp" +#include "ibf.hpp" +#include "shared.hpp" // To prevent issues when running multiple CLI tests in parallel, give each CLI test unique names: struct count_test : public app_test @@ -26,17 +27,16 @@ TEST_F(count_test, small_example) estimate_ibf_arguments args{}; initialization_args(args); - count(args, {data("mini_example.fasta")}, data("mini_gen.fasta"), - data("mini_gen.genome"), false); + count(args, {data("mini_example.fasta")}, data("mini_gen.fasta"), data("mini_gen.genome"), false); std::ifstream output_file("mini_example.count.out"); std::string line; std::string expected{"gen1\t3"}; if (output_file.is_open()) { - while ( std::getline (output_file,line) ) + while (std::getline(output_file, line)) { - EXPECT_EQ(expected,line); + EXPECT_EQ(expected, line); } output_file.close(); } @@ -47,17 +47,20 @@ TEST_F(count_test, small_example_paired) estimate_ibf_arguments args{}; initialization_args(args); - count(args, {data("mini_example.fasta"), data("mini_example.fasta")}, - data("mini_gen.fasta"), data("mini_gen.genome"), true); + count(args, + {data("mini_example.fasta"), data("mini_example.fasta")}, + data("mini_gen.fasta"), + data("mini_gen.genome"), + true); std::ifstream output_file("mini_example.count.out"); std::string line; std::string expected{"gen1\t6"}; if (output_file.is_open()) { - while ( std::getline (output_file,line) ) + while (std::getline(output_file, line)) { - EXPECT_EQ(expected,line); + EXPECT_EQ(expected, line); } output_file.close(); } @@ -68,21 +71,19 @@ TEST_F(count_test, small_example_exclude) estimate_ibf_arguments args{}; initialization_args(args); - count(args, {data("mini_example.fasta")}, data("mini_gen.fasta"), - data("mini_gen2.genome"), false); + count(args, {data("mini_example.fasta")}, data("mini_gen.fasta"), data("mini_gen2.genome"), false); std::ifstream output_file("mini_example.count.out"); std::string line; std::string expected{"gen1\t3"}; if (output_file.is_open()) { - while ( std::getline (output_file,line) ) + while (std::getline(output_file, line)) { - EXPECT_EQ(expected,line); + EXPECT_EQ(expected, line); } output_file.close(); } - } TEST_F(count_test, genome_small_example) @@ -96,7 +97,7 @@ TEST_F(count_test, genome_small_example) uint64_t expected{192}; output_file.open("mini_gen.genome", std::ios::binary); uint64_t minimiser; - while(output_file.read((char*)&minimiser, sizeof(minimiser))) + while (output_file.read((char *)&minimiser, sizeof(minimiser))) { EXPECT_EQ(expected, minimiser); } diff --git a/test/api/estimate_test.cpp b/test/api/estimate_test.cpp index d0441c6..6d7ad28 100644 --- a/test/api/estimate_test.cpp +++ b/test/api/estimate_test.cpp @@ -1,12 +1,13 @@ #include + #include #include -#include "ibf.h" -#include "shared.h" -#include "estimate.h" #include "../app_test.hpp" +#include "estimate.hpp" +#include "ibf.hpp" +#include "shared.hpp" // To prevent issues when running multiple CLI tests in parallel, give each CLI test unique names: struct estimate_test : public app_test @@ -44,9 +45,9 @@ TEST_F(estimate_test, small_example) std::string expected{"gen1\t3\t"}; if (output_file.is_open()) { - while ( std::getline (output_file,line) ) + while (std::getline(output_file, line)) { - EXPECT_EQ(expected,line); + EXPECT_EQ(expected, line); } output_file.close(); } @@ -75,9 +76,9 @@ TEST_F(estimate_test, small_example_uncompressed) std::string expected{"gen1\t3\t"}; if (output_file.is_open()) { - while ( std::getline (output_file,line) ) + while (std::getline(output_file, line)) { - EXPECT_EQ(expected,line); + EXPECT_EQ(expected, line); } output_file.close(); } @@ -105,9 +106,9 @@ TEST_F(estimate_test, small_example_gene_not_found) std::string expected{"gen2\t0\t"}; if (output_file.is_open()) { - while ( std::getline (output_file,line) ) + while (std::getline(output_file, line)) { - EXPECT_EQ(expected,line); + EXPECT_EQ(expected, line); } output_file.close(); } @@ -127,7 +128,7 @@ TEST_F(estimate_test, small_example_different_expressions_per_level) minimiser(sequence_files, ibf_args, minimiser_args, cutoffs); std::vector minimiser_files{"Estimate_Test_mini_example.minimiser"}; ASSERT_TRUE(std::filesystem::exists(minimiser_files[0])); - ibf_args.expression_thresholds= {}; + ibf_args.expression_thresholds = {}; ibf(minimiser_files, ibf_args, fpr); ibf_args.expression_thresholds = {0, 1, 2}; @@ -141,9 +142,9 @@ TEST_F(estimate_test, small_example_different_expressions_per_level) std::string expected{"gen1\t3\t"}; if (output_file.is_open()) { - while ( std::getline (output_file,line) ) + while (std::getline(output_file, line)) { - EXPECT_EQ(expected,line); + EXPECT_EQ(expected, line); } output_file.close(); } @@ -164,7 +165,7 @@ TEST_F(estimate_test, small_example_different_expressions_per_level_normalizatio minimiser(sequence_files, ibf_args, minimiser_args, cutoffs); std::vector minimiser_files{"Estimate_Test_mini_example.minimiser"}; ASSERT_TRUE(std::filesystem::exists(minimiser_files[0])); - ibf_args.expression_thresholds= {}; + ibf_args.expression_thresholds = {}; ibf(minimiser_files, ibf_args, fpr); ibf_args.expression_thresholds = {0, 1, 2}; @@ -178,9 +179,9 @@ TEST_F(estimate_test, small_example_different_expressions_per_level_normalizatio std::string expected{"gen1\t1\t"}; if (output_file.is_open()) { - while ( std::getline (output_file,line) ) + while (std::getline(output_file, line)) { - EXPECT_EQ(expected,line); + EXPECT_EQ(expected, line); } output_file.close(); } @@ -202,7 +203,7 @@ TEST_F(estimate_test, small_example_different_expressions_per_level_normalizatio minimiser(sequence_files, ibf_args, minimiser_args, cutoffs); std::vector minimiser_files{"Estimate_Test_mini_example.minimiser"}; ASSERT_TRUE(std::filesystem::exists(minimiser_files[0])); - ibf_args.expression_thresholds= {}; + ibf_args.expression_thresholds = {}; ibf(minimiser_files, ibf_args, fpr); ibf_args.expression_thresholds = {0, 1, 2}; @@ -216,9 +217,9 @@ TEST_F(estimate_test, small_example_different_expressions_per_level_normalizatio std::string expected{"gen1\t1\t"}; if (output_file.is_open()) { - while ( std::getline (output_file,line) ) + while (std::getline(output_file, line)) { - EXPECT_EQ(expected,line); + EXPECT_EQ(expected, line); } output_file.close(); } @@ -230,8 +231,10 @@ TEST_F(estimate_test, example) minimiser_arguments minimiser_args{}; estimate_arguments estimate_args{}; std::vector fpr = {0.05}; - std::vector sequence_files = {data("exp_01.fasta"), data("exp_02.fasta"), - data("exp_11.fasta"), data("exp_12.fasta")}; + std::vector sequence_files = {data("exp_01.fasta"), + data("exp_02.fasta"), + data("exp_11.fasta"), + data("exp_12.fasta")}; minimiser_args.samples = {2, 2}; ibf_args.expression_thresholds = {4, 32}; ibf_args.compressed = false; @@ -248,9 +251,9 @@ TEST_F(estimate_test, example) std::string expected{"GeneA\t9\t32\t"}; if (output_file.is_open()) { - while ( std::getline (output_file,line) ) + while (std::getline(output_file, line)) { - EXPECT_EQ(expected,line); + EXPECT_EQ(expected, line); } output_file.close(); } @@ -261,9 +264,11 @@ TEST_F(estimate_test, example_multiple_threads) estimate_ibf_arguments ibf_args{}; minimiser_arguments minimiser_args{}; estimate_arguments estimate_args{}; - std::vector sequence_files = {data("exp_01.fasta"), data("exp_02.fasta"), - data("exp_11.fasta"), data("exp_12.fasta")}; - minimiser_args.samples = {2,2}; + std::vector sequence_files = {data("exp_01.fasta"), + data("exp_02.fasta"), + data("exp_11.fasta"), + data("exp_12.fasta")}; + minimiser_args.samples = {2, 2}; ibf_args.expression_thresholds = {4, 32}; std::vector fpr = {0.05}; ibf_args.compressed = false; @@ -281,9 +286,9 @@ TEST_F(estimate_test, example_multiple_threads) std::string expected{"GeneA\t9\t32\t"}; if (output_file.is_open()) { - while ( std::getline (output_file,line) ) + while (std::getline(output_file, line)) { - EXPECT_EQ(expected,line); + EXPECT_EQ(expected, line); } output_file.close(); } @@ -294,10 +299,12 @@ TEST_F(estimate_test, example_different_expressions_per_level) estimate_ibf_arguments ibf_args{}; minimiser_arguments minimiser_args{}; estimate_arguments estimate_args{}; - std::vector sequence_files = {data("exp_01.fasta"), data("exp_02.fasta"), - data("exp_11.fasta"), data("exp_12.fasta")}; + std::vector sequence_files = {data("exp_01.fasta"), + data("exp_02.fasta"), + data("exp_11.fasta"), + data("exp_12.fasta")}; std::vector cutoffs = {0, 0}; - minimiser_args.samples = {2,2}; + minimiser_args.samples = {2, 2}; ibf_args.number_expression_thresholds = 4; std::vector fpr = {0.05}; ibf_args.compressed = false; @@ -319,9 +326,9 @@ TEST_F(estimate_test, example_different_expressions_per_level) std::string expected{"GeneA\t7\t26\t"}; if (output_file.is_open()) { - while ( std::getline (output_file, line) ) + while (std::getline(output_file, line)) { - EXPECT_EQ(expected, line); + EXPECT_EQ(expected, line); } output_file.close(); } @@ -332,10 +339,12 @@ TEST_F(estimate_test, example_different_expressions_per_level_multiple_threads) estimate_ibf_arguments ibf_args{}; minimiser_arguments minimiser_args{}; estimate_arguments estimate_args{}; - std::vector sequence_files = {data("exp_01.fasta"), data("exp_02.fasta"), - data("exp_11.fasta"), data("exp_12.fasta")}; + std::vector sequence_files = {data("exp_01.fasta"), + data("exp_02.fasta"), + data("exp_11.fasta"), + data("exp_12.fasta")}; std::vector cutoffs = {0, 0}; - minimiser_args.samples = {2,2}; + minimiser_args.samples = {2, 2}; ibf_args.number_expression_thresholds = 4; std::vector fpr = {0.05}; ibf_args.compressed = false; @@ -343,7 +352,7 @@ TEST_F(estimate_test, example_different_expressions_per_level_multiple_threads) std::vector minimiser_files{"exp_01.minimiser", "exp_11.minimiser"}; ASSERT_TRUE(std::filesystem::exists(minimiser_files[0])); ASSERT_TRUE(std::filesystem::exists(minimiser_files[1])); - ibf_args.expression_thresholds= {}; + ibf_args.expression_thresholds = {}; ibf(minimiser_files, ibf_args, fpr); ibf_args.threads = 2; @@ -359,9 +368,9 @@ TEST_F(estimate_test, example_different_expressions_per_level_multiple_threads) std::string expected{"GeneA\t7\t26\t"}; if (output_file.is_open()) { - while ( std::getline (output_file,line) ) + while (std::getline(output_file, line)) { - EXPECT_EQ(expected, line); + EXPECT_EQ(expected, line); } output_file.close(); } diff --git a/test/api/ibf_test.cpp b/test/api/ibf_test.cpp index d5de94e..740f837 100644 --- a/test/api/ibf_test.cpp +++ b/test/api/ibf_test.cpp @@ -1,11 +1,12 @@ #include + #include #include -#include "ibf.h" -#include "shared.h" #include "../app_test.hpp" +#include "ibf.hpp" +#include "shared.hpp" // To prevent issues when running multiple CLI tests in parallel, give each CLI test unique names: struct ibf_test : public app_test @@ -46,10 +47,10 @@ TEST_F(ibf_test, given_expression_thresholds) std::vector expected_result(1, 0); auto & res = agent.bulk_contains(2); - EXPECT_RANGE_EQ(expected_result, res); + EXPECT_RANGE_EQ(expected_result, res); expected_result[0] = 1; auto & res2 = agent.bulk_contains(24); - EXPECT_RANGE_EQ(expected_result, res2); + EXPECT_RANGE_EQ(expected_result, res2); } ASSERT_TRUE(std::filesystem::exists("IBF_Test_Exp_IBF_Data")); @@ -93,10 +94,10 @@ TEST_F(ibf_test, given_expression_thresholds_include_file) std::vector expected_result(1, 0); auto & res = agent.bulk_contains(2); - EXPECT_RANGE_EQ(expected_result, res); + EXPECT_RANGE_EQ(expected_result, res); expected_result[0] = 1; auto & res2 = agent.bulk_contains(24); - EXPECT_RANGE_EQ(expected_result, res2); + EXPECT_RANGE_EQ(expected_result, res2); } } @@ -126,10 +127,10 @@ TEST_F(ibf_test, given_expression_thresholds_exclude_file) std::vector expected_result(1, 0); auto & res = agent.bulk_contains(2); - EXPECT_RANGE_EQ(expected_result, res); + EXPECT_RANGE_EQ(expected_result, res); expected_result[0] = 1; auto & res2 = agent.bulk_contains(24); - EXPECT_RANGE_EQ(expected_result, res2); + EXPECT_RANGE_EQ(expected_result, res2); } } @@ -159,10 +160,10 @@ TEST_F(ibf_test, no_given_expression_thresholds) std::vector expected_result(1, 0); auto & res = agent.bulk_contains(2); - EXPECT_RANGE_EQ(expected_result, res); + EXPECT_RANGE_EQ(expected_result, res); expected_result[0] = 1; auto & res2 = agent.bulk_contains(24); - EXPECT_RANGE_EQ(expected_result, res2); + EXPECT_RANGE_EQ(expected_result, res2); } } @@ -179,8 +180,7 @@ TEST_F(ibf_test, expression_thresholds_by_genome) std::vector expected{}; std::vector cutoffs{}; - std::vector medians = ibf(sequence_files, ibf_args, minimiser_args, fpr, cutoffs, - data("mini_gen.fasta")); + std::vector medians = ibf(sequence_files, ibf_args, minimiser_args, fpr, cutoffs, data("mini_gen.fasta")); EXPECT_EQ(expected, medians); @@ -193,10 +193,10 @@ TEST_F(ibf_test, expression_thresholds_by_genome) std::vector expected_result(1, 0); auto & res = agent.bulk_contains(2); - EXPECT_RANGE_EQ(expected_result, res); + EXPECT_RANGE_EQ(expected_result, res); expected_result[0] = 1; auto & res2 = agent.bulk_contains(192); - EXPECT_RANGE_EQ(expected_result, res2); + EXPECT_RANGE_EQ(expected_result, res2); } } @@ -248,10 +248,10 @@ TEST_F(ibf_test, given_cutoffs) std::vector expected_result(1, 0); auto & res = agent.bulk_contains(2); - EXPECT_RANGE_EQ(expected_result, res); + EXPECT_RANGE_EQ(expected_result, res); expected_result[0] = 1; auto & res2 = agent.bulk_contains(24); - EXPECT_RANGE_EQ(expected_result, res2); + EXPECT_RANGE_EQ(expected_result, res2); } estimate_ibf_arguments args{}; @@ -295,10 +295,10 @@ TEST_F(ibf_test, different_file_sizes) std::vector expected_result(2, 0); auto & res = agent.bulk_contains(2); - EXPECT_RANGE_EQ(expected_result, res); + EXPECT_RANGE_EQ(expected_result, res); expected_result[0] = 1; expected_result[1] = 1; auto & res2 = agent.bulk_contains(24); - EXPECT_RANGE_EQ(expected_result, res2); + EXPECT_RANGE_EQ(expected_result, res2); } } diff --git a/test/api/ibfmin_test.cpp b/test/api/ibfmin_test.cpp index 6f0d3c4..ce03d07 100644 --- a/test/api/ibfmin_test.cpp +++ b/test/api/ibfmin_test.cpp @@ -1,11 +1,12 @@ #include + #include #include -#include "ibf.h" -#include "shared.h" #include "../app_test.hpp" +#include "ibf.hpp" +#include "shared.hpp" // To prevent issues when running multiple CLI tests in parallel, give each CLI test unique names: struct ibfmin_test : public app_test @@ -44,12 +45,11 @@ TEST_F(ibfmin_test, given_expression_thresholds) std::vector expected_result(1, 0); auto & res = agent.bulk_contains(97); - EXPECT_RANGE_EQ(expected_result, res); + EXPECT_RANGE_EQ(expected_result, res); expected_result[0] = 1; auto & res2 = agent.bulk_contains(24); - EXPECT_RANGE_EQ(expected_result, res2); + EXPECT_RANGE_EQ(expected_result, res2); } - } TEST_F(ibfmin_test, given_expression_thresholds_multiple_threads) @@ -75,10 +75,10 @@ TEST_F(ibfmin_test, given_expression_thresholds_multiple_threads) std::vector expected_result(128, 0); auto & res = agent.bulk_contains(97); - EXPECT_RANGE_EQ(expected_result, res); + EXPECT_RANGE_EQ(expected_result, res); std::vector expected_result2(128, 1); auto & res2 = agent.bulk_contains(24); - EXPECT_RANGE_EQ(expected_result2, res2); + EXPECT_RANGE_EQ(expected_result2, res2); } TEST_F(ibfmin_test, no_given_expression_thresholds) @@ -103,12 +103,11 @@ TEST_F(ibfmin_test, no_given_expression_thresholds) std::vector expected_result(1, 0); auto & res = agent.bulk_contains(2); - EXPECT_RANGE_EQ(expected_result, res); + EXPECT_RANGE_EQ(expected_result, res); expected_result[0] = 1; auto & res2 = agent.bulk_contains(24); - EXPECT_RANGE_EQ(expected_result, res2); + EXPECT_RANGE_EQ(expected_result, res2); } - } TEST_F(ibfmin_test, expression_thresholds_by_genome) @@ -133,12 +132,11 @@ TEST_F(ibfmin_test, expression_thresholds_by_genome) std::vector expected_result(1, 0); auto & res = agent.bulk_contains(2); - EXPECT_RANGE_EQ(expected_result, res); + EXPECT_RANGE_EQ(expected_result, res); expected_result[0] = 1; auto & res2 = agent.bulk_contains(24); - EXPECT_RANGE_EQ(expected_result, res2); + EXPECT_RANGE_EQ(expected_result, res2); } - } TEST_F(ibfmin_test, no_given_expression_thresholds_multiple_threads) @@ -164,10 +162,10 @@ TEST_F(ibfmin_test, no_given_expression_thresholds_multiple_threads) std::vector expected_result(128, 0); auto & res = agent.bulk_contains(2); - EXPECT_RANGE_EQ(expected_result, res); + EXPECT_RANGE_EQ(expected_result, res); std::vector expected_result2(128, 1); auto & res2 = agent.bulk_contains(24); - EXPECT_RANGE_EQ(expected_result2, res2); + EXPECT_RANGE_EQ(expected_result2, res2); } TEST_F(ibfmin_test, different_shape) @@ -198,10 +196,9 @@ TEST_F(ibfmin_test, different_shape) std::vector expected_result(1, 0); auto & res = agent.bulk_contains(97); - EXPECT_RANGE_EQ(expected_result, res); + EXPECT_RANGE_EQ(expected_result, res); expected_result[0] = 1; auto & res2 = agent.bulk_contains(4); - EXPECT_RANGE_EQ(expected_result, res2); + EXPECT_RANGE_EQ(expected_result, res2); } - } diff --git a/test/api/insert_delete_test.cpp b/test/api/insert_delete_test.cpp index 3c68b56..bdd0172 100644 --- a/test/api/insert_delete_test.cpp +++ b/test/api/insert_delete_test.cpp @@ -1,11 +1,12 @@ #include + #include #include -#include "ibf.h" -#include "shared.h" #include "../app_test.hpp" +#include "ibf.hpp" +#include "shared.hpp" // To prevent issues when running multiple CLI tests in parallel, give each CLI test unique names: struct delete_test : public app_test @@ -23,7 +24,7 @@ struct delete_test : public app_test struct insert_test : public delete_test { // Reads the level file ibf creates - template + template void read_levels(std::vector> & expressions, std::filesystem::path filename) { ASSERT_TRUE(std::filesystem::exists(filename)) << filename; @@ -41,13 +42,13 @@ struct insert_test : public delete_test if (j == expressions.size()) expressions.push_back(empty_vector); std::ranges::copy(stream_view | seqan3::detail::take_until_or_throw(seqan3::is_char<' '>), - std::back_inserter(buffer)); - if constexpr(std::same_as) - expressions[j].push_back((uint16_t) std::stoi(buffer)); + std::back_inserter(buffer)); + if constexpr (std::same_as) + expressions[j].push_back((uint16_t)std::stoi(buffer)); else - expressions[j].push_back((double) std::stod(buffer)); + expressions[j].push_back((double)std::stod(buffer)); buffer.clear(); - if(*stream_it != '/') + if (*stream_it != '/') ++stream_it; if (*stream_it == '\n') @@ -55,7 +56,8 @@ struct insert_test : public delete_test ++stream_it; j++; } - } while (*stream_it != '/'); + } + while (*stream_it != '/'); ++stream_it; fin.close(); @@ -65,7 +67,7 @@ struct insert_test : public delete_test TEST_F(delete_test, no_given_thresholds) { std::vector fpr = {0.05}; - std::vector cutoffs_delete{0,0}; + std::vector cutoffs_delete{0, 0}; estimate_ibf_arguments ibf_args_delete{}; minimiser_arguments minimiser_args_delete{}; initialization_args(ibf_args_delete); @@ -73,19 +75,19 @@ TEST_F(delete_test, no_given_thresholds) ibf_args_delete.number_expression_thresholds = 2; minimiser_args_delete.experiment_names = false; ibf_args_delete.path_out = "IBF_delete_Exp_"; - std::vector sequence_files_delete = {data("mini_example.fasta"),data("mini_example.fasta")}; + std::vector sequence_files_delete = {data("mini_example.fasta"), data("mini_example.fasta")}; ibf(sequence_files_delete, ibf_args_delete, minimiser_args_delete, fpr, cutoffs_delete); seqan3::interleaved_bloom_filter ibf{}; load_ibf(ibf, "IBF_delete_Exp_IBF_Level_0"); seqan3::interleaved_bloom_filter ibf_0{seqan3::bin_count{2u}, - seqan3::bin_size{ibf.bin_size()}, - seqan3::hash_function_count{1u}}; + seqan3::bin_size{ibf.bin_size()}, + seqan3::hash_function_count{1u}}; load_ibf(ibf, "IBF_delete_Exp_IBF_Level_1"); seqan3::interleaved_bloom_filter ibf_1{seqan3::bin_count{2u}, - seqan3::bin_size{ibf.bin_size()}, - seqan3::hash_function_count{1u}}; + seqan3::bin_size{ibf.bin_size()}, + seqan3::hash_function_count{1u}}; - delete_bin({0,1}, ibf_args_delete, "IBF_delete_Exp_", true); + delete_bin({0, 1}, ibf_args_delete, "IBF_delete_Exp_", true); seqan3::interleaved_bloom_filter ibf_delete{}; @@ -107,7 +109,7 @@ TEST_F(insert_test, ibf) minimiser_args.experiment_names = false; std::vector sequence_files = {data("mini_example.fasta"), data("mini_example.fasta")}; std::vector fpr = {0.05}; - std::vector cutoffs{0,0}; + std::vector cutoffs{0, 0}; ibf(sequence_files, ibf_args, minimiser_args, fpr, cutoffs); @@ -140,7 +142,7 @@ TEST_F(insert_test, ibf) read_levels(fpr_ibf, "IBF_True_Exp_IBF_FPRs.fprs"); std::vector> fpr_insert{}; read_levels(fpr_insert, "IBF_Insert_Exp_IBF_FPRs.fprs"); - EXPECT_EQ(fpr_ibf,fpr_insert); + EXPECT_EQ(fpr_ibf, fpr_insert); } TEST_F(insert_test, ibf_no_given_thresholds) @@ -156,7 +158,7 @@ TEST_F(insert_test, ibf_no_given_thresholds) std::vector fpr = {0.05}; std::vector expected{}; - std::vector cutoffs{0,0}; + std::vector cutoffs{0, 0}; std::vector medians = ibf(sequence_files, ibf_args, minimiser_args, fpr, cutoffs); @@ -169,7 +171,8 @@ TEST_F(insert_test, ibf_no_given_thresholds) minimiser_args_insert.experiment_names = false; ibf_args_insert.path_out = "IBF_Insert_Exp_"; std::vector sequence_files_insert = {data("mini_example.fasta")}; - std::vector medians_insert = ibf(sequence_files_insert, ibf_args_insert, minimiser_args_insert, fpr, cutoffs_insert); + std::vector medians_insert = + ibf(sequence_files_insert, ibf_args_insert, minimiser_args_insert, fpr, cutoffs_insert); insert(sequence_files_insert, ibf_args_insert, minimiser_args_insert, cutoffs_insert, "", "IBF_Insert_Exp_", true); seqan3::interleaved_bloom_filter ibf; seqan3::interleaved_bloom_filter ibf_insert; @@ -186,13 +189,13 @@ TEST_F(insert_test, ibf_no_given_thresholds) read_levels(expressions_ibf, "IBF_True_Exp_IBF_Levels.levels"); std::vector> expressions_insert{}; read_levels(expressions_insert, "IBF_Insert_Exp_IBF_Levels.levels"); - EXPECT_EQ(expressions_ibf,expressions_insert); + EXPECT_EQ(expressions_ibf, expressions_insert); std::vector> fpr_ibf{}; read_levels(fpr_ibf, "IBF_True_Exp_IBF_FPRs.fprs"); std::vector> fpr_insert{}; read_levels(fpr_insert, "IBF_Insert_Exp_IBF_FPRs.fprs"); - EXPECT_EQ(fpr_ibf,fpr_insert); + EXPECT_EQ(fpr_ibf, fpr_insert); } TEST_F(insert_test, ibf_delete) @@ -204,9 +207,11 @@ TEST_F(insert_test, ibf_delete) ibf_args.path_out = "IBF_True_Exp_"; ibf_args.expression_thresholds = {1, 2}; minimiser_args.experiment_names = false; - std::vector sequence_files = {data("mini_example.fasta"), data("mini_example2.fasta"), data("mini_example.fasta")}; - std::vector fpr = {0.05,0.05}; - std::vector cutoffs{0,0,0}; + std::vector sequence_files = {data("mini_example.fasta"), + data("mini_example2.fasta"), + data("mini_example.fasta")}; + std::vector fpr = {0.05, 0.05}; + std::vector cutoffs{0, 0, 0}; ibf(sequence_files, ibf_args, minimiser_args, fpr, cutoffs); @@ -219,7 +224,9 @@ TEST_F(insert_test, ibf_delete) ibf_args_insert.expression_thresholds = {1, 2}; minimiser_args_insert.experiment_names = false; std::vector sequence_files_insert = {data("mini_example2.fasta")}; - std::vector sequence_files_test = {data("mini_example.fasta"), data("mini_example2.fasta"), data("mini_example.fasta")}; + std::vector sequence_files_test = {data("mini_example.fasta"), + data("mini_example2.fasta"), + data("mini_example.fasta")}; ibf(sequence_files_test, ibf_args_insert, minimiser_args, fpr, cutoffs); delete_bin({1}, ibf_args_insert, ibf_args_insert.path_out, false); @@ -240,10 +247,9 @@ TEST_F(insert_test, ibf_delete) read_levels(fpr_ibf, "IBF_True_Exp_IBF_FPRs.fprs"); std::vector> fpr_insert{}; read_levels(fpr_insert, "IBF_Insert_Exp_IBF_FPRs.fprs"); - EXPECT_EQ(fpr_ibf,fpr_insert); + EXPECT_EQ(fpr_ibf, fpr_insert); } - TEST_F(insert_test, ibf_delete_no_given_threshold) { estimate_ibf_arguments ibf_args{}; @@ -253,9 +259,11 @@ TEST_F(insert_test, ibf_delete_no_given_threshold) ibf_args.path_out = "IBF_True_Exp_"; ibf_args.number_expression_thresholds = 2; minimiser_args.experiment_names = false; - std::vector sequence_files = {data("mini_example.fasta"), data("mini_example2.fasta"), data("mini_example.fasta")}; - std::vector fpr = {0.05,0.05}; - std::vector cutoffs{0,0,0}; + std::vector sequence_files = {data("mini_example.fasta"), + data("mini_example2.fasta"), + data("mini_example.fasta")}; + std::vector fpr = {0.05, 0.05}; + std::vector cutoffs{0, 0, 0}; ibf(sequence_files, ibf_args, minimiser_args, fpr, cutoffs); @@ -268,7 +276,9 @@ TEST_F(insert_test, ibf_delete_no_given_threshold) ibf_args_insert.number_expression_thresholds = 2; minimiser_args_insert.experiment_names = false; std::vector sequence_files_insert = {data("mini_example2.fasta")}; - std::vector sequence_files_test = {data("mini_example.fasta"), data("mini_example2.fasta"), data("mini_example.fasta")}; + std::vector sequence_files_test = {data("mini_example.fasta"), + data("mini_example2.fasta"), + data("mini_example.fasta")}; ibf(sequence_files_test, ibf_args_insert, minimiser_args, fpr, cutoffs); delete_bin({1}, ibf_args_insert, ibf_args_insert.path_out, true); @@ -290,13 +300,13 @@ TEST_F(insert_test, ibf_delete_no_given_threshold) read_levels(expressions_ibf, "IBF_True_Exp_IBF_Levels.levels"); std::vector> expressions_insert{}; read_levels(expressions_insert, "IBF_Insert_Exp_IBF_Levels.levels"); - EXPECT_EQ(expressions_ibf,expressions_insert); + EXPECT_EQ(expressions_ibf, expressions_insert); std::vector> fpr_ibf{}; read_levels(fpr_ibf, "IBF_True_Exp_IBF_FPRs.fprs"); std::vector> fpr_insert{}; read_levels(fpr_insert, "IBF_Insert_Exp_IBF_FPRs.fprs"); - EXPECT_EQ(fpr_ibf,fpr_insert); + EXPECT_EQ(fpr_ibf, fpr_insert); } TEST_F(insert_test, ibfmin) @@ -307,7 +317,8 @@ TEST_F(insert_test, ibfmin) std::vector fpr = {0.05, 0.05}; ibf_args.path_out = "IBFMIN_Test_Given_"; ibf_args.compressed = false; - std::vector minimiser_file = {data("mini_example.minimiser"), data("mini_example.minimiser")}; + std::vector minimiser_file = {data("mini_example.minimiser"), + data("mini_example.minimiser")}; ibf(minimiser_file, ibf_args, fpr); estimate_ibf_arguments ibf_args_insert{}; @@ -317,7 +328,7 @@ TEST_F(insert_test, ibfmin) ibf_args_insert.compressed = false; std::vector minimiser_file_insert = {data("mini_example.minimiser")}; ibf(minimiser_file_insert, ibf_args_insert, fpr); - insert(minimiser_file_insert, ibf_args_insert, "", "IBFMIN_Insert_Given_", false); + insert(minimiser_file_insert, ibf_args_insert, "", "IBFMIN_Insert_Given_", false); seqan3::interleaved_bloom_filter ibf; seqan3::interleaved_bloom_filter ibf_insert; @@ -334,7 +345,7 @@ TEST_F(insert_test, ibfmin) read_levels(fpr_ibf, "IBFMIN_Test_Given_IBF_FPRs.fprs"); std::vector> fpr_insert{}; read_levels(fpr_insert, "IBFMIN_Insert_Given_IBF_FPRs.fprs"); - EXPECT_EQ(fpr_ibf,fpr_insert); + EXPECT_EQ(fpr_ibf, fpr_insert); } TEST_F(insert_test, ibfmin_delete) @@ -345,7 +356,9 @@ TEST_F(insert_test, ibfmin_delete) std::vector fpr = {0.05, 0.05}; ibf_args.path_out = "IBFMIN_Test_Given_"; ibf_args.compressed = false; - std::vector minimiser_file = {data("mini_example.minimiser"), data("mini_example.minimiser"), data("mini_example.minimiser")}; + std::vector minimiser_file = {data("mini_example.minimiser"), + data("mini_example.minimiser"), + data("mini_example.minimiser")}; ibf(minimiser_file, ibf_args, fpr); estimate_ibf_arguments ibf_args_insert{}; @@ -356,7 +369,7 @@ TEST_F(insert_test, ibfmin_delete) std::vector minimiser_file_insert = {data("mini_example.minimiser")}; ibf(minimiser_file, ibf_args_insert, fpr); delete_bin({1}, ibf_args_insert, ibf_args_insert.path_out, false); - insert(minimiser_file_insert, ibf_args_insert, "", "IBFMIN_Insert_Given_", false); + insert(minimiser_file_insert, ibf_args_insert, "", "IBFMIN_Insert_Given_", false); seqan3::interleaved_bloom_filter ibf; seqan3::interleaved_bloom_filter ibf_insert; @@ -373,7 +386,7 @@ TEST_F(insert_test, ibfmin_delete) read_levels(fpr_ibf, "IBFMIN_Test_Given_IBF_FPRs.fprs"); std::vector> fpr_insert{}; read_levels(fpr_insert, "IBFMIN_Insert_Given_IBF_FPRs.fprs"); - EXPECT_EQ(fpr_ibf,fpr_insert); + EXPECT_EQ(fpr_ibf, fpr_insert); } TEST_F(insert_test, ibfmin_no_given_thresholds) @@ -384,7 +397,8 @@ TEST_F(insert_test, ibfmin_no_given_thresholds) std::vector fpr = {0.05, 0.05}; ibf_args.path_out = "IBFMIN_Test_Given_"; ibf_args.compressed = false; - std::vector minimiser_file = {data("mini_example.minimiser"), data("mini_example.minimiser")}; + std::vector minimiser_file = {data("mini_example.minimiser"), + data("mini_example.minimiser")}; ibf(minimiser_file, ibf_args, fpr); @@ -396,7 +410,7 @@ TEST_F(insert_test, ibfmin_no_given_thresholds) fpr = {0.05}; std::vector minimiser_file_insert = {data("mini_example.minimiser")}; ibf(minimiser_file_insert, ibf_args_insert, fpr); - insert(minimiser_file_insert, ibf_args_insert, "", "IBFMIN_Insert_Given_", true); + insert(minimiser_file_insert, ibf_args_insert, "", "IBFMIN_Insert_Given_", true); seqan3::interleaved_bloom_filter ibf; seqan3::interleaved_bloom_filter ibf_insert; @@ -413,13 +427,13 @@ TEST_F(insert_test, ibfmin_no_given_thresholds) read_levels(expressions_ibf, "IBFMIN_Test_Given_IBF_Levels.levels"); std::vector> expressions_insert{}; read_levels(expressions_insert, "IBFMIN_Insert_Given_IBF_Levels.levels"); - EXPECT_EQ(expressions_ibf,expressions_insert); + EXPECT_EQ(expressions_ibf, expressions_insert); std::vector> fpr_ibf{}; read_levels(fpr_ibf, "IBFMIN_Test_Given_IBF_FPRs.fprs"); std::vector> fpr_insert{}; read_levels(fpr_insert, "IBFMIN_Insert_Given_IBF_FPRs.fprs"); - EXPECT_EQ(fpr_ibf,fpr_insert); + EXPECT_EQ(fpr_ibf, fpr_insert); } TEST_F(insert_test, delete_ibfmin_no_given_thresholds) @@ -430,7 +444,9 @@ TEST_F(insert_test, delete_ibfmin_no_given_thresholds) std::vector fpr = {0.05, 0.05}; ibf_args.path_out = "IBFMIN_Test_Given_Del_"; ibf_args.compressed = false; - std::vector minimiser_file = {data("mini_example.minimiser"), data("mini_example.minimiser"), data("mini_example.minimiser")}; + std::vector minimiser_file = {data("mini_example.minimiser"), + data("mini_example.minimiser"), + data("mini_example.minimiser")}; ibf(minimiser_file, ibf_args, fpr); @@ -443,7 +459,7 @@ TEST_F(insert_test, delete_ibfmin_no_given_thresholds) std::vector minimiser_file_insert = {data("mini_example.minimiser")}; ibf(minimiser_file, ibf_args_insert, fpr); delete_bin({1}, ibf_args_insert, ibf_args_insert.path_out, true); - insert(minimiser_file_insert, ibf_args_insert, "", "IBFMIN_Insert_Given_Del_", true); + insert(minimiser_file_insert, ibf_args_insert, "", "IBFMIN_Insert_Given_Del_", true); seqan3::interleaved_bloom_filter ibf; seqan3::interleaved_bloom_filter ibf_insert; @@ -460,11 +476,11 @@ TEST_F(insert_test, delete_ibfmin_no_given_thresholds) read_levels(expressions_ibf, "IBFMIN_Test_Given_Del_IBF_Levels.levels"); std::vector> expressions_insert{}; read_levels(expressions_insert, "IBFMIN_Insert_Given_Del_IBF_Levels.levels"); - EXPECT_EQ(expressions_ibf,expressions_insert); + EXPECT_EQ(expressions_ibf, expressions_insert); std::vector> fpr_ibf{}; read_levels(fpr_ibf, "IBFMIN_Test_Given_Del_IBF_FPRs.fprs"); std::vector> fpr_insert{}; read_levels(fpr_insert, "IBFMIN_Insert_Given_Del_IBF_FPRs.fprs"); - EXPECT_EQ(fpr_ibf,fpr_insert); + EXPECT_EQ(fpr_ibf, fpr_insert); } diff --git a/test/api/minimiser_test.cpp b/test/api/minimiser_test.cpp index 6c8d352..cf48248 100644 --- a/test/api/minimiser_test.cpp +++ b/test/api/minimiser_test.cpp @@ -1,11 +1,12 @@ #include + #include #include -#include "ibf.h" -#include "shared.h" #include "../app_test.hpp" +#include "ibf.hpp" +#include "shared.hpp" // To prevent issues when running multiple CLI tests in parallel, give each CLI test unique names: struct minimiser_test : public app_test @@ -20,33 +21,37 @@ struct minimiser_test : public app_test args.compressed = true; } - std::vector> expected_hash_tables{ // minimisers: - {{0,2}, // AAAA - {1,4}, // AAAC - {6,4}, // AACG - {24,1}, // ACGA - {27,5}, // ACGT - {97,3}, // CGAC - {108,2}, // CGTA - {109,3}, // CGTC - {112,3}, // CTAA - {177,1}, // GTAC - {192,3}, // TAAA - {216,1}, // TCGA - }, - {{27,1}, // ACGT - {42,1}, // AGGG - {74,1}, // CAGG - {82,1}, // CCAG - {84,1}, // CCCA - {85,19}, // CCCC - {86,1}, // CCCG - {109,1}, // CGTC - {149,2}, // GCCC - {161,1}, // GGAC - {165,1}, // GGCC - {168,1}, // GGGA - },}; + std::vector> expected_hash_tables{ + // minimisers: + { + {0, 2}, // AAAA + {1, 4}, // AAAC + {6, 4}, // AACG + {24, 1}, // ACGA + {27, 5}, // ACGT + {97, 3}, // CGAC + {108, 2}, // CGTA + {109, 3}, // CGTC + {112, 3}, // CTAA + {177, 1}, // GTAC + {192, 3}, // TAAA + {216, 1}, // TCGA + }, + { + {27, 1}, // ACGT + {42, 1}, // AGGG + {74, 1}, // CAGG + {82, 1}, // CCAG + {84, 1}, // CCCA + {85, 19}, // CCCC + {86, 1}, // CCCG + {109, 1}, // CGTC + {149, 2}, // GCCC + {161, 1}, // GGAC + {165, 1}, // GGCC + {168, 1}, // GGGA + }, + }; }; TEST_F(minimiser_test, small_example) @@ -57,8 +62,7 @@ TEST_F(minimiser_test, small_example) std::vector cutoffs = {0, 0}; args.expression_thresholds = {0}; std::vector fpr = {0.05}; - std::vector sequence_files = {data("mini_example.fasta"), - data("mini_example2.fasta")}; + std::vector sequence_files = {data("mini_example.fasta"), data("mini_example2.fasta")}; minimiser(sequence_files, args, minimiser_args, cutoffs); robin_hood::unordered_node_map result_hash_table{}; std::vector minimiser_files{}; @@ -69,7 +73,10 @@ TEST_F(minimiser_test, small_example) { uint8_t cutoff{}; // Test Header file - read_binary_start(args, ("Minimiser_Test_" + std::string{sequence_files[i].stem()} + ".minimiser"), num_of_minimisers, cutoff); + read_binary_start(args, + ("Minimiser_Test_" + std::string{sequence_files[i].stem()} + ".minimiser"), + num_of_minimisers, + cutoff); EXPECT_EQ(4, args.k); EXPECT_EQ(4, args.w_size.get()); @@ -96,14 +103,13 @@ TEST_F(minimiser_test, small_example) std::vector expected_result(2, 0); auto & res = agent.bulk_contains(2); - EXPECT_RANGE_EQ(expected_result, res); + EXPECT_RANGE_EQ(expected_result, res); expected_result[0] = 1; auto & res2 = agent.bulk_contains(0); - EXPECT_RANGE_EQ(expected_result, res2); + EXPECT_RANGE_EQ(expected_result, res2); expected_result[1] = 1; auto & res3 = agent.bulk_contains(27); - EXPECT_RANGE_EQ(expected_result, res3); - + EXPECT_RANGE_EQ(expected_result, res3); } TEST_F(minimiser_test, small_example_different_shape) @@ -114,8 +120,7 @@ TEST_F(minimiser_test, small_example_different_shape) std::vector cutoffs = {0, 0}; args.shape = seqan3::bin_literal{0b1101}; EXPECT_EQ(13, args.shape.to_ulong()); - std::vector sequence_files = {data("mini_example.fasta"), - data("mini_example2.fasta")}; + std::vector sequence_files = {data("mini_example.fasta"), data("mini_example2.fasta")}; minimiser(sequence_files, args, minimiser_args, cutoffs); uint64_t num_of_minimisers{}; @@ -125,7 +130,10 @@ TEST_F(minimiser_test, small_example_different_shape) { uint8_t cutoff{}; // Test Header file - read_binary_start(args, ("Minimiser_Test_" + std::string{sequence_files[i].stem()} + ".minimiser"), num_of_minimisers, cutoff); + read_binary_start(args, + ("Minimiser_Test_" + std::string{sequence_files[i].stem()} + ".minimiser"), + num_of_minimisers, + cutoff); EXPECT_EQ(4, args.k); EXPECT_EQ(4, args.w_size.get()); @@ -134,7 +142,6 @@ TEST_F(minimiser_test, small_example_different_shape) EXPECT_EQ(expected_nums[i], num_of_minimisers); EXPECT_EQ(0, cutoff); } - } TEST_F(minimiser_test, small_example_samplewise) @@ -146,8 +153,7 @@ TEST_F(minimiser_test, small_example_samplewise) std::vector cutoffs = {0, 0}; args.number_expression_thresholds = 1; std::vector fpr = {0.05}; - std::vector sequence_files = {data("mini_example.fasta"), - data("mini_example2.fasta")}; + std::vector sequence_files = {data("mini_example.fasta"), data("mini_example2.fasta")}; minimiser(sequence_files, args, minimiser_args, cutoffs); std::vector> expected_counts{{7}, {12}}; @@ -162,7 +168,10 @@ TEST_F(minimiser_test, small_example_samplewise) uint8_t cutoff{}; // Test Header file args.expression_thresholds = {}; - read_binary_start(args, ("Minimiser_Test_" + std::string{sequence_files[i].stem()} + ".minimiser"), num_of_minimisers, cutoff); + read_binary_start(args, + ("Minimiser_Test_" + std::string{sequence_files[i].stem()} + ".minimiser"), + num_of_minimisers, + cutoff); EXPECT_EQ(4, args.k); EXPECT_EQ(4, args.w_size.get()); EXPECT_EQ(0, args.s.get()); @@ -189,13 +198,13 @@ TEST_F(minimiser_test, small_example_samplewise) std::vector expected_result(2, 0); auto & res = agent.bulk_contains(2); - EXPECT_RANGE_EQ(expected_result, res); + EXPECT_RANGE_EQ(expected_result, res); auto & res2 = agent.bulk_contains(0); expected_result[0] = 1; - EXPECT_RANGE_EQ(expected_result, res2); + EXPECT_RANGE_EQ(expected_result, res2); expected_result[1] = 1; auto & res3 = agent.bulk_contains(27); - EXPECT_RANGE_EQ(expected_result, res3); + EXPECT_RANGE_EQ(expected_result, res3); } TEST_F(minimiser_test, cutoff_by_filesize) @@ -206,8 +215,7 @@ TEST_F(minimiser_test, cutoff_by_filesize) args.expression_thresholds = {0}; std::vector fpr = {0.05}; std::vector cutoffs{}; - std::vector sequence_files = {data("mini_example.fasta"), - data("mini_example2.fasta")}; + std::vector sequence_files = {data("mini_example.fasta"), data("mini_example2.fasta")}; minimiser(sequence_files, args, minimiser_args, cutoffs); @@ -219,7 +227,10 @@ TEST_F(minimiser_test, cutoff_by_filesize) { uint8_t cutoff{}; // Test Header file - read_binary_start(args, ("Minimiser_Test_" + std::string{sequence_files[i].stem()} + ".minimiser"), num_of_minimisers, cutoff); + read_binary_start(args, + ("Minimiser_Test_" + std::string{sequence_files[i].stem()} + ".minimiser"), + num_of_minimisers, + cutoff); EXPECT_EQ(4, args.k); EXPECT_EQ(4, args.w_size.get()); @@ -238,7 +249,7 @@ TEST_F(minimiser_test, cutoff_by_filesize) std::vector expected_result(2, 0); auto & res = agent.bulk_contains(2); - EXPECT_RANGE_EQ(expected_result, res); + EXPECT_RANGE_EQ(expected_result, res); expected_result[0] = 1; auto & res2 = agent.bulk_contains(0); EXPECT_RANGE_EQ(expected_result, res2); @@ -246,7 +257,6 @@ TEST_F(minimiser_test, cutoff_by_filesize) expected_result[1] = 1; auto & res3 = agent.bulk_contains(85); EXPECT_RANGE_EQ(expected_result, res3); - } TEST_F(minimiser_test, small_example_two_threads) @@ -258,8 +268,7 @@ TEST_F(minimiser_test, small_example_two_threads) std::vector cutoffs = {0, 0}; args.expression_thresholds = {0}; std::vector fpr = {0.05}; - std::vector sequence_files = {data("mini_example.fasta"), - data("mini_example2.fasta")}; + std::vector sequence_files = {data("mini_example.fasta"), data("mini_example2.fasta")}; minimiser(sequence_files, args, minimiser_args, cutoffs); args.threads = 1; robin_hood::unordered_node_map result_hash_table{}; @@ -271,7 +280,10 @@ TEST_F(minimiser_test, small_example_two_threads) { uint8_t cutoff{}; // Test Header file - read_binary_start(args, ("Minimiser_Test_" + std::string{sequence_files[i].stem()} + ".minimiser"), num_of_minimisers, cutoff); + read_binary_start(args, + ("Minimiser_Test_" + std::string{sequence_files[i].stem()} + ".minimiser"), + num_of_minimisers, + cutoff); EXPECT_EQ(4, args.k); EXPECT_EQ(4, args.w_size.get()); @@ -299,14 +311,13 @@ TEST_F(minimiser_test, small_example_two_threads) std::vector expected_result(2, 0); auto & res = agent.bulk_contains(2); - EXPECT_RANGE_EQ(expected_result, res); + EXPECT_RANGE_EQ(expected_result, res); expected_result[0] = 1; auto & res2 = agent.bulk_contains(0); EXPECT_RANGE_EQ(expected_result, res2); expected_result[1] = 1; auto & res3 = agent.bulk_contains(27); EXPECT_RANGE_EQ(expected_result, res3); - } TEST_F(minimiser_test, small_example_include) @@ -317,8 +328,7 @@ TEST_F(minimiser_test, small_example_include) args.path_out = "Minimiser_Test_In_"; std::vector cutoffs = {0, 0}; minimiser_args.include_file = data("mini_gen.fasta"); - std::vector sequence_files = {data("mini_example.fasta"), - data("mini_example2.fasta")}; + std::vector sequence_files = {data("mini_example.fasta"), data("mini_example2.fasta")}; minimiser(sequence_files, args, minimiser_args, cutoffs); robin_hood::unordered_node_map result_hash_table{}; std::vector minimiser_files{}; @@ -329,7 +339,10 @@ TEST_F(minimiser_test, small_example_include) { uint8_t cutoff{}; // Test Header file - read_binary_start(args, ("Minimiser_Test_In_" + std::string{sequence_files[i].stem()} + ".minimiser"), num_of_minimisers, cutoff); + read_binary_start(args, + ("Minimiser_Test_In_" + std::string{sequence_files[i].stem()} + ".minimiser"), + num_of_minimisers, + cutoff); EXPECT_EQ(4, args.k); EXPECT_EQ(4, args.w_size.get()); @@ -341,7 +354,7 @@ TEST_F(minimiser_test, small_example_include) // Test binary file read_binary(("Minimiser_Test_In_" + std::string{sequence_files[i].stem()} + ".minimiser"), result_hash_table); minimiser_files.push_back(("Minimiser_Test_In_" + std::string{sequence_files[i].stem()} + ".minimiser")); - if (i==0) + if (i == 0) { for (auto & hash : result_hash_table) { @@ -356,7 +369,6 @@ TEST_F(minimiser_test, small_example_include) result_hash_table.clear(); } - } TEST_F(minimiser_test, small_example_exclude) @@ -369,8 +381,7 @@ TEST_F(minimiser_test, small_example_exclude) minimiser_args.exclude_file = data("mini_gen2.fasta"); args.expression_thresholds = {0}; std::vector fpr = {0.05}; - std::vector sequence_files = {data("mini_example.fasta"), - data("mini_example2.fasta")}; + std::vector sequence_files = {data("mini_example.fasta"), data("mini_example2.fasta")}; minimiser(sequence_files, args, minimiser_args, cutoffs); robin_hood::unordered_node_map result_hash_table{}; std::vector minimiser_files{}; @@ -381,7 +392,10 @@ TEST_F(minimiser_test, small_example_exclude) { uint8_t cutoff{}; // Test Header file - read_binary_start(args, ("Minimiser_Test_Ex_" + std::string{sequence_files[i].stem()} + ".minimiser"), num_of_minimisers, cutoff); + read_binary_start(args, + ("Minimiser_Test_Ex_" + std::string{sequence_files[i].stem()} + ".minimiser"), + num_of_minimisers, + cutoff); EXPECT_EQ(4, args.k); EXPECT_EQ(4, args.w_size.get()); @@ -412,37 +426,39 @@ TEST_F(minimiser_test, small_example_exclude) std::vector expected_result(2, 0); auto & res = agent.bulk_contains(2); - EXPECT_RANGE_EQ(expected_result, res); + EXPECT_RANGE_EQ(expected_result, res); expected_result[0] = 1; auto & res2 = agent.bulk_contains(0); - EXPECT_RANGE_EQ(expected_result, res2); + EXPECT_RANGE_EQ(expected_result, res2); expected_result[1] = 1; auto & res3 = agent.bulk_contains(27); - EXPECT_RANGE_EQ(expected_result, res3); - + EXPECT_RANGE_EQ(expected_result, res3); } TEST_F(minimiser_test, small_example_shape) { - std::vector> expected_hash_tables_shape{ // minimisers: - { - {0,3}, // AA - {1,4}, // AC - {2,4}, // AG - {3,5}, // AT - {4,5}, // CA - {5,6}, // CC - {9,1}, // GC - {12,4}, // TA - }, - {{2,1}, // AT - {3,1}, // AG - {4,1}, // CA - {5,20}, // CC - {6,3}, // CG - {8,1}, // GA - {9,4}, // GC - },}; + std::vector> expected_hash_tables_shape{ + // minimisers: + { + {0, 3}, // AA + {1, 4}, // AC + {2, 4}, // AG + {3, 5}, // AT + {4, 5}, // CA + {5, 6}, // CC + {9, 1}, // GC + {12, 4}, // TA + }, + { + {2, 1}, // AT + {3, 1}, // AG + {4, 1}, // CA + {5, 20}, // CC + {6, 3}, // CG + {8, 1}, // GA + {9, 4}, // GC + }, + }; estimate_ibf_arguments args{}; minimiser_arguments minimiser_args{}; @@ -453,8 +469,7 @@ TEST_F(minimiser_test, small_example_shape) std::vector cutoffs = {0, 0}; args.expression_thresholds = {0}; std::vector fpr = {0.05}; - std::vector sequence_files = {data("mini_example.fasta"), - data("mini_example2.fasta")}; + std::vector sequence_files = {data("mini_example.fasta"), data("mini_example2.fasta")}; minimiser(sequence_files, args, minimiser_args, cutoffs); robin_hood::unordered_node_map result_hash_table{}; std::vector minimiser_files{}; @@ -465,7 +480,10 @@ TEST_F(minimiser_test, small_example_shape) { uint8_t cutoff{}; // Test Header file - read_binary_start(args, ("Minimiser_Test_Shape_" + std::string{sequence_files[i].stem()} + ".minimiser"), num_of_minimisers, cutoff); + read_binary_start(args, + ("Minimiser_Test_Shape_" + std::string{sequence_files[i].stem()} + ".minimiser"), + num_of_minimisers, + cutoff); EXPECT_EQ(4, args.k); EXPECT_EQ(4, args.w_size.get()); @@ -475,7 +493,8 @@ TEST_F(minimiser_test, small_example_shape) EXPECT_EQ(0, cutoff); // Test binary file - read_binary(("Minimiser_Test_Shape_" + std::string{sequence_files[i].stem()} + ".minimiser"), result_hash_table); + read_binary(("Minimiser_Test_Shape_" + std::string{sequence_files[i].stem()} + ".minimiser"), + result_hash_table); minimiser_files.push_back(("Minimiser_Test_Shape_" + std::string{sequence_files[i].stem()} + ".minimiser")); for (auto & hash : expected_hash_tables_shape[i]) EXPECT_EQ(expected_hash_tables_shape[i][hash.first], result_hash_table[hash.first]); @@ -490,12 +509,11 @@ TEST_F(minimiser_test, small_example_shape) std::vector expected_result(2, 0); auto & res = agent.bulk_contains(7); - EXPECT_RANGE_EQ(expected_result, res); + EXPECT_RANGE_EQ(expected_result, res); expected_result[0] = 1; auto & res2 = agent.bulk_contains(12); - EXPECT_RANGE_EQ(expected_result, res2); + EXPECT_RANGE_EQ(expected_result, res2); expected_result[1] = 1; auto & res3 = agent.bulk_contains(2); - EXPECT_RANGE_EQ(expected_result, res3); - + EXPECT_RANGE_EQ(expected_result, res3); } diff --git a/test/cli/cli_test.hpp b/test/cli/cli_test.hpp index 1e78a6a..80dcf03 100644 --- a/test/cli/cli_test.hpp +++ b/test/cli/cli_test.hpp @@ -1,9 +1,9 @@ #include -#include // system calls -#include // test directory creation -#include // ostringstream -#include // strings +#include // system calls +#include // test directory creation +#include // ostringstream +#include // strings // Include the EXPECT_RANGE_EQ macro for better information if range elements differ. #include @@ -14,12 +14,10 @@ struct cli_test : public ::testing::Test { private: - // Holds the original work directory where Gtest has been started. std::filesystem::path original_workdir{}; protected: - // Result struct for captured streams and exit code. struct cli_test_result { @@ -37,8 +35,8 @@ struct cli_test : public ::testing::Test // Assemble the command string and disable version check. std::ostringstream command{}; command << "SEQAN3_NO_VERSION_CHECK=1 " << BINDIR; - int a[] = {0, ((void)(command << command_items << ' '), 0) ... }; - (void) a; + int a[] = {0, ((void)(command << command_items << ' '), 0)...}; + (void)a; // Always capture the output streams. testing::internal::CaptureStdout(); @@ -62,10 +60,8 @@ struct cli_test : public ::testing::Test { // Assemble the directory name. ::testing::TestInfo const * const info = ::testing::UnitTest::GetInstance()->current_test_info(); - std::filesystem::path const test_dir{std::string{OUTPUTDIR} + - std::string{info->test_case_name()} + - std::string{"."} + - std::string{info->name()}}; + std::filesystem::path const test_dir{std::string{OUTPUTDIR} + std::string{info->test_case_name()} + + std::string{"."} + std::string{info->name()}}; try { std::filesystem::remove_all(test_dir); // delete the directory if it exists @@ -84,7 +80,7 @@ struct cli_test : public ::testing::Test { try { - std::filesystem::current_path(original_workdir); // restore the original work dir + std::filesystem::current_path(original_workdir); // restore the original work dir } catch (std::exception const & exc) { diff --git a/test/cli/count_options_test.cpp b/test/cli/count_options_test.cpp index 43e3310..957a201 100644 --- a/test/cli/count_options_test.cpp +++ b/test/cli/count_options_test.cpp @@ -1,26 +1,24 @@ -#include // strings +#include // strings #include "../app_test.hpp" -struct count_options_test : public app_test {}; +struct count_options_test : public app_test +{}; TEST_F(count_options_test, no_options) { app_test_result result = execute_app("count"); - std::string expected - { - "needle-count - Get expression value depending on minimizers. " - "This function is an alternative to pseudoaligners like kallisto. It " - "estimates the expression value for all sequences in the genome file " - "based on the exact minimiser occurrences of the given sequence files. " - "Please run genome beforehand to create the genome file.\n" - "======================================================================" - "======================================================================" - "======================================================================" - "======================================================================" - "==========================================\n " - "Try -h or --help for more information.\n" - }; + std::string expected{"needle-count - Get expression value depending on minimizers. " + "This function is an alternative to pseudoaligners like kallisto. It " + "estimates the expression value for all sequences in the genome file " + "based on the exact minimiser occurrences of the given sequence files. " + "Please run genome beforehand to create the genome file.\n" + "======================================================================" + "======================================================================" + "======================================================================" + "======================================================================" + "==========================================\n " + "Try -h or --help for more information.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, expected); EXPECT_EQ(result.err, std::string{}); @@ -28,12 +26,10 @@ TEST_F(count_options_test, no_options) TEST_F(count_options_test, fail_no_argument) { - app_test_result result = execute_app("count", "--seed 0 --genome", data("mini_gen.genome")," --include", data("mini_gen.fasta")); - std::string expected - { - "Error. Incorrect command line input for count. Not enough positional arguments provided " - "(Need at least 1). See -h/--help for more information.\n" - }; + app_test_result result = + execute_app("count", "--seed 0 --genome", data("mini_gen.genome"), " --include", data("mini_gen.fasta")); + std::string expected{"Error. Incorrect command line input for count. Not enough positional arguments provided " + "(Need at least 1). See -h/--help for more information.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, std::string{}); EXPECT_EQ(result.err, expected); @@ -41,7 +37,11 @@ TEST_F(count_options_test, fail_no_argument) TEST_F(count_options_test, with_arguments) { - app_test_result result = execute_app("count -k 4 -w 4 --seed 0 --genome", data("mini_gen.genome")," --include", data("mini_gen.fasta"), data("mini_example.fasta")); + app_test_result result = execute_app("count -k 4 -w 4 --seed 0 --genome", + data("mini_gen.genome"), + " --include", + data("mini_gen.fasta"), + data("mini_example.fasta")); EXPECT_SUCCESS(result); EXPECT_EQ(result.out, ""); EXPECT_EQ(result.err, std::string{}); @@ -50,11 +50,11 @@ TEST_F(count_options_test, with_arguments) TEST_F(count_options_test, multithreads) { app_test_result result = execute_app("count -k 4 -w 8 -t 2 --genome", - data("mini_gen.genome"), - " --include", - data("mini_gen.fasta"), - data("mini_example.fasta"), - data("mini_example.fasta")); + data("mini_gen.genome"), + " --include", + data("mini_gen.fasta"), + data("mini_example.fasta"), + data("mini_example.fasta")); EXPECT_SUCCESS(result); EXPECT_EQ(result.out, ""); EXPECT_EQ(result.err, std::string{}); @@ -63,27 +63,25 @@ TEST_F(count_options_test, multithreads) TEST_F(count_options_test, paired) { app_test_result result = execute_app("count -k 4 -w 8 -p --genome", - data("mini_gen.genome"), - " --include", - data("mini_gen.fasta"), - data("mini_example.fasta"), - data("mini_example.fasta")); + data("mini_gen.genome"), + " --include", + data("mini_gen.fasta"), + data("mini_example.fasta"), + data("mini_example.fasta")); EXPECT_SUCCESS(result); EXPECT_EQ(result.out, ""); EXPECT_EQ(result.err, std::string{}); } -struct genome_options_test : public app_test {}; +struct genome_options_test : public app_test +{}; TEST_F(genome_options_test, no_options) { app_test_result result = execute_app("genome"); - std::string expected - { - "needle-genome - Creates the genome file necessary as an input to count.\n" - "=======================================================================\n" - " Try -h or --help for more information.\n" - }; + std::string expected{"needle-genome - Creates the genome file necessary as an input to count.\n" + "=======================================================================\n" + " Try -h or --help for more information.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, expected); EXPECT_EQ(result.err, std::string{}); @@ -92,11 +90,8 @@ TEST_F(genome_options_test, no_options) TEST_F(genome_options_test, fail_no_argument) { app_test_result result = execute_app("genome", "--seed 0"); - std::string expected - { - "Error. Incorrect command line input for count. Not enough positional arguments provided " - "(Need at least 1). See -h/--help for more information.\n" - }; + std::string expected{"Error. Incorrect command line input for count. Not enough positional arguments provided " + "(Need at least 1). See -h/--help for more information.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, std::string{}); EXPECT_EQ(result.err, expected); diff --git a/test/cli/delete_options_test.cpp b/test/cli/delete_options_test.cpp index fbbe668..0933df2 100644 --- a/test/cli/delete_options_test.cpp +++ b/test/cli/delete_options_test.cpp @@ -1,21 +1,18 @@ -#include // strings +#include // strings #include "../app_test.hpp" +#include "ibf.hpp" +#include "shared.hpp" -#include "ibf.h" -#include "shared.h" - -struct delete_options_test : public app_test {}; +struct delete_options_test : public app_test +{}; TEST_F(delete_options_test, delete_no_options) { app_test_result result = execute_app("delete"); - std::string expected - { - "needle-delete - Delete experiments specified by their position from the Needle index.\n" - "=====================================================================================\n" - " Try -h or --help for more information.\n" - }; + std::string expected{"needle-delete - Delete experiments specified by their position from the Needle index.\n" + "=====================================================================================\n" + " Try -h or --help for more information.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, expected); EXPECT_EQ(result.err, std::string{}); @@ -27,7 +24,7 @@ TEST_F(delete_options_test, with_argument) minimiser_arguments minimiser_args{}; ibf_args.expression_thresholds = {1, 2}; std::vector fpr = {0.05}; - std::vector sequence_files = {data("exp_01.fasta"),data("exp_01.fasta")}; + std::vector sequence_files = {data("exp_01.fasta"), data("exp_01.fasta")}; ibf_args.path_out = "Test_"; std::vector cutoffs{}; ibf(sequence_files, ibf_args, minimiser_args, fpr, cutoffs); diff --git a/test/cli/estimate_options_test.cpp b/test/cli/estimate_options_test.cpp index 1781f6c..2d363c4 100644 --- a/test/cli/estimate_options_test.cpp +++ b/test/cli/estimate_options_test.cpp @@ -1,21 +1,18 @@ -#include // strings +#include // strings #include "../app_test.hpp" +#include "ibf.hpp" +#include "shared.hpp" -#include "ibf.h" -#include "shared.h" - -struct estimate_options_test : public app_test {}; +struct estimate_options_test : public app_test +{}; TEST_F(estimate_options_test, no_options) { app_test_result result = execute_app("estimate"); - std::string expected - { - "needle-estimate - Estimate expression value of transcript based on the Needle index.\n" - "====================================================================================\n" - " Try -h or --help for more information.\n" - }; + std::string expected{"needle-estimate - Estimate expression value of transcript based on the Needle index.\n" + "====================================================================================\n" + " Try -h or --help for more information.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, expected); EXPECT_EQ(result.err, std::string{}); @@ -24,11 +21,8 @@ TEST_F(estimate_options_test, no_options) TEST_F(estimate_options_test, fail_no_argument) { app_test_result result = execute_app("estimate", "-m"); - std::string expected - { - "Error. Incorrect command line input for estimate. Not enough positional arguments provided " - "(Need at least 1). See -h/--help for more information.\n" - }; + std::string expected{"Error. Incorrect command line input for estimate. Not enough positional arguments provided " + "(Need at least 1). See -h/--help for more information.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, std::string{}); EXPECT_EQ(result.err, expected); @@ -79,7 +73,7 @@ TEST_F(estimate_options_test, with_argument_out) std::vector cutoffs{}; ibf(sequence_files, ibf_args, minimiser_args, fpr, cutoffs); - app_test_result result = execute_app("estimate -o ", "expressions.out","-i ", "Test_", data("mini_gen.fasta")); + app_test_result result = execute_app("estimate -o ", "expressions.out", "-i ", "Test_", data("mini_gen.fasta")); EXPECT_SUCCESS(result); EXPECT_EQ(result.out, ""); EXPECT_EQ(result.err, std::string{}); diff --git a/test/cli/ibf_options_test.cpp b/test/cli/ibf_options_test.cpp index 03178e3..1e6d5dd 100644 --- a/test/cli/ibf_options_test.cpp +++ b/test/cli/ibf_options_test.cpp @@ -1,18 +1,16 @@ -#include // strings +#include // strings #include "../app_test.hpp" -struct ibf_options_test : public app_test {}; +struct ibf_options_test : public app_test +{}; TEST_F(ibf_options_test, ibf_no_options) { app_test_result result = execute_app("ibf"); - std::string expected - { - "needle-ibf - Constructs the Needle index.\n" - "=========================================\n" - " Try -h or --help for more information.\n" - }; + std::string expected{"needle-ibf - Constructs the Needle index.\n" + "=========================================\n" + " Try -h or --help for more information.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, expected); EXPECT_EQ(result.err, std::string{}); @@ -21,11 +19,8 @@ TEST_F(ibf_options_test, ibf_no_options) TEST_F(ibf_options_test, ibf_fail_no_argument) { app_test_result result = execute_app("ibf", "-c"); - std::string expected - { - "Error. Incorrect command line input for ibf. Not enough positional arguments provided " - "(Need at least 1). See -h/--help for more information.\n" - }; + std::string expected{"Error. Incorrect command line input for ibf. Not enough positional arguments provided " + "(Need at least 1). See -h/--help for more information.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, std::string{""}); EXPECT_EQ(result.err, expected); @@ -34,10 +29,7 @@ TEST_F(ibf_options_test, ibf_fail_no_argument) TEST_F(ibf_options_test, ibf_fail_contradiction) { app_test_result result = execute_app("ibf -f 0.05 -e 1 -e 2 -l 1", data("exp_01.fasta")); - std::string expected - { - "Error. Please set the expression levels OR give the number of expression levels.\n" - }; + std::string expected{"Error. Please set the expression levels OR give the number of expression levels.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, std::string{""}); EXPECT_EQ(result.err, expected); @@ -45,14 +37,13 @@ TEST_F(ibf_options_test, ibf_fail_contradiction) TEST_F(ibf_options_test, ibf_fail_contradiction2) { - app_test_result result = execute_app("ibf -f 0.05 -e 1 -e 2 --levels-by-genome ", data("exp_01.fasta"), - data("exp_01.fasta")); - std::string expected - { - "Error. The determination of expression levels can not be used with individual levels already given. Please set " - "the expression levels without the option --level-by-genome OR use the number of expression levels with that option." - "\n" - }; + app_test_result result = + execute_app("ibf -f 0.05 -e 1 -e 2 --levels-by-genome ", data("exp_01.fasta"), data("exp_01.fasta")); + std::string expected{"Error. The determination of expression levels can not be used with individual levels already " + "given. Please set " + "the expression levels without the option --level-by-genome OR use the number of expression " + "levels with that option." + "\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, std::string{""}); EXPECT_EQ(result.err, expected); @@ -61,10 +52,7 @@ TEST_F(ibf_options_test, ibf_fail_contradiction2) TEST_F(ibf_options_test, ibf_fail_no_fpr) { app_test_result result = execute_app("ibf -l 2", data("exp_01.fasta")); - std::string expected - { - "Error. Please give a false positive rate for the IBFs.\n" - }; + std::string expected{"Error. Please give a false positive rate for the IBFs.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, std::string{""}); EXPECT_EQ(result.err, expected); @@ -73,10 +61,8 @@ TEST_F(ibf_options_test, ibf_fail_no_fpr) TEST_F(ibf_options_test, ibf_fail_incorrect_number_of_fprs) { app_test_result result = execute_app("ibf -f 0.05 -f 0.01 -f 0.03 -l 2", data("exp_01.fasta")); - std::string expected - { - "Error. Length of false positive rates for IBFs is not equal to length of expression thresholds.\n" - }; + std::string expected{ + "Error. Length of false positive rates for IBFs is not equal to length of expression thresholds.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, std::string{""}); EXPECT_EQ(result.err, expected); @@ -109,12 +95,10 @@ TEST_F(ibf_options_test, ibf_with_argument_with_userdefined_shape) TEST_F(ibf_options_test, ibfmin_no_options) { app_test_result result = execute_app("ibfmin"); - std::string expected - { + std::string expected{ "needle-ibfmin - Constructs the Needle index from the minimiser files created by needle minimiser.\n" "=================================================================================================\n" - " Try -h or --help for more information.\n" - }; + " Try -h or --help for more information.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, expected); EXPECT_EQ(result.err, std::string{}); @@ -123,11 +107,8 @@ TEST_F(ibf_options_test, ibfmin_no_options) TEST_F(ibf_options_test, ibfmin_fail_no_argument) { app_test_result result = execute_app("ibfmin -c"); - std::string expected - { - "Error. Incorrect command line input for ibfmin. Not enough positional arguments provided " - "(Need at least 1). See -h/--help for more information.\n" - }; + std::string expected{"Error. Incorrect command line input for ibfmin. Not enough positional arguments provided " + "(Need at least 1). See -h/--help for more information.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, std::string{}); EXPECT_EQ(result.err, expected); @@ -136,10 +117,7 @@ TEST_F(ibf_options_test, ibfmin_fail_no_argument) TEST_F(ibf_options_test, ibfmin_fail_contradiction) { app_test_result result = execute_app("ibfmin -f 0.05 -e 1 -e 2 -l 1", data("mini_example.minimiser")); - std::string expected - { - "Error. Please set the expression levels OR give the number of expression levels.\n" - }; + std::string expected{"Error. Please set the expression levels OR give the number of expression levels.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, std::string{""}); EXPECT_EQ(result.err, expected); @@ -147,14 +125,14 @@ TEST_F(ibf_options_test, ibfmin_fail_contradiction) TEST_F(ibf_options_test, ibfmin_fail_contradiction2) { - app_test_result result = execute_app("ibfmin -f 0.05 -e 1 -e 2 --levels-by-genome ", data("exp_01.fasta"), + app_test_result result = execute_app("ibfmin -f 0.05 -e 1 -e 2 --levels-by-genome ", + data("exp_01.fasta"), data("mini_example.minimiser")); - std::string expected - { - "Error. The determination of expression levels can not be used with individual levels already given. Please set " - "the expression levels without the option --level-by-genome OR use the number of expression levels with that option." - "\n" - }; + std::string expected{"Error. The determination of expression levels can not be used with individual levels already " + "given. Please set " + "the expression levels without the option --level-by-genome OR use the number of expression " + "levels with that option." + "\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, std::string{""}); EXPECT_EQ(result.err, expected); @@ -163,10 +141,7 @@ TEST_F(ibf_options_test, ibfmin_fail_contradiction2) TEST_F(ibf_options_test, ibfmin_fail_no_fpr) { app_test_result result = execute_app("ibfmin -l 2", data("mini_example.minimiser")); - std::string expected - { - "Error. Please give a false positive rate for the IBFs.\n" - }; + std::string expected{"Error. Please give a false positive rate for the IBFs.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, std::string{""}); EXPECT_EQ(result.err, expected); @@ -175,10 +150,8 @@ TEST_F(ibf_options_test, ibfmin_fail_no_fpr) TEST_F(ibf_options_test, ibfmin_fail_incorrect_number_of_fprs) { app_test_result result = execute_app("ibfmin -f 0.05 -f 0.01 -f 0.03 -l 2", data("mini_example.minimiser")); - std::string expected - { - "Error. Length of false positive rates for IBFs is not equal to length of expression thresholds.\n" - }; + std::string expected{ + "Error. Length of false positive rates for IBFs is not equal to length of expression thresholds.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, std::string{""}); EXPECT_EQ(result.err, expected); diff --git a/test/cli/insert_options_test.cpp b/test/cli/insert_options_test.cpp index 3fe3610..d6a1b9e 100644 --- a/test/cli/insert_options_test.cpp +++ b/test/cli/insert_options_test.cpp @@ -1,21 +1,18 @@ -#include // strings +#include // strings #include "../app_test.hpp" +#include "ibf.hpp" +#include "shared.hpp" -#include "ibf.h" -#include "shared.h" - -struct insert_options_test : public app_test {}; +struct insert_options_test : public app_test +{}; TEST_F(insert_options_test, insert_no_options) { app_test_result result = execute_app("insert"); - std::string expected - { - "needle-insert - Inserts into a given uncompressed Needle index.\n" - "===============================================================\n" - " Try -h or --help for more information.\n" - }; + std::string expected{"needle-insert - Inserts into a given uncompressed Needle index.\n" + "===============================================================\n" + " Try -h or --help for more information.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, expected); EXPECT_EQ(result.err, std::string{}); @@ -24,11 +21,8 @@ TEST_F(insert_options_test, insert_no_options) TEST_F(insert_options_test, insert_fail_no_argument) { app_test_result result = execute_app("insert", "-c"); - std::string expected - { - "Error. Incorrect command line input for insert. Not enough positional arguments provided " - "(Need at least 1). See -h/--help for more information.\n" - }; + std::string expected{"Error. Incorrect command line input for insert. Not enough positional arguments provided " + "(Need at least 1). See -h/--help for more information.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, std::string{""}); EXPECT_EQ(result.err, expected); @@ -42,11 +36,11 @@ TEST_F(insert_options_test, with_argument) ibf_args.expression_thresholds = {1, 2}; std::vector fpr = {0.05}; std::vector sequence_files = {data("exp_01.fasta")}; - ibf_args.path_out = tmp_dir/"Test_"; + ibf_args.path_out = tmp_dir / "Test_"; std::vector cutoffs{1}; ibf(sequence_files, ibf_args, minimiser_args, fpr, cutoffs); - app_test_result result = execute_app("insert -i ", tmp_dir/"Test_", data("exp_01.fasta")); + app_test_result result = execute_app("insert -i ", tmp_dir / "Test_", data("exp_01.fasta")); EXPECT_SUCCESS(result); EXPECT_EQ(result.out, ""); EXPECT_EQ(result.err, std::string{}); diff --git a/test/cli/minimiser_options_test.cpp b/test/cli/minimiser_options_test.cpp index d823a6d..e0c03ac 100644 --- a/test/cli/minimiser_options_test.cpp +++ b/test/cli/minimiser_options_test.cpp @@ -1,18 +1,16 @@ -#include // strings +#include // strings #include "../app_test.hpp" -struct minimiser_options_test : public app_test {}; +struct minimiser_options_test : public app_test +{}; TEST_F(minimiser_options_test, no_options) { app_test_result result = execute_app("minimiser"); - std::string expected - { - "needle-minimiser - Calculates minimiser for given experiments.\n" - "==============================================================\n" - " Try -h or --help for more information.\n" - }; + std::string expected{"needle-minimiser - Calculates minimiser for given experiments.\n" + "==============================================================\n" + " Try -h or --help for more information.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, expected); EXPECT_EQ(result.err, std::string{}); @@ -21,11 +19,8 @@ TEST_F(minimiser_options_test, no_options) TEST_F(minimiser_options_test, fail_no_argument) { app_test_result result = execute_app("minimiser", "--seed 0"); - std::string expected - { - "Error. Incorrect command line input for minimiser. Not enough positional arguments provided " - "(Need at least 1). See -h/--help for more information.\n" - }; + std::string expected{"Error. Incorrect command line input for minimiser. Not enough positional arguments provided " + "(Need at least 1). See -h/--help for more information.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, std::string{}); EXPECT_EQ(result.err, expected); @@ -49,8 +44,8 @@ TEST_F(minimiser_options_test, cutoff) TEST_F(minimiser_options_test, multiple_sample) { - app_test_result result = execute_app("minimiser -k 4 -w 8 --samples 2 ", data("mini_example.fasta"), - data("mini_example.fasta")); + app_test_result result = + execute_app("minimiser -k 4 -w 8 --samples 2 ", data("mini_example.fasta"), data("mini_example.fasta")); EXPECT_SUCCESS(result); EXPECT_EQ(result.out, ""); EXPECT_EQ(result.err, std::string{}); @@ -58,8 +53,8 @@ TEST_F(minimiser_options_test, multiple_sample) TEST_F(minimiser_options_test, multithreads) { - app_test_result result = execute_app("minimiser -k 4 -w 8 -t 2", data("mini_example.fasta"), - data("mini_example.fasta")); + app_test_result result = + execute_app("minimiser -k 4 -w 8 -t 2", data("mini_example.fasta"), data("mini_example.fasta")); EXPECT_SUCCESS(result); EXPECT_EQ(result.out, ""); EXPECT_EQ(result.err, std::string{}); @@ -67,8 +62,8 @@ TEST_F(minimiser_options_test, multithreads) TEST_F(minimiser_options_test, paired) { - app_test_result result = execute_app("minimiser -k 4 -w 8 -p ", data("mini_example.fasta"), - data("mini_example.fasta")); + app_test_result result = + execute_app("minimiser -k 4 -w 8 -p ", data("mini_example.fasta"), data("mini_example.fasta")); EXPECT_SUCCESS(result); EXPECT_EQ(result.out, ""); EXPECT_EQ(result.err, std::string{}); @@ -76,12 +71,9 @@ TEST_F(minimiser_options_test, paired) TEST_F(minimiser_options_test, invalid_argument) { - app_test_result result = execute_app("minimiser -k 4 -w 8 --samples 3 ", data("mini_example.fasta"), - data("mini_example.fasta")); - std::string expected - { - "Error. Incorrect command line input for multiple-samples.\n" - }; + app_test_result result = + execute_app("minimiser -k 4 -w 8 --samples 3 ", data("mini_example.fasta"), data("mini_example.fasta")); + std::string expected{"Error. Incorrect command line input for multiple-samples.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, ""); EXPECT_EQ(result.err, expected); diff --git a/test/cli/needle_options_test.cpp b/test/cli/needle_options_test.cpp index df40aca..5decb12 100644 --- a/test/cli/needle_options_test.cpp +++ b/test/cli/needle_options_test.cpp @@ -1,18 +1,16 @@ -#include // strings +#include // strings #include "../app_test.hpp" -struct needle_options_test : public app_test {}; +struct needle_options_test : public app_test +{}; TEST_F(needle_options_test, no_options) { app_test_result result = execute_app(); - std::string expected - { - "needle\n" - "======\n" - " Try -h or --help for more information.\n" - }; + std::string expected{"needle\n" + "======\n" + " Try -h or --help for more information.\n"}; EXPECT_SUCCESS(result); EXPECT_EQ(result.out, expected); EXPECT_EQ(result.err, std::string{}); @@ -21,12 +19,11 @@ TEST_F(needle_options_test, no_options) TEST_F(needle_options_test, fail_no_argument) { app_test_result result = execute_app("-v"); - std::string expected - { + std::string expected{ "Error. Incorrect command. See needle help for more information.You either forgot or misspelled the subcommand!" - " Please specify which sub-program you want to use: one of [count,delete,estimate,genome,ibf,ibfmin,insert,insertmin,minimiser]. " - "Use -h/--help for more information.\n" - }; + " Please specify which sub-program you want to use: one of " + "[count,delete,estimate,genome,ibf,ibfmin,insert,insertmin,minimiser]. " + "Use -h/--help for more information.\n"}; EXPECT_NE(result.exit_code, 0); EXPECT_EQ(result.out, std::string{}); EXPECT_TRUE(result.err == expected); diff --git a/test/data/datasources.cmake b/test/data/datasources.cmake index daed835..34d873f 100644 --- a/test/data/datasources.cmake +++ b/test/data/datasources.cmake @@ -9,4 +9,3 @@ include (test/declare_datasource) # `datasources.cmake`, `README.md`, and files ending in `.license` are ignored. # You may organise your data in subdirectories, but each file must have a unique name. include (test/add_local_data) -