From 470942196def50bc812a4b2fe6650a3529318daf Mon Sep 17 00:00:00 2001 From: Massimiliano Rossi Date: Wed, 18 Sep 2024 22:38:21 -0400 Subject: [PATCH] Feature: Add SAM output to `moni mems` (#11) * Add SAM output option for moni mems * Bump version * Fix missing declarations * Extract MEM pos and length * convert string to to_string * Add -s option to option parser * Pass SAM output parameter to mem_c * Update README --- CMakeLists.txt | 4 +- README.md | 8 +- include/extender/extender_klib.hpp | 2 +- include/extender/extender_ksw2.hpp | 2 +- pipeline/moni.in | 5 +- src/mems.cpp | 128 +++++++++++++++++++++++------ 6 files changed, 114 insertions(+), 35 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b4b9bd..2a75926 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,7 +14,7 @@ message(STATUS "Install directory: ${CMAKE_INSTALL_PREFIX}") project(moni) SET(VERSION_MAJOR "0") SET(VERSION_MINOR "2") -SET(VERSION_PATCH "1") +SET(VERSION_PATCH "2") SET(VERSION "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}") message("version: ${VERSION}") @@ -83,7 +83,7 @@ include(InstallRequiredSystemLibraries) set(CPACK_GENERATOR "STGZ;TGZ;DEB") set(CPACK_SOURCE_GENERATOR "TGZ") set(CPACK_PACKAGE_VENDOR "University of Florida") -set(CPACK_PACKAGE_CONTACT "rossi.m@ufl.edu") +set(CPACK_PACKAGE_CONTACT "maxrossi91@gmail.com") set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "MONI - Pangenomic index for finding MEMs") set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") set(CPACK_RESOURCE_FILE_README "${CMAKE_CURRENT_SOURCE_DIR}/README.md") diff --git a/README.md b/README.md index 2ecee7b..0d4a976 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ [![Release](https://img.shields.io/github/release/maxrossi91/moni.svg)](https://github.com/maxrossi91/moni/releases) -[![Downloads](https://img.shields.io/github/downloads/maxrossi91/moni/total?logo=github)](https://github.com/maxrossi91/moni/releases/download/v0.2.1/moni_0.2.1_amd64.deb) +[![Downloads](https://img.shields.io/github/downloads/maxrossi91/moni/total?logo=github)](https://github.com/maxrossi91/moni/releases/download/v0.2.2/moni_0.2.2_amd64.deb) # MONI ```console @@ -9,7 +9,7 @@ | |\/| | | | | . ` | | | | | | | |__| | |\ |_| |_ |_| |_|\____/|_| \_|_____| - ver 0.2.1 + ver 0.2.2 ``` A Pangenomics Index for Finding MEMs. @@ -57,7 +57,7 @@ usage: moni ms [-h] -i INDEX -p PATTERN [-o OUTPUT] [-t THREADS] ### Computing the matching statistics with MONI: ``` -usage: moni mems [-h] -i INDEX -p PATTERN [-o OUTPUT] [-e] [-t THREADS] +usage: moni mems [-h] -i INDEX -p PATTERN [-o OUTPUT] [-e] [-s] [-t THREADS] -h, --help show this help message and exit -i INDEX, --index INDEX reference index base name (default: None) @@ -67,6 +67,8 @@ usage: moni mems [-h] -i INDEX -p PATTERN [-o OUTPUT] [-e] [-t THREADS] output directory path (default: .) -e, --extended-output output MEM occurrence in the reference (default: False) + -s, --sam-output + output MEM in a SAM formatted file. (default: False) -t THREADS, --threads THREADS number of helper threads (default: 1) -g GRAMMAR, --grammar GRAMMAR diff --git a/include/extender/extender_klib.hpp b/include/extender/extender_klib.hpp index 39169ee..8b62da6 100644 --- a/include/extender/extender_klib.hpp +++ b/include/extender/extender_klib.hpp @@ -344,7 +344,7 @@ class extender { std::string res = "@HD\tVN:1.6\tSO:unknown\n"; res += idx.to_sam(); - res += "@PG\tID:moni\tPN:moni\tVN:0.1.0\n"; + res += "@PG\tID:moni\tPN:moni\tVN:0.2.2\n"; return res; } diff --git a/include/extender/extender_ksw2.hpp b/include/extender/extender_ksw2.hpp index fb89ce6..4048769 100644 --- a/include/extender/extender_ksw2.hpp +++ b/include/extender/extender_ksw2.hpp @@ -740,7 +740,7 @@ class extender { std::string res = "@HD\tVN:1.6\tSO:unknown\n"; res += idx.to_sam(); - res += "@PG\tID:moni\tPN:moni\tVN:0.1.0\n"; + res += "@PG\tID:moni\tPN:moni\tVN:0.2.2\n"; return res; } diff --git a/pipeline/moni.in b/pipeline/moni.in index f885696..89922fe 100755 --- a/pipeline/moni.in +++ b/pipeline/moni.in @@ -11,7 +11,7 @@ Description = """ | |\/| | | | | . ` | | | | | | | |__| | |\ |_| |_ |_| |_|\____/|_| \_|_____| - ver 0.2.1 + ver 0.2.2 A Pangenomics Index for Finding MEMs. Build the index for highly repetive files using the approach @@ -558,6 +558,8 @@ class run_helper(threading.Thread): command += f" -l {args.min_length}" if args.extended_output: command += " -e" + if args.sam_output: + command += " -s" if args.output != ".": command += " -o {}".format(args.output) @@ -655,6 +657,7 @@ def main(): mems_parser.add_argument('-p', '--pattern', help='the input query', type=str, required=True) mems_parser.add_argument('-o', '--output', help='output file prefix', type=str, default='.') mems_parser.add_argument('-e', '--extended-output', help='output MEM occurrence in the reference', action='store_true') + mems_parser.add_argument('-s', '--sam-output', help='output MEM in a SAM formatted file.', action='store_true') mems_parser.add_argument('-l', '--minimum-MEM-length', help='minimum MEM length', type=int, default=35, dest='min_length') mems_parser.add_argument('-t', '--threads', help='number of helper threads', default=1, type=int) mems_parser.add_argument('-g', '--grammar', help='select the grammar [plain, shaped]', type=str, default='plain') diff --git a/src/mems.cpp b/src/mems.cpp index 9bf9e07..e5d1b4d 100644 --- a/src/mems.cpp +++ b/src/mems.cpp @@ -189,7 +189,7 @@ std::string get_slp_file_extension() } //////////////////////////////////////////////////////////////////////////////// -template +template class mems_c { public: @@ -240,7 +240,7 @@ class mems_c // the read, the next size_t integer stores the number m of MEMs, and the // following m size_t pairs of integers stores the positions and lengths of // the MEMs. - void maxrimal_exact_matches(kseq_t *read, FILE* out) + void maximal_exact_matches(kseq_t *read, FILE* out) { auto pointers = ms.query(read->seq.s, read->seq.l); std::vector lengths(pointers.size()); @@ -276,6 +276,13 @@ class mems_c size_t h_length = read->name.l; fwrite(&h_length, sizeof(size_t), 1,out); fwrite(read->name.s, sizeof(char),h_length,out); + if(sam_output) + { + size_t s_length = read->seq.l; + fwrite(&s_length, sizeof(size_t), 1,out); + fwrite(read->seq.s, sizeof(char),s_length,out); + fwrite(read->qual.s, sizeof(char),s_length,out); + } size_t q_length = mems.size(); fwrite(&q_length, sizeof(size_t), 1,out); fwrite(mems.data(), sizeof(std::tuple),q_length,out); @@ -343,7 +350,7 @@ void *mt_ms_worker(void *param) while ((ks_tell(seq) < p->end) && ((l = kseq_read(seq)) >= 0)) { - p->ms->maxrimal_exact_matches(seq,out_fd); + p->ms->maximal_exact_matches(seq,out_fd); } @@ -405,7 +412,7 @@ size_t st_ms(ms_t *ms, std::string pattern_filename, std::string out_filename) while ((l = kseq_read(seq)) >= 0) { - ms->maxrimal_exact_matches(seq, out_fd); + ms->maximal_exact_matches(seq, out_fd); } @@ -431,7 +438,8 @@ struct Args size_t l = 25; // minumum MEM length size_t th = 1; // number of threads bool shaped_slp = false; // use shaped slp - bool extended_output = false; // use shaped slp + bool extended_output = false; // print one MEM occurrence in the reference + bool sam_output = false; // output MEMs in SAM format }; void parseArgs(int argc, char *const argv[], Args &arg) @@ -440,17 +448,18 @@ void parseArgs(int argc, char *const argv[], Args &arg) extern char *optarg; extern int optind; - std::string usage("usage: " + std::string(argv[0]) + " infile [-p patterns] [-o output] [-t threads] [-l len] [-q shaped_slp] [-e extended_output] [-b batch]\n\n" + + std::string usage("usage: " + std::string(argv[0]) + " infile [-p patterns] [-o output] [-t threads] [-l len] [-q shaped_slp] [-e extended_output] [-s sam_output] [-b batch]\n\n" + "Copmputes the matching statistics of the reads in the pattern against the reference index in infile.\n" + " shaped_slp: [boolean] - use shaped slp. (def. false)\n" + "extended_output: [boolean] - print one MEM occurrence in ref. (def. false)\n" + + " sam_output: [boolean] - print output in SAM format. (def. false)\n" + " pattens: [string] - path to patterns file.\n" + " output: [string] - output file prefix.\n" + " len: [integer] - minimum MEM lengt (def. 25)\n" + " thread: [integer] - number of threads (def. 1)\n"); std::string sarg; - while ((c = getopt(argc, argv, "l:hp:o:t:qe")) != -1) + while ((c = getopt(argc, argv, "l:hp:o:t:qes")) != -1) { switch (c) { @@ -474,6 +483,9 @@ void parseArgs(int argc, char *const argv[], Args &arg) case 'e': arg.extended_output = true; break; + case 's': + arg.sam_output = true; + break; case 'h': error(usage); case '?': @@ -490,6 +502,10 @@ void parseArgs(int argc, char *const argv[], Args &arg) { error("Invalid number of arguments\n", usage); } + + if (arg.extended_output && arg.sam_output) { + error("Cannot specify both extended_output and sam_output flags.\n", usage); + } } //********** end argument options ******************** @@ -554,10 +570,18 @@ void dispatcher(Args &args) verbose("Printing plain output"); t_insert_start = std::chrono::high_resolution_clock::now(); - std::ofstream f_mems(out_filename + ".mems"); + std::string mems_file_suffix = args.sam_output ? ".sam" : ".mems"; + std::ofstream f_mems(out_filename + mems_file_suffix); if (!f_mems.is_open()) - error("open() file " + std::string(out_filename) + ".mems failed"); + error("open() file " + std::string(out_filename) + mems_file_suffix + " failed"); + + if(args.sam_output) + { + f_mems << "@HD\tVN:1.6\tSO:unknown\n"; + f_mems << idx.to_sam(); + f_mems << "@PG\tID:moni\tPN:moni\tVN:0.2.2\n"; + } size_t n_seq = 0; for (size_t i = 0; i < args.th; ++i) @@ -569,25 +593,49 @@ void dispatcher(Args &args) error("open() file " + tmp_filename + " failed"); size_t length = 0; + size_t rname_l = 0; + size_t s_length = 0; size_t m = 100; // Reserved size for pointers and lengths std::vector> mem(m); size_t s = 100; // Reserved size for read name + size_t rseq_l = 100; // Reserved size for seq and qual char* rname = (char *)malloc(s * sizeof(char)); - while (!feof(in_fd) and fread(&length, sizeof(size_t), 1, in_fd) > 0) + char *rseq = (char *)malloc(rseq_l * sizeof(char)); + char *rqual = (char *)malloc(rseq_l * sizeof(char)); + while (!feof(in_fd) and fread(&rname_l, sizeof(size_t), 1, in_fd) > 0) { // Reading read name - if (s < length) + if (s < rname_l) { // Resize lengths and pointers - s = length; - rname = (char *)realloc(rname, m * sizeof(char)); + s = rname_l; + rname = (char *)realloc(rname, s * sizeof(char)); } - if ((fread(rname, sizeof(char), length, in_fd)) != length) + if ((fread(rname, sizeof(char), rname_l, in_fd)) != rname_l) error("fread() file " + std::string(tmp_filename) + " failed"); - - // TODO: Store the fasta headers somewhere - f_mems << ">" + std::string(rname,length) << endl; + + // In case of SAM output read also the sequence and quals + if (args.sam_output) + { + if ((fread(&s_length, sizeof(size_t), 1, in_fd)) != 1) + error("fread() file " + std::string(tmp_filename) + " failed"); + if (rseq_l < s_length) + { + // Resize s_lengths and pointers + rseq_l = s_length; + rseq = (char *)realloc(rseq, rseq_l * sizeof(char)); + rqual = (char *)realloc(rqual, rseq_l * sizeof(char)); + } + if ((fread(rseq, sizeof(char), s_length, in_fd)) != s_length) + error("fread() file " + std::string(tmp_filename) + " failed"); + if ((fread(rqual, sizeof(char), s_length, in_fd)) != s_length) + error("fread() file " + std::string(tmp_filename) + " failed"); + } + else + { + f_mems << ">" + std::string(rname, rname_l) << endl; + } // Reading MEMs if ((fread(&length, sizeof(size_t), 1, in_fd)) != 1) @@ -605,20 +653,40 @@ void dispatcher(Args &args) // TODO: Store the fasta headers somewhere // f_mems << ">" + std::to_string(n_seq) << endl; - if (args.extended_output){ + if (args.sam_output){ for (size_t i = 0; i < length; ++i) { + size_t mem_pos = std::get<0>(mem[i]); + size_t mem_len = std::get<1>(mem[i]); std::pair pos = idx.index(std::get<2>(mem[i])); - f_mems << "(" << std::get<0>(mem[i]) << "," << std::get<1>(mem[i]) << "," << pos.first << "," << pos.second << ") "; - } - } else { - for (size_t i = 0; i < length; ++i) - { - f_mems << "(" << std::get<0>(mem[i]) << "," << std::get<1>(mem[i]) << ") "; + f_mems << std::string(rname,rname_l) + "\t"; + // First MEM is primary, all other MEMs are non primary + f_mems << (i?"256\t":"0\t"); + f_mems << pos.first << "\t" << pos.second + 1<< "\t60\t"; + std::string cigar = ""; + if (mem_pos > 0) cigar += std::to_string(mem_pos) + "S"; + cigar += std::to_string(mem_len) + "M"; + size_t suff_length = s_length - (mem_pos + mem_len); + if (suff_length > 0) cigar += std::to_string(suff_length) + "S"; + f_mems << cigar + "\t" + std::string(rseq, s_length) + "\t" + std::string(rqual, s_length) + "\n"; } + } else + { + if (args.extended_output){ + for (size_t i = 0; i < length; ++i) + { + std::pair pos = idx.index(std::get<2>(mem[i])); + f_mems << "(" << std::get<0>(mem[i]) << "," << std::get<1>(mem[i]) << "," << pos.first << "," << pos.second << ") "; + } + } else { + for (size_t i = 0; i < length; ++i) + { + f_mems << "(" << std::get<0>(mem[i]) << "," << std::get<1>(mem[i]) << ") "; + } + } + f_mems << endl; } - f_mems << endl; n_seq++; } @@ -645,11 +713,17 @@ int main(int argc, char *const argv[]) if (args.shaped_slp) { - dispatcher>(args); + if (args.sam_output) + dispatcher>(args); + else + dispatcher>(args); } else { - dispatcher>(args); + if(args.sam_output) + dispatcher>(args); + else + dispatcher>(args); } return 0; } \ No newline at end of file