Skip to content

Commit

Permalink
Feature: Add SAM output to moni mems (#11)
Browse files Browse the repository at this point in the history
* Add SAM output option for moni mems

* Bump version

* Fix missing declarations

* Extract MEM pos and length

* convert string to to_string

* Add -s option to option parser

* Pass SAM output parameter to mem_c

* Update README
  • Loading branch information
maxrossi91 authored Sep 19, 2024
1 parent 9c36dcd commit 4709421
Show file tree
Hide file tree
Showing 6 changed files with 114 additions and 35 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ message(STATUS "Install directory: ${CMAKE_INSTALL_PREFIX}")
project(moni)
SET(VERSION_MAJOR "0")
SET(VERSION_MINOR "2")
SET(VERSION_PATCH "1")
SET(VERSION_PATCH "2")
SET(VERSION "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}")
message("version: ${VERSION}")

Expand Down Expand Up @@ -83,7 +83,7 @@ include(InstallRequiredSystemLibraries)
set(CPACK_GENERATOR "STGZ;TGZ;DEB")
set(CPACK_SOURCE_GENERATOR "TGZ")
set(CPACK_PACKAGE_VENDOR "University of Florida")
set(CPACK_PACKAGE_CONTACT "[email protected]")
set(CPACK_PACKAGE_CONTACT "[email protected]")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "MONI - Pangenomic index for finding MEMs")
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
set(CPACK_RESOURCE_FILE_README "${CMAKE_CURRENT_SOURCE_DIR}/README.md")
Expand Down
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[![Release](https://img.shields.io/github/release/maxrossi91/moni.svg)](https://github.com/maxrossi91/moni/releases)
[![Downloads](https://img.shields.io/github/downloads/maxrossi91/moni/total?logo=github)](https://github.com/maxrossi91/moni/releases/download/v0.2.1/moni_0.2.1_amd64.deb)
[![Downloads](https://img.shields.io/github/downloads/maxrossi91/moni/total?logo=github)](https://github.com/maxrossi91/moni/releases/download/v0.2.2/moni_0.2.2_amd64.deb)

# MONI
```console
Expand All @@ -9,7 +9,7 @@
| |\/| | | | | . ` | | |
| | | | |__| | |\ |_| |_
|_| |_|\____/|_| \_|_____|
ver 0.2.1
ver 0.2.2
```
A Pangenomics Index for Finding MEMs.

Expand Down Expand Up @@ -57,7 +57,7 @@ usage: moni ms [-h] -i INDEX -p PATTERN [-o OUTPUT] [-t THREADS]

### Computing the matching statistics with MONI:
```
usage: moni mems [-h] -i INDEX -p PATTERN [-o OUTPUT] [-e] [-t THREADS]
usage: moni mems [-h] -i INDEX -p PATTERN [-o OUTPUT] [-e] [-s] [-t THREADS]
-h, --help show this help message and exit
-i INDEX, --index INDEX
reference index base name (default: None)
Expand All @@ -67,6 +67,8 @@ usage: moni mems [-h] -i INDEX -p PATTERN [-o OUTPUT] [-e] [-t THREADS]
output directory path (default: .)
-e, --extended-output
output MEM occurrence in the reference (default: False)
-s, --sam-output
output MEM in a SAM formatted file. (default: False)
-t THREADS, --threads THREADS
number of helper threads (default: 1)
-g GRAMMAR, --grammar GRAMMAR
Expand Down
2 changes: 1 addition & 1 deletion include/extender/extender_klib.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ class extender
{
std::string res = "@HD\tVN:1.6\tSO:unknown\n";
res += idx.to_sam();
res += "@PG\tID:moni\tPN:moni\tVN:0.1.0\n";
res += "@PG\tID:moni\tPN:moni\tVN:0.2.2\n";
return res;
}

Expand Down
2 changes: 1 addition & 1 deletion include/extender/extender_ksw2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -740,7 +740,7 @@ class extender
{
std::string res = "@HD\tVN:1.6\tSO:unknown\n";
res += idx.to_sam();
res += "@PG\tID:moni\tPN:moni\tVN:0.1.0\n";
res += "@PG\tID:moni\tPN:moni\tVN:0.2.2\n";
return res;
}

Expand Down
5 changes: 4 additions & 1 deletion pipeline/moni.in
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Description = """
| |\/| | | | | . ` | | |
| | | | |__| | |\ |_| |_
|_| |_|\____/|_| \_|_____|
ver 0.2.1
ver 0.2.2
A Pangenomics Index for Finding MEMs.
Build the index for highly repetive files using the approach
Expand Down Expand Up @@ -558,6 +558,8 @@ class run_helper(threading.Thread):
command += f" -l {args.min_length}"
if args.extended_output:
command += " -e"
if args.sam_output:
command += " -s"
if args.output != ".":
command += " -o {}".format(args.output)

Expand Down Expand Up @@ -655,6 +657,7 @@ def main():
mems_parser.add_argument('-p', '--pattern', help='the input query', type=str, required=True)
mems_parser.add_argument('-o', '--output', help='output file prefix', type=str, default='.')
mems_parser.add_argument('-e', '--extended-output', help='output MEM occurrence in the reference', action='store_true')
mems_parser.add_argument('-s', '--sam-output', help='output MEM in a SAM formatted file.', action='store_true')
mems_parser.add_argument('-l', '--minimum-MEM-length', help='minimum MEM length', type=int, default=35, dest='min_length')
mems_parser.add_argument('-t', '--threads', help='number of helper threads', default=1, type=int)
mems_parser.add_argument('-g', '--grammar', help='select the grammar [plain, shaped]', type=str, default='plain')
Expand Down
128 changes: 101 additions & 27 deletions src/mems.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ std::string get_slp_file_extension<plain_slp_t>()
}
////////////////////////////////////////////////////////////////////////////////

template <typename slp_t>
template <typename slp_t, bool sam_output = false>
class mems_c
{
public:
Expand Down Expand Up @@ -240,7 +240,7 @@ class mems_c
// the read, the next size_t integer stores the number m of MEMs, and the
// following m size_t pairs of integers stores the positions and lengths of
// the MEMs.
void maxrimal_exact_matches(kseq_t *read, FILE* out)
void maximal_exact_matches(kseq_t *read, FILE* out)
{
auto pointers = ms.query(read->seq.s, read->seq.l);
std::vector<size_t> lengths(pointers.size());
Expand Down Expand Up @@ -276,6 +276,13 @@ class mems_c
size_t h_length = read->name.l;
fwrite(&h_length, sizeof(size_t), 1,out);
fwrite(read->name.s, sizeof(char),h_length,out);
if(sam_output)
{
size_t s_length = read->seq.l;
fwrite(&s_length, sizeof(size_t), 1,out);
fwrite(read->seq.s, sizeof(char),s_length,out);
fwrite(read->qual.s, sizeof(char),s_length,out);
}
size_t q_length = mems.size();
fwrite(&q_length, sizeof(size_t), 1,out);
fwrite(mems.data(), sizeof(std::tuple<size_t,size_t,size_t>),q_length,out);
Expand Down Expand Up @@ -343,7 +350,7 @@ void *mt_ms_worker(void *param)
while ((ks_tell(seq) < p->end) && ((l = kseq_read(seq)) >= 0))
{

p->ms->maxrimal_exact_matches(seq,out_fd);
p->ms->maximal_exact_matches(seq,out_fd);

}

Expand Down Expand Up @@ -405,7 +412,7 @@ size_t st_ms(ms_t *ms, std::string pattern_filename, std::string out_filename)
while ((l = kseq_read(seq)) >= 0)
{

ms->maxrimal_exact_matches(seq, out_fd);
ms->maximal_exact_matches(seq, out_fd);

}

Expand All @@ -431,7 +438,8 @@ struct Args
size_t l = 25; // minumum MEM length
size_t th = 1; // number of threads
bool shaped_slp = false; // use shaped slp
bool extended_output = false; // use shaped slp
bool extended_output = false; // print one MEM occurrence in the reference
bool sam_output = false; // output MEMs in SAM format
};

void parseArgs(int argc, char *const argv[], Args &arg)
Expand All @@ -440,17 +448,18 @@ void parseArgs(int argc, char *const argv[], Args &arg)
extern char *optarg;
extern int optind;

std::string usage("usage: " + std::string(argv[0]) + " infile [-p patterns] [-o output] [-t threads] [-l len] [-q shaped_slp] [-e extended_output] [-b batch]\n\n" +
std::string usage("usage: " + std::string(argv[0]) + " infile [-p patterns] [-o output] [-t threads] [-l len] [-q shaped_slp] [-e extended_output] [-s sam_output] [-b batch]\n\n" +
"Copmputes the matching statistics of the reads in the pattern against the reference index in infile.\n" +
" shaped_slp: [boolean] - use shaped slp. (def. false)\n" +
"extended_output: [boolean] - print one MEM occurrence in ref. (def. false)\n" +
" sam_output: [boolean] - print output in SAM format. (def. false)\n" +
" pattens: [string] - path to patterns file.\n" +
" output: [string] - output file prefix.\n" +
" len: [integer] - minimum MEM lengt (def. 25)\n" +
" thread: [integer] - number of threads (def. 1)\n");

std::string sarg;
while ((c = getopt(argc, argv, "l:hp:o:t:qe")) != -1)
while ((c = getopt(argc, argv, "l:hp:o:t:qes")) != -1)
{
switch (c)
{
Expand All @@ -474,6 +483,9 @@ void parseArgs(int argc, char *const argv[], Args &arg)
case 'e':
arg.extended_output = true;
break;
case 's':
arg.sam_output = true;
break;
case 'h':
error(usage);
case '?':
Expand All @@ -490,6 +502,10 @@ void parseArgs(int argc, char *const argv[], Args &arg)
{
error("Invalid number of arguments\n", usage);
}

if (arg.extended_output && arg.sam_output) {
error("Cannot specify both extended_output and sam_output flags.\n", usage);
}
}

//********** end argument options ********************
Expand Down Expand Up @@ -554,10 +570,18 @@ void dispatcher(Args &args)
verbose("Printing plain output");
t_insert_start = std::chrono::high_resolution_clock::now();

std::ofstream f_mems(out_filename + ".mems");
std::string mems_file_suffix = args.sam_output ? ".sam" : ".mems";
std::ofstream f_mems(out_filename + mems_file_suffix);

if (!f_mems.is_open())
error("open() file " + std::string(out_filename) + ".mems failed");
error("open() file " + std::string(out_filename) + mems_file_suffix + " failed");

if(args.sam_output)
{
f_mems << "@HD\tVN:1.6\tSO:unknown\n";
f_mems << idx.to_sam();
f_mems << "@PG\tID:moni\tPN:moni\tVN:0.2.2\n";
}

size_t n_seq = 0;
for (size_t i = 0; i < args.th; ++i)
Expand All @@ -569,25 +593,49 @@ void dispatcher(Args &args)
error("open() file " + tmp_filename + " failed");

size_t length = 0;
size_t rname_l = 0;
size_t s_length = 0;
size_t m = 100; // Reserved size for pointers and lengths
std::vector<std::tuple<size_t,size_t,size_t>> mem(m);
size_t s = 100; // Reserved size for read name
size_t rseq_l = 100; // Reserved size for seq and qual
char* rname = (char *)malloc(s * sizeof(char));
while (!feof(in_fd) and fread(&length, sizeof(size_t), 1, in_fd) > 0)
char *rseq = (char *)malloc(rseq_l * sizeof(char));
char *rqual = (char *)malloc(rseq_l * sizeof(char));
while (!feof(in_fd) and fread(&rname_l, sizeof(size_t), 1, in_fd) > 0)
{
// Reading read name
if (s < length)
if (s < rname_l)
{
// Resize lengths and pointers
s = length;
rname = (char *)realloc(rname, m * sizeof(char));
s = rname_l;
rname = (char *)realloc(rname, s * sizeof(char));
}

if ((fread(rname, sizeof(char), length, in_fd)) != length)
if ((fread(rname, sizeof(char), rname_l, in_fd)) != rname_l)
error("fread() file " + std::string(tmp_filename) + " failed");

// TODO: Store the fasta headers somewhere
f_mems << ">" + std::string(rname,length) << endl;

// In case of SAM output read also the sequence and quals
if (args.sam_output)
{
if ((fread(&s_length, sizeof(size_t), 1, in_fd)) != 1)
error("fread() file " + std::string(tmp_filename) + " failed");
if (rseq_l < s_length)
{
// Resize s_lengths and pointers
rseq_l = s_length;
rseq = (char *)realloc(rseq, rseq_l * sizeof(char));
rqual = (char *)realloc(rqual, rseq_l * sizeof(char));
}
if ((fread(rseq, sizeof(char), s_length, in_fd)) != s_length)
error("fread() file " + std::string(tmp_filename) + " failed");
if ((fread(rqual, sizeof(char), s_length, in_fd)) != s_length)
error("fread() file " + std::string(tmp_filename) + " failed");
}
else
{
f_mems << ">" + std::string(rname, rname_l) << endl;
}

// Reading MEMs
if ((fread(&length, sizeof(size_t), 1, in_fd)) != 1)
Expand All @@ -605,20 +653,40 @@ void dispatcher(Args &args)

// TODO: Store the fasta headers somewhere
// f_mems << ">" + std::to_string(n_seq) << endl;
if (args.extended_output){
if (args.sam_output){
for (size_t i = 0; i < length; ++i)
{
size_t mem_pos = std::get<0>(mem[i]);
size_t mem_len = std::get<1>(mem[i]);
std::pair<std::string, size_t> pos = idx.index(std::get<2>(mem[i]));
f_mems << "(" << std::get<0>(mem[i]) << "," << std::get<1>(mem[i]) << "," << pos.first << "," << pos.second << ") ";
}
} else {
for (size_t i = 0; i < length; ++i)
{
f_mems << "(" << std::get<0>(mem[i]) << "," << std::get<1>(mem[i]) << ") ";
f_mems << std::string(rname,rname_l) + "\t";
// First MEM is primary, all other MEMs are non primary
f_mems << (i?"256\t":"0\t");
f_mems << pos.first << "\t" << pos.second + 1<< "\t60\t";
std::string cigar = "";
if (mem_pos > 0) cigar += std::to_string(mem_pos) + "S";
cigar += std::to_string(mem_len) + "M";
size_t suff_length = s_length - (mem_pos + mem_len);
if (suff_length > 0) cigar += std::to_string(suff_length) + "S";
f_mems << cigar + "\t" + std::string(rseq, s_length) + "\t" + std::string(rqual, s_length) + "\n";
}
} else
{
if (args.extended_output){
for (size_t i = 0; i < length; ++i)
{
std::pair<std::string, size_t> pos = idx.index(std::get<2>(mem[i]));
f_mems << "(" << std::get<0>(mem[i]) << "," << std::get<1>(mem[i]) << "," << pos.first << "," << pos.second << ") ";
}
} else {
for (size_t i = 0; i < length; ++i)
{
f_mems << "(" << std::get<0>(mem[i]) << "," << std::get<1>(mem[i]) << ") ";
}

}
f_mems << endl;
}
f_mems << endl;

n_seq++;
}
Expand All @@ -645,11 +713,17 @@ int main(int argc, char *const argv[])

if (args.shaped_slp)
{
dispatcher<mems_c<shaped_slp_t>>(args);
if (args.sam_output)
dispatcher<mems_c<shaped_slp_t,true>>(args);
else
dispatcher<mems_c<shaped_slp_t,false>>(args);
}
else
{
dispatcher<mems_c<plain_slp_t>>(args);
if(args.sam_output)
dispatcher<mems_c<plain_slp_t,true>>(args);
else
dispatcher<mems_c<plain_slp_t,false>>(args);
}
return 0;
}

0 comments on commit 4709421

Please sign in to comment.