Skip to content

Commit

Permalink
gff input support
Browse files Browse the repository at this point in the history
  • Loading branch information
mikolmogorov committed Oct 25, 2020
1 parent 3a3aa6b commit 357c971
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 12 deletions.
21 changes: 17 additions & 4 deletions src/breakpoint_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ BreakpointGraph::~BreakpointGraph()
BreakpointGraph::BreakpointGraph(const std::vector<Permutation>& permutations)
{
DEBUG_PRINT("Building breakpoint graph");
int maxBlockOverlap = 0;
for (auto &perm : permutations)
{
assert(!perm.blocks.empty());
Expand Down Expand Up @@ -62,11 +63,14 @@ BreakpointGraph::BreakpointGraph(const std::vector<Permutation>& permutations)
int leftPos = blockPair.first.end;
int rightPos = blockPair.second.start;

if (rightPos < leftPos)
maxBlockOverlap = std::max(maxBlockOverlap, leftPos - rightPos);
if (rightPos < leftPos) leftPos = rightPos;

/*if (rightPos < leftPos)
std::cerr << "WARNING: overlapping blocks\n"
<< blockPair.first.start << " " << blockPair.first.end
<< " " << blockPair.second.start << " "
<< blockPair.second.end << " | " << perm.seqId << "\n";
<< blockPair.second.end << " | " << perm.seqId << "\n";*/

curEdge = this->addEdge(-blockPair.first.signedId(),
blockPair.second.signedId(), perm.seqId);
Expand All @@ -82,10 +86,19 @@ BreakpointGraph::BreakpointGraph(const std::vector<Permutation>& permutations)
tailEdge->prevEdge = curEdge;
}

if (maxBlockOverlap > 0)
{
std::cerr << "\n\tWARNING: some alignment blocks were overlapping "
"by at most " << maxBlockOverlap << "\n" <<
"\tIt is unexpected and might result into nonsense synteny blocks.\n" <<
"\tPlease note that this tool currently support alignments\n" <<
"\tproduced by either Cactus or SibeliaZ.\n\n";

}
DEBUG_PRINT("Constructed graph with " << _nodes.size() << " nodes");
}

namespace
/*namespace
{
std::unordered_map<Edge*, int> getConjunctionEdges(BreakpointGraph& bg)
{
Expand Down Expand Up @@ -128,7 +141,7 @@ namespace
DEBUG_PRINT("Getting conjunction edges - done");
return edgeToGroup;
}
}
}*/

void BreakpointGraph::getPermutations(PermVec& permutations,
BlockGroups& blockGroups)
Expand Down
79 changes: 78 additions & 1 deletion src/maf_tools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,28 @@
#include <sstream>
#include <iostream>

namespace
{
std::vector<std::string> split(const std::string& str, const std::string& delim)
{
std::vector<std::string> tokens;
size_t prev = 0, pos = 0;
do
{
pos = str.find(delim, prev);
if (pos == std::string::npos) pos = str.length();
std::string token = str.substr(prev, pos-prev);
if (!token.empty()) tokens.push_back(token);
prev = pos + delim.length();
}
while (pos < str.length() && prev < str.length());
return tokens;
}
}

PermVec mafToPermutations(const std::string& mafFile, int minBlockLen)
{
DEBUG_PRINT("Reading maf file");
std::cerr << "\tReading maf file" << std::endl;
const float MAX_GAP_RATE = 0.3;

int blockId = 1;
Expand Down Expand Up @@ -99,3 +118,61 @@ PermVec mafToPermutations(const std::string& mafFile, int minBlockLen)
DEBUG_PRINT("Finished reading");
return permutations;
}

PermVec parseGff(const std::string& filename, int minBlockLen)
{
std::cerr << "\tReading GFF file" << std::endl;

std::unordered_map<std::string, Permutation> permBySeqId;
//std::unordered_map<std::string, std::vector<Block>> newBlocks;

std::ifstream fin(filename);
if (!fin) throw std::runtime_error("Cannot open " + filename);

std::string line;
while(!fin.eof())
{
std::getline(fin, line);
if (line.empty()) continue;
if (line[0] == '#') continue;

auto tokens = split(line, "\t");
if (tokens.size() < 9) throw std::runtime_error("Error reading GFF");

std::string seqName = tokens[0];
int start = atoi(tokens[3].c_str());
int end = atoi(tokens[4].c_str());
int sign = tokens[6] == "+" ? 1 : -1;
std::string attributes = tokens[8];

int blockId = 0;
for (auto& attr : split(attributes, ";"))
{
if (attr.substr(0, 2) == "id") blockId = atoi(attr.substr(3).c_str());
//std::cout << attr << " " << blockId << std::endl;
}
if (blockId == 0) throw std::runtime_error("Error getting block ID");

//std::cerr << seqName << " " << blockId << " " << start << " " << end << std::endl;

permBySeqId[seqName].blocks.push_back(Block(blockId, sign, start, end));
permBySeqId[seqName].nucLength = std::max(permBySeqId[seqName].nucLength, end);
permBySeqId[seqName].seqName = seqName;
}

std::vector<Permutation> permutations;
int seqId = 1;
auto cmp = [](const Block& a, const Block& b) {return a.start < b.start;};
for (auto p : permBySeqId)
{
if (!p.second.blocks.empty())
{
permutations.push_back(std::move(p.second));
permutations.back().seqId = seqId++;
std::sort(permutations.back().blocks.begin(),
permutations.back().blocks.end(), cmp);
}
}
DEBUG_PRINT("Finished reading");
return permutations;
}
2 changes: 2 additions & 0 deletions src/maf_tools.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@
#include "breakpoint_graph.h"

PermVec mafToPermutations(const std::string& mafFile, int minBlockLen);

PermVec parseGff(const std::string& filename, int minBlockLen);
41 changes: 34 additions & 7 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,28 @@ std::vector<ParamPair> parseSimplParamsFile(const std::string& filename)
return out;
}

bool isMaf(const std::string& filename)
{
size_t dotPos = filename.rfind(".");
if (dotPos == std::string::npos)
{
throw std::runtime_error("Can't identify input file type");
}
std::string suffix = filename.substr(dotPos + 1);

if (suffix == "maf")
{
return true;
}
else if (suffix == "gff")
{
return false;
}
throw std::runtime_error("Can't identify input file type");
}

static std::vector<ParamPair> DEFAULT_PARAMS =
{{10, 10}, {30, 100}, {50, 5000}, {100, 7000}, {500, 10000},
{1500, 50000}, {5000, 100000}, {10000, 500000}, {50000, 1000000}};
{{30, 10}, {100, 100}, {500, 1000}, {1000, 5000}, {5000, 15000}};

void doJob(const std::string& inputMaf, const std::string& outDir,
std::vector<ParamPair> simplParams, std::vector<int> minBlockSizes)
Expand All @@ -118,9 +137,17 @@ void doJob(const std::string& inputMaf, const std::string& outDir,
std::sort(minBlockSizes.begin(), minBlockSizes.end(), std::greater<int>());
makeDirectory(outDir);

//read maf alignment and join adjacent columns
std::cerr << "\tParsing MAF file\n";
PermVec mafBlocks = mafToPermutations(inputMaf, MIN_ALIGNMENT);
//read block coordinates from file (either maf or gff)
PermVec mafBlocks;
if (isMaf(inputMaf))
{
mafBlocks = mafToPermutations(inputMaf, MIN_ALIGNMENT);
}
else
{
mafBlocks = parseGff(inputMaf, MIN_ALIGNMENT);
}

compressPaths(mafBlocks, MAX_ALIGNMENT_GAP, currentBlocks, blockGroups);

//iterative simplification
Expand Down Expand Up @@ -161,9 +188,9 @@ bool parseArgs(int argc, char** argv, std::string& mafFile, std::string& outDir,
auto printUsage = []()
{
std::cerr << "Usage: maf2synteny [-o out_dir] [-s simpl_params] "
<< "[-m block_sizes] maf_file\n\n"
<< "[-m block_sizes] alignment_file\n\n"
<< "positional arguments:\n"
<< "\tmaf_file\tpath to maf file\n"
<< "\talignment_file\tpath to alignment file in maf or gff format\n"
<< "\noptional arguments:\n"
<< "\t-o out_dir\tpath to the output directory [default = .]\n"
<< "\t-s simpl_params\tpath to a file with custom "
Expand Down

0 comments on commit 357c971

Please sign in to comment.