Skip to content

Commit

Permalink
Merge commit '68c8e7b92ad975d80702785d1509ba53f08be73f'
Browse files Browse the repository at this point in the history
  • Loading branch information
milot-mirdita committed Apr 21, 2021
2 parents ca97725 + 68c8e7b commit c12da22
Show file tree
Hide file tree
Showing 14 changed files with 156 additions and 85 deletions.
55 changes: 39 additions & 16 deletions lib/mmseqs/data/workflow/databases.sh
Original file line number Diff line number Diff line change
Expand Up @@ -185,21 +185,38 @@ case "${SELECTION}" in
"eggNOG")
if notExists "${TMP_PATH}/download.done"; then
date "+%s" > "${TMP_PATH}/version"
downloadFile "http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/2/2_raw_algs.tar" "${TMP_PATH}/bacteria"
downloadFile "http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/2157/2157_raw_algs.tar" "${TMP_PATH}/archea"
downloadFile "http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/2759/2759_raw_algs.tar" "${TMP_PATH}/eukaryota"
downloadFile "http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/10239/10239_raw_algs.tar" "${TMP_PATH}/viruses"
downloadFile "http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/2/2_raw_algs.tar" "${TMP_PATH}/bacteria.tar"
downloadFile "http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/2157/2157_raw_algs.tar" "${TMP_PATH}/archea.tar"
downloadFile "http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/2759/2759_raw_algs.tar" "${TMP_PATH}/eukaryota.tar"
downloadFile "http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/10239/10239_raw_algs.tar" "${TMP_PATH}/viruses.tar"
touch "${TMP_PATH}/download.done"
fi
INPUT_TYPE="eggNOG"
push_back "${TMP_PATH}/bacteria.tar"
push_back "${TMP_PATH}/archea.tar"
push_back "${TMP_PATH}/eukaryota.tar"
push_back "${TMP_PATH}/viruses.tar"
TAR2DB_INCLUDE='\.raw_alg\.faa\.gz$'
SED_FIX_LOOKUP='s|\.raw_alg\.faa\.gz||g'
;;
"VOGDB")
if notExists "${TMP_PATH}/download.done"; then
downloadFile "http://fileshare.csb.univie.ac.at/vog/latest/release.txt" "${TMP_PATH}/version"
downloadFile "http://fileshare.csb.univie.ac.at/vog/latest/vog.raw_algs.tar.gz" "${TMP_PATH}/vog.tar.gz"
touch "${TMP_PATH}/download.done"
fi
INPUT_TYPE="eggNOG"
push_back "${TMP_PATH}/vog.tar.gz"
TAR2DB_INCLUDE='\.msa$'
SED_FIX_LOOKUP='s|\.msa||g'
;;
"CDD")
if notExists "${TMP_PATH}/msa.msa.gz"; then
downloadFile "https://ftp.ncbi.nih.gov/pub/mmdb/cdd/cdd.info" "${TMP_PATH}/version"
downloadFile "https://ftp.ncbi.nih.gov/pub/mmdb/cdd/fasta.tar.gz" "${TMP_PATH}/msa.tar.gz"
fi
INPUT_TYPE="FASTA_MSA"
FASTA_MSA_SED='s|\.FASTA||g'
SED_FIX_LOOKUP='s|\.FASTA||g'
FASTA_MSA_MSA2PROFILE_PAR="--skip-query"
;;
"Resfinder")
Expand All @@ -214,11 +231,12 @@ case "${SELECTION}" in
;;
"dbCAN2")
if notExists "${TMP_PATH}/download.done"; then
downloadFile "http://bcb.unl.edu/dbCAN2/download/dbCAN-fam-aln-V8.tar.gz" "${TMP_PATH}/msa.tar.gz"
printf "8 %s\n" "$(date "+%s")" > "${TMP_PATH}/version"
downloadFile "http://bcb.unl.edu/dbCAN2/download/dbCAN-fam-aln-V9.tar.gz" "${TMP_PATH}/msa.tar.gz"
printf "9 %s\n" "$(date "+%s")" > "${TMP_PATH}/version"
touch "${TMP_PATH}/download.done"
fi
INPUT_TYPE="FASTA_MSA"
SED_FIX_LOOKUP='s|\.aln||g'
;;
"SILVA")
if notExists "${TMP_PATH}/download.done"; then
Expand Down Expand Up @@ -311,11 +329,11 @@ case "${INPUT_TYPE}" in
# shellcheck disable=SC2086
"${MMSEQS}" tar2db "${TMP_PATH}/msa.tar.gz" "${TMP_PATH}/msa" --output-dbtype 11 ${THREADS_PAR} \
|| fail "tar2db died"
if [ -n "${FASTA_MSA_SED}" ]; then
sed "${FASTA_MSA_SED}" "${TMP_PATH}/msa.lookup" > "${TMP_PATH}/msa.lookup_tmp"
mv -f "${TMP_PATH}/msa.lookup_tmp" "${TMP_PATH}/msa.lookup"
if [ -n "${SED_FIX_LOOKUP}" ]; then
sed "${SED_FIX_LOOKUP}" "${TMP_PATH}/msa.lookup" > "${TMP_PATH}/msa.lookup_tmp"
mv -f -- "${TMP_PATH}/msa.lookup_tmp" "${TMP_PATH}/msa.lookup"
fi
rm -f "${TMP_PATH}/msa.tar.gz"
rm -f -- "${TMP_PATH}/msa.tar.gz"
# shellcheck disable=SC2086
"${MMSEQS}" msa2profile "${TMP_PATH}/msa" "${OUTDB}" --match-mode 1 --match-ratio 0.5 ${FASTA_MSA_MSA2PROFILE_PAR} ${THREADS_PAR} \
|| fail "msa2profile died"
Expand All @@ -326,17 +344,22 @@ case "${INPUT_TYPE}" in
fi
;;
"eggNOG")
eval "set -- $ARR"
# shellcheck disable=SC2086
"${MMSEQS}" tar2db "${TMP_PATH}/bacteria" "${TMP_PATH}/archea" "${TMP_PATH}/eukaryota" "${TMP_PATH}/viruses" "${TMP_PATH}/msa" --output-dbtype 11 --tar-include '\.raw_alg\.faa\.gz$' ${THREADS_PAR} \
"${MMSEQS}" tar2db "${@}" "${TMP_PATH}/msa" --output-dbtype 11 --tar-include "${TAR2DB_INCLUDE}" ${THREADS_PAR} \
|| fail "tar2db died"
rm -f "${TMP_PATH}/bacteria.tar" "${TMP_PATH}/archea.tar" "${TMP_PATH}/eukaryota.tar" "${TMP_PATH}/viruses.tar"
rm -f -- "${@}"
if [ -n "${SED_FIX_LOOKUP}" ]; then
sed "${SED_FIX_LOOKUP}" "${TMP_PATH}/msa.lookup" > "${TMP_PATH}/msa.lookup_tmp"
mv -f -- "${TMP_PATH}/msa.lookup_tmp" "${TMP_PATH}/msa.lookup"
fi
sed 's|\.raw_alg\.faa\.gz||g' "${TMP_PATH}/msa.lookup" > "${TMP_PATH}/msa.lookup.tmp"
mv -f "${TMP_PATH}/msa.lookup.tmp" "${TMP_PATH}/msa.lookup"
mv -f -- "${TMP_PATH}/msa.lookup.tmp" "${TMP_PATH}/msa.lookup"
# shellcheck disable=SC2086
"${MMSEQS}" msa2profile "${TMP_PATH}/msa" "${OUTDB}" --match-mode 1 --match-ratio 0.5 ${THREADS_PAR} \
|| fail "msa2profile died"
mv -f "${TMP_PATH}/msa.lookup" "${OUTDB}.lookup"
mv -f "${TMP_PATH}/msa.source" "${OUTDB}.source"
mv -f -- "${TMP_PATH}/msa.lookup" "${OUTDB}.lookup"
mv -f -- "${TMP_PATH}/msa.source" "${OUTDB}.source"
if [ -n "${REMOVE_TMP}" ]; then
# shellcheck disable=SC2086
"${MMSEQS}" rmdb "${TMP_PATH}/msa" ${VERB_PAR} \
Expand Down
4 changes: 2 additions & 2 deletions lib/mmseqs/src/alignment/Alignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,16 @@ Alignment::Alignment(const std::string &querySeqDB, const std::string &targetSeq
EXIT(EXIT_FAILURE);
}

if (addBacktrace == true) {
if (lcaAlign == false && addBacktrace == true) {
alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID;
}


if (lcaAlign == true) {
lcaSwMode = initSWMode(std::max(alignmentMode, (unsigned int)Parameters::ALIGNMENT_MODE_SCORE_ONLY), 0.0f, 0.0f);
realign = true;
realignScoreBias = 0.0f;
realignMaxSeqs = 1;
addBacktrace = false;
}

if (realign == true) {
Expand Down
8 changes: 6 additions & 2 deletions lib/mmseqs/src/alignment/rescorediagonal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,12 @@ int doRescorediagonal(Parameters &par,
if (totalMemory > resultReader.getTotalDataSize()) {
flushSize = resultReader.getSize();
}
size_t iterations = static_cast<int>(ceil(static_cast<double>(dbSize) / static_cast<double>(flushSize)));


size_t iterations = 1;
if(flushSize > 0){
iterations = static_cast<int>(ceil(static_cast<double>(dbSize) / static_cast<double>(flushSize)));
}

for (size_t i = 0; i < iterations; i++) {
size_t start = dbFrom + (i * flushSize);
size_t bucketSize = std::min(dbSize - (i * flushSize), flushSize);
Expand Down
30 changes: 13 additions & 17 deletions lib/mmseqs/src/commons/DBReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ template <typename T> bool DBReader<T>::open(int accessType){
for(size_t fileIdx = 0; fileIdx < dataFileNames.size(); fileIdx++){
FILE* dataFile = fopen(dataFileNames[fileIdx].c_str(), "r");
if (dataFile == NULL) {
Debug(Debug::ERROR) << "Can not open data file " << dataFileName << "!\n";
Debug(Debug::ERROR) << "Cannot open data file " << dataFileName << "!\n";
EXIT(EXIT_FAILURE);
}
size_t dataSize;
Expand All @@ -141,13 +141,13 @@ template <typename T> bool DBReader<T>::open(int accessType){
}
if (dataMode & USE_LOOKUP || dataMode & USE_LOOKUP_REV) {
std::string lookupFilename = (std::string(dataFileName) + ".lookup");
if(FileUtil::fileExists(lookupFilename.c_str()) == false){
Debug(Debug::ERROR) << "Can not open lookup file " << lookupFilename << "!\n";
MemoryMapped lookupData(lookupFilename, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
if (lookupData.isValid() == false) {
Debug(Debug::ERROR) << "Cannot open lookup file " << lookupFilename << "!\n";
EXIT(EXIT_FAILURE);
}
MemoryMapped indexData(lookupFilename, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
char* lookupDataChar = (char *) indexData.getData();
size_t lookupDataSize = indexData.size();
char* lookupDataChar = (char *) lookupData.getData();
size_t lookupDataSize = lookupData.size();
lookupSize = Util::ompCountLines(lookupDataChar, lookupDataSize, threads);
lookup = new(std::nothrow) LookupEntry[this->lookupSize];
incrementMemory(sizeof(LookupEntry) * this->lookupSize);
Expand All @@ -157,25 +157,21 @@ template <typename T> bool DBReader<T>::open(int accessType){
} else {
SORT_PARALLEL(lookup, lookup + lookupSize, LookupEntry::compareByAccession);
}
indexData.close();
lookupData.close();
}
bool isSortedById = false;
if (externalData == false) {
if(FileUtil::fileExists(indexFileName)==false){
Debug(Debug::ERROR) << "Can not open index file " << indexFileName << "!\n";
EXIT(EXIT_FAILURE);
}
MemoryMapped indexData(indexFileName, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
if (!indexData.isValid()){
Debug(Debug::ERROR) << "Can map open index file " << indexFileName << "\n";
Debug(Debug::ERROR) << "Cannot open index file " << indexFileName << "\n";
EXIT(EXIT_FAILURE);
}
char* indexDataChar = (char *) indexData.getData();
size_t indexDataSize = indexData.size();
size = Util::ompCountLines(indexDataChar, indexDataSize, threads);

index = new(std::nothrow) Index[this->size];
Util::checkAllocation(index, "Can not allocate index memory in DBReader");
index = new(std::nothrow) Index[size];
Util::checkAllocation(index, "Cannot allocate index memory in DBReader");
incrementMemory(sizeof(Index) * size);

bool isSortedById = readIndex(indexDataChar, indexDataSize, index, dataSize);
Expand Down Expand Up @@ -203,7 +199,7 @@ template <typename T> bool DBReader<T>::open(int accessType){
compressedBuffers[i] = (char*) malloc(compressedBufferSizes[i]);
incrementMemory(compressedBufferSizes[i]);
if(compressedBuffers[i]==NULL){
Debug(Debug::ERROR) << "Can not allocate compressedBuffer!\n";
Debug(Debug::ERROR) << "Cannot allocate compressedBuffer!\n";
EXIT(EXIT_FAILURE);
}
dstream[i] = ZSTD_createDStream();
Expand Down Expand Up @@ -236,7 +232,7 @@ void DBReader<std::string>::sortIndex(bool isSortedById) {
SORT_PARALLEL(index, index + size, Index::compareById);
} else {
if(accessType != NOSORT && accessType != HARDNOSORT){
Debug(Debug::ERROR) << "DBReader<std::string> can not be opened in sort mode\n";
Debug(Debug::ERROR) << "DBReader<std::string> cannot be opened in sort mode\n";
EXIT(EXIT_FAILURE);
}
}
Expand Down Expand Up @@ -456,7 +452,7 @@ template <typename T> void DBReader<T>::remapData(){
for(size_t fileIdx = 0; fileIdx < dataFileNames.size(); fileIdx++){
FILE* dataFile = fopen(dataFileNames[fileIdx].c_str(), "r");
if (dataFile == NULL) {
Debug(Debug::ERROR) << "Can not open data file " << dataFileNames[fileIdx] << "!\n";
Debug(Debug::ERROR) << "Cannot open data file " << dataFileNames[fileIdx] << "!\n";
EXIT(EXIT_FAILURE);
}
size_t dataSize = 0;
Expand Down
50 changes: 24 additions & 26 deletions lib/mmseqs/src/commons/MemoryMapped.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,8 @@ MemoryMapped::MemoryMapped()
#ifdef _MSC_VER
_mappedFile (NULL),
#endif
_mappedView (NULL)
{
}
_mappedView (NULL),
_emptyView ("") {}


/// open file, mappedBytes = 0 maps the whole file
Expand All @@ -66,8 +65,7 @@ MemoryMapped::MemoryMapped(const std::string& filename, size_t mappedBytes, Cach
_mappedFile (NULL),
#endif
_mappedView (NULL),
openned(false)
{
_emptyView ("") {
open(filename, mappedBytes, hint);
}

Expand All @@ -82,13 +80,12 @@ MemoryMapped::~MemoryMapped()
/// open file
bool MemoryMapped::open(const std::string& filename, size_t mappedBytes, CacheHint hint)
{

// already open ?
if (openned)
if (isValid())
return false;

_filesize = 0;

_file = 0;
_filesize = 0;
_hint = hint;
#ifdef _MSC_VER
_mappedFile = NULL;
Expand Down Expand Up @@ -150,7 +147,6 @@ bool MemoryMapped::open(const std::string& filename, size_t mappedBytes, CacheHi
if (!_mappedView)
return false;

openned = true;
// everything's fine
return true;
}
Expand All @@ -160,13 +156,14 @@ bool MemoryMapped::open(const std::string& filename, size_t mappedBytes, CacheHi
void MemoryMapped::close()
{
// kill pointer
if (_mappedView)
{
if (_mappedView) {
if (_mappedView != _emptyView) {
#ifdef _MSC_VER
::UnmapViewOfFile(_mappedView);
::UnmapViewOfFile(_mappedView);
#else
::munmap(_mappedView, _filesize);
::munmap(_mappedView, _filesize);
#endif
}
_mappedView = NULL;
}

Expand Down Expand Up @@ -222,14 +219,7 @@ const unsigned char* MemoryMapped::getData() const
/// true, if file successfully opened
bool MemoryMapped::isValid() const
{
if (_filesize)
{
return _mappedView != NULL;
}
else
{
return true;
}
return _mappedView != NULL;
}


Expand Down Expand Up @@ -257,13 +247,14 @@ bool MemoryMapped::remap(uint64_t offset, size_t mappedBytes)
mappedBytes = _filesize;

// close old mapping
if (_mappedView)
{
if (_mappedView) {
if (_mappedView != _emptyView) {
#ifdef _MSC_VER
::UnmapViewOfFile(_mappedView);
::UnmapViewOfFile(_mappedView);
#else
::munmap(_mappedView, _mappedBytes);
::munmap(_mappedView, _mappedBytes);
#endif
}
_mappedView = NULL;
}

Expand All @@ -273,6 +264,13 @@ bool MemoryMapped::remap(uint64_t offset, size_t mappedBytes)
if (offset + mappedBytes > _filesize)
mappedBytes = size_t(_filesize - offset);

// mmap doesn't opening 0-byte large files
// map a fake null view to support this case
if (mappedBytes == 0) {
_mappedView = (void*)_emptyView;
return true;
}

#ifdef _MSC_VER
// Windows

Expand Down
4 changes: 2 additions & 2 deletions lib/mmseqs/src/commons/MemoryMapped.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ class MemoryMapped
FileHandle _file;
/// pointer to the file contents mapped into memory
void* _mappedView;

bool openned;
/// pointer to an empty string (to support mapping 0-byte size files)
const char* _emptyView;
};
#endif
3 changes: 3 additions & 0 deletions lib/mmseqs/src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1207,6 +1207,9 @@ Parameters::Parameters():
taxonomy = combineList(taxonomy, aggregatetaxweights);
taxonomy = combineList(taxonomy, lca);
taxonomy = combineList(taxonomy, searchworkflow);
taxonomy = removeParameter(taxonomy, PARAM_NUM_ITERATIONS);
taxonomy = removeParameter(taxonomy, PARAM_START_SENS);
taxonomy = removeParameter(taxonomy, PARAM_SENS_STEPS);
// easy taxonomy
easytaxonomy = combineList(taxonomy, addtaxonomy);
Expand Down
9 changes: 9 additions & 0 deletions lib/mmseqs/src/prefiltering/CacheFriendlyOperations.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,15 @@ struct __attribute__((__packed__)) CounterResult {
unsigned int id;
unsigned short diagonal;
unsigned char count;

static bool sortById(const CounterResult &first, const CounterResult &second) {
if (first.id < second.id)
return true;
if (second.id < first.id)
return false;
return false;
}

};

template<unsigned int BINSIZE>
Expand Down
4 changes: 2 additions & 2 deletions lib/mmseqs/src/prefiltering/Prefiltering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -782,7 +782,7 @@ bool Prefiltering::runSplit(const std::string &resultDB, const std::string &resu
Sequence seq(qdbr->getMaxSeqLen(), querySeqType, kmerSubMat, kmerSize, spacedKmer, aaBiasCorrection, true, spacedKmerPattern);
QueryMatcher matcher(indexTable, sequenceLookup, kmerSubMat, ungappedSubMat,
kmerThr, kmerSize, dbSize, std::max(tdbr->getMaxSeqLen(),qdbr->getMaxSeqLen()), maxResListLen, aaBiasCorrection,
diagonalScoring, minDiagScoreThr, takeOnlyBestKmer);
diagonalScoring, minDiagScoreThr, takeOnlyBestKmer, targetSeqType==Parameters::DBTYPE_NUCLEOTIDES);

if (seq.profile_matrix != NULL) {
matcher.setProfileMatrix(seq.profile_matrix);
Expand Down Expand Up @@ -818,7 +818,7 @@ bool Prefiltering::runSplit(const std::string &resultDB, const std::string &resu
}
}
// calculate prefiltering results
std::pair<hit_t *, size_t> prefResults = matcher.matchQuery(&seq, targetSeqId);
std::pair<hit_t *, size_t> prefResults = matcher.matchQuery(&seq, targetSeqId, targetSeqType==Parameters::DBTYPE_NUCLEOTIDES);
size_t resultSize = prefResults.second;
const float queryLength = static_cast<float>(qdbr->getSeqLen(id));
for (size_t i = 0; i < resultSize; i++) {
Expand Down
Loading

0 comments on commit c12da22

Please sign in to comment.