Skip to content

Commit

Permalink
more robust sequence id extraction; allow user to select preferred fo…
Browse files Browse the repository at this point in the history
…rmat
  • Loading branch information
muellan committed Mar 11, 2024
1 parent d35f957 commit 7368290
Show file tree
Hide file tree
Showing 9 changed files with 218 additions and 36 deletions.
12 changes: 12 additions & 0 deletions docs/mode_build.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,18 @@ BASIC OPTIONS
alternative source in a post processing step.
default: 'nucl_(gb|wgs|est|gss).accession2taxid'

-sequence-id-format (smart|ncbi|gi|filename|leadingword)
Method used for extracting sequence IDs from filenames and
sequence headers.Sequence IDs are also used to assign taxa
to reference sequences.
Available types are:
smart : try NCBI > genbank > filename
ncbi : NCBI-style accession/accession.version
gi : genbank identifier
filename : filename without extension
leadingword : first stretch of non-whitespace characters
default: smart

-silent|-verbose information level during build:
silent => none / verbose => most detailed
default: neither => only errors/important info
Expand Down
12 changes: 12 additions & 0 deletions docs/mode_build_query.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,18 @@ BASIC OPTIONS
alternative source in a post processing step.
default: 'nucl_(gb|wgs|est|gss).accession2taxid'

-sequence-id-format (smart|ncbi|gi|filename|leadingword)
Method used for extracting sequence IDs from filenames and
sequence headers.Sequence IDs are also used to assign taxa
to reference sequences.
Available types are:
smart : try NCBI > genbank > filename
ncbi : NCBI-style accession/accession.version
gi : genbank identifier
filename : filename without extension
leadingword : first stretch of non-whitespace characters
default: smart

-silent|-verbose information level during build:
silent => none / verbose => most detailed
default: neither => only errors/important info
Expand Down
30 changes: 21 additions & 9 deletions src/building.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -258,15 +258,15 @@ bool add_targets_to_database(
int dbPart,
input_batch& batch,
const std::map<string,taxon_id>& sequ2taxid,
sequence_id_type seqIdType,
info_level infoLvl)
{
for (auto& seq : batch) {
if (!db.check_load_factor(dbPart)) return false;
if (db.add_target_failed(dbPart)) return false;

if (!seq.data.empty()) {
auto seqId = extract_accession_string(
seq.header, sequence_id_type::any);
auto seqId = extract_accession_string(seq.header, seqIdType);

// make sure sequence id is not empty,
// use entire header if neccessary
Expand Down Expand Up @@ -312,6 +312,7 @@ bool add_targets_to_database(
void add_targets_to_database(database& db,
const std::vector<string>& infiles,
const std::map<string,taxon_id>& sequ2taxid,
sequence_id_type seqIdType,
info_level infoLvl)
{
// make executor that runs database insertion (concurrently) in batches
Expand Down Expand Up @@ -342,7 +343,7 @@ void add_targets_to_database(database& db,

batch_executor<input_sequence> executor { execOpt,
[&] (int id, auto& batch) {
return add_targets_to_database(db, id, batch, sequ2taxid, infoLvl);
return add_targets_to_database(db, id, batch, sequ2taxid, seqIdType, infoLvl);
}};

// spawn threads to read sequences
Expand All @@ -364,10 +365,15 @@ void add_targets_to_database(database& db,
}

try {
const auto fileAccession = extract_accession_string(
filename, sequence_id_type::acc_ver);

const taxon_id fileTaxId = find_taxon_id(sequ2taxid, fileAccession);
auto fileAccession = extract_accession_string(filename, seqIdType);
taxon_id fileTaxId = find_taxon_id(sequ2taxid, fileAccession);

if (fileTaxId == taxonomy::none_id() &&
seqIdType == sequence_id_type::smart)
{
fileAccession = extract_accession_string(filename, sequence_id_type::filename);
fileTaxId = find_taxon_id(sequ2taxid, fileAccession);
}

if (infoLvl == info_level::verbose) {
std::lock_guard<std::mutex> lock(outputMtx);
Expand Down Expand Up @@ -571,9 +577,15 @@ void add_to_database(database& db, const build_options& opt, timer& time)
opt.taxonomy.mappingPreFilesGlobal,
opt.infiles, opt.infoLevel);

if (notSilent) cout << "Processing reference sequences." << endl;
if (notSilent) {
cout << "Sequence ID extraction method: "
<< to_string(opt.sequenceIdType)
<< "\nProcessing reference sequences." << endl;
}


add_targets_to_database(db, opt.infiles, taxonMap, opt.infoLevel);
add_targets_to_database(db, opt.infiles, taxonMap,
opt.sequenceIdType, opt.infoLevel);

if (notSilent) {
clear_current_line(cout);
Expand Down
14 changes: 11 additions & 3 deletions src/classification.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,20 +111,28 @@ ground_truth (const taxonomy_cache& taxonomy, const string& header)
{
// try to extract query id and find the corresponding target in database
const taxon* tax = nullptr;
tax = taxonomy.taxon_with_name(extract_accession_string(header, sequence_id_type::acc_ver));
tax = taxonomy.taxon_with_name(extract_accession_string(header, sequence_id_type::ncbi_acc_ver));
if (tax) return taxonomy.cached_next_ranked_ancestor(tax);

tax = taxonomy.taxon_with_similar_name(extract_accession_string(header, sequence_id_type::acc));
tax = taxonomy.taxon_with_similar_name(extract_accession_string(header, sequence_id_type::ncbi_acc));
if (tax) return taxonomy.cached_next_ranked_ancestor(tax);

// try to extract id from header
// try to extract id directly from header
tax = taxonomy.taxon_with_id(extract_taxon_id(header));
if (tax) return taxonomy.cached_next_ranked_ancestor(tax);

// try to find entire header as sequence identifier
tax = taxonomy.taxon_with_name(header);
if (tax) return taxonomy.cached_next_ranked_ancestor(tax);

// try first word of header
tax = taxonomy.taxon_with_name(extract_accession_string(header, sequence_id_type::leading_word));
if (tax) return taxonomy.cached_next_ranked_ancestor(tax);

// try to extract filename in header
tax = taxonomy.taxon_with_name(extract_accession_string(header, sequence_id_type::filename));
if (tax) return taxonomy.cached_next_ranked_ancestor(tax);

return nullptr;
}

Expand Down
31 changes: 31 additions & 0 deletions src/options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,36 @@ info_level_cli(info_level& lvl, error_messages& err)



//-------------------------------------------------------------------
// / @brief shared command-line options for taxonomy
clipp::group
sequence_id_format_cli(sequence_id_type& type, error_messages&)
{
using namespace clipp;

return (
( option("-sequence-id-format") & one_of(
command("smart" ).set(type, sequence_id_type::smart),
command("ncbi" ).set(type, sequence_id_type::ncbi),
command("gi" ).set(type, sequence_id_type::genbank),
command("filename" ).set(type, sequence_id_type::filename),
command("leadingword").set(type, sequence_id_type::leading_word)
)
)
% "Method used for extracting sequence IDs from filenames and sequence headers."
"Sequence IDs are also used to assign taxa to reference sequences.\n"
"Available types are:\n"
" 'smart' : try NCBI accession > genbank identifier > filename\n"
" 'ncbi' : NCBI-style accession or accession.version ID\n"
" 'gi' : genbank identifier (number prefixed by 'gi|' or 'gi:' or 'gi=') \n"
" 'filename' : use string between first path separator and file extension\n"
" 'leadingword' : extracts first contiguous stretch of non-whitespace characters\n"
"default: smart\n"
);
}



//-------------------------------------------------------------------
// / @brief shared command-line options for taxonomy
clipp::group
Expand Down Expand Up @@ -500,6 +530,7 @@ build_mode_cli(build_options& opt, error_messages& err)
"BASIC OPTIONS" %
(
taxonomy_cli(opt.taxonomy, err),
sequence_id_format_cli(opt.sequenceIdType, err),
info_level_cli(opt.infoLevel, err)
),
"SKETCHING (SUBSAMPLING)" %
Expand Down
3 changes: 3 additions & 0 deletions src/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,9 @@ struct build_options
taxonomy_options taxonomy;
bool resetParents = false;

// default sequence id format (used for parsing sequence headers and filenames)
sequence_id_type sequenceIdType = sequence_id_type::smart;

info_level infoLevel = info_level::moderate;
};

Expand Down
132 changes: 115 additions & 17 deletions src/sequence_io.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,24 @@ void sequence_pair_reader::index_offset (index_type index)



/*************************************************************************//**
*
*****************************************************************************/
std::string to_string (sequence_id_type id) noexcept
{
switch (id) {
case sequence_id_type::smart: return "smart (NCBI accession > genbank > filename)";
case sequence_id_type::ncbi: return "NCBI accession";
case sequence_id_type::ncbi_acc: return "NCBI accession";
case sequence_id_type::ncbi_acc_ver: return "NCBI accession.version";
case sequence_id_type::filename: return "filename (without extension)";
case sequence_id_type::leading_word: return "leading contiguous stretch of non-whitespace characters";
case sequence_id_type::genbank: return "genbank";
}
return "";
}




/*************************************************************************//**
Expand All @@ -468,6 +486,61 @@ accession_regex("(^|[^[:alnum:]])(([A-Z][_A-Z]{1,9}[0-9]{5,})(\\.[0-9]+)?)",
std::regex::optimize);



/*************************************************************************//**
*
* @brief extracts first contiguous stretch of non-whitespace characters
*
*****************************************************************************/
string extract_leading_word (const string& text)
{
if (text.empty()) return text;

// skip leading whitespace
auto fst = std::find_if(text.begin(), text.end(), [](char c) {
return not std::isspace(static_cast<unsigned char>(c)); });

// whitespace only
if (fst == text.end()) return text;

// find first whitespace after first contiguous word
auto lst = std::find_if(fst+1, text.end(), [](char c) {
return std::isspace(static_cast<unsigned char>(c)); });

return string{fst,lst};
}



/*************************************************************************//**
*
* @brief extracts string between first path separator (or string start)
* and extension separator (or string end)
*
*****************************************************************************/
string extract_filename_without_extension (const string& text)
{
if (text.empty()) return text;

// find first occurrence of path separator
#ifdef _WIN32
// windows allows backslash *and* slash as path separator
auto fst = std::find(text.rbegin(), text.rend(), '\\').base();
auto sla = std::find(text.rbegin(), text.rend(), '/').base();
// use last position with either slash or backslash
if (sla > fst) fst = sla;
#else
auto fst = std::find(text.rbegin(), text.rend(), '/').base();
#endif

// find extension separator
auto ext = std::find(fst, text.end(), '.');

return string{fst,ext};
}



/*************************************************************************//**
*
* @brief extracts the NCBI accession[.version] number from a string
Expand All @@ -476,26 +549,35 @@ accession_regex("(^|[^[:alnum:]])(([A-Z][_A-Z]{1,9}[0-9]{5,})(\\.[0-9]+)?)",
*****************************************************************************/
string
extract_ncbi_accession_number (const string& text,
sequence_id_type idtype = sequence_id_type::any)
sequence_id_type idtype = sequence_id_type::ncbi)
{
if (text.empty()) return "";

std::smatch match;
std::regex_search(text, match, accession_regex);

switch (idtype) {
case sequence_id_type::acc:
case sequence_id_type::smart: // fallthrough
case sequence_id_type::ncbi:
return match[2];

case sequence_id_type::ncbi_acc:
return match[3];
case sequence_id_type::acc_ver:
if (match[4].length())

case sequence_id_type::ncbi_acc_ver:
if (match[4].length()) {
return match[2];
else
} else {
return "";
case sequence_id_type::gi:
return "";
default:
return match[2];
}

case sequence_id_type::filename: // fallthrough
case sequence_id_type::leading_word: // fallthrough
case sequence_id_type::genbank: // fallthrough
break;
}

return "";
}


Expand All @@ -511,6 +593,9 @@ extract_genbank_identifier (const string& text)
if (text.empty()) return "";

auto i = text.find("gi|");
if (i == string::npos) { i = text.find("gi:"); }
if (i == string::npos) { i = text.find("gi="); }

if (i != string::npos) {
// skip prefix
i += 3;
Expand All @@ -534,21 +619,34 @@ extract_accession_string (const string& text, sequence_id_type idtype)
if (text.empty()) return "";

switch (idtype) {
case sequence_id_type::acc:
// [[fallthrough]]
case sequence_id_type::acc_ver:
case sequence_id_type::ncbi_acc: // fallthrough
case sequence_id_type::ncbi_acc_ver:
return extract_ncbi_accession_number(text, idtype);
case sequence_id_type::gi:

case sequence_id_type::ncbi:
return extract_ncbi_accession_number(text);

case sequence_id_type::genbank:
return extract_genbank_identifier(text);
default: {

case sequence_id_type::leading_word:
return extract_leading_word(text);

case sequence_id_type::filename:
return extract_filename_without_extension(text);

case sequence_id_type::smart:
{
auto s = extract_ncbi_accession_number(text);
if (!s.empty()) return s;

s = extract_genbank_identifier(text);

return s;
if (!s.empty()) return s;
s = extract_filename_without_extension(text);
if (!s.empty()) return s;
}
}

return text;
}


Expand Down
Loading

0 comments on commit 7368290

Please sign in to comment.