Skip to content

Commit

Permalink
more
Browse files Browse the repository at this point in the history
  • Loading branch information
blackrim committed Jan 25, 2021
1 parent 49eea0b commit e9b0a9d
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 122 deletions.
244 changes: 123 additions & 121 deletions src/GenBankReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,142 +58,144 @@ void GenBankReader::parse_file(string fl, string db_name){
sqlite3 *conn;
int rc = sqlite3_open(db_name.c_str(), &conn);
char *zErrMsg = 0;

int nseqs = 0;
sqlite3_exec(conn, "BEGIN TRANSACTION", NULL, NULL, NULL);
while(getline(file,ln)){
vector<string> tokens;
string del(" ");
Tokenize(ln,tokens,del);
for(int j=0;j<tokens.size();j++){
TrimSpaces(tokens[j]);
}
if(tokens.size() >= 1){
if(tokens[0] == "LOCUS"){
locus = tokens[1];
continue;
}
if(tokens[0] == "VERSION"){
ver = tokens[1];
continue;
}
if(tokens[0].find("/db_xref=\"taxon:")!= string::npos){
vector<string> t2;
string del2(":");
Tokenize(tokens[0],t2,del2);
taxid = t2[1].substr(0,t2[1].size()-1);
continue;
}
if(tokens[0] == "DEFINITION"){
descr = true;
string ln2 = ln;
TrimSpaces(ln2);
descrst += ln2.substr(12,ln2.size()-12);
continue;
}
//keep reading the description
if(descr == true){
if(tokens[0] == "ACCESSION"){
acc = tokens[1];
descr = false;
continue;
}else{
string ln2 = ln;
TrimSpaces(ln2);
descrst += " "+ln2;
continue;
vector<string> tokens;
string del(" ");
Tokenize(ln,tokens,del);
for(int j=0;j<tokens.size();j++){
TrimSpaces(tokens[j]);
}
}
if(tokens[0] == "TITLE" && titledone == false){
title = true;
titledone = true;
string ln2 = ln;
TrimSpaces(ln2);
titlest += ln2.substr(10,ln2.size()-10);
std::replace(titlest.begin(),titlest.end(), '\'',' ');
continue;
}
if(title == true){
if (tokens[0] == "JOURNAL"){
title = false;
continue;
}else{
string ln2 = ln;
TrimSpaces(ln2);
titlest += " "+ln2;
std::replace(titlest.begin(),titlest.end(), '\'',' ');
continue;
}
}
if(tokens[0] == "ORIGIN" && tokens.size() == 1){
seq = true;
continue;
}
//keep reading the sequence
//sequence is always the last thing
if(seq == true){
if(tokens[0] == "//"){
bool deposit = true;
if(locus.size() == 0){
cout<<"locus" << endl;
deposit = false;
}else if(taxid.size() ==0){
cout<<"taxid" << endl;
deposit = false;
}else if(acc.size() == 0 || titlest.size() == 0 || ver.size() == 0){
deposit = false;
cout << "acc titlest ver" << endl;
}else if(descrst.size() ==0){
cout<<"descr" << endl;
deposit = false;
}else if(seqst.size() ==0){
cout<<"seqst" << endl;
deposit = false;
if(tokens.size() >= 1){
if(tokens[0] == "LOCUS"){
locus = tokens[1];
continue;
}
string sql = "insert into sequence (ncbi_id,locus,accession_id,version_id,description,title,seq) values (";
sql += taxid+",'";
sql += locus+"','";
sql += acc+"','";
sql += ver+"','";
sql += descrst +"','";
sql += titlest +"','";
std::transform(seqst.begin(), seqst.end(), seqst.begin(), upper);
sql += seqst+"');";
size_t found = seqst.find_first_of("(),.[]@#$%!+=^&*\"'|-_/{}`~<>\\");
if(found != string::npos){
cout << taxid << "," << locus << "," << descrst << endl;
cout << seqst << endl;
exit(0);
if(tokens[0] == "VERSION"){
ver = tokens[1];
continue;
}
if(deposit == true){
rc = sqlite3_exec(conn, sql.c_str(), 0, 0, 0);
if(tokens[0].find("/db_xref=\"taxon:")!= string::npos){
vector<string> t2;
string del2(":");
Tokenize(tokens[0],t2,del2);
taxid = t2[1].substr(0,t2[1].size()-1);
continue;
}
seqst = "";
descrst = "";
locus = "";
taxid = "";
seq = false;
descr = false;
acc = "";
ver = "";
titlest = "";
title = false;
titledone = false;
}else{
for(int j=1;j<tokens.size();j++){
seqst+=tokens[j];
if(tokens[0] == "DEFINITION"){
descr = true;
string ln2 = ln;
TrimSpaces(ln2);
descrst += ln2.substr(12,ln2.size()-12);
continue;
}
//keep reading the description
if(descr == true){
if(tokens[0] == "ACCESSION"){
acc = tokens[1];
descr = false;
continue;
}else{
string ln2 = ln;
TrimSpaces(ln2);
descrst += " "+ln2;
continue;
}
}
if(tokens[0] == "TITLE" && titledone == false){
title = true;
titledone = true;
string ln2 = ln;
TrimSpaces(ln2);
titlest += ln2.substr(10,ln2.size()-10);
std::replace(titlest.begin(),titlest.end(), '\'',' ');
continue;
}
if(title == true){
if (tokens[0] == "JOURNAL"){
title = false;
continue;
}else{
string ln2 = ln;
TrimSpaces(ln2);
titlest += " "+ln2;
std::replace(titlest.begin(),titlest.end(), '\'',' ');
continue;
}
}
if(tokens[0] == "ORIGIN" && tokens.size() == 1){
seq = true;
continue;
}
//keep reading the sequence
//sequence is always the last thing
if(seq == true){
if(tokens[0] == "//"){
bool deposit = true;
if(locus.size() == 0){
cout<<"locus" << endl;
deposit = false;
}else if(taxid.size() ==0){
cout<<"taxid" << endl;
deposit = false;
}else if(acc.size() == 0 || titlest.size() == 0 || ver.size() == 0){
deposit = false;
cout << "acc titlest ver" << endl;
}else if(descrst.size() ==0){
cout<<"descr" << endl;
deposit = false;
}else if(seqst.size() ==0){
cout<<"seqst" << endl;
deposit = false;
}
string sql = "insert into sequence (ncbi_id,locus,accession_id,version_id,description,title,seq) values (";
sql += taxid+",'";
sql += locus+"','";
sql += acc+"','";
sql += ver+"','";
sql += descrst +"','";
sql += titlest +"','";
std::transform(seqst.begin(), seqst.end(), seqst.begin(), upper);
sql += seqst+"');";
size_t found = seqst.find_first_of("(),.[]@#$%!+=^&*\"'|-_/{}`~<>\\");
if(found != string::npos){
cout << taxid << "," << locus << "," << descrst << endl;
cout << seqst << endl;
exit(0);
}
if(deposit == true){
rc = sqlite3_exec(conn, sql.c_str(), 0, 0, 0);
nseqs++;
}
seqst = "";
descrst = "";
locus = "";
taxid = "";
seq = false;
descr = false;
acc = "";
ver = "";
titlest = "";
title = false;
titledone = false;
}else{
for(int j=1;j<tokens.size();j++){
seqst+=tokens[j];
}
continue;
}
}
continue;
}
}
/*
//SHOULDN'T NEED THIS
if(tokens[0] == "//"){
cout << "next seq" << endl;
break;
}
*/
}
}
}
cout << nseqs << " seqs read" << endl;
sqlite3_exec(conn, "COMMIT TRANSACTION", NULL, NULL, NULL);
sqlite3_close(conn);
}
Expand Down
2 changes: 1 addition & 1 deletion src/SQLiteDBController.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ void SQLiteDBController::load_seqs(string div,bool downl){
system(cmd.c_str());
cout << name << endl;
GenBankReader gbr;
gbr.parse_file(name,db_name);
gbr.parse_file(name_ngz,db_name);
remove(name_ngz.c_str());
cur += 1;
}
Expand Down

0 comments on commit e9b0a9d

Please sign in to comment.