diff --git a/nbactions.xml b/nbactions.xml index 56dedd5..e842d11 100644 --- a/nbactions.xml +++ b/nbactions.xml @@ -10,7 +10,7 @@ org.codehaus.mojo:exec-maven-plugin:1.2.1:exec - -Xms4g -classpath %classpath edu.umn.msi.gx.mztosqlite.MzToSQLite -s /Users/jj/Downloads/iTRAQ_Erwinia.sqlite /Users/jj/gx/toolsheds/msi/mzid_sqlite/iTRAQ_Erwinia.mzid /Users/jj/gx/toolsheds/msi/mzid_sqlite/iTRAQ_Erwinia.mzML + -Xms4g -classpath %classpath edu.umn.msi.gx.mztosqlite.MzToSQLite -s /Users/jj/Downloads/GCC2015.sqlite /Users/jj/gxt/gxt/database/files/006/dataset_6617.dat /Users/jj/gxt/gxt/database/files/006/dataset_6616.dat /Users/jj/gxt/gxt/database/files/006/dataset_6620.dat /Users/jj/gxt/gxt/database/files/006/dataset_6618.dat java @@ -24,7 +24,7 @@ org.codehaus.mojo:exec-maven-plugin:1.2.1:exec - -Xdebug -Xrunjdwp:transport=dt_socket,server=n,address=${jpda.address} -Xms4g -classpath %classpath edu.umn.msi.gx.mztosqlite.MzToSQLite -s /Users/jj/Downloads/iTRAQ_Erwinia.sqlite /Users/jj/gx/toolsheds/msi/mzid_sqlite/iTRAQ_Erwinia.mzid /Users/jj/gx/toolsheds/msi/mzid_sqlite/iTRAQ_Erwinia.mzML + -Xdebug -Xrunjdwp:transport=dt_socket,server=n,address=${jpda.address} -Xms4g -classpath %classpath edu.umn.msi.gx.mztosqlite.MzToSQLite -s /Users/jj/Downloads/GCC2015.sqlite /Users/jj/gxt/gxt/database/files/006/dataset_6617.dat /Users/jj/gxt/gxt/database/files/006/dataset_6616.dat /Users/jj/gxt/gxt/database/files/006/dataset_6620.dat /Users/jj/gxt/gxt/database/files/006/dataset_6618.dat java true @@ -39,7 +39,7 @@ org.codehaus.mojo:exec-maven-plugin:1.2.1:exec - -Xms4g -classpath %classpath edu.umn.msi.gx.mztosqlite.MzToSQLite -s /Users/jj/Downloads/iTRAQ_Erwinia.sqlite /Users/jj/gx/toolsheds/msi/mzid_sqlite/iTRAQ_Erwinia.mzid /Users/jj/gx/toolsheds/msi/mzid_sqlite/iTRAQ_Erwinia.mzML + -Xms4g -classpath %classpath edu.umn.msi.gx.mztosqlite.MzToSQLite -s /Users/jj/Downloads/GCC2015.sqlite /Users/jj/gxt/gxt/database/files/006/dataset_6617.dat /Users/jj/gxt/gxt/database/files/006/dataset_6616.dat /Users/jj/gxt/gxt/database/files/006/dataset_6620.dat /Users/jj/gxt/gxt/database/files/006/dataset_6618.dat java diff --git a/pom.xml b/pom.xml index a179ae5..5e6a705 100644 --- a/pom.xml +++ b/pom.xml @@ -97,12 +97,61 @@ uk.ac.ebi.pride.tools pride-wrapper 1.2.0 - + + + com.github.samtools + htsjdk + 1.130 + org.tmatesoft.sqljet sqljet 1.1.10 + + com.compomics + utilities + 3.47.2 + + + commons-math + commons-math + + + servlet-api + javax.servlet + + + swingx + org.swinglabs + + + swing-layout + org.swinglabs + + + batik-all + batik + + + pdf-transcoder + batik + + + jfreechart + jfree + + + xercesImpl + xerces + + + ssj + umontreal.iro.lecuyer.gof + + + + diff --git a/src/main/java/edu/umn/msi/gx/mztosqlite/MzIdentParser.java b/src/main/java/edu/umn/msi/gx/mztosqlite/MzIdentParser.java index 239b972..1db2aea 100644 --- a/src/main/java/edu/umn/msi/gx/mztosqlite/MzIdentParser.java +++ b/src/main/java/edu/umn/msi/gx/mztosqlite/MzIdentParser.java @@ -6,12 +6,25 @@ package edu.umn.msi.gx.mztosqlite; +import com.compomics.util.protein.Header; +import java.io.BufferedReader; +import java.io.DataInputStream; import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.text.DateFormat; +import java.text.SimpleDateFormat; import java.util.ArrayList; +import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import uk.ac.ebi.jmzidml.MzIdentMLElement; import uk.ac.ebi.jmzidml.model.mzidml.AbstractParam; import uk.ac.ebi.jmzidml.model.mzidml.AnalysisData; @@ -41,8 +54,14 @@ * @version */ public class MzIdentParser { + String[] FASTA_ID_PATTERNS = {"^(?:sp|tr|gi[|][^|]+[|]ref)[|]([a-zA-Z0-9]+[|]*(:[_.][a-zA-Z0-9]+)*).*$", + "^(\\w+)\\s*(.*)$"}; + String filepath; - Map spectrumIdPkidMap; + /* map of spectrum id to database primary key*/ + private Map spectrumIdPkidMap; + private Map accToSeq = new HashMap<>(); + private Map accToDefline = new HashMap<>(); boolean verbose = false; public MzIdentParser() { @@ -69,6 +88,12 @@ public void parseIdent(MzParserHandler handler) { parseIdent(filepath,handler); } public void parseIdent(String filepath, MzParserHandler parseHandler) { + /* + TODO: Generate scores for: + DBSequence: %sequence_coverage + Spectral_identification: + */ + DateFormat dfmt = new SimpleDateFormat("yyyy-MM-dd' 'HH:mm:ss.SSS"); File input = new File(filepath); boolean aUseSpectrumCache = true; Map sourceFileIdHashMap = new HashMap<>(); @@ -78,7 +103,8 @@ public void parseIdent(String filepath, MzParserHandler parseHandler) { Map spectraDataIdHashMap = new HashMap<>(); Map siiIdHashMap = new HashMap<>(); Map siiIdToSirHashMap = new HashMap<>(); - Map columnToScoreMap = new HashMap<>(); + Map scoreNameToClassMap = new HashMap<>(); + Map columnToScoreMap = new HashMap<>(); Map columnToProtScoreMap = new HashMap<>(); Map dbSequenceIdHashMap = new HashMap<>(); Map pdhIdHashMap = new HashMap<>(); @@ -92,6 +118,8 @@ public void parseIdent(String filepath, MzParserHandler parseHandler) { Map fkDBSequenceIdPkid = new HashMap<>(); Map fkSpectrumIdentificationIdPkid = new HashMap<>(); Map fkPeptideIdPkid = new HashMap<>(); + Map> spectrumIdentScores = new HashMap<>(); + Map scoreNameValueClass = new HashMap<>(); // Read mzIdentML file MzIdentMLUnmarshaller unmarshaller = new MzIdentMLUnmarshaller(input); DataCollection dc = unmarshaller.unmarshal(DataCollection.class); @@ -119,6 +147,7 @@ public void parseIdent(String filepath, MzParserHandler parseHandler) { } // AnalysisSoftwareList AnalysisSoftware SoftwareName if (verbose) { + System.out.print(dfmt.format(new Date())); System.out.print("About to iterate over SourceFile"); } Iterator iterSourceFile = unmarshaller.unmarshalCollectionFromXpath(MzIdentMLElement.SourceFile); @@ -133,7 +162,8 @@ public void parseIdent(String filepath, MzParserHandler parseHandler) { } if (verbose) { System.out.println("...done"); - System.out.print("About to iterate over AnalysisSoftware"); + System.out.print(dfmt.format(new Date())); + System.out.print("About to iterate over AnalysisSoftware"); } Iterator iterAnalysisSoftware = unmarshaller.unmarshalCollectionFromXpath(MzIdentMLElement.AnalysisSoftware); while (iterAnalysisSoftware.hasNext()) { @@ -142,6 +172,7 @@ public void parseIdent(String filepath, MzParserHandler parseHandler) { } if (verbose) { System.out.println("...done"); + System.out.print(dfmt.format(new Date())); System.out.print("About to iterate over DBsequence"); } Iterator iterDBSequence = unmarshaller.unmarshalCollectionFromXpath(MzIdentMLElement.DBSequence); @@ -149,6 +180,17 @@ public void parseIdent(String filepath, MzParserHandler parseHandler) { DBSequence dbSequence = iterDBSequence.next(); dbSequenceIdHashMap.put(dbSequence.getId(), dbSequence); Object SearchDatabase_pkid = fkSearchDatabaseIdPkid.get(dbSequence.getSearchDatabaseRef()); + if (dbSequence.getSeq() == null) { + String seq = accToSeq.get(dbSequence.getAccession()); + //System.out.println("seq: " + seq); + if(seq!=null){ + dbSequence.setSeq(seq); + if (dbSequence.getLength() == null) { + dbSequence.setLength(seq.length()); + } + } + } + Integer seqLen =dbSequence.getLength(); Map dbMap = new HashMap<>(); dbMap.put("SearchDatabase_pkid", SearchDatabase_pkid); dbMap.put("accession", dbSequence.getAccession()); @@ -166,6 +208,7 @@ public void parseIdent(String filepath, MzParserHandler parseHandler) { } if (verbose) { System.out.println("...done"); + System.out.print(dfmt.format(new Date())); System.out.print("About to iterate over Peptide"); } Iterator iterPeptide = unmarshaller.unmarshalCollectionFromXpath(MzIdentMLElement.Peptide); @@ -226,6 +269,7 @@ public void parseIdent(String filepath, MzParserHandler parseHandler) { } if (verbose) { System.out.println("...done"); + System.out.print(dfmt.format(new Date())); System.out.print("About to iterate over PepEvid"); } Iterator iterPeptideEvidence = unmarshaller.unmarshalCollectionFromXpath(MzIdentMLElement.PeptideEvidence); @@ -235,6 +279,7 @@ public void parseIdent(String filepath, MzParserHandler parseHandler) { } if (verbose) { System.out.println("...done"); + System.out.print(dfmt.format(new Date())); System.out.print("About to iterate over Spectra Data"); } Iterator iterSpectraData = unmarshaller.unmarshalCollectionFromXpath(MzIdentMLElement.SpectraData); @@ -244,6 +289,7 @@ public void parseIdent(String filepath, MzParserHandler parseHandler) { } if (verbose) { System.out.println("...done"); + System.out.print(dfmt.format(new Date())); System.out.print("About to iterate over PDH"); } Iterator iterPDH = unmarshaller.unmarshalCollectionFromXpath(MzIdentMLElement.ProteinDetectionHypothesis); @@ -278,6 +324,7 @@ public void parseIdent(String filepath, MzParserHandler parseHandler) { Integer counter = 0; if (verbose) { System.out.println("...done"); + System.out.print(dfmt.format(new Date())); System.out.print("About to iterate over SIR"); } Iterator iterSIR = unmarshaller.unmarshalCollectionFromXpath(MzIdentMLElement.SpectrumIdentificationResult); @@ -292,17 +339,35 @@ public void parseIdent(String filepath, MzParserHandler parseHandler) { siiIdHashMap.put(sii.getId(), sii); siiIdToSirHashMap.put(sii.getId(), sir); for (CvParam cvParam : sii.getCvParam()) { - if (cvParam.getValue() != null) { + String value = cvParam.getValue(); + if (value != null && !value.isEmpty()) { if (!columnToScoreMap.containsValue(cvParam.getName())) { columnToScoreMap.put(counter, cvParam.getName()); + Class colClass = String.class; + try { + int parseInt = Integer.parseInt(value); + colClass = Integer.class; + + } catch (NumberFormatException exi) { + try { + double parseDouble = Double.parseDouble(value); + colClass = Double.class; + } catch (NumberFormatException exd) { + + } + } + scoreNameToClassMap.put(cvParam.getName(), colClass); counter++; } } } } } + // Add new columns to score table + parseHandler.addTableColumns("Score", scoreNameToClassMap); if (verbose) { System.out.println("...done"); + System.out.print(dfmt.format(new Date())); System.out.print("About to create output"); } for (SpectrumIdentificationResult sir : sirList) { @@ -310,9 +375,9 @@ public void parseIdent(String filepath, MzParserHandler parseHandler) { SpectraData spectraData = spectraDataIdHashMap.get(sir.getSpectraDataRef()); //String spectrumID = sir.getSpectrumID(); Double rtInSeconds = -1.0; - String spectrumTitle = ""; - // - // + String spectrumTitle = null; + // + // // for (CvParam cvParam : sir.getCvParam()) { // Updated by FG: checking for old CV param 1114 or newer correct CV term 16. @@ -340,7 +405,11 @@ public void parseIdent(String filepath, MzParserHandler parseHandler) { String indentificationID = sii.getId(); //psmValues.put("id", spectrumID); // TEXT if (spectrumIdPkidMap != null) { - psmValues.put("Spectrum_pkid",spectrumIdPkidMap.get(spectrumID)); + if ((spectrumTitle != null) && (spectrumIdPkidMap.get(spectrumTitle) != null)) { + psmValues.put("Spectrum_pkid", spectrumIdPkidMap.get(spectrumTitle)); + } else { + psmValues.put("Spectrum_pkid", spectrumIdPkidMap.get(spectrumID)); + } } psmValues.put("spectrum_id", spectrumID); psmValues.put("acquisitionNum", null); // TEXT @@ -351,17 +420,36 @@ public void parseIdent(String filepath, MzParserHandler parseHandler) { psmValues.put("experimentalMassToCharge", sii.getCalculatedMassToCharge()); // REAL psmValues.put("calculatedMassToCharge", sii.getCalculatedMassToCharge()); // REAL - Map mapNameToValue = new HashMap<>(); +// Map mapNameToValue = new HashMap<>(); + //Handle scores for (AbstractParam param : sii.getParamGroup()) { - mapNameToValue.put(param.getName(), param.getValue()); - //System.out.println("test1" + param.getName() + "-> " + param.getValue()); - } - //Handle scores - for (int i = 0; i < columnToScoreMap.size(); i++) { - String score = columnToScoreMap.get(i); - Object scoreValue = mapNameToValue.containsKey(score) ? mapNameToValue.get(score) : null; - scoreValues.put(score, scoreValue); + String pname = param.getName(); + String pvalue = param.getValue(); + Object value = null; + if (pvalue != null && !pvalue.isEmpty()) { + Class colClass = scoreNameToClassMap.get(pname); + if (colClass == String.class) { + value = pvalue; + } else { + try { + if (colClass == Double.class) { + value = new Double(pvalue); + } else if (colClass == Integer.class) { + value = new Integer(pvalue); + } + } catch (NumberFormatException exd) { + + } + } + //System.out.println("test1" + param.getName() + "-> " + param.getValue()); + } + scoreValues.put(param.getName(), param.getValue()); } +// for (int i = 0; i < columnToScoreMap.size(); i++) { +// String score = columnToScoreMap.get(i); +// Object scoreValue = mapNameToValue.containsKey(score) ? mapNameToValue.get(score) : null; +// scoreValues.put(score, scoreValue); +// } Object SpectrumIdentification_pkid = parseHandler.handle("SpectrumIdentification", psmValues); List peptideEvidenceRefs = sii.getPeptideEvidenceRef(); @@ -389,7 +477,98 @@ public void parseIdent(String filepath, MzParserHandler parseHandler) { } } - + if (verbose) { + System.out.println("...done"); + System.out.println(dfmt.format(new Date())); + } + } + + public void readFasta(String inputFasta) { + List pats = new ArrayList<>(); + for (String pat : FASTA_ID_PATTERNS) { + pats.add(Pattern.compile(pat)); + } + InputStream fstream = null; + try { + fstream = new FileInputStream(inputFasta); + // Get the object of DataInputStream + InputStream in = new DataInputStream(fstream); + BufferedReader br = new BufferedReader(new InputStreamReader(in)); + String line; + String currSequence = ""; + String currProtAcc = null; + String currDefline = null; + int recordCounter = 0; + while ((line = br.readLine()) != null) { + line = line.replaceAll("\n",""); + + if(line.contains(">")){ + //Insert previous into hash and reset + if(recordCounter != 0){ + currSequence = currSequence.replaceAll(" ",""); + accToSeq.put(currProtAcc, currSequence); + //System.out.println("Inserting:" + currProtAcc + "_" + currSequence); + accToDefline.put(currProtAcc, currDefline); + //System.out.println("Inserting2:" + currProtAcc + "_" + currDefline); + + currSequence = ""; + } + + try { + Header header = Header.parseFromFASTA(line); + currProtAcc = header.getAccession(); + currDefline = header.getDescription(); + } catch (Exception ex) { + line = line.replaceAll(">", ""); + for (Pattern p : pats) { + Matcher m = p.matcher(line); + if (m.matches()) { + switch (m.groupCount()) { + case 2: + currDefline = m.group(2); + case 1: + currProtAcc = m.group(1); + break; + default: + currProtAcc = line; + } + break; + } + } + } + recordCounter++; + } else { + currSequence += line; + } + } + //handle last + accToSeq.put(currProtAcc, currSequence.replaceAll(" ","")); + accToDefline.put(currProtAcc, currDefline); + //Close the input stream + in.close(); + } catch (FileNotFoundException ex) { + String methodName =Thread.currentThread().getStackTrace()[1].getMethodName(); + String className = this.getClass().getName(); + String message= "The task \""+methodName + "\" in the class \""+ className + "\" was not completed because of "+ ex.getMessage()+"."+ + "\nPlease see the reference guide at 01 for more information on this error. https://code.google.com/p/mzidentml-lib/wiki/CommonErrors "; + System.out.println (message); + } catch (IOException ex) { + String methodName =Thread.currentThread().getStackTrace()[1].getMethodName(); + String className = this.getClass().getName(); + String message= "The task \""+methodName + "\" in the class \""+ className + "\" was not completed because of "+ ex.getMessage()+"."+ + "\nPlease see the reference guide at 02 for more information on this error. https://code.google.com/p/mzidentml-lib/wiki/CommonErrors "; + System.out.println (message); + } finally { + try { + fstream.close(); + } catch (IOException ex) { + String methodName =Thread.currentThread().getStackTrace()[1].getMethodName(); + String className = this.getClass().getName(); + String message= "The task \""+methodName + "\" in the class \""+ className + "\" was not completed because of "+ ex.getMessage()+"."+ + "\nPlease see the reference guide at 02 for more information on this error. https://code.google.com/p/mzidentml-lib/wiki/CommonErrors "; + System.out.println (message); + } + } } public static void main(String[] args) { diff --git a/src/main/java/edu/umn/msi/gx/mztosqlite/MzParserHandler.java b/src/main/java/edu/umn/msi/gx/mztosqlite/MzParserHandler.java index 7d0c354..d378c16 100644 --- a/src/main/java/edu/umn/msi/gx/mztosqlite/MzParserHandler.java +++ b/src/main/java/edu/umn/msi/gx/mztosqlite/MzParserHandler.java @@ -23,5 +23,9 @@ public MzParserHandler(MzSQLiteDB mzSQLiteDB) { public Object handle(String tableName, Map values) { return this.mzSQLiteDB.insertOrReplace(tableName, values); } + + public void addTableColumns(String table,Map colTypes) { + mzSQLiteDB.addTableColumns(table, colTypes); + } } diff --git a/src/main/java/edu/umn/msi/gx/mztosqlite/MzSQLiteDB.java b/src/main/java/edu/umn/msi/gx/mztosqlite/MzSQLiteDB.java index cd4b5e1..a60088f 100644 --- a/src/main/java/edu/umn/msi/gx/mztosqlite/MzSQLiteDB.java +++ b/src/main/java/edu/umn/msi/gx/mztosqlite/MzSQLiteDB.java @@ -17,6 +17,7 @@ import org.tmatesoft.sqljet.core.SqlJetTransactionMode; import org.tmatesoft.sqljet.core.schema.ISqlJetColumnDef; import org.tmatesoft.sqljet.core.schema.SqlJetConflictAction; +import org.tmatesoft.sqljet.core.table.ISqlJetCursor; import org.tmatesoft.sqljet.core.table.ISqlJetTable; import org.tmatesoft.sqljet.core.table.SqlJetDb; @@ -37,10 +38,42 @@ public class MzSQLiteDB { public static final String CREATE_SpectrumIdentification_TABLE = "CREATE TABLE SpectrumIdentification (pkid INTEGER PRIMARY KEY, Spectrum_pkid INTEGER, spectrum_id TEXT, acquisitionNum INTEGER, chargeState INTEGER, retentionTime REAL,rank INTEGER, passThreshold INTEGER, experimentalMassToCharge REAL, calculatedMassToCharge REAL)"; // public static final String CREATE_Fragmentation_TABLE = "CREATE TABLE Fragmentation (pkid INTEGER PRIMARY KEY, spectrum_identification_id TEXT, charge INTEGER, index TEXT)"; public static final String CREATE_Spectrum_TABLE = "CREATE TABLE Spectrum (pkid INTEGER PRIMARY KEY, id TEXT, acquisitionNum INTEGER, msLevel INTEGER, polarity INTEGER, peaksCount INTEGER, totIonCurrent REAL, retentionTime REAL, basePeakMZ REAL, basePeakIntensity REAL, collisionEnergy REAL, ionisationEnergy REAL, lowMZ REAL, highMZ REAL, precursorScanNum INTEGER, precursorMZ REAL, precursorCharge INTEGER, precursorIntensity REAL )"; - public static final String CREATE_Peaks_TABLE = "CREATE TABLE Peaks (pkid INTEGER PRIMARY KEY, id TEXT, spectrum_pkid INT REFERENCES Spectrum(pkid), acquisitionNum INTEGER, moz TEXT, intensity TEXT)"; - + public static final String CREATE_Peaks_TABLE = "CREATE TABLE Peaks (pkid INTEGER PRIMARY KEY, Spectrum_pkid INT REFERENCES Spectrum(pkid), acquisitionNum INTEGER, moz TEXT, intensity TEXT)"; public static final String[] TABLE_DEFS = {CREATE_Source_TABLE, CREATE_SpectraData_TABLE, CREATE_SearchDatabase_TABLE, CREATE_DBSequence_TABLE, CREATE_Peptide_TABLE, CREATE_PeptideEvidence_TABLE, CREATE_Modification_TABLE, CREATE_SpectrumIdentification_TABLE, CREATE_Score_TABLE, CREATE_Spectrum_TABLE,CREATE_Peaks_TABLE}; - + // Indexes + public static final String CREATE_DBSequence_DB_Accession_INDEX = "CREATE INDEX DBSequence_db_accession_idx ON DBSequence(SearchDatabase_pkid,accession)"; + public static final String CREATE_DBSequence_Accession_INDEX = "CREATE INDEX DBSequence_acc_idx ON DBSequence(accession)"; + public static final String CREATE_Peptide_sequence_INDEX = "CREATE INDEX Peptide_sequence_idx ON Peptide(sequence)"; + public static final String CREATE_PeptideEvidence_FKEYs_INDEX = "CREATE INDEX PeptideEvidence_fkey_idx on PeptideEvidence (spectrumidentification_pkid,dbsequence_pkid,peptide_pkid)"; + + public static final String[] INDEX_DEFS = {CREATE_DBSequence_Accession_INDEX, CREATE_Peptide_sequence_INDEX, CREATE_PeptideEvidence_FKEYs_INDEX}; + /* Sample Protein query: + SELECT dbs.accession, dbs.description, count(si.pkid) AS ScanCount + FROM spectrum, peptide, DBSequence dbs, peptideevidence pe, spectrumidentification si, score + WHERE pe.dbsequence_pkid = dbs.pkid + AND pe.peptide_pkid = peptide.pkid + AND pe.spectrumidentification_pkid = si.pkid + AND si.spectrum_pkid = spectrum.pkid + AND score.spectrumidentification_pkid = si.pkid + GROUP BY dbs.accession + ORDER BY ScanCount DESC; + */ + /* Sample PeptideSpetralMatch query: + SELECT CAST(Score.'PeptideShaker PSM score' as Number) AS 'PeptideShaker PSM score', + CAST(Score.'OMSSA:evalue' as Number) AS 'OMSSA:evalue',CAST(Score.'MS-GF:SpecEValue' as Number) AS 'MS-GF:SpecEValue', + CAST(Score.'theoretical mass' as Number) AS 'theoretical mass', + CAST(Score.'PeptideShaker PSM confidence' as Number) AS 'PeptideShaker PSM confidence', + Spectrum.acquisitionNum,Spectrum.msLevel,Spectrum.polarity,Spectrum.peaksCount,Spectrum.totIonCurrent, + Spectrum.retentionTime,Spectrum.basePeakMZ,Spectrum.basePeakIntensity,Spectrum.collisionEnergy, + Spectrum.ionisationEnergy,Spectrum.lowMZ,Spectrum.highMZ,Spectrum.precursorScanNum,Spectrum.precursorMZ, + Spectrum.precursorCharge,Spectrum.precursorIntensity,Spectrum.title,Peptide.sequence,Peptide.modNum,dbs.accession, si.pkid + FROM spectrum, peptide, DBSequence dbs, peptideevidence pe, spectrumidentification si, score + WHERE pe.dbsequence_pkid = dbs.pkid + AND pe.peptide_pkid = peptide.pkid + AND pe.spectrumidentification_pkid = si.pkid + AND si.spectrum_pkid = spectrum.pkid + AND score.spectrumidentification_pkid = si.pkid; + */ String dbFilePath = null; File dbFile = null; SqlJetDb sqlJetDb = null; @@ -59,7 +92,7 @@ public MzSQLiteDB(String dbFilePath) { this.dbFilePath = dbFilePath; this.dbFile = new File(dbFilePath); dbFile.delete(); - Logger.getLogger(MzSQLiteDB.class.getName()).log(Level.FINE, dbFile.getAbsolutePath() + " writable " + dbFile.canWrite()); + Logger.getLogger(MzSQLiteDB.class.getName()).log(Level.FINE, "{0} writable {1}", new Object[]{dbFile.getAbsolutePath(), dbFile.canWrite()}); } SqlJetDb getDB() throws SqlJetException { @@ -87,6 +120,9 @@ public void createTables(SqlJetDb db) throws SqlJetException { Logger.getLogger(MzSQLiteDB.class.getName()).log(Level.FINE, createTableStmt); db.createTable(createTableStmt); } + for (String indexStmt : INDEX_DEFS) { + db.createIndex(indexStmt); + } } catch (SqlJetException ex) { Logger.getLogger(MzSQLiteDB.class.getName()).log(Level.SEVERE, null, ex); } finally { @@ -134,6 +170,36 @@ public void checkColumns(String tableName, Map values) { } } + public void addTableColumns(String tableName,Map colTypes) { + try { + SqlJetDb db = getDB(); + db.getSchema().getTableNames(); + db.beginTransaction(SqlJetTransactionMode.WRITE); + try { + ISqlJetTable table = db.getTable(tableName); + for (String field : colTypes.keySet()) { + if (table.getDefinition().getColumn(field) == null) { + String columnName = sanitizeColumnName(field); + Class colClass = colTypes.get(field); + String fieldType = colClass.isAssignableFrom(Double.class) ? "REAL" : colClass.isAssignableFrom(Integer.class) ? "INTEGER" : "TEXT"; + String alterTable = "ALTER TABLE " + tableName + " ADD COLUMN " + columnName + " " + fieldType; + Logger.getLogger(MzSQLiteDB.class.getName()).log(Level.FINE, alterTable); + db.alterTable(alterTable); + ISqlJetColumnDef column = table.getDefinition().getColumn(columnName); + getSchemaMap().get(tableName).put(field, column); + } + } + } catch (SqlJetException ex) { + Logger.getLogger(MzSQLiteDB.class.getName()).log(Level.SEVERE, null, ex); + } + db.commit(); + db.close(); + } catch (SqlJetException ex) { + Logger.getLogger(MzSQLiteDB.class.getName()).log(Level.SEVERE, null, ex); + } + + } + public void addColumns(String tableName,Map values) { try { SqlJetDb db = getDB(); @@ -193,8 +259,45 @@ public Object insertOrReplace(String tableName, Map values) { } return rowid; } - + public Object update(String tableName, Object pkid, Map values) { + Object rowid = null; + try { + checkColumns(tableName,values); + SqlJetDb db = getDB(); + db.beginTransaction(SqlJetTransactionMode.WRITE); + try { + ISqlJetTable table = db.getTable(tableName); + ISqlJetCursor lookup = table.lookup(null, pkid); + lookup.updateByFieldNames(values); + rowid = lookup.getRowId(); + } catch (SqlJetException ex) { + Logger.getLogger(MzSQLiteDB.class.getName()).log(Level.SEVERE, values.toString(), ex); + } + db.commit(); + db.close(); + } catch (SqlJetException ex) { + Logger.getLogger(MzSQLiteDB.class.getName()).log(Level.SEVERE, null, ex); + } + return rowid; + } + + public Map getValues(String tableName, Object pkid, Map values) { + Map rowValues = values != null ? values : new HashMap(); + try { + SqlJetDb db = getDB(); + ISqlJetTable table = db.getTable(tableName); + Map tableMap = getSchemaMap().get(tableName); + ISqlJetCursor lookup = table.lookup(null, pkid); + for (String field : tableMap.keySet()) { + rowValues.put(field, lookup.getValue(field)); + } + } catch (SqlJetException ex) { + Logger.getLogger(MzSQLiteDB.class.getName()).log(Level.SEVERE, null, ex); + } + return rowValues; + } + public static void main(String[] args) { String dbpath = new File(args.length > 0 ? args[0] : "/Users/jj/tmp/mzSQLiteDB.sqlite").getAbsolutePath(); Logger.getLogger(MzSQLiteDB.class.getName()).log(Level.FINE, dbpath); diff --git a/src/main/java/edu/umn/msi/gx/mztosqlite/MzSpectrumParser.java b/src/main/java/edu/umn/msi/gx/mztosqlite/MzSpectrumParser.java index 27cd3d5..f966bfd 100644 --- a/src/main/java/edu/umn/msi/gx/mztosqlite/MzSpectrumParser.java +++ b/src/main/java/edu/umn/msi/gx/mztosqlite/MzSpectrumParser.java @@ -130,13 +130,13 @@ public void parseSpectrum(String filepath, ProteomicsFormat format, MzParserHand case "MS:1000894": spectrumValues.put("retentionTime", cv.getValue()); break; - /* MGF: additional params - CvParam("retention time", retentionTime, "MS", "MS:1000894") + case "MS:1000796": + spectrumValues.put("title", cv.getValue()); + break; + /* Additional params CvParam("peak list scans", scan, "MS", "MS:1000797") - CvParam("spectrum title", title, "MS", "MS:1000796") - CvParam("Fragment mass tolerance setting", tolerance.toString(), "PRIDE", "PRIDE:0000161") CvParam("Fragment mass tolerance setting", tolerance.toString(), "PRIDE", "PRIDE:0000161") - */ + */ default: break; } @@ -144,7 +144,10 @@ public void parseSpectrum(String filepath, ProteomicsFormat format, MzParserHand Object spectrum_pkid = parseHandler.handle("Spectrum", spectrumValues); if (spectrumIdPkidMap != null) { - spectrumIdPkidMap.put(spectrumID, spectrum_pkid); + spectrumIdPkidMap.put(spectrumID, spectrum_pkid); + if (spectrumValues.get("title") != null) { + spectrumIdPkidMap.put((String) spectrumValues.get("title"), spectrum_pkid); + } } List mozArray = new ArrayList<>(); List intensityArray = new ArrayList<>(); @@ -157,11 +160,12 @@ public void parseSpectrum(String filepath, ProteomicsFormat format, MzParserHand String intensity = intensityArray.toString().replaceAll(" ", ""); Map peakValues = new HashMap<>(); peakValues.put("acquisitionNum", acquisitionNum); - peakValues.put("spectrum_pkid", spectrum_pkid); + peakValues.put("Spectrum_pkid", spectrum_pkid); peakValues.put("moz", moz); peakValues.put("intensity", intensity); - Object peak_pkid = parseHandler.handle("Peaks", peakValues); + Object peak_pkid = parseHandler.handle("Peaks", peakValues); } + } private JMzReader getSpectrumParser(File inputFile,ProteomicsFormat format) throws Exception { diff --git a/src/main/java/edu/umn/msi/gx/mztosqlite/MzToSQLite.java b/src/main/java/edu/umn/msi/gx/mztosqlite/MzToSQLite.java index 3111e03..ff3aa5d 100644 --- a/src/main/java/edu/umn/msi/gx/mztosqlite/MzToSQLite.java +++ b/src/main/java/edu/umn/msi/gx/mztosqlite/MzToSQLite.java @@ -30,6 +30,7 @@ public class MzToSQLite { Map scanFiles = new HashMap<>(); Map identFiles = new HashMap<>(); + Map seqDbFiles = new HashMap<>(); String dbPath = null; String jsonPath = null; String tsvPath = null; @@ -37,15 +38,25 @@ public class MzToSQLite { MzSQLiteDB mzSQLiteDB = null; public final void parseOptions(String[] args) { + Integer MAX_INPUTS = 100; Parser parser = new BasicParser(); String dbOpt = "sqlite"; + String inputFileOpt = "input"; + String inputNameOpt = "name"; + String inputIdOpt = "encoded_id"; String verboseOpt = "verbose"; String helpOpt = "help"; Options options = new Options(); options.addOption("s", dbOpt, true, "SQLite output file"); options.addOption("v", verboseOpt, false, "verbose"); options.addOption("h", helpOpt, false, "help"); - + options.addOption("i", inputFileOpt, verbose, "input file"); + options.addOption("n", inputNameOpt, verbose, "name for input file"); + options.addOption("e", inputIdOpt, verbose, "encoded id for input file"); + options.addOption("f", inputIdOpt, verbose, "FASTA Search Database files"); + options.getOption(inputFileOpt).setArgs(MAX_INPUTS); + options.getOption(inputNameOpt).setArgs(MAX_INPUTS); + options.getOption(inputIdOpt).setArgs(MAX_INPUTS); // create the parser try { // parse the command line arguments @@ -89,6 +100,9 @@ public final void parseOptions(String[] args) { case PRIDEXML: scanFiles.put(filePath, format); break; + case FASTA: + seqDbFiles.put(filePath, format); + break; case PEPXML: case UNSUPPORTED: default: @@ -102,7 +116,6 @@ public final void parseOptions(String[] args) { Logger.getLogger(MzToSQLite.class.getName()).log(Level.WARNING, "Unable to read {0}", filePath); } } - } } catch (ParseException exp) { Logger.getLogger(MzToSQLite.class.getName()).log(Level.SEVERE, null, exp); @@ -135,12 +148,14 @@ public void processFiles() { source.put("format", format.toString()); handler.handle("Source", source); MzIdentParser mzIdentParser = new MzIdentParser(filepath,spectrumIdPkidMap); + for (String fastapath : seqDbFiles.keySet()) { + mzIdentParser.readFasta(fastapath); + } mzIdentParser.parseIdent(handler); - } + } } public static void main(String[] args) { - try { MzToSQLite mzToSQLite = new MzToSQLite(); mzToSQLite.parseOptions(args); diff --git a/src/main/java/edu/umn/msi/gx/mztosqlite/ProteomicsFormat.java b/src/main/java/edu/umn/msi/gx/mztosqlite/ProteomicsFormat.java index 1f41f2c..4059020 100644 --- a/src/main/java/edu/umn/msi/gx/mztosqlite/ProteomicsFormat.java +++ b/src/main/java/edu/umn/msi/gx/mztosqlite/ProteomicsFormat.java @@ -22,6 +22,7 @@ public enum ProteomicsFormat { MZDATA(".xml"), PRIDEXML(".xml"), PEPXML(".pep.xml"), + FASTA(".fasta"), UNSUPPORTED("?"); private final String extension; @@ -51,46 +52,49 @@ public static ProteomicsFormat getFormat(String filename) { } return null; } + public static ProteomicsFormat getFormat(File inputFile) throws IOException { - if (checkHeader(inputFile, " 0 && line.startsWith(">")) { + return FASTA; + } + } + } finally { try { if (fr != null) { fr.close(); @@ -99,7 +103,7 @@ public static boolean checkHeader(File file, String string) throws IOException { // ignore } } - return match; + return UNSUPPORTED; } }