diff --git a/src/org/ohdsi/usagi/indexBuilding/BuildIndex.java b/src/org/ohdsi/usagi/indexBuilding/BuildIndex.java index dfb8006..a724739 100644 --- a/src/org/ohdsi/usagi/indexBuilding/BuildIndex.java +++ b/src/org/ohdsi/usagi/indexBuilding/BuildIndex.java @@ -48,7 +48,7 @@ public class BuildIndex { public static void main(String[] args) { Global.folder = "S:/Data/Usagi/"; BuildIndex buildIndex = new BuildIndex(); - buildIndex.buildIndex("S:/Data/OMOP Standard Vocabulary V5/Vocabulary5.0-20141013", "S:/Data/LOINC/loinc.csv"); + buildIndex.buildIndex("S:/Data/OMOP Standard Vocabulary V5/Vocabulary5.0-20150321", "S:/Data/LOINC/loinc.csv"); } public void buildIndex(String vocabFolder, String loincFile) { @@ -115,6 +115,7 @@ public void run() { loincToInfo = loadLoincInfo(loincFile); } report("Sorting vocabulary files"); + FileSorter.delimiter = '\t'; FileSorter.sort(vocabFolder + "/CONCEPT.csv", new String[] { "CONCEPT_ID" }, new boolean[] { true }); FileSorter.sort(vocabFolder + "/CONCEPT_SYNONYM.csv", new String[] { "CONCEPT_ID" }, new boolean[] { true }); @@ -122,8 +123,8 @@ public void run() { UsagiSearchEngine usagiSearchEngine = new UsagiSearchEngine(Global.folder); usagiSearchEngine.createNewMainIndex(); - Iterator conceptIterator = new ReadCSVFileWithHeader(vocabFolder + "/CONCEPT.csv").iterator(); - Iterator conceptSynIterator = new ReadCSVFileWithHeader(vocabFolder + "/CONCEPT_SYNONYM.csv").iterator(); + Iterator conceptIterator = new ReadCSVFileWithHeader(vocabFolder + "/CONCEPT.csv", '\t').iterator(); + Iterator conceptSynIterator = new ReadCSVFileWithHeader(vocabFolder + "/CONCEPT_SYNONYM.csv", '\t').iterator(); @SuppressWarnings("unchecked") MultiRowIterator iterator = new MultiRowIterator("CONCEPT_ID", true, new String[] { "concept", "concept_synonym" }, new Iterator[] { conceptIterator, conceptSynIterator }); @@ -132,6 +133,8 @@ public void run() { allowedVocabularies.add(allowedVocabulary); while (iterator.hasNext()) { MultiRowSet multiRowSet = iterator.next(); + if (multiRowSet.get("concept").size() == 0) + System.out.println("No concept found for concept ID " + multiRowSet.linkingId); Row conceptRow = multiRowSet.get("concept").get(0); if (conceptRow.getCells().size() > 2) // Extra check to catch badly formatted rows (which are in a vocab we don't care about) if (conceptRow.get("STANDARD_CONCEPT").equals("S") && allowedVocabularies.contains(conceptRow.get("VOCABULARY_ID"))) { diff --git a/src/org/ohdsi/utilities/files/FileSorter.java b/src/org/ohdsi/utilities/files/FileSorter.java index c556127..cdac991 100644 --- a/src/org/ohdsi/utilities/files/FileSorter.java +++ b/src/org/ohdsi/utilities/files/FileSorter.java @@ -35,6 +35,7 @@ public class FileSorter { public static double minFreeMemFraction = 0.25; public static boolean checkIfAlreadySorted = false; public static int maxCheckRows = 10000000; + public static char delimiter = ','; public static void sort(String filename, String... columnNames) { boolean[] sortNumeric = new boolean[columnNames.length]; @@ -56,7 +57,7 @@ public static void sort(String filename, String[] columnnames, boolean[] sortNum System.out.println("Memory available for sorting: " + availableMem + " bytes. Min free = " + minFreeMem); } - Iterator> iterator = new ReadCSVFile(filename).iterator(); + Iterator> iterator = new ReadCSVFile(filename, delimiter).iterator(); List header = iterator.next(); @@ -66,7 +67,7 @@ public static void sort(String filename, String[] columnnames, boolean[] sortNum if (isSorted(iterator, comparator)) return; else { - iterator = new ReadCSVFile(filename).iterator(); + iterator = new ReadCSVFile(filename, delimiter).iterator(); iterator.next(); // skip header } } @@ -150,7 +151,7 @@ private static void mergeByBatches(int nrOfFiles, String source, Comparator header, List> rows, String filename) { - WriteCSVFile out = new WriteCSVFile(filename); + WriteCSVFile out = new WriteCSVFile(filename, delimiter); if (header != null) out.write(header); for (List row : rows) @@ -242,7 +243,7 @@ private static void mergeFiles(String sourceBase, int start, int end, String tar List header = null; boolean done = true; for (int i = start; i < end; i++) { - ReadCSVFile tempFile = new ReadCSVFile(generateFilename(sourceBase, i)); + ReadCSVFile tempFile = new ReadCSVFile(generateFilename(sourceBase, i), delimiter); Iterator> iterator = tempFile.getIterator(); if (iterator.hasNext()) { if (tempFiles.size() == 0) // its the first one @@ -259,7 +260,7 @@ private static void mergeFiles(String sourceBase, int start, int end, String tar } else filerows.add(null); } - WriteCSVFile out = new WriteCSVFile(target); + WriteCSVFile out = new WriteCSVFile(target, delimiter); out.write(header); while (!done) { // Find best file to pick from: diff --git a/src/org/ohdsi/utilities/files/ReadCSVFile.java b/src/org/ohdsi/utilities/files/ReadCSVFile.java index d578ff3..d80a9f1 100644 --- a/src/org/ohdsi/utilities/files/ReadCSVFile.java +++ b/src/org/ohdsi/utilities/files/ReadCSVFile.java @@ -32,6 +32,12 @@ public class ReadCSVFile implements Iterable> { public boolean EOF = false; private char delimiter = ','; + + public ReadCSVFile(String filename, char delimiter) { + this(filename); + this.delimiter = delimiter; + } + public ReadCSVFile(String filename) { try { FileInputStream textFileStream = new FileInputStream(filename); @@ -43,6 +49,11 @@ public ReadCSVFile(String filename) { } } + public ReadCSVFile(InputStream inputstream, char delimiter) { + this(inputstream); + this.delimiter = delimiter; + } + public ReadCSVFile(InputStream inputstream) { try { bufferedReader = new BufferedReader(new InputStreamReader(inputstream, "ISO-8859-1")); diff --git a/src/org/ohdsi/utilities/files/ReadCSVFileWithHeader.java b/src/org/ohdsi/utilities/files/ReadCSVFileWithHeader.java index 7b3ac39..21256ee 100644 --- a/src/org/ohdsi/utilities/files/ReadCSVFileWithHeader.java +++ b/src/org/ohdsi/utilities/files/ReadCSVFileWithHeader.java @@ -23,39 +23,44 @@ import java.util.List; import java.util.Map; +public class ReadCSVFileWithHeader implements Iterable { + private InputStream inputstream; + private char delimiter = ','; + + public ReadCSVFileWithHeader(String filename, char delimiter) { + this(filename); + this.delimiter = delimiter; + } + + public ReadCSVFileWithHeader(String filename) { + try { + inputstream = new FileInputStream(filename); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } + } + + public ReadCSVFileWithHeader(InputStream inputstream) { + this.inputstream = inputstream; + } -public class ReadCSVFileWithHeader implements Iterable{ - private InputStream inputstream; - - public ReadCSVFileWithHeader(String filename) { - try { - inputstream = new FileInputStream(filename); - } catch (FileNotFoundException e) { - e.printStackTrace(); - } - } - - public ReadCSVFileWithHeader(InputStream inputstream){ - this.inputstream = inputstream; - } - @Override public Iterator iterator() { return new RowIterator(); } - - public class RowIterator implements Iterator{ - private Iterator> iterator; - private Map fieldName2ColumnIndex; - - public RowIterator(){ - iterator = new ReadCSVFile(inputstream).iterator(); + public class RowIterator implements Iterator { + + private Iterator> iterator; + private Map fieldName2ColumnIndex; + + public RowIterator() { + iterator = new ReadCSVFile(inputstream, delimiter).iterator(); fieldName2ColumnIndex = new HashMap(); for (String header : iterator.next()) fieldName2ColumnIndex.put(header, fieldName2ColumnIndex.size()); } - + @Override public boolean hasNext() { return iterator.hasNext(); @@ -63,13 +68,13 @@ public boolean hasNext() { @Override public Row next() { - return new Row(iterator.next(),fieldName2ColumnIndex); + return new Row(iterator.next(), fieldName2ColumnIndex); } @Override public void remove() { - throw new RuntimeException("Remove not supported"); + throw new RuntimeException("Remove not supported"); } - + } } diff --git a/src/org/ohdsi/utilities/files/WriteCSVFile.java b/src/org/ohdsi/utilities/files/WriteCSVFile.java index 1c0682c..e1b659e 100644 --- a/src/org/ohdsi/utilities/files/WriteCSVFile.java +++ b/src/org/ohdsi/utilities/files/WriteCSVFile.java @@ -25,7 +25,9 @@ import java.util.List; public class WriteCSVFile { - + + private char delimiter = ','; + public WriteCSVFile(String filename, boolean append) { FileOutputStream stream; try { @@ -36,13 +38,18 @@ public WriteCSVFile(String filename, boolean append) { } catch (UnsupportedEncodingException e) { e.printStackTrace(); } - + } - + + public WriteCSVFile(String filename, char delimiter) { + this(filename); + this.delimiter = delimiter; + } + public WriteCSVFile(String filename) { this(filename, false); } - + public void write(List string) { try { bufferedWrite.write(columns2line(string)); @@ -51,8 +58,8 @@ public void write(List string) { e.printStackTrace(); } } - - public static String columns2line(List columns) { + + public String columns2line(List columns) { StringBuilder sb = new StringBuilder(); Iterator iterator = columns.iterator(); while (iterator.hasNext()) { @@ -61,17 +68,17 @@ public static String columns2line(List columns) { column = column.replaceAll("\\\\", "\\\\\\\\"); if (hasQuotes) column = column.replaceAll("\"", "\\\\\""); - column = column.replaceAll("\r", ""); + column = column.replaceAll("\r", ""); column = column.replaceAll("\n", "\\\\n"); - if (hasQuotes || column.contains(",")) + if (hasQuotes || column.contains(Character.toString(delimiter))) column = "\"" + column + "\""; sb.append(column); if (iterator.hasNext()) - sb.append(","); + sb.append(delimiter); } return sb.toString(); } - + public void flush() { try { bufferedWrite.flush(); @@ -79,7 +86,7 @@ public void flush() { e.printStackTrace(); } } - + public void close() { try { bufferedWrite.close(); @@ -87,6 +94,6 @@ public void close() { e.printStackTrace(); } } - + private BufferedWriter bufferedWrite; }