Skip to content

Commit

Permalink
Modified index building routines to handle new format of Vocab files
Browse files Browse the repository at this point in the history
(tab-delimited instead of comma-delimited).
  • Loading branch information
schuemie committed Mar 31, 2015
1 parent 468a0c3 commit 7fe1024
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 46 deletions.
9 changes: 6 additions & 3 deletions src/org/ohdsi/usagi/indexBuilding/BuildIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public class BuildIndex {
public static void main(String[] args) {
Global.folder = "S:/Data/Usagi/";
BuildIndex buildIndex = new BuildIndex();
buildIndex.buildIndex("S:/Data/OMOP Standard Vocabulary V5/Vocabulary5.0-20141013", "S:/Data/LOINC/loinc.csv");
buildIndex.buildIndex("S:/Data/OMOP Standard Vocabulary V5/Vocabulary5.0-20150321", "S:/Data/LOINC/loinc.csv");
}

public void buildIndex(String vocabFolder, String loincFile) {
Expand Down Expand Up @@ -115,15 +115,16 @@ public void run() {
loincToInfo = loadLoincInfo(loincFile);
}
report("Sorting vocabulary files");
FileSorter.delimiter = '\t';
FileSorter.sort(vocabFolder + "/CONCEPT.csv", new String[] { "CONCEPT_ID" }, new boolean[] { true });
FileSorter.sort(vocabFolder + "/CONCEPT_SYNONYM.csv", new String[] { "CONCEPT_ID" }, new boolean[] { true });

report("Adding concepts to index");
UsagiSearchEngine usagiSearchEngine = new UsagiSearchEngine(Global.folder);
usagiSearchEngine.createNewMainIndex();

Iterator<Row> conceptIterator = new ReadCSVFileWithHeader(vocabFolder + "/CONCEPT.csv").iterator();
Iterator<Row> conceptSynIterator = new ReadCSVFileWithHeader(vocabFolder + "/CONCEPT_SYNONYM.csv").iterator();
Iterator<Row> conceptIterator = new ReadCSVFileWithHeader(vocabFolder + "/CONCEPT.csv", '\t').iterator();
Iterator<Row> conceptSynIterator = new ReadCSVFileWithHeader(vocabFolder + "/CONCEPT_SYNONYM.csv", '\t').iterator();
@SuppressWarnings("unchecked")
MultiRowIterator iterator = new MultiRowIterator("CONCEPT_ID", true, new String[] { "concept", "concept_synonym" }, new Iterator[] {
conceptIterator, conceptSynIterator });
Expand All @@ -132,6 +133,8 @@ public void run() {
allowedVocabularies.add(allowedVocabulary);
while (iterator.hasNext()) {
MultiRowSet multiRowSet = iterator.next();
if (multiRowSet.get("concept").size() == 0)
System.out.println("No concept found for concept ID " + multiRowSet.linkingId);
Row conceptRow = multiRowSet.get("concept").get(0);
if (conceptRow.getCells().size() > 2) // Extra check to catch badly formatted rows (which are in a vocab we don't care about)
if (conceptRow.get("STANDARD_CONCEPT").equals("S") && allowedVocabularies.contains(conceptRow.get("VOCABULARY_ID"))) {
Expand Down
11 changes: 6 additions & 5 deletions src/org/ohdsi/utilities/files/FileSorter.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ public class FileSorter {
public static double minFreeMemFraction = 0.25;
public static boolean checkIfAlreadySorted = false;
public static int maxCheckRows = 10000000;
public static char delimiter = ',';

public static void sort(String filename, String... columnNames) {
boolean[] sortNumeric = new boolean[columnNames.length];
Expand All @@ -56,7 +57,7 @@ public static void sort(String filename, String[] columnnames, boolean[] sortNum
System.out.println("Memory available for sorting: " + availableMem + " bytes. Min free = " + minFreeMem);
}

Iterator<List<String>> iterator = new ReadCSVFile(filename).iterator();
Iterator<List<String>> iterator = new ReadCSVFile(filename, delimiter).iterator();

List<String> header = iterator.next();

Expand All @@ -66,7 +67,7 @@ public static void sort(String filename, String[] columnnames, boolean[] sortNum
if (isSorted(iterator, comparator))
return;
else {
iterator = new ReadCSVFile(filename).iterator();
iterator = new ReadCSVFile(filename, delimiter).iterator();
iterator.next(); // skip header
}
}
Expand Down Expand Up @@ -150,7 +151,7 @@ private static void mergeByBatches(int nrOfFiles, String source, Comparator<List
}

private static void writeToDisk(List<String> header, List<List<String>> rows, String filename) {
WriteCSVFile out = new WriteCSVFile(filename);
WriteCSVFile out = new WriteCSVFile(filename, delimiter);
if (header != null)
out.write(header);
for (List<String> row : rows)
Expand Down Expand Up @@ -242,7 +243,7 @@ private static void mergeFiles(String sourceBase, int start, int end, String tar
List<String> header = null;
boolean done = true;
for (int i = start; i < end; i++) {
ReadCSVFile tempFile = new ReadCSVFile(generateFilename(sourceBase, i));
ReadCSVFile tempFile = new ReadCSVFile(generateFilename(sourceBase, i), delimiter);
Iterator<List<String>> iterator = tempFile.getIterator();
if (iterator.hasNext()) {
if (tempFiles.size() == 0) // its the first one
Expand All @@ -259,7 +260,7 @@ private static void mergeFiles(String sourceBase, int start, int end, String tar
} else
filerows.add(null);
}
WriteCSVFile out = new WriteCSVFile(target);
WriteCSVFile out = new WriteCSVFile(target, delimiter);
out.write(header);
while (!done) {
// Find best file to pick from:
Expand Down
11 changes: 11 additions & 0 deletions src/org/ohdsi/utilities/files/ReadCSVFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ public class ReadCSVFile implements Iterable<List<String>> {
public boolean EOF = false;
private char delimiter = ',';


public ReadCSVFile(String filename, char delimiter) {
this(filename);
this.delimiter = delimiter;
}

public ReadCSVFile(String filename) {
try {
FileInputStream textFileStream = new FileInputStream(filename);
Expand All @@ -43,6 +49,11 @@ public ReadCSVFile(String filename) {
}
}

public ReadCSVFile(InputStream inputstream, char delimiter) {
this(inputstream);
this.delimiter = delimiter;
}

public ReadCSVFile(InputStream inputstream) {
try {
bufferedReader = new BufferedReader(new InputStreamReader(inputstream, "ISO-8859-1"));
Expand Down
57 changes: 31 additions & 26 deletions src/org/ohdsi/utilities/files/ReadCSVFileWithHeader.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,53 +23,58 @@
import java.util.List;
import java.util.Map;

public class ReadCSVFileWithHeader implements Iterable<Row> {
private InputStream inputstream;
private char delimiter = ',';

public ReadCSVFileWithHeader(String filename, char delimiter) {
this(filename);
this.delimiter = delimiter;
}

public ReadCSVFileWithHeader(String filename) {
try {
inputstream = new FileInputStream(filename);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}

public ReadCSVFileWithHeader(InputStream inputstream) {
this.inputstream = inputstream;
}

public class ReadCSVFileWithHeader implements Iterable<Row>{
private InputStream inputstream;

public ReadCSVFileWithHeader(String filename) {
try {
inputstream = new FileInputStream(filename);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}

public ReadCSVFileWithHeader(InputStream inputstream){
this.inputstream = inputstream;
}

@Override
public Iterator<Row> iterator() {
return new RowIterator();
}

public class RowIterator implements Iterator<Row>{

private Iterator<List<String>> iterator;
private Map<String, Integer> fieldName2ColumnIndex;

public RowIterator(){
iterator = new ReadCSVFile(inputstream).iterator();
public class RowIterator implements Iterator<Row> {

private Iterator<List<String>> iterator;
private Map<String, Integer> fieldName2ColumnIndex;

public RowIterator() {
iterator = new ReadCSVFile(inputstream, delimiter).iterator();
fieldName2ColumnIndex = new HashMap<String, Integer>();
for (String header : iterator.next())
fieldName2ColumnIndex.put(header, fieldName2ColumnIndex.size());
}

@Override
public boolean hasNext() {
return iterator.hasNext();
}

@Override
public Row next() {
return new Row(iterator.next(),fieldName2ColumnIndex);
return new Row(iterator.next(), fieldName2ColumnIndex);
}

@Override
public void remove() {
throw new RuntimeException("Remove not supported");
throw new RuntimeException("Remove not supported");
}

}
}
31 changes: 19 additions & 12 deletions src/org/ohdsi/utilities/files/WriteCSVFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
import java.util.List;

public class WriteCSVFile {


private char delimiter = ',';

public WriteCSVFile(String filename, boolean append) {
FileOutputStream stream;
try {
Expand All @@ -36,13 +38,18 @@ public WriteCSVFile(String filename, boolean append) {
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}

}


public WriteCSVFile(String filename, char delimiter) {
this(filename);
this.delimiter = delimiter;
}

public WriteCSVFile(String filename) {
this(filename, false);
}

public void write(List<String> string) {
try {
bufferedWrite.write(columns2line(string));
Expand All @@ -51,8 +58,8 @@ public void write(List<String> string) {
e.printStackTrace();
}
}
public static String columns2line(List<String> columns) {

public String columns2line(List<String> columns) {
StringBuilder sb = new StringBuilder();
Iterator<String> iterator = columns.iterator();
while (iterator.hasNext()) {
Expand All @@ -61,32 +68,32 @@ public static String columns2line(List<String> columns) {
column = column.replaceAll("\\\\", "\\\\\\\\");
if (hasQuotes)
column = column.replaceAll("\"", "\\\\\"");
column = column.replaceAll("\r", "");
column = column.replaceAll("\r", "");
column = column.replaceAll("\n", "\\\\n");
if (hasQuotes || column.contains(","))
if (hasQuotes || column.contains(Character.toString(delimiter)))
column = "\"" + column + "\"";
sb.append(column);
if (iterator.hasNext())
sb.append(",");
sb.append(delimiter);
}
return sb.toString();
}

public void flush() {
try {
bufferedWrite.flush();
} catch (IOException e) {
e.printStackTrace();
}
}

public void close() {
try {
bufferedWrite.close();
} catch (IOException e) {
e.printStackTrace();
}
}

private BufferedWriter bufferedWrite;
}

0 comments on commit 7fe1024

Please sign in to comment.