Skip to content
This repository has been archived by the owner on Jun 17, 2019. It is now read-only.

Commit

Permalink
Fix header seperation in Pathology
Browse files Browse the repository at this point in the history
  • Loading branch information
trivedigaurav committed Jul 23, 2015
1 parent a17a617 commit b894576
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 45 deletions.
89 changes: 46 additions & 43 deletions src/edu/pitt/cs/nih/backend/featureVector/Preprocess.java
Original file line number Diff line number Diff line change
Expand Up @@ -267,40 +267,42 @@ public static String[] separatePathologyHeaderFooter(String input) throws Except
sb.append("\n");
iLine++;
}
// make header
text[0] = TextUtil.removeDuplicatedSpace(sb.toString().trim());

// make header
text[0] = TextUtil.removeDuplicatedSpace(sb.toString().trim());

// content starts from here until meet Pathologist (beginning of a sentence)
sb = new StringBuilder();
if(iLine < allLines - 1) {
iLine++;
// text += lines[iLine].trim() + "\n";
sb.append(lines[iLine].trim());
sb.append("\n");
}
else {
iLine = -1;
}
iLine++;
// // content starts from here until meet Pathologist (beginning of a sentence)
// sb = new StringBuilder();
// if(iLine < allLines - 1) {
// iLine++;
// // text += lines[iLine].trim() + "\n";
// sb.append(lines[iLine].trim());
// sb.append("\n");
// }
// else {
// iLine = -1;
// }
// iLine++;

Pattern p = Pattern.compile("^\\s*Pathologist");
Matcher m = p.matcher(lines[iLine]);
while(iLine < allLines && !m.find()) {
if(!lines[iLine].trim().equals("")) {
// text += lines[iLine].trim() + "\n";
sb.append(lines[iLine].trim());
sb.append("\n");
}
iLine++;
m = p.matcher(lines[iLine]);
}
// skip until meet GROSS DESCRIPTION
p = Pattern.compile("^GROSS DESCRIPTION");
m = p.matcher(lines[iLine]);
while(iLine < allLines && !m.find()) {
iLine++;
m = p.matcher(lines[iLine]);
}
// Pattern p = Pattern.compile("^\\s*Pathologist");
// Matcher m = p.matcher(lines[iLine]);
// while(iLine < allLines && !m.find()) {
// if(!lines[iLine].trim().equals("")) {
// // text += lines[iLine].trim() + "\n";
// sb.append(lines[iLine].trim());
// sb.append("\n");
// }
// iLine++;
// m = p.matcher(lines[iLine]);
// }
// // skip until meet GROSS DESCRIPTION
// p = Pattern.compile("^GROSS DESCRIPTION");
// m = p.matcher(lines[iLine]);
// while(iLine < allLines && !m.find()) {
// iLine++;
// m = p.matcher(lines[iLine]);
// }

// keep contain until meet E_O_R
while(iLine < allLines && lines[iLine].indexOf("E_O_R") == -1) {
if(!lines[iLine].trim().equals("")) {
Expand All @@ -310,22 +312,23 @@ public static String[] separatePathologyHeaderFooter(String input) throws Except
}
iLine++;
}
// make content

// make content
// remove stop word
// and **ID-NUM
text[1] = TextUtil.removeDuplicatedSpace(sb.toString().trim());
// and **ID-NUM
text[1] = TextUtil.removeDuplicatedSpace(sb.toString().trim());
text[1] = text[1].replaceAll("\\*\\*ID\\-NUM", "");
text[1] = text[1].replaceAll("\\*\\*INITIALS", "");
text[1] = text[1].replaceAll("_{3,}", "");

// footer starts from here to the end
sb = new StringBuilder();
while(iLine < allLines) {
sb.append(lines[iLine++].trim());
sb.append("\n");
}
// make the footer
text[2] = TextUtil.removeDuplicatedSpace(sb.toString().trim());
// footer starts from here to the end
sb = new StringBuilder();
while(iLine < allLines) {
sb.append(lines[iLine++].trim());
sb.append("\n");
}
// make the footer
text[2] = TextUtil.removeDuplicatedSpace(sb.toString().trim());

return text;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -981,13 +981,13 @@ public String wordTreeSkippedNGramPatternString(Map<String, String> spanMap)
patternStr = sb.toString().trim().replaceAll(whiteSpaceBeforePunc, "\\\\s{0,1}");
// in case the first skipped n-gram is a punctuation
// there would be no white space before the n-gram
patternStr = patternStr.replaceAll(" (?=(\\(\\\\S\\+))", "\\\\s*");
patternStr = patternStr.replaceAll(" (?=(\\(\\\\S\\+))", "\\\\W*");
// // quote the string
// patternStr = TextUtil.escapeRegex(patternStr);
// reverse 's
patternStr = patternStr.replaceAll("'s", "' {0,1}s");
// replace whitespace by \s
patternStr = patternStr.replaceAll("\\s(?!\\{)", "\\\\s+");
patternStr = patternStr.replaceAll("\\s(?!\\{)", "\\\\W+");

// System.out.println("Search pattern: " + patternStr);

Expand Down

0 comments on commit b894576

Please sign in to comment.