diff --git a/pom.xml b/pom.xml
index 7c4e9f7..f65d41d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -6,7 +6,7 @@
org.ohnlp.medtagger
medtagger
- 1.0.76
+ 1.0.77
The MedTagger biomedical information extraction pipeline
diff --git a/src/main/java/org/ohnlp/medtagger/backbone/MedtatorOutputTransform.java b/src/main/java/org/ohnlp/medtagger/backbone/MedtatorOutputTransform.java
new file mode 100644
index 0000000..4fb0006
--- /dev/null
+++ b/src/main/java/org/ohnlp/medtagger/backbone/MedtatorOutputTransform.java
@@ -0,0 +1,206 @@
+package org.ohnlp.medtagger.backbone;
+
+import org.apache.beam.sdk.coders.KvCoder;
+import org.apache.beam.sdk.coders.RowCoder;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
+import org.apache.beam.sdk.schemas.Schema;
+import org.apache.beam.sdk.schemas.transforms.Join;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.GroupByKey;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.values.*;
+import org.checkerframework.checker.initialization.qual.Initialized;
+import org.checkerframework.checker.nullness.qual.NonNull;
+import org.checkerframework.checker.nullness.qual.UnknownKeyFor;
+import org.ohnlp.backbone.api.annotations.ComponentDescription;
+import org.ohnlp.backbone.api.annotations.ConfigurationProperty;
+import org.ohnlp.backbone.api.components.LoadFromMany;
+import org.ohnlp.backbone.api.config.InputColumn;
+import org.ohnlp.backbone.api.exceptions.ComponentInitializationException;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.transform.TransformerException;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import static org.ohnlp.medtagger.ie.cc.MedTatorWriter.writeXml;
+
+@ComponentDescription(
+ name = "Medtator Output Transform",
+ desc = "Transforms MedTagger IE Output (and Similar) to MedTator format"
+)
+public class MedtatorOutputTransform extends LoadFromMany {
+ @ConfigurationProperty(
+ path = "fileSystemPath",
+ desc = "The file system path to write to"
+ )
+ private String workingDir;
+
+ @ConfigurationProperty(
+ path = "note_id_raw_col",
+ desc = "The input column to use containing the note identifier from the raw text collection"
+ )
+ private InputColumn note_identifer_raw_col;
+
+ @ConfigurationProperty(
+ path = "note_text_raw_col",
+ desc = "The input column to use containing the note text from the annotated entities"
+ )
+ private InputColumn note_text_raw_col;
+
+ @ConfigurationProperty(
+ path = "note_id_ann_col",
+ desc = "The input column to use containing the note identifier from the annotated entities"
+ )
+ private InputColumn note_identifer_ann_col;
+
+ @ConfigurationProperty(
+ path = "note_ann_start_col",
+ desc = "The input column to use containing the annotation start index from the annotated entities"
+ )
+ private InputColumn ann_start_ann_col;
+
+ @ConfigurationProperty(
+ path = "note_ann_end_col",
+ desc = "The input column to use containing the annotation end index from the annotated entities"
+ )
+ private InputColumn ann_end_ann_col;
+
+ @ConfigurationProperty(
+ path = "note_ann_type_col",
+ desc = "The input column to use containing the annotation type from the annotated entities. " +
+ "Defaults to \"ConceptMention\" if left blank",
+ required = false
+ )
+ private InputColumn ann_type_col;
+
+ @Override
+ public void init() throws ComponentInitializationException {
+ new File(this.workingDir).mkdirs();
+ }
+
+ @Override
+ public POutput expand(PCollectionRowTuple input) {
+ PCollection rawText = input.get("Raw Text");
+ PCollection entitydf = input.get("Entity Annotations");
+
+ PCollection> keyedRawText = rawText.apply("Extract Raw Text Keys", ParDo.of(new DoFn>() {
+ @ProcessElement
+ public void process(@Element Row input, OutputReceiver> output) {
+ output.output(KV.of(input.getValue(note_identifer_raw_col.getSourceColumnName()).toString(), input));
+ }
+ })).setCoder(
+ KvCoder.of(StringUtf8Coder.of(), RowCoder.of(rawText.getSchema()))
+ );
+
+ // Compressed annotation Schema
+ Schema schema = Schema.of(
+ Schema.Field.of(note_identifer_ann_col.getSourceColumnName(), Schema.FieldType.STRING),
+ Schema.Field.of("annotations", Schema.FieldType.iterable(Schema.FieldType.row(entitydf.getSchema())))
+ );
+
+ PCollection groupedKeyedAnnotations =
+ entitydf.apply("Extract Annotation Keys", ParDo.of(new DoFn>() {
+ @ProcessElement
+ public void process(@Element Row input, OutputReceiver> output) {
+ output.output(KV.of(input.getValue(note_identifer_ann_col.getSourceColumnName()).toString(), input));
+ }
+ })).setCoder(
+ KvCoder.of(StringUtf8Coder.of(), RowCoder.of(entitydf.getSchema()))
+ ).apply(
+ "Group by ID",
+ GroupByKey.create()
+ ).apply(
+ "Transform to Row",
+ ParDo.of(new DoFn>, Row>() {
+ @ProcessElement
+ public void process(@Element KV> input, OutputReceiver out) {
+ out.output(
+ Row.withSchema(
+ schema
+ ).addValues(input.getKey(), input.getValue()).build()
+ );
+ }
+ })
+ ).setCoder(RowCoder.of(schema));
+
+ // Join with raw text
+ groupedKeyedAnnotations.apply(
+ "Join annotations with raw text",
+ Join.innerJoin(rawText)
+ .on(Join.FieldsEqual.left(note_identifer_ann_col.getSourceColumnName()).right(note_identifer_raw_col.getSourceColumnName()))
+ ).apply(
+ "Convert to XML and Write",
+ ParDo.of(new DoFn() {
+ private transient DocumentBuilder db;
+ private transient DocumentBuilderFactory dbf;
+
+ @Setup
+ public void init() {
+ dbf = DocumentBuilderFactory.newInstance();
+ try {
+ db = dbf.newDocumentBuilder();
+ } catch (ParserConfigurationException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ @ProcessElement
+ public void process(@Element Row input, OutputReceiver out) {
+ String note_id = input.getRow("rhs").getString(note_identifer_raw_col.getSourceColumnName());
+ Iterable anns = input.getRow("lhs").getIterable("annotations");
+ String rawText = input.getRow("rhs").getString(note_text_raw_col.getSourceColumnName());
+ Document doc = db.newDocument();
+ org.w3c.dom.Element rootElement = doc.createElement("MedTagger");
+ doc.appendChild(rootElement);
+ Node cdata = doc.createCDATASection(rawText);
+ org.w3c.dom.Element textElement = doc.createElement("TEXT");
+ textElement.appendChild(cdata);
+ rootElement.appendChild(textElement);
+ org.w3c.dom.Element tagsElement = doc.createElement("TAGS");
+ int tagId = 0;
+ for (Row ann : anns) {
+ int start = Integer.valueOf(ann.getString(ann_start_ann_col.getSourceColumnName()));
+ int end = Integer.valueOf(ann.getString(ann_end_ann_col.getSourceColumnName()));
+ String type = "CM";
+ if (ann_type_col != null) {
+ type = ann.getString(ann_type_col.getSourceColumnName());
+ }
+ org.w3c.dom.Element tagElement = doc.createElement(type);
+ tagElement.setAttribute("spans", start+"~"+end);
+ tagElement.setAttribute("text", rawText.substring(start, end));
+ tagElement.setAttribute("id", "P"+tagId);
+ tagsElement.appendChild(tagElement);
+ tagId++;
+ }
+ rootElement.appendChild(tagsElement);
+
+ try{
+ FileOutputStream output =
+ new FileOutputStream(new File(workingDir, note_id + ".xml"));
+ writeXml(doc, output);
+ } catch (TransformerException e) {
+ throw new RuntimeException(e);
+ } catch (FileNotFoundException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ })
+ ).setCoder(StringUtf8Coder.of());
+
+ return PDone.in(input.getPipeline());
+ }
+
+ @Override
+ public List getInputTags() {
+ return Arrays.asList("Raw Text", "Entity Annotations");
+ }
+}
diff --git a/src/main/java/org/ohnlp/medtagger/ie/cc/MedTatorWriter.java b/src/main/java/org/ohnlp/medtagger/ie/cc/MedTatorWriter.java
index b6b8f48..bc400c7 100644
--- a/src/main/java/org/ohnlp/medtagger/ie/cc/MedTatorWriter.java
+++ b/src/main/java/org/ohnlp/medtagger/ie/cc/MedTatorWriter.java
@@ -155,7 +155,7 @@ public void printAnnotationsInline(JCas jcas) throws IOException {
}
}
- private static void writeXml(org.w3c.dom.Document doc,
+ public static void writeXml(org.w3c.dom.Document doc,
OutputStream output)
throws TransformerException {