Fix jsonl to json conversion (#3419)

* Fix spacy.gold.docs_to_json function * Fix jsonl2json converter
explosion · Mar 17, 2019 · 47e1103 · 47e1103
1 parent 0a4b074
commit 47e1103
Show file tree

Hide file tree

Showing 2 changed files with 61 additions and 12 deletions.
diff --git a/spacy/cli/converters/jsonl2json.py b/spacy/cli/converters/jsonl2json.py
@@ -3,18 +3,51 @@
 
 import srsly
 
-from ...util import get_lang_class
+from ...gold import docs_to_json
+from ...util import get_lang_class, minibatch
 
 
 def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
     if lang is None:
         raise ValueError("No --lang specified, but tokenization required")
     json_docs = []
-    input_tuples = [srsly.json_loads(line) for line in input_data]
+    input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
     nlp = get_lang_class(lang)()
-    for i, (raw_text, ents) in enumerate(input_tuples):
-        doc = nlp.make_doc(raw_text)
-        doc[0].is_sent_start = True
-        doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents["entities"]]
-        json_docs.append(doc.to_json())
+    sentencizer = nlp.create_pipe("sentencizer")
+    for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
+        docs = []
+        for record in batch:
+            raw_text = record["text"]
+            if "entities" in record:
+                ents = record["entities"]
+            else:
+                ents = record["spans"]
+            ents = [(e["start"], e["end"], e["label"]) for e in ents]
+            doc = nlp.make_doc(raw_text)
+            sentencizer(doc)
+            spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
+            doc.ents = _cleanup_spans(spans)
+            docs.append(doc)
+        json_docs.append(docs_to_json(docs, id=i))
     return json_docs
+
+
+def _cleanup_spans(spans):
+    output = []
+    seen = set()
+    for span in spans:
+        if span is not None:
+            # Trim whitespace
+            while len(span) and span[0].is_space:
+                span = span[1:]
+            while len(span) and span[-1].is_space:
+                span = span[:-1]
+            if not len(span):
+                continue
+            for i in range(span.start, span.end):
+                if i in seen:
+                    break
+            else:
+                output.append(span)
+                seen.update(range(span.start, span.end))
+    return output
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
@@ -598,19 +598,35 @@ cdef class GoldParse:
                         self.c.sent_start[i] = 0
 
 
-def docs_to_json(docs, underscore=None):
+def docs_to_json(docs, id=0):
     """Convert a list of Doc objects into the JSON-serializable format used by
     the spacy train command.
 
     docs (iterable / Doc): The Doc object(s) to convert.
-    underscore (list): Optional list of string names of custom doc._.
-        attributes. Attribute values need to be JSON-serializable. Values will
-        be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
+    id (int): Id for the JSON.
     RETURNS (list): The data in spaCy's JSON format.
     """
     if isinstance(docs, Doc):
         docs = [docs]
-    return [doc.to_json(underscore=underscore) for doc in docs]
+    json_doc = {"id": id, "paragraphs": []}
+    for i, doc in enumerate(docs):
+        json_para = {'raw': doc.text, "sentences": []}
+        ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
+        biluo_tags = biluo_tags_from_offsets(doc, ent_offsets)
+        for j, sent in enumerate(doc.sents):
+            json_sent = {"tokens": [], "brackets": []}
+            for token in sent:
+                json_token = {"id": token.i, "orth": token.text}
+                if doc.is_tagged:
+                    json_token["tag"] = token.tag_
+                if doc.is_parsed:
+                    json_token["head"] = token.head.i-token.i
+                    json_token["dep"] = token.dep_
+                json_token["ner"] = biluo_tags[token.i]
+                json_sent["tokens"].append(json_token)
+            json_para["sentences"].append(json_sent)
+        json_doc["paragraphs"].append(json_para)
+    return json_doc 
 
 
 def biluo_tags_from_offsets(doc, entities, missing="O"):