updated README; moved to v.1.6.3; polished some tests

mromanello · Feb 1, 2018 · ffe60e3 · ffe60e3
1 parent f4f33bc
commit ffe60e3
Show file tree

Hide file tree

Showing 7 changed files with 90 additions and 260 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -14,7 +14,7 @@ install:
   - pip install -r requirements_dev.txt
   - pip install .
 # command to run tests
-script: pytest -vv --cov=citation_extractor --ignore=tests/test_eval.py tests/test_citation_extractor.py tests/test_eval.py::test_eval_ner
+script: pytest -vv --cov=citation_extractor --ignore=tests/test_eval.py
 #script: travis_wait 60 pytest -s -vv --cov=citation_extractor
 after_success:
   - codecov
diff --git a/README.md b/README.md
@@ -10,6 +10,34 @@
 
 This software supports Python version 2.7, and it was tested only on POSIX–compliant operating systems (Linux, Mac OS X, FreeBSD, etc.).
 
+### Installing TreeTagger
+
+The `CitationExtractor` relies on TreeTagger for the PoS tagging of input texts.
+
+There is a handy script to install it.
+
+To run it without having to clone this repo:
+
+```bash
+wget -O install_treetagger.sh https://raw.githubusercontent.com/mromanello/CitationExtractor/master/install_treetagger.sh
+chmod a+x install_treetagger.sh
+./install_treetagger.sh
+rm install_treetagger.sh
+```
+
+otherwise:
+
+```bash
+git clone https://github.com/mromanello/CitationExtractor.git
+cd CitationExtractor
+chmod a+x install_treetagger.sh
+./install_treetagger.sh
+rm install_treetagger.sh
+```
+
+
+### With pip
+
 To install the `CitationExtractor` first run:
 
     $ pip install http://www.antlr3.org/download/Python/antlr_python_runtime-3.1.3.tar.gz#egg=antlr_python_runtime-3.1.3
@@ -22,6 +50,18 @@ followed by:
 **NB:** the installation of all other dependencies is handled by `setup.py` but for some reason
 (that I'm still trying to figure out) it does not pick up these two.
 
+### Verify installation
+
+To double check that everything was installed correctly, try running the following lines (it should take ~20s):
+
+```python
+from citation_extractor.settings import crfsuite
+from citation_extractor.pipeline import get_extractor
+extractor = get_extractor(crfsuite)
+assert extractor is not None
+```
+
+If the code above runs without throwing exceptions means you managed to install the library!
 
 ## Documentation
 

diff --git a/citation_extractor/Utils/actlearn.py b/citation_extractor/Utils/actlearn.py
diff --git a/citation_extractor/__init__.py b/citation_extractor/__init__.py
@@ -1,2 +1,2 @@
 # -*- coding: utf-8 -*-
-__version__ = '1.6.2'
+__version__ = '1.6.3'
diff --git a/tests/test_Utils.py b/tests/test_Utils.py
@@ -1,13 +1,12 @@
+"""Tests for the module `citation_extractor.core`."""
 # -*- coding: utf-8 -*-
 # author: Matteo Romanello, [email protected]
 
-import pytest
-import pdb
 import pkg_resources
 import logging
 import pandas as pd
-from pytest import fixture
-from citation_extractor.Utils.IO import *
+from citation_extractor.Utils.IO import annotations2references
+from citation_extractor.Utils.IO import load_brat_data
 from citation_extractor.Utils.strmatching import StringUtils
 
 logging.basicConfig(level=logging.INFO)
@@ -19,35 +18,66 @@
 
 
 def test_annotations2references(knowledge_base):
-    datadir = ('citation_extractor','data/aph_corpus/goldset/ann/')
+    datadir = ('citation_extractor', 'data/aph_corpus/goldset/ann/')
     dir = pkg_resources.resource_filename(*datadir)
-    files = [file.replace('-doc-1.ann','') for file in pkg_resources.resource_listdir(*datadir) if '.ann' in file]
-    all_annotations = [annotations2references(file, dir, knowledge_base) for file in files]
+    files = [
+        file.replace('-doc-1.ann', '')
+        for file in pkg_resources.resource_listdir(*datadir)
+        if '.ann' in file
+    ]
+    all_annotations = [
+        annotations2references(file, dir, knowledge_base)
+        for file in files[:50]
+    ]
     references = reduce((lambda x, y: x + y), all_annotations)
     assert references is not None
 
-#def test_sort_entities(): #TODO implement
-#    raise NotImplementedError
 
-#@pytest.mark.skip
-def test_load_brat_data(crf_citation_extractor, knowledge_base, postaggers, aph_test_ann_files, aph_titles):
+def test_load_brat_data(
+    crfsuite_citation_extractor,
+    knowledge_base, postaggers,
+    aph_test_ann_files,
+    aph_titles
+):
+    assert crfsuite_citation_extractor is not None
     # load the pandas.DataFrame
-    dataframe = load_brat_data(crf_citation_extractor, knowledge_base, postaggers, aph_test_ann_files, aph_titles)
-    assert dataframe is not None and type(dataframe)==type(pd.DataFrame()) and dataframe.shape[0]>0
+    dataframe = load_brat_data(
+        crfsuite_citation_extractor,
+        knowledge_base,
+        postaggers,
+        aph_test_ann_files,
+        aph_titles
+    )
+    assert dataframe is not None
+    assert isinstance(dataframe, pd.DataFrame)
+    assert dataframe.shape[0] > 0
 
 #####################
 # Utils.strmatching #
 #####################
 
-def test_utils_stringutils():
 
+def test_utils_stringutils():
     strings = [
-        ("de", u"Wie seine Vorgänger verfolgt auch Ammianus die didaktische Absicht,")
-        , ("en", u"Judgement of Paris, with actors playing the bribing goddesses, at the end of Book 10 (11, 3-5 : cf. 10, 30-31).")
-        , ("it", u"Superior e databili tra l'età augustea e il 5° sec. : AE 1952, 16 ; CIL 13, 8648 = ILS 2244 ; AE 1938, 120 ;")
+        (
+            "de",
+            u"Wie seine Vorgänger verfolgt auch\
+            Ammianus die didaktische Absicht,"
+        ),
+        (
+            "en",
+            u"Judgement of Paris, with actors playing the bribing goddesses,\
+            at the end of Book 10 (11, 3-5 : cf. 10, 30-31)."
+        ),
+        (
+            "it",
+            u"Superior e databili tra l'età augustea e il 5° sec. : AE 1952,\
+            16 ; CIL 13, 8648 = ILS 2244 ; AE 1938, 120 ;"
+        )
     ]
 
     for language, text in strings:
         normalized_text = StringUtils.normalize(text)
         normalized_text = StringUtils.normalize(text, language)
         normalized_text = StringUtils.normalize(text, language, keep_dots=True)
+        assert normalized_text is not None