Skip to content

Commit

Permalink
updated README; moved to v.1.6.3; polished some tests
Browse files Browse the repository at this point in the history
  • Loading branch information
mromanello committed Feb 1, 2018
1 parent f4f33bc commit ffe60e3
Show file tree
Hide file tree
Showing 7 changed files with 90 additions and 260 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ install:
- pip install -r requirements_dev.txt
- pip install .
# command to run tests
script: pytest -vv --cov=citation_extractor --ignore=tests/test_eval.py tests/test_citation_extractor.py tests/test_eval.py::test_eval_ner
script: pytest -vv --cov=citation_extractor --ignore=tests/test_eval.py
#script: travis_wait 60 pytest -s -vv --cov=citation_extractor
after_success:
- codecov
40 changes: 40 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,34 @@

This software supports Python version 2.7, and it was tested only on POSIX–compliant operating systems (Linux, Mac OS X, FreeBSD, etc.).

### Installing TreeTagger

The `CitationExtractor` relies on TreeTagger for the PoS tagging of input texts.

There is a handy script to install it.

To run it without having to clone this repo:

```bash
wget -O install_treetagger.sh https://raw.githubusercontent.com/mromanello/CitationExtractor/master/install_treetagger.sh
chmod a+x install_treetagger.sh
./install_treetagger.sh
rm install_treetagger.sh
```

otherwise:

```bash
git clone https://github.com/mromanello/CitationExtractor.git
cd CitationExtractor
chmod a+x install_treetagger.sh
./install_treetagger.sh
rm install_treetagger.sh
```


### With pip

To install the `CitationExtractor` first run:

$ pip install http://www.antlr3.org/download/Python/antlr_python_runtime-3.1.3.tar.gz#egg=antlr_python_runtime-3.1.3
Expand All @@ -22,6 +50,18 @@ followed by:
**NB:** the installation of all other dependencies is handled by `setup.py` but for some reason
(that I'm still trying to figure out) it does not pick up these two.

### Verify installation

To double check that everything was installed correctly, try running the following lines (it should take ~20s):

```python
from citation_extractor.settings import crfsuite
from citation_extractor.pipeline import get_extractor
extractor = get_extractor(crfsuite)
assert extractor is not None
```

If the code above runs without throwing exceptions means you managed to install the library!

## Documentation

Expand Down
222 changes: 0 additions & 222 deletions citation_extractor/Utils/actlearn.py

This file was deleted.

2 changes: 1 addition & 1 deletion citation_extractor/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# -*- coding: utf-8 -*-
__version__ = '1.6.2'
__version__ = '1.6.3'
64 changes: 47 additions & 17 deletions tests/test_Utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
"""Tests for the module `citation_extractor.core`."""
# -*- coding: utf-8 -*-
# author: Matteo Romanello, [email protected]

import pytest
import pdb
import pkg_resources
import logging
import pandas as pd
from pytest import fixture
from citation_extractor.Utils.IO import *
from citation_extractor.Utils.IO import annotations2references
from citation_extractor.Utils.IO import load_brat_data
from citation_extractor.Utils.strmatching import StringUtils

logging.basicConfig(level=logging.INFO)
Expand All @@ -19,35 +18,66 @@


def test_annotations2references(knowledge_base):
datadir = ('citation_extractor','data/aph_corpus/goldset/ann/')
datadir = ('citation_extractor', 'data/aph_corpus/goldset/ann/')
dir = pkg_resources.resource_filename(*datadir)
files = [file.replace('-doc-1.ann','') for file in pkg_resources.resource_listdir(*datadir) if '.ann' in file]
all_annotations = [annotations2references(file, dir, knowledge_base) for file in files]
files = [
file.replace('-doc-1.ann', '')
for file in pkg_resources.resource_listdir(*datadir)
if '.ann' in file
]
all_annotations = [
annotations2references(file, dir, knowledge_base)
for file in files[:50]
]
references = reduce((lambda x, y: x + y), all_annotations)
assert references is not None

#def test_sort_entities(): #TODO implement
# raise NotImplementedError

#@pytest.mark.skip
def test_load_brat_data(crf_citation_extractor, knowledge_base, postaggers, aph_test_ann_files, aph_titles):
def test_load_brat_data(
crfsuite_citation_extractor,
knowledge_base, postaggers,
aph_test_ann_files,
aph_titles
):
assert crfsuite_citation_extractor is not None
# load the pandas.DataFrame
dataframe = load_brat_data(crf_citation_extractor, knowledge_base, postaggers, aph_test_ann_files, aph_titles)
assert dataframe is not None and type(dataframe)==type(pd.DataFrame()) and dataframe.shape[0]>0
dataframe = load_brat_data(
crfsuite_citation_extractor,
knowledge_base,
postaggers,
aph_test_ann_files,
aph_titles
)
assert dataframe is not None
assert isinstance(dataframe, pd.DataFrame)
assert dataframe.shape[0] > 0

#####################
# Utils.strmatching #
#####################

def test_utils_stringutils():

def test_utils_stringutils():
strings = [
("de", u"Wie seine Vorgänger verfolgt auch Ammianus die didaktische Absicht,")
, ("en", u"Judgement of Paris, with actors playing the bribing goddesses, at the end of Book 10 (11, 3-5 : cf. 10, 30-31).")
, ("it", u"Superior e databili tra l'età augustea e il 5° sec. : AE 1952, 16 ; CIL 13, 8648 = ILS 2244 ; AE 1938, 120 ;")
(
"de",
u"Wie seine Vorgänger verfolgt auch\
Ammianus die didaktische Absicht,"
),
(
"en",
u"Judgement of Paris, with actors playing the bribing goddesses,\
at the end of Book 10 (11, 3-5 : cf. 10, 30-31)."
),
(
"it",
u"Superior e databili tra l'età augustea e il 5° sec. : AE 1952,\
16 ; CIL 13, 8648 = ILS 2244 ; AE 1938, 120 ;"
)
]

for language, text in strings:
normalized_text = StringUtils.normalize(text)
normalized_text = StringUtils.normalize(text, language)
normalized_text = StringUtils.normalize(text, language, keep_dots=True)
assert normalized_text is not None
Loading

0 comments on commit ffe60e3

Please sign in to comment.