diff --git a/README.md b/README.md index b85efcf..b280b0f 100644 --- a/README.md +++ b/README.md @@ -24,9 +24,16 @@ listed and explained below: * `dev` for being able to develop this package +Note that depending on your system, it may be necessary to install certain system libraries manually +before activating the contrib dependencies. For example on macOS, `libomp` is required by the `lightgbm` +contrib dependency: + + brew install libomp + To install for example the `contrib` dependencies run: pip install -e ".[contrib]" + ## Starting a simple recommender diff --git a/ariadne/contrib/jieba.py b/ariadne/contrib/jieba.py index 41c9eee..2639218 100644 --- a/ariadne/contrib/jieba.py +++ b/ariadne/contrib/jieba.py @@ -26,4 +26,4 @@ def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_ result = jieba.tokenize(cas.sofa_string) for tk in result: prediction = create_prediction(cas, layer, feature, tk[1], tk[2], tk[0]) - cas.add_annotation(prediction) + cas.add(prediction) diff --git a/ariadne/contrib/sbert.py b/ariadne/contrib/sbert.py index 0e48580..d1d44a7 100644 --- a/ariadne/contrib/sbert.py +++ b/ariadne/contrib/sbert.py @@ -87,7 +87,7 @@ def fit(self, documents: List[TrainingDocument], layer: str, feature: str, proje if label is None: continue - sentences.append(cas.get_covered_text(sentence)) + sentences.append(sentence.get_covered_text()) targets.append(label) featurized_sentences = featurizer.featurize(sentences) @@ -112,7 +112,7 @@ def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_ for sentence, featurized_sentence, label in zip(sentences, featurized_sentences, predictions): prediction = create_prediction(cas, layer, feature, sentence.begin, sentence.end, label) - cas.add_annotation(prediction) + cas.add(prediction) def _get_featurizer(self): return CachedSentenceTransformer("distilbert-base-nli-mean-tokens") diff --git a/ariadne/contrib/sklearn.py b/ariadne/contrib/sklearn.py index 8b0a932..c747a30 100644 --- a/ariadne/contrib/sklearn.py +++ b/ariadne/contrib/sklearn.py @@ -60,7 +60,7 @@ def fit(self, documents: List[TrainingDocument], layer: str, feature: str, proje if label is None: continue - sentences.append(cas.get_covered_text(sentence)) + sentences.append(sentence.get_covered_text()) targets.append(label) model = Pipeline([("vect", CountVectorizer()), ("tfidf", TfidfTransformer()), ("clf", MultinomialNB())]) @@ -79,7 +79,7 @@ def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_ for sentence in cas.select(SENTENCE_TYPE): predicted = model.predict([sentence.get_covered_text()])[0] prediction = create_prediction(cas, layer, feature, sentence.begin, sentence.end, predicted) - cas.add_annotation(prediction) + cas.add(prediction) # https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html#let-s-use-conll-2002-data-to-build-a-ner-system diff --git a/ariadne/contrib/spacy.py b/ariadne/contrib/spacy.py index e60a204..01413fc 100644 --- a/ariadne/contrib/spacy.py +++ b/ariadne/contrib/spacy.py @@ -38,7 +38,7 @@ def __init__(self, model_name: str, model_directory: Path = None): def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): # Extract the tokens from the CAS and create a spacy doc from it cas_tokens = cas.select(TOKEN_TYPE) - words = [cas.get_covered_text(cas_token) for cas_token in cas_tokens] + words = [cas_token.get_covered_text() for cas_token in cas_tokens] doc = Doc(self._model.vocab, words=words) @@ -51,7 +51,7 @@ def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_ end = cas_tokens[named_entity.end - 1].end label = named_entity.label_ prediction = create_prediction(cas, layer, feature, begin, end, label) - cas.add_annotation(prediction) + cas.add(prediction) class SpacyPosClassifier(Classifier): @@ -66,7 +66,7 @@ def __init__(self, model_name: str): def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): # Extract the tokens from the CAS and create a spacy doc from it - words = [cas.get_covered_text(cas_token) for cas_token in cas.select(TOKEN_TYPE)] + words = [cas_token.get_covered_text() for cas_token in cas.select(TOKEN_TYPE)] doc = Doc(self._model.vocab, words=words) @@ -77,4 +77,4 @@ def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_ # For every token, extract the POS tag and create an annotation in the CAS for cas_token, spacy_token in zip(cas.select(TOKEN_TYPE), doc): prediction = create_prediction(cas, layer, feature, cas_token.begin, cas_token.end, spacy_token.tag_) - cas.add_annotation(prediction) + cas.add(prediction) diff --git a/setup.py b/setup.py index b895a0b..0026541 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ "nltk~=3.5", "jieba~=0.42", "sentence-transformers~=2.2.2", -# "lightgbm~=3.3.5", + "lightgbm~=4.2.0", "diskcache~=5.2.1" ] diff --git a/tests/test_inception_util.py b/tests/test_inception_util.py index 4e07dbb..585ace1 100644 --- a/tests/test_inception_util.py +++ b/tests/test_inception_util.py @@ -21,11 +21,11 @@ def test_create_prediction(): typesystem = TypeSystem() Span = typesystem.create_type("custom.Span") - typesystem.add_feature(Span, "inception_internal_predicted", "uima.cas.Boolean") - typesystem.add_feature(Span, "value", "uima.cas.String") - typesystem.add_feature(Span, "value_score", "uima.cas.Double") - typesystem.add_feature(Span, "value_score_explanation", "uima.cas.String") - typesystem.add_feature(Span, "value_auto_accept", "uima.cas.Boolean") + typesystem.create_feature(Span, "inception_internal_predicted", "uima.cas.Boolean") + typesystem.create_feature(Span, "value", "uima.cas.String") + typesystem.create_feature(Span, "value_score", "uima.cas.Double") + typesystem.create_feature(Span, "value_score_explanation", "uima.cas.String") + typesystem.create_feature(Span, "value_auto_accept", "uima.cas.Boolean") cas = Cas(typesystem=typesystem) prediction = create_prediction( cas, "custom.Span", "value", 0, 4, "label", score=0.1, score_explanation="blah", auto_accept=True diff --git a/tests/test_jieba_segmenter.py b/tests/test_jieba_segmenter.py index 08d003f..0a99795 100644 --- a/tests/test_jieba_segmenter.py +++ b/tests/test_jieba_segmenter.py @@ -56,7 +56,7 @@ def _load_data() -> Cas: cas = Cas() cas.sofa_string = text.strip() predicted_type = cas.typesystem.create_type(_PREDICTED_TYPE) - cas.typesystem.add_feature(predicted_type, _PREDICTED_FEATURE, "uima.cas.String") - cas.typesystem.add_feature(predicted_type, "inception_internal_predicted", "uima.cas.Boolean") + cas.typesystem.create_feature(predicted_type, _PREDICTED_FEATURE, "uima.cas.String") + cas.typesystem.create_feature(predicted_type, "inception_internal_predicted", "uima.cas.Boolean") return cas diff --git a/tests/test_sbert_sentence_classifier.py b/tests/test_sbert_sentence_classifier.py index 2800a33..6818a2b 100644 --- a/tests/test_sbert_sentence_classifier.py +++ b/tests/test_sbert_sentence_classifier.py @@ -17,7 +17,7 @@ import pytest -pytest.importorskip("lightgbm.LGBMClassifier") +#pytest.importorskip("lightgbm.LGBMClassifier") from ariadne.contrib.sbert import SbertSentenceClassifier diff --git a/tests/util.py b/tests/util.py index 183ace9..c3e2bad 100644 --- a/tests/util.py +++ b/tests/util.py @@ -97,6 +97,6 @@ def build_typesystem() -> TypeSystem: typesystem = TypeSystem() typesystem.create_type(SENTENCE_TYPE) PredictedType = typesystem.create_type(PREDICTED_TYPE) - typesystem.add_feature(PredictedType, PREDICTED_FEATURE, TYPE_NAME_STRING) - typesystem.add_feature(PredictedType, IS_PREDICTION, TYPE_NAME_BOOLEAN) + typesystem.create_feature(PredictedType, PREDICTED_FEATURE, TYPE_NAME_STRING) + typesystem.create_feature(PredictedType, IS_PREDICTION, TYPE_NAME_BOOLEAN) return typesystem