diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..bb54742 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include dependency_paraphraser/models/natasha_projector.pkl \ No newline at end of file diff --git a/README.md b/README.md index 39088e4..4c71fa3 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,45 @@ # dependency-paraphraser -A sentence paraphraser based on dependency parsing and word embedding similarity. +A sentence paraphraser based on dependency parsing +and word embedding similarity. + +How the paraphraser works: +1. + +The basic usage (for Russian language) is based on Natasha library: + +```python +import dependency_paraphraser.natasha +import random +random.seed(42) +text = 'каждый охотник желает знать где сидит фазан' +for i in range(3): + print(dependency_paraphraser.natasha.paraphrase(text, tree_temperature=2)) +# желает знать сидит фазан где каждый охотник +# каждый охотник желает знать где фазан сидит +# знать где фазан сидит каждый охотник желает +``` + +You can provide your own w2v model to replace words with similar ones: +```python +import compress_fasttext +small_model = compress_fasttext.models.CompressedFastTextKeyedVectors.load( + 'https://github.com/avidale/compress-fasttext/releases/download/v0.0.1/ft_freqprune_100K_20K_pq_100.bin' +) +random.seed(42) +for i in range(3): + print(dependency_paraphraser.natasha.paraphrase(text, w2v=small_model, p_rep=0.8, min_sim=0.55)) +# стремится каждый охотник знать рябчик где усаживается +# каждый охотник хочет узнать фазан где просиживает +# каждый охотник хочет узнать фазан где восседает +``` + +Alternatively, you can expand and use the w2v model from Natasha (aka `navec`): +```python +navec_model = dependency_paraphraser.natasha.emb.as_gensim +random.seed(42) +for i in range(3): + print(dependency_paraphraser.natasha.paraphrase(text, w2v=navec_model, p_rep=0.5, min_sim=0.55)) +# желает каждый охотник помнить фазан где лежит +# каждый охотник желает знать фазан где сидит +# каждый охотник оставляет понять где фазан лежит +``` diff --git a/dependency_paraphraser/__init__.py b/dependency_paraphraser/__init__.py new file mode 100644 index 0000000..3a4286e --- /dev/null +++ b/dependency_paraphraser/__init__.py @@ -0,0 +1 @@ +from . import projection, utils, synonyms diff --git a/dependency_paraphraser/models/natasha_projector.pkl b/dependency_paraphraser/models/natasha_projector.pkl new file mode 100644 index 0000000..7a9c8f9 Binary files /dev/null and b/dependency_paraphraser/models/natasha_projector.pkl differ diff --git a/dependency_paraphraser/natasha.py b/dependency_paraphraser/natasha.py new file mode 100644 index 0000000..cc7d96f --- /dev/null +++ b/dependency_paraphraser/natasha.py @@ -0,0 +1,59 @@ +import os +import pickle + + +from natasha import ( + Segmenter, + MorphVocab, + NewsEmbedding, + NewsMorphTagger, + NewsSyntaxParser, + Doc, +) + +from dependency_paraphraser import projection, synonyms + + +segmenter = Segmenter() +morph_vocab = MorphVocab() + +emb = NewsEmbedding() +morph_tagger = NewsMorphTagger(emb) +syntax_parser = NewsSyntaxParser(emb) + +gensim_emb = None + + +with open(os.path.join(os.path.dirname(__file__), 'models', 'natasha_projector.pkl'), 'rb') as f: + projector = pickle.load(f) + + +def use_news_embeddings(): + """ Convert navec embeddings to gensim format to use for synonym replacement """ + global gensim_emb + gensim_emb = emb.as_gensim + + +def paraphrase(text, tree_temperature=0.5, w2v=None, min_sim=0.5, p_rep=0.5): + doc = Doc(text) + doc.segment(segmenter) + doc.tag_morph(morph_tagger) + doc.parse_syntax(syntax_parser) + + if w2v is None: + w2v = gensim_emb + + results = [] + + for sent in doc.sents: + toks = projection.make_tree_projection( + sent, model=projector, temperature=tree_temperature, + ) + if w2v: + words = synonyms.replace_synonyms( + toks, w2v=w2v, morph_vocab=morph_vocab, min_sim=min_sim, p_rep=p_rep, + ) + else: + words = [token.text for token in toks] + results.append(' '.join(words)) + return ' '.join(results) diff --git a/dependency_paraphraser/projection.py b/dependency_paraphraser/projection.py new file mode 100644 index 0000000..3653f17 --- /dev/null +++ b/dependency_paraphraser/projection.py @@ -0,0 +1,96 @@ +import random +from functools import cmp_to_key + + +from dependency_paraphraser.utils import anneal, nat2ch, conll2ch + + +def sent2xy(sent, sentence_format='natasha'): + pairs_x = [] + pairs_y = [] + pairs_ids = [] + tree = nat2ch(sent) if sentence_format == 'natasha' else conll2ch(sent) + tokens = sent.tokens if sentence_format == 'natasha' else sent + + def tok2rel(tok): + if sentence_format == 'natasha': + return tok.rel + return tok.deprel + + def tok2pos(tok): + if sentence_format == 'natasha': + return tok.pos + return tok.upos + + def tok2features(tok, idx): + dep = 'parent' if idx == parent_id else tok2rel(tok) + return { + 'deprel_{}'.format(dep): 1, + 'pos_{}'.format(tok2pos(tok)): 1, + } + + for parent_id, children_ids in tree.items(): + ids = children_ids if parent_id == -1 else [parent_id] + children_ids + for l_id in ids: + for r_id in ids: + if l_id == r_id: + continue + pairs_x.append({ + '{}_{}'.format(lr, k): v + for lr, idx in [('r', r_id), ('l', l_id)] + for k, v in tok2features(tokens[idx], idx).items() + }) + pairs_y.append(int(l_id < r_id)) + pairs_ids.append((l_id, r_id)) + return pairs_x, pairs_y, pairs_ids + + +def project_tree_randomly(id_to_children, pair_to_proba=None, root_id=-1, temperature=1, shuffle=True): + """ + Project a tree into a sequence, optionally with a random order of children. + Return the list of token indices. + """ + if root_id not in id_to_children and root_id != -1: + return [root_id] + children_ids = id_to_children[root_id][:] + + ids = children_ids if root_id == -1 else [root_id] + children_ids + + def comparator(l, r): + raw_proba = (pair_to_proba[(l, r)] + 1 - pair_to_proba[(r, l)]) / 2 + new_proba = anneal(raw_proba, t=temperature) + return random.random() - new_proba + + if pair_to_proba: + random.shuffle(ids) + ids = sorted(ids, key=cmp_to_key(comparator)) + elif shuffle: + random.shuffle(ids) + else: + ids = sorted(ids) + + result = [] + for tok_id in ids: + if tok_id == root_id: + result.append(tok_id) + else: + result.extend(project_tree_randomly( + id_to_children, + root_id=tok_id, + pair_to_proba=pair_to_proba, + temperature=temperature, + shuffle=shuffle, + )) + return result + + +def make_tree_projection(sent, model, sentence_format='natasha', temperature=1): + px, py, pids = sent2xy(sent, sentence_format=sentence_format) + preds = model.predict_proba(px)[:, 1] + pair2proba = {pair: proba for pair, proba in zip(pids, preds)} + ch = nat2ch(sent) if sentence_format == 'natasha' else conll2ch(sent) + tokens = sent.tokens if sentence_format == 'natasha' else sent + tokens = [ + tokens[t] for t in project_tree_randomly(ch, pair_to_proba=pair2proba, temperature=temperature) + ] + return tokens diff --git a/dependency_paraphraser/synonyms.py b/dependency_paraphraser/synonyms.py new file mode 100644 index 0000000..d9d3403 --- /dev/null +++ b/dependency_paraphraser/synonyms.py @@ -0,0 +1,52 @@ +import random + + +INFLECTABLES = { + 'NOUN': ['Number', 'Case'], + 'ADJ': ['Number', 'Case', 'Gender',], # 'Degree' ?? + 'VERB': ['Number', 'VerbForm', 'Person', 'Tense', 'Mood', 'Gender'], +} +DEFAULT_INFLECTABLES = ['Number', 'Case', 'VerbForm', 'Person', 'Tense', 'Mood'] # 'Gender', + + +def morph_synonyms(token, w2v, morph_vocab, initial_k=30, k=10, threshold=0.0): + token.lemmatize(morph_vocab) + text = token.lemma + neighbours = [] + if text not in w2v: + return neighbours + pairs = w2v.most_similar(text, topn=initial_k) + for pair in pairs: + if len(neighbours) >= k: + break + if pair[1] < threshold: + break + parses = morph_vocab.parse(pair[0]) + for parse in parses: + if parse.normal == text: + continue + if parse.pos == token.pos: + inflectables = INFLECTABLES.get(token.pos, DEFAULT_INFLECTABLES) + word = parse.inflect({token.feats[k] for k in inflectables if k in token.feats}) + if not word: + continue + if word.word == text: + break + neighbours.append((pair[0], word.word, pair[1])) + break + return neighbours + + +def replace_synonyms(tokens, w2v, morph_vocab, min_sim=0.6, p_rep=0.5): + result = [] + for token in tokens: + if random.random() > p_rep: + result.append(token.text) + continue + neighbors = morph_synonyms(token, w2v, morph_vocab, threshold=min_sim) + + if neighbors: + result.append(random.choice(neighbors)[1]) + else: + result.append(token.text) + return result diff --git a/dependency_paraphraser/utils.py b/dependency_paraphraser/utils.py new file mode 100644 index 0000000..e48db2e --- /dev/null +++ b/dependency_paraphraser/utils.py @@ -0,0 +1,28 @@ +import math + +from collections import defaultdict + + +def conll2ch(sentence): + id2children = defaultdict(list) + for tok in sentence: + id2children[int(tok.head) - 1].append(int(tok.id)-1) + return id2children + + +def nat2ch(sentence): + id2idx = {token.id: i for i, token in enumerate(sentence.tokens)} + id2children = defaultdict(list) + for tok in sentence.tokens: + id2children[id2idx.get(tok.head_id, -1)].append(id2idx[tok.id]) + return id2children + +def anneal(p, t=1): + """ Modify a probability by applying temperature to it. """ + if t == 1: + return p + if t == math.inf: + return 0.5 + if t == 0: + return int(p > 0.5) + return 1 / (1 + math.exp(-(math.log(p / (1-p)) / t))) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..1bbada2 --- /dev/null +++ b/setup.py @@ -0,0 +1,32 @@ +import setuptools + +with open("README.md", "r", encoding="utf-8") as fh: + long_description = fh.read() + +setuptools.setup( + name="dependency-paraphraser", + version="0.0.1", + author="David Dale", + author_email="dale.david@mail.ru", + description="A sentence paraphraser based on dependency syntax and word embeddings", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/avidale/dependency-paraphraser", + packages=setuptools.find_packages(), + license="MIT", + classifiers=[ + "Development Status :: 3 - Alpha", + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + install_requires=[ + ], + extras_require={ + 'natasha': ['natasha'], + }, + package_data={ + '': ['*.pkl'] + }, + include_package_data=True, +)