-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
313 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
include dependency_paraphraser/models/natasha_projector.pkl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,45 @@ | ||
# dependency-paraphraser | ||
A sentence paraphraser based on dependency parsing and word embedding similarity. | ||
A sentence paraphraser based on dependency parsing | ||
and word embedding similarity. | ||
|
||
How the paraphraser works: | ||
1. | ||
|
||
The basic usage (for Russian language) is based on Natasha library: | ||
|
||
```python | ||
import dependency_paraphraser.natasha | ||
import random | ||
random.seed(42) | ||
text = 'каждый охотник желает знать где сидит фазан' | ||
for i in range(3): | ||
print(dependency_paraphraser.natasha.paraphrase(text, tree_temperature=2)) | ||
# желает знать сидит фазан где каждый охотник | ||
# каждый охотник желает знать где фазан сидит | ||
# знать где фазан сидит каждый охотник желает | ||
``` | ||
|
||
You can provide your own w2v model to replace words with similar ones: | ||
```python | ||
import compress_fasttext | ||
small_model = compress_fasttext.models.CompressedFastTextKeyedVectors.load( | ||
'https://github.com/avidale/compress-fasttext/releases/download/v0.0.1/ft_freqprune_100K_20K_pq_100.bin' | ||
) | ||
random.seed(42) | ||
for i in range(3): | ||
print(dependency_paraphraser.natasha.paraphrase(text, w2v=small_model, p_rep=0.8, min_sim=0.55)) | ||
# стремится каждый охотник знать рябчик где усаживается | ||
# каждый охотник хочет узнать фазан где просиживает | ||
# каждый охотник хочет узнать фазан где восседает | ||
``` | ||
|
||
Alternatively, you can expand and use the w2v model from Natasha (aka `navec`): | ||
```python | ||
navec_model = dependency_paraphraser.natasha.emb.as_gensim | ||
random.seed(42) | ||
for i in range(3): | ||
print(dependency_paraphraser.natasha.paraphrase(text, w2v=navec_model, p_rep=0.5, min_sim=0.55)) | ||
# желает каждый охотник помнить фазан где лежит | ||
# каждый охотник желает знать фазан где сидит | ||
# каждый охотник оставляет понять где фазан лежит | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from . import projection, utils, synonyms |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
import os | ||
import pickle | ||
|
||
|
||
from natasha import ( | ||
Segmenter, | ||
MorphVocab, | ||
NewsEmbedding, | ||
NewsMorphTagger, | ||
NewsSyntaxParser, | ||
Doc, | ||
) | ||
|
||
from dependency_paraphraser import projection, synonyms | ||
|
||
|
||
segmenter = Segmenter() | ||
morph_vocab = MorphVocab() | ||
|
||
emb = NewsEmbedding() | ||
morph_tagger = NewsMorphTagger(emb) | ||
syntax_parser = NewsSyntaxParser(emb) | ||
|
||
gensim_emb = None | ||
|
||
|
||
with open(os.path.join(os.path.dirname(__file__), 'models', 'natasha_projector.pkl'), 'rb') as f: | ||
projector = pickle.load(f) | ||
|
||
|
||
def use_news_embeddings(): | ||
""" Convert navec embeddings to gensim format to use for synonym replacement """ | ||
global gensim_emb | ||
gensim_emb = emb.as_gensim | ||
|
||
|
||
def paraphrase(text, tree_temperature=0.5, w2v=None, min_sim=0.5, p_rep=0.5): | ||
doc = Doc(text) | ||
doc.segment(segmenter) | ||
doc.tag_morph(morph_tagger) | ||
doc.parse_syntax(syntax_parser) | ||
|
||
if w2v is None: | ||
w2v = gensim_emb | ||
|
||
results = [] | ||
|
||
for sent in doc.sents: | ||
toks = projection.make_tree_projection( | ||
sent, model=projector, temperature=tree_temperature, | ||
) | ||
if w2v: | ||
words = synonyms.replace_synonyms( | ||
toks, w2v=w2v, morph_vocab=morph_vocab, min_sim=min_sim, p_rep=p_rep, | ||
) | ||
else: | ||
words = [token.text for token in toks] | ||
results.append(' '.join(words)) | ||
return ' '.join(results) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
import random | ||
from functools import cmp_to_key | ||
|
||
|
||
from dependency_paraphraser.utils import anneal, nat2ch, conll2ch | ||
|
||
|
||
def sent2xy(sent, sentence_format='natasha'): | ||
pairs_x = [] | ||
pairs_y = [] | ||
pairs_ids = [] | ||
tree = nat2ch(sent) if sentence_format == 'natasha' else conll2ch(sent) | ||
tokens = sent.tokens if sentence_format == 'natasha' else sent | ||
|
||
def tok2rel(tok): | ||
if sentence_format == 'natasha': | ||
return tok.rel | ||
return tok.deprel | ||
|
||
def tok2pos(tok): | ||
if sentence_format == 'natasha': | ||
return tok.pos | ||
return tok.upos | ||
|
||
def tok2features(tok, idx): | ||
dep = 'parent' if idx == parent_id else tok2rel(tok) | ||
return { | ||
'deprel_{}'.format(dep): 1, | ||
'pos_{}'.format(tok2pos(tok)): 1, | ||
} | ||
|
||
for parent_id, children_ids in tree.items(): | ||
ids = children_ids if parent_id == -1 else [parent_id] + children_ids | ||
for l_id in ids: | ||
for r_id in ids: | ||
if l_id == r_id: | ||
continue | ||
pairs_x.append({ | ||
'{}_{}'.format(lr, k): v | ||
for lr, idx in [('r', r_id), ('l', l_id)] | ||
for k, v in tok2features(tokens[idx], idx).items() | ||
}) | ||
pairs_y.append(int(l_id < r_id)) | ||
pairs_ids.append((l_id, r_id)) | ||
return pairs_x, pairs_y, pairs_ids | ||
|
||
|
||
def project_tree_randomly(id_to_children, pair_to_proba=None, root_id=-1, temperature=1, shuffle=True): | ||
""" | ||
Project a tree into a sequence, optionally with a random order of children. | ||
Return the list of token indices. | ||
""" | ||
if root_id not in id_to_children and root_id != -1: | ||
return [root_id] | ||
children_ids = id_to_children[root_id][:] | ||
|
||
ids = children_ids if root_id == -1 else [root_id] + children_ids | ||
|
||
def comparator(l, r): | ||
raw_proba = (pair_to_proba[(l, r)] + 1 - pair_to_proba[(r, l)]) / 2 | ||
new_proba = anneal(raw_proba, t=temperature) | ||
return random.random() - new_proba | ||
|
||
if pair_to_proba: | ||
random.shuffle(ids) | ||
ids = sorted(ids, key=cmp_to_key(comparator)) | ||
elif shuffle: | ||
random.shuffle(ids) | ||
else: | ||
ids = sorted(ids) | ||
|
||
result = [] | ||
for tok_id in ids: | ||
if tok_id == root_id: | ||
result.append(tok_id) | ||
else: | ||
result.extend(project_tree_randomly( | ||
id_to_children, | ||
root_id=tok_id, | ||
pair_to_proba=pair_to_proba, | ||
temperature=temperature, | ||
shuffle=shuffle, | ||
)) | ||
return result | ||
|
||
|
||
def make_tree_projection(sent, model, sentence_format='natasha', temperature=1): | ||
px, py, pids = sent2xy(sent, sentence_format=sentence_format) | ||
preds = model.predict_proba(px)[:, 1] | ||
pair2proba = {pair: proba for pair, proba in zip(pids, preds)} | ||
ch = nat2ch(sent) if sentence_format == 'natasha' else conll2ch(sent) | ||
tokens = sent.tokens if sentence_format == 'natasha' else sent | ||
tokens = [ | ||
tokens[t] for t in project_tree_randomly(ch, pair_to_proba=pair2proba, temperature=temperature) | ||
] | ||
return tokens |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import random | ||
|
||
|
||
INFLECTABLES = { | ||
'NOUN': ['Number', 'Case'], | ||
'ADJ': ['Number', 'Case', 'Gender',], # 'Degree' ?? | ||
'VERB': ['Number', 'VerbForm', 'Person', 'Tense', 'Mood', 'Gender'], | ||
} | ||
DEFAULT_INFLECTABLES = ['Number', 'Case', 'VerbForm', 'Person', 'Tense', 'Mood'] # 'Gender', | ||
|
||
|
||
def morph_synonyms(token, w2v, morph_vocab, initial_k=30, k=10, threshold=0.0): | ||
token.lemmatize(morph_vocab) | ||
text = token.lemma | ||
neighbours = [] | ||
if text not in w2v: | ||
return neighbours | ||
pairs = w2v.most_similar(text, topn=initial_k) | ||
for pair in pairs: | ||
if len(neighbours) >= k: | ||
break | ||
if pair[1] < threshold: | ||
break | ||
parses = morph_vocab.parse(pair[0]) | ||
for parse in parses: | ||
if parse.normal == text: | ||
continue | ||
if parse.pos == token.pos: | ||
inflectables = INFLECTABLES.get(token.pos, DEFAULT_INFLECTABLES) | ||
word = parse.inflect({token.feats[k] for k in inflectables if k in token.feats}) | ||
if not word: | ||
continue | ||
if word.word == text: | ||
break | ||
neighbours.append((pair[0], word.word, pair[1])) | ||
break | ||
return neighbours | ||
|
||
|
||
def replace_synonyms(tokens, w2v, morph_vocab, min_sim=0.6, p_rep=0.5): | ||
result = [] | ||
for token in tokens: | ||
if random.random() > p_rep: | ||
result.append(token.text) | ||
continue | ||
neighbors = morph_synonyms(token, w2v, morph_vocab, threshold=min_sim) | ||
|
||
if neighbors: | ||
result.append(random.choice(neighbors)[1]) | ||
else: | ||
result.append(token.text) | ||
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import math | ||
|
||
from collections import defaultdict | ||
|
||
|
||
def conll2ch(sentence): | ||
id2children = defaultdict(list) | ||
for tok in sentence: | ||
id2children[int(tok.head) - 1].append(int(tok.id)-1) | ||
return id2children | ||
|
||
|
||
def nat2ch(sentence): | ||
id2idx = {token.id: i for i, token in enumerate(sentence.tokens)} | ||
id2children = defaultdict(list) | ||
for tok in sentence.tokens: | ||
id2children[id2idx.get(tok.head_id, -1)].append(id2idx[tok.id]) | ||
return id2children | ||
|
||
def anneal(p, t=1): | ||
""" Modify a probability by applying temperature to it. """ | ||
if t == 1: | ||
return p | ||
if t == math.inf: | ||
return 0.5 | ||
if t == 0: | ||
return int(p > 0.5) | ||
return 1 / (1 + math.exp(-(math.log(p / (1-p)) / t))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import setuptools | ||
|
||
with open("README.md", "r", encoding="utf-8") as fh: | ||
long_description = fh.read() | ||
|
||
setuptools.setup( | ||
name="dependency-paraphraser", | ||
version="0.0.1", | ||
author="David Dale", | ||
author_email="[email protected]", | ||
description="A sentence paraphraser based on dependency syntax and word embeddings", | ||
long_description=long_description, | ||
long_description_content_type="text/markdown", | ||
url="https://github.com/avidale/dependency-paraphraser", | ||
packages=setuptools.find_packages(), | ||
license="MIT", | ||
classifiers=[ | ||
"Development Status :: 3 - Alpha", | ||
"Programming Language :: Python :: 3", | ||
"License :: OSI Approved :: MIT License", | ||
"Operating System :: OS Independent", | ||
], | ||
install_requires=[ | ||
], | ||
extras_require={ | ||
'natasha': ['natasha'], | ||
}, | ||
package_data={ | ||
'': ['*.pkl'] | ||
}, | ||
include_package_data=True, | ||
) |