Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
avidale committed May 16, 2020
1 parent a69ea06 commit 7ae1a80
Show file tree
Hide file tree
Showing 9 changed files with 313 additions and 1 deletion.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include dependency_paraphraser/models/natasha_projector.pkl
45 changes: 44 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,45 @@
# dependency-paraphraser
A sentence paraphraser based on dependency parsing and word embedding similarity.
A sentence paraphraser based on dependency parsing
and word embedding similarity.

How the paraphraser works:
1.

The basic usage (for Russian language) is based on Natasha library:

```python
import dependency_paraphraser.natasha
import random
random.seed(42)
text = 'каждый охотник желает знать где сидит фазан'
for i in range(3):
print(dependency_paraphraser.natasha.paraphrase(text, tree_temperature=2))
# желает знать сидит фазан где каждый охотник
# каждый охотник желает знать где фазан сидит
# знать где фазан сидит каждый охотник желает
```

You can provide your own w2v model to replace words with similar ones:
```python
import compress_fasttext
small_model = compress_fasttext.models.CompressedFastTextKeyedVectors.load(
'https://github.com/avidale/compress-fasttext/releases/download/v0.0.1/ft_freqprune_100K_20K_pq_100.bin'
)
random.seed(42)
for i in range(3):
print(dependency_paraphraser.natasha.paraphrase(text, w2v=small_model, p_rep=0.8, min_sim=0.55))
# стремится каждый охотник знать рябчик где усаживается
# каждый охотник хочет узнать фазан где просиживает
# каждый охотник хочет узнать фазан где восседает
```

Alternatively, you can expand and use the w2v model from Natasha (aka `navec`):
```python
navec_model = dependency_paraphraser.natasha.emb.as_gensim
random.seed(42)
for i in range(3):
print(dependency_paraphraser.natasha.paraphrase(text, w2v=navec_model, p_rep=0.5, min_sim=0.55))
# желает каждый охотник помнить фазан где лежит
# каждый охотник желает знать фазан где сидит
# каждый охотник оставляет понять где фазан лежит
```
1 change: 1 addition & 0 deletions dependency_paraphraser/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from . import projection, utils, synonyms
Binary file not shown.
59 changes: 59 additions & 0 deletions dependency_paraphraser/natasha.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import os
import pickle


from natasha import (
Segmenter,
MorphVocab,
NewsEmbedding,
NewsMorphTagger,
NewsSyntaxParser,
Doc,
)

from dependency_paraphraser import projection, synonyms


segmenter = Segmenter()
morph_vocab = MorphVocab()

emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)

gensim_emb = None


with open(os.path.join(os.path.dirname(__file__), 'models', 'natasha_projector.pkl'), 'rb') as f:
projector = pickle.load(f)


def use_news_embeddings():
""" Convert navec embeddings to gensim format to use for synonym replacement """
global gensim_emb
gensim_emb = emb.as_gensim


def paraphrase(text, tree_temperature=0.5, w2v=None, min_sim=0.5, p_rep=0.5):
doc = Doc(text)
doc.segment(segmenter)
doc.tag_morph(morph_tagger)
doc.parse_syntax(syntax_parser)

if w2v is None:
w2v = gensim_emb

results = []

for sent in doc.sents:
toks = projection.make_tree_projection(
sent, model=projector, temperature=tree_temperature,
)
if w2v:
words = synonyms.replace_synonyms(
toks, w2v=w2v, morph_vocab=morph_vocab, min_sim=min_sim, p_rep=p_rep,
)
else:
words = [token.text for token in toks]
results.append(' '.join(words))
return ' '.join(results)
96 changes: 96 additions & 0 deletions dependency_paraphraser/projection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import random
from functools import cmp_to_key


from dependency_paraphraser.utils import anneal, nat2ch, conll2ch


def sent2xy(sent, sentence_format='natasha'):
pairs_x = []
pairs_y = []
pairs_ids = []
tree = nat2ch(sent) if sentence_format == 'natasha' else conll2ch(sent)
tokens = sent.tokens if sentence_format == 'natasha' else sent

def tok2rel(tok):
if sentence_format == 'natasha':
return tok.rel
return tok.deprel

def tok2pos(tok):
if sentence_format == 'natasha':
return tok.pos
return tok.upos

def tok2features(tok, idx):
dep = 'parent' if idx == parent_id else tok2rel(tok)
return {
'deprel_{}'.format(dep): 1,
'pos_{}'.format(tok2pos(tok)): 1,
}

for parent_id, children_ids in tree.items():
ids = children_ids if parent_id == -1 else [parent_id] + children_ids
for l_id in ids:
for r_id in ids:
if l_id == r_id:
continue
pairs_x.append({
'{}_{}'.format(lr, k): v
for lr, idx in [('r', r_id), ('l', l_id)]
for k, v in tok2features(tokens[idx], idx).items()
})
pairs_y.append(int(l_id < r_id))
pairs_ids.append((l_id, r_id))
return pairs_x, pairs_y, pairs_ids


def project_tree_randomly(id_to_children, pair_to_proba=None, root_id=-1, temperature=1, shuffle=True):
"""
Project a tree into a sequence, optionally with a random order of children.
Return the list of token indices.
"""
if root_id not in id_to_children and root_id != -1:
return [root_id]
children_ids = id_to_children[root_id][:]

ids = children_ids if root_id == -1 else [root_id] + children_ids

def comparator(l, r):
raw_proba = (pair_to_proba[(l, r)] + 1 - pair_to_proba[(r, l)]) / 2
new_proba = anneal(raw_proba, t=temperature)
return random.random() - new_proba

if pair_to_proba:
random.shuffle(ids)
ids = sorted(ids, key=cmp_to_key(comparator))
elif shuffle:
random.shuffle(ids)
else:
ids = sorted(ids)

result = []
for tok_id in ids:
if tok_id == root_id:
result.append(tok_id)
else:
result.extend(project_tree_randomly(
id_to_children,
root_id=tok_id,
pair_to_proba=pair_to_proba,
temperature=temperature,
shuffle=shuffle,
))
return result


def make_tree_projection(sent, model, sentence_format='natasha', temperature=1):
px, py, pids = sent2xy(sent, sentence_format=sentence_format)
preds = model.predict_proba(px)[:, 1]
pair2proba = {pair: proba for pair, proba in zip(pids, preds)}
ch = nat2ch(sent) if sentence_format == 'natasha' else conll2ch(sent)
tokens = sent.tokens if sentence_format == 'natasha' else sent
tokens = [
tokens[t] for t in project_tree_randomly(ch, pair_to_proba=pair2proba, temperature=temperature)
]
return tokens
52 changes: 52 additions & 0 deletions dependency_paraphraser/synonyms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import random


INFLECTABLES = {
'NOUN': ['Number', 'Case'],
'ADJ': ['Number', 'Case', 'Gender',], # 'Degree' ??
'VERB': ['Number', 'VerbForm', 'Person', 'Tense', 'Mood', 'Gender'],
}
DEFAULT_INFLECTABLES = ['Number', 'Case', 'VerbForm', 'Person', 'Tense', 'Mood'] # 'Gender',


def morph_synonyms(token, w2v, morph_vocab, initial_k=30, k=10, threshold=0.0):
token.lemmatize(morph_vocab)
text = token.lemma
neighbours = []
if text not in w2v:
return neighbours
pairs = w2v.most_similar(text, topn=initial_k)
for pair in pairs:
if len(neighbours) >= k:
break
if pair[1] < threshold:
break
parses = morph_vocab.parse(pair[0])
for parse in parses:
if parse.normal == text:
continue
if parse.pos == token.pos:
inflectables = INFLECTABLES.get(token.pos, DEFAULT_INFLECTABLES)
word = parse.inflect({token.feats[k] for k in inflectables if k in token.feats})
if not word:
continue
if word.word == text:
break
neighbours.append((pair[0], word.word, pair[1]))
break
return neighbours


def replace_synonyms(tokens, w2v, morph_vocab, min_sim=0.6, p_rep=0.5):
result = []
for token in tokens:
if random.random() > p_rep:
result.append(token.text)
continue
neighbors = morph_synonyms(token, w2v, morph_vocab, threshold=min_sim)

if neighbors:
result.append(random.choice(neighbors)[1])
else:
result.append(token.text)
return result
28 changes: 28 additions & 0 deletions dependency_paraphraser/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import math

from collections import defaultdict


def conll2ch(sentence):
id2children = defaultdict(list)
for tok in sentence:
id2children[int(tok.head) - 1].append(int(tok.id)-1)
return id2children


def nat2ch(sentence):
id2idx = {token.id: i for i, token in enumerate(sentence.tokens)}
id2children = defaultdict(list)
for tok in sentence.tokens:
id2children[id2idx.get(tok.head_id, -1)].append(id2idx[tok.id])
return id2children

def anneal(p, t=1):
""" Modify a probability by applying temperature to it. """
if t == 1:
return p
if t == math.inf:
return 0.5
if t == 0:
return int(p > 0.5)
return 1 / (1 + math.exp(-(math.log(p / (1-p)) / t)))
32 changes: 32 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import setuptools

with open("README.md", "r", encoding="utf-8") as fh:
long_description = fh.read()

setuptools.setup(
name="dependency-paraphraser",
version="0.0.1",
author="David Dale",
author_email="[email protected]",
description="A sentence paraphraser based on dependency syntax and word embeddings",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/avidale/dependency-paraphraser",
packages=setuptools.find_packages(),
license="MIT",
classifiers=[
"Development Status :: 3 - Alpha",
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
install_requires=[
],
extras_require={
'natasha': ['natasha'],
},
package_data={
'': ['*.pkl']
},
include_package_data=True,
)

0 comments on commit 7ae1a80

Please sign in to comment.