-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
254 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,254 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"from akin import MinHash, LSH" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"outputs": [], | ||
"source": [ | ||
"content = [\n", | ||
" 'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium',\n", | ||
" 'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.',\n", | ||
" 'A helium atom has about four times as much mass as a hydrogen atom, so the composition changes '\n", | ||
" 'when described as the proportion of mass contributed by different atoms.',\n", | ||
" 'Jupiter is primarily composed of hydrogen and a quarter of its mass being helium',\n", | ||
" 'A helium atom has about four times as much mass as a hydrogen atom and the composition changes '\n", | ||
" 'when described as a proportion of mass contributed by different atoms.',\n", | ||
" 'Theoretical models indicate that if Jupiter had much more mass than it does at present, it '\n", | ||
" 'would shrink.',\n", | ||
" 'This process causes Jupiter to shrink by about 2 cm each year.',\n", | ||
" 'Jupiter is mostly composed of hydrogen with a quarter of its mass being helium',\n", | ||
" 'The Great Red Spot is large enough to accommodate Earth within its boundaries.'\n", | ||
"]" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"pycharm": { | ||
"name": "#%%\n" | ||
} | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"outputs": [], | ||
"source": [ | ||
"# Labels for each text in content.\n", | ||
"labels = [1, 2, 3, 4, 5, 6, 7, 8, 9]" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"pycharm": { | ||
"name": "#%%\n" | ||
} | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"outputs": [], | ||
"source": [ | ||
"# Create MinHash object.\n", | ||
"minhash = MinHash(content, n_gram=9, permutations=100, hash_bits=64, seed=3)" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"pycharm": { | ||
"name": "#%%\n" | ||
} | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"outputs": [], | ||
"source": [ | ||
"# Create LSH model.\n", | ||
"lsh = LSH(minhash, labels, no_of_bands=50)" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"pycharm": { | ||
"name": "#%%\n" | ||
} | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": "[8, 4]" | ||
}, | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"# Query to find near duplicates for text 1.\n", | ||
"lsh.query(1, min_jaccard=0.5)" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"pycharm": { | ||
"name": "#%%\n" | ||
} | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"outputs": [], | ||
"source": [ | ||
"# Generate minhash signature and add new texts to LSH model.\n", | ||
"new_text = [\n", | ||
" 'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium',\n", | ||
" 'Jupiter moving out of the inner Solar System would have allowed the formation of '\n", | ||
" 'inner planets.'\n", | ||
"]" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"pycharm": { | ||
"name": "#%%\n" | ||
} | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"outputs": [], | ||
"source": [ | ||
"new_labels = ['doc1', 'doc2']" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"pycharm": { | ||
"name": "#%%\n" | ||
} | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"outputs": [], | ||
"source": [ | ||
"new_minhash = MinHash(new_text, n_gram=9, permutations=100, hash_bits=64, seed=3)\n", | ||
"lsh.update(new_minhash, new_labels)" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"pycharm": { | ||
"name": "#%%\n" | ||
} | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 10, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": "[1, 2, 3, 4, 5, 6, 7, 8, 9, 'doc1', 'doc2']" | ||
}, | ||
"execution_count": 10, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"# Check contents of documents.\n", | ||
"lsh.contains()" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"pycharm": { | ||
"name": "#%%\n" | ||
} | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": "[1, 2, 3, 4, 6, 7, 8, 9, 'doc1', 'doc2']" | ||
}, | ||
"execution_count": 11, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"# Remove text and label from model.\n", | ||
"lsh.remove(5)\n", | ||
"lsh.contains()" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"pycharm": { | ||
"name": "#%%\n" | ||
} | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 12, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": "{1: ['doc1', 4],\n 2: ['doc2'],\n 3: [],\n 4: [1, 'doc1'],\n 6: [],\n 7: [],\n 8: [],\n 9: [],\n 'doc1': [1, 4],\n 'doc2': [2]}" | ||
}, | ||
"execution_count": 12, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"# Return adjacency list for all similar texts.\n", | ||
"adjacency_list = lsh.adjacency_list(min_jaccard=0.55)\n", | ||
"adjacency_list" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"pycharm": { | ||
"name": "#%%\n" | ||
} | ||
} | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 2 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython2", | ||
"version": "2.7.6" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 0 | ||
} |