Skip to content

Commit

Permalink
Added usage example
Browse files Browse the repository at this point in the history
  • Loading branch information
justinbt1 committed Apr 24, 2022
1 parent 24962ff commit 7dcf216
Showing 1 changed file with 254 additions and 0 deletions.
254 changes: 254 additions & 0 deletions example/usage_example.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from akin import MinHash, LSH"
]
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [],
"source": [
"content = [\n",
" 'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium',\n",
" 'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.',\n",
" 'A helium atom has about four times as much mass as a hydrogen atom, so the composition changes '\n",
" 'when described as the proportion of mass contributed by different atoms.',\n",
" 'Jupiter is primarily composed of hydrogen and a quarter of its mass being helium',\n",
" 'A helium atom has about four times as much mass as a hydrogen atom and the composition changes '\n",
" 'when described as a proportion of mass contributed by different atoms.',\n",
" 'Theoretical models indicate that if Jupiter had much more mass than it does at present, it '\n",
" 'would shrink.',\n",
" 'This process causes Jupiter to shrink by about 2 cm each year.',\n",
" 'Jupiter is mostly composed of hydrogen with a quarter of its mass being helium',\n",
" 'The Great Red Spot is large enough to accommodate Earth within its boundaries.'\n",
"]"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [],
"source": [
"# Labels for each text in content.\n",
"labels = [1, 2, 3, 4, 5, 6, 7, 8, 9]"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [],
"source": [
"# Create MinHash object.\n",
"minhash = MinHash(content, n_gram=9, permutations=100, hash_bits=64, seed=3)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [],
"source": [
"# Create LSH model.\n",
"lsh = LSH(minhash, labels, no_of_bands=50)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [
{
"data": {
"text/plain": "[8, 4]"
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Query to find near duplicates for text 1.\n",
"lsh.query(1, min_jaccard=0.5)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [],
"source": [
"# Generate minhash signature and add new texts to LSH model.\n",
"new_text = [\n",
" 'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium',\n",
" 'Jupiter moving out of the inner Solar System would have allowed the formation of '\n",
" 'inner planets.'\n",
"]"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [],
"source": [
"new_labels = ['doc1', 'doc2']"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [],
"source": [
"new_minhash = MinHash(new_text, n_gram=9, permutations=100, hash_bits=64, seed=3)\n",
"lsh.update(new_minhash, new_labels)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [
{
"data": {
"text/plain": "[1, 2, 3, 4, 5, 6, 7, 8, 9, 'doc1', 'doc2']"
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Check contents of documents.\n",
"lsh.contains()"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [
{
"data": {
"text/plain": "[1, 2, 3, 4, 6, 7, 8, 9, 'doc1', 'doc2']"
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Remove text and label from model.\n",
"lsh.remove(5)\n",
"lsh.contains()"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"data": {
"text/plain": "{1: ['doc1', 4],\n 2: ['doc2'],\n 3: [],\n 4: [1, 'doc1'],\n 6: [],\n 7: [],\n 8: [],\n 9: [],\n 'doc1': [1, 4],\n 'doc2': [2]}"
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Return adjacency list for all similar texts.\n",
"adjacency_list = lsh.adjacency_list(min_jaccard=0.55)\n",
"adjacency_list"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

0 comments on commit 7dcf216

Please sign in to comment.