diff --git a/example/usage_example.ipynb b/example/usage_example.ipynb index ec9c8a0..d60989d 100644 --- a/example/usage_example.ipynb +++ b/example/usage_example.ipynb @@ -1,15 +1,30 @@ { "cells": [ + { + "cell_type": "markdown", + "source": [ + "### Akin - Example Usage" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, "outputs": [], "source": [ "from akin import MinHash, LSH" - ] + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } }, { "cell_type": "code", @@ -43,8 +58,7 @@ "execution_count": 3, "outputs": [], "source": [ - "# Labels for each text in content.\n", - "labels = [1, 2, 3, 4, 5, 6, 7, 8, 9]" + "content_labels = [1, 2, 3, 4, 5, 6, 7, 8, 9]" ], "metadata": { "collapsed": false, @@ -53,12 +67,23 @@ } } }, + { + "cell_type": "markdown", + "source": [ + "**Create MinHash object:**" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, { "cell_type": "code", "execution_count": 4, "outputs": [], "source": [ - "# Create MinHash object.\n", "minhash = MinHash(content, n_gram=9, permutations=100, hash_bits=64, seed=3)" ], "metadata": { @@ -68,13 +93,24 @@ } } }, + { + "cell_type": "markdown", + "source": [ + "**Create LSH object:**" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, { "cell_type": "code", "execution_count": 5, "outputs": [], "source": [ - "# Create LSH model.\n", - "lsh = LSH(minhash, labels, no_of_bands=50)" + "lsh = LSH(minhash, content_labels, no_of_bands=50)" ], "metadata": { "collapsed": false, @@ -83,6 +119,18 @@ } } }, + { + "cell_type": "markdown", + "source": [ + "**Query to find near duplicates for text 1:**" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, { "cell_type": "code", "execution_count": 6, @@ -97,7 +145,6 @@ } ], "source": [ - "# Query to find near duplicates for text 1.\n", "lsh.query(1, min_jaccard=0.5)" ], "metadata": { @@ -107,12 +154,23 @@ } } }, + { + "cell_type": "markdown", + "source": [ + "**Generate minhash signature and add new texts to LSH model:**" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 14, "outputs": [], "source": [ - "# Generate minhash signature and add new texts to LSH model.\n", "new_text = [\n", " 'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium',\n", " 'Jupiter moving out of the inner Solar System would have allowed the formation of '\n", @@ -155,6 +213,18 @@ } } }, + { + "cell_type": "markdown", + "source": [ + "**Check contents of documents:**" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, { "cell_type": "code", "execution_count": 10, @@ -169,7 +239,6 @@ } ], "source": [ - "# Check contents of documents.\n", "lsh.contains()" ], "metadata": { @@ -179,6 +248,18 @@ } } }, + { + "cell_type": "markdown", + "source": [ + "**Remove text and label from model:**" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, { "cell_type": "code", "execution_count": 11, @@ -204,6 +285,18 @@ } } }, + { + "cell_type": "markdown", + "source": [ + "**Return adjacency list for all similar texts:**" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, { "cell_type": "code", "execution_count": 12, @@ -218,7 +311,6 @@ } ], "source": [ - "# Return adjacency list for all similar texts.\n", "adjacency_list = lsh.adjacency_list(min_jaccard=0.55)\n", "adjacency_list" ],