diff --git a/04_cohort_three/live_code/2_ds_search_sort.ipynb b/04_cohort_three/live_code/2_ds_search_sort.ipynb new file mode 100644 index 0000000..cae19e8 --- /dev/null +++ b/04_cohort_three/live_code/2_ds_search_sort.ipynb @@ -0,0 +1,1327 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "8a3c8ac8", + "metadata": {}, + "outputs": [], + "source": [ + "s = set()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "948403a1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(s)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f38bbbfa", + "metadata": {}, + "outputs": [], + "source": [ + "s.add(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "07cc4985", + "metadata": {}, + "outputs": [], + "source": [ + "s.remove(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f1122e94", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "set()" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "94138692", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "5 in s" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "8dc7d244", + "metadata": {}, + "outputs": [], + "source": [ + "d = {}" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "71acbf90", + "metadata": {}, + "outputs": [], + "source": [ + "d['key1'] = 'answer1'\n", + "d['key2'] = 'answer2'" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "51c85a3f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'key1': 'answer3', 'key2': 'answer2'}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "7c07bdd3", + "metadata": {}, + "outputs": [], + "source": [ + "d['key1'] = 'answer3'" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "a4138516", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'answer3'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d.pop('key1')" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "7de107c0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'key2': 'answer2'}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "b9ce91a2", + "metadata": {}, + "outputs": [], + "source": [ + "l = [1, 2, 3, 4, 5]\n", + "it = iter(l)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "3a028b2b", + "metadata": {}, + "outputs": [ + { + "ename": "StopIteration", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mStopIteration\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[24], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mit\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mStopIteration\u001b[0m: " + ] + } + ], + "source": [ + "next(it)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "487a7ee1", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "de600a75", + "metadata": {}, + "outputs": [], + "source": [ + "# Implement a set ADT, using a list data structure\n", + "# get size, insert a value (without introducing duplicates),\n", + "# remove a specified value, check membership\n", + "\n", + "\n", + "def get_size(l):\n", + " return len(l)\n", + "\n", + "def insert_a_value(l, value):\n", + " if value not in l:\n", + " l.append(value)\n", + "\n", + "def remove_a_value(l, value):\n", + " l.remove(value)\n", + "\n", + "def check_membership(l, value):\n", + " return value in l" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "17fbba03", + "metadata": {}, + "outputs": [], + "source": [ + "l = []\n", + "insert_a_value(l, 5)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "8821da94", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[5]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "l" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "d2d4604c", + "metadata": {}, + "outputs": [], + "source": [ + "insert_a_value(l, 5)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "54ae3aa9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[5]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "l" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "a217cdbb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "check_membership(l, 5)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "6c748850", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "check_membership(l, 6)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "fa7552d0", + "metadata": {}, + "outputs": [], + "source": [ + "remove_a_value(l, 5)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "c0529168", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "l" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "1842396f", + "metadata": {}, + "outputs": [], + "source": [ + "a = [1, 2, 3]" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "8083e7cc", + "metadata": {}, + "outputs": [], + "source": [ + "b = a" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "293073a8", + "metadata": {}, + "outputs": [], + "source": [ + "a.append(4)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "c62ad511", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[1, 2, 3, 4]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "b" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f74d5e8", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "78b81a3b", + "metadata": {}, + "outputs": [], + "source": [ + "l = []\n", + "# back of the list\n", + "def push(l, v):\n", + " l.append(v)\n", + "\n", + "def pop(l):\n", + " l.pop()" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "a618d278", + "metadata": {}, + "outputs": [], + "source": [ + "l = []\n", + "# front of the list\n", + "def push(l, v):\n", + " l.insert(0, v)\n", + " \n", + "def pop(l):\n", + " l.pop(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d05830f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "1ec06651", + "metadata": {}, + "outputs": [], + "source": [ + "push(l, 5)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "8ffe967b", + "metadata": {}, + "outputs": [], + "source": [ + "push(l, 6)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "4f056fff", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[6, 5]" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "l" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "1b638d94", + "metadata": {}, + "outputs": [], + "source": [ + "pop(l)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "b18bb37b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[5]" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "l" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b077e424", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "7e585832", + "metadata": {}, + "outputs": [], + "source": [ + "l = []\n", + "# Queue\n", + "# Add to the back of the queue (enqueue)\n", + "# Remove from the front of the queue (dequeue)\n", + "def enqueue(l, v): # O(1)\n", + " l.append(v)\n", + "\n", + "def dequeue(l): # O(n)\n", + " return l.pop(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "e9fc6f82", + "metadata": {}, + "outputs": [], + "source": [ + "l = []\n", + "# Queue\n", + "# Add to the front of the queue (enqueue)\n", + "# Remove from the back of the queue (dequeue)\n", + "def enqueue(l, v): #O(n)\n", + " l.insert(0, v)\n", + "\n", + "def dequeue(l): #O(1)\n", + " return l.pop()" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "8341ce64", + "metadata": {}, + "outputs": [], + "source": [ + "enqueue(l, 1)\n", + "enqueue(l, 2)\n", + "enqueue(l, 3)\n", + "enqueue(l, 4)" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "ad7e57a7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dequeue(l)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b92f4669", + "metadata": {}, + "outputs": [], + "source": [ + "video = load_video('video.mp4')\n", + "model.is_bike(video[50])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61766ef6", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db9e416d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7205c57", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44122816", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8837c9a6-8a69-4356-be21-9298c4317339", + "metadata": {}, + "source": [ + "# Array-Based Data Structures, Searching, and Sorting\n", + "\n", + "## Outline\n", + "\n", + "- Array Based Data Structures\n", + "\n", + " - Stack, queue, Python List\n", + "\n", + "- Searching\n", + "\n", + " - Linear, Binary Search\n", + "\n", + "- Sorting\n", + "\n", + " - Selection, Insertion Sort\n", + "\n", + "- Hash map, hash table (Python dictionary), hash functions\n", + "\n", + "# Array Based Data Structures\n", + "\n", + "## Abstract Data Types Versus Data Structure\n", + "\n", + "- Some concepts are generally useful and transcend any programming\n", + " language\n", + "\n", + "- An **abstract data type** (ADT) defines some kind of data and\n", + " operations that can be performed on it\n", + "\n", + " - Abstract because there is no mention of *how* data is stored or\n", + " *how* the operations work\n", + "\n", + " - Concerned about “what”\n", + "\n", + "- A **data structure** is a concrete method of storing data (and\n", + " therefore its operations).\n", + "\n", + " - For instance, Python List is a data structure because it has a\n", + " specific implementation.\n", + "\n", + "- ADTs form a common vocabulary for computer scientists to discuss\n", + " problems. It allows us to focus on the design and worry about\n", + " implementation later.\n", + "\n", + "## Important ADTs\n", + "\n", + "- Set\n", + "\n", + " - Data: a collection of unique elements\n", + "\n", + " - Operations: get size, insert a value (without introducing\n", + " duplicates), remove a specified value, check membership\n", + "\n", + "- List\n", + "\n", + " - Data: an ordered sequence of elements\n", + "\n", + " - Operations: access element by index, insert a value at a given\n", + " index, remove a value at a given index\n", + "\n", + "## Important ADTs [1]\n", + "\n", + "- Map\n", + "\n", + " - Data: a collection of key-value pairs, where each key is unique\n", + " and associated with a single value\n", + "\n", + " - Operations: look-up a value for a given key, insert a new\n", + " key-value pair, remove a key-value pair, update the value\n", + " associated with a given key\n", + "\n", + "- Iterable\n", + "\n", + " - Data: a collection of values (may or may not be unique)\n", + "\n", + " - Operations: iterate through the elements of the collection one\n", + " at a time.\n", + "\n", + "## Relation between ADTs and Data Structures\n", + "\n", + "- A Python `list` is not a ADT. But it is a natural implementation of\n", + " the List ADT.\n", + "\n", + " - The designers of Python implemented `list` operations\n", + "\n", + "- A single ADT can be implemented by many data structures\n", + "\n", + " - You could implement List ADT using a Python `dict`\n", + "\n", + " - We can store the list `[\"DS\", 4, \"Life\"]` like this:\n", + " `{0: \"DS\", 1: 4, 2: \"Life\"}`\n", + "\n", + "- A data structure can implement many ADTs\n", + "\n", + " - Practice: how can you implement a set with a Python `list`?\n", + "\n", + "## Python Lists\n", + "\n", + "- Each element has an address in memory. The addresses are ordered by\n", + " index number and adjacent to each other.\n", + "\n", + "- Run time for `append` method\n", + "\n", + " - A new address is created and placed at the end of the list\n", + "\n", + " - $O(1)$ time because it doesn’t matter how long the list is\n", + "\n", + "- Run time for `insert` method\n", + "\n", + " - The worst case occurs when you insert at the beginning of the\n", + " list because each element in the list has to be shifted down by\n", + " 1.\n", + "\n", + " - $O(n)$ time\n", + "\n", + "- Run time for `delete` method\n", + "\n", + " - If you remove the first element, all other elements must be\n", + " shifted up by one.\n", + "\n", + " - $O(n)$ time\n", + "\n", + "## Stack\n", + "\n", + "- A stack contains zero or more items\n", + "\n", + " - Items are added at the top of the stack, called *pushing*\n", + "\n", + " - Items are removed from the top of the stack, called *popping*\n", + "\n", + "- The first item added to the stack is the last item removed\n", + "\n", + " - We call this “first-in-last-out” (LIFO) behavior\n", + "\n", + "- 2 minutes: is it faster to use the front or back of a Python list to\n", + " implement a stack? What is the Big-O for stack operations under each\n", + " choice?\n", + "\n", + "## Queue\n", + "\n", + "- A queue contains zero or more items\n", + "\n", + " - Items are added at the rear of the queue, called *enqueue*\n", + "\n", + " - Items are removed from the front of the queue, called *dequeing*\n", + "\n", + "- Items come out of the queue in the order they were added\n", + "\n", + " - We call this “first-in-first-out” (FIFO) behavior\n", + "\n", + "- 2 minutes: is it faster to use the front or back of a Python list to\n", + " implement a queue? What is the Big-O for stack operations under each\n", + " choice?\n", + "\n", + "# Searching\n", + "\n", + "## Motivating Example\n", + "\n", + "- You want to develop a ML method to search through a video to figure\n", + " out when an bike is stolen.\n", + "\n", + "- You could start from the beginning of your video feed and run your\n", + " ML method on each frame until you the bike is not in the frame.\n", + "\n", + " - This would take $O(n)$, probably a long time since you’re using\n", + " ML\n", + "\n", + "- What if we started halfway through? If the bike was there, then\n", + " break the remaining video in half and check again. If the bike\n", + " wasn’t there, then break the previous part of the video in half and\n", + " check again.\n", + "\n", + " - This is *binary search*\n", + "\n", + "## Binary Versus Linear Search\n", + "\n", + "- How many steps does binary searching through 100 numbers take?\n", + " 10,000?\n", + "\n", + " - We can generalize this as $O(\\text{log}n)$\n", + "\n", + "- What is the big-O of linear searching through 100 numbers? 10,000?\n", + "\n", + " - $O(n)$\n", + "\n", + "- Notice binary search requires the list to be sorted in advance.\n", + "\n", + " - We implicitly assumed this in the bike theft example (time is\n", + " “sorted”)\n", + "\n", + "# Sorting\n", + "\n", + "## Selection Sort\n", + "\n", + "- Suppose you want to sort prices of all fruits at a supermarket from\n", + " lowest to highest\n", + "\n", + "- You go through the list, find the item with the lowest price then\n", + " place it on top, then find the second lowest price and place it\n", + " second, etc.\n", + "\n", + " - You will end up with a sorted list!\n", + "\n", + "- To find the lowest price, you need to traverse the entire list. You\n", + " must do this $n$ times until there are no more items.\n", + "\n", + " - This takes $O(n^2)$ time\n", + "\n", + "## Insertion Sort\n", + "\n", + "- Compare the current item to its predecessor. If the item is smaller\n", + " than its predecessor, compare it to the items before. Move the\n", + " greater items one position up to make space for the swapped item.\n", + "\n", + "- You need to traverse the list once for each item in the list, so the\n", + " Big-O is $O(n^2)$.\n", + "\n", + "![](./images/insertion_sort.png)\n", + "\n", + "## Live Coding:\n", + "\n", + "The *h-index* is defined by Wikipedia as the maximum value of $h$ such\n", + "that the given researcher has published at least $h$ papers that have\n", + "each been cited at least $h$ times.\n", + "\n", + "Given a list of integers representing a researcher. Each index is their\n", + "$ith$ publication and the value at the $ith$ index is the number of\n", + "citation. Calculate the h-index of that researcher.\n", + "\n", + "### Example\n", + "\n", + "[1] From https://www.teach.cs.toronto.edu/~csc148h/winter/notes/" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "fd88bcec", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# INPUT\n", + "lst = [2,2,5,6]\n", + "\n", + "def h_index(l):\n", + " count = 0\n", + " for i in reversed(range(1, len(l)+1)):\n", + " for citation in l:\n", + " if citation >= i:\n", + " count+=1\n", + " if count == i:\n", + " return i\n", + " count = 0\n", + " return 0\n", + "\n", + "# OUTPUT\n", + "# 2\n", + "# O(n^2)\n", + "h_index(lst)" + ] + }, + { + "cell_type": "markdown", + "id": "7c3265c3-d522-4c73-8f61-8ac240af5395", + "metadata": {}, + "source": [ + "# Hash map, hash table (Python dictionary), hash functions\n", + "\n", + "## Motivating Example\n", + "\n", + "- Recall from the first lecture that searching in a Python set took\n", + " (basically) 0 seconds\n", + "\n", + " - How was this achieved?\n", + "\n", + " - Binary search only has $O(\\text{log}n)$ time, so there must be\n", + " something else\n", + "\n", + "- To achieve $O(1)$ time, we need something that immediately knows the\n", + " where/what the item is.\n", + "\n", + " - This is the purpose of *hash functions*\n", + "\n", + "## Hash Functions\n", + "\n", + "- A hash function is a function where you enter a string and it\n", + " returns an integer\n", + "\n", + " - Python objects have hash" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "92fbc341", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "-5240100812393730627" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hash(\"DS 4 Life\")" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "0c95c190", + "metadata": {}, + "outputs": [], + "source": [ + "l = [[], [], [], [], [], [], [], [], [], []]" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "50d05b14", + "metadata": {}, + "outputs": [], + "source": [ + "def my_hash_function(s):\n", + " return hash(s)%10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52cbeb8e", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "ae9468fc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "my_hash_function(\"eggs\")" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "5c624de3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "my_hash_function(\"tomato\")" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "3cfee8b9", + "metadata": {}, + "outputs": [], + "source": [ + "l[my_hash_function(\"bananas\")].append(1.44)\n", + "l[my_hash_function(\"eggs\")].append((2.50, 'eggs'))\n", + "l[my_hash_function(\"tomato\")].append((3.50, 'tomato'))" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "721a3188", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[], [], [(2.5, 'eggs'), (3.5, 'tomato')], [1.44], [], [], [], [], [], []]" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "l" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "985b79c3", + "metadata": {}, + "outputs": [], + "source": [ + "l[my_hash_function(\"eggs\")] = 3.50\n", + "l[my_hash_function(\"bananas\")] = 1.44\n", + "# l[] = 3.00" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "28f73f63", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.44" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "l[my_hash_function(\"bananas\")]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ddb272da", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3.5" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "l[my_hash_function(\"eggs\")]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d544ac9f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73b4e456", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "5e90a472-56d3-43ec-ad80-c348ccf52c6d", + "metadata": {}, + "source": [ + "- There are two requirements for a hash function\n", + "\n", + " - It needs to be consistent. For instance, if you enter “UofT” and\n", + " get “1827”, then every time you enter “UofT” you should get\n", + " “1827”\n", + "\n", + " - It maps different words to different numbers. Each string has a\n", + " unique hash.\n", + "\n", + "## Using Hash Functions: Example\n", + "\n", + "- Suppose you have a grocery store catalog with prices and barcodes.\n", + " When you scan an item at checkout, you want it to instantly return\n", + " the price.\n", + "\n", + "- You can put each barcode into a hash function.\n", + "\n", + " - Let’s say barcode “1234” *hashes* to “1” and “9876” hashes to “2”\n", + " \n", + "\n", + " - We store the price of item “1234” at address “1”. Store the\n", + " price of item “9876” at address “2”\n", + "\n", + " - We say the price at “1” is the *hash value* of “1”\n", + "\n", + "- If there are 8 items sold at the store, then the hash function will\n", + " only return integers from 1 to 8\n", + "\n", + " - The size of the hash table is often referred to as its number of\n", + " bins or slots.\n", + "\n", + " - Thus, the hash function depends on the array\n", + "\n", + "- This implementation is called a *hash table*\n", + "\n", + " - The hash table is basically a list of lists, and the hash\n", + " function maps an object to its location in the outer list.\n", + "\n", + "## Python’s Hash Tables: `dict`\n", + "\n", + "- You will likely never implement a hash table yourself, most\n", + " languages have an implementation for has tables.\n", + "\n", + " - In Python, this is the `dict` class\n", + "\n", + "- Dictionaries have keys and values (barcodes and prices)\n", + "\n", + "- Dictionaries have really good performance. Search, insert, or delete\n", + " item are all are $O(1)$ in the average case.\n", + "\n", + " - Average case assumes you have a “good” hash function that avoids\n", + " *collisions*. You can read more about collisions in the\n", + " textbooks.\n", + "\n", + " - The worst case of Python dictionaries for search, insert, and\n", + " delete is $O(n)$.\n", + "\n", + "- Recall Python dictionaries don’t allow duplicate keys, that is\n", + " because has hashes must be unique!\n", + "\n", + "## Python `set`\n", + "\n", + "- Recall during the first lecture, we showcased that Python’s set\n", + " search was much faster than list search\n", + "\n", + "- This is because Python’s set implements a hash function to store its\n", + " values\n", + "\n", + "# Recommended Problems and References\n", + "\n", + "## Recommended Problems\n", + "\n", + "- Bhargava: Chapter 5\n", + "\n", + " - 5.1 to 5.4\n", + "\n", + " - Read pages 79 to 86 on the use cases of hash functions\n", + "\n", + "- Additional\n", + "\n", + " - Give examples of 2 situations to use a queue and 2 situations to\n", + " use a stack\n", + "\n", + " - In Python, code a `stack` class with `is_empty`, `push`, and\n", + " `pop` methods using the end of a Python list as the top of the\n", + " stack. Bonus: Compare the run time of using the start of the\n", + " list versus the end of the list as the top of the stack using\n", + " the `timeit` library!\n", + "\n", + " - In Python, code a `binary_search` function.\n", + "\n", + " - In Python, code a `hash_table` that can hash 4 values.\n", + "\n", + "## References\n", + "\n", + "- Bhargava, A. Y. (2016). *Grokking algorithms: An illustrated guide\n", + " for programmers and other curious people.* Manning. Chapter 5.\n", + "\n", + "- Cormen, T. H. (Ed.). (2009). *Introduction to algorithms* (3rd ed).\n", + " MIT Press. Chapter 2, 10, 11.\n", + "\n", + "- Horton, D., & Liu, D. (2023, November 19). *CSC148 Lecture Notes*.\n", + " https://www.teach.cs.toronto.edu/~csc148h/winter/notes/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1bbf25f6", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/04_cohort_three/live_code/Lecture_1_Review.ipynb b/04_cohort_three/live_code/Lecture_1_Review.ipynb new file mode 100644 index 0000000..9dc2d21 --- /dev/null +++ b/04_cohort_three/live_code/Lecture_1_Review.ipynb @@ -0,0 +1,556 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "0a369d6d", + "metadata": { + "id": "0a369d6d" + }, + "outputs": [], + "source": [ + "# Agenda items\n", + "# - Start recording\n", + "# - Review of yesterday's material" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "271e942f-dd86-462f-868a-645c2ba541a3", + "metadata": { + "id": "271e942f-dd86-462f-868a-645c2ba541a3" + }, + "outputs": [], + "source": [ + "import timeit\n", + "import random" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "070f8a79-9dbd-4767-acdc-76d9460eb3a9", + "metadata": { + "id": "070f8a79-9dbd-4767-acdc-76d9460eb3a9" + }, + "outputs": [], + "source": [ + "small_list = list(range(10)) # 10\n", + "medium_list = list(range(10**2)) # 100\n", + "big_list = list(range(10**4)) # 10000\n", + "\n", + "random.shuffle(small_list)\n", + "random.shuffle(medium_list)\n", + "random.shuffle(big_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "076ecea7-16db-4e09-b29b-65782a3ac88a", + "metadata": { + "id": "076ecea7-16db-4e09-b29b-65782a3ac88a", + "outputId": "a30fa6de-4264-4347-ed03-a497d32a39fb" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time to run on small list: 0.000077\n", + "Time to run on medium list: 0.000077 which is 1.00 times the small list\n", + "Time to run on big list: 0.000077 which is 1.0050 times the medium list, and 1.0050 times the small list\n" + ] + } + ], + "source": [ + "def get_first_element(lst):\n", + " return lst[0]\n", + "\n", + "# O(1)\n", + "\n", + "a = timeit.timeit(\"get_first_element(small_list)\", number=1000, globals=globals())\n", + "b = timeit.timeit(\"get_first_element(medium_list)\", number=1000, globals=globals())\n", + "c = timeit.timeit(\"get_first_element(big_list)\", number=1000, globals=globals())\n", + "print(\"Time to run on small list: {0:.6f}\".format(a))\n", + "print(\"Time to run on medium list: {0:.6f} which is {1:.2f} times the small list\".format(b, b/a))\n", + "print(\"Time to run on big list: {0:.6f} which is {1:.4f} times the medium list, and {1:.4f} times the small list\".format(c, c/b, c/a))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9e9be818-5183-41fb-ae19-e3c51fe325a9", + "metadata": { + "id": "9e9be818-5183-41fb-ae19-e3c51fe325a9", + "outputId": "3f1ca000-76e9-48bc-ad3f-ca8fe1c95489" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time to run on small list: 0.000970\n", + "Time to run on medium list: 0.001394 which is 1.44 times the small list\n", + "Time to run on big list: 0.001110 which is 0.7964 times the medium list, and 0.7964 times the small list\n" + ] + } + ], + "source": [ + "def add_first_10_numbers(lst):\n", + " total = lst[0]\n", + " total = total + lst[1]\n", + " total = total + lst[2]\n", + " total = total + lst[3]\n", + " total = total + lst[4]\n", + " total = total + lst[5]\n", + " total = total + lst[6]\n", + " total = total + lst[7]\n", + " total = total + lst[8]\n", + " total = total + lst[9]\n", + " return total\n", + "\n", + "#O(1)\n", + "\n", + "a = timeit.timeit(\"add_first_10_numbers(small_list)\", number=1000, globals=globals())\n", + "b = timeit.timeit(\"add_first_10_numbers(medium_list)\", number=1000, globals=globals())\n", + "c = timeit.timeit(\"add_first_10_numbers(big_list)\", number=1000, globals=globals())\n", + "print(\"Time to run on small list: {0:.6f}\".format(a))\n", + "print(\"Time to run on medium list: {0:.6f} which is {1:.2f} times the small list\".format(b, b/a))\n", + "print(\"Time to run on big list: {0:.6f} which is {1:.4f} times the medium list, and {1:.4f} times the small list\".format(c, c/b, c/a))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "75e09cc5-3da4-4021-85bb-8141643547cb", + "metadata": { + "id": "75e09cc5-3da4-4021-85bb-8141643547cb", + "outputId": "284cd437-fc41-4806-cf24-d030c00c6702" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time to run on small list: 0.000373\n", + "Time to run on medium list: 0.004840 which is 12.98 times the small list\n", + "Time to run on big list: 0.40833889699933934 which is 84.37484505461018 times the medium list, and 1095.1534021611 times the small list\n" + ] + } + ], + "source": [ + "def get_sum(lst):\n", + " total = 0\n", + " for item in lst:\n", + " total = total + item\n", + " return total\n", + "\n", + "#O(n)\n", + "\n", + "a = timeit.timeit(\"get_sum(small_list)\", number=1000, globals=globals())\n", + "b = timeit.timeit(\"get_sum(medium_list)\", number=1000, globals=globals())\n", + "c = timeit.timeit(\"get_sum(big_list)\", number=1000, globals=globals())\n", + "print(\"Time to run on small list: {0:.6f}\".format(a))\n", + "print(\"Time to run on medium list: {0:.6f} which is {1:.2f} times the small list\".format(b, b/a))\n", + "print(\"Time to run on big list: {} which is {} times the medium list, and {} times the small list\".format(c, c/b, c/a))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "78648b8b-a76a-41de-893b-df5745e54542", + "metadata": { + "id": "78648b8b-a76a-41de-893b-df5745e54542", + "outputId": "5656f6f2-3e08-4d87-c8af-d36befde30e9" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time to run on small list: 0.001023\n", + "Time to run on medium list: 0.005936 which is 5.80 times the small list\n", + "Time to run on big list: 0.6699908189993948 which is 112.85975703108316 times the medium list, and 654.8257101188873 times the small list\n" + ] + } + ], + "source": [ + "def search(arr, target):\n", + " for i in range(len(arr)):\n", + " if arr[i] == target:\n", + " return i\n", + " return -1\n", + "\n", + "# O(n)\n", + "\n", + "a = timeit.timeit(\"search(small_list, random.random() * len(small_list))\", number=1000, globals=globals())\n", + "b = timeit.timeit(\"search(medium_list, random.random() * len(medium_list))\", number=1000, globals=globals())\n", + "c = timeit.timeit(\"search(big_list, random.random() * len(big_list))\", number=1000, globals=globals())\n", + "print(\"Time to run on small list: {0:.6f}\".format(a))\n", + "print(\"Time to run on medium list: {0:.6f} which is {1:.2f} times the small list\".format(b, b/a))\n", + "print(\"Time to run on big list: {} which is {} times the medium list, and {} times the small list\".format(c, c/b, c/a))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "efb2b577-5beb-46b9-bf39-4827cd0ad4fc", + "metadata": { + "id": "efb2b577-5beb-46b9-bf39-4827cd0ad4fc", + "outputId": "b47ab4c0-909f-44fb-8198-315f7642d116" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time to run on small list: 0.000059\n", + "Time to run on medium list: 0.006227 which is 105.53 times the small list\n", + "Time to run on big list: 46.575065183999996 which is 7480.078076005227 times the medium list, and 789394.5048546676 times the small list\n" + ] + } + ], + "source": [ + "def bubble_sort(arr):\n", + " n = len(arr)\n", + " for i in range(n):\n", + " for j in range(0, n-i-1):\n", + " if arr[j] > arr[j+1]:\n", + " arr[j], arr[j+1] = arr[j+1], arr[j]\n", + " return arr\n", + "\n", + "# O(n^2)\n", + "\n", + "a = timeit.timeit(\"bubble_sort(small_list)\", number=10, globals=globals())\n", + "b = timeit.timeit(\"bubble_sort(medium_list)\", number=10, globals=globals())\n", + "c = timeit.timeit(\"bubble_sort(big_list)\", number=10, globals=globals())\n", + "print(\"Time to run on small list: {0:.6f}\".format(a))\n", + "print(\"Time to run on medium list: {0:.6f} which is {1:.2f} times the small list\".format(b, b/a))\n", + "print(\"Time to run on big list: {} which is {} times the medium list, and {} times the small list\".format(c, c/b, c/a))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "998b5ee5-30bc-44e0-b10a-778ee80b23ef", + "metadata": { + "id": "998b5ee5-30bc-44e0-b10a-778ee80b23ef", + "outputId": "df3646cc-9c39-45db-a53c-c215a27db9a8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time to run on small list: 0.000094\n", + "Time to run on medium list: 0.000148 which is 1.58 times the small list\n", + "Time to run on big list: 0.0009163860000001023 which is 6.189288126997257 times the medium list, and 9.754390821019161 times the small list\n" + ] + } + ], + "source": [ + "def binary_search(arr, target):\n", + " low, high = 0, len(arr) - 1\n", + " while low <= high:\n", + " mid = (low + high) // 2\n", + " if arr[mid] == target:\n", + " return mid\n", + " elif arr[mid] < target:\n", + " low = mid + 1\n", + " else:\n", + " high = mid - 1\n", + " return -1\n", + "\n", + "# O(log n)\n", + "\n", + "small_list = list(range(10))\n", + "medium_list = list(range(100))\n", + "big_list = list(range(10000))\n", + "\n", + "\n", + "a = timeit.timeit(\"binary_search(small_list, random.random() * len(small_list))\", number=100, globals=globals())\n", + "b = timeit.timeit(\"binary_search(medium_list, random.random() * len(medium_list))\", number=100, globals=globals())\n", + "c = timeit.timeit(\"binary_search(big_list, random.random() * len(big_list))\", number=100, globals=globals())\n", + "print(\"Time to run on small list: {0:.6f}\".format(a))\n", + "print(\"Time to run on medium list: {0:.6f} which is {1:.2f} times the small list\".format(b, b/a))\n", + "print(\"Time to run on big list: {} which is {} times the medium list, and {} times the small list\".format(c, c/b, c/a))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43dcdc0c-d2f0-445e-a1c2-d07d4daec3fb", + "metadata": { + "id": "43dcdc0c-d2f0-445e-a1c2-d07d4daec3fb" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23c31601-55a6-4726-8872-3e4d5fe15467", + "metadata": { + "id": "23c31601-55a6-4726-8872-3e4d5fe15467" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58404016-d75b-4a99-b1fb-69a64b4ee031", + "metadata": { + "id": "58404016-d75b-4a99-b1fb-69a64b4ee031" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6f00e26-baa4-411a-a0ff-5278a67f9118", + "metadata": { + "id": "d6f00e26-baa4-411a-a0ff-5278a67f9118" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23ed9f6d", + "metadata": { + "id": "23ed9f6d" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6926d08", + "metadata": { + "id": "e6926d08" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ed592cd", + "metadata": { + "id": "1ed592cd" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3155958", + "metadata": { + "id": "f3155958" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "023d9e5a", + "metadata": { + "id": "023d9e5a" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a254e84", + "metadata": { + "id": "7a254e84" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "297036cf", + "metadata": { + "id": "297036cf" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05efedaa", + "metadata": { + "id": "05efedaa" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a71a945b", + "metadata": { + "id": "a71a945b" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65e5fb7d", + "metadata": { + "id": "65e5fb7d" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3f1ac4f", + "metadata": { + "id": "d3f1ac4f" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e541e5a", + "metadata": { + "id": "4e541e5a" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5a9f8ad", + "metadata": { + "id": "d5a9f8ad" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3508b806", + "metadata": { + "id": "3508b806" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83aa3ece", + "metadata": { + "id": "83aa3ece" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "308fb0aa", + "metadata": { + "id": "308fb0aa" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e85d8c27-caa4-4f27-8cc9-97fdeeed943d", + "metadata": { + "id": "e85d8c27-caa4-4f27-8cc9-97fdeeed943d" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06ce3a3a-3502-4b3b-b872-ae3682c8e5f9", + "metadata": { + "id": "06ce3a3a-3502-4b3b-b872-ae3682c8e5f9" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce9cdf93-5739-4a9d-9975-41c6f35469dd", + "metadata": { + "id": "ce9cdf93-5739-4a9d-9975-41c6f35469dd" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ba87069-2fad-48c2-9cb6-9aca643e843b", + "metadata": { + "id": "9ba87069-2fad-48c2-9cb6-9aca643e843b" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d149d260", + "metadata": { + "id": "d149d260" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}