deepfunding · andrew-octopus · Jan 2, 2025 · Jan 2, 2025 · Jan 3, 2025
diff --git a/play/exploring_alternativces.ipynb b/play/exploring_alternativces.ipynb
@@ -0,0 +1,229 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Timing Tests for Basic Functions\n",
+    "by: Octopus\n",
+    "\n",
+    "Experiments to see if alternate version of specific functions offer any performance enhancement. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import random\n",
+    "import math\n",
+    "import time"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Function to Generate Random Examples of Logits and Samples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_samples(num_logits, num_samples):\n",
+    "    \"\"\"\n",
+    "    Function to generate random samples of logits and samples. \n",
+    "    \"\"\"\n",
+    "\n",
+    "    # Generate random log contributions for items\n",
+    "    logits = [random.uniform(-3, 3) for _ in range(num_logits)]  # Random values between 0 and 3\n",
+    "\n",
+    "    # Generate samples (a, b, c) triples\n",
+    "    samples = []\n",
+    "    for _ in range(num_samples):\n",
+    "        a, b = random.sample(range(num_logits), 2)  # Pick two different indices\n",
+    "        c = random.uniform(-3,3)  # Give reviewers opinions\n",
+    "        samples.append((a, b, c))\n",
+    "\n",
+    "    return logits, samples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Different Versions of the Cost Functions\n",
+    "\n",
+    "We compare two different versions of the cost function. The second one unpacks the tuples into numpy arrays. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def original_cost_function(logits, samples):\n",
+    "    # The original version, using one comprehension with appeal to python math.\n",
+    "    return sum((logits[b] - logits[a] - c) ** 2 for a, b, c in samples)\n",
+    "\n",
+    "   \n",
+    "def alternate_cost_function(logits,samples):\n",
+    "    # A potential modification, which pays the cost of three comprehensions\n",
+    "    # in exchange for being able to use numpy operations \n",
+    "    logits_a = np.array([logits[a] for a, _, _ in samples])\n",
+    "    logits_b = np.array([logits[b] for _,b,_ in samples])\n",
+    "    c_elements = np.array([c for _,_,c in samples])\n",
+    "    modified_cost = np.square(np.sum(logits_b - logits_a - c_elements))\n",
+    "    return modified_cost"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Comparing the two versions of the cost function\n",
+    "\n",
+    "Below are helper functions to compare the performance (time) and accuracy of the two versions of the function. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Performance profiler\n",
+    "def performance_profiler(num_items, num_samples):\n",
+    "    # Generate sample logits and samples\n",
+    "    logits, samples = generate_samples(num_items, num_samples)\n",
+    "\n",
+    "    # Profile original cost function\n",
+    "    start_time = time.time()\n",
+    "    original_cost = original_cost_function(logits, samples)\n",
+    "    original_duration = time.time() - start_time\n",
+    "\n",
+    "    # Profile modified cost function\n",
+    "    start_time = time.time()\n",
+    "    modified_cost = modified_cost_function(logits, samples)\n",
+    "    modified_duration = time.time() - start_time\n",
+    "\n",
+    "    # Print results\n",
+    "    print(f\"Original Cost: {original_cost}, Time: {original_duration:.6f} seconds\")\n",
+    "    print(f\"Modified Cost: {modified_cost}, Time: {modified_duration:.6f} seconds\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Accuracy test function\n",
+    "def accuracy_test(num_items, num_samples):\n",
+    "    # Generate sample logits and samples\n",
+    "    logits, samples = generate_samples(num_items, num_samples)\n",
+    "\n",
+    "    # Calculate costs using both functions\n",
+    "    original_cost = original_cost_function(logits, samples)\n",
+    "    modified_cost = modified_cost_function(logits, samples)\n",
+    "\n",
+    "    # Check if the results are the same\n",
+    "    if original_cost == modified_cost:\n",
+    "        print(\"Accuracy Test Passed: Both functions give the same result.\")\n",
+    "    else:\n",
+    "        print(\"Accuracy Test Failed: Results differ.\")\n",
+    "        print(f\"Original Cost: {original_cost}, Modified Cost: {modified_cost}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[6], line 38\u001b[0m\n\u001b[0;32m     35\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__main__\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m     36\u001b[0m     \u001b[38;5;66;03m# Define a list of (num_items, num_samples) tuples for different input sizes\u001b[39;00m\n\u001b[0;32m     37\u001b[0m     input_sizes \u001b[38;5;241m=\u001b[39m [(\u001b[38;5;241m1000\u001b[39m, \u001b[38;5;241m10\u001b[39m), (\u001b[38;5;241m5000\u001b[39m, \u001b[38;5;241m50\u001b[39m), (\u001b[38;5;241m100000\u001b[39m, \u001b[38;5;241m1000\u001b[39m), (\u001b[38;5;241m2000000\u001b[39m, \u001b[38;5;241m2000\u001b[39m), (\u001b[38;5;241m500000000\u001b[39m, \u001b[38;5;241m5000\u001b[39m)]\n\u001b[1;32m---> 38\u001b[0m     \u001b[43mperformance_profiler_multiple_sizes\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_sizes\u001b[49m\u001b[43m)\u001b[49m\n",
+      "Cell \u001b[1;32mIn[6], line 9\u001b[0m, in \u001b[0;36mperformance_profiler_multiple_sizes\u001b[1;34m(sizes)\u001b[0m\n\u001b[0;32m      5\u001b[0m modified_times \u001b[38;5;241m=\u001b[39m [\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m*\u001b[39m num_sizes  \u001b[38;5;66;03m# Preallocate list for modified times\u001b[39;00m\n\u001b[0;32m      7\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m index, (num_items, num_samples) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(sizes):\n\u001b[0;32m      8\u001b[0m     \u001b[38;5;66;03m# Generate sample logits and samples\u001b[39;00m\n\u001b[1;32m----> 9\u001b[0m     logits, samples \u001b[38;5;241m=\u001b[39m \u001b[43mgenerate_samples\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnum_items\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_samples\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     11\u001b[0m     \u001b[38;5;66;03m# Profile original cost function\u001b[39;00m\n\u001b[0;32m     12\u001b[0m     start_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n",
+      "Cell \u001b[1;32mIn[2], line 7\u001b[0m, in \u001b[0;36mgenerate_samples\u001b[1;34m(num_logits, num_samples)\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m      3\u001b[0m \u001b[38;5;124;03mFunction to generate random samples of logits and samples. \u001b[39;00m\n\u001b[0;32m      4\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m      6\u001b[0m \u001b[38;5;66;03m# Generate random log contributions for items\u001b[39;00m\n\u001b[1;32m----> 7\u001b[0m logits \u001b[38;5;241m=\u001b[39m [random\u001b[38;5;241m.\u001b[39muniform(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m3\u001b[39m, \u001b[38;5;241m3\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(num_logits)]  \u001b[38;5;66;03m# Random values between 0 and 3\u001b[39;00m\n\u001b[0;32m      9\u001b[0m \u001b[38;5;66;03m# Generate samples (a, b, c) triples\u001b[39;00m\n\u001b[0;32m     10\u001b[0m samples \u001b[38;5;241m=\u001b[39m []\n",
+      "Cell \u001b[1;32mIn[2], line 7\u001b[0m, in \u001b[0;36m<listcomp>\u001b[1;34m(.0)\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m      3\u001b[0m \u001b[38;5;124;03mFunction to generate random samples of logits and samples. \u001b[39;00m\n\u001b[0;32m      4\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m      6\u001b[0m \u001b[38;5;66;03m# Generate random log contributions for items\u001b[39;00m\n\u001b[1;32m----> 7\u001b[0m logits \u001b[38;5;241m=\u001b[39m [\u001b[43mrandom\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muniform\u001b[49m(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m3\u001b[39m, \u001b[38;5;241m3\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(num_logits)]  \u001b[38;5;66;03m# Random values between 0 and 3\u001b[39;00m\n\u001b[0;32m      9\u001b[0m \u001b[38;5;66;03m# Generate samples (a, b, c) triples\u001b[39;00m\n\u001b[0;32m     10\u001b[0m samples \u001b[38;5;241m=\u001b[39m []\n",
+      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "# Performance profiler for multiple input sizes\n",
+    "def performance_profiler_multiple_sizes(sizes):\n",
+    "    num_sizes = len(sizes)\n",
+    "    original_times = [0] * num_sizes  # Preallocate list for original times\n",
+    "    modified_times = [0] * num_sizes  # Preallocate list for modified times\n",
+    "\n",
+    "    for index, (num_items, num_samples) in enumerate(sizes):\n",
+    "        # Generate sample logits and samples\n",
+    "        logits, samples = generate_samples(num_items, num_samples)\n",
+    "\n",
+    "        # Profile original cost function\n",
+    "        start_time = time.time()\n",
+    "        original_cost_function(logits, samples)\n",
+    "        original_duration = time.time() - start_time\n",
+    "        original_times[index] = original_duration  # Store time in preallocated list\n",
+    "\n",
+    "        # Profile modified cost function\n",
+    "        start_time = time.time()\n",
+    "        modified_cost_function(logits, samples)\n",
+    "        modified_duration = time.time() - start_time\n",
+    "        modified_times[index] = modified_duration  # Store time in preallocated list\n",
+    "\n",
+    "    # Plotting the results\n",
+    "    plt.figure(figsize=(10, 6))\n",
+    "    plt.plot([size[0] for size in sizes], original_times, label='Original Cost Function', marker='o')\n",
+    "    plt.plot([size[0] for size in sizes], modified_times, label='Modified Cost Function', marker='x')\n",
+    "    plt.title('Performance Comparison of Cost Functions')\n",
+    "    plt.xlabel('Number of Items')\n",
+    "    plt.ylabel('Time (seconds)')\n",
+    "    plt.legend()\n",
+    "    plt.grid()\n",
+    "    plt.show()\n",
+    "\n",
+    "# Example usage\n",
+    "if __name__ == \"__main__\":\n",
+    "    # Define a list of (num_items, num_samples) tuples for different input sizes\n",
+    "    input_sizes = [(1000, 10), (5000, 50), (10_000, 1000), (20_000, 2000)]\n",
+    "    performance_profiler_multiple_sizes(input_sizes)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/scoring.py b/scoring.py
@@ -1,53 +1,65 @@
-import random
-import math
-import numpy as np
-from scipy.optimize import minimize
-
-# Cost function
-#
-# The expected input is:
-#
-# 1. A proposed full list of log(contributions) for all items. For example, if
-#    logits = [0, 0, 1, 3], this means that items 0 and 1 provided the same
-#    value, item 2 provided e (~2.718) times more, and item 3 provided e^3 times
-#    more than item 0 or 1.
-# 2. A list of (a, b, c) triples from jurors, where a and b are indices, and
-#    c is the log of the juror's opinion of how much more value item b provided
-#    than item a. If item a is more valuable, then c should be negative.
-
-def cost_function(logits, samples):
-    return sum((logits[b] - logits[a] - c) ** 2 for a, b, c in samples)
-
-# Optimization to find best vector of weights on the input logits. For example,
-# if the input logits contains three lists and the output is [0.5, 0.5, 0], then
-# this means that the optimum is to take a 50/50 average of the first two lists.
-# 
-# Inputs are (i) the logits themselves, and (ii) the juror samples, in the same
-# format as in cost_function
-
-def find_optimal_weights(logits_lists, samples):
-
-    def split_cost(weights):
-        combined_logits = [
-            sum(w * L[i] for w, L in zip(weights, logits_lists))
-            for i in range(len(logits_lists[0]))
-        ]
-        return cost_function(combined_logits, samples)
-
-    # Initial guess: equal weights
-    initial_weights = [1 / len(logits_lists)] * len(logits_lists)
-
-    # Constraint: weights must sum to 1
-    constraints = ({'type': 'eq', 'fun': lambda w: sum(w) - 1})
-
-    # Bounds: weights must be between 0 and 1
-    bounds = [(0, 1)] * len(logits_lists)
-
-    # Minimize the split cost
-    result = minimize(
-        split_cost,
-        initial_weights,
-        bounds=bounds,
-        constraints=constraints
-    )
-    return result.x
+import random
+import math
+import numpy as np
+from scipy.optimize import minimize
+
+# Cost function
+#
+# The expected input is:
+#
+# 1. A proposed full list of log(contributions) for all items. For example, if
+#    logits = [0, 0, 1, 3], this means that items 0 and 1 provided the same
+#    value, item 2 provided e (~2.718) times more, and item 3 provided e^3 times
+#    more than item 0 or 1.
+# 2. A list of (a, b, c) triples from jurors, where a and b are indices, and
+#    c is the log of the juror's opinion of how much more value item b provided
+#    than item a. If item a is more valuable, then c should be negative.
+
+def cost_function(logits, samples):
+    return sum((logits[b] - logits[a] - c) ** 2 for a, b, c in samples)
+
+# Alternate version of cost function
+# Should be much faster when number of logits or samples is large. 
+def alternate_cost_function(logits,samples):
+    # A potential modification, which pays the cost of three comprehensions
+    # in exchange for being able to use numpy operations 
+    logits_a = np.array([logits[a] for a, _, _ in samples])
+    logits_b = np.array([logits[b] for _,b,_ in samples])
+    c_elements = np.array([c for _,_,c in samples])
+    modified_cost = np.square(np.sum(logits_b - logits_a - c_elements))
+    return modified_cost    
+
+
+# Optimization to find best vector of weights on the input logits. For example,
+# if the input logits contains three lists and the output is [0.5, 0.5, 0], then
+# this means that the optimum is to take a 50/50 average of the first two lists.
+# 
+# Inputs are (i) the logits themselves, and (ii) the juror samples, in the same
+# format as in cost_function
+
+def find_optimal_weights(logits_lists, samples):
+
+    def split_cost(weights):
+        combined_logits = [
+            sum(w * L[i] for w, L in zip(weights, logits_lists))
+            for i in range(len(logits_lists[0]))
+        ]
+        return cost_function(combined_logits, samples)
+
+    # Initial guess: equal weights
+    initial_weights = [1 / len(logits_lists)] * len(logits_lists)
+
+    # Constraint: weights must sum to 1
+    constraints = ({'type': 'eq', 'fun': lambda w: sum(w) - 1})
+
+    # Bounds: weights must be between 0 and 1
+    bounds = [(0, 1)] * len(logits_lists)
+
+    # Minimize the split cost
+    result = minimize(
+        split_cost,
+        initial_weights,
+        bounds=bounds,
+        constraints=constraints
+    )
+    return result.x