Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alternate cost functions january 2 2025 #5

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
229 changes: 229 additions & 0 deletions play/exploring_alternativces.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Timing Tests for Basic Functions\n",
"by: Octopus\n",
"\n",
"Experiments to see if alternate version of specific functions offer any performance enhancement. "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import random\n",
"import math\n",
"import time"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Function to Generate Random Examples of Logits and Samples"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def generate_samples(num_logits, num_samples):\n",
" \"\"\"\n",
" Function to generate random samples of logits and samples. \n",
" \"\"\"\n",
"\n",
" # Generate random log contributions for items\n",
" logits = [random.uniform(-3, 3) for _ in range(num_logits)] # Random values between 0 and 3\n",
"\n",
" # Generate samples (a, b, c) triples\n",
" samples = []\n",
" for _ in range(num_samples):\n",
" a, b = random.sample(range(num_logits), 2) # Pick two different indices\n",
" c = random.uniform(-3,3) # Give reviewers opinions\n",
" samples.append((a, b, c))\n",
"\n",
" return logits, samples"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Different Versions of the Cost Functions\n",
"\n",
"We compare two different versions of the cost function. The second one unpacks the tuples into numpy arrays. "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def original_cost_function(logits, samples):\n",
" # The original version, using one comprehension with appeal to python math.\n",
" return sum((logits[b] - logits[a] - c) ** 2 for a, b, c in samples)\n",
"\n",
" \n",
"def alternate_cost_function(logits,samples):\n",
" # A potential modification, which pays the cost of three comprehensions\n",
" # in exchange for being able to use numpy operations \n",
" logits_a = np.array([logits[a] for a, _, _ in samples])\n",
" logits_b = np.array([logits[b] for _,b,_ in samples])\n",
" c_elements = np.array([c for _,_,c in samples])\n",
" modified_cost = np.square(np.sum(logits_b - logits_a - c_elements))\n",
" return modified_cost"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Comparing the two versions of the cost function\n",
"\n",
"Below are helper functions to compare the performance (time) and accuracy of the two versions of the function. "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Performance profiler\n",
"def performance_profiler(num_items, num_samples):\n",
" # Generate sample logits and samples\n",
" logits, samples = generate_samples(num_items, num_samples)\n",
"\n",
" # Profile original cost function\n",
" start_time = time.time()\n",
" original_cost = original_cost_function(logits, samples)\n",
" original_duration = time.time() - start_time\n",
"\n",
" # Profile modified cost function\n",
" start_time = time.time()\n",
" modified_cost = modified_cost_function(logits, samples)\n",
" modified_duration = time.time() - start_time\n",
"\n",
" # Print results\n",
" print(f\"Original Cost: {original_cost}, Time: {original_duration:.6f} seconds\")\n",
" print(f\"Modified Cost: {modified_cost}, Time: {modified_duration:.6f} seconds\")\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Accuracy test function\n",
"def accuracy_test(num_items, num_samples):\n",
" # Generate sample logits and samples\n",
" logits, samples = generate_samples(num_items, num_samples)\n",
"\n",
" # Calculate costs using both functions\n",
" original_cost = original_cost_function(logits, samples)\n",
" modified_cost = modified_cost_function(logits, samples)\n",
"\n",
" # Check if the results are the same\n",
" if original_cost == modified_cost:\n",
" print(\"Accuracy Test Passed: Both functions give the same result.\")\n",
" else:\n",
" print(\"Accuracy Test Failed: Results differ.\")\n",
" print(f\"Original Cost: {original_cost}, Modified Cost: {modified_cost}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[6], line 38\u001b[0m\n\u001b[0;32m 35\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__main__\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m 36\u001b[0m \u001b[38;5;66;03m# Define a list of (num_items, num_samples) tuples for different input sizes\u001b[39;00m\n\u001b[0;32m 37\u001b[0m input_sizes \u001b[38;5;241m=\u001b[39m [(\u001b[38;5;241m1000\u001b[39m, \u001b[38;5;241m10\u001b[39m), (\u001b[38;5;241m5000\u001b[39m, \u001b[38;5;241m50\u001b[39m), (\u001b[38;5;241m100000\u001b[39m, \u001b[38;5;241m1000\u001b[39m), (\u001b[38;5;241m2000000\u001b[39m, \u001b[38;5;241m2000\u001b[39m), (\u001b[38;5;241m500000000\u001b[39m, \u001b[38;5;241m5000\u001b[39m)]\n\u001b[1;32m---> 38\u001b[0m \u001b[43mperformance_profiler_multiple_sizes\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_sizes\u001b[49m\u001b[43m)\u001b[49m\n",
"Cell \u001b[1;32mIn[6], line 9\u001b[0m, in \u001b[0;36mperformance_profiler_multiple_sizes\u001b[1;34m(sizes)\u001b[0m\n\u001b[0;32m 5\u001b[0m modified_times \u001b[38;5;241m=\u001b[39m [\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m*\u001b[39m num_sizes \u001b[38;5;66;03m# Preallocate list for modified times\u001b[39;00m\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m index, (num_items, num_samples) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(sizes):\n\u001b[0;32m 8\u001b[0m \u001b[38;5;66;03m# Generate sample logits and samples\u001b[39;00m\n\u001b[1;32m----> 9\u001b[0m logits, samples \u001b[38;5;241m=\u001b[39m \u001b[43mgenerate_samples\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnum_items\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_samples\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 11\u001b[0m \u001b[38;5;66;03m# Profile original cost function\u001b[39;00m\n\u001b[0;32m 12\u001b[0m start_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n",
"Cell \u001b[1;32mIn[2], line 7\u001b[0m, in \u001b[0;36mgenerate_samples\u001b[1;34m(num_logits, num_samples)\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;124;03mFunction to generate random samples of logits and samples. \u001b[39;00m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 6\u001b[0m \u001b[38;5;66;03m# Generate random log contributions for items\u001b[39;00m\n\u001b[1;32m----> 7\u001b[0m logits \u001b[38;5;241m=\u001b[39m [random\u001b[38;5;241m.\u001b[39muniform(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m3\u001b[39m, \u001b[38;5;241m3\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(num_logits)] \u001b[38;5;66;03m# Random values between 0 and 3\u001b[39;00m\n\u001b[0;32m 9\u001b[0m \u001b[38;5;66;03m# Generate samples (a, b, c) triples\u001b[39;00m\n\u001b[0;32m 10\u001b[0m samples \u001b[38;5;241m=\u001b[39m []\n",
"Cell \u001b[1;32mIn[2], line 7\u001b[0m, in \u001b[0;36m<listcomp>\u001b[1;34m(.0)\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;124;03mFunction to generate random samples of logits and samples. \u001b[39;00m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 6\u001b[0m \u001b[38;5;66;03m# Generate random log contributions for items\u001b[39;00m\n\u001b[1;32m----> 7\u001b[0m logits \u001b[38;5;241m=\u001b[39m [\u001b[43mrandom\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muniform\u001b[49m(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m3\u001b[39m, \u001b[38;5;241m3\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(num_logits)] \u001b[38;5;66;03m# Random values between 0 and 3\u001b[39;00m\n\u001b[0;32m 9\u001b[0m \u001b[38;5;66;03m# Generate samples (a, b, c) triples\u001b[39;00m\n\u001b[0;32m 10\u001b[0m samples \u001b[38;5;241m=\u001b[39m []\n",
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"\n",
"# Performance profiler for multiple input sizes\n",
"def performance_profiler_multiple_sizes(sizes):\n",
" num_sizes = len(sizes)\n",
" original_times = [0] * num_sizes # Preallocate list for original times\n",
" modified_times = [0] * num_sizes # Preallocate list for modified times\n",
"\n",
" for index, (num_items, num_samples) in enumerate(sizes):\n",
" # Generate sample logits and samples\n",
" logits, samples = generate_samples(num_items, num_samples)\n",
"\n",
" # Profile original cost function\n",
" start_time = time.time()\n",
" original_cost_function(logits, samples)\n",
" original_duration = time.time() - start_time\n",
" original_times[index] = original_duration # Store time in preallocated list\n",
"\n",
" # Profile modified cost function\n",
" start_time = time.time()\n",
" modified_cost_function(logits, samples)\n",
" modified_duration = time.time() - start_time\n",
" modified_times[index] = modified_duration # Store time in preallocated list\n",
"\n",
" # Plotting the results\n",
" plt.figure(figsize=(10, 6))\n",
" plt.plot([size[0] for size in sizes], original_times, label='Original Cost Function', marker='o')\n",
" plt.plot([size[0] for size in sizes], modified_times, label='Modified Cost Function', marker='x')\n",
" plt.title('Performance Comparison of Cost Functions')\n",
" plt.xlabel('Number of Items')\n",
" plt.ylabel('Time (seconds)')\n",
" plt.legend()\n",
" plt.grid()\n",
" plt.show()\n",
"\n",
"# Example usage\n",
"if __name__ == \"__main__\":\n",
" # Define a list of (num_items, num_samples) tuples for different input sizes\n",
" input_sizes = [(1000, 10), (5000, 50), (10_000, 1000), (20_000, 2000)]\n",
" performance_profiler_multiple_sizes(input_sizes)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
118 changes: 65 additions & 53 deletions scoring.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,65 @@
import random
import math
import numpy as np
from scipy.optimize import minimize

# Cost function
#
# The expected input is:
#
# 1. A proposed full list of log(contributions) for all items. For example, if
# logits = [0, 0, 1, 3], this means that items 0 and 1 provided the same
# value, item 2 provided e (~2.718) times more, and item 3 provided e^3 times
# more than item 0 or 1.
# 2. A list of (a, b, c) triples from jurors, where a and b are indices, and
# c is the log of the juror's opinion of how much more value item b provided
# than item a. If item a is more valuable, then c should be negative.

def cost_function(logits, samples):
return sum((logits[b] - logits[a] - c) ** 2 for a, b, c in samples)

# Optimization to find best vector of weights on the input logits. For example,
# if the input logits contains three lists and the output is [0.5, 0.5, 0], then
# this means that the optimum is to take a 50/50 average of the first two lists.
#
# Inputs are (i) the logits themselves, and (ii) the juror samples, in the same
# format as in cost_function

def find_optimal_weights(logits_lists, samples):

def split_cost(weights):
combined_logits = [
sum(w * L[i] for w, L in zip(weights, logits_lists))
for i in range(len(logits_lists[0]))
]
return cost_function(combined_logits, samples)

# Initial guess: equal weights
initial_weights = [1 / len(logits_lists)] * len(logits_lists)

# Constraint: weights must sum to 1
constraints = ({'type': 'eq', 'fun': lambda w: sum(w) - 1})

# Bounds: weights must be between 0 and 1
bounds = [(0, 1)] * len(logits_lists)

# Minimize the split cost
result = minimize(
split_cost,
initial_weights,
bounds=bounds,
constraints=constraints
)
return result.x
import random
import math
import numpy as np
from scipy.optimize import minimize

# Cost function
#
# The expected input is:
#
# 1. A proposed full list of log(contributions) for all items. For example, if
# logits = [0, 0, 1, 3], this means that items 0 and 1 provided the same
# value, item 2 provided e (~2.718) times more, and item 3 provided e^3 times
# more than item 0 or 1.
# 2. A list of (a, b, c) triples from jurors, where a and b are indices, and
# c is the log of the juror's opinion of how much more value item b provided
# than item a. If item a is more valuable, then c should be negative.

def cost_function(logits, samples):
return sum((logits[b] - logits[a] - c) ** 2 for a, b, c in samples)

# Alternate version of cost function
# Should be much faster when number of logits or samples is large.
def alternate_cost_function(logits,samples):
# A potential modification, which pays the cost of three comprehensions
# in exchange for being able to use numpy operations
logits_a = np.array([logits[a] for a, _, _ in samples])
logits_b = np.array([logits[b] for _,b,_ in samples])
c_elements = np.array([c for _,_,c in samples])
modified_cost = np.square(np.sum(logits_b - logits_a - c_elements))
return modified_cost


# Optimization to find best vector of weights on the input logits. For example,
# if the input logits contains three lists and the output is [0.5, 0.5, 0], then
# this means that the optimum is to take a 50/50 average of the first two lists.
#
# Inputs are (i) the logits themselves, and (ii) the juror samples, in the same
# format as in cost_function

def find_optimal_weights(logits_lists, samples):

def split_cost(weights):
combined_logits = [
sum(w * L[i] for w, L in zip(weights, logits_lists))
for i in range(len(logits_lists[0]))
]
return cost_function(combined_logits, samples)

# Initial guess: equal weights
initial_weights = [1 / len(logits_lists)] * len(logits_lists)

# Constraint: weights must sum to 1
constraints = ({'type': 'eq', 'fun': lambda w: sum(w) - 1})

# Bounds: weights must be between 0 and 1
bounds = [(0, 1)] * len(logits_lists)

# Minimize the split cost
result = minimize(
split_cost,
initial_weights,
bounds=bounds,
constraints=constraints
)
return result.x