From c4ff471c3df82dfbf5ef69b8ea7194b99ad50365 Mon Sep 17 00:00:00 2001 From: andrew-octopus <144281779+andrew-octopus@users.noreply.github.com> Date: Thu, 2 Jan 2025 17:46:31 -0500 Subject: [PATCH 1/3] Added alternative version of scoring function. --- scoring.py | 118 +++++++++++++++++++++++++++++------------------------ 1 file changed, 65 insertions(+), 53 deletions(-) diff --git a/scoring.py b/scoring.py index 884e018..37eca8f 100644 --- a/scoring.py +++ b/scoring.py @@ -1,53 +1,65 @@ -import random -import math -import numpy as np -from scipy.optimize import minimize - -# Cost function -# -# The expected input is: -# -# 1. A proposed full list of log(contributions) for all items. For example, if -# logits = [0, 0, 1, 3], this means that items 0 and 1 provided the same -# value, item 2 provided e (~2.718) times more, and item 3 provided e^3 times -# more than item 0 or 1. -# 2. A list of (a, b, c) triples from jurors, where a and b are indices, and -# c is the log of the juror's opinion of how much more value item b provided -# than item a. If item a is more valuable, then c should be negative. - -def cost_function(logits, samples): - return sum((logits[b] - logits[a] - c) ** 2 for a, b, c in samples) - -# Optimization to find best vector of weights on the input logits. For example, -# if the input logits contains three lists and the output is [0.5, 0.5, 0], then -# this means that the optimum is to take a 50/50 average of the first two lists. -# -# Inputs are (i) the logits themselves, and (ii) the juror samples, in the same -# format as in cost_function - -def find_optimal_weights(logits_lists, samples): - - def split_cost(weights): - combined_logits = [ - sum(w * L[i] for w, L in zip(weights, logits_lists)) - for i in range(len(logits_lists[0])) - ] - return cost_function(combined_logits, samples) - - # Initial guess: equal weights - initial_weights = [1 / len(logits_lists)] * len(logits_lists) - - # Constraint: weights must sum to 1 - constraints = ({'type': 'eq', 'fun': lambda w: sum(w) - 1}) - - # Bounds: weights must be between 0 and 1 - bounds = [(0, 1)] * len(logits_lists) - - # Minimize the split cost - result = minimize( - split_cost, - initial_weights, - bounds=bounds, - constraints=constraints - ) - return result.x +import random +import math +import numpy as np +from scipy.optimize import minimize + +# Cost function +# +# The expected input is: +# +# 1. A proposed full list of log(contributions) for all items. For example, if +# logits = [0, 0, 1, 3], this means that items 0 and 1 provided the same +# value, item 2 provided e (~2.718) times more, and item 3 provided e^3 times +# more than item 0 or 1. +# 2. A list of (a, b, c) triples from jurors, where a and b are indices, and +# c is the log of the juror's opinion of how much more value item b provided +# than item a. If item a is more valuable, then c should be negative. + +def cost_function(logits, samples): + return sum((logits[b] - logits[a] - c) ** 2 for a, b, c in samples) + +# Alternate version of cost function +# Should be much faster when number of logits or samples is large. +def alternate_cost_function(logits,samples): + # A potential modification, which pays the cost of three comprehensions + # in exchange for being able to use numpy operations + logits_a = np.array([logits[a] for a, _, _ in samples]) + logits_b = np.array([logits[b] for _,b,_ in samples]) + c_elements = np.array([c for _,_,c in samples]) + modified_cost = np.square(np.sum(logits_b - logits_a - c_elements)) + return modified_cost + + +# Optimization to find best vector of weights on the input logits. For example, +# if the input logits contains three lists and the output is [0.5, 0.5, 0], then +# this means that the optimum is to take a 50/50 average of the first two lists. +# +# Inputs are (i) the logits themselves, and (ii) the juror samples, in the same +# format as in cost_function + +def find_optimal_weights(logits_lists, samples): + + def split_cost(weights): + combined_logits = [ + sum(w * L[i] for w, L in zip(weights, logits_lists)) + for i in range(len(logits_lists[0])) + ] + return cost_function(combined_logits, samples) + + # Initial guess: equal weights + initial_weights = [1 / len(logits_lists)] * len(logits_lists) + + # Constraint: weights must sum to 1 + constraints = ({'type': 'eq', 'fun': lambda w: sum(w) - 1}) + + # Bounds: weights must be between 0 and 1 + bounds = [(0, 1)] * len(logits_lists) + + # Minimize the split cost + result = minimize( + split_cost, + initial_weights, + bounds=bounds, + constraints=constraints + ) + return result.x From a5eee3793344b7d40f1680f3b15b8d6ea808b0a2 Mon Sep 17 00:00:00 2001 From: andrew-octopus <144281779+andrew-octopus@users.noreply.github.com> Date: Thu, 2 Jan 2025 17:47:19 -0500 Subject: [PATCH 2/3] Jupyter notebook for performance comparison --- octopus_tests.ipynb | 229 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 229 insertions(+) create mode 100644 octopus_tests.ipynb diff --git a/octopus_tests.ipynb b/octopus_tests.ipynb new file mode 100644 index 0000000..bab4824 --- /dev/null +++ b/octopus_tests.ipynb @@ -0,0 +1,229 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Timing Tests for Basic Functions\n", + "by: Octopus\n", + "\n", + "Experiments to see if alternate version of specific functions offer any performance enhancement. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import random\n", + "import math\n", + "import time" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Function to Generate Random Examples of Logits and Samples" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_samples(num_logits, num_samples):\n", + " \"\"\"\n", + " Function to generate random samples of logits and samples. \n", + " \"\"\"\n", + "\n", + " # Generate random log contributions for items\n", + " logits = [random.uniform(-3, 3) for _ in range(num_logits)] # Random values between 0 and 3\n", + "\n", + " # Generate samples (a, b, c) triples\n", + " samples = []\n", + " for _ in range(num_samples):\n", + " a, b = random.sample(range(num_logits), 2) # Pick two different indices\n", + " c = random.uniform(-3,3) # Give reviewers opinions\n", + " samples.append((a, b, c))\n", + "\n", + " return logits, samples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Different Versions of the Cost Functions\n", + "\n", + "We compare two different versions of the cost function. The second one unpacks the tuples into numpy arrays. " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def original_cost_function(logits, samples):\n", + " # The original version, using one comprehension with appeal to python math.\n", + " return sum((logits[b] - logits[a] - c) ** 2 for a, b, c in samples)\n", + "\n", + " \n", + "def alternate_cost_function(logits,samples):\n", + " # A potential modification, which pays the cost of three comprehensions\n", + " # in exchange for being able to use numpy operations \n", + " logits_a = np.array([logits[a] for a, _, _ in samples])\n", + " logits_b = np.array([logits[b] for _,b,_ in samples])\n", + " c_elements = np.array([c for _,_,c in samples])\n", + " modified_cost = np.square(np.sum(logits_b - logits_a - c_elements))\n", + " return modified_cost" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Comparing the two versions of the cost function\n", + "\n", + "Below are helper functions to compare the performance (time) and accuracy of the two versions of the function. " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Performance profiler\n", + "def performance_profiler(num_items, num_samples):\n", + " # Generate sample logits and samples\n", + " logits, samples = generate_samples(num_items, num_samples)\n", + "\n", + " # Profile original cost function\n", + " start_time = time.time()\n", + " original_cost = original_cost_function(logits, samples)\n", + " original_duration = time.time() - start_time\n", + "\n", + " # Profile modified cost function\n", + " start_time = time.time()\n", + " modified_cost = modified_cost_function(logits, samples)\n", + " modified_duration = time.time() - start_time\n", + "\n", + " # Print results\n", + " print(f\"Original Cost: {original_cost}, Time: {original_duration:.6f} seconds\")\n", + " print(f\"Modified Cost: {modified_cost}, Time: {modified_duration:.6f} seconds\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Accuracy test function\n", + "def accuracy_test(num_items, num_samples):\n", + " # Generate sample logits and samples\n", + " logits, samples = generate_samples(num_items, num_samples)\n", + "\n", + " # Calculate costs using both functions\n", + " original_cost = original_cost_function(logits, samples)\n", + " modified_cost = modified_cost_function(logits, samples)\n", + "\n", + " # Check if the results are the same\n", + " if original_cost == modified_cost:\n", + " print(\"Accuracy Test Passed: Both functions give the same result.\")\n", + " else:\n", + " print(\"Accuracy Test Failed: Results differ.\")\n", + " print(f\"Original Cost: {original_cost}, Modified Cost: {modified_cost}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[6], line 38\u001b[0m\n\u001b[0;32m 35\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__main__\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m 36\u001b[0m \u001b[38;5;66;03m# Define a list of (num_items, num_samples) tuples for different input sizes\u001b[39;00m\n\u001b[0;32m 37\u001b[0m input_sizes \u001b[38;5;241m=\u001b[39m [(\u001b[38;5;241m1000\u001b[39m, \u001b[38;5;241m10\u001b[39m), (\u001b[38;5;241m5000\u001b[39m, \u001b[38;5;241m50\u001b[39m), (\u001b[38;5;241m100000\u001b[39m, \u001b[38;5;241m1000\u001b[39m), (\u001b[38;5;241m2000000\u001b[39m, \u001b[38;5;241m2000\u001b[39m), (\u001b[38;5;241m500000000\u001b[39m, \u001b[38;5;241m5000\u001b[39m)]\n\u001b[1;32m---> 38\u001b[0m \u001b[43mperformance_profiler_multiple_sizes\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_sizes\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[1;32mIn[6], line 9\u001b[0m, in \u001b[0;36mperformance_profiler_multiple_sizes\u001b[1;34m(sizes)\u001b[0m\n\u001b[0;32m 5\u001b[0m modified_times \u001b[38;5;241m=\u001b[39m [\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m*\u001b[39m num_sizes \u001b[38;5;66;03m# Preallocate list for modified times\u001b[39;00m\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m index, (num_items, num_samples) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(sizes):\n\u001b[0;32m 8\u001b[0m \u001b[38;5;66;03m# Generate sample logits and samples\u001b[39;00m\n\u001b[1;32m----> 9\u001b[0m logits, samples \u001b[38;5;241m=\u001b[39m \u001b[43mgenerate_samples\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnum_items\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_samples\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 11\u001b[0m \u001b[38;5;66;03m# Profile original cost function\u001b[39;00m\n\u001b[0;32m 12\u001b[0m start_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n", + "Cell \u001b[1;32mIn[2], line 7\u001b[0m, in \u001b[0;36mgenerate_samples\u001b[1;34m(num_logits, num_samples)\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;124;03mFunction to generate random samples of logits and samples. \u001b[39;00m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 6\u001b[0m \u001b[38;5;66;03m# Generate random log contributions for items\u001b[39;00m\n\u001b[1;32m----> 7\u001b[0m logits \u001b[38;5;241m=\u001b[39m [random\u001b[38;5;241m.\u001b[39muniform(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m3\u001b[39m, \u001b[38;5;241m3\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(num_logits)] \u001b[38;5;66;03m# Random values between 0 and 3\u001b[39;00m\n\u001b[0;32m 9\u001b[0m \u001b[38;5;66;03m# Generate samples (a, b, c) triples\u001b[39;00m\n\u001b[0;32m 10\u001b[0m samples \u001b[38;5;241m=\u001b[39m []\n", + "Cell \u001b[1;32mIn[2], line 7\u001b[0m, in \u001b[0;36m\u001b[1;34m(.0)\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;124;03mFunction to generate random samples of logits and samples. \u001b[39;00m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 6\u001b[0m \u001b[38;5;66;03m# Generate random log contributions for items\u001b[39;00m\n\u001b[1;32m----> 7\u001b[0m logits \u001b[38;5;241m=\u001b[39m [\u001b[43mrandom\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muniform\u001b[49m(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m3\u001b[39m, \u001b[38;5;241m3\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(num_logits)] \u001b[38;5;66;03m# Random values between 0 and 3\u001b[39;00m\n\u001b[0;32m 9\u001b[0m \u001b[38;5;66;03m# Generate samples (a, b, c) triples\u001b[39;00m\n\u001b[0;32m 10\u001b[0m samples \u001b[38;5;241m=\u001b[39m []\n", + "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "\n", + "# Performance profiler for multiple input sizes\n", + "def performance_profiler_multiple_sizes(sizes):\n", + " num_sizes = len(sizes)\n", + " original_times = [0] * num_sizes # Preallocate list for original times\n", + " modified_times = [0] * num_sizes # Preallocate list for modified times\n", + "\n", + " for index, (num_items, num_samples) in enumerate(sizes):\n", + " # Generate sample logits and samples\n", + " logits, samples = generate_samples(num_items, num_samples)\n", + "\n", + " # Profile original cost function\n", + " start_time = time.time()\n", + " original_cost_function(logits, samples)\n", + " original_duration = time.time() - start_time\n", + " original_times[index] = original_duration # Store time in preallocated list\n", + "\n", + " # Profile modified cost function\n", + " start_time = time.time()\n", + " modified_cost_function(logits, samples)\n", + " modified_duration = time.time() - start_time\n", + " modified_times[index] = modified_duration # Store time in preallocated list\n", + "\n", + " # Plotting the results\n", + " plt.figure(figsize=(10, 6))\n", + " plt.plot([size[0] for size in sizes], original_times, label='Original Cost Function', marker='o')\n", + " plt.plot([size[0] for size in sizes], modified_times, label='Modified Cost Function', marker='x')\n", + " plt.title('Performance Comparison of Cost Functions')\n", + " plt.xlabel('Number of Items')\n", + " plt.ylabel('Time (seconds)')\n", + " plt.legend()\n", + " plt.grid()\n", + " plt.show()\n", + "\n", + "# Example usage\n", + "if __name__ == \"__main__\":\n", + " # Define a list of (num_items, num_samples) tuples for different input sizes\n", + " input_sizes = [(1000, 10), (5000, 50), (10_000, 1000), (20_000, 2000)]\n", + " performance_profiler_multiple_sizes(input_sizes)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From d5e893aa38bf52b12535f48776acb2bff54638ec Mon Sep 17 00:00:00 2001 From: andrew-octopus <144281779+andrew-octopus@users.noreply.github.com> Date: Thu, 2 Jan 2025 20:10:25 -0500 Subject: [PATCH 3/3] Rename octopus_tests.ipynb to play/exploring_alternativces.ipynb --- octopus_tests.ipynb => play/exploring_alternativces.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename octopus_tests.ipynb => play/exploring_alternativces.ipynb (100%) diff --git a/octopus_tests.ipynb b/play/exploring_alternativces.ipynb similarity index 100% rename from octopus_tests.ipynb rename to play/exploring_alternativces.ipynb