1010import os
1111import time
1212import traceback
13+ import random
1314from datetime import datetime , timedelta
1415from typing import TYPE_CHECKING , Any , Dict , Literal , Optional , Union , cast , overload
1516
@@ -769,7 +770,14 @@ async def _commit_spend_updates_to_db( # noqa: PLR0915
769770 proxy_logging_obj = proxy_logging_obj ,
770771 )
771772 # Optionally, sleep for a bit before retrying
772- await asyncio .sleep (2 ** i ) # Exponential backoff
773+ await asyncio .sleep (
774+ # Sleep a random amount to avoid retrying and deadlocking again: when two transactions deadlock they are
775+ # cancelled basically at the same time, so if they wait the same time they will also retry at the same time
776+ # and thus they are more likely to deadlock again.
777+ # Instead, we sleep a random amount so that they retry at slightly different times, lowering the chance of
778+ # repeated deadlocks, and therefore of exceeding the retry limit.
779+ random .uniform (2 ** i , 2 ** (i + 1 ))
780+ )
773781 except Exception as e :
774782 _raise_failed_update_spend_exception (
775783 e = e , start_time = start_time , proxy_logging_obj = proxy_logging_obj
@@ -849,8 +857,27 @@ async def _update_daily_spend(
849857 try :
850858 for i in range (n_retry_times + 1 ):
851859 try :
860+ # Sort the transactions to minimize the probability of deadlocks by reducing the chance of concurrent
861+ # trasactions locking the same rows/ranges in different orders.
852862 transactions_to_process = dict (
853- list (daily_spend_transactions .items ())[:BATCH_SIZE ]
863+ sorted (
864+ daily_spend_transactions .items (),
865+ # Normally to avoid deadlocks we would sort by the index, but since we have sprinkled indexes
866+ # on our schema like we're discount Salt Bae, we just sort by all fields that have an index,
867+ # in an ad-hoc (but hopefully sensible) order of indexes. The actual ordering matters less than
868+ # ensuring that all concurrent transactions sort in the same order.
869+ # We could in theory use the dict key, as it contains basically the same fields, but this is more
870+ # robust to future changes in the key format.
871+ # If _update_daily_spend ever gets the ability to write to multiple tables at once, the sorting
872+ # should sort by the table first.
873+ key = lambda x : (
874+ x [1 ]["date" ],
875+ x [1 ].get (entity_id_field ),
876+ x [1 ]["api_key" ],
877+ x [1 ]["model" ],
878+ x [1 ]["custom_llm_provider" ],
879+ ),
880+ )[:BATCH_SIZE ]
854881 )
855882
856883 if len (transactions_to_process ) == 0 :
@@ -893,7 +920,8 @@ async def _update_daily_spend(
893920 "model_group" : transaction .get ("model_group" ),
894921 "mcp_namespaced_tool_name" : transaction .get (
895922 "mcp_namespaced_tool_name"
896- ) or "" ,
923+ )
924+ or "" ,
897925 "custom_llm_provider" : transaction .get (
898926 "custom_llm_provider"
899927 ),
@@ -909,13 +937,13 @@ async def _update_daily_spend(
909937
910938 # Add cache-related fields if they exist
911939 if "cache_read_input_tokens" in transaction :
912- common_data ["cache_read_input_tokens" ] = (
913- transaction . get ( "cache_read_input_tokens" , 0 )
914- )
940+ common_data [
941+ "cache_read_input_tokens"
942+ ] = transaction . get ( "cache_read_input_tokens" , 0 )
915943 if "cache_creation_input_tokens" in transaction :
916- common_data ["cache_creation_input_tokens" ] = (
917- transaction . get ( "cache_creation_input_tokens" , 0 )
918- )
944+ common_data [
945+ "cache_creation_input_tokens"
946+ ] = transaction . get ( "cache_creation_input_tokens" , 0 )
919947
920948 # Create update data structure
921949 update_data = {
@@ -976,7 +1004,14 @@ async def _update_daily_spend(
9761004 start_time = start_time ,
9771005 proxy_logging_obj = proxy_logging_obj ,
9781006 )
979- await asyncio .sleep (2 ** i )
1007+ await asyncio .sleep (
1008+ # Sleep a random amount to avoid retrying and deadlocking again: when two transactions deadlock they are
1009+ # cancelled basically at the same time, so if they wait the same time they will also retry at the same time
1010+ # and thus they are more likely to deadlock again.
1011+ # Instead, we sleep a random amount so that they retry at slightly different times, lowering the chance of
1012+ # repeated deadlocks, and therefore of exceeding the retry limit.
1013+ random .uniform (2 ** i , 2 ** (i + 1 ))
1014+ )
9801015
9811016 except Exception as e :
9821017 if "transactions_to_process" in locals ():
0 commit comments