2626
2727import attr
2828
29+ from twisted .internet import defer
2930from twisted .internet .error import ConnectError
3031from twisted .names import client , dns
3132from twisted .names .error import DNSNameError , DNSNotImplementedError , DomainError
@@ -145,7 +146,37 @@ async def resolve_service(self, service_name: bytes) -> List[Server]:
145146
146147 try :
147148 answers , _ , _ = await make_deferred_yieldable (
148- self ._dns_client .lookupService (service_name )
149+ self ._dns_client .lookupService (
150+ service_name ,
151+ # This is a sequence of ints that represent the "number of seconds
152+ # after which to reissue the query. When the last timeout expires,
153+ # the query is considered failed." The default value in Twisted is
154+ # `timeout=(1, 3, 11, 45)` (60s total) which is an "arbitrary"
155+ # exponential backoff sequence and is too long (see below).
156+ #
157+ # We want the total timeout to be below the overarching HTTP request
158+ # timeout (60s for federation requests) that spurred on this lookup.
159+ # This way, we can see the underlying DNS failure and move on
160+ # instead of the user ending up with a generic HTTP request timeout.
161+ #
162+ # Since these DNS queries are done over UDP (unreliable transport),
163+ # by it's nature, it's bound to occasionally fail (dropped packets,
164+ # etc). We want a list that starts small and re-issues DNS queries
165+ # multiple times until we get a response or timeout.
166+ timeout = (
167+ 1 , # Quick retry for packet loss/scenarios
168+ 3 , # Still reasonable for slow responders
169+ 3 , # ...
170+ 3 , # Already catching 99.9% of successful queries at 10s
171+ # Final attempt for extreme edge cases.
172+ #
173+ # TODO: In the future, we could consider removing this extra
174+ # time if we don't see complaints. For comparison, The Windows
175+ # DNS resolver gives up after 10s using `(1, 1, 2, 4, 2)`, see
176+ # https://learn.microsoft.com/en-us/troubleshoot/windows-server/networking/dns-client-resolution-timeouts
177+ 5 ,
178+ ),
179+ )
149180 )
150181 except DNSNameError :
151182 # TODO: cache this. We can get the SOA out of the exception, and use
@@ -165,6 +196,10 @@ async def resolve_service(self, service_name: bytes) -> List[Server]:
165196 return list (cache_entry )
166197 else :
167198 raise e
199+ except defer .TimeoutError as e :
200+ raise defer .TimeoutError (
201+ f"Could not resolve DNS for SRV record { service_name !r} due to timeout (50s total)"
202+ ) from e
168203
169204 if (
170205 len (answers ) == 1
0 commit comments