Skip to content

Commit aaacc87

Browse files
committed
srv_resolver: add 15s timeout to DNS lookups
1 parent 631eed9 commit aaacc87

File tree

3 files changed

+45
-3
lines changed

3 files changed

+45
-3
lines changed

changelog.d/19026.misc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Reduces the SRV DNS record lookup timeout to 15 seconds.
2+
This fixes issues when DNS lookups hang, as the default timeout of 60 seconds matches the timeout for the federation request itself,
3+
thus we see the HTTP request timeout and not the actual DNS error.

synapse/http/federation/srv_resolver.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
import attr
2828

29+
from twisted.internet import defer
2930
from twisted.internet.error import ConnectError
3031
from twisted.names import client, dns
3132
from twisted.names.error import DNSNameError, DNSNotImplementedError, DomainError
@@ -145,7 +146,37 @@ async def resolve_service(self, service_name: bytes) -> List[Server]:
145146

146147
try:
147148
answers, _, _ = await make_deferred_yieldable(
148-
self._dns_client.lookupService(service_name)
149+
self._dns_client.lookupService(
150+
service_name,
151+
# This is a sequence of ints that represent the "number of seconds
152+
# after which to reissue the query. When the last timeout expires,
153+
# the query is considered failed." The default value in Twisted is
154+
# `timeout=(1, 3, 11, 45)` (60s total) which is an "arbitrary"
155+
# exponential backoff sequence and is too long (see below).
156+
#
157+
# We want the total timeout to be below the overarching HTTP request
158+
# timeout (60s for federation requests) that spurred on this lookup.
159+
# This way, we can see the underlying DNS failure and move on
160+
# instead of the user ending up with a generic HTTP request timeout.
161+
#
162+
# Since these DNS queries are done over UDP (unreliable transport),
163+
# by it's nature, it's bound to occasionally fail (dropped packets,
164+
# etc). We want a list that starts small and re-issues DNS queries
165+
# multiple times until we get a response or timeout.
166+
timeout=(
167+
1, # Quick retry for packet loss/scenarios
168+
3, # Still reasonable for slow responders
169+
3, # ...
170+
3, # Already catching 99.9% of successful queries at 10s
171+
# Final attempt for extreme edge cases.
172+
#
173+
# TODO: In the future, we could consider removing this extra
174+
# time if we don't see complaints. For comparison, The Windows
175+
# DNS resolver gives up after 10s using `(1, 1, 2, 4, 2)`, see
176+
# https://learn.microsoft.com/en-us/troubleshoot/windows-server/networking/dns-client-resolution-timeouts
177+
5,
178+
),
179+
)
149180
)
150181
except DNSNameError:
151182
# TODO: cache this. We can get the SOA out of the exception, and use
@@ -165,6 +196,10 @@ async def resolve_service(self, service_name: bytes) -> List[Server]:
165196
return list(cache_entry)
166197
else:
167198
raise e
199+
except defer.TimeoutError as e:
200+
raise defer.TimeoutError(
201+
f"Could not resolve DNS for SRV record {service_name!r} due to timeout (50s total)"
202+
) from e
168203

169204
if (
170205
len(answers) == 1

tests/http/federation/test_srv_resolver.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,9 @@ def do_lookup() -> Generator["Deferred[object]", object, List[Server]]:
6868
test_d = do_lookup()
6969
self.assertNoResult(test_d)
7070

71-
dns_client_mock.lookupService.assert_called_once_with(service_name)
71+
dns_client_mock.lookupService.assert_called_once_with(
72+
service_name, timeout=(1, 3, 3, 3, 5)
73+
)
7274

7375
result_deferred.callback(([answer_srv], None, None))
7476

@@ -98,7 +100,9 @@ def test_from_cache_expired_and_dns_fail(
98100
servers: List[Server]
99101
servers = yield defer.ensureDeferred(resolver.resolve_service(service_name)) # type: ignore[assignment]
100102

101-
dns_client_mock.lookupService.assert_called_once_with(service_name)
103+
dns_client_mock.lookupService.assert_called_once_with(
104+
service_name, timeout=(1, 3, 3, 3, 5)
105+
)
102106

103107
self.assertEqual(len(servers), 1)
104108
self.assertEqual(servers, cache[service_name])

0 commit comments

Comments
 (0)