Skip to content

Commit

Permalink
[internal-dns] remove lookup_ipv6 in favor of lookup_socket_v6 (#6320)
Browse files Browse the repository at this point in the history
`lookup_ipv6` is both buggy and easy to misuse:
* it sends an AAAA query for a domain which should have a SRV record
- this works only because
#4051 means the SRV
record is incorrectly returned, along with the actually-desired AAAA for
the SRV's target in Additionals
* it looks up an IPv6 address from a SRV record *but ignores the port*.
in places `lookup_ipv6` was used, it was paired consistently with the
hardcoded port NEXUS_INTERNAL_PORT and matched what should be in the
resolved SRV record. if we for example wanted to move Nexus' port (or
start a test Nexus on an atypical port), the authoritative port number
in the SRV response would be ignored for the hardcoded port.

lets just use the port that we told DNS we're at!

we may still want a bare IPv6 address for a service if we're going to
test network reachability, for example, but we're not doing that with
this function today. this all is distinct from helpers like
`lookup_all_ipv6`.

if we need a service's IPv6 address to use with an alternate port to
access a different API, we *probably* should have a distinct SRV record
for that lookup to use instead? i've found three instances of this:
* wicket assumes the techport proxy is on the same IP as Nexus' API, but that isn't necessarily true
* we assume the CRDB admin service listens on the same IP as CRDB itself, but that doesn't have to be true
* we look up addresses for MGS via `ServiceName::Dendrite`, but there's a
`ServiceName::ManagementGatewayService`, so either that's a typo or can be made to have its own SRV records

there are some uses of `lookup_all_ipv6` that make a lot of sense still,
where we're discovering the rack's network and _really_ do not care
about the port that Dendrite happens to be on.
  • Loading branch information
iximeow authored Aug 16, 2024
1 parent 66ac7b3 commit 8ae0833
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 58 deletions.
73 changes: 30 additions & 43 deletions internal-dns/src/resolver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,27 +160,6 @@ impl Resolver {
self.resolver.clear_cache();
}

/// Looks up a single [`Ipv6Addr`] based on the SRV name.
/// Returns an error if the record does not exist.
// TODO: There are lots of ways this API can expand: Caching,
// actually respecting TTL, looking up ports, etc.
//
// For now, however, it serves as a very simple "get everyone using DNS"
// API that can be improved upon later.
pub async fn lookup_ipv6(
&self,
srv: crate::ServiceName,
) -> Result<Ipv6Addr, ResolveError> {
let name = srv.srv_name();
debug!(self.log, "lookup_ipv6 srv"; "dns_name" => &name);
let response = self.resolver.ipv6_lookup(&name).await?;
let address = response
.iter()
.next()
.ok_or_else(|| ResolveError::NotFound(srv))?;
Ok(address.0)
}

/// Returns the targets of the SRV records for a DNS name
///
/// The returned values are generally other DNS names that themselves would
Expand Down Expand Up @@ -235,6 +214,12 @@ impl Resolver {
// TODO-robustness: any callers of this should probably be using
// all the targets for a given SRV and not just the first one
// we get, see [`Resolver::lookup_all_socket_v6`].
//
// TODO: There are lots of ways this API can expand: Caching,
// actually respecting TTL, looking up ports, etc.
//
// For now, however, it serves as a very simple "get everyone using DNS"
// API that can be improved upon later.
pub async fn lookup_socket_v6(
&self,
service: crate::ServiceName,
Expand Down Expand Up @@ -549,11 +534,11 @@ mod test {
dns_server.update(&dns_config).await.unwrap();

let resolver = dns_server.resolver().unwrap();
let found_ip = resolver
.lookup_ipv6(ServiceName::Cockroach)
let found_addr = resolver
.lookup_socket_v6(ServiceName::Cockroach)
.await
.expect("Should have been able to look up IP address");
assert_eq!(found_ip, ip,);
assert_eq!(found_addr.ip(), &ip,);

dns_server.cleanup_successful();
logctx.cleanup_successful();
Expand Down Expand Up @@ -631,11 +616,13 @@ mod test {

// Look up Cockroach
let resolver = dns_server.resolver().unwrap();
let ip = resolver
.lookup_ipv6(ServiceName::Cockroach)
let resolved_addr = resolver
.lookup_socket_v6(ServiceName::Cockroach)
.await
.expect("Should have been able to look up IP address");
assert!(cockroach_addrs.iter().any(|addr| addr.ip() == &ip));
assert!(cockroach_addrs
.iter()
.any(|addr| addr.ip() == resolved_addr.ip()));

// Look up all the Cockroach addresses.
let mut ips =
Expand All @@ -649,18 +636,18 @@ mod test {
);

// Look up Clickhouse
let ip = resolver
.lookup_ipv6(ServiceName::Clickhouse)
let addr = resolver
.lookup_socket_v6(ServiceName::Clickhouse)
.await
.expect("Should have been able to look up IP address");
assert_eq!(&ip, clickhouse_addr.ip());
assert_eq!(addr.ip(), clickhouse_addr.ip());

// Look up Backend Service
let ip = resolver
.lookup_ipv6(srv_backend)
let addr = resolver
.lookup_socket_v6(srv_backend)
.await
.expect("Should have been able to look up IP address");
assert_eq!(&ip, crucible_addr.ip());
assert_eq!(addr.ip(), crucible_addr.ip());

// If we deploy a new generation that removes all records, then we don't
// find anything any more.
Expand All @@ -671,7 +658,7 @@ mod test {
// If we remove the records for all services, we won't find them any
// more. (e.g., there's no hidden caching going on)
let error = resolver
.lookup_ipv6(ServiceName::Cockroach)
.lookup_socket_v6(ServiceName::Cockroach)
.await
.expect_err("unexpectedly found records");
assert_matches!(
Expand Down Expand Up @@ -708,11 +695,11 @@ mod test {
dns_builder.service_backend_zone(srv_crdb, &zone, 12345).unwrap();
let dns_config = dns_builder.build_full_config_for_initial_generation();
dns_server.update(&dns_config).await.unwrap();
let found_ip = resolver
.lookup_ipv6(ServiceName::Cockroach)
let found_addr = resolver
.lookup_socket_v6(ServiceName::Cockroach)
.await
.expect("Should have been able to look up IP address");
assert_eq!(found_ip, ip1);
assert_eq!(found_addr.ip(), &ip1);

// If we insert the same record with a new address, it should be
// updated.
Expand All @@ -726,11 +713,11 @@ mod test {
dns_builder.build_full_config_for_initial_generation();
dns_config.generation += 1;
dns_server.update(&dns_config).await.unwrap();
let found_ip = resolver
.lookup_ipv6(ServiceName::Cockroach)
let found_addr = resolver
.lookup_socket_v6(ServiceName::Cockroach)
.await
.expect("Should have been able to look up IP address");
assert_eq!(found_ip, ip2);
assert_eq!(found_addr.ip(), &ip2);

dns_server.cleanup_successful();
logctx.cleanup_successful();
Expand Down Expand Up @@ -861,11 +848,11 @@ mod test {
dns_server.update(&dns_config).await.unwrap();

// Confirm that we can access this record manually.
let found_ip = resolver
.lookup_ipv6(ServiceName::Nexus)
let found_addr = resolver
.lookup_socket_v6(ServiceName::Nexus)
.await
.expect("Should have been able to look up IP address");
assert_eq!(found_ip, ip);
assert_eq!(found_addr.ip(), &ip);

// Confirm that the progenitor client can access this record too.
let value = client.test_endpoint().await.unwrap();
Expand Down
8 changes: 2 additions & 6 deletions oximeter/collector/src/agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ use internal_dns::resolver::Resolver;
use internal_dns::ServiceName;
use nexus_client::types::IdSortMode;
use omicron_common::address::CLICKHOUSE_PORT;
use omicron_common::address::NEXUS_INTERNAL_PORT;
use omicron_common::backoff;
use omicron_common::backoff::BackoffError;
use oximeter::types::ProducerResults;
Expand Down Expand Up @@ -816,7 +815,7 @@ async fn refresh_producer_list(agent: OximeterAgent, resolver: Resolver) {
async fn resolve_nexus_with_backoff(
log: &Logger,
resolver: &Resolver,
) -> SocketAddr {
) -> SocketAddrV6 {
let log_failure = |error, delay| {
warn!(
log,
Expand All @@ -827,12 +826,9 @@ async fn resolve_nexus_with_backoff(
};
let do_lookup = || async {
resolver
.lookup_ipv6(ServiceName::Nexus)
.lookup_socket_v6(ServiceName::Nexus)
.await
.map_err(|e| BackoffError::transient(e.to_string()))
.map(|ip| {
SocketAddr::V6(SocketAddrV6::new(ip, NEXUS_INTERNAL_PORT, 0, 0))
})
};
backoff::retry_notify(
backoff::retry_policy_internal_service(),
Expand Down
17 changes: 8 additions & 9 deletions oximeter/collector/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ use dropshot::HttpServerStarter;
use internal_dns::resolver::ResolveError;
use internal_dns::resolver::Resolver;
use internal_dns::ServiceName;
use omicron_common::address::NEXUS_INTERNAL_PORT;
use omicron_common::api::internal::nexus::ProducerEndpoint;
use omicron_common::backoff;
use omicron_common::FileKv;
Expand Down Expand Up @@ -251,14 +250,14 @@ impl Oximeter {
let nexus_address = if let Some(address) = config.nexus_address {
address
} else {
SocketAddr::V6(SocketAddrV6::new(
resolver.lookup_ipv6(ServiceName::Nexus).await.map_err(
|e| backoff::BackoffError::transient(e.to_string()),
)?,
NEXUS_INTERNAL_PORT,
0,
0,
))
SocketAddr::V6(
resolver
.lookup_socket_v6(ServiceName::Nexus)
.await
.map_err(|e| {
backoff::BackoffError::transient(e.to_string())
})?,
)
};
let client = nexus_client::Client::new(
&format!("http://{nexus_address}"),
Expand Down

0 comments on commit 8ae0833

Please sign in to comment.