Skip to content

Commit

Permalink
[PLINT-356] Maintain a long lived connection (#17919)
Browse files Browse the repository at this point in the history
* long lived connection

* Update connection logic and change can_connect to gauge

* remove unused field

* fix metadata and add changelog
  • Loading branch information
sarah-witt authored Jun 25, 2024
1 parent 55bd98b commit 6f5ea08
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 44 deletions.
1 change: 1 addition & 0 deletions esxi/changelog.d/17919.fixed
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Maintain a long lived connection to ESXi host.
82 changes: 47 additions & 35 deletions esxi/datadog_checks/esxi/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,47 @@ def __init__(self, name, init_config, instances):
else:
self.proxy_host = parsed_proxy.hostname
self.proxy_port = parsed_proxy.port
self.conn = None
self.content = None
self.check_initializations.append(self.initiate_api_connection)

def initiate_api_connection(self):
try:
context = ssl.SSLContext(protocol=ssl.PROTOCOL_TLS_CLIENT)
context.check_hostname = True if self.ssl_verify else False
context.verify_mode = ssl.CERT_REQUIRED if self.ssl_verify else ssl.CERT_NONE

if self.ssl_capath:
context.load_verify_locations(cafile=None, capath=self.ssl_capath, cadata=None)
elif self.ssl_cafile:
context.load_verify_locations(cafile=self.ssl_cafile, capath=None, cadata=None)
else:
context.load_default_certs(ssl.Purpose.SERVER_AUTH)

create_connection_method = socket.create_connection
if self.proxy_host:
socket.create_connection = lambda address, timeout, source_address, **kwargs: create_connection(
address, timeout, source_address, self.proxy_host, self.proxy_port
)

connection = connect.SmartConnect(host=self.host, user=self.username, pwd=self.password, sslContext=context)
socket.create_connection = create_connection_method

self.conn = connection
self.content = connection.content

if self.content.about.apiType != "HostAgent":
raise Exception(
f"{self.host} is not an ESXi host; please set the `host` config option to an ESXi host "
"or use the vSphere integration to collect data from the vCenter",
)

self.log.info("Connected to ESXi host %s: %s", self.host, self.content.about.fullName)
self.gauge("host.can_connect", 1, tags=self.tags)
except Exception as e:
self.log.exception("Cannot connect to ESXi host %s: %s", self.host, str(e))
self.gauge("host.can_connect", 0, tags=self.tags)
raise

def _validate_excluded_host_tags(self, excluded_host_tags):
valid_excluded_host_tags = []
Expand Down Expand Up @@ -354,44 +395,15 @@ def set_version_metadata(self):

def check(self, _):
try:
context = ssl.SSLContext(protocol=ssl.PROTOCOL_TLS_CLIENT)
context.check_hostname = True if self.ssl_verify else False
context.verify_mode = ssl.CERT_REQUIRED if self.ssl_verify else ssl.CERT_NONE

if self.ssl_capath:
context.load_verify_locations(cafile=None, capath=self.ssl_capath, cadata=None)
elif self.ssl_cafile:
context.load_verify_locations(cafile=self.ssl_cafile, capath=None, cadata=None)
else:
context.load_default_certs(ssl.Purpose.SERVER_AUTH)

create_connection_method = socket.create_connection
if self.proxy_host:
socket.create_connection = lambda address, timeout, source_address, **kwargs: create_connection(
address, timeout, source_address, self.proxy_host, self.proxy_port
)

connection = connect.SmartConnect(host=self.host, user=self.username, pwd=self.password, sslContext=context)
socket.create_connection = create_connection_method

self.conn = connection
self.content = connection.content

if self.content.about.apiType != "HostAgent":
raise Exception(
f"{self.host} is not an ESXi host; please set the `host` config option to an ESXi host "
"or use the vSphere integration to collect data from the vCenter",
)

self.log.info("Connected to ESXi host %s: %s", self.host, self.content.about.fullName)
self.count("host.can_connect", 1, tags=self.tags)
self.set_version_metadata()
self.gauge("host.can_connect", 1, tags=self.tags)

except Exception as e:
self.log.exception("Cannot connect to ESXi host %s: %s", self.host, str(e))
self.count("host.can_connect", 0, tags=self.tags)
raise
self.conn = None
self.content = None
self.log.debug("Failed to get version metadata; attempting to reconnect to the ESXi host: %s", str(e))
self.initiate_api_connection()

self.set_version_metadata()
resources = self.get_resources()
resource_map = {
obj_content.obj: {prop.name: prop.val for prop in obj_content.propSet}
Expand Down
1 change: 1 addition & 0 deletions esxi/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ esxi.disk.write.avg,gauge,,kibibyte,second,Average number of kilobytes written t
esxi.hbr.hbrNetRx.avg,gauge,,kibibyte,second,Kilobytes per second of outgoing host-based replication network traffic (for this virtual machine or host).,0,esxi,hbr net rx,
esxi.hbr.hbrNetTx.avg,gauge,,kibibyte,second,Average amount of data transmitted per second,0,esxi,hbr net tx,
esxi.hbr.hbrNumVms.avg,gauge,,,,Number of powered-on virtual machines running on this host that currently have host-based replication protection enabled.,0,esxi,hbr num vms,
esxi.host.can_connect,gauge,,,,Whether the check can connect to the ESXi host or not.,0,esxi,host connect,
esxi.host.count,gauge,,,,Timeseries with value 1 for each ESXi Host. Make 'sum by {X}' queries to count all the Hosts with the tag X.,0,esxi,host count,
esxi.mem.active.avg,gauge,,kibibyte,,"Amount of memory that is actively used, as estimated by VMkernel based on recently touched memory pages",-1,esxi,mem active avg,
esxi.mem.activewrite.avg,gauge,,kibibyte,,Estimate for the amount of memory actively being written to by the virtual machine,-1,esxi,mem activewrite avg,
Expand Down
4 changes: 2 additions & 2 deletions esxi/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
def test_esxi_metric_up(vcsim_instance, dd_run_check, aggregator):
check = EsxiCheck('esxi', {}, [vcsim_instance])
dd_run_check(check)
aggregator.assert_metric('esxi.host.can_connect', 1, count=1, tags=["esxi_url:127.0.0.1:8989"])
aggregator.assert_metric('esxi.host.can_connect', 1, count=2, tags=["esxi_url:127.0.0.1:8989"])


def test_esxi_perf_metrics(vcsim_instance, dd_run_check, aggregator, caplog):
Expand Down Expand Up @@ -73,7 +73,7 @@ def test_esxi_perf_metrics(vcsim_instance, dd_run_check, aggregator, caplog):

aggregator.assert_metric("esxi.host.count")
aggregator.assert_metric("esxi.vm.count")
aggregator.assert_metric("esxi.host.can_connect", 1, count=1, tags=base_tags)
aggregator.assert_metric("esxi.host.can_connect", 1, count=2, tags=base_tags)

aggregator.assert_all_metrics_covered()

Expand Down
34 changes: 27 additions & 7 deletions esxi/tests/test_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def test_esxi_metric_up(instance, dd_run_check, aggregator, caplog):
check = EsxiCheck('esxi', {}, [instance])
caplog.set_level(logging.DEBUG)
dd_run_check(check)
aggregator.assert_metric('esxi.host.can_connect', 1, count=1, tags=["esxi_url:localhost"])
aggregator.assert_metric('esxi.host.can_connect', 1, count=2, tags=["esxi_url:localhost"])
assert "Connected to ESXi host localhost: VMware ESXi 6.5.0 build-123456789" in caplog.text


Expand All @@ -45,7 +45,7 @@ def test_none_properties_data(vcsim_instance, dd_run_check, aggregator, service_
assert "No resources found; halting check execution" in caplog.text

base_tags = ["esxi_url:127.0.0.1:8989"]
aggregator.assert_metric("esxi.host.can_connect", 1, count=1, tags=base_tags)
aggregator.assert_metric("esxi.host.can_connect", 1, count=2, tags=base_tags)
aggregator.assert_all_metrics_covered()


Expand All @@ -67,7 +67,7 @@ def test_esxi_no_properties(vcsim_instance, dd_run_check, aggregator, service_in
assert "No resources found; halting check execution" in caplog.text

base_tags = ["esxi_url:127.0.0.1:8989"]
aggregator.assert_metric("esxi.host.can_connect", 1, count=1, tags=base_tags)
aggregator.assert_metric("esxi.host.can_connect", 1, count=2, tags=base_tags)
aggregator.assert_all_metrics_covered()


Expand Down Expand Up @@ -129,7 +129,7 @@ def test_esxi_perf_metrics(vcsim_instance, dd_run_check, aggregator, caplog):
base_tags = ["esxi_url:127.0.0.1:8989"]
aggregator.assert_metric("esxi.cpu.usage.avg", value=0.26, tags=base_tags, hostname="localhost.localdomain")
aggregator.assert_metric("esxi.mem.granted.avg", value=80, tags=base_tags, hostname="localhost.localdomain")
aggregator.assert_metric("esxi.host.can_connect", 1, count=1, tags=base_tags)
aggregator.assert_metric("esxi.host.can_connect", 1, count=2, tags=base_tags)

assert "Skipping metric net.droppedRx.sum for localhost.localdomain, because the value "
"returned by the host is negative (i.e. the metric is not yet available). values: [-1]" in caplog.text
Expand Down Expand Up @@ -513,7 +513,7 @@ def test_report_instance_metrics_invalid_metric_name_still_collect_metrics(aggre
base_tags = ["esxi_url:127.0.0.1:8989"]
aggregator.assert_metric("esxi.cpu.usage.avg", value=0.26, tags=base_tags, hostname="localhost.localdomain")
aggregator.assert_metric("esxi.mem.granted.avg", value=80, tags=base_tags, hostname="localhost.localdomain")
aggregator.assert_metric("esxi.host.can_connect", 1, count=1, tags=base_tags)
aggregator.assert_metric("esxi.host.can_connect", 1, count=2, tags=base_tags)


@pytest.mark.usefixtures("service_instance")
Expand Down Expand Up @@ -1206,7 +1206,7 @@ def test_use_configured_hostname(vcsim_instance, dd_run_check, aggregator, datad
base_tags = ["esxi_url:127.0.0.1:8989"]
aggregator.assert_metric("esxi.cpu.usage.avg", value=0.26, tags=base_tags, hostname="127.0.0.1:8989")
aggregator.assert_metric("esxi.mem.granted.avg", value=80, tags=base_tags, hostname="127.0.0.1:8989")
aggregator.assert_metric("esxi.host.can_connect", 1, count=1, tags=base_tags)
aggregator.assert_metric("esxi.host.can_connect", 1, count=2, tags=base_tags)

datadog_agent.assert_external_tags(
'127.0.0.1:8989',
Expand Down Expand Up @@ -1237,7 +1237,7 @@ def test_use_socks_proxy(vcsim_instance, dd_run_check, caplog, aggregator):
check = EsxiCheck('esxi', {}, [instance])
dd_run_check(check)
assert "Proxy scheme socks5 not supported; ignoring" not in caplog.text
aggregator.assert_metric("esxi.host.can_connect", 1, count=1)
aggregator.assert_metric("esxi.host.can_connect", 1, count=2)


def test_use_socks_proxy_mocked(vcsim_instance, dd_run_check, caplog, aggregator):
Expand All @@ -1251,3 +1251,23 @@ def test_use_socks_proxy_mocked(vcsim_instance, dd_run_check, caplog, aggregator
assert "Proxy scheme socks5 not supported; ignoring" not in caplog.text
assert socks_connect.call_count == 1
aggregator.assert_metric("esxi.host.can_connect", 0, count=1)


@pytest.mark.usefixtures("service_instance")
def test_cant_get_version(vcsim_instance, dd_run_check, caplog, service_instance, aggregator):
check = EsxiCheck('esxi', {}, [vcsim_instance])
# run check once to initialize
dd_run_check(check)
aggregator.reset()

service_instance.content.about = None
caplog.set_level(logging.DEBUG)

with pytest.raises(Exception):
dd_run_check(check)
assert (
"Failed to get version metadata; attempting to reconnect to the ESXi host: "
"'NoneType' object has no attribute 'version'" in caplog.text
)
aggregator.assert_metric('esxi.host.can_connect', 0, count=1)
aggregator.assert_all_metrics_covered()

0 comments on commit 6f5ea08

Please sign in to comment.