Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port agent OOM kill fix to 7.48 #15988

Merged
merged 2 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions kafka_consumer/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

## Unreleased

## 4.1.3 / 2023-10-11

***Fixed***:

* Add ability to cache offsets and close admin client ([#15988](https://github.com/DataDog/integrations-core/pull/15988))

## 4.1.2 / 2023-09-04 / Agent 7.48.0

***Fixed***:
Expand Down
18 changes: 18 additions & 0 deletions kafka_consumer/assets/configuration/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,24 @@ files:
value:
type: boolean
example: false
- name: consumer_queued_max_messages_kbytes
description: |
The consumer very aggressively caches messages in the background (tuned for very high throughput).
To reduce memory usage, tune down queued.max.messages.kbytes (maximum cache size per partition).
Override the kafka default to 1MB for the integration check to optimize
memory consumption to avoid potential out of memory (OOM) kill. (Default setting is 1GB per Kafka client)
value:
type: integer
example: 1024
- name: close_admin_client
description: |
Release AdminClient at the end of execution for garbage collection. Originally, we kept the same AdminClient
running over the entire life of the check. By deallocating the AdminClient after the check run, we should free
up any memory used in the client, although the performance of the check run would be slower (since we will
need to reconnect with a new client). Set this config option to false to improve performance.
value:
type: boolean
example: true
- name: monitor_all_broker_highwatermarks
description: |
Setting monitor_all_broker_highwatermarks to `true` tells the check to
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)

__version__ = "4.1.2"
__version__ = "4.1.3"
16 changes: 11 additions & 5 deletions kafka_consumer/datadog_checks/kafka_consumer/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def __init__(self, config, log) -> None:
self.config = config
self.log = log
self._kafka_client = None
self.topic_partition_cache = {}

@property
def kafka_client(self):
Expand All @@ -34,6 +35,7 @@ def __create_consumer(self, consumer_group):
"bootstrap.servers": self.config._kafka_connect_str,
"group.id": consumer_group,
"enable.auto.commit": False, # To avoid offset commit to broker during close
"queued.max.messages.kbytes": self.config._consumer_queued_max_messages_kbytes,
}
config.update(self.__get_authentication_config())

Expand Down Expand Up @@ -152,6 +154,9 @@ def get_highwater_offsets(self, consumer_offsets):
return highwater_offsets

def get_partitions_for_topic(self, topic):
if partitions := self.topic_partition_cache.get(topic):
return partitions

try:
cluster_metadata = self.kafka_client.list_topics(topic, timeout=self.config._request_timeout)
except KafkaException as e:
Expand All @@ -160,6 +165,7 @@ def get_partitions_for_topic(self, topic):
else:
topic_metadata = cluster_metadata.topics[topic]
partitions = list(topic_metadata.partitions.keys())
self.topic_partition_cache[topic] = partitions
return partitions

def request_metadata_update(self):
Expand Down Expand Up @@ -244,6 +250,9 @@ def _get_consumer_groups(self):
def _list_consumer_group_offsets(self, cg_tp):
return self.kafka_client.list_consumer_group_offsets([cg_tp])

def close_admin_client(self):
self._kafka_client = None

def _get_consumer_offset_futures(self, consumer_groups):
futures = []

Expand Down Expand Up @@ -271,11 +280,8 @@ def _get_consumer_offset_futures(self, consumer_groups):
# If partitions are not defined
else:
# get all the partitions for this topic
partitions = (
self.kafka_client.list_topics(topic=topic, timeout=self.config._request_timeout)
.topics[topic]
.partitions
)
partitions = self.get_partitions_for_topic(topic)

topic_partitions = [TopicPartition(topic, partition) for partition in partitions]

futures.append(
Expand Down
4 changes: 4 additions & 0 deletions kafka_consumer/datadog_checks/kafka_consumer/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ def __init__(self, init_config, instance, log) -> None:
if self._consumer_groups_regex
else ""
)
# Optimization to avoid OOM kill:
# https://github.com/confluentinc/confluent-kafka-python/issues/759
self._consumer_queued_max_messages_kbytes = instance.get('consumer_queued_max_messages_kbytes', 1024)
self._close_admin_client = instance.get('close_admin_client', True)

self._kafka_connect_str = instance.get('kafka_connect_str')
self._kafka_version = instance.get('kafka_client_api_version')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@ def shared_kafka_timeout():
return 5


def instance_close_admin_client():
return True


def instance_consumer_queued_max_messages_kbytes():
return 1024


def instance_disable_generic_tags():
return False

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,10 @@ class InstanceConfig(BaseModel):
arbitrary_types_allowed=True,
frozen=True,
)
close_admin_client: Optional[bool] = None
consumer_groups: Optional[MappingProxyType[str, Any]] = None
consumer_groups_regex: Optional[MappingProxyType[str, Any]] = None
consumer_queued_max_messages_kbytes: Optional[int] = None
disable_generic_tags: Optional[bool] = None
empty_default_hostname: Optional[bool] = None
kafka_client_api_version: Optional[str] = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,22 @@ instances:
#
# monitor_unlisted_consumer_groups: false

## @param consumer_queued_max_messages_kbytes - integer - optional - default: 1024
## The consumer very aggressively caches messages in the background (tuned for very high throughput).
## To reduce memory usage, tune down queued.max.messages.kbytes (maximum cache size per partition).
## Override the kafka default to 1MB for the integration check to optimize
## memory consumption to avoid potential out of memory (OOM) kill. (Default setting is 1GB per Kafka client)
#
# consumer_queued_max_messages_kbytes: 1024

## @param close_admin_client - boolean - optional - default: true
## Release AdminClient at the end of execution for garbage collection. Originally, we kept the same AdminClient
## running over the entire life of the check. By deallocating the AdminClient after the check run, we should free
## up any memory used in the client, although the performance of the check run would be slower (since we will
## need to reconnect with a new client). Set this config option to false to improve performance.
#
# close_admin_client: true

## @param monitor_all_broker_highwatermarks - boolean - optional - default: false
## Setting monitor_all_broker_highwatermarks to `true` tells the check to
## discover and fetch the broker highwater mark offsets for all kafka topics in
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ def check(self, _):
except Exception:
self.log.exception("There was a problem collecting the highwater mark offsets.")
# Unlike consumer offsets, fail immediately because we can't calculate consumer lag w/o highwater_offsets
if self.config._close_admin_client:
self.client.close_admin_client()
raise

total_contexts = len(consumer_offsets) + len(highwater_offsets)
Expand All @@ -67,6 +69,8 @@ def check(self, _):
self.report_consumer_offsets_and_lag(
consumer_offsets, highwater_offsets, self._context_limit - len(highwater_offsets)
)
if self.config._close_admin_client:
self.client.close_admin_client()

def report_highwater_offsets(self, highwater_offsets, contexts_limit):
"""Report the broker highwater offsets."""
Expand Down
2 changes: 1 addition & 1 deletion requirements-agent-release.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ datadog-impala==2.0.0
datadog-istio==5.1.0
datadog-jboss-wildfly==2.1.1
datadog-journald==1.1.1
datadog-kafka-consumer==4.1.2
datadog-kafka-consumer==4.1.3
datadog-kafka==2.14.1
datadog-kong==3.0.0
datadog-kube-apiserver-metrics==4.0.0
Expand Down
Loading