Add schema collection to Postgres integration (#15484) (#15866)

DataDog · Sep 19, 2023 · d25a65e · d25a65e
1 parent 454fbde
commit d25a65e
Show file tree

Hide file tree

Showing 14 changed files with 640 additions and 18 deletions.
diff --git a/postgres/CHANGELOG.md b/postgres/CHANGELOG.md
@@ -6,6 +6,10 @@
 
 * Attempt to connect to the database and fail fast before trying to establish a connection pool ([#15839](https://github.com/DataDog/integrations-core/pull/15839))
 
+***Added***:
+
+* Add schema collection to Postgres integration (#15484) ([#15866](https://github.com/DataDog/integrations-core/pull/15866))
+
 ***Fixed***:
 
 * Revert psycopg3 upgrade ([#15859](https://github.com/DataDog/integrations-core/pull/15859))

diff --git a/postgres/assets/configuration/spec.yaml b/postgres/assets/configuration/spec.yaml
@@ -502,6 +502,40 @@ files:
             type: number
             example: 600
 
+    - name: collect_schemas
+      description: |
+        Enable collection of database schemas. In order to collect schemas from all user databases, 
+        enable `database_autodiscovery`. To collect from a single database, set `dbname` to collect 
+        the schema for that database.
+        Relation metrics must be enabled for schema collection.
+      options:
+        - name: enabled
+          description: |
+            Enable collection of database schemas. Requires `dbm: true` and relation metrics must be enabled.
+          value:
+            type: boolean
+            example: false
+        - name: max_tables
+          description: |
+            Maximum amount of tables the Agent collects from the instance.
+          value:
+            type: number
+            example: 1000
+            display_default: 1000
+        - name: max_columns
+          description: |
+            Maximum amount of columns the Agent collects per table.
+          value:
+            type: number
+            example: 50
+            display_default: 50
+        - name: collection_interval
+          description: |
+            The database schema collection interval (in seconds).
+          value:
+            type: number
+            example: 600
+
     - name: aws
       description: |
         This block defines the configuration for AWS RDS and Aurora instances. 

diff --git a/postgres/datadog_checks/postgres/config.py b/postgres/datadog_checks/postgres/config.py
@@ -98,6 +98,12 @@ def __init__(self, instance):
         self.pg_stat_activity_view = instance.get('pg_stat_activity_view', 'pg_stat_activity')
         self.statement_samples_config = instance.get('query_samples', instance.get('statement_samples', {})) or {}
         self.settings_metadata_config = instance.get('collect_settings', {}) or {}
+        self.schemas_metadata_config = instance.get('collect_schemas', {"enabled": False})
+        if not self.relations and self.schemas_metadata_config['enabled']:
+            raise ConfigurationError(
+                'In order to collect schemas on this database, you must enable relation metrics collection.'
+            )
+
         self.resources_metadata_config = instance.get('collect_resources', {}) or {}
         self.statement_activity_config = instance.get('query_activity', {}) or {}
         self.statement_metrics_config = instance.get('query_metrics', {}) or {}

diff --git a/postgres/datadog_checks/postgres/config_models/instance.py b/postgres/datadog_checks/postgres/config_models/instance.py
@@ -38,6 +38,17 @@ class Azure(BaseModel):
     fully_qualified_domain_name: Optional[str] = None
 
 
+class CollectSchemas(BaseModel):
+    model_config = ConfigDict(
+        arbitrary_types_allowed=True,
+        frozen=True,
+    )
+    collection_interval: Optional[float] = None
+    enabled: Optional[bool] = None
+    max_columns: Optional[float] = None
+    max_tables: Optional[float] = None
+
+
 class CollectSettings(BaseModel):
     model_config = ConfigDict(
         arbitrary_types_allowed=True,
@@ -163,6 +174,7 @@ class InstanceConfig(BaseModel):
     collect_database_size_metrics: Optional[bool] = None
     collect_default_database: Optional[bool] = None
     collect_function_metrics: Optional[bool] = None
+    collect_schemas: Optional[CollectSchemas] = None
     collect_settings: Optional[CollectSettings] = None
     collect_wal_metrics: Optional[bool] = None
     custom_queries: Optional[tuple[MappingProxyType[str, Any], ...]] = None

diff --git a/postgres/datadog_checks/postgres/data/conf.yaml.example b/postgres/datadog_checks/postgres/data/conf.yaml.example
@@ -404,6 +404,33 @@ instances:
         #
         # collection_interval: 600
 
+    ## Enable collection of database schemas. In order to collect schemas from all user databases, 
+    ## enable `database_autodiscovery`. To collect from a single database, set `dbname` to collect 
+    ## the schema for that database.
+    ## Relation metrics must be enabled for schema collection.
+    #
+    # collect_schemas:
+
+        ## @param enabled - boolean - optional - default: false
+        ## Enable collection of database schemas. Requires `dbm: true` and relation metrics must be enabled.
+        #
+        # enabled: false
+
+        ## @param max_tables - number - optional - default: 1000
+        ## Maximum amount of tables the Agent collects from the instance.
+        #
+        # max_tables: 1000
+
+        ## @param max_columns - number - optional - default: 50
+        ## Maximum amount of columns the Agent collects per table.
+        #
+        # max_columns: 50
+
+        ## @param collection_interval - number - optional - default: 600
+        ## The database schema collection interval (in seconds).
+        #
+        # collection_interval: 600
+
     ## This block defines the configuration for AWS RDS and Aurora instances. 
     ##
     ## Complete this section if you have installed the Datadog AWS Integration 

diff --git a/postgres/datadog_checks/postgres/explain_parameterized_queries.py b/postgres/datadog_checks/postgres/explain_parameterized_queries.py
@@ -3,6 +3,7 @@
 # Licensed under a 3-clause BSD style license (see LICENSE)
 
 import logging
+import re
 
 import psycopg2
 
@@ -169,3 +170,11 @@ def _execute_query_and_fetch_rows(self, dbname, query):
             with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor:
                 cursor.execute(query)
                 return cursor.fetchall()
+
+    def _is_parameterized_query(self, statement: str) -> bool:
+        # Use regex to match $1 to determine if a query is parameterized
+        # BUT single quoted string '$1' should not be considered as a parameter
+        # e.g. SELECT * FROM products WHERE id = $1; -- $1 is a parameter
+        # e.g. SELECT * FROM products WHERE id = '$1'; -- '$1' is not a parameter
+        parameterized_query_pattern = r"(?<!')\$(?!'\$')[\d]+(?!')"
+        return re.search(parameterized_query_pattern, statement) is not None