diff --git a/README.md b/README.md index 7bb676d10..efbcef0d5 100644 --- a/README.md +++ b/README.md @@ -478,3 +478,14 @@ Open a PR with these changes, and it will be reviewed and merged as per [the sta ## Running ad hoc jobs using a different image There may be a job (ClowdJobInvocation) which requires using a special image that is different from the one used by the parent application, i.e. host-inventory. Clowder out-of-the-box does not allow it. [Running a Special Job](docs/running_special_job.md) describes how to accomplish it. + +## Logging System Profile fields + +Use the environment variable `SP_FIELDS_TO_LOG` to log the System Profile fields of a host. +These fields are logged when adding, updating or deleting a host from inventory. It is very helpful when debugging hosts in Kibana. + +Below is an example on how to use to use the environment variable: + +```bash +SP_FIELDS_TO_LOG = "cpu_model,disk_devices" +``` diff --git a/app/config.py b/app/config.py index d0c180d56..7e5f726a2 100644 --- a/app/config.py +++ b/app/config.py @@ -320,6 +320,8 @@ def __init__(self, runtime_environment): self.mq_db_batch_max_messages = int(os.getenv("MQ_DB_BATCH_MAX_MESSAGES", "1")) self.mq_db_batch_max_seconds = float(os.getenv("MQ_DB_BATCH_MAX_SECONDS", "0.5")) + self.sp_fields_to_log = os.getenv("SP_FIELDS_TO_LOG", "").split(",") + if self._runtime_environment == RuntimeEnvironment.PENDO_JOB: self.pendo_sync_active = os.environ.get("PENDO_SYNC_ACTIVE", "false").lower() == "true" self.pendo_endpoint = os.environ.get("PENDO_ENDPOINT", "https://app.pendo.io/api/v1") diff --git a/app/instrumentation.py b/app/instrumentation.py index 43628fb67..cd267d2ba 100644 --- a/app/instrumentation.py +++ b/app/instrumentation.py @@ -85,8 +85,12 @@ def get_control_rule(): # delete host -def log_host_delete_succeeded(logger, host_id, control_rule): - logger.info("Deleted host: %s", host_id, extra={"access_rule": control_rule}) +def log_host_delete_succeeded(logger, host_id, control_rule, sp_fields_to_log): + logger.info( + "Deleted host: %s", + host_id, + extra={"access_rule": control_rule, "system_profile": json.dumps(sp_fields_to_log)}, + ) def log_host_delete_failed(logger, host_id, control_rule): @@ -216,7 +220,7 @@ def log_get_sparse_system_profile_succeeded(logger, data): # add host -def log_add_host_attempt(logger, input_host): +def log_add_host_attempt(logger, input_host, sp_fields_to_log): logger.info( "Attempting to add host", extra={ @@ -228,13 +232,14 @@ def log_add_host_attempt(logger, input_host): "reporter": input_host.reporter, "stale_timestamp": input_host.stale_timestamp.isoformat(), "tags": json.dumps(input_host.tags), + "system_profile": json.dumps(sp_fields_to_log), }, "access_rule": get_control_rule(), }, ) -def log_add_update_host_succeeded(logger, add_result, output_host): +def log_add_update_host_succeeded(logger, add_result, sp_fields_to_log, output_host): metrics.add_host_success.labels(add_result.name, output_host.get("reporter", "null")).inc() # created vs updated # log all the incoming host data except facts and system_profile b/c they can be quite large logger.info( @@ -242,24 +247,34 @@ def log_add_update_host_succeeded(logger, add_result, output_host): add_result.name, extra={ "host": {i: output_host[i] for i in output_host if i not in ("facts", "system_profile")}, + "system_profile": json.dumps(sp_fields_to_log), "access_rule": get_control_rule(), }, ) -def log_add_host_failure(logger, message, host_data): - logger.exception(f"Error adding host: {message} ", extra={"host": host_data}) +def log_add_host_failure(logger, message, host_data, sp_fields_to_log): + logger.exception( + f"Error adding host: {message}", extra={"host": host_data, "system_profile": json.dumps(sp_fields_to_log)} + ) metrics.add_host_failure.labels("InventoryException", host_data.get("reporter", "null")).inc() # update system profile -def log_update_system_profile_success(logger, host_data): +def log_update_system_profile_success(logger, host_data, sp_fields_to_log): metrics.update_system_profile_success.inc() - logger.info("System profile updated for host ID: %s", host_data.get("id")) + logger.info( + "System profile updated for host ID: %s", + host_data.get("id"), + extra={"system_profile": json.dumps(sp_fields_to_log)}, + ) -def log_update_system_profile_failure(logger, host_data): - logger.exception("Error updating system profile for host ", extra={"host": host_data}) +def log_update_system_profile_failure(logger, host_data, sp_fields_to_log): + logger.exception( + "Error updating system profile for host", + extra={"host": host_data, "system_profile": json.dumps(sp_fields_to_log)}, + ) metrics.update_system_profile_failure.labels("InventoryException").inc() @@ -300,7 +315,7 @@ def rbac_group_permission_denied(logger, group_ids, required_permission): def log_db_access_failure(logger, message, host_data): - logger.error("Failure to access database ", f"{message}") + logger.error("Failure to access database %s", message) metrics.db_communication_error.labels("OperationalError", host_data.get("insights_id", message)).inc() diff --git a/app/queue/host_mq.py b/app/queue/host_mq.py index 0f255ca1a..67f157c99 100644 --- a/app/queue/host_mq.py +++ b/app/queue/host_mq.py @@ -55,6 +55,7 @@ from lib.db import session_guard from lib.feature_flags import FLAG_INVENTORY_USE_CACHED_INSIGHTS_CLIENT_SYSTEM from lib.feature_flags import get_flag_value +from utils.system_profile_log import extract_host_dict_sp_to_log logger = get_logger(__name__) @@ -220,24 +221,29 @@ def update_system_profile(host_data, platform_metadata, notification_event_produ if operation_args is None: operation_args = {} + sp_fields_to_log = extract_host_dict_sp_to_log(host_data) + try: input_host = deserialize_host(host_data, schema=LimitedHostSchema) input_host.id = host_data.get("id") identity = create_mock_identity_with_org_id(input_host.org_id) output_host, update_result = host_repository.update_system_profile(input_host, identity) - success_logger = partial(log_update_system_profile_success, logger) + success_logger = partial(log_update_system_profile_success, logger, sp_fields_to_log) return output_host, update_result, identity, success_logger except ValidationException: metrics.update_system_profile_failure.labels("ValidationException").inc() raise except InventoryException: - log_update_system_profile_failure(logger, host_data) + log_update_system_profile_failure(logger, host_data, sp_fields_to_log) raise except OperationalError as oe: log_db_access_failure(logger, f"Could not access DB {str(oe)}", host_data) raise oe except Exception: - logger.exception("Error while updating host system profile", extra={"host": host_data}) + logger.exception( + "Error while updating host system profile", + extra={"host": host_data, "system_profile": sp_fields_to_log}, + ) metrics.update_system_profile_failure.labels("Exception").inc() raise @@ -245,6 +251,8 @@ def update_system_profile(host_data, platform_metadata, notification_event_produ def add_host(host_data, platform_metadata, notification_event_producer, operation_args=None): if operation_args is None: operation_args = {} + + sp_fields_to_log = extract_host_dict_sp_to_log(host_data) try: identity = _get_identity(host_data, platform_metadata) # basic-auth does not need owner_id @@ -252,22 +260,23 @@ def add_host(host_data, platform_metadata, notification_event_producer, operatio host_data = _set_owner(host_data, identity) input_host = deserialize_host(host_data) - log_add_host_attempt(logger, input_host) + log_add_host_attempt(logger, input_host, sp_fields_to_log) host_row, add_result = host_repository.add_host(input_host, identity, operation_args=operation_args) - success_logger = partial(log_add_update_host_succeeded, logger, add_result) + success_logger = partial(log_add_update_host_succeeded, logger, add_result, sp_fields_to_log) + # raise InventoryException return host_row, add_result, identity, success_logger except ValidationException: metrics.add_host_failure.labels("ValidationException", host_data.get("reporter", "null")).inc() raise except InventoryException as ie: - log_add_host_failure(logger, str(ie.detail), host_data) + log_add_host_failure(logger, str(ie.detail), host_data, sp_fields_to_log) raise except OperationalError as oe: log_db_access_failure(logger, f"Could not access DB {str(oe)}", host_data) raise oe except Exception: - logger.exception("Error while adding host", extra={"host": host_data}) + logger.exception("Error while adding host", extra={"host": host_data, "system_profile": sp_fields_to_log}) metrics.add_host_failure.labels("Exception", host_data.get("reporter", "null")).inc() raise diff --git a/lib/host_delete.py b/lib/host_delete.py index 586264641..7387fb813 100644 --- a/lib/host_delete.py +++ b/lib/host_delete.py @@ -18,6 +18,7 @@ from lib.host_kafka import kafka_available from lib.metrics import delete_host_count from lib.metrics import delete_host_processing_time +from utils.system_profile_log import extract_host_model_sp_to_log __all__ = ("delete_hosts",) logger = get_logger(__name__) @@ -72,6 +73,7 @@ def delete_hosts( def _delete_host(session, host, identity, control_rule) -> OperationResult: + sp_fields_to_log = extract_host_model_sp_to_log(host) assoc_delete_query = session.query(HostGroupAssoc).filter(HostGroupAssoc.host_id == host.id) host_delete_query = session.query(Host).filter(Host.id == host.id) assoc_delete_query.delete(synchronize_session="fetch") @@ -82,7 +84,7 @@ def _delete_host(session, host, identity, control_rule) -> OperationResult: None, None, EventType.delete, - partial(log_host_delete_succeeded, logger, host.id, control_rule), + partial(log_host_delete_succeeded, logger, host.id, control_rule, sp_fields_to_log), ) diff --git a/logconfig.yaml b/logconfig.yaml index f7e186bbc..30834447e 100644 --- a/logconfig.yaml +++ b/logconfig.yaml @@ -34,7 +34,7 @@ loggers: handlers: - logstash - cloudwatch - propagate: false + propagate: true urllib3: level: WARNING handlers: diff --git a/tests/test_api_hosts_delete.py b/tests/test_api_hosts_delete.py index b4d7efe32..d93638b9b 100644 --- a/tests/test_api_hosts_delete.py +++ b/tests/test_api_hosts_delete.py @@ -1,3 +1,4 @@ +import logging from unittest import mock from unittest.mock import patch @@ -587,6 +588,32 @@ def test_postgres_delete_filtered_hosts_nomatch( assert response_data["results"][0]["id"] == not_deleted_host_id +def test_log_create_delete( + event_datetime_mock, + event_producer_mock, + notification_event_producer_mock, + db_create_host, + db_get_host, + api_delete_host, + caplog, +): + caplog.at_level(logging.INFO) + host = db_create_host() + + response_status, _ = api_delete_host(host.id) + + assert_response_status(response_status, expected_status=200) + + assert_delete_event_is_valid(event_producer=event_producer_mock, host=host, timestamp=event_datetime_mock) + assert_delete_notification_is_valid( + notification_event_producer=notification_event_producer_mock, + host=host, + ) + + assert not db_get_host(host.id) + assert caplog.records[0].system_profile == "{}" + + class DeleteHostsMock: @classmethod def create_mock(cls, hosts_ids_to_delete): diff --git a/tests/test_host_mq_service.py b/tests/test_host_mq_service.py index f8589b055..4cb7235fb 100644 --- a/tests/test_host_mq_service.py +++ b/tests/test_host_mq_service.py @@ -1,4 +1,5 @@ import json +import logging from copy import deepcopy from datetime import datetime from datetime import timedelta @@ -1989,3 +1990,53 @@ def test_batch_mq_graceful_rollback(mocker, flask_app): # Since batch size is 3 and we're sending 5 messages,the first batch (3 messages) will get dropped, # but the second batch (2 messages) should have events produced. assert write_batch_patch.call_count == 1 + + +@pytest.mark.parametrize("identity", (SYSTEM_IDENTITY,)) +def test_add_host_logs(identity, mocker, flask_app, caplog): + caplog.at_level(logging.INFO) + + expected_insights_id = generate_uuid() + host = minimal_host(account=identity["account_number"], insights_id=expected_insights_id) + + mock_notification_event_producer = mocker.Mock() + + message = wrap_message(host.data(), "add_host", get_platform_metadata(identity)) + result = handle_message(json.dumps(message), mock_notification_event_producer) + + assert result.event_type == EventType.created + assert result.host_row.canonical_facts["insights_id"] == expected_insights_id + assert caplog.records[0].input_host["system_profile"] == "{}" + mock_notification_event_producer.write_event.assert_not_called() + + +@pytest.mark.parametrize("id_type", ("id", "insights_id", "fqdn")) +def test_log_update_system_profile(mq_create_or_update_host, db_get_host, id_type, caplog): + caplog.at_level(logging.INFO) + expected_ids = {"insights_id": generate_uuid(), "fqdn": "foo.test.redhat.com"} + input_host = base_host(**expected_ids, system_profile={"owner_id": OWNER_ID, "number_of_cpus": 1}) + first_host_from_event = mq_create_or_update_host(input_host) + first_host_from_db = db_get_host(first_host_from_event.id) + expected_ids["id"] = str(first_host_from_db.id) + + assert str(first_host_from_db.canonical_facts["insights_id"]) == expected_ids["insights_id"] + assert first_host_from_db.system_profile_facts.get("number_of_cpus") == 1 + + input_host = base_host( + **{id_type: expected_ids[id_type]}, system_profile={"number_of_cpus": 4, "number_of_sockets": 8} + ) + input_host.stale_timestamp = None + input_host.reporter = None + second_host_from_event = mq_create_or_update_host(input_host, message_operation=update_system_profile) + second_host_from_db = db_get_host(second_host_from_event.id) + + # The second host should have the same ID and insights ID, + # and the system profile should have updated with the new values. + assert str(second_host_from_db.id) == first_host_from_event.id + assert str(second_host_from_db.canonical_facts["insights_id"]) == expected_ids["insights_id"] + assert second_host_from_db.system_profile_facts == { + "owner_id": OWNER_ID, + "number_of_cpus": 4, + "number_of_sockets": 8, + } + assert caplog.records[0].input_host["system_profile"] == "{}" diff --git a/utils/system_profile_log.py b/utils/system_profile_log.py new file mode 100644 index 000000000..3611a91f7 --- /dev/null +++ b/utils/system_profile_log.py @@ -0,0 +1,17 @@ +from app.common import inventory_config +from app.models import Host + + +def extract_sp_to_log(sp_data: dict) -> dict: + return {k: sp_data[k] for k in inventory_config().sp_fields_to_log if k in sp_data} + + +def extract_host_model_sp_to_log(host: Host) -> dict: + return extract_sp_to_log(host.system_profile_facts) + + +def extract_host_dict_sp_to_log(host_data: dict) -> dict: + if "system_profile" in host_data: + return extract_sp_to_log(host_data["system_profile"]) + else: + return {}