diff --git a/cerberus/kubernetes/client.py b/cerberus/kubernetes/client.py index 8d21804..0ba3f2e 100644 --- a/cerberus/kubernetes/client.py +++ b/cerberus/kubernetes/client.py @@ -261,14 +261,21 @@ def namespace_sleep_tracker(namespace, pods_tracker): # Monitor the status of the pods in the specified namespace # and set the status to true or false -def monitor_namespace(namespace): +def monitor_namespace(namespace, ignore_pattern=None): notready_pods = set() + match = False notready_containers = defaultdict(list) all_pod_info_list = get_all_pod_info(namespace) if all_pod_info_list is not None and len(all_pod_info_list) > 0: for all_pod_info in all_pod_info_list: for pod_info in all_pod_info.items: pod = pod_info.metadata.name + if ignore_pattern: + for pattern in ignore_pattern: + if re.match(pattern, pod): + match = True + if match: + continue pod_status = pod_info.status pod_status_phase = pod_status.phase if pod_status_phase != "Running" and pod_status_phase != "Succeeded": @@ -295,8 +302,8 @@ def monitor_namespace(namespace): return status, notready_pods, notready_containers -def process_namespace(iteration, namespace, failed_pods_components, failed_pod_containers): - watch_component_status, failed_component_pods, failed_containers = monitor_namespace(namespace) +def process_namespace(iteration, namespace, failed_pods_components, failed_pod_containers, ignore_pattern): + watch_component_status, failed_component_pods, failed_containers = monitor_namespace(namespace, ignore_pattern) logging.info("Iteration %s: %s: %s" % (iteration, namespace, watch_component_status)) if not watch_component_status: failed_pods_components[namespace] = failed_component_pods diff --git a/config/config.yaml b/config/config.yaml index f99423b..705bd86 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -19,6 +19,7 @@ cerberus: - openshift-kube-scheduler - openshift-ingress - openshift-sdn # When enabled, it will check for the cluster sdn and monitor that namespace + watch_namespaces_ignore_pattern: [^installer*] # Ignores pods matching the regex pattern in the namespaces specified under watch_namespaces cerberus_publish_status: True # When enabled, cerberus starts a light weight http server and publishes the status inspect_components: False # Enable it only when OpenShift client is supported to run # When enabled, cerberus collects logs, events and metrics of failed components diff --git a/config/kubernetes_config.yaml b/config/kubernetes_config.yaml index ea61f36..87b4e2d 100644 --- a/config/kubernetes_config.yaml +++ b/config/kubernetes_config.yaml @@ -9,6 +9,7 @@ cerberus: label: node-role.kubernetes.io/master watch_namespaces: # List of namespaces to be monitored - kube-system + watch_namespaces_ignore_pattern: [] # Ignores pods matching the regex pattern in the namespaces specified under watch_namespaces cerberus_publish_status: True # When enabled, cerberus starts a light weight http server and publishes the status inspect_components: False # Enable it only when OpenShift client is supported to run # When enabled, cerberus collects logs, events and metrics of failed components diff --git a/docs/config.md b/docs/config.md index 2531914..280f435 100644 --- a/docs/config.md +++ b/docs/config.md @@ -37,6 +37,7 @@ cerberus: - openshift-kube-scheduler - openshift-ingress - openshift-sdn # When enabled, it will check for the cluster sdn and monitor that namespace + watch_namespaces_ignore_pattern: [] # Ignores pods matching the regex pattern in the namespaces specified under watch_namespaces cerberus_publish_status: True # When enabled, cerberus starts a light weight http server and publishes the status inspect_components: False # Enable it only when OpenShift client is supported to run # When enabled, cerberus collects logs, events and metrics of failed components diff --git a/start_cerberus.py b/start_cerberus.py index e16e279..5c5088d 100644 --- a/start_cerberus.py +++ b/start_cerberus.py @@ -79,6 +79,7 @@ def main(cfg): watch_nodes = config["cerberus"].get("watch_nodes", False) watch_cluster_operators = config["cerberus"].get("watch_cluster_operators", False) watch_namespaces = config["cerberus"].get("watch_namespaces", []) + watch_namespaces_ignore_pattern = config["cerberus"].get("watch_namespaces_ignore_pattern", []) watch_terminating_namespaces = config["cerberus"].get("watch_terminating_namespaces", True) watch_url_routes = config["cerberus"].get("watch_url_routes", []) watch_master_schedulable = config["cerberus"].get("watch_master_schedulable", {}) @@ -288,6 +289,7 @@ def main(cfg): watch_namespaces, repeat(failed_pods_components), repeat(failed_pod_containers), + repeat(watch_namespaces_ignore_pattern), ), )