red-hat-storage · petr-balogh · Feb 3, 2025 · Jan 17, 2025 · ebenahar · Jan 22, 2025
@@ -183,6 +183,8 @@ Reporting related config. (Do not store secret data in the repository!).
 * `save_mem_report` - If True, test run memory report CSV file will be saved in `RUN["log_dir"]/stats_log_dir_<run_id>`
   directory along with <test name>.peak_rss_table, <test name>.peak_vms_table reports. The option may be enforced by
   exporting env variable: export SAVE_MEM_REPORT=true
+* `max_mg_fail_attempts` - Maximum attempts to run MG commands to prevent
+  spending time on MG which is timeouting.
 
 #### ENV_DATA
 

@@ -231,7 +231,11 @@ def do_deploy_ocp(self, log_cli_level):
                     config.RUN["is_ocp_deployment_failed"] = True
                     logger.error(e)
                     if config.REPORTING["gather_on_deploy_failure"]:
-                        collect_ocs_logs("deployment", ocs=False)
+                        collect_ocs_logs(
+                            "deployment",
+                            ocs=False,
+                            timeout=defaults.MUST_GATHER_TIMEOUT,
+                        )
                     raise
 
     def do_deploy_submariner(self):
@@ -381,8 +385,16 @@ def do_deploy_ocs(self):
                     if config.REPORTING["gather_on_deploy_failure"]:
                         # Let's do the collections separately to guard against one
                         # of them failing
-                        collect_ocs_logs("deployment", ocs=False)
-                        collect_ocs_logs("deployment", ocp=False)
+                        collect_ocs_logs(
+                            "deployment",
+                            ocs=False,
+                            timeout=defaults.MUST_GATHER_TIMEOUT,
+                        )
+                        collect_ocs_logs(
+                            "deployment",
+                            ocp=False,
+                            timeout=defaults.MUST_GATHER_TIMEOUT,
+                        )
                     raise
             config.reset_ctx()
             # Run ocs_install_verification here only in case of multicluster.

@@ -141,6 +141,7 @@ REPORTING:
   gather_on_deploy_failure: true
   collect_logs_on_success_run: False
   rp_client_log_level: "ERROR"
+  max_mg_fail_attempts: 3
 
 # This is the default information about environment.
 ENV_DATA:

@@ -20,6 +20,7 @@
     ClusterNameNotProvidedError,
     ClusterPathNotProvidedError,
 )
+from ocs_ci.ocs import defaults
 from ocs_ci.ocs.constants import (
     CLUSTER_NAME_MAX_CHARACTERS,
     CLUSTER_NAME_MIN_CHARACTERS,
@@ -31,7 +32,7 @@
 )
 from ocs_ci.ocs.cluster import check_clusters
 from ocs_ci.ocs.resources.ocs import get_version_info
-from ocs_ci.ocs.utils import collect_ocs_logs, collect_prometheus_metrics
+from ocs_ci.ocs import utils
 from ocs_ci.utility.utils import (
     dump_config_to_file,
     get_ceph_version,
@@ -729,34 +730,53 @@ def pytest_runtest_makereport(item, call):
         and ocsci_config.RUN.get("cli_params").get("collect-logs")
         and not ocsci_config.RUN.get("cli_params").get("deploy")
     ):
+        item_markers = {marker.name for marker in item.iter_markers()}
         test_case_name = item.name
+        # TODO: We should avoid paths and rely on markers issue:
+        # https://github.com/red-hat-storage/ocs-ci/issues/10526
         ocp_logs_collection = (
             True
             if any(
                 x in item.location[0]
                 for x in [
-                    "ecosystem",
-                    "e2e/performance",
                     "tests/functional/z_cluster",
                 ]
             )
             else False
         )
+        ocp_markers_to_collect = {
+            "performance",
+            "purple_squad",
+        }
+        if ocp_markers_to_collect & item_markers:
+            ocp_logs_collection = True
         ocs_logs_collection = (
             False
             if any(x in item.location[0] for x in ["_ui", "must_gather"])
             else True
         )
-        mcg_logs_collection = (
-            True if any(x in item.location[0] for x in ["mcg", "ecosystem"]) else False
+        mcg_markers_to_collect = {
+            "mcg",
+            "purple_squad",
+        }
+        # For every failure in MG we are trying to extend next attempt by 20 minutes
+        adjusted_timeout = utils.mg_fail_count * 1200
+        timeout = ocsci_config.REPORTING.get(
+            "must_gather_timeout", defaults.MUST_GATHER_TIMEOUT + adjusted_timeout
         )
+        log.info(f"Adjusted timeout for MG is {timeout} seconds")
+        mcg_logs_collection = bool(mcg_markers_to_collect & item_markers)
         try:
             if not ocsci_config.RUN.get("is_ocp_deployment_failed"):
-                collect_ocs_logs(
+                utils.collect_ocs_logs(
                     dir_name=test_case_name,
                     ocp=ocp_logs_collection,
                     ocs=ocs_logs_collection,
                     mcg=mcg_logs_collection,
+                    silent=True,
+                    output_file=True,
+                    skip_after_max_fail=True,
+                    timeout=timeout,
                 )
         except Exception:
             log.exception("Failed to collect OCS logs")
@@ -770,7 +790,7 @@ def pytest_runtest_makereport(item, call):
         metrics = item.get_closest_marker("gather_metrics_on_fail").args
         try:
             threading_lock = call.getfixturevalue("threading_lock")
-            collect_prometheus_metrics(
+            utils.collect_prometheus_metrics(
                 metrics,
                 f"{item.name}-{call.when}",
                 call.start,

@@ -168,6 +168,7 @@
 # Must-gather:
 MUST_GATHER_UPSTREAM_IMAGE = "quay.io/ocs-dev/ocs-must-gather"
 MUST_GATHER_UPSTREAM_TAG = "latest"
+MUST_GATHER_TIMEOUT = 3600
 
 # CrushDeviceClass
 CRUSH_DEVICE_CLASS = "ssd"

@@ -151,6 +151,7 @@ def exec_oc_cmd(
         silent=False,
         cluster_config=None,
         skip_tls_verify=False,
+        output_file=None,
         **kwargs,
     ):
         """
@@ -171,6 +172,8 @@ def exec_oc_cmd(
             cluster_config (MultiClusterConfig): cluster_config will be used only in the context of multiclsuter
                 executions
             skip_tls_verify (bool): Adding '--insecure-skip-tls-verify' to oc command
+            output_file (str): path where to write output of stdout and stderr from command - apply only when
+                silent mode is True
 
         Returns:
             dict: Dictionary represents a returned yaml file.
@@ -217,6 +220,7 @@ def exec_oc_cmd(
             threading_lock=self.threading_lock,
             silent=silent,
             cluster_config=cluster_config,
+            output_file=output_file,
             **kwargs,
         )