Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvements for MG collections #11161

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions conf/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,8 @@ Reporting related config. (Do not store secret data in the repository!).
* `save_mem_report` - If True, test run memory report CSV file will be saved in `RUN["log_dir"]/stats_log_dir_<run_id>`
directory along with <test name>.peak_rss_table, <test name>.peak_vms_table reports. The option may be enforced by
exporting env variable: export SAVE_MEM_REPORT=true
* `max_mg_fail_attempts` - Maximum attempts to run MG commands to prevent
spending time on MG which is timeouting.

#### ENV_DATA

Expand Down
18 changes: 15 additions & 3 deletions ocs_ci/deployment/deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,11 @@ def do_deploy_ocp(self, log_cli_level):
config.RUN["is_ocp_deployment_failed"] = True
logger.error(e)
if config.REPORTING["gather_on_deploy_failure"]:
collect_ocs_logs("deployment", ocs=False)
collect_ocs_logs(
"deployment",
ocs=False,
timeout=defaults.MUST_GATHER_TIMEOUT,
)
raise

def do_deploy_submariner(self):
Expand Down Expand Up @@ -381,8 +385,16 @@ def do_deploy_ocs(self):
if config.REPORTING["gather_on_deploy_failure"]:
# Let's do the collections separately to guard against one
# of them failing
collect_ocs_logs("deployment", ocs=False)
collect_ocs_logs("deployment", ocp=False)
collect_ocs_logs(
"deployment",
ocs=False,
timeout=defaults.MUST_GATHER_TIMEOUT,
)
collect_ocs_logs(
"deployment",
ocp=False,
timeout=defaults.MUST_GATHER_TIMEOUT,
)
raise
config.reset_ctx()
# Run ocs_install_verification here only in case of multicluster.
Expand Down
1 change: 1 addition & 0 deletions ocs_ci/framework/conf/default_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ REPORTING:
gather_on_deploy_failure: true
collect_logs_on_success_run: False
rp_client_log_level: "ERROR"
max_mg_fail_attempts: 3

# This is the default information about environment.
ENV_DATA:
Expand Down
34 changes: 27 additions & 7 deletions ocs_ci/framework/pytest_customization/ocscilib.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
ClusterNameNotProvidedError,
ClusterPathNotProvidedError,
)
from ocs_ci.ocs import defaults
from ocs_ci.ocs.constants import (
CLUSTER_NAME_MAX_CHARACTERS,
CLUSTER_NAME_MIN_CHARACTERS,
Expand All @@ -31,7 +32,7 @@
)
from ocs_ci.ocs.cluster import check_clusters
from ocs_ci.ocs.resources.ocs import get_version_info
from ocs_ci.ocs.utils import collect_ocs_logs, collect_prometheus_metrics
from ocs_ci.ocs import utils
from ocs_ci.utility.utils import (
dump_config_to_file,
get_ceph_version,
Expand Down Expand Up @@ -729,34 +730,53 @@ def pytest_runtest_makereport(item, call):
and ocsci_config.RUN.get("cli_params").get("collect-logs")
and not ocsci_config.RUN.get("cli_params").get("deploy")
):
item_markers = {marker.name for marker in item.iter_markers()}
test_case_name = item.name
# TODO: We should avoid paths and rely on markers issue:
# https://github.com/red-hat-storage/ocs-ci/issues/10526
ocp_logs_collection = (
True
if any(
x in item.location[0]
for x in [
"ecosystem",
"e2e/performance",
"tests/functional/z_cluster",
]
)
else False
)
ocp_markers_to_collect = {
"performance",
"purple_squad",
}
if ocp_markers_to_collect & item_markers:
ocp_logs_collection = True
ocs_logs_collection = (
False
if any(x in item.location[0] for x in ["_ui", "must_gather"])
else True
)
mcg_logs_collection = (
True if any(x in item.location[0] for x in ["mcg", "ecosystem"]) else False
mcg_markers_to_collect = {
"mcg",
"purple_squad",
}
# For every failure in MG we are trying to extend next attempt by 20 minutes
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this would be up to 2 hours wait (20 min + 40 min + 60 min), in case the default max_mg_fail_attempts is being used.
Isn't it too much time?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We see sometime it's really not enough the 60 mins - so I am giving a chance to try to give more time to collect logs - to let us analyze and check MG if we have anything. If it will fail 3 times it will not do anything more which is still better than previously on example run it spent more than 24 hours on collecting logs which always time outed and we didn't get any log. So I was thinking that this increased time might help us to maybe get some logs from MG to identify the issue why it's taking longer than used to. We can reduce number of failed MG count to 2 only.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as long as we don't try again to collect in any other later test case, except for once before teardown, this should be fine

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we reach our max attempts we do not collect it again and skipping whole collection.

adjusted_timeout = utils.mg_fail_count * 1200
timeout = ocsci_config.REPORTING.get(
"must_gather_timeout", defaults.MUST_GATHER_TIMEOUT + adjusted_timeout
)
log.info(f"Adjusted timeout for MG is {timeout} seconds")
mcg_logs_collection = bool(mcg_markers_to_collect & item_markers)
try:
if not ocsci_config.RUN.get("is_ocp_deployment_failed"):
collect_ocs_logs(
utils.collect_ocs_logs(
dir_name=test_case_name,
ocp=ocp_logs_collection,
ocs=ocs_logs_collection,
mcg=mcg_logs_collection,
silent=True,
output_file=True,
skip_after_max_fail=True,
timeout=timeout,
)
except Exception:
log.exception("Failed to collect OCS logs")
Expand All @@ -770,7 +790,7 @@ def pytest_runtest_makereport(item, call):
metrics = item.get_closest_marker("gather_metrics_on_fail").args
try:
threading_lock = call.getfixturevalue("threading_lock")
collect_prometheus_metrics(
utils.collect_prometheus_metrics(
metrics,
f"{item.name}-{call.when}",
call.start,
Expand Down
1 change: 1 addition & 0 deletions ocs_ci/ocs/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@
# Must-gather:
MUST_GATHER_UPSTREAM_IMAGE = "quay.io/ocs-dev/ocs-must-gather"
MUST_GATHER_UPSTREAM_TAG = "latest"
MUST_GATHER_TIMEOUT = 3600

# CrushDeviceClass
CRUSH_DEVICE_CLASS = "ssd"
Expand Down
4 changes: 4 additions & 0 deletions ocs_ci/ocs/ocp.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ def exec_oc_cmd(
silent=False,
cluster_config=None,
skip_tls_verify=False,
output_file=None,
**kwargs,
):
"""
Expand All @@ -171,6 +172,8 @@ def exec_oc_cmd(
cluster_config (MultiClusterConfig): cluster_config will be used only in the context of multiclsuter
executions
skip_tls_verify (bool): Adding '--insecure-skip-tls-verify' to oc command
output_file (str): path where to write output of stdout and stderr from command - apply only when
silent mode is True

Returns:
dict: Dictionary represents a returned yaml file.
Expand Down Expand Up @@ -217,6 +220,7 @@ def exec_oc_cmd(
threading_lock=self.threading_lock,
silent=silent,
cluster_config=cluster_config,
output_file=output_file,
**kwargs,
)

Expand Down
Loading