diff --git a/gprofiler/containers_client.py b/gprofiler/containers_client.py index 0492972ad..1876eba10 100644 --- a/gprofiler/containers_client.py +++ b/gprofiler/containers_client.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import time from typing import Dict, List, Optional, Set from granulate_utils.containers.client import ContainersClient @@ -25,6 +26,8 @@ logger = get_logger_adapter(__name__) +NEWLY_CREATED_CONTAINER_AGE_IN_SECONDS = 3 + class ContainerNamesClient: def __init__(self) -> None: @@ -73,9 +76,13 @@ def get_container_name(self, pid: int) -> str: def _safely_get_process_container_name(self, pid: int) -> Optional[str]: try: try: - container_id = get_process_container_id(Process(pid)) + process = Process(pid) + container_id = get_process_container_id(process) if container_id is None: return None + # If the container is newly created, we wait a bit to make sure the container is available + if time.time() - process.create_time() <= NEWLY_CREATED_CONTAINER_AGE_IN_SECONDS: + time.sleep(2) except NoSuchProcess: return None return self._get_container_name(container_id) diff --git a/gprofiler/profilers/java.py b/gprofiler/profilers/java.py index a5737541f..c722fc7ce 100644 --- a/gprofiler/profilers/java.py +++ b/gprofiler/profilers/java.py @@ -1229,6 +1229,7 @@ def _check_async_profiler_loaded(self, process: Process) -> bool: def _profile_process(self, process: Process, duration: int, spawned: bool) -> ProfileData: comm = process_comm(process) exe = process_exe(process) + container_name = self._profiler_state.get_container_name(process.pid) java_version_output: Optional[str] = get_java_version_logged(process, self._profiler_state.stop_event) if self._enabled_proc_events_java: @@ -1258,7 +1259,6 @@ def _profile_process(self, process: Process, duration: int, spawned: bool) -> Pr self._profiled_pids.add(process.pid) logger.info(f"Profiling{' spawned' if spawned else ''} process {process.pid} with async-profiler") - container_name = self._profiler_state.get_container_name(process.pid) app_metadata = self._metadata.get_metadata(process) appid = application_identifiers.get_java_app_id(process, self._collect_spark_app_name) diff --git a/gprofiler/profilers/php.py b/gprofiler/profilers/php.py index bab63f266..882592f5c 100644 --- a/gprofiler/profilers/php.py +++ b/gprofiler/profilers/php.py @@ -210,10 +210,11 @@ def extract_metadata_section(re_expr: Pattern, metadata_line: str) -> str: if profiler_state.processes_to_profile is not None: if pid not in [process.pid for process in profiler_state.processes_to_profile]: continue + container_name = profiler_state.get_container_name(pid) # TODO: appid & app metadata for php! appid = None app_metadata = None - profiles[pid] = ProfileData(results[pid], appid, app_metadata, profiler_state.get_container_name(pid)) + profiles[pid] = ProfileData(results[pid], appid, app_metadata, container_name) return profiles diff --git a/gprofiler/profilers/python_ebpf.py b/gprofiler/profilers/python_ebpf.py index e2b44bc9e..8564cf07f 100644 --- a/gprofiler/profilers/python_ebpf.py +++ b/gprofiler/profilers/python_ebpf.py @@ -262,9 +262,9 @@ def snapshot(self) -> ProcessToProfileData: if self._profiler_state.processes_to_profile is not None: if process not in self._profiler_state.processes_to_profile: continue + container_name = self._profiler_state.get_container_name(pid) appid = application_identifiers.get_python_app_id(process) app_metadata = self._metadata.get_metadata(process) - container_name = self._profiler_state.get_container_name(pid) except NoSuchProcess: appid = None app_metadata = None diff --git a/gprofiler/utils/__init__.py b/gprofiler/utils/__init__.py index 2fad553bd..97346c4b1 100644 --- a/gprofiler/utils/__init__.py +++ b/gprofiler/utils/__init__.py @@ -90,45 +90,7 @@ def is_root() -> bool: return os.geteuid() == 0 -libc: Optional[ctypes.CDLL] = None - - -def prctl(*argv: Any) -> int: - global libc - if libc is None: - libc = ctypes.CDLL("libc.so.6", use_errno=True) - return cast(int, libc.prctl(*argv)) - - -PR_SET_PDEATHSIG = 1 - - -def set_child_termination_on_parent_death() -> int: - ret = prctl(PR_SET_PDEATHSIG, signal.SIGTERM) - if ret != 0: - errno = ctypes.get_errno() - logger.warning( - f"Failed to set parent-death signal on child process. errno: {errno}, strerror: {os.strerror(errno)}" - ) - return ret - - -def wrap_callbacks(callbacks: List[Callable]) -> Callable: - # Expects array of callback. - # Returns one callback that call each one of them, and returns the retval of last callback - def wrapper() -> Any: - ret = None - for cb in callbacks: - ret = cb() - - return ret - - return wrapper - - -def start_process( - cmd: Union[str, List[str]], via_staticx: bool = False, term_on_parent_death: bool = True, **kwargs: Any -) -> Popen: +def start_process(cmd: Union[str, List[str]], via_staticx: bool = False, **kwargs: Any) -> Popen: if isinstance(cmd, str): cmd = [cmd] @@ -150,19 +112,12 @@ def start_process( env = env if env is not None else os.environ.copy() env.update({"LD_LIBRARY_PATH": ""}) - if is_windows(): - cur_preexec_fn = None # preexec_fn is not supported on Windows platforms. subprocess.py reports this. - else: - cur_preexec_fn = kwargs.pop("preexec_fn", os.setpgrp) - if term_on_parent_death: - cur_preexec_fn = wrap_callbacks([set_child_termination_on_parent_death, cur_preexec_fn]) - popen = Popen( cmd, stdout=kwargs.pop("stdout", subprocess.PIPE), stderr=kwargs.pop("stderr", subprocess.PIPE), stdin=subprocess.PIPE, - preexec_fn=cur_preexec_fn, + start_new_session=is_linux(), # TODO: change to "process_group" after upgrade to Python 3.11+ env=env, **kwargs, ) diff --git a/tests/test_java.py b/tests/test_java.py index 6b2f7b8d0..125db51dc 100644 --- a/tests/test_java.py +++ b/tests/test_java.py @@ -118,7 +118,7 @@ def read_ap_version(self: AsyncProfiledProcess) -> str: return version -def xtest_async_profiler_already_running( +def test_async_profiler_already_running( application_pid: int, profiler_state: ProfilerState, assert_collapsed: AssertInCollapsed, @@ -163,7 +163,7 @@ def xtest_async_profiler_already_running( @pytest.mark.parametrize("in_container", [True]) -def xtest_java_async_profiler_cpu_mode( +def test_java_async_profiler_cpu_mode( application_pid: int, assert_collapsed: AssertInCollapsed, profiler_state: ProfilerState, @@ -186,7 +186,7 @@ def xtest_java_async_profiler_cpu_mode( @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("application_image_tag", ["musl"]) -def xtest_java_async_profiler_musl_and_cpu( +def test_java_async_profiler_musl_and_cpu( application_pid: int, assert_collapsed: AssertInCollapsed, profiler_state: ProfilerState, @@ -215,7 +215,7 @@ def test_java_safemode_parameters(profiler_state: ProfilerState) -> None: assert "Java version checks are mandatory in --java-safemode" in str(excinfo.value) -def xtest_java_safemode_version_check( +def test_java_safemode_version_check( monkeypatch: MonkeyPatch, caplog: LogCaptureFixture, application_pid: int, @@ -226,6 +226,7 @@ def xtest_java_safemode_version_check( monkeypatch.setitem(JavaProfiler.MINIMAL_SUPPORTED_VERSIONS, 8, (Version("8.999"), 0)) with make_java_profiler(profiler_state) as profiler: + profiler._profiler_state.get_container_name(application_pid) process = profiler._select_processes_to_profile()[0] jvm_version_str = cast_away_optional(get_java_version(process, profiler._profiler_state.stop_event)) jvm_version = parse_jvm_version(jvm_version_str) @@ -236,7 +237,7 @@ def xtest_java_safemode_version_check( assert log_record_extra(log_record)["jvm_version"] == repr(jvm_version) -def xtest_java_safemode_build_number_check( +def test_java_safemode_build_number_check( monkeypatch: MonkeyPatch, caplog: LogCaptureFixture, application_pid: int, @@ -245,6 +246,7 @@ def xtest_java_safemode_build_number_check( profiler_state: ProfilerState, ) -> None: with make_java_profiler(profiler_state) as profiler: + profiler._profiler_state.get_container_name(application_pid) process = profiler._select_processes_to_profile()[0] jvm_version_str = cast_away_optional(get_java_version(process, profiler._profiler_state.stop_event)) jvm_version = parse_jvm_version(jvm_version_str) @@ -261,10 +263,10 @@ def xtest_java_safemode_build_number_check( [ (False, (), False), # default (False, ("-XX:ErrorFile=/tmp/my_custom_error_file.log",), False), # custom error file - # (True, (), False), # containerized (other params are ignored) + (True, (), False), # containerized (other params are ignored) ], ) -def xtest_hotspot_error_file( +def test_hotspot_error_file( application_pid: int, monkeypatch: MonkeyPatch, caplog: LogCaptureFixture, @@ -297,7 +299,7 @@ def start_async_profiler_and_crash(self: AsyncProfiledProcess, *args: Any, **kwa assert profiler._safemode_disable_reason is not None -def xtest_disable_java_profiling( +def test_disable_java_profiling( application_pid: int, monkeypatch: MonkeyPatch, caplog: LogCaptureFixture, @@ -315,7 +317,7 @@ def xtest_disable_java_profiling( assert "Java profiling has been disabled, skipping profiling of all java process" in caplog.text -def xtest_already_loaded_async_profiler_profiling_failure( +def test_already_loaded_async_profiler_profiling_failure( monkeypatch: MonkeyPatch, caplog: LogCaptureFixture, application_pid: int, @@ -339,7 +341,7 @@ def xtest_already_loaded_async_profiler_profiling_failure( # test only once; and don't test in container - as it will go down once we kill the Java app. @pytest.mark.parametrize("in_container", [False]) @pytest.mark.parametrize("check_app_exited", [False]) # we're killing it, the exit check will raise. -def xtest_async_profiler_output_written_upon_jvm_exit( +def test_async_profiler_output_written_upon_jvm_exit( tmp_path_world_accessible: Path, application_pid: int, assert_collapsed: AssertInCollapsed, @@ -367,7 +369,7 @@ def delayed_kill() -> None: # test only once @pytest.mark.parametrize("in_container", [False]) -def xtest_async_profiler_stops_after_given_timeout( +def test_async_profiler_stops_after_given_timeout( tmp_path_world_accessible: Path, application_pid: int, assert_collapsed: AssertInCollapsed, @@ -400,7 +402,7 @@ def xtest_async_profiler_stops_after_given_timeout( @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("application_image_tag,search_for", [("j9", "OpenJ9"), ("zing", "Zing")]) -def xtest_sanity_other_jvms( +def test_sanity_other_jvms( application_pid: int, assert_collapsed: AssertInCollapsed, search_for: str, @@ -417,6 +419,7 @@ def xtest_sanity_other_jvms( frequency=99, java_async_profiler_mode="cpu", ) as profiler: + profiler._profiler_state.get_container_name(application_pid) process = psutil.Process(application_pid) assert search_for in cast_away_optional(get_java_version(process, profiler._profiler_state.stop_event)) process_collapsed = snapshot_pid_collapsed(profiler, application_pid) @@ -425,7 +428,7 @@ def xtest_sanity_other_jvms( @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("application_image_tag,search_for", [("eclipse-temurin-latest", "Temurin")]) -def xtest_sanity_latest_jvms( +def test_sanity_latest_jvms( application_pid: int, assert_collapsed: AssertInCollapsed, search_for: str, @@ -438,6 +441,7 @@ def xtest_sanity_latest_jvms( """ with make_java_profiler(profiler_state) as profiler: + profiler._profiler_state.get_container_name(application_pid) # sanity check that this is the correct JVM we're targeting assert search_for in cast_away_optional( get_java_version(psutil.Process(application_pid), profiler._profiler_state.stop_event) @@ -462,7 +466,7 @@ def simulate_libjvm_delete(application_pid: int) -> None: # test only once. in a container, so that we don't mess up the environment :) @pytest.mark.parametrize("in_container", [True]) -def xtest_java_deleted_libjvm( +def test_java_deleted_libjvm( application_pid: int, application_docker_container: Container, assert_collapsed: AssertInCollapsed, @@ -501,7 +505,7 @@ def _filter_record(r: LogRecord) -> bool: pytest.param("ro", [docker.types.Mount(target="/tmpfs", source="", type="tmpfs", read_only=True)], id="ro"), ], ) -def xtest_java_noexec_or_ro_dirs( +def test_java_noexec_or_ro_dirs( tmp_path_world_accessible: Path, # will be used by AP for logs & outputs application_pid: int, extra_application_docker_mounts: List[docker.types.Mount], @@ -569,7 +573,7 @@ def xtest_java_noexec_or_ro_dirs( @pytest.mark.parametrize("in_container", [True]) -def xtest_java_symlinks_in_paths( +def test_java_symlinks_in_paths( application_pid: int, application_docker_container: Container, assert_collapsed: AssertInCollapsed, @@ -616,7 +620,7 @@ def xtest_java_symlinks_in_paths( @pytest.mark.parametrize("in_container", [True]) # only in container is enough -def xtest_java_appid_and_metadata_before_process_exits( +def test_java_appid_and_metadata_before_process_exits( application_pid: int, assert_collapsed: AssertInCollapsed, monkeypatch: MonkeyPatch, @@ -657,7 +661,7 @@ def start_async_profiler_and_interrupt(self: AsyncProfiledProcess, *args: Any, * @pytest.mark.parametrize("in_container", [True]) # only in container is enough -def xtest_java_attach_socket_missing( +def test_java_attach_socket_missing( application_pid: int, profiler_state: ProfilerState, ) -> None: @@ -680,7 +684,7 @@ def xtest_java_attach_socket_missing( # we know what messages to expect when in container, not on the host Java @pytest.mark.parametrize("in_container", [True]) -def xtest_java_jattach_async_profiler_log_output( +def test_java_jattach_async_profiler_log_output( application_pid: int, caplog: LogCaptureFixture, profiler_state: ProfilerState, @@ -817,7 +821,7 @@ def test_non_java_basename_version( @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("insert_dso_name", [False, True]) -def xtest_dso_name_in_ap_profile( +def test_dso_name_in_ap_profile( application_pid: int, insert_dso_name: bool, profiler_state: ProfilerState, @@ -836,7 +840,7 @@ def xtest_dso_name_in_ap_profile( @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("insert_dso_name", [False, True]) @pytest.mark.parametrize("libc_pattern", [r"(^|;)\(/.*/libc-.*\.so\)($|;)"]) -def xtest_handling_missing_symbol_in_profile( +def test_handling_missing_symbol_in_profile( application_pid: int, insert_dso_name: bool, libc_pattern: str, @@ -852,7 +856,7 @@ def xtest_handling_missing_symbol_in_profile( @pytest.mark.parametrize("in_container", [True]) -def xtest_meminfo_logged( +def test_meminfo_logged( application_pid: int, caplog: LogCaptureFixture, profiler_state: ProfilerState, @@ -869,7 +873,7 @@ def xtest_meminfo_logged( # test that java frames include no semicolon but use a pipe '|' character instead, as implemented by AP @pytest.mark.parametrize("in_container", [True]) -def xtest_java_frames_include_no_semicolons( +def test_java_frames_include_no_semicolons( application_pid: int, profiler_state: ProfilerState, ) -> None: @@ -896,7 +900,7 @@ def xtest_java_frames_include_no_semicolons( # test that async profiler doesn't print anything to applications stdout, stderr streams @pytest.mark.parametrize("in_container", [True]) -def xtest_no_stray_output_in_stdout_stderr( +def test_no_stray_output_in_stdout_stderr( application_pid: int, application_docker_container: Container, monkeypatch: MonkeyPatch, @@ -1094,7 +1098,7 @@ def flush_output_and_stop_async_profiler(self: AsyncProfiledProcess, *args: Any, ), ], ) -def xtest_collect_default_jvm_flags( +def test_collect_default_jvm_flags( profiler_state: ProfilerState, tmp_path: Path, application_pid: int, @@ -1178,7 +1182,7 @@ def xtest_collect_default_jvm_flags( ), ], ) -def xtest_collect_cmdline_and_env_jvm_flags( +def test_collect_cmdline_and_env_jvm_flags( docker_client: DockerClient, application_docker_image: Image, assert_collapsed: AssertInCollapsed, @@ -1218,7 +1222,7 @@ def xtest_collect_cmdline_and_env_jvm_flags( @pytest.mark.parametrize("java_cli_flags", ["-XX:MinHeapFreeRatio=5 -XX:MaxHeapFreeRatio=95"]) @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("expected_flags", [[]]) -def xtest_collect_flags_unsupported_filtered_out( +def test_collect_flags_unsupported_filtered_out( docker_client: DockerClient, application_docker_image: Image, assert_collapsed: AssertInCollapsed, @@ -1246,10 +1250,9 @@ def xtest_collect_flags_unsupported_filtered_out( f"exec java {java_cli_flags} -jar Fibonacci.jar", ], ) as container: - assert ( - profiler._metadata.get_jvm_flags_serialized(psutil.Process(container.attrs["State"]["Pid"])) - == expected_flags - ) + pid = container.attrs["State"]["Pid"] + profiler._profiler_state.get_container_name(pid) + assert profiler._metadata.get_jvm_flags_serialized(psutil.Process(pid)) == expected_flags log_record = next(filter(lambda r: r.message == "Missing requested flags:", caplog.records)) # use slicing to remove the leading -XX: instead of removeprefix as it's not available in python 3.8 assert ( @@ -1260,7 +1263,7 @@ def xtest_collect_flags_unsupported_filtered_out( @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("expected_flags", [[]]) -def xtest_collect_none_jvm_flags( +def test_collect_none_jvm_flags( profiler_state: ProfilerState, tmp_path: Path, application_pid: int, @@ -1272,7 +1275,7 @@ def xtest_collect_none_jvm_flags( @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("include_mmm", [True, False]) -def xtest_including_method_modifiers( +def test_including_method_modifiers( application_pid: int, profiler_state: ProfilerState, include_mmm: bool, @@ -1287,7 +1290,7 @@ def xtest_including_method_modifiers( @pytest.mark.parametrize("java_line_numbers", ["none", "line-of-function"]) @pytest.mark.parametrize("in_container", [True]) -def xtest_including_line_numbers( +def test_including_line_numbers( application_pid: int, profiler_state: ProfilerState, java_line_numbers: str, diff --git a/tests/test_profiling_mode.py b/tests/test_profiling_mode.py index a237286d1..6ced99a30 100644 --- a/tests/test_profiling_mode.py +++ b/tests/test_profiling_mode.py @@ -56,7 +56,7 @@ def test_sanity( ("java", "ap", True, "java.lang.String[]"), ], ) -def xtest_allocation_being_profiled( +def test_allocation_being_profiled( application_docker_container: Container, docker_client: DockerClient, gprofiler_docker_image: Image, diff --git a/tests/test_python.py b/tests/test_python.py index c4e8943c4..92cd099c1 100644 --- a/tests/test_python.py +++ b/tests/test_python.py @@ -39,7 +39,7 @@ def runtime() -> str: @pytest.mark.parametrize("in_container", [True]) @pytest.mark.parametrize("application_image_tag", ["libpython"]) -def xtest_python_select_by_libpython( +def test_python_select_by_libpython( application_pid: int, assert_collapsed: AssertInCollapsed, profiler_state: ProfilerState, @@ -83,7 +83,7 @@ def xtest_python_select_by_libpython( ], ) @pytest.mark.parametrize("profiler_type", ["py-spy", "pyperf"]) -def xtest_python_matrix( +def test_python_matrix( application_pid: int, assert_collapsed: AssertInCollapsed, profiler_type: str, diff --git a/tests/test_sanity.py b/tests/test_sanity.py index 738458f49..7df1238f0 100644 --- a/tests/test_sanity.py +++ b/tests/test_sanity.py @@ -50,7 +50,7 @@ @pytest.mark.parametrize("runtime", ["java"]) -def xtest_java_from_host( +def test_java_from_host( tmp_path_world_accessible: Path, application_pid: int, assert_app_id: Callable, @@ -68,7 +68,7 @@ def xtest_java_from_host( @pytest.mark.parametrize("runtime", ["python"]) -def xtest_pyspy( +def test_pyspy( application_pid: int, assert_collapsed: AssertInCollapsed, assert_app_id: Callable, @@ -108,7 +108,7 @@ def test_phpspy( @pytest.mark.parametrize("runtime", ["ruby"]) -def xtest_rbspy( +def test_rbspy( application_pid: int, assert_collapsed: AssertInCollapsed, gprofiler_docker_image: Image, @@ -120,7 +120,7 @@ def xtest_rbspy( @pytest.mark.parametrize("runtime", ["dotnet"]) -def xtest_dotnet_trace( +def test_dotnet_trace( application_pid: int, assert_collapsed: AssertInCollapsed, gprofiler_docker_image: Image, diff --git a/tests/utils.py b/tests/utils.py index dc22f136e..73aa8c5cf 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -141,6 +141,7 @@ def run_privileged_container( def _no_errors(logs: str) -> None: # example line: [2021-06-12 10:13:57,528] ERROR: gprofiler: ruby profiling failed assert "] ERROR: " not in logs, f"found ERRORs in gProfiler logs!: {logs}" + assert "Could not acquire gProfiler's lock" not in logs, f"found lock error in gProfiler logs!: {logs}" def run_gprofiler_in_container(docker_client: DockerClient, image: Image, command: List[str], **kwargs: Any) -> None: @@ -205,7 +206,15 @@ def assert_ldd_version_container(container: Container, version: str) -> None: def snapshot_pid_profile(profiler: ProfilerInterface, pid: int) -> ProfileData: - return profiler.snapshot()[pid] + last_snapshot = None + + def has_profile() -> bool: + nonlocal last_snapshot + last_snapshot = profiler.snapshot() + return pid in last_snapshot + + wait_event(timeout=5, stop_event=Event(), condition=has_profile, interval=1) + return last_snapshot[pid] # type: ignore def snapshot_pid_collapsed(profiler: ProfilerInterface, pid: int) -> StackToSampleCount: