From c1da797a4361adf7dd0b26eeec60dd9b01e4169f Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Mon, 29 Jul 2024 11:12:21 +0200 Subject: [PATCH] topo: handle offline cpus Some CPUs can become offline for various reasons. Do not crash when generating a DOT diagram from a sosreport with offline CPUs. Adjust topology and interrupt counters parsing to take offline CPUs into account. Display offline CPUs in gray in the diagram. Reported-by: Abhiram R N Signed-off-by: Robin Jarry --- sosviz/collect/irq.py | 14 +++++++++++--- sosviz/collect/topo.py | 27 ++++++++++++++------------- sosviz/output/dot.py | 10 ++++++++++ 3 files changed, 35 insertions(+), 16 deletions(-) diff --git a/sosviz/collect/irq.py b/sosviz/collect/irq.py index 9779e91..64c7b4f 100644 --- a/sosviz/collect/irq.py +++ b/sosviz/collect/irq.py @@ -18,12 +18,20 @@ def parse_report(path: pathlib.Path, data: D): if not f.is_file(): return + cpu_ids = [] + for cpu in path.glob("sys/devices/system/cpu/cpu[0-9]*"): + cpu_ids.append(int(re.match(r"cpu(\d+)", cpu.name).group(1))) + counters_len = max(*cpu_ids) + 1 + for match in INTERRUPT_RE.finditer(f.read_text()): - irq, counters, desc = match.groups() + irq = match.group(1) + counters = [0] * counters_len + for i, c in enumerate(match.group(2).split()): + counters[cpu_ids[i]] = int(c) irqs[irq] = D( irq=irq, - desc=re.sub(r"\s+", " ", desc.strip()), - counters=[int(c) for c in counters.split()], + desc=re.sub(r"\s+", " ", match.group(3).strip()), + counters=counters, ) try: irqs[irq].requested_affinity = parse_cpu_set( diff --git a/sosviz/collect/topo.py b/sosviz/collect/topo.py index c8252c0..ccb651a 100644 --- a/sosviz/collect/topo.py +++ b/sosviz/collect/topo.py @@ -9,7 +9,7 @@ def parse_report(path: pathlib.Path, data: D): - nodes = list(path.glob("sys/devices/system/node/node*")) + nodes = list(path.glob("sys/devices/system/node/node[0-9]*")) for node in nodes: match = re.match(r"^node(\d+)$", node.name) if not match: @@ -38,25 +38,26 @@ def parse_report(path: pathlib.Path, data: D): size = int(match.group(1)) * 1024 numa.setdefault("hugepages", D())[size] = int(huge.read_text()) - for cpu in path.glob("sys/devices/system/cpu/cpu*/topology"): + offline_cpus = set() + for cpu in path.glob("sys/devices/system/cpu/cpu[0-9]*"): + if not (cpu / f"node{numa_id}").is_dir(): + continue + cpu_id = int(re.match(r"cpu(\d+)", cpu.name).group(1)) + online = cpu / "online" + if online.is_file() and online.read_text().strip() == "0": + offline_cpus.add(cpu_id) + continue + topo = cpu / "topology" try: - package_id = int((cpu / "physical_package_id").read_text()) - if package_id != numa_id: - continue - except FileNotFoundError: - cpu_id = int(re.match(r"cpu(\d+)", cpu.parent.name).group(1)) - if cpu_id not in cpus: - continue - try: - threads = parse_cpu_set((cpu / "thread_siblings_list").read_text()) + threads = parse_cpu_set((topo / "thread_siblings_list").read_text()) except FileNotFoundError: try: - threads = parse_cpu_set((cpu / "core_cpus_list").read_text()) + threads = parse_cpu_set((topo / "core_cpus_list").read_text()) except FileNotFoundError: # hyperthreading disabled continue - cpus.update(threads) for t in threads: siblings[t] = threads - {t} numa.cpus = cpus + numa.offline_cpus = offline_cpus numa.thread_siblings = siblings diff --git a/sosviz/output/dot.py b/sosviz/output/dot.py index dbb110f..7d26b5c 100644 --- a/sosviz/output/dot.py +++ b/sosviz/output/dot.py @@ -659,6 +659,16 @@ def phy_numa(self, numa: D): tooltip=self.irq_counters_tooltip(housekeeping_cpus), color="blue", ) + if numa.offline_cpus: + self.node( + f"phy_cpus_offline_{numa.id}", + [ + 'Offline', + f'CPUs {bit_list(numa.offline_cpus)}', + ], + tooltip=self.irq_counters_tooltip(numa.offline_cpus), + color="gray", + ) labels = [f"memory {human_readable(numa.total_memory, 1024)}"] for size, num in numa.get("hugepages", {}).items():