Skip to content

Commit

Permalink
topo: handle offline cpus
Browse files Browse the repository at this point in the history
Some CPUs can become offline for various reasons. Do not crash when
generating a DOT diagram from a sosreport with offline CPUs. Adjust
topology and interrupt counters parsing to take offline CPUs into
account.

Display offline CPUs in gray in the diagram.

Reported-by: Abhiram R N <[email protected]>
Signed-off-by: Robin Jarry <[email protected]>
  • Loading branch information
rjarry committed Jul 29, 2024
1 parent 1582700 commit c1da797
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 16 deletions.
14 changes: 11 additions & 3 deletions sosviz/collect/irq.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,20 @@ def parse_report(path: pathlib.Path, data: D):
if not f.is_file():
return

cpu_ids = []
for cpu in path.glob("sys/devices/system/cpu/cpu[0-9]*"):
cpu_ids.append(int(re.match(r"cpu(\d+)", cpu.name).group(1)))
counters_len = max(*cpu_ids) + 1

for match in INTERRUPT_RE.finditer(f.read_text()):
irq, counters, desc = match.groups()
irq = match.group(1)
counters = [0] * counters_len
for i, c in enumerate(match.group(2).split()):
counters[cpu_ids[i]] = int(c)
irqs[irq] = D(
irq=irq,
desc=re.sub(r"\s+", " ", desc.strip()),
counters=[int(c) for c in counters.split()],
desc=re.sub(r"\s+", " ", match.group(3).strip()),
counters=counters,
)
try:
irqs[irq].requested_affinity = parse_cpu_set(
Expand Down
27 changes: 14 additions & 13 deletions sosviz/collect/topo.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@


def parse_report(path: pathlib.Path, data: D):
nodes = list(path.glob("sys/devices/system/node/node*"))
nodes = list(path.glob("sys/devices/system/node/node[0-9]*"))
for node in nodes:
match = re.match(r"^node(\d+)$", node.name)
if not match:
Expand Down Expand Up @@ -38,25 +38,26 @@ def parse_report(path: pathlib.Path, data: D):
size = int(match.group(1)) * 1024
numa.setdefault("hugepages", D())[size] = int(huge.read_text())

for cpu in path.glob("sys/devices/system/cpu/cpu*/topology"):
offline_cpus = set()
for cpu in path.glob("sys/devices/system/cpu/cpu[0-9]*"):
if not (cpu / f"node{numa_id}").is_dir():
continue
cpu_id = int(re.match(r"cpu(\d+)", cpu.name).group(1))
online = cpu / "online"
if online.is_file() and online.read_text().strip() == "0":
offline_cpus.add(cpu_id)
continue
topo = cpu / "topology"
try:
package_id = int((cpu / "physical_package_id").read_text())
if package_id != numa_id:
continue
except FileNotFoundError:
cpu_id = int(re.match(r"cpu(\d+)", cpu.parent.name).group(1))
if cpu_id not in cpus:
continue
try:
threads = parse_cpu_set((cpu / "thread_siblings_list").read_text())
threads = parse_cpu_set((topo / "thread_siblings_list").read_text())
except FileNotFoundError:
try:
threads = parse_cpu_set((cpu / "core_cpus_list").read_text())
threads = parse_cpu_set((topo / "core_cpus_list").read_text())
except FileNotFoundError:
# hyperthreading disabled
continue
cpus.update(threads)
for t in threads:
siblings[t] = threads - {t}
numa.cpus = cpus
numa.offline_cpus = offline_cpus
numa.thread_siblings = siblings
10 changes: 10 additions & 0 deletions sosviz/output/dot.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,16 @@ def phy_numa(self, numa: D):
tooltip=self.irq_counters_tooltip(housekeeping_cpus),
color="blue",
)
if numa.offline_cpus:
self.node(
f"phy_cpus_offline_{numa.id}",
[
'<font color="gray"><b>Offline</b></font>',
f'<font color="gray">CPUs {bit_list(numa.offline_cpus)}</font>',
],
tooltip=self.irq_counters_tooltip(numa.offline_cpus),
color="gray",
)

labels = [f"<b>memory {human_readable(numa.total_memory, 1024)}</b>"]
for size, num in numa.get("hugepages", {}).items():
Expand Down

0 comments on commit c1da797

Please sign in to comment.