Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[mellanox_firmware] Add Mellanox firmware plugin #3407

Merged
merged 1 commit into from Dec 13, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 141 additions & 0 deletions sos/report/plugins/mellanox_firmware.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# Copyright (C) 2023 Nvidia Corporation, Alin Serdean <[email protected]>

# This file is part of the sos project: https://github.com/sosreport/sos
#
# This copyrighted material is made available to anyone wishing to use,
# modify, copy, or redistribute it subject to the terms and conditions of
# version 2 of the GNU General Public License.
#
# See the LICENSE file in the source distribution for further information.

from sos.report.plugins import Plugin, IndependentPlugin
import os
import time


class MellanoxFirmware(Plugin, IndependentPlugin):

short_desc = 'Nvidia(Mellanox) firmware tools output'

plugin_name = "mellanox_firmware"
profiles = ('hardware', 'system')
packages = ('mst', 'mstflint')

MLNX_STRING = "Mellanox Technologies"

def check_enabled(self):
"""
Checks if this plugin should be executed at all.
We will only enable the plugin if there is a
Mellanox Technologies network adapter
"""
lspci = self.exec_cmd("lspci -D -d 15b3::0200")
return lspci['status'] == 0 and self.MLNX_STRING in lspci['output']

def collect(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you need to overwrite collect method? Usually, whatever add_cmd_output you call in setup, the plugin will automatically collect, knowing the whole list of commands in advance.

Is there some reason of having collect_cmd_output in collect, instead of having the commands under add_cmd_output in setup?

(same applies to "no --allow-system-changes or if flint --version fails, return")

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was suggested here: #3407 (comment)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, since we're modifying system state, I think it best if we do this during collect(), and that requires using collect_cmd_output() for the just-in-time writing to the plugin dir.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see, it makes sense (and I should read whole PR discussion before reviewing).

Maybe then it is worth having a comment abou the unusual approach in the code?

if not self.get_option('allow_system_changes'):
self._log_info("Skipping mst/mlx cable commands as system changes"
"would be made. Use --allow-system-changes to"
"enable this collection.")
return
This conversation was marked as resolved.
Show resolved Hide resolved

"""
Run only if mft package is installed.
flint is available from the mft package.
"""
co = self.exec_cmd('flint --version')
if co['status'] != 0:
return

co = self.collect_cmd_output('mst start')
This conversation was marked as resolved.
Show resolved Hide resolved
if co['status'] != 0:
return

self.collect_cmd_output('mst cable add')
self.collect_cmd_output("mst status -v", timeout=10)
self.collect_cmd_output("mlxcables", timeout=10)
co = os.listdir("/dev/mst")
mlxcables = []
for device in co:
if 'cable' in device:
mlxcables.append(device)
for mlxcable in mlxcables:
self.collect_cmd_output(f"mlxcables -d {mlxcable} --DDM",
timeout=10)
self.collect_cmd_output(f"mlxcables -d {mlxcable} --dump",
timeout=10)
self.collect_cmd_output("mst stop", changes=True)

def setup(self):
# Get all devices which have the vendor Mellanox Technologies
devices = []
device_list = self.collect_cmd_output('lspci -D -d 15b3::0200')
"""
Will return a string of the following format:
0000:08:00.0 Ethernet controller: Mellanox Technologies MT2892
Family
"""
if device_list['status'] != 0:
# bail out if there no Mellanox PCI devices
return
Comment on lines +78 to +80
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is required over the check_enabled method in cases when user manually enables the plugin, ACK.


for line in device_list["output"].splitlines():
"""
Should return 0000:08:00.0
from the following string
0000:08:00.0 Ethernet controller: Mellanox Technologies MT2892
Family
"""
devices.append(line[0:8]+'00.0')

devices = set(devices)

"""
# Mft package is present if OFED is installed
# mstflint package is part of the distro and can be installed.
"""
commands = []

# mft package is installed if flint command is available
co = self.exec_cmd('flint --version')
if co['status'] != 0:
"""
mstflint package commands
the commands do not support position independent arguments
"""
commands = [
["mstconfig -d ", " -e q"],
["mstflint -d ", " dc"],
["mstflint -d ", " q"],
["mstreg -d ", " --reg_name ROCE_ACCL --get"],
["mstlink -d ", ""],
]
else:
pmoravec marked this conversation as resolved.
Show resolved Hide resolved
"""
mft package commands
the commands do not support position independent arguments
"""
commands = [
["mlxdump -d ", " pcie_uc --all"],
["mstconfig -d ", " -e q"],
["flint -d ", " dc"],
["flint -d ", " q"],
["mlxreg -d ", " --reg_name ROCE_ACCL --get"],
["mlxlink -d ", ""],
["fwtrace -d ", " -i all --tracer_mode FIFO"],
]
for device in devices:
for command in commands:
self.add_cmd_output(f"{command[0]} {device} "
f"{command[1]}", timeout=30)
pmoravec marked this conversation as resolved.
Show resolved Hide resolved

"""
Dump the output of the mstdump command three times
waiting for one second. This output is useful to check
if certain registers changed
"""
for i in range(3):
self.add_cmd_output(f"mstdump {device}")
time.sleep(1)

# vim: set et ts=4 sw=4 :