Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

For #209 - Parse new rocm-smi output (main) #214

Merged
merged 1 commit into from
Nov 27, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 139 additions & 33 deletions src/amd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ impl gpu::GPU for AmdGPU {
// accelerators present.

fn amd_present() -> bool {
return Path::new("/sys/module/amdgpu").exists();
Path::new("/sys/module/amdgpu").exists()
}

// We only have one machine with AMD GPUs at UiO and rocm-smi is unable to show eg how much memory
Expand Down Expand Up @@ -106,35 +106,20 @@ fn get_amd_utilization(user_by_pid: &UserTable) -> Result<Vec<gpu::Process>, Str
// I've not been able to combine the two invocations of rocm-smi yet; we have to run the command
// twice. Not a happy situation.

match command::safe_command(AMD_CONCISE_COMMAND, AMD_CONCISE_ARGS, TIMEOUT_SECONDS) {
Ok(concise_raw_text) => {
match command::safe_command(
AMD_SHOWPIDGPUS_COMMAND,
AMD_SHOWPIDGPUS_ARGS,
TIMEOUT_SECONDS,
) {
Ok(showpidgpus_raw_text) => Ok(extract_amd_information(
&concise_raw_text,
&showpidgpus_raw_text,
user_by_pid,
)?),
Err(e) => Err(format!("{:?}", e)),
}
}
Err(CmdError::CouldNotStart(_)) => Ok(vec![]),
Err(e) => Err(format!("{:?}", e)),
}
Ok(extract_amd_information(
&get_raw_per_device_info()?,
&get_raw_per_pid_info()?,
user_by_pid,
))
}

// Put it all together from the command output.

fn extract_amd_information(
concise_raw_text: &str,
showpidgpus_raw_text: &str,
per_device_info: &[(f64, f64)],
per_pid_info: &[(usize, Vec<usize>)],
user_by_pid: &UserTable,
) -> Result<Vec<gpu::Process>, String> {
let per_device_info = parse_concise_command(concise_raw_text)?; // device -> (gpu%, mem%)
let per_pid_info = parse_showpidgpus_command(showpidgpus_raw_text)?; // pid -> [device, ...]
) -> Vec<gpu::Process> {
let mut num_processes_per_device = vec![0; per_device_info.len()];
per_pid_info.iter().for_each(|(_, devs)| {
devs.iter()
Expand Down Expand Up @@ -170,7 +155,41 @@ fn extract_amd_information(
fst
}
});
Ok(processes)
processes
}

// Return a dense map from device index starting at zero to gpu and gpumem utilization; empty if
// rocm-smi is not present; or an error if rocm-smi failed.
//
// Unfortunately the output format of rocm-smi is not stable. So first we try to parse the CSV
// form, subsequently we try the old format.

fn get_raw_per_device_info() -> Result<Vec<(f64, f64)>, String> {
match command::safe_command(
"rocm-smi",
&["--showuse", "--showmemuse", "--csv"],
TIMEOUT_SECONDS,
) {
Ok(text) => {
if let Ok(info) = parse_csv_concise_command(&text) {
return Ok(info);
}
// Otherwise fall through to second attempt below
}
Err(CmdError::CouldNotStart(_)) => {
return Ok(vec![]);
}
Err(_) => {
// Fall through, we're going to assume this is some problem with the command line
// switches, the next attempt will hopefully surface any problems with the cards
// themselves.
}
}
match command::safe_command("rocm-smi", &[], TIMEOUT_SECONDS) {
Ok(text) => parse_text_concise_command(&text),
Err(CmdError::CouldNotStart(_)) => Ok(vec![]),
Err(e) => Err(format!("{:?}", e)),
}
}

#[cfg(test)]
Expand Down Expand Up @@ -207,16 +226,82 @@ PID 28154 is using 1 DRM device(s):
let users = map! {
28156 => ("bob", 1001usize)
};
let zs = extract_amd_information(concise, pidgpu, &users).expect("Test: AMD information");
let zs = extract_amd_information(
&parse_text_concise_command(concise).expect("Test: AMD text concise information"),
&parse_showpidgpus_command(pidgpu).expect("Test: AMD pid gpu information"),
&users);
assert!(zs.eq(&vec![
proc! { Some(0), 28154, "_zombie_28154", gpu::ZOMBIE_UID, 99.0/2.0, 57.0/2.0 },
proc! { Some(0), 28156, "bob", 1001, 99.0/2.0, 57.0/2.0 },
proc! { Some(1), 28156, "bob", 1001, 63.0, 5.0 },
]));
}

const AMD_CONCISE_COMMAND: &str = "rocm-smi";
const AMD_CONCISE_ARGS: &[&str] = &[];
// The format here is line-oriented:
//
// There should initially be at least one line with at least three fields which should
// start with these strings in order (with this capitalization, sigh):
// device
// GPU use
// GPU Memory
//
// Subsequently there should be lines starting with "cardN,", these are are information for that
// card.
//
// All other lines are junk.

fn parse_csv_concise_command(raw_text: &str) -> Result<Vec<(f64, f64)>, String> {
let lines = raw_text.lines().collect::<Vec<&str>>();
let mut mappings = vec![];
let mut found_device = false;
for l in lines {
if l.starts_with("device") {
if found_device {
return Err("Inconsistent output".to_string());
}
let fields = l.split(',').collect::<Vec<&str>>();
if fields.len() >= 3
&& fields[1].starts_with("GPU use")
&& fields[2].starts_with("GPU Memory")
{
found_device = true;
}
} else if let Some(rest) = l.strip_prefix("card") {
let fields = rest.split(',').collect::<Vec<&str>>();
let mut dev = None;
let mut gpu = None;
let mut gpumem = None;
if fields.len() >= 3 {
dev = match fields[0].parse::<usize>() {
Ok(n) => Some(n),
_ => None,
};
gpu = match fields[1].parse::<f64>() {
Ok(n) => Some(n),
_ => None,
};
gpumem = match fields[2].parse::<f64>() {
Ok(n) => Some(n),
_ => None,
};
}
match (dev, gpu, gpumem) {
(Some(dev), Some(gpu), Some(gpumem)) => {
if mappings.len() < dev + 1 {
mappings.resize(dev + 1, (0.0, 0.0))
}
mappings[dev] = (gpu, gpumem);
}
_ => {}
}
}
}
if found_device && mappings.len() > 0 {
Ok(mappings)
} else {
Err("Inconsistent output".to_string())
}
}

// Return a vector of AMD GPU utilization indexed by device number: (gpu%, mem%)
//
Expand All @@ -225,7 +310,7 @@ const AMD_CONCISE_ARGS: &[&str] = &[];
//
// The mem% is instantaneous memory utilization as expected.

fn parse_concise_command(raw_text: &str) -> Result<Vec<(f64, f64)>, String> {
fn parse_text_concise_command(raw_text: &str) -> Result<Vec<(f64, f64)>, String> {
let block = find_block(raw_text, "= Concise Info =");
if block.len() > 1 {
let hdr = block[0].split_whitespace().collect::<Vec<&str>>();
Expand Down Expand Up @@ -263,8 +348,8 @@ fn parse_concise_command(raw_text: &str) -> Result<Vec<(f64, f64)>, String> {
}

#[test]
fn test_parse_concise_command() {
let xs = parse_concise_command(
fn test_parse_text_concise_command() {
let xs = parse_text_concise_command(
"
================================= Concise Info =================================
GPU Temp (DieEdge) AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
Expand All @@ -277,8 +362,29 @@ GPU Temp (DieEdge) AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
assert!(xs.eq(&vec![(99.0, 57.0), (63.0, 5.0)]));
}

const AMD_SHOWPIDGPUS_COMMAND: &str = "rocm-smi";
const AMD_SHOWPIDGPUS_ARGS: &[&str] = &["--showpidgpus"];
#[test]
fn test_parse_csv_concise_command() {
let xs = parse_csv_concise_command(
"
device,GPU use (%),GPU Memory Allocated (VRAM%),Memory Activity
card0,99,57,N/A
card1,63,5,N/A
",
)
.unwrap();
assert!(xs.eq(&vec![(99.0, 57.0), (63.0, 5.0)]));
}

// Return a sparse map from pid to devices used by the pid; empty if rocm-smi is not present; or an
// error if rocm-smi failed.

fn get_raw_per_pid_info() -> Result<Vec<(usize, Vec<usize>)>, String> {
match command::safe_command("rocm-smi", &["--showpidgpus"], TIMEOUT_SECONDS) {
Ok(showpidgpus_raw_text) => parse_showpidgpus_command(&showpidgpus_raw_text),
Err(CmdError::CouldNotStart(_)) => Ok(vec![]),
Err(e) => Err(format!("{:?}", e)),
}
}

// Return a vector of (PID, DEVICES) where DEVICES is a vector of the devices used by the PID. The
// PID is a string, the devices are numbers. See test cases below for the various forms
Expand Down
Loading