Skip to content

Commit 1a3b18b

Browse files
jakki-amdBipradip Chowdhuryeppaneamdsmedegaard
authored
Add Apple system metrics support (#3377)
* Add Apple system metrics support Co-authored-by: Bipradip Chowdhury <[email protected]> Co-authored-by: Rony Leppänen <[email protected]> Co-authored-by: Anders Smedegaard Pedersen <[email protected]> * Fix ModelServerTest.testMetricManager for other HW vendors * Add GPUUtilization as expect metric --------- Co-authored-by: Bipradip Chowdhury <[email protected]> Co-authored-by: Rony Leppänen <[email protected]> Co-authored-by: Anders Smedegaard Pedersen <[email protected]>
1 parent 9bcbd22 commit 1a3b18b

File tree

5 files changed

+36
-16
lines changed

5 files changed

+36
-16
lines changed

frontend/server/src/main/java/org/pytorch/serve/device/utils/AppleUtil.java

+6-10
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,9 @@
55
import com.google.gson.JsonObject;
66
import com.google.gson.JsonParser;
77
import java.util.ArrayList;
8+
import java.util.Collections;
89
import java.util.LinkedHashSet;
910
import java.util.List;
10-
import java.util.stream.Collectors;
11-
import java.util.stream.IntStream;
1211
import org.pytorch.serve.device.Accelerator;
1312
import org.pytorch.serve.device.AcceleratorVendor;
1413
import org.pytorch.serve.device.interfaces.IAcceleratorUtility;
@@ -75,15 +74,12 @@ public List<JsonObject> extractAccelerators(JsonElement rootObject) {
7574
.getAsJsonObject() // Gets the outer object
7675
.get("SPDisplaysDataType") // Gets the "SPDisplaysDataType" element
7776
.getAsJsonArray();
77+
7878
JsonObject gpuObject = displaysArray.get(0).getAsJsonObject();
79-
int number_of_cores = Integer.parseInt(gpuObject.get("sppci_cores").getAsString());
80-
81-
// add the object `number_of_cores` times to maintain the exsisitng
82-
// functionality
83-
accelerators =
84-
IntStream.range(0, number_of_cores)
85-
.mapToObj(i -> gpuObject)
86-
.collect(Collectors.toList());
79+
80+
// Create list with only a single accelerator object as
81+
// M1, M2, M3 Macs have only single integrated GPU
82+
accelerators = Collections.singletonList(gpuObject);
8783

8884
return accelerators;
8985
}

frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java

+11-2
Original file line numberDiff line numberDiff line change
@@ -1372,8 +1372,17 @@ public void testMetricManager() throws JsonParseException, InterruptedException
13721372
Assert.assertTrue(++count < 5);
13731373
}
13741374

1375-
// 7 system-level metrics + 3 gpu-specific metrics
1376-
Assert.assertEquals(metrics.size(), 7 + 3 * configManager.getNumberOfGpu());
1375+
// Determine if the device is Apple or not
1376+
String vendor = System.getProperty("os.name");
1377+
if (vendor != null) {
1378+
if (vendor.startsWith("Mac")) {
1379+
// 7 system-level metrics + 2 gpu-specific metrics (per GPU) for Apple devices
1380+
Assert.assertEquals(metrics.size(), 7 + 2 * configManager.getNumberOfGpu());
1381+
} else {
1382+
// 7 system-level metrics + 3 gpu-specific metrics (per GPU) for non-Apple devices
1383+
Assert.assertEquals(metrics.size(), 7 + 3 * configManager.getNumberOfGpu());
1384+
}
1385+
}
13771386

13781387
for (Metric metric : metrics) {
13791388
String metricName = metric.getMetricName();

frontend/server/src/test/java/org/pytorch/serve/device/utils/AppleUtilTest.java

+3-3
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ public void testExtractAcceleratorId() {
7676
public void testExtractAccelerators() {
7777
List<JsonObject> accelerators = appleUtil.extractAccelerators(sampleOutputJson);
7878

79-
assertEquals(accelerators.size(), 7);
79+
assertEquals(accelerators.size(), 1);
8080
assertEquals(accelerators.get(0).get("sppci_model").getAsString(), "Apple M1");
8181
}
8282

@@ -88,7 +88,7 @@ public void testSmiOutputToUpdatedAccelerators() {
8888
ArrayList<Accelerator> updatedAccelerators =
8989
appleUtil.smiOutputToUpdatedAccelerators(sampleOutputJson.toString(), parsedGpuIds);
9090

91-
assertEquals(updatedAccelerators.size(), 7);
91+
assertEquals(updatedAccelerators.size(), 1);
9292
Accelerator accelerator = updatedAccelerators.get(0);
9393
assertEquals(accelerator.getAcceleratorModel(), "Apple M1");
9494
assertEquals(accelerator.getVendor(), AcceleratorVendor.APPLE);
@@ -112,7 +112,7 @@ public String[] getUtilizationSmiCommand() {
112112
ArrayList<Accelerator> availableAccelerators =
113113
spyAppleUtil.getAvailableAccelerators(availableAcceleratorIds);
114114

115-
assertEquals(availableAccelerators.size(), 7);
115+
assertEquals(availableAccelerators.size(), 1);
116116
Accelerator accelerator = availableAccelerators.get(0);
117117
assertEquals(accelerator.getAcceleratorModel(), "Apple M1");
118118
assertEquals(accelerator.getVendor(), AcceleratorVendor.APPLE);

frontend/server/src/test/java/org/pytorch/serve/util/ConfigManagerTest.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,8 @@ public void testNumGpuM1() throws ReflectiveOperationException, IOException {
118118
String mac_arm64_cpu_only = System.getenv().getOrDefault("TS_MAC_ARM64_CPU_ONLY", "False");
119119
if (arch.equals("aarch64")) {
120120
if (mac_arm64_cpu_only.equals("True")) {
121-
Assert.assertEquals(configManager.getNumberOfGpu(), 0);
121+
// Mac M1 returns 1 accelerator device
122+
Assert.assertEquals(configManager.getNumberOfGpu(), 1);
122123
} else {
123124
Assert.assertTrue(configManager.getNumberOfGpu() > 0);
124125
}

ts/metrics/system_metrics.py

+14
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,20 @@ def collect_gpu_metrics(num_of_gpus):
8888
amdsmi.amdsmi_shut_down()
8989
except amdsmi.AmdSmiException as e:
9090
logging.error("Could not shut down AMD-SMI library.")
91+
elif torch.backends.mps.is_available():
92+
try:
93+
total_memory = torch.mps.driver_allocated_memory()
94+
mem_used = torch.mps.current_allocated_memory()
95+
gpu_mem_utilization = (
96+
(mem_used / total_memory * 100) if total_memory > 0 else 0
97+
)
98+
# Currently there is no way to calculate GPU utilization with MPS.
99+
gpu_utilization = None
100+
except Exception as e:
101+
logging.error(f"Could not capture MPS memory metrics")
102+
mem_used = 0
103+
gpu_mem_utilization = 0
104+
gpu_utilization = None
91105

92106
dimension_gpu = [
93107
Dimension("Level", "Host"),

0 commit comments

Comments
 (0)