Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

extend to 2/4 GPUs #116

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 52 additions & 16 deletions sensors.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def bytes_to_human(num):
num /= 1000.0
return "%.2f %s" % (num, 'YB')


class ISMError(Exception):
"""General exception."""

Expand Down Expand Up @@ -83,7 +84,8 @@ def __init__(self):
NvGPUTemp()]

for sensor in self.sensor_instances:
self.settings['sensors'][sensor.name] = (sensor.desc, sensor.cmd)
self.settings['sensors'][sensor.name] = (
sensor.desc, sensor.cmd)

self._last_net_usage = [0, 0] # (up, down)
self._fetcher = None
Expand Down Expand Up @@ -286,7 +288,8 @@ def get_results(self):
res[sensor] = value

else: # custom sensor
res[sensor] = BaseSensor.script_exec(self.settings["sensors"][sensor][1])
res[sensor] = BaseSensor.script_exec(
self.settings["sensors"][sensor][1])

return res

Expand Down Expand Up @@ -345,14 +348,33 @@ class NvGPUSensor(BaseSensor):

def get_value(self, sensor):
if sensor == 'nvgpu':
return "{:02.0f}%".format(self._fetch_gpu())

def _fetch_gpu(self, percpu=False):
result = subprocess.check_output(['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv'])
perc = result.splitlines()[1]
perc = perc[:-2]
return int(perc)

perc = self._fetch_gpu_util()
perc1, perc2 = self._fetch_gpu_mem()
if len(perc) == 1:
return "{:02.0f}%".format(int(perc[0][:-2]))
elif len(perc) == 2:
return "{:02.0f}({:02.0f})% {:02.0f}({:02.0f})%".format(int(int(perc1[0][:-4])*100/int(perc2[0][:-4])), int(perc[0][:-2]), int(int(perc1[1][:-4])*100/int(perc2[1][:-4])), int(perc[1][:-2]))
elif len(perc) == 4:
return "{:02.0f}% {:02.0f}% {:02.0f}% {:02.0f}%".format(int(perc[0][:-2]), int(perc[1][:-2]), int(perc[2][:-2]), int(perc[3][:-2]))

def _fetch_gpu_util(self, percpu=False):
result = subprocess.check_output(
['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv'])
perc = result.splitlines()[1:]
# perc = perc[:-2]
# return int(perc)
return perc

def _fetch_gpu_mem(self, percpu=False):
result = subprocess.check_output(
['nvidia-smi', '--query-gpu=memory.used', '--format=csv'])
perc1 = result.splitlines()[1:]
result = subprocess.check_output(
['nvidia-smi', '--query-gpu=memory.total', '--format=csv'])
perc2 = result.splitlines()[1:]
# perc = perc[:-2]
# return int(perc)
return perc1, perc2

class NvGPUTemp(BaseSensor):
"""Return GPU temperature expressed in Celsius
Expand All @@ -362,12 +384,22 @@ class NvGPUTemp(BaseSensor):

def get_value(self, sensor):
# degrees symbol is unicode U+00B0
return "{}\u00B0C".format(self._fetch_gputemp())
perc = self._fetch_gputemp()
# fell free to customize the following code for your multi-gpu machine
if len(perc) == 1:
return "{}\u00B0C".format(int(perc[0]))
elif len(perc) == 2:
return "{}\u00B0C {}\u00B0C".format(int(perc[0]), int(perc[1]))
elif len(perc) == 4:
return "{}\u00B0C {}\u00B0C {}\u00B0C {}\u00B0C".format(int(perc[0]), int(perc[1]), int(perc[2]), int(perc[3]))

def _fetch_gputemp(self):
result = subprocess.check_output(['nvidia-smi', '--query-gpu=temperature.gpu', '--format=csv'])
perc = result.splitlines()[1]
return int(perc)
result = subprocess.check_output(
['nvidia-smi', '--query-gpu=temperature.gpu', '--format=csv'])
# perc = result.splitlines()[1]
# return int(perc)
perc = result.splitlines()[1:]
return perc


class CPUSensor(BaseSensor):
Expand Down Expand Up @@ -476,6 +508,7 @@ def _fetch_net(self):
current[1] /= mgr.get_interval()
return '↓ {:>9s}/s ↑ {:>9s}/s'.format(bytes_to_human(current[0]), bytes_to_human(current[1]))


class NetCompSensor(BaseSensor):
name = 'netcomp'
desc = _('Network activity in Compact form.')
Expand All @@ -500,6 +533,7 @@ def _fetch_net(self):
current[1] /= mgr.get_interval()
return '⇵ {:>9s}/s'.format(bytes_to_human(current[0] + current[1]))


class TotalNetSensor(BaseSensor):
name = 'totalnet'
desc = _('Total Network activity.')
Expand All @@ -519,6 +553,7 @@ def _fetch_net(self):
current[1] /= mgr.get_interval()
return ' Σ {:>9s}'.format(bytes_to_human(current[0] + current[1]))


class BatSensor(BaseSensor):
name = 'bat\d*'
desc = _('Battery capacity.')
Expand All @@ -528,7 +563,8 @@ def check(self, sensor):
if self.bat.match(sensor):
bat_id = int(sensor[3:]) if len(sensor) > 3 else 0
if not os.path.exists("/sys/class/power_supply/BAT{}".format(bat_id)):
raise ISMError(_("Invalid number returned for the Battery sensor."))
raise ISMError(
_("Invalid number returned for the Battery sensor."))

return True

Expand Down Expand Up @@ -703,7 +739,7 @@ def _fetch_cputemp(self):

# if that fails try various hwmon files

cat = lambda file: open(file, 'r').read().strip()
def cat(file): return open(file, 'r').read().strip()
ret = None

zone = "/sys/class/thermal/thermal_zone0/"
Expand Down