Skip to content

Commit

Permalink
Performance Measurement Fix to avoid MemCopyHtoD Measuring
Browse files Browse the repository at this point in the history
  • Loading branch information
DEKHTIARJonathan committed Apr 7, 2022
1 parent 6128d0b commit cf30938
Showing 1 changed file with 53 additions and 19 deletions.
72 changes: 53 additions & 19 deletions tftrt/examples/benchmark_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,15 +97,15 @@ def _config_gpu_memory(self, gpu_mem_cap):
tf.config.set_memory_growth(gpu, True)
except AttributeError:
tf.config.experimental.set_memory_growth(gpu, True)

else:
try:
set_virtual_device_configuration = tf.config.set_virtual_device_configuration
device_config = tf.config.LogicalDeviceConfiguration(memory_limit=gpu_mem_cap)
except AttributeError:
set_virtual_device_configuration = tf.config.experimental.set_virtual_device_configuration
device_config = tf.config.experimental.VirtualDeviceConfiguration(memory_limit=gpu_mem_cap)

set_virtual_device_configuration(gpu, [device_config])
except RuntimeError as e:
print('Can not set GPU memory config', e)
Expand Down Expand Up @@ -378,18 +378,40 @@ def infer_batch(x):
)

iter_times = []
memcopy_times = []

def log_step(step_idx, display_every, iter_time):
def log_step(step_idx, display_every, iter_time, memcpyHtoD_time):
if step_idx % display_every == 0:
print(
f" step {step_idx:04d}, iter_time(ms)={iter_time:.3f}"
f" step {step_idx:04d}, iter_time(ms)={iter_time:.3f}, memcpyHtoD_time(ms)={memcpyHtoD_time:.3f}"
)

dataset = timed_dataset(
dataset, activate=self._args.debug_performance
)

@force_gpu_resync
@tf.function()
def force_data_on_gpu(data, device="/gpu:0"):
with tf.device(device):
if isinstance(data, (list, tuple)):
output_data = list()
for t in data:
output_data.append(tf.identity(t))
elif isinstance(data, dict):
output_data = dict()
for k, v in data.items():
output_data[k] = tf.identity(v)
else:
output_data = tf.identity(data)
return output_data

for step_idx, data_batch in enumerate(dataset):

start_time = time.time()
data_batch = force_data_on_gpu(data_batch)
memcopy_times.append(time.time() - start_time)

x, y = self.preprocess_model_inputs(data_batch)

start_time = time.time()
Expand All @@ -398,11 +420,14 @@ def log_step(step_idx, display_every, iter_time):

if not self._args.debug_performance:
log_step(
step_idx + 1, self._args.display_every,
np.mean(iter_times[-self._args.display_every:]) * 1000
step_idx + 1,
display_every=self._args.display_every,
iter_time=np.mean(iter_times[-self._args.display_every:]) * 1000,
memcpyHtoD_time=np.mean(memcopy_times[-self._args.display_every:]) * 1000
)
else:
print(f"{'GPU Iteration Time':18s}: {iter_times[-1]:.4f}s")
print(f"{'MemCopyHtoD Iteration Time':18s}: {iter_times[-1]:.4f}s")

if not self._args.use_synthetic_data:
data_aggregator.aggregate_data(y_pred, y)
Expand All @@ -411,15 +436,15 @@ def log_step(step_idx, display_every, iter_time):
step_idx + 1 >= self._args.num_iterations):
break

if (not self._args.debug_performance and
step_idx % self._args.display_every !=
0): # avoids double printing
if (
not self._args.debug_performance and
step_idx % self._args.display_every != 0
): # avoids double printing
log_step(
step_idx + 1,
display_every=1, # force print
iter_time=(
np.mean(iter_times[-self._args.display_every:]) * 1000
)
iter_time=np.mean(iter_times[-self._args.display_every:]) * 1000,
memcpyHtoD_time=np.mean(memcopy_times[-self._args.display_every:]) * 1000
)

with timed_section("Metric Computation"):
Expand All @@ -440,18 +465,27 @@ def log_step(step_idx, display_every, iter_time):
# Skipping last batch. Might have different batch_size
run_times = np.array(iter_times)
run_times = run_times[self._args.num_warmup_iterations:-1]
mem_times = np.array(memcopy_times)
mem_times = mem_times[self._args.num_warmup_iterations:-1]

metrics['Total GPU Time (s)'] = int(np.ceil(np.sum(iter_times)))
metrics['Throughput (samples/sec)'] = np.mean(
self._args.batch_size / run_times
)
metrics['99th_percentile (ms)'] = np.percentile(
run_times, q=99, interpolation='lower'
) * 1000
metrics['GPU Latency Mean (ms)'] = np.mean(run_times) * 1000
metrics['GPU Latency Median (ms)'] = np.median(run_times) * 1000
metrics['GPU Latency Min (ms)'] = np.min(run_times) * 1000
metrics['GPU Latency Max (ms)'] = np.max(run_times) * 1000

def timing_metrics(time_arr, log_prefix):
data = dict()
data[f"{log_prefix} 99th_percentile (ms)"] = np.percentile(
run_times, q=99, interpolation='lower'
) * 1000
data[f"{log_prefix} Mean (ms)"] = np.mean(run_times) * 1000
data[f"{log_prefix} Median (ms)"] = np.median(run_times) * 1000
data[f"{log_prefix} Min (ms)"] = np.min(run_times) * 1000
data[f"{log_prefix} Max (ms)"] = np.max(run_times) * 1000
return data

metrics.update(timing_metrics(run_times, "GPU Latency"))
metrics.update(timing_metrics(mem_times, "MemCopyHtoD Time"))

self._export_runtime_metrics_to_json(metrics)

Expand Down

0 comments on commit cf30938

Please sign in to comment.