-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
107 lines (84 loc) · 3.35 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# This file contains utility functions for distributed training.
import torch
import torch.nn as nn
import torch.optim as optim
from train.train import train_distributed
import portalocker
def LOG(text, file_path):
with open(file_path, "a+") as file:
try:
# Acquire an exclusive lock
portalocker.lock(file, portalocker.LOCK_EX)
# Write data to the file
file.write(text+'\n')
finally:
# Release the lock
portalocker.unlock(file)
def LOG_AND_PRINT(text, file_path):
print(text)
with open(file_path, "a+") as file:
try:
# Acquire an exclusive lock
portalocker.lock(file, portalocker.LOCK_EX)
# Write data to the file
file.write(text+'\n')
finally:
# Release the lock
portalocker.unlock(file)
# each process will have their own
def train_model(model, train_loader, queue, epoch, learning_rate, device):
# print("Number of CPUs being used", multiprocessing.cpu_count())
# Model & loss & optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
print("training")
gradients = train_distributed(
model, device, train_loader, optimizer, criterion, epoch
)
queue.put(gradients)
def get_model_parameters(model):
"""Extract parameters from a single model."""
parameters = {
name: param.clone().detach() for name, param in model.named_parameters()
}
return parameters
def average_model_parameters(model_parameters_list):
"""Average the parameters of models in a list."""
avg_parameters = {}
for key in model_parameters_list[0].keys():
# Stack the same parameter from each model and then take the mean
avg_parameters[key] = torch.stack(
[params[key] for params in model_parameters_list if key in params.keys()]
).mean(dim=0)
return avg_parameters
def apply_model_parameters(model, model_parameters):
""" Apply model parameters to a given model. """
with torch.no_grad():
for name, param in model.named_parameters():
if name in model_parameters:
param.copy_(model_parameters[name])
def average_model_gradients(gradient_list):
"""Average the gradients of models in a list."""
avg_gradients = {}
for key in gradient_list[0].keys():
# Stack the same gradient from each model and then take the mean
avg_gradients[key] = torch.stack(
[grads[key] for grads in gradient_list if key in grads.keys()]
).mean(dim=0)
return avg_gradients
def apply_averaged_parameters_and_gradients(model, avg_parameters, avg_gradients):
"""Apply averaged parameters and gradients to a model."""
with torch.no_grad():
for name, param in model.named_parameters():
if name in avg_parameters:
param.copy_(avg_parameters[name])
if param.grad is not None and name in avg_gradients:
param.grad.copy_(avg_gradients[name])
return model
def apply_averaged_parameters(model, avg_parameters):
"""Apply averaged parameters and gradients to a model."""
with torch.no_grad():
for name, param in model.named_parameters():
if name in avg_parameters:
param.copy_(avg_parameters[name])
return model