-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmulti_step.py
118 lines (103 loc) · 4.99 KB
/
multi_step.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import oneflow as flow
import torch
import torchmetrics
import torchvision
import flowvision
from flowvision.datasets import CIFAR10
from utils import SeparateWriter
from config import cfgs
from generate_model import generate_model
def serious_train(writer:SeparateWriter, epochs:int, enable_oneflow:bool, model_name='ResNet50'):
DEVICE = 'cuda' if flow.cuda.is_available() else 'cpu'
BATCH_SIZE = cfgs[model_name]['BATCH_SIZE']
IMAGE_SIZE = cfgs[model_name]['IMAGE_SIZE']
NUM_CLASSES = cfgs[model_name]["NUM_CLASSES"] # CIFAR-10
if enable_oneflow:
import flowvision.transforms as transforms
import oneflow.nn as nn
from oneflow.optim import SGD
from oneflow.utils.data import DataLoader
else:
import torchvision.transforms as transforms
import torch.nn as nn
from torch.optim import SGD
from torch.utils.data import DataLoader
train_transform = transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomVerticalFlip(),
transforms.Resize(IMAGE_SIZE),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
test_transform = transforms.Compose([
transforms.Resize(IMAGE_SIZE),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
# Dataset可以通用
train_dataset = CIFAR10(root='data', train=True, transform=train_transform, download=True)
test_dataset = CIFAR10(root='data', train=False, transform=test_transform, download=True)
train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
test_data_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
# 如何保证两次跑出来的模型初始化参数一致?
if enable_oneflow:
_, model = generate_model(model_name)
else:
model, _ = generate_model(model_name)
model = model.to(DEVICE)
def evaluate(model, data_loader, steps, model_name):
model.eval()
num_classes, task, average = NUM_CLASSES, "multiclass", "macro"
metric_collection = torchmetrics.MetricCollection({
'Accuracy': torchmetrics.Accuracy(task=task, num_classes=num_classes, average=average).to('cpu'),
'Precision': torchmetrics.Precision(task=task, num_classes=num_classes, average=average).to('cpu'),
'Recall': torchmetrics.Recall(task=task, num_classes=num_classes, average=average).to('cpu'),
"AUROC": torchmetrics.AUROC(task=task, num_classes=num_classes, average=average).to('cpu'),
})
for batch, (images, labels) in enumerate(data_loader):
images, labels = images.to(DEVICE), labels.cpu() #.to(DEVICE)
# fixed out of memory
if enable_oneflow:
with flow.no_grad():
preds = model(images)
else:
with torch.no_grad():
preds = model(images)
if model_name == "Inception" and enable_oneflow:
preds = preds[0]
if enable_oneflow:
preds = torch.from_numpy(preds.numpy())#.to(DEVICE)
labels = torch.from_numpy(labels.numpy())#.to(DEVICE)
preds = preds.softmax(dim=1).cpu()
batch_metrics = metric_collection.forward(preds, labels)
if batch % 20 == 0:
for key, value in batch_metrics.items():
writer.write_log_single(f"eval/{key}(step)", value, steps)
val_metrics = metric_collection.compute()
print(val_metrics)
for key, value in val_metrics.items():
writer.write_log_single(f"eval/{key}", value, steps)
metric_collection.reset()
def train_model(model, train_data_loader, test_data_loader, loss_func, optimizer, model_name):
dataset_size = len(train_data_loader.dataset)
steps = 0
for epoch in range(epochs):
model.train()
for batch, (images, labels) in enumerate(train_data_loader):
images, labels = images.to(DEVICE), labels.to(DEVICE)
preds = model(images)
if model_name == "Inception":
preds = preds[0]
loss = loss_func(preds, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
steps += 1
if batch % 20 == 0:
writer.write_log_single("train/loss", loss, steps)
writer.write_log_single("train/epoch", epoch, steps)
print(f'loss: {loss:>7f} [epoch: {epoch} {batch * BATCH_SIZE:>5d}/{dataset_size:>5d}]')
evaluate(model, test_data_loader, steps, model_name)
optimizer = SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=5e-4)
loss_func = nn.CrossEntropyLoss()
train_model(model, train_data_loader, test_data_loader, loss_func, optimizer, model_name)