-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathutil_adversarial_attack.py
246 lines (201 loc) · 9.66 KB
/
util_adversarial_attack.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
import numpy as np
import torch
from torch import optim, nn
import torch.nn.functional as F
from util_MNIST import retrieveMNISTTestData
from util_model import SimpleNeuralNet, loadModel
from art.attacks import FastGradientMethod, ProjectedGradientDescent
from art.classifiers import PyTorchClassifier
from adversarial_attack_DRO import ProjetcedDRO
img_rows, img_cols = 28, 28
"""
This module contains classes for adversarial attacks.
"""
def wrapModel(model, loss_criterion):
"""
Wrap a PyTorch model using a wrapper provided by ART (Adversarial
Robustness Toolbox) by IBM.
"""
optimizer = optim.Adam(model.parameters())
input_shape = (1, img_rows, img_cols)
return PyTorchClassifier((0, 1), model, loss_criterion, optimizer, input_shape, nb_classes=10)
class FGSM:
"""
Class for the fast gradient sign method (FGSM).
This class delegates the implementation of the attack to the ART library
developed by IBM.
"""
def __init__(self, model, loss_criterion, norm, batch_size=128):
self.wrapped_pytorch_model = wrapModel(model, loss_criterion)
self.norm = norm
self.batch_size = batch_size
self.attack = FastGradientMethod(
self.wrapped_pytorch_model, batch_size=batch_size)
# Use GPU for computation if it is available
self.device = torch.device(
"cuda:0" if torch.cuda.is_available() else "cpu")
def generatePerturbation(self, data, budget, minimal=False):
"""
Generate adversarial examples from a given batch of images.
The input data should have already been loaded on an appropriate
device.
Arguments:
data: pairs of a batch of images and a batch of labels. The batch
of images should be a numpy array. The batch of labels should
be a numpy array of integers.
budget: the maximal size of perturbation allowed. This parameter
is not used if minimal = True.
minimal: whether the minimal adversarial perturbation is computed.
If yes, the maximal size of perturbation is 1.0. Consequently,
the budget parameter is overridden.
"""
images, _ = data
images_adv = self.attack.generate(x=images.cpu().numpy(
), norm=self.norm, eps=budget, minimal=minimal, eps_step=budget / 50, eps_max=budget, batch_size=self.batch_size)
images_adv = torch.from_numpy(images_adv)
# The output to be returned should be loaded on an appropriate device.
return images_adv.to(self.device)
class FGSMNative:
"""
Class for manually implemented FGSM, unlike the above FGSM class in this
module. For some unknown reason, this class produces a different
performance in adversarial attacks from the FGSM class. The performance of
FGSMNative is better than that of FGSM only in some cases (and not in all
cases). Additionally, the difference between the FGSM class and the
FGSMNative class is not significant.
"""
def __init__(self, model, loss_criterion, norm=np.inf, batch_size=128):
self.model = model
self.loss_criterion = loss_criterion
self.norm = norm
self.batch_size = batch_size
# Use GPU for computation if it is available
self.device = torch.device(
"cuda:0" if torch.cuda.is_available() else "cpu")
def generatePerturbation(self, data, budget, minimal=False):
"""
Generate adversarial examples from a given batch of images.
The input data should have already been loaded on an appropriate
device.
Note that unlike the FGSM class, in this FGSMNative class, the
computation of minimal perturbations is not supported.
Arguments:
data: pairs of a batch of images and a batch of labels. The batch
of images should be a numpy array. The batch of labels should
be a numpy array of integers.
budget: the maximal size of perturbation allowed. This parameter
is not used if minimal = True.
minimal: whether the minimal adversarial perturbation is computed.
If yes, the maximal size of perturbation is 1.0. Consequently,
the budget parameter is overridden.
"""
images, labels = data
images_adv = images.clone().detach().to(self.device)
# We will never need to compute a gradient with respect to images_adv.
images_adv.requires_grad_(False)
images.requires_grad_(True)
output = self.model(images)
loss = self.loss_criterion(output, labels)
loss.backward()
images.requires_grad_(False)
if self.norm == np.inf:
direction = images.grad.data.sign()
elif self.norm == 2:
flattened_images = images_adv.view(-1, img_rows * img_cols)
direction = F.normalize(
flattened_images, p=2, dim=1).view(images.size())
else:
raise ValueError("The norm is not valid.")
if minimal:
iterations = 50
incremental_size = budget / iterations
minimal_perturbations = torch.zeros(images.size())
for i in range(iterations):
outputs = self.model(
(images_adv + minimal_perturbations).clamp(0, 1))
_, predicted = torch.max(outputs.data, 1)
for j in range(labels.size()[0]):
# If the current adversarial exampels are correctly
# classified, increase the size of the perturbations.
if predicted[j] == labels[j]:
minimal_perturbations[j].add_(
incremental_size * direction[j])
images_adv.add_(minimal_perturbations)
else:
images_adv.add_(budget * direction)
images_adv.clamp_(0, 1)
# The output to be returned should be loaded on an appropriate device.
return images_adv
class PGD:
"""
Class for adversarial attacks based on projected gradient descent (PGD).
The implementation of PGD in ART executes projection on a feasible region
after each iteration. However, random restrating is not used in this
implementation. Not using radom restarting is the difference between the
PGD implemented in ART and the one described by Madry et al.
This adversarial attack subsumes the iterative FGSM.
"""
def __init__(self, model, loss_criterion, norm=np.inf, batch_size=128):
self.wrapped_pytorch_model = wrapModel(model, loss_criterion)
self.norm = norm
self.batch_size = batch_size
self.attack = ProjectedGradientDescent(
self.wrapped_pytorch_model, norm=norm, random_init=False, batch_size=batch_size)
# Use GPU for computation if it is available
self.device = torch.device(
"cuda:0" if torch.cuda.is_available() else "cpu")
def generatePerturbation(self, data, budget, max_iter=15):
images, _ = data
# eps_step is not allowed to be larger than budget according to the
# documentation of ART.
eps_step = budget / 5
images_adv = self.attack.generate(x=images.cpu().numpy(
), norm=self.norm, eps=budget, eps_step=eps_step, max_iter=max_iter, batch_size=self.batch_size)
images_adv = torch.from_numpy(images_adv)
# The output to be returned should be loaded on an appropriate device.
return images_adv.to(self.device)
class DistributionalPGD:
"""
Class for a PGD-based distributional adversarial attack (as opposed to pointwise
adversarial attacks such as FGSM and PGD).
By default, we use the 2-Wasserstein distance (for the distributional distance)
and the L-2 norm (for the underlying pointwise distance).
"""
def __init__(self, model, loss_criterion):
self.model = model
self.loss_criterion = loss_criterion
self.training_module = ProjetcedDRO(model, loss_criterion)
def generatePerturbation(self, data, budget, max_iter=15):
images_adv, _ = self.training_module.attack(budget, data, steps=max_iter)
# The output is already loaded on an appropriate device (i.e. GPU if available).
return images_adv
if __name__ == "__main__":
# Load a simple neural network
model = SimpleNeuralNet()
loadModel(model, "./ERM_models/SimpleModel.pt")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device) # Load the neural network on GPU if it is available
print("The neural network is now loaded on {}.".format(device))
# Create an object for PGD
criterion = nn.CrossEntropyLoss()
batch_size = 128
pgd = PGD(model, criterion, batch_size=batch_size)
pytorch_model = pgd.wrapped_pytorch_model
# Read MNIST dataset
test_loader = retrieveMNISTTestData(batch_size=1024)
# Craft adversarial examples with PGD
epsilon = 0.1 # Maximum perturbation
total, correct = 0, 0
for i, data in enumerate(test_loader):
images, labels = data
images, labels = images.to(device), labels.to(device)
# images_adv is already loaded on GPU by generatePerturbation
images_adv = pgd.generatePerturbation(data, epsilon)
with torch.no_grad():
outputs = model(images_adv)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
acc = (predicted == labels).sum().item() / labels.size(0)
print("Iteration: {}; test accuracy on adversarial sample: {}".format(i+1, acc))
print("Overall accuracy on adversarial exampels: {}.".format(correct / total))