-
Notifications
You must be signed in to change notification settings - Fork 12
/
clip_loss.py
53 lines (40 loc) · 1.75 KB
/
clip_loss.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import torch
import clip
from clip_utils import background_dict, category_dict
# maximize similarity
class SimMaxLoss(torch.nn.Module):
def __init__(self, margin=0):
super(SimMaxLoss, self).__init__()
self.margin = margin
def forward(self, x, weights):
x = x.clamp(0.0001, 0.9999)
return -(torch.log(x + self.margin) * weights).mean()
# minimize similarity
class SimMinLoss(torch.nn.Module):
def __init__(self, margin=0):
super(SimMinLoss, self).__init__()
self.margin = margin
def forward(self, x, weights):
x = x.clamp(0.0001, 0.9999)
return -(torch.log(1 - x + self.margin) * weights).mean()
# suppress background activation
class BackgroundSuppressionLoss(torch.nn.Module):
"""
based on threshold
"""
def __init__(self, threshold=0.26, dname='coco'):
super(BackgroundSuppressionLoss, self).__init__()
self.dname = dname
self.background = background_dict[dname]
self.threshold = threshold
print(f'Use CBSLoss! threshold: {threshold}')
def forward(self, clip_model, images, eps=0.0001):
image_features = clip_model.encode_image(images) # [N1, C]
text_features = clip_model.encode_text(clip.tokenize(self.background).cuda()) # [N2, C]
# normalization
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
logits_per_image = (image_features @ text_features.t()) # [N1, N2]
mask = torch.zeros_like(logits_per_image)
mask = torch.where(logits_per_image > self.threshold, torch.ones_like(mask), torch.zeros_like(mask))
return -(torch.log(1 - logits_per_image) * mask).sum()