-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathtest_stack.py
139 lines (122 loc) · 4.83 KB
/
test_stack.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# -------------------------------------------------------------------------------------
# Negative-Aware Attention Framework for Image-Text Matching implementation based on SCAN
# https://github.com/CrossmodalGroup/NAAF
# "Negative-Aware Attention Framework for Image-Text Matching"
# Kun Zhang, Zhendong Mao, Quan Wang, Yongdong Zhang
#
# Writen by Kun Zhang, 2022
# -------------------------------------------------------------------------------------
from vocab import Vocabulary
import evaluation
import numpy as np
import os
def i2t(im_len, sims, npts=None, return_ranks=False):
"""
Images->Text (Image Annotation)
Images: (N, n_region, d) matrix of images
Captions: (5N, max_n_word, d) matrix of captions
CapLens: (5N) array of caption lengths
sims: (N, 5N) matrix of similarity im-cap
"""
npts = im_len
ranks = np.zeros(npts)
top1 = np.zeros(npts)
for index in range(npts):
inds = np.argsort(sims[index])[::-1]
# Score
rank = 1e20
for i in range(5 * index, 5 * index + 5, 1):
tmp = np.where(inds == i)[0][0]
if tmp < rank:
rank = tmp
ranks[index] = rank
top1[index] = inds[0]
# Compute metrics
r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
medr = np.floor(np.median(ranks)) + 1
meanr = ranks.mean() + 1
if return_ranks:
return (r1, r5, r10, medr, meanr), (ranks, top1)
else:
return (r1, r5, r10, medr, meanr)
def t2i(im_len, sims, npts=None, return_ranks=False):
"""
Text->Images (Image Search)
Images: (N, n_region, d) matrix of images
Captions: (5N, max_n_word, d) matrix of captions
CapLens: (5N) array of caption lengths
sims: (N, 5N) matrix of similarity im-cap
"""
npts = im_len
ranks = np.zeros(5 * npts)
top1 = np.zeros(5 * npts)
# --> (5N(caption), N(image))
sims = sims.T
for index in range(npts):
for i in range(5):
inds = np.argsort(sims[5 * index + i])[::-1]
ranks[5 * index + i] = np.where(inds == index)[0][0]
top1[5 * index + i] = inds[0]
# Compute metrics
r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
medr = np.floor(np.median(ranks)) + 1
meanr = ranks.mean() + 1
if return_ranks:
return (r1, r5, r10, medr, meanr), (ranks, top1)
else:
return (r1, r5, r10, medr, meanr)
if __name__ == '__main__':
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
RUN_PATH1 = ""
RUN_PATH2 = ""
DATA_PATH = ""
isfold5 = False
sims1 = evaluation.evalstack(
RUN_PATH1, data_path=DATA_PATH, split="test", fold5=isfold5)
sims2 = evaluation.evalstack(
RUN_PATH2, data_path=DATA_PATH, split="test", fold5=isfold5)
if not isfold5:
sims = (sims1 + sims2)
im_len = len(sims)
print('im length:', im_len)
r, rt = i2t(im_len, sims, return_ranks=True)
ri, rti = t2i(im_len, sims, return_ranks=True)
ar = (r[0] + r[1] + r[2]) / 3
ari = (ri[0] + ri[1] + ri[2]) / 3
rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
print("rsum: %.1f" % rsum)
print("Average i2t Recall: %.1f" % ar)
print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r)
print("Average t2i Recall: %.1f" % ari)
print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri)
else:
results = []
for i in range(5):
sim_shard = (sims1[i] + sims2[i]) / 2
im_len = len(sim_shard)
print('im length:', im_len)
r, rt0 = i2t(im_len, sim_shard, return_ranks=True)
print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r)
ri, rti0 = t2i(im_len, sim_shard, return_ranks=True)
print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri)
if i == 0:
rt, rti = rt0, rti0
ar = (r[0] + r[1] + r[2]) / 3
ari = (ri[0] + ri[1] + ri[2]) / 3
rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari))
results += [list(r) + list(ri) + [ar, ari, rsum]]
print("-----------------------------------")
print("Mean metrics: ")
mean_metrics = tuple(np.array(results).mean(axis=0).flatten())
print("rsum: %.1f" % (mean_metrics[10] * 6))
print("Average i2t Recall: %.1f" % mean_metrics[11])
print("Image to text: %.1f %.1f %.1f %.1f %.1f" %
mean_metrics[:5])
print("Average t2i Recall: %.1f" % mean_metrics[12])
print("Text to image: %.1f %.1f %.1f %.1f %.1f" %
mean_metrics[5:10])