forked from fizx/libbow-osx
-
Notifications
You must be signed in to change notification settings - Fork 0
/
kl-div.c
120 lines (110 loc) · 2.79 KB
/
kl-div.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/* Calculate Kulback-Leibler Divergence between two word distributions */
#include <bow/libbow.h>
void
print_usage (const char *progname)
{
fprintf (stderr, "%s [-m] worddistfile1 worddistfile2\n", progname);
fprintf (stderr, " -m calculates `KL-Divergence to the mean'\n");
exit (-1);
}
int
main (int argc, char *argv[])
{
int doing_larry_loss = 0;
int doing_kl_div_to_mean = 0;
FILE *fp;
float prob;
static const int word_len = 1024;
char word[word_len];
int vocab_size;
float *dist[2];
int d;
double kldiv;
int wi;
float dist_sum;
if (argc == 4)
{
if (!strcmp (argv[1], "-m"))
doing_kl_div_to_mean = 1;
else if (!strcmp (argv[1], "-l"))
doing_larry_loss = 1;
else
print_usage (argv[0]);
}
else if (argc != 3)
print_usage (argv[0]);
/* Read each word distribution to get the vocabulary size */
for (d = 0; d < 2; d++)
{
fp = bow_fopen (argv[argc-2+d], "r");
while (fscanf (fp, "%f %s", &prob, word) == 2)
{
assert (strlen (word) < word_len);
wi = bow_word2int (word);
}
fclose (fp);
}
/* Insist that no more words be added to the vocabulary. */
bow_word2int_do_not_add = 1;
vocab_size = bow_num_words ();
bow_verbosify (bow_verbose, "Vocabulary size is %d\n", vocab_size);
/* Initialize the distribution array to zeros. */
for (d = 0; d < 2; d++)
{
dist[d] = (float*) bow_malloc (sizeof(float) * vocab_size);
for (wi = 0; wi < vocab_size; wi++)
dist[d][wi] = 0;
}
/* Read each word distribution to fill in the distribution array */
for (d = 0; d < 2; d++)
{
fp = bow_fopen (argv[argc-2+d], "r");
dist_sum = 0;
while (fscanf (fp, "%f %s", &prob, word) == 2)
{
assert (strlen (word) < word_len);
wi = bow_word2int (word);
assert (wi != -1);
assert (dist[d][wi] == 0);
dist[d][wi] = prob;
dist_sum += prob;
}
if (dist_sum < 0.98 || dist_sum > 1.02)
bow_error ("Distribution%d sum != 1.0, =%f\n", d, dist_sum);
fclose (fp);
}
#if 1
/* Calculate the value of Larry Wasserman's Loss function. Assume
that the first distribution is the correct one. */
kldiv = 0;
if (doing_larry_loss)
{
double diff;
for (wi = 0; wi < vocab_size; wi++)
{
diff = dist[1][wi] - dist[0][wi];
if (dist[0][wi])
kldiv += (diff * diff) / (dist[0][wi] * (1.0 - dist[0][wi]));
}
printf ("%g\n", kldiv);
exit (0);
}
#endif
/* Calculate the KL-Div */
kldiv = 0;
if (doing_kl_div_to_mean)
{
for (wi = 0; wi < vocab_size; wi++)
kldiv += dist[0][wi] * log (dist[0][wi]
/ ((dist[0][wi] + dist[1][wi])/2));
}
else
{
for (wi = 0; wi < vocab_size; wi++)
if (dist[0][wi])
kldiv += dist[0][wi] * log (dist[0][wi]
/ dist[1][wi]);
}
printf ("%g\n", kldiv);
exit (0);
}