-
Notifications
You must be signed in to change notification settings - Fork 584
/
generate-clusters.cpp
86 lines (66 loc) · 2.55 KB
/
generate-clusters.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
/*! \file generate-clusters.cpp
* \brief Generate clusters given a similarity matrix using metropolis-hastings algorithm
* \author Georgi Gerganov
*/
#include "subbreak2.h"
int main(int argc, char ** argv) {
printf("Usage: %s n-gram.txt\n", argv[0]);
if (argc < 2) {
return -1;
}
//srand(time(0));
Cipher::TFreqMap freqMap;
if (Cipher::loadFreqMap(argv[1], freqMap) == false) {
return -1;
}
std::string plain;
plain = R"(
as far as services go, only two steady contributors of revenue
streams keep swelling without apple having to charge
subscription fees. one is the money paid by google parent alphabet
for searches made through apple products such as the safari browser and siri.
)";
Cipher::TParameters params;
TSimilarityMap ccMap;
Cipher::generateSimilarityMap(params, plain, ccMap);
TSimilarityMap logMap;
TSimilarityMap logMapInv;
Cipher::normalizeSimilarityMap(params, ccMap, logMap, logMapInv);
Cipher::TResult result;
Cipher::generateClustersInitialGuess(params, ccMap, result.clusters);
auto pCur = Cipher::calcPClusters(params, ccMap, logMap, logMapInv, result.clusters, result.clMap);
while (true) {
auto clustersNew = result.clusters;
Cipher::mutateClusters(params, clustersNew);
auto pNew = Cipher::calcPClusters(params, ccMap, logMap, logMapInv, clustersNew, result.clMap);
//printf("pNew = %g, pCur = %g\n", pNew, pCur);
auto u = frand();
//auto alpha = pNew/pCur;
auto alpha = std::exp((pNew - pCur));
//printf("alpha = %g\n", alpha);
if (u <= alpha) {
result.clusters = clustersNew;
pCur = pNew;
int n = plain.size();
int nMatch = 0;
for (int j = 0; j < n - 1; ++j) {
for (int i = j + 1; i < n; ++i) {
if ((plain[i] == plain[j] && result.clusters[i] == result.clusters[j]) ||
(plain[i] != plain[j] && result.clusters[i] != result.clusters[j])) {
++nMatch;
}
}
}
float pMatch = float(nMatch)/((n*(n-1))/2);
printf("pCur = %g, pMatch = %g, alpha = %g\n", pCur, pMatch, alpha);
static int cnt = 99;
if (++cnt >= 100 && pMatch > 0.94) {
cnt = 0;
params.nSubbreakIterations = 1000;
TClusterToLetterMap clMap;
Cipher::subbreak(params, freqMap, result);
}
}
}
return 0;
}