1
+ package zingg .common .core .block ;
2
+
3
+ import org .junit .jupiter .api .Assertions ;
4
+ import org .junit .jupiter .api .Test ;
5
+ import zingg .common .client .Arguments ;
6
+ import zingg .common .client .ArgumentsUtil ;
7
+ import zingg .common .client .IArguments ;
8
+ import zingg .common .client .ZFrame ;
9
+ import zingg .common .client .ZinggClientException ;
10
+ import zingg .common .client .util .DFObjectUtil ;
11
+ import zingg .common .core .block .dataUtility .CsvReader ;
12
+ import zingg .common .core .block .dataUtility .DataUtility ;
13
+ import zingg .common .core .block .model .Customer ;
14
+ import zingg .common .core .block .model .CustomerDupe ;
15
+ import zingg .common .core .util .BlockingTreeUtil ;
16
+ import zingg .common .core .util .HashUtil ;
17
+
18
+ import java .util .Iterator ;
19
+ import java .util .List ;
20
+ import java .util .Objects ;
21
+
22
+ import static java .lang .Math .max ;
23
+
24
+ public abstract class TestBlockingTreeUtil <S , D , R , C , T > {
25
+
26
+ protected String TEST_DATA_BASE_LOCATION ;
27
+ private int maxDepth = 1 , totalNodes = 0 ;
28
+ private static String TEST_FILE = "test.csv" ;
29
+ private static String CONFIG_FILE = "config.json" ;
30
+ private final DataUtility dataUtility ;
31
+
32
+ public TestBlockingTreeUtil () {
33
+ setTestDataBaseLocation ();
34
+ this .dataUtility = new DataUtility (new CsvReader ());
35
+ }
36
+
37
+ @ Test
38
+ public void testSameBlockingTreeWithoutVariance () throws Exception , ZinggClientException {
39
+ List <Customer > testCustomers = dataUtility .getCustomers (TEST_DATA_BASE_LOCATION + "/" + TEST_FILE );
40
+ //setting variance as false
41
+ List <CustomerDupe > testCustomerDupes = dataUtility .getCustomerDupes (TEST_DATA_BASE_LOCATION + "/" + TEST_FILE , false );
42
+ DFObjectUtil <S , D , R , C > dfObjectUtil = getDFObjectUtil ();
43
+
44
+ ZFrame <D , R , C > zFrameTest = dfObjectUtil .getDFFromObjectList (testCustomers , Customer .class );
45
+ ZFrame <D , R , C > zFramePositives = dfObjectUtil .getDFFromObjectList (testCustomerDupes , CustomerDupe .class );
46
+
47
+ testSameBlockingTree (zFrameTest , zFramePositives );
48
+ }
49
+
50
+ @ Test
51
+ public void testSameBlockingTreeWithVariance () throws Exception , ZinggClientException {
52
+ List <Customer > testCustomers = dataUtility .getCustomers (TEST_DATA_BASE_LOCATION + "/" + TEST_FILE );
53
+ //setting variance as true
54
+ List <CustomerDupe > testCustomerDupes = dataUtility .getCustomerDupes (TEST_DATA_BASE_LOCATION + "/" + TEST_FILE , true );
55
+ DFObjectUtil <S , D , R , C > dfObjectUtil = getDFObjectUtil ();
56
+
57
+ ZFrame <D , R , C > zFrameTest = dfObjectUtil .getDFFromObjectList (testCustomers , Customer .class );
58
+ ZFrame <D , R , C > zFramePositives = dfObjectUtil .getDFFromObjectList (testCustomerDupes , CustomerDupe .class );
59
+
60
+ testSameBlockingTree (zFrameTest , zFramePositives );
61
+ }
62
+
63
+ public void testSameBlockingTree (ZFrame <D , R , C > zFrameTest , ZFrame <D , R , C > zFramePositives ) throws Exception , ZinggClientException {
64
+ setTestDataBaseLocation ();
65
+ BlockingTreeUtil <S , D , R , C , T > blockingTreeUtil = getBlockingTreeUtil ();
66
+ HashUtil <S , D , R , C , T > hashUtil = getHashUtil ();
67
+
68
+
69
+ IArguments args = new ArgumentsUtil (Arguments .class ).createArgumentsFromJSON (
70
+ TEST_DATA_BASE_LOCATION + "/" + CONFIG_FILE ,
71
+ "" );
72
+ args .setBlockSize (8 );
73
+
74
+ long ts = System .currentTimeMillis ();
75
+ Tree <Canopy <R >> blockingTreeOptimized = blockingTreeUtil .createBlockingTree (zFrameTest , zFramePositives , 1 , -1 ,
76
+ args , hashUtil .getHashFunctionList (), HashUtility .CACHED );
77
+ System .out .println ("************ time taken to create optimized blocking tree ************ " + (System .currentTimeMillis () - ts ));
78
+
79
+ ts = System .currentTimeMillis ();
80
+ Tree <Canopy <R >> blockingTreeDefault = blockingTreeUtil .createBlockingTree (zFrameTest , zFramePositives , 1 , -1 ,
81
+ args , hashUtil .getHashFunctionList (), HashUtility .DEFAULT );
82
+ System .out .println ("************ time taken to create blocking tree ************ " + (System .currentTimeMillis () - ts ));
83
+
84
+ int depth = 1 ;
85
+ //assert both the trees are equal
86
+ Assertions .assertTrue (dfsSameTreeValidation (blockingTreeDefault , blockingTreeOptimized , depth ));
87
+
88
+ System .out .println ("-------- max depth of trees -------- " + maxDepth );
89
+ System .out .println ("-------- total nodes in a trees -------- " + totalNodes );
90
+ }
91
+
92
+
93
+ private boolean dfsSameTreeValidation (Tree <Canopy <R >> node1 , Tree <Canopy <R >> node2 , int depth ) {
94
+ totalNodes ++;
95
+ maxDepth = max (maxDepth , depth );
96
+
97
+ //if both the node1 and node2 are null, return true
98
+ if (node1 == null && node2 == null ){
99
+ return true ;
100
+ }
101
+ //if only one of node1 or node2 is null, return false
102
+ if (node1 == null || node2 == null ){
103
+ return false ;
104
+ }
105
+
106
+ if (!performValidationOnNode1AndNode2 (node1 , node2 )) {
107
+ return false ;
108
+ }
109
+
110
+ Iterator <Tree <Canopy <R >>> canopyIterator1 = node1 .getSubTrees ().iterator ();
111
+ Iterator <Tree <Canopy <R >>> canopyIterator2 = node2 .getSubTrees ().iterator ();
112
+
113
+ boolean isEqual = true ;
114
+
115
+ //recurse through sub-trees
116
+ while (canopyIterator1 .hasNext () && canopyIterator2 .hasNext ()) {
117
+ isEqual &= dfsSameTreeValidation (canopyIterator1 .next (), canopyIterator2 .next (), depth + 1 );
118
+ }
119
+
120
+ return isEqual ;
121
+ }
122
+
123
+
124
+ private boolean performValidationOnNode1AndNode2 (Tree <Canopy <R >> node1 , Tree <Canopy <R >> node2 ) {
125
+ boolean functionEqual = isNodeFunctionEqual (node1 .getHead (), node2 .getHead ());
126
+ boolean contextEqual = isNodeContextEqual (node1 .getHead (), node2 .getHead ());
127
+ boolean hashEqual = isNodeHashEqual (node1 .getHead (), node2 .getHead ());
128
+ boolean subtreeSizeEqual = isNodeSubTreesSizeEqual (node1 , node2 );
129
+
130
+ return functionEqual && contextEqual && hashEqual && subtreeSizeEqual ;
131
+ }
132
+ private boolean isNodeFunctionEqual (Canopy <R > node1Head , Canopy <R > node2Head ) {
133
+ if (node1Head .getFunction () == null && node2Head .getFunction () == null ) {
134
+ return true ;
135
+ } else if (node1Head .getFunction () == null || node2Head .getFunction () == null ) {
136
+ return false ;
137
+ } else {
138
+ return Objects .equals (node1Head .getFunction ().getName (), node2Head .getFunction ().getName ());
139
+ }
140
+ }
141
+
142
+ private boolean isNodeHashEqual (Canopy <R > node1Head , Canopy <R > node2Head ) {
143
+ return Objects .equals (node1Head .getHash (), node2Head .getHash ());
144
+ }
145
+
146
+ private boolean isNodeContextEqual (Canopy <R > node1Head , Canopy <R > node2Head ) {
147
+
148
+ if (node1Head .getContext () == null && node2Head .getContext () == null ) {
149
+ return true ;
150
+ } else if (node1Head .getContext () == null || node2Head .getContext () == null ) {
151
+ return false ;
152
+ } else {
153
+ return Objects .equals (node1Head .getContext ().getName (), node2Head .getContext ().getName ());
154
+ }
155
+ }
156
+
157
+ private boolean isNodeSubTreesSizeEqual (Tree <Canopy <R >> node1 , Tree <Canopy <R >> node2 ) {
158
+ return node1 .getSubTrees ().size () == node2 .getSubTrees ().size ();
159
+ }
160
+
161
+
162
+ protected abstract DFObjectUtil <S , D , R , C > getDFObjectUtil ();
163
+ protected abstract BlockingTreeUtil <S , D , R , C , T > getBlockingTreeUtil ();
164
+ protected abstract HashUtil <S , D , R , C , T > getHashUtil ();
165
+ protected abstract void setTestDataBaseLocation ();
166
+ }
0 commit comments