@@ -21,6 +21,7 @@ public abstract class Block<D,R,C,T> implements Serializable {
21
21
private static final long serialVersionUID = 1L ;
22
22
23
23
public static final Log LOG = LogFactory .getLog (Block .class );
24
+ private final BlockUtility <R > blockUtility ;
24
25
25
26
protected ZFrame <D ,R ,C > dupes ;
26
27
// Class[] types;
@@ -30,13 +31,14 @@ public abstract class Block<D,R,C,T> implements Serializable {
30
31
protected ListMap <HashFunction <D ,R ,C ,T >, String > childless ;
31
32
32
33
public Block () {
33
-
34
- }
34
+ blockUtility = new BlockUtility < R >();
35
+ }
35
36
36
37
public Block (ZFrame <D ,R ,C > training , ZFrame <D ,R ,C > dupes ) {
37
38
this .training = training ;
38
39
this .dupes = dupes ;
39
40
childless = new ListMap <HashFunction <D ,R ,C ,T >, String >();
41
+ blockUtility = new BlockUtility <R >();
40
42
// types = getSampleTypes();
41
43
/*
42
44
* for (Class type : types) { LOG.info("Type is " + type); }
@@ -121,11 +123,13 @@ public void estimateElimCount(Canopy<R> c, long elimCount) {
121
123
}
122
124
123
125
public Canopy <R >getBestNode (Tree <Canopy <R >> tree , Canopy <R >parent , Canopy <R >node ,
124
- List <FieldDefinition > fieldsOfInterest ) throws Exception {
126
+ List <FieldDefinition > fieldsOfInterest , int startIndexToIterate ) throws Exception {
125
127
long least = Long .MAX_VALUE ;
126
128
int maxElimination = 0 ;
129
+ int numberOfFields = fieldsOfInterest .size ();
127
130
Canopy <R >best = null ;
128
- for (FieldDefinition field : fieldsOfInterest ) {
131
+ for (int fieldsToExplore = 0 ; fieldsToExplore < numberOfFields ; startIndexToIterate = (startIndexToIterate + 1 ) % numberOfFields , fieldsToExplore ++) {
132
+ FieldDefinition field = fieldsOfInterest .get (startIndexToIterate );
129
133
if (LOG .isDebugEnabled ()){
130
134
LOG .debug ("Trying for " + field + " with data type " + field .getDataType () + " and real dt "
131
135
+ getFeatureFactory ().getDataTypeFromString (field .getDataType ()));
@@ -217,7 +221,7 @@ public void estimateElimCount(Canopy<R> c, long elimCount) {
217
221
* @throws ZinggClientException
218
222
*/
219
223
public Tree <Canopy <R >> getBlockingTree (Tree <Canopy <R >> tree , Canopy <R >parent ,
220
- Canopy <R >node , List <FieldDefinition > fieldsOfInterest ) throws Exception , ZinggClientException {
224
+ Canopy <R >node , List <FieldDefinition > fieldsOfInterest , int startIndexToIterate ) throws Exception , ZinggClientException {
221
225
if (LOG .isDebugEnabled ()) {
222
226
LOG .debug ("Tree so far " );
223
227
LOG .debug (tree );
@@ -228,8 +232,11 @@ public Tree<Canopy<R>> getBlockingTree(Tree<Canopy<R>> tree, Canopy<R>parent,
228
232
}
229
233
if (size > maxSize && node .getDupeN () != null && node .getDupeN ().size () > 0 ) {
230
234
LOG .debug ("Size is bigger " );
231
- Canopy <R >best = getBestNode (tree , parent , node , fieldsOfInterest );
235
+ Canopy <R >best = getBestNode (tree , parent , node , fieldsOfInterest , startIndexToIterate == - 1 ? 0 : startIndexToIterate );
232
236
if (best != null ) {
237
+ if (startIndexToIterate != -1 ) {
238
+ startIndexToIterate = blockUtility .getStartIndexToIterateOverFields (best , fieldsOfInterest );
239
+ }
233
240
if (LOG .isDebugEnabled ()) {
234
241
LOG .debug (" HashFunction is " + best + " and node is " + node );
235
242
}
@@ -255,7 +262,7 @@ public Tree<Canopy<R>> getBlockingTree(Tree<Canopy<R>> tree, Canopy<R>parent,
255
262
LOG .debug (" Finding for " + n );
256
263
}
257
264
258
- getBlockingTree (tree , node , n , fieldsOfInterest );
265
+ getBlockingTree (tree , node , n , fieldsOfInterest , startIndexToIterate );
259
266
}
260
267
}
261
268
else {
0 commit comments