Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 4ff5d01

Browse files
committedNov 7, 2024·
changes
1 parent 01f871b commit 4ff5d01

File tree

5 files changed

+51
-9
lines changed

5 files changed

+51
-9
lines changed
 

‎common/core/src/main/java/zingg/common/core/block/Block.java

+14-7
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ public abstract class Block<D,R,C,T> implements Serializable {
2121
private static final long serialVersionUID = 1L;
2222

2323
public static final Log LOG = LogFactory.getLog(Block.class);
24+
private final BlockUtility<R> blockUtility;
2425

2526
protected ZFrame<D,R,C> dupes;
2627
// Class[] types;
@@ -30,13 +31,14 @@ public abstract class Block<D,R,C,T> implements Serializable {
3031
protected ListMap<HashFunction<D,R,C,T>, String> childless;
3132

3233
public Block() {
33-
34-
}
34+
blockUtility = new BlockUtility<R>();
35+
}
3536

3637
public Block(ZFrame<D,R,C> training, ZFrame<D,R,C> dupes) {
3738
this.training = training;
3839
this.dupes = dupes;
3940
childless = new ListMap<HashFunction<D,R,C,T>, String>();
41+
blockUtility = new BlockUtility<R>();
4042
// types = getSampleTypes();
4143
/*
4244
* for (Class type : types) { LOG.info("Type is " + type); }
@@ -121,11 +123,13 @@ public void estimateElimCount(Canopy<R> c, long elimCount) {
121123
}
122124

123125
public Canopy<R>getBestNode(Tree<Canopy<R>> tree, Canopy<R>parent, Canopy<R>node,
124-
List<FieldDefinition> fieldsOfInterest) throws Exception {
126+
List<FieldDefinition> fieldsOfInterest, int startIndexToIterate) throws Exception {
125127
long least = Long.MAX_VALUE;
126128
int maxElimination = 0;
129+
int numberOfFields = fieldsOfInterest.size();
127130
Canopy<R>best = null;
128-
for (FieldDefinition field : fieldsOfInterest) {
131+
for (int fieldsToExplore = 0; fieldsToExplore < numberOfFields; startIndexToIterate = (startIndexToIterate + 1) % numberOfFields, fieldsToExplore++) {
132+
FieldDefinition field = fieldsOfInterest.get(startIndexToIterate);
129133
if (LOG.isDebugEnabled()){
130134
LOG.debug("Trying for " + field + " with data type " + field.getDataType() + " and real dt "
131135
+ getFeatureFactory().getDataTypeFromString(field.getDataType()));
@@ -217,7 +221,7 @@ public void estimateElimCount(Canopy<R> c, long elimCount) {
217221
* @throws ZinggClientException
218222
*/
219223
public Tree<Canopy<R>> getBlockingTree(Tree<Canopy<R>> tree, Canopy<R>parent,
220-
Canopy<R>node, List<FieldDefinition> fieldsOfInterest) throws Exception, ZinggClientException {
224+
Canopy<R>node, List<FieldDefinition> fieldsOfInterest, int startIndexToIterate) throws Exception, ZinggClientException {
221225
if (LOG.isDebugEnabled()) {
222226
LOG.debug("Tree so far ");
223227
LOG.debug(tree);
@@ -228,8 +232,11 @@ public Tree<Canopy<R>> getBlockingTree(Tree<Canopy<R>> tree, Canopy<R>parent,
228232
}
229233
if (size > maxSize && node.getDupeN() != null && node.getDupeN().size() > 0) {
230234
LOG.debug("Size is bigger ");
231-
Canopy<R>best = getBestNode(tree, parent, node, fieldsOfInterest);
235+
Canopy<R>best = getBestNode(tree, parent, node, fieldsOfInterest, startIndexToIterate == -1 ? 0 : startIndexToIterate);
232236
if (best != null) {
237+
if (startIndexToIterate != -1) {
238+
startIndexToIterate = blockUtility.getStartIndexToIterateOverFields(best, fieldsOfInterest);
239+
}
233240
if (LOG.isDebugEnabled()) {
234241
LOG.debug(" HashFunction is " + best + " and node is " + node);
235242
}
@@ -255,7 +262,7 @@ public Tree<Canopy<R>> getBlockingTree(Tree<Canopy<R>> tree, Canopy<R>parent,
255262
LOG.debug(" Finding for " + n);
256263
}
257264

258-
getBlockingTree(tree, node, n, fieldsOfInterest);
265+
getBlockingTree(tree, node, n, fieldsOfInterest, startIndexToIterate);
259266
}
260267
}
261268
else {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package zingg.common.core.block;
2+
3+
import zingg.common.client.FieldDefinition;
4+
5+
import java.util.List;
6+
7+
public class BlockUtility<R> {
8+
public int getStartIndexToIterateOverFields(Canopy<R>best, List<FieldDefinition> fieldsOfInterest) {
9+
FieldDefinition bestFieldDefinition = best.getContext();
10+
for (int idx = 0; idx < fieldsOfInterest.size(); idx++) {
11+
if (fieldsOfInterest.get(idx).equals(bestFieldDefinition)) {
12+
return (idx + 1) % fieldsOfInterest.size();
13+
}
14+
}
15+
return -1;
16+
}
17+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
package zingg.common.core.block;
2+
3+
public enum FieldIteratorType {
4+
DEFAULT,
5+
NEXT_FIELD
6+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
package zingg.common.core.block;
2+
3+
public class StartIndexProvider {
4+
public static int getStartIndex(FieldIteratorType fieldIteratorType) {
5+
return fieldIteratorType.equals(FieldIteratorType.DEFAULT) ? -1 : 0;
6+
}
7+
}

‎common/core/src/main/java/zingg/common/core/util/BlockingTreeUtil.java

+7-2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
import zingg.common.client.util.Util;
1717
import zingg.common.core.block.Block;
1818
import zingg.common.core.block.Canopy;
19+
import zingg.common.core.block.FieldIteratorType;
20+
import zingg.common.core.block.StartIndexProvider;
1921
import zingg.common.core.block.Tree;
2022
import zingg.common.core.hash.HashFunction;
2123

@@ -67,9 +69,12 @@ public Tree<Canopy<R>> createBlockingTree(ZFrame<D,R,C> testData,
6769
fd.add(def);
6870
}
6971
}
70-
72+
//set this value as
73+
//DEFAULT: default
74+
//NEXT_FIELD: start from next field definition for every bestNode call
75+
//in child
7176
Tree<Canopy<R>> blockingTree = cblock.getBlockingTree(null, null, root,
72-
fd);
77+
fd, StartIndexProvider.getStartIndex(FieldIteratorType.NEXT_FIELD));
7378
if (LOG.isDebugEnabled()) {
7479
LOG.debug("The blocking tree is ");
7580
blockingTree.print(2);

0 commit comments

Comments
 (0)
Please sign in to comment.