Skip to content

Commit 143fcbc

Browse files
committedOct 30, 2024·
ftd changes
1 parent 54ee88e commit 143fcbc

File tree

15 files changed

+187
-90
lines changed

15 files changed

+187
-90
lines changed
 

‎assembly/dependency-reduced-pom.xml

+26-6
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,32 @@
6565
</plugins>
6666
</build>
6767
<dependencies>
68+
<dependency>
69+
<groupId>org.mockito</groupId>
70+
<artifactId>mockito-inline</artifactId>
71+
<version>5.2.0</version>
72+
<scope>test</scope>
73+
</dependency>
74+
<dependency>
75+
<groupId>org.mockito</groupId>
76+
<artifactId>mockito-core</artifactId>
77+
<version>5.2.0</version>
78+
<scope>test</scope>
79+
<exclusions>
80+
<exclusion>
81+
<artifactId>byte-buddy</artifactId>
82+
<groupId>net.bytebuddy</groupId>
83+
</exclusion>
84+
<exclusion>
85+
<artifactId>byte-buddy-agent</artifactId>
86+
<groupId>net.bytebuddy</groupId>
87+
</exclusion>
88+
<exclusion>
89+
<artifactId>objenesis</artifactId>
90+
<groupId>org.objenesis</groupId>
91+
</exclusion>
92+
</exclusions>
93+
</dependency>
6894
<dependency>
6995
<groupId>org.junit.jupiter</groupId>
7096
<artifactId>junit-jupiter-engine</artifactId>
@@ -113,12 +139,6 @@
113139
</exclusion>
114140
</exclusions>
115141
</dependency>
116-
<dependency>
117-
<groupId>org.mockito</groupId>
118-
<artifactId>mockito-all</artifactId>
119-
<version>1.8.4</version>
120-
<scope>test</scope>
121-
</dependency>
122142
<dependency>
123143
<groupId>org.hamcrest</groupId>
124144
<artifactId>hamcrest-all</artifactId>

‎common/core/src/main/java/zingg/common/core/block/Block.java

+6-8
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,12 @@ public abstract class Block<D,R,C,T> implements Serializable {
3030
ZFrame<D,R,C> training;
3131
protected ListMap<HashFunction<D,R,C,T>, String> childless;
3232

33-
public Block(HashUtility hashUtility) {
34-
HashFunctionUtilityFactory<D, R, C, T> hashFunctionUtilityFactory = new HashFunctionUtilityFactory<D, R, C, T>();
35-
this.hashFunctionUtility = hashFunctionUtilityFactory.getHashFunctionUtility(hashUtility);
33+
public Block() {
34+
this.hashFunctionUtility = HashFunctionUtilityFactory.getHashFunctionUtility(HashUtility.CACHED);
3635
}
3736

38-
public Block(ZFrame<D,R,C> training, ZFrame<D,R,C> dupes, HashUtility hashUtility) {
39-
HashFunctionUtilityFactory<D, R, C, T> hashFunctionUtilityFactory = new HashFunctionUtilityFactory<D, R, C, T>();
40-
this.hashFunctionUtility = hashFunctionUtilityFactory.getHashFunctionUtility(hashUtility);
37+
public Block(ZFrame<D,R,C> training, ZFrame<D,R,C> dupes) {
38+
this.hashFunctionUtility = HashFunctionUtilityFactory.getHashFunctionUtility(HashUtility.CACHED);
4139
this.training = training;
4240
this.dupes = dupes;
4341
childless = new ListMap<HashFunction<D,R,C,T>, String>();
@@ -48,8 +46,8 @@ public Block(ZFrame<D,R,C> training, ZFrame<D,R,C> dupes, HashUtility hashUtilit
4846
}
4947

5048
public Block(ZFrame<D,R,C> training, ZFrame<D,R,C> dupes,
51-
ListMap<T, HashFunction<D, R, C, T>> functionsMap, long maxSize, HashUtility hashUtility) {
52-
this(training, dupes, hashUtility);
49+
ListMap<T, HashFunction<D, R, C, T>> functionsMap, long maxSize) {
50+
this(training, dupes);
5351
this.functionsMap = functionsMap;
5452
// functionsMap.prettyPrint();
5553
this.maxSize = maxSize;
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
package zingg.common.core.block;
22

3-
public class HashFunctionUtilityFactory<D, R, C, T> {
3+
public class HashFunctionUtilityFactory {
44

5-
public IHashFunctionUtility<D, R, C, T> getHashFunctionUtility(HashUtility hashUtility) {
5+
public static IHashFunctionUtility getHashFunctionUtility(HashUtility hashUtility) {
66

77
if (HashUtility.DEFAULT.equals(hashUtility)) {
8-
return new DefaultHashFunctionUtility<D, R, C, T>();
8+
return new DefaultHashFunctionUtility();
99
}
10-
return new CacheBasedHashFunctionUtility<D, R, C, T>();
10+
return new CacheBasedHashFunctionUtility();
1111
}
1212
}

‎common/core/src/main/java/zingg/common/core/executor/Trainer.java

+1-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
import zingg.common.client.util.ColName;
99
import zingg.common.client.util.ColValues;
1010
import zingg.common.core.block.Canopy;
11-
import zingg.common.core.block.HashUtility;
1211
import zingg.common.core.block.Tree;
1312
import zingg.common.core.model.Model;
1413
import zingg.common.core.util.Analytics;
@@ -43,7 +42,7 @@ public void execute() throws ZinggClientException {
4342
ZFrame<D,R,C> testData = getStopWords().preprocessForStopWords(testDataOriginal);
4443

4544
Tree<Canopy<R>> blockingTree = getBlockingTreeUtil().createBlockingTreeFromSample(testData, positives, 0.5,
46-
-1, args, getHashUtil().getHashFunctionList(), HashUtility.CACHED);
45+
-1, args, getHashUtil().getHashFunctionList());
4746
if (blockingTree == null || blockingTree.getSubTrees() == null) {
4847
LOG.warn("Seems like no indexing rules have been learnt");
4948
}

‎common/core/src/main/java/zingg/common/core/executor/TrainingDataFinder.java

+1-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
package zingg.common.core.executor;
22

3-
import java.util.Arrays;
4-
53
import org.apache.commons.logging.Log;
64
import org.apache.commons.logging.LogFactory;
75

@@ -13,7 +11,6 @@
1311
import zingg.common.client.util.ColName;
1412
import zingg.common.client.util.ColValues;
1513
import zingg.common.core.block.Canopy;
16-
import zingg.common.core.block.HashUtility;
1714
import zingg.common.core.block.Tree;
1815
import zingg.common.core.model.Model;
1916
import zingg.common.core.preprocess.StopWordsRemover;
@@ -88,7 +85,7 @@ public void execute() throws ZinggClientException {
8885

8986
ZFrame<D,R,C> sample = getStopWords().preprocessForStopWords(sampleOrginal);
9087

91-
Tree<Canopy<R>> tree = getBlockingTreeUtil().createBlockingTree(sample, posPairs, 1, -1, args, getHashUtil().getHashFunctionList(), HashUtility.CACHED);
88+
Tree<Canopy<R>> tree = getBlockingTreeUtil().createBlockingTree(sample, posPairs, 1, -1, args, getHashUtil().getHashFunctionList());
9289
//tree.print(2);
9390
ZFrame<D,R,C> blocked = getBlockingTreeUtil().getBlockHashes(sample, tree);
9491

‎common/core/src/main/java/zingg/common/core/util/BlockingTreeUtil.java

+5-6
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
import zingg.common.client.util.Util;
1717
import zingg.common.core.block.Block;
1818
import zingg.common.core.block.Canopy;
19-
import zingg.common.core.block.HashUtility;
2019
import zingg.common.core.block.Tree;
2120
import zingg.common.core.hash.HashFunction;
2221

@@ -37,13 +36,13 @@ public void setPipeUtil(PipeUtilBase<S, D, R, C> pipeUtil) {
3736

3837

3938
public abstract Block<D,R,C,T> getBlock(ZFrame<D,R,C> sample, ZFrame<D,R,C> positives,
40-
ListMap<T, HashFunction<D,R,C,T>>hashFunctions, long blockSize, HashUtility hashUtility);
39+
ListMap<T, HashFunction<D,R,C,T>>hashFunctions, long blockSize);
4140

4241

4342
public Tree<Canopy<R>> createBlockingTree(ZFrame<D,R,C> testData,
4443
ZFrame<D,R,C> positives, double sampleFraction, long blockSize,
4544
IArguments args,
46-
ListMap<T, HashFunction<D,R,C,T>> hashFunctions, HashUtility hashUtility) throws Exception, ZinggClientException {
45+
ListMap<T, HashFunction<D,R,C,T>> hashFunctions) throws Exception, ZinggClientException {
4746
ZFrame<D,R,C> sample = testData.sample(false, sampleFraction);
4847
sample = sample.cache();
4948
long totalCount = sample.count();
@@ -55,7 +54,7 @@ public Tree<Canopy<R>> createBlockingTree(ZFrame<D,R,C> testData,
5554
LOG.info("Learning indexing rules for block size " + blockSize);
5655

5756
positives = positives.coalesce(1);
58-
Block<D,R,C,T> cblock = getBlock(sample, positives, hashFunctions, blockSize, hashUtility);
57+
Block<D,R,C,T> cblock = getBlock(sample, positives, hashFunctions, blockSize);
5958
Canopy<R> root = new Canopy<R>(sample.collectAsList(), positives.collectAsList());
6059

6160
List<FieldDefinition> fd = new ArrayList<FieldDefinition> ();
@@ -79,9 +78,9 @@ public Tree<Canopy<R>> createBlockingTree(ZFrame<D,R,C> testData,
7978

8079
public Tree<Canopy<R>> createBlockingTreeFromSample(ZFrame<D,R,C> testData,
8180
ZFrame<D,R,C> positives, double sampleFraction, long blockSize, IArguments args,
82-
ListMap hashFunctions, HashUtility hashUtility) throws Exception, ZinggClientException {
81+
ListMap hashFunctions) throws Exception, ZinggClientException {
8382
ZFrame<D,R,C> sample = testData.sample(false, sampleFraction);
84-
return createBlockingTree(sample, positives, sampleFraction, blockSize, args, hashFunctions, hashUtility);
83+
return createBlockingTree(sample, positives, sampleFraction, blockSize, args, hashFunctions);
8584
}
8685

8786
public void writeBlockingTree(Tree<Canopy<R>> blockingTree, IArguments args) throws Exception, ZinggClientException {

‎common/core/src/test/java/zingg/common/core/block/TestBlockBase.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ public void testTree() throws Throwable {
4343
IArguments args = getArguments();
4444

4545
Tree<Canopy<R>> blockingTree = blockingTreeUtil.createBlockingTreeFromSample(zFrameEvent, zFrameEventCluster, 0.5, -1,
46-
args, hashUtil.getHashFunctionList(), HashUtility.CACHED);
46+
args, hashUtil.getHashFunctionList());
4747

4848
// primary deciding is unique year so identityInteger should have been picked
4949
Canopy<R> head = blockingTree.getHead();

‎common/core/src/test/java/zingg/common/core/block/TestBlockingTreeUtil.java

+79-13
Original file line numberDiff line numberDiff line change
@@ -2,25 +2,34 @@
22

33
import org.junit.jupiter.api.Assertions;
44
import org.junit.jupiter.api.Test;
5+
import org.mockito.MockedStatic;
6+
import org.mockito.Mockito;
57
import zingg.common.client.Arguments;
68
import zingg.common.client.ArgumentsUtil;
9+
import zingg.common.client.FieldDefinition;
710
import zingg.common.client.IArguments;
11+
import zingg.common.client.MatchType;
812
import zingg.common.client.ZFrame;
913
import zingg.common.client.ZinggClientException;
1014
import zingg.common.client.util.DFObjectUtil;
11-
import zingg.common.core.block.data.CsvReader;
15+
import zingg.common.client.util.ListMap;
1216
import zingg.common.core.block.data.DataUtility;
1317
import zingg.common.core.block.model.Customer;
1418
import zingg.common.core.block.model.CustomerDupe;
19+
import zingg.common.core.hash.HashFunction;
1520
import zingg.common.core.util.BlockingTreeUtil;
21+
import zingg.common.core.util.CsvReader;
1622
import zingg.common.core.util.HashUtil;
23+
import zingg.common.core.util.Heuristics;
1724

25+
import java.util.ArrayList;
1826
import java.util.Iterator;
1927
import java.util.List;
2028
import java.util.Objects;
2129

2230
import static java.lang.Math.max;
2331

32+
2433
public abstract class TestBlockingTreeUtil<S, D, R, C, T> {
2534

2635
protected String TEST_DATA_BASE_LOCATION;
@@ -61,33 +70,63 @@ public void testSameBlockingTreeWithVariance() throws Exception, ZinggClientExce
6170
testSameBlockingTree(zFrameTest, zFramePositives);
6271
}
6372

73+
6474
public void testSameBlockingTree(ZFrame<D, R, C> zFrameTest, ZFrame<D, R, C> zFramePositives) throws Exception, ZinggClientException {
6575
setTestDataBaseLocation();
66-
BlockingTreeUtil<S, D, R, C, T> blockingTreeUtil = getBlockingTreeUtil();
6776
HashUtil<S, D, R, C, T> hashUtil = getHashUtil();
6877

69-
7078
IArguments args = new ArgumentsUtil(Arguments.class).createArgumentsFromJSON(
7179
TEST_DATA_BASE_LOCATION + "/" + CONFIG_FILE,
7280
"");
7381
args.setBlockSize(8);
7482

75-
long ts = System.currentTimeMillis();
76-
Tree<Canopy<R>> blockingTreeOptimized = blockingTreeUtil.createBlockingTree(zFrameTest, zFramePositives, 1, -1,
77-
args, hashUtil.getHashFunctionList(), HashUtility.CACHED);
78-
System.out.println("************ time taken to create optimized blocking tree ************ " + (System.currentTimeMillis() - ts));
79-
80-
ts = System.currentTimeMillis();
81-
Tree<Canopy<R>> blockingTreeDefault = blockingTreeUtil.createBlockingTree(zFrameTest, zFramePositives, 1, -1,
82-
args, hashUtil.getHashFunctionList(), HashUtility.DEFAULT);
83-
System.out.println("************ time taken to create blocking tree ************ " + (System.currentTimeMillis() - ts));
83+
Tree<Canopy<R>> blockingTreeOptimized = getBlockingTree(zFrameTest, zFramePositives, hashUtil, args, "cached");
84+
Tree<Canopy<R>> blockingTreeDefault = getBlockingTree(zFrameTest, zFramePositives, hashUtil, args, "default");
8485

8586
int depth = 1;
8687
//assert both the trees are equal
8788
Assertions.assertTrue(dfsSameTreeValidation(blockingTreeDefault, blockingTreeOptimized, depth));
8889

8990
System.out.println("-------- max depth of trees -------- " + maxDepth);
90-
System.out.println("-------- total nodes in a trees -------- " + totalNodes);
91+
System.out.println("-------- total nodes in a trees ---- " + totalNodes);
92+
}
93+
94+
95+
private Tree<Canopy<R>> getBlockingTree(ZFrame<D, R, C> zFrameTest, ZFrame<D, R, C> zFramePositives, HashUtil<S, D, R, C, T> hashUtil,
96+
IArguments args, String blockingTreeType) throws Exception, ZinggClientException {
97+
long ts = System.currentTimeMillis();
98+
Block<D, R, C, T> block;
99+
if ("cached".equals(blockingTreeType)) {
100+
block = getCachedBasedBlock(zFrameTest, zFramePositives, hashUtil, args);
101+
} else {
102+
block = getDefaultBlock(zFrameTest, zFramePositives, hashUtil, args);
103+
}
104+
Canopy<R> root = getCanopy(zFrameTest, zFramePositives, 1);
105+
Tree<Canopy<R>> blockingTree = block.getBlockingTree(null, null, root, getFieldDefinitions(args));
106+
System.out.println("************ time taken to create " + blockingTreeType + " blocking tree ************, " + (System.currentTimeMillis() - ts));
107+
return blockingTree;
108+
}
109+
110+
//Override with new CacheBasedHashFunctionUtility<D, R, C, T>()
111+
private Block<D, R, C, T> getCachedBasedBlock(ZFrame<D, R, C> zFrameTest, ZFrame<D, R, C> zFramePositives,
112+
HashUtil<S, D, R, C, T> hashUtil, IArguments arguments) throws Exception {
113+
try (MockedStatic<HashFunctionUtilityFactory> hashFunctionUtilityFactoryMock = Mockito.mockStatic(HashFunctionUtilityFactory.class)) {
114+
hashFunctionUtilityFactoryMock.when(() -> HashFunctionUtilityFactory.getHashFunctionUtility(Mockito.any(HashUtility.class)))
115+
.thenReturn(new CacheBasedHashFunctionUtility<D, R, C, T>());
116+
return getBlock(zFrameTest, 1, zFramePositives, -1,
117+
hashUtil.getHashFunctionList(), arguments);
118+
}
119+
}
120+
121+
//Override with new DefaultHashFunctionUtility<>()
122+
private Block<D, R, C, T> getDefaultBlock(ZFrame<D, R, C> zFrameTest, ZFrame<D, R, C> zFramePositives,
123+
HashUtil<S, D, R, C, T> hashUtil, IArguments arguments) throws Exception {
124+
try (MockedStatic<HashFunctionUtilityFactory> hashFunctionUtilityFactoryMock = Mockito.mockStatic(HashFunctionUtilityFactory.class)) {
125+
hashFunctionUtilityFactoryMock.when(() -> HashFunctionUtilityFactory.getHashFunctionUtility(Mockito.any(HashUtility.class)))
126+
.thenReturn(new DefaultHashFunctionUtility<D, R, C, T>());
127+
return getBlock(zFrameTest, 1, zFramePositives, -1,
128+
hashUtil.getHashFunctionList(), arguments);
129+
}
91130
}
92131

93132

@@ -159,9 +198,36 @@ private boolean isNodeSubTreesSizeEqual(Tree<Canopy<R>> node1, Tree<Canopy<R>> n
159198
return node1.getSubTrees().size() == node2.getSubTrees().size();
160199
}
161200

201+
private Block<D, R, C, T> getBlock(ZFrame<D, R, C> testData, double sampleFraction, ZFrame<D,R,C> positives,
202+
long blockSize, ListMap<T, HashFunction<D,R,C,T>> hashFunctions, IArguments args) {
203+
ZFrame<D,R,C> sample = testData.sample(false, sampleFraction);
204+
long totalCount = sample.count();
205+
if (blockSize == -1) blockSize = Heuristics.getMaxBlockSize(totalCount, args.getBlockSize());
206+
positives = positives.coalesce(1);
207+
Block<D,R,C,T> cblock = getBlock(sample, positives, hashFunctions, blockSize);
208+
return cblock;
209+
}
210+
211+
private Canopy<R> getCanopy(ZFrame<D,R,C> testData, ZFrame<D,R,C> positives, double sampleFraction) {
212+
ZFrame<D,R,C> sample = testData.sample(false, sampleFraction);
213+
return new Canopy<R>(sample.collectAsList(), positives.collectAsList());
214+
}
215+
216+
private List<FieldDefinition> getFieldDefinitions(IArguments arguments) {
217+
List<FieldDefinition> fieldDefinitions = new ArrayList<FieldDefinition>();
218+
219+
for (FieldDefinition def : arguments.getFieldDefinition()) {
220+
if (! (def.getMatchType() == null || def.getMatchType().contains(MatchType.DONT_USE))) {
221+
fieldDefinitions.add(def);
222+
}
223+
}
224+
return fieldDefinitions;
225+
}
162226

163227
protected abstract DFObjectUtil<S, D, R, C> getDFObjectUtil();
164228
protected abstract BlockingTreeUtil<S, D, R, C, T> getBlockingTreeUtil();
165229
protected abstract HashUtil<S, D, R, C, T> getHashUtil();
166230
protected abstract void setTestDataBaseLocation();
231+
protected abstract Block<D, R, C, T> getBlock(ZFrame<D,R,C> sample, ZFrame<D,R,C> positives,
232+
ListMap<T, HashFunction<D,R,C,T>>hashFunctions, long blockSize);
167233
}

‎common/core/src/test/java/zingg/common/core/block/data/CsvReader.java

-30
This file was deleted.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
package zingg.common.core.block.data;
2+
3+
import zingg.common.core.util.IFromCsv;
4+
import java.io.FileNotFoundException;
5+
import java.util.List;
6+
7+
public interface ICsvReader extends DataReader {
8+
List<? extends IFromCsv> getRecords(String file, boolean skipHeader) throws FileNotFoundException;
9+
}

‎common/core/src/test/java/zingg/common/core/util/CsvReader.java

+28-2
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,19 @@
11
package zingg.common.core.util;
22

3+
import com.opencsv.CSVReader;
4+
import com.opencsv.CSVReaderBuilder;
5+
import com.opencsv.exceptions.CsvException;
6+
import zingg.common.core.block.data.ICsvReader;
7+
38
import java.io.File;
49
import java.io.FileNotFoundException;
10+
import java.io.FileReader;
11+
import java.io.IOException;
512
import java.util.ArrayList;
613
import java.util.List;
714
import java.util.Scanner;
815

9-
public class CsvReader {
16+
public class CsvReader implements ICsvReader {
1017
protected List<? extends IFromCsv> records;
1118
IFromCsv creator;
1219

@@ -15,7 +22,18 @@ public CsvReader(IFromCsv creator){
1522
this.creator = creator;
1623
}
1724

18-
public List<? extends IFromCsv> getRecords(String file, boolean skipHeader) throws FileNotFoundException{
25+
//default constructor
26+
public CsvReader() {
27+
28+
}
29+
30+
public List<String[]> readDataFromSource(String source) throws IOException, CsvException {
31+
CSVReader csvReader = getCSVReader(source);
32+
List<String[]> allData = csvReader.readAll();
33+
return allData;
34+
}
35+
36+
public List<? extends IFromCsv> getRecords(String file, boolean skipHeader) throws FileNotFoundException {
1937
int lineno = 0;
2038
try (Scanner scanner = new Scanner(new File(file))) {
2139
while (scanner.hasNextLine()) {
@@ -25,4 +43,12 @@ public List<? extends IFromCsv> getRecords(String file, boolean skipHeader) thro
2543
return records;
2644
}
2745

46+
private CSVReader getCSVReader(String source) throws IOException {
47+
FileReader filereader = new FileReader(source);
48+
CSVReader csvReader = new CSVReaderBuilder(filereader)
49+
.withSkipLines(1)
50+
.build();
51+
return csvReader;
52+
}
53+
2854
}

‎pom.xml

+12-6
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,18 @@
9393
</repositories>
9494

9595
<dependencies>
96+
<dependency>
97+
<groupId>org.mockito</groupId>
98+
<artifactId>mockito-inline</artifactId>
99+
<version>5.2.0</version>
100+
<scope>test</scope>
101+
</dependency>
102+
<dependency>
103+
<groupId>org.mockito</groupId>
104+
<artifactId>mockito-core</artifactId>
105+
<version>5.2.0</version>
106+
<scope>test</scope>
107+
</dependency>
96108
<dependency>
97109
<groupId>com.opencsv</groupId>
98110
<artifactId>opencsv</artifactId>
@@ -116,12 +128,6 @@
116128
<version>5.8.1</version>
117129
<scope>test</scope>
118130
</dependency>
119-
<dependency>
120-
<groupId>org.mockito</groupId>
121-
<artifactId>mockito-all</artifactId>
122-
<version>1.8.4</version>
123-
<scope>test</scope>
124-
</dependency>
125131
<dependency>
126132
<groupId>org.hamcrest</groupId>
127133
<artifactId>hamcrest-all</artifactId>

‎spark/core/src/main/java/zingg/spark/core/block/SparkBlock.java

+4-4
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,14 @@ public class SparkBlock extends Block<Dataset<Row>, Row, Column, DataType> {
1818
private static final long serialVersionUID = 1L;
1919

2020

21-
public SparkBlock(HashUtility hashUtility){
22-
super(hashUtility);
21+
public SparkBlock(){
22+
super();
2323
}
2424

2525

2626
public SparkBlock(ZFrame<Dataset<Row>, Row, Column> training, ZFrame<Dataset<Row>, Row, Column> dupes,
27-
ListMap<DataType, HashFunction<Dataset<Row>, Row, Column, DataType>> functionsMap, long maxSize, HashUtility hashUtility) {
28-
super(training, dupes, functionsMap, maxSize, hashUtility);
27+
ListMap<DataType, HashFunction<Dataset<Row>, Row, Column, DataType>> functionsMap, long maxSize) {
28+
super(training, dupes, functionsMap, maxSize);
2929
}
3030

3131
@Override

‎spark/core/src/main/java/zingg/spark/core/util/SparkBlockingTreeUtil.java

+2-4
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import org.apache.spark.sql.Encoders;
1212
import org.apache.spark.sql.Row;
1313
import org.apache.spark.sql.RowFactory;
14-
import org.apache.spark.sql.catalyst.encoders.RowEncoder;
1514
import org.apache.spark.sql.types.DataType;
1615
import org.apache.spark.sql.types.DataTypes;
1716
import org.apache.spark.sql.types.StructField;
@@ -23,7 +22,6 @@
2322
import zingg.common.client.util.PipeUtilBase;
2423
import zingg.common.core.block.Block;
2524
import zingg.common.core.block.Canopy;
26-
import zingg.common.core.block.HashUtility;
2725
import zingg.common.core.block.Tree;
2826
import zingg.common.core.hash.HashFunction;
2927
import zingg.common.core.util.BlockingTreeUtil;
@@ -88,8 +86,8 @@ public Tree<Canopy<Row>> readBlockingTree(Arguments args) throws Exception, Zing
8886
public Block<Dataset<Row>, Row, Column, DataType> getBlock(ZFrame<Dataset<Row>, Row, Column> sample,
8987
ZFrame<Dataset<Row>, Row, Column> positives,
9088
ListMap<DataType, HashFunction<Dataset<Row>, Row, Column, DataType>> hashFunctions,
91-
long blockSize, HashUtility hashUtility) {
89+
long blockSize) {
9290
// TODO Auto-generated method stub
93-
return new SparkBlock(sample, positives, hashFunctions, blockSize, hashUtility);
91+
return new SparkBlock(sample, positives, hashFunctions, blockSize);
9492
}
9593
}

‎spark/core/src/test/java/zingg/common/core/block/TestSparkBlockingTreeUtil.java

+9
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,17 @@
77
import org.apache.spark.sql.types.DataType;
88
import org.junit.jupiter.api.extension.ExtendWith;
99
import zingg.TestSparkBase;
10+
import zingg.common.client.ZFrame;
1011
import zingg.common.client.util.DFObjectUtil;
1112
import zingg.common.client.util.IWithSession;
13+
import zingg.common.client.util.ListMap;
1214
import zingg.common.client.util.WithSession;
15+
import zingg.common.core.hash.HashFunction;
1316
import zingg.common.core.util.BlockingTreeUtil;
1417
import zingg.common.core.util.HashUtil;
1518
import zingg.spark.client.util.SparkDFObjectUtil;
1619
import zingg.spark.client.util.SparkPipeUtil;
20+
import zingg.spark.core.block.SparkBlock;
1721
import zingg.spark.core.util.SparkBlockingTreeUtil;
1822
import zingg.spark.core.util.SparkHashUtil;
1923

@@ -46,4 +50,9 @@ protected HashUtil<SparkSession, Dataset<Row>, Row, Column, DataType> getHashUti
4650
protected void setTestDataBaseLocation() {
4751
TEST_DATA_BASE_LOCATION = "/home/administrator/zingg/zinggOSS/examples/febrl";
4852
}
53+
54+
@Override
55+
protected Block<Dataset<Row>, Row, Column, DataType> getBlock(ZFrame<Dataset<Row>, Row, Column> sample, ZFrame<Dataset<Row>, Row, Column> positives, ListMap<DataType, HashFunction<Dataset<Row>, Row, Column, DataType>> hashFunctions, long blockSize) {
56+
return new SparkBlock(sample, positives, hashFunctions, blockSize);
57+
}
4958
}

0 commit comments

Comments
 (0)
Please sign in to comment.