Skip to content

Commit 00efe14

Browse files
authored
Merge pull request zinggAI#925 from sania-16/sania
Sim and Hash functions for Boolean Datatype
2 parents d01a005 + 6008c7d commit 00efe14

File tree

5 files changed

+49
-4
lines changed

5 files changed

+49
-4
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
package zingg.common.core.feature;
2+
3+
import zingg.common.client.FieldDefinition;
4+
import zingg.common.client.MatchType;
5+
import zingg.common.core.similarity.function.CheckNullFunction;
6+
import zingg.common.core.similarity.function.SimilarityFunctionExact;
7+
8+
public class BooleanFeature extends BaseFeature<Boolean> {
9+
10+
private static final long serialVersionUID = 1L;
11+
12+
public BooleanFeature() {
13+
}
14+
15+
public void init(FieldDefinition f){
16+
setFieldDefinition(f);
17+
if (f.getMatchType().contains(MatchType.EXACT)) {
18+
addSimFunction(new SimilarityFunctionExact<Boolean>("BooleanSimilarityFunctionExact"));
19+
}
20+
if (f.getMatchType().contains(MatchType.NULL_OR_BLANK)) {
21+
addSimFunction(new CheckNullFunction<Boolean>("CheckNullFunctionBoolean"));
22+
}
23+
}
24+
25+
}

common/core/src/main/java/zingg/common/core/hash/HashFunctionRegistry.java

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ public interface HashFunctionRegistry<D,R,C,T> {
66
/*
77
init(new IdentityString());
88
init(new IdentityInteger());
9+
init(new IdentityBoolean());
910
init(new First1Chars());
1011
init(new First2Chars());
1112
init(new First3Chars());
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
package zingg.common.core.hash;
2+
3+
public class IdentityBoolean extends BaseHash<Boolean, Boolean>{
4+
5+
private static final long serialVersionUID = 1L;
6+
7+
public IdentityBoolean() {
8+
setName("identityBoolean");
9+
}
10+
11+
public Boolean call(Boolean field) {
12+
return field;
13+
}
14+
15+
}

docs/stepbystep/configuration/field-definitions.md

+6-4
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,17 @@ The way to match the given field. Multiple match types, separated by commas, can
3232

3333
| Match Type | Description | Applicable To |
3434
| ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------- |
35-
| FUZZY | Broad matches with typos, abbreviations, and other variations. | string, integer, long, double, date |
36-
| EXACT | No tolerance with variations, Preferable for country codes, pin codes, and other categorical variables where you expect no variations. | string, integer, long, date |
35+
| FUZZY | Broad matches with typos, abbreviations, and other variations. | string, integer, long, double, date |
36+
| EXACT | No tolerance with variations, Preferable for country codes, pin codes, and other categorical variables where you expect no variations. | string, integer, long, date, boolean |
3737
| DONT\_USE | Appears in the output but no computation is done on these. Helpful for fields like ids that are required in the output. DONT\_USE fields are not shown to the user while labeling, if [showConcise](field-definitions.md#showconcise) is set to true. | any |
3838
| EMAIL | Matches only the id part of the email before the @ character | any |
3939
| PINCODE | Matches pin codes like xxxxx-xxxx with xxxxx | string |
40-
| NULL\_OR\_BLANK | By default Zingg treats nulls as matches, but if we add this to a field which has other match type like FUZZY, Zingg will build a feature for null values and learn | string, integer, long, date |
40+
| NULL\_OR\_BLANK | By default Zingg treats nulls as matches, but if we add this to a field which has other match type like FUZZY, Zingg will build a feature for null values and learn | string, integer, long, date, boolean |
4141
| TEXT | Compares words overlap between two strings. Good for descriptive fields without much typos | string |
4242
| NUMERIC | extracts numbers from strings and compares how many of them are same across both strings, for example apartment numbers. | string |
4343
| NUMERIC\_WITH\_UNITS | extracts product codes or numbers with units, for example 16gb from strings and compares how many are same across both strings | string |
4444
| ONLY\_ALPHABETS\_EXACT | only looks at the alphabetical characters and compares if they are exactly the same. when the numbers inside strings do not matter, for example if you are looking at buildings but want to ignore flat numbers | string |
45-
| ONLY\_ALPHABETS\_FUZZY | ignores any numbers in the strings and then does a fuzzy comparison, useful for fields like addresses with typos where you want to look at street number separately using NUMERIC | string |
45+
| ONLY\_ALPHABETS\_FUZZY | ignores any numbers in the strings and then does a fuzzy comparison, useful for fields like addresses with typos where you want to look at street number separately using
46+
NUMERIC | string |
47+
4648
####

spark/core/src/main/java/zingg/spark/core/feature/SparkFeatureFactory.java

+2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import org.apache.spark.sql.types.DataTypes;
77

88
import zingg.common.core.feature.ArrayDoubleFeature;
9+
import zingg.common.core.feature.BooleanFeature;
910
import zingg.common.core.feature.DateFeature;
1011
import zingg.common.core.feature.DoubleFeature;
1112
import zingg.common.core.feature.FeatureFactory;
@@ -28,6 +29,7 @@ public void init() {
2829
map.put(DataTypes.FloatType, FloatFeature.class);
2930
map.put(DataTypes.LongType, LongFeature.class);
3031
map.put(DataTypes.createArrayType(DataTypes.DoubleType), ArrayDoubleFeature.class);
32+
map.put(DataTypes.BooleanType,BooleanFeature.class);
3133
}
3234

3335
@Override

0 commit comments

Comments
 (0)