-
Notifications
You must be signed in to change notification settings - Fork 125
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* match type changes * refactoring * code refactoring * working changes * working tests * fixing junits * fixing junits * refactoring * refactoring changes * test changes * refactoring * working changes * working changes * code clean * code cleanup * refactoring code * changes for preprocess * working changes * preprocessor changes * fixing junits * refactoring junits * fixing junits * working changes * refactoring * updating stopword docs * code cleanup * stopwords junit * junits * working junits * added register udf as part of the process * cleaning code * documentation changes * added init so that preprocs can be prepared before invocation * latest changes * preproc order * report generated * changed preproc in pos samples to aply to individual rows instead of pairs * code cleanup * code cleanup * added cache (#1041) * Case normalize (#1027) * added Case normalizer preprocessor * removed toLowerCase() in sim call() * fixed junits * added junits for case normalizer * added for spark driver memory in spark session builder * added log * added log * logged memory in GB * abstracted out stopWord files names * added logging * added exception * changes (#1040) * changes * added select to make original order * Mapping final (#1049) * added cache * changes * added select to make original order * added cache * Case normalize (#1027) * added Case normalizer preprocessor * removed toLowerCase() in sim call() * fixed junits * added junits for case normalizer * added for spark driver memory in spark session builder * added log * added log * logged memory in GB * abstracted out stopWord files names * added logging * added exception * mapping fix for labeller * added cache * changes * added select to make original order * added cache * Case normalize (#1027) * added Case normalizer preprocessor * removed toLowerCase() in sim call() * fixed junits * added junits for case normalizer * added for spark driver memory in spark session builder * added log * added log * logged memory in GB * abstracted out stopWord files names * added logging * added exception * made join on both id and cluster * test not needed as we are already case normalizing at start * pulled changes from main * removed TestDSUtil.java * added spark driver memory from env variable * added LabellerUtil * added default method for executing preprocessor * removed redundant constructor * added no args constructor * fixing issue #1065 #1066 * fixing blocker validator * simplified assertion * changed dir name * added support for withColumns in ZFrame * updated to protected variable * updated dir name * changed method for caseNormalizer * simplified assertions * simplified assertions * removed casting * removed casting --------- Co-authored-by: sania-16 <[email protected]> Co-authored-by: Sania Goyal <[email protected]> Co-authored-by: Sonal Goyal <[email protected]> Co-authored-by: nitish <[email protected]>
- Loading branch information
1 parent
c6cb1c8
commit fd596bc
Showing
117 changed files
with
1,567 additions
and
682 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
8 changes: 8 additions & 0 deletions
8
common/client/src/main/java/zingg/common/client/IMatchType.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
package zingg.common.client; | ||
|
||
public interface IMatchType extends Named { | ||
|
||
public String toString(); | ||
|
||
} | ||
|
106 changes: 44 additions & 62 deletions
106
common/client/src/main/java/zingg/common/client/MatchType.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,86 +1,68 @@ | ||
package zingg.common.client; | ||
|
||
import java.io.Serializable; | ||
import java.util.HashMap; | ||
import java.util.Map; | ||
|
||
import com.fasterxml.jackson.annotation.JsonCreator; | ||
import com.fasterxml.jackson.annotation.JsonValue; | ||
|
||
/** | ||
* Field types used in defining the types of fields for matching. See the field | ||
* definitions and the user guide for more details | ||
*/ | ||
|
||
public enum MatchType implements Serializable { | ||
/** | ||
* Short words like first names and organizations with focus on first | ||
* characters matching | ||
*/ | ||
FUZZY("FUZZY"), | ||
|
||
/** | ||
* Fields needing exact matches | ||
*/ | ||
EXACT("EXACT"), | ||
public class MatchType implements IMatchType, Serializable{ | ||
|
||
|
||
/** | ||
* Many times pin code is xxxxx-xxxx and has to be matched with xxxxx. | ||
*/ | ||
PINCODE("PINCODE"), | ||
private static final long serialVersionUID = 1L; | ||
protected String name; | ||
|
||
/** | ||
* an email type which is supposed to look at only the first part of the email and ignore the domain. | ||
*/ | ||
EMAIL("EMAIL"), | ||
|
||
/** | ||
* Long descriptive text, usually more than a couple of words for example | ||
* product descriptions | ||
*/ | ||
TEXT("TEXT"), | ||
public MatchType(){ | ||
|
||
} | ||
|
||
/** | ||
* Strings containing numbers which need to be same. Example in addresses, | ||
* we dont want 4th street to match 5th street | ||
* Matching numbers with deviations | ||
*/ | ||
NUMERIC("NUMERIC"), | ||
/*eg P301d, P00231*/ | ||
NUMERIC_WITH_UNITS("NUMBER_WITH_UNITS"), | ||
NULL_OR_BLANK("NULL_OR_BLANK"), | ||
ONLY_ALPHABETS_EXACT("ONLY_ALPHABETS_EXACT"), | ||
ONLY_ALPHABETS_FUZZY("ONLY_ALPHABETS_FUZZY"), | ||
DONT_USE("DONT_USE"); | ||
public MatchType(String n){ | ||
this.name = n; | ||
MatchTypes.put(this); | ||
} | ||
|
||
private String value; | ||
private static Map<String, MatchType> types; | ||
@Override | ||
public String getName() { | ||
return this.name; | ||
} | ||
|
||
MatchType(String type) { | ||
this.value = type; | ||
@Override | ||
public void setName(String name) { | ||
this.name = name; | ||
} | ||
|
||
private static void init() { | ||
types = new HashMap<String, MatchType>(); | ||
for (MatchType f : MatchType.values()) { | ||
types.put(f.value, f); | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
final int prime = 31; | ||
int result = 1; | ||
result = prime * result + ((name == null) ? 0 : name.hashCode()); | ||
return result; | ||
} | ||
|
||
@JsonCreator | ||
public static MatchType getMatchType(String t) throws ZinggClientException{ | ||
if (types == null) { | ||
init(); | ||
@Override | ||
public boolean equals(Object obj) { | ||
if (this == obj) | ||
return true; | ||
if (obj == null) | ||
return false; | ||
if (getClass() != obj.getClass()) | ||
return false; | ||
MatchType other = (MatchType) obj; | ||
if (name == null) { | ||
if (other.name != null){ | ||
return false; | ||
} | ||
} | ||
else if (!name.equalsIgnoreCase(other.name)){ | ||
return false; | ||
} | ||
MatchType type = types.get(t.trim().toUpperCase()); | ||
if (type == null) throw new ZinggClientException("Unsupported Match Type: " + t); | ||
return type; | ||
return true; | ||
} | ||
|
||
@JsonValue | ||
public String value() { | ||
return value; | ||
@Override | ||
public String toString() { | ||
return name; | ||
} | ||
|
||
} |
54 changes: 54 additions & 0 deletions
54
common/client/src/main/java/zingg/common/client/MatchTypes.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
package zingg.common.client; | ||
|
||
import java.util.HashMap; | ||
import java.util.Map; | ||
|
||
public class MatchTypes { | ||
|
||
public final static IMatchType FUZZY = new MatchType("FUZZY"); | ||
public final static IMatchType EXACT = new MatchType("EXACT"); | ||
public final static IMatchType PINCODE = new MatchType("PINCODE"); | ||
public final static IMatchType EMAIL = new MatchType("EMAIL"); | ||
public final static IMatchType TEXT = new MatchType("TEXT"); | ||
public final static IMatchType NUMERIC = new MatchType("NUMERIC"); | ||
public final static IMatchType NUMERIC_WITH_UNITS = new MatchType("NUMERIC_WITH_UNITS"); | ||
public final static IMatchType NULL_OR_BLANK = new MatchType("NULL_OR_BLANK"); | ||
public final static IMatchType ONLY_ALPHABETS_EXACT = new MatchType("ONLY_ALPHABETS_EXACT"); | ||
public final static IMatchType ONLY_ALPHABETS_FUZZY = new MatchType("ONLY_ALPHABETS_FUZZY"); | ||
public final static IMatchType DONT_USE = new MatchType("DONT_USE"); | ||
|
||
public static Map<String, IMatchType> allMatchTypes;// = new HashMap<String, IMatchType>(); | ||
|
||
protected MatchTypes(){ | ||
|
||
} | ||
|
||
public static final void put(IMatchType o) { | ||
|
||
if (allMatchTypes == null) { | ||
allMatchTypes = new HashMap<String, IMatchType>(); | ||
} | ||
|
||
allMatchTypes.put(o.getName().toUpperCase(), o); | ||
} | ||
|
||
public static String[] getAllMatchTypes() { | ||
IMatchType[] zo = allMatchTypes.values().toArray(new IMatchType[allMatchTypes.size()]); | ||
int i = 0; | ||
String[] s = new String[zo.length]; | ||
for (IMatchType z: zo) { | ||
s[i++] = z.getName(); | ||
} | ||
return s; | ||
} | ||
|
||
public static IMatchType getByName(String name) throws Exception{ | ||
for (IMatchType zo: MatchTypes.allMatchTypes.values()) { | ||
if (zo.getName().equalsIgnoreCase(name)) { | ||
return zo; | ||
} | ||
} | ||
return null; | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.