Skip to content

Commit

Permalink
Mapping main synced 24 feb (#1061)
Browse files Browse the repository at this point in the history
* match type changes

* refactoring

* code refactoring

* working changes

* working tests

* fixing junits

* fixing junits

* refactoring

* refactoring changes

* test changes

* refactoring

* working changes

* working changes

* code clean

* code cleanup

* refactoring code

* changes for preprocess

* working changes

* preprocessor changes

* fixing junits

* refactoring junits

* fixing junits

* working changes

* refactoring

* updating stopword docs

* code cleanup

* stopwords junit

* junits

* working junits

* added register udf as part of the process

* cleaning code

* documentation changes

* added init so that preprocs can be prepared before invocation

* latest changes

* preproc order

* report generated

* changed preproc in pos samples to aply to individual rows instead of pairs

* code cleanup

* code cleanup

* added cache (#1041)

* Case normalize (#1027)

* added Case normalizer preprocessor

* removed toLowerCase() in sim call()

* fixed junits

* added junits for case normalizer

* added for spark driver memory in spark session builder

* added log

* added log

* logged memory in GB

* abstracted out stopWord files names

* added logging

* added exception

* changes (#1040)

* changes

* added select to make original order

* Mapping final (#1049)

* added cache

* changes

* added select to make original order

* added cache

* Case normalize (#1027)

* added Case normalizer preprocessor

* removed toLowerCase() in sim call()

* fixed junits

* added junits for case normalizer

* added for spark driver memory in spark session builder

* added log

* added log

* logged memory in GB

* abstracted out stopWord files names

* added logging

* added exception

* mapping fix for labeller

* added cache

* changes

* added select to make original order

* added cache

* Case normalize (#1027)

* added Case normalizer preprocessor

* removed toLowerCase() in sim call()

* fixed junits

* added junits for case normalizer

* added for spark driver memory in spark session builder

* added log

* added log

* logged memory in GB

* abstracted out stopWord files names

* added logging

* added exception

* made join on both id and cluster

* test not needed as we are already case normalizing at start

* pulled changes from main

* removed TestDSUtil.java

* added spark driver memory from env variable

* added LabellerUtil

* added default method for executing preprocessor

* removed redundant constructor

* added no args constructor

* fixing issue #1065 #1066

* fixing blocker validator

* simplified assertion

* changed dir name

* added support for withColumns in ZFrame

* updated to protected variable

* updated dir name

* changed method for caseNormalizer

* simplified assertions

* simplified assertions

* removed casting

* removed casting

---------

Co-authored-by: sania-16 <[email protected]>
Co-authored-by: Sania Goyal <[email protected]>
Co-authored-by: Sonal Goyal <[email protected]>
Co-authored-by: nitish <[email protected]>
  • Loading branch information
5 people authored Mar 5, 2025
1 parent c6cb1c8 commit fd596bc
Show file tree
Hide file tree
Showing 117 changed files with 1,567 additions and 682 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ public void setLabelDataSampleSize(float labelDataSampleSize) throws ZinggClient
*/
@Override
public List<? extends FieldDefinition> getFieldDefinition() {
return fieldDefinition;
return this.fieldDefinition;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@ public class FieldDefUtil implements Serializable{

public List<? extends FieldDefinition> getFieldDefinitionDontUse(List<? extends FieldDefinition> fieldDefinition) {
return fieldDefinition.stream()
.filter(x->x.matchType.contains(MatchType.DONT_USE))
.filter(x->x.matchType.contains(MatchTypes.DONT_USE))
.collect(Collectors.toList());
}

public List<? extends FieldDefinition> getFieldDefinitionToUse(List<? extends FieldDefinition> fieldDefinition) {
return fieldDefinition.stream()
.filter(x->!x.matchType.contains(MatchType.DONT_USE))
.filter(x->!x.matchType.contains(MatchTypes.DONT_USE))
.collect(Collectors.toList());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,15 @@
* @author sgoyal
*
*/
public class FieldDefinition implements Named,
Serializable {
public class FieldDefinition implements Named, Serializable {

private static final long serialVersionUID = 1L;

public static final Log LOG = LogFactory.getLog(FieldDefinition.class);

@JsonDeserialize(using = MatchTypeDeserializer.class)
@JsonSerialize(using = MatchTypeSerializer.class)
public List<MatchType> matchType;
public List<? extends IMatchType> matchType;

//@JsonSerialize(using = DataTypeSerializer.class)
public String dataType;
Expand All @@ -52,17 +51,21 @@ public class FieldDefinition implements Named,
public FieldDefinition() {
}

public String getFields() { return fields; }
public String getFields() {
return fields;
}

public void setFields(String fields) { this.fields = fields;}
public void setFields(String fields) {
this.fields = fields;
}

/**
* Get the field type of the class
*
* @return the type
*/
public List<MatchType> getMatchType() {
return matchType;
public List<? extends IMatchType> getMatchType() {
return this.matchType;
}

/**
Expand All @@ -73,12 +76,12 @@ public List<MatchType> getMatchType() {
* the type to set
*/
@JsonDeserialize(using = MatchTypeDeserializer.class)
public void setMatchType(List<MatchType> type) {
public void setMatchType(List<? extends IMatchType> type) {
this.matchType = type; //MatchTypeDeserializer.getMatchTypeFromString(type);
}


public void setMatchTypeInternal(MatchType... type) {
public void setMatchTypeInternal(IMatchType... type) {
this.matchType = Arrays.asList(type);
}

Expand Down Expand Up @@ -113,7 +116,7 @@ public void setAbbreviations(String abbreviations) {
}

public String getFieldName() {
return fieldName;
return this.fieldName;
}

public void setFieldName(String fieldName) {
Expand All @@ -122,7 +125,7 @@ public void setFieldName(String fieldName) {

@JsonIgnore
public boolean isDontUse() {
return (matchType != null && matchType.contains(MatchType.DONT_USE));
return (matchType != null && matchType.contains(MatchTypes.DONT_USE));
}

@Override
Expand Down Expand Up @@ -185,17 +188,17 @@ public void serialize(DataType dType, JsonGenerator jsonGenerator,
}
}*/

public static class MatchTypeSerializer extends StdSerializer<List<MatchType>> {
public static class MatchTypeSerializer extends StdSerializer<List<IMatchType>> {
public MatchTypeSerializer() {
this(null);
}

public MatchTypeSerializer(Class<List<MatchType>> t) {
public MatchTypeSerializer(Class<List<IMatchType>> t) {
super(t);
}

@Override
public void serialize(List<MatchType> matchType, JsonGenerator jsonGen, SerializerProvider provider)
public void serialize(List<IMatchType> matchType, JsonGenerator jsonGen, SerializerProvider provider)
throws IOException, JsonProcessingException {
try {
jsonGen.writeObject(getStringFromMatchType(matchType));
Expand All @@ -205,14 +208,14 @@ public void serialize(List<MatchType> matchType, JsonGenerator jsonGen, Serializ
}
}

public static String getStringFromMatchType(List<MatchType> matchType) throws ZinggClientException {
public static String getStringFromMatchType(List<IMatchType> matchType) throws ZinggClientException {
return String.join(",", matchType.stream()
.map(p -> p.value())
.map(p -> p.getName())
.collect(Collectors.toList()));
}
}

public static class MatchTypeDeserializer extends StdDeserializer<List<MatchType>> {
public static class MatchTypeDeserializer extends StdDeserializer<List<IMatchType>> {
private static final long serialVersionUID = 1L;

public MatchTypeDeserializer() {
Expand All @@ -222,24 +225,24 @@ public MatchTypeDeserializer(Class<String> t) {
super(t);
}
@Override
public List<MatchType> deserialize(JsonParser parser, DeserializationContext context)
public List<IMatchType> deserialize(JsonParser parser, DeserializationContext context)
throws IOException, JsonProcessingException {
ObjectMapper mapper = new ObjectMapper();
try{
mapper.enable(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY);
LOG.debug("Deserializing custom type");
return getMatchTypeFromString(mapper.readValue(parser, String.class));
}
catch(ZinggClientException e) {
catch(Exception | ZinggClientException e) {
throw new IOException(e);
}
}

public static List<MatchType> getMatchTypeFromString(String m) throws ZinggClientException{
List<MatchType> matchTypes = new ArrayList<MatchType>();
public static List<IMatchType> getMatchTypeFromString(String m) throws ZinggClientException, Exception{
List<IMatchType> matchTypes = new ArrayList<IMatchType>();
String[] matchTypeFromConfig = m.split(",");
for (String s: matchTypeFromConfig) {
MatchType mt = MatchType.getMatchType(s);
IMatchType mt = MatchTypes.getByName(s);
matchTypes.add(mt);
}
return matchTypes;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package zingg.common.client;

public interface IMatchType extends Named {

public String toString();

}

106 changes: 44 additions & 62 deletions common/client/src/main/java/zingg/common/client/MatchType.java
Original file line number Diff line number Diff line change
@@ -1,86 +1,68 @@
package zingg.common.client;

import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;

/**
* Field types used in defining the types of fields for matching. See the field
* definitions and the user guide for more details
*/

public enum MatchType implements Serializable {
/**
* Short words like first names and organizations with focus on first
* characters matching
*/
FUZZY("FUZZY"),

/**
* Fields needing exact matches
*/
EXACT("EXACT"),
public class MatchType implements IMatchType, Serializable{


/**
* Many times pin code is xxxxx-xxxx and has to be matched with xxxxx.
*/
PINCODE("PINCODE"),
private static final long serialVersionUID = 1L;
protected String name;

/**
* an email type which is supposed to look at only the first part of the email and ignore the domain.
*/
EMAIL("EMAIL"),

/**
* Long descriptive text, usually more than a couple of words for example
* product descriptions
*/
TEXT("TEXT"),
public MatchType(){

}

/**
* Strings containing numbers which need to be same. Example in addresses,
* we dont want 4th street to match 5th street
* Matching numbers with deviations
*/
NUMERIC("NUMERIC"),
/*eg P301d, P00231*/
NUMERIC_WITH_UNITS("NUMBER_WITH_UNITS"),
NULL_OR_BLANK("NULL_OR_BLANK"),
ONLY_ALPHABETS_EXACT("ONLY_ALPHABETS_EXACT"),
ONLY_ALPHABETS_FUZZY("ONLY_ALPHABETS_FUZZY"),
DONT_USE("DONT_USE");
public MatchType(String n){
this.name = n;
MatchTypes.put(this);
}

private String value;
private static Map<String, MatchType> types;
@Override
public String getName() {
return this.name;
}

MatchType(String type) {
this.value = type;
@Override
public void setName(String name) {
this.name = name;
}

private static void init() {
types = new HashMap<String, MatchType>();
for (MatchType f : MatchType.values()) {
types.put(f.value, f);
}

@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((name == null) ? 0 : name.hashCode());
return result;
}

@JsonCreator
public static MatchType getMatchType(String t) throws ZinggClientException{
if (types == null) {
init();
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
MatchType other = (MatchType) obj;
if (name == null) {
if (other.name != null){
return false;
}
}
else if (!name.equalsIgnoreCase(other.name)){
return false;
}
MatchType type = types.get(t.trim().toUpperCase());
if (type == null) throw new ZinggClientException("Unsupported Match Type: " + t);
return type;
return true;
}

@JsonValue
public String value() {
return value;
@Override
public String toString() {
return name;
}

}
54 changes: 54 additions & 0 deletions common/client/src/main/java/zingg/common/client/MatchTypes.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package zingg.common.client;

import java.util.HashMap;
import java.util.Map;

public class MatchTypes {

public final static IMatchType FUZZY = new MatchType("FUZZY");
public final static IMatchType EXACT = new MatchType("EXACT");
public final static IMatchType PINCODE = new MatchType("PINCODE");
public final static IMatchType EMAIL = new MatchType("EMAIL");
public final static IMatchType TEXT = new MatchType("TEXT");
public final static IMatchType NUMERIC = new MatchType("NUMERIC");
public final static IMatchType NUMERIC_WITH_UNITS = new MatchType("NUMERIC_WITH_UNITS");
public final static IMatchType NULL_OR_BLANK = new MatchType("NULL_OR_BLANK");
public final static IMatchType ONLY_ALPHABETS_EXACT = new MatchType("ONLY_ALPHABETS_EXACT");
public final static IMatchType ONLY_ALPHABETS_FUZZY = new MatchType("ONLY_ALPHABETS_FUZZY");
public final static IMatchType DONT_USE = new MatchType("DONT_USE");

public static Map<String, IMatchType> allMatchTypes;// = new HashMap<String, IMatchType>();

protected MatchTypes(){

}

public static final void put(IMatchType o) {

if (allMatchTypes == null) {
allMatchTypes = new HashMap<String, IMatchType>();
}

allMatchTypes.put(o.getName().toUpperCase(), o);
}

public static String[] getAllMatchTypes() {
IMatchType[] zo = allMatchTypes.values().toArray(new IMatchType[allMatchTypes.size()]);
int i = 0;
String[] s = new String[zo.length];
for (IMatchType z: zo) {
s[i++] = z.getName();
}
return s;
}

public static IMatchType getByName(String name) throws Exception{
for (IMatchType zo: MatchTypes.allMatchTypes.values()) {
if (zo.getName().equalsIgnoreCase(name)) {
return zo;
}
}
return null;
}

}
2 changes: 2 additions & 0 deletions common/client/src/main/java/zingg/common/client/ZFrame.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ public interface ZFrame<D, R, C> {

public static final String RIGHT_JOIN = "right";
public static final String LEFT_JOIN = "left";
public static final String INNER_JOIN = "inner";

public static final String COL_COUNT = "count";
public static final String COL_VALUE = "VALUE";
Expand Down Expand Up @@ -75,6 +76,7 @@ public interface ZFrame<D, R, C> {
public ZFrame<D, R, C> unionByName(ZFrame<D, R, C> other, boolean flag);

public <A> ZFrame<D, R, C> withColumn(String s, A c);
public ZFrame<D, R, C> withColumns(String[] columns, C[] columnValues);


public ZFrame<D, R, C> repartition(int num);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,9 @@ public interface ColName {
public static final String MODEL_ID_COL = COL_PREFIX + "modelId";
public static final String RAW_PREDICTION="rawPrediction";
public static final String COL_COUNT = COL_PREFIX + "count";
public static final String COL_WORD = COL_PREFIX + "word";
public static final String COL_WORD = COL_PREFIX + "stopword";
public static final String COL_SPLIT = COL_PREFIX + "split";
public static final String HASH_COUNTS_COL = ColName.HASH_COL + "_count";
public static final String BLOCK_SAMPLES = "blockSamples/";
public static final String STOPWORD_COL = COL_PREFIX + "word";

}
Loading

0 comments on commit fd596bc

Please sign in to comment.