From b630f2010e3139ebf1d026bb2ac81d549e5725d5 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Fri, 21 Apr 2023 19:48:59 +0530 Subject: [PATCH] fromDDL instead of JSON for schema --- .../configWithMultipleMatchTypes.json | 20 ++++----- ...nfigWithMultipleMatchTypesUnsupported.json | 20 ++++----- .../src/test/resources/documenter/config.json | 22 +++++----- .../test/resources/testDocumenter/config.json | 18 ++++---- .../src/test/resources/testFebrl/config.json | 22 +++++----- .../test/resources/testPeekModel/config.json | 22 +++++----- docs/accuracy/stopWordsRemoval.md | 2 +- .../stopwordsremoval/README.md | 2 +- docs/running/databricks.md | 42 +++++++++---------- examples/amazon-google/config.json | 10 ++--- .../amazon-google/configWithStopWords.json | 10 ++--- examples/beerAdvo-rateBeer/config.json | 10 ++--- examples/databricks/config120.json | 18 ++++---- examples/febrl/config.json | 22 +++++----- examples/febrl/configLink.json | 20 ++++----- examples/febrl/configSnow.json | 20 ++++----- examples/febrl/configWithTrainingSamples.json | 20 ++++----- examples/febrl/findTrainingData.json | 20 ++++----- examples/febrl120k/config.json | 22 +++++----- examples/febrl120k/config120k.json | 20 ++++----- examples/febrl120k/config500k.json | 20 ++++----- examples/iTunes-amazon/config.json | 18 ++++---- examples/ncVoters5M/config.json | 10 ++--- perf/120kconfig.json | 18 ++++---- python/zingg/client.py | 3 +- scala/examples/FebrlExample.scala | 20 ++++----- .../zingg/spark/client/TestArguments.java | 4 +- .../zingg/spark/core/block/SparkBlock.java | 2 +- .../core/feature/SparkFeatureFactory.java | 7 +--- .../src/test/java/zingg/block/TestBlock.java | 8 ++-- .../zingg/common/core/util/TestDSUtil.java | 24 +++++------ .../src/test/resources/documenter/config.json | 22 +++++----- .../test/resources/testDocumenter/config.json | 18 ++++---- .../src/test/resources/testFebrl/config.json | 22 +++++----- .../test/resources/testPeekModel/config.json | 22 +++++----- 35 files changed, 288 insertions(+), 292 deletions(-) diff --git a/common/client/src/test/resources/testArguments/configWithMultipleMatchTypes.json b/common/client/src/test/resources/testArguments/configWithMultipleMatchTypes.json index dcc7e3f39..82f75127e 100644 --- a/common/client/src/test/resources/testArguments/configWithMultipleMatchTypes.json +++ b/common/client/src/test/resources/testArguments/configWithMultipleMatchTypes.json @@ -4,61 +4,61 @@ "fieldName" : "fname", "matchType" : "fuzzy,null_or_blank", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "lname", "matchType" : "fuzzy", "fields" : "lname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "stNo", "matchType": "exact", "fields" : "stNo", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add1", "matchType": "fuzzy", "fields" : "add1", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add2", "matchType": "fuzzy", "fields" : "add2", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "city", "matchType": "fuzzy", "fields" : "city", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "areacode", "matchType": "exact", "fields" : "areacode", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "state", "matchType": "fuzzy", "fields" : "state", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "dob", "matchType": "fuzzy", "fields" : "dob", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "ssn", "matchType": "fuzzy", "fields" : "ssn", - "dataType": "\"string\"" + "dataType": "string" } ], "output" : [{ diff --git a/common/client/src/test/resources/testArguments/configWithMultipleMatchTypesUnsupported.json b/common/client/src/test/resources/testArguments/configWithMultipleMatchTypesUnsupported.json index 6dfcec90e..02c7e3584 100644 --- a/common/client/src/test/resources/testArguments/configWithMultipleMatchTypesUnsupported.json +++ b/common/client/src/test/resources/testArguments/configWithMultipleMatchTypesUnsupported.json @@ -4,61 +4,61 @@ "fieldName" : "fname", "matchType" : "fuzzy,null_wrong_blank", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "lname", "matchType" : "fuzzy", "fields" : "lname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "stNo", "matchType": "exact", "fields" : "stNo", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add1", "matchType": "fuzzy", "fields" : "add1", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add2", "matchType": "fuzzy", "fields" : "add2", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "city", "matchType": "fuzzy", "fields" : "city", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "areacode", "matchType": "exact", "fields" : "areacode", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "state", "matchType": "fuzzy", "fields" : "state", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "dob", "matchType": "fuzzy", "fields" : "dob", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "ssn", "matchType": "fuzzy", "fields" : "ssn", - "dataType": "\"string\"" + "dataType": "string" } ], "output" : [{ diff --git a/common/core/src/test/resources/documenter/config.json b/common/core/src/test/resources/documenter/config.json index ed55402de..6ecb4df5b 100644 --- a/common/core/src/test/resources/documenter/config.json +++ b/common/core/src/test/resources/documenter/config.json @@ -4,67 +4,67 @@ "fieldName" : "recId", "matchType" : "dont_use", "fields" : "recId", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "fname", "matchType" : "fuzzy", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "lname", "matchType" : "fuzzy", "fields" : "lname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "stNo", "matchType": "exact", "fields" : "stNo", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add1", "matchType": "fuzzy", "fields" : "add1", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add2", "matchType": "fuzzy", "fields" : "add2", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "city", "matchType": "fuzzy", "fields" : "city", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "areacode", "matchType": "exact", "fields" : "areacode", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "state", "matchType": "fuzzy", "fields" : "state", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "dob", "matchType": "fuzzy", "fields" : "dob", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "ssn", "matchType": "fuzzy", "fields" : "ssn", - "dataType": "\"string\"" + "dataType": "string" } ], "output" : [{ diff --git a/common/core/src/test/resources/testDocumenter/config.json b/common/core/src/test/resources/testDocumenter/config.json index d9f4d4ba4..5d966a2a4 100644 --- a/common/core/src/test/resources/testDocumenter/config.json +++ b/common/core/src/test/resources/testDocumenter/config.json @@ -4,55 +4,55 @@ "fieldName" : "fname", "matchType" : "fuzzy", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "lname", "matchType" : "fuzzy", "fields" : "lname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "stNo", "matchType": "fuzzy", "fields" : "stNo", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add1", "matchType": "fuzzy", "fields" : "add1", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add2", "matchType": "fuzzy", "fields" : "add2", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "city", "matchType": "fuzzy", "fields" : "city", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "state", "matchType": "fuzzy", "fields" : "state", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "dob", "matchType": "fuzzy", "fields" : "dob", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "ssn", "matchType": "fuzzy", "fields" : "ssn", - "dataType": "\"string\"" + "dataType": "string" } ], "output" : [{ diff --git a/common/core/src/test/resources/testFebrl/config.json b/common/core/src/test/resources/testFebrl/config.json index 97645a1cb..cdcbe82ef 100644 --- a/common/core/src/test/resources/testFebrl/config.json +++ b/common/core/src/test/resources/testFebrl/config.json @@ -4,67 +4,67 @@ "fieldName" : "id", "matchType" : "dont_use", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "fname", "matchType" : "fuzzy", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "lname", "matchType" : "fuzzy", "fields" : "lname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "stNo", "matchType": "exact", "fields" : "stNo", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add1", "matchType": "fuzzy", "fields" : "add1", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add2", "matchType": "fuzzy", "fields" : "add2", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "city", "matchType": "fuzzy", "fields" : "city", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "areacode", "matchType": "exact", "fields" : "areacode", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "state", "matchType": "exact", "fields" : "state", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "dob", "matchType": "exact", "fields" : "dob", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "ssn", "matchType": "exact", "fields" : "ssn", - "dataType": "\"string\"" + "dataType": "string" } ], "output" : [{ diff --git a/common/core/src/test/resources/testPeekModel/config.json b/common/core/src/test/resources/testPeekModel/config.json index b1b251f4b..0cd06f062 100644 --- a/common/core/src/test/resources/testPeekModel/config.json +++ b/common/core/src/test/resources/testPeekModel/config.json @@ -4,67 +4,67 @@ "fieldName" : "id", "matchType" : "dont_use", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "fname", "matchType" : "fuzzy", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "lname", "matchType" : "fuzzy", "fields" : "lname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "stNo", "matchType": "exact", "fields" : "stNo", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add1", "matchType": "fuzzy", "fields" : "add1", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add2", "matchType": "fuzzy", "fields" : "add2", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "city", "matchType": "fuzzy", "fields" : "city", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "areacode", "matchType": "exact", "fields" : "areacode", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "state", "matchType": "exact", "fields" : "state", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "dob", "matchType": "exact", "fields" : "dob", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "ssn", "matchType": "exact", "fields" : "ssn", - "dataType": "\"string\"" + "dataType": "string" } ], "output" : [{ diff --git a/docs/accuracy/stopWordsRemoval.md b/docs/accuracy/stopWordsRemoval.md index 702f4149f..752f7bb22 100644 --- a/docs/accuracy/stopWordsRemoval.md +++ b/docs/accuracy/stopWordsRemoval.md @@ -22,7 +22,7 @@ Once you have verified the above stop words, you can configure them in the JSON "fieldName" : "fname", "matchType" : "fuzzy", "fields" : "fname", - "dataType": "\"string\"", + "dataType": "string", "stopWords": "models/100/stopWords/fname.csv" }, ``` diff --git a/docs/improving-accuracy/stopwordsremoval/README.md b/docs/improving-accuracy/stopwordsremoval/README.md index 702f4149f..752f7bb22 100644 --- a/docs/improving-accuracy/stopwordsremoval/README.md +++ b/docs/improving-accuracy/stopwordsremoval/README.md @@ -22,7 +22,7 @@ Once you have verified the above stop words, you can configure them in the JSON "fieldName" : "fname", "matchType" : "fuzzy", "fields" : "fname", - "dataType": "\"string\"", + "dataType": "string", "stopWords": "models/100/stopWords/fname.csv" }, ``` diff --git a/docs/running/databricks.md b/docs/running/databricks.md index 99af05022..288c46113 100644 --- a/docs/running/databricks.md +++ b/docs/running/databricks.md @@ -58,55 +58,55 @@ The config file for Databricks needs modifications to accept dbfs locations. Her "fieldName" : "fname", "matchType" : "email", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "lname", "matchType" : "fuzzy", "fields" : "lname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "stNo", "matchType": "fuzzy", "fields" : "stNo", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add1", "matchType": "fuzzy", "fields" : "add1", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add2", "matchType": "fuzzy", "fields" : "add2", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "city", "matchType": "fuzzy", "fields" : "city", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "state", "matchType": "fuzzy", "fields" : "state", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "dob", "matchType": "fuzzy", "fields" : "dob", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "ssn", "matchType": "fuzzy", "fields" : "ssn", - "dataType": "\"string\"" + "dataType": "string" } ], "output" : [{ @@ -127,18 +127,18 @@ The config file for Databricks needs modifications to accept dbfs locations. Her "header":false }, "schema": - "{\"type\" : \"struct\", - \"fields\" : [ - {\"name\":\"id\", \"type\":\"string\", \"nullable\":false}, - {\"name\":\"fname\", \"type\":\"string\", \"nullable\":true}, - {\"name\":\"lname\",\"type\":\"string\",\"nullable\":true} , - {\"name\":\"stNo\", \"type\":\"string\", \"nullable\":true}, - {\"name\":\"add1\", \"type\":\"string\", \"nullable\":true}, - {\"name\":\"add2\",\"type\":\"string\",\"nullable\":true} , - {\"name\":\"city\", \"type\":\"string\", \"nullable\":true}, - {\"name\":\"state\", \"type\":\"string\", \"nullable\":true}, - {\"name\":\"dob\",\"type\":\"string\",\"nullable\":true} , - {\"name\":\"ssn\",\"type\":\"string\",\"nullable\":true} + "{type : struct, + fields : [ + {name:id, type:string, nullable:false}, + {name:fname, type:string, nullable:true}, + {name:lname,type:string,nullable:true} , + {name:stNo, type:string, nullable:true}, + {name:add1, type:string, nullable:true}, + {name:add2,type:string,nullable:true} , + {name:city, type:string, nullable:true}, + {name:state, type:string, nullable:true}, + {name:dob,type:string,nullable:true} , + {name:ssn,type:string,nullable:true} ] }" }], diff --git a/examples/amazon-google/config.json b/examples/amazon-google/config.json index 087ce219b..335f90718 100644 --- a/examples/amazon-google/config.json +++ b/examples/amazon-google/config.json @@ -4,31 +4,31 @@ "fieldName" : "id", "matchType" : "DONT_USE", "fields" : "id", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "title", "matchType" : "number_with_units", "fields" : "title", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "description", "matchType" : "text", "fields" : "description", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "manufacturer", "matchType": "fuzzy", "fields" : "manufacturer", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "price", "matchType": "fuzzy", "fields" : "price", - "dataType": "\"double\"" + "dataType": "double" }], "output" : [{ "name":"output", diff --git a/examples/amazon-google/configWithStopWords.json b/examples/amazon-google/configWithStopWords.json index cb38f1f10..fcda34476 100644 --- a/examples/amazon-google/configWithStopWords.json +++ b/examples/amazon-google/configWithStopWords.json @@ -4,32 +4,32 @@ "fieldName" : "id", "matchType" : "DONT_USE", "fields" : "id", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "title", "matchType" : "number_with_units", "fields" : "title", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "description", "matchType" : "text", "fields" : "description", - "dataType": "\"string\"" , + "dataType": "string" , "stopWords" : "examples/amazon-google/stopWords.csv" }, { "fieldName" : "manufacturer", "matchType": "fuzzy", "fields" : "manufacturer", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "price", "matchType": "fuzzy", "fields" : "price", - "dataType": "\"double\"" + "dataType": "double" }], "output" : [{ "name":"output", diff --git a/examples/beerAdvo-rateBeer/config.json b/examples/beerAdvo-rateBeer/config.json index d1679daee..52048518e 100644 --- a/examples/beerAdvo-rateBeer/config.json +++ b/examples/beerAdvo-rateBeer/config.json @@ -4,31 +4,31 @@ "fieldName" : "id", "matchType" : "DONT_USE", "fields" : "id", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "Beer_Name", "matchType" : "fuzzy", "fields" : "Beer_Name", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "Brew_Factory_Name", "matchType" : "fuzzy", "fields" : "Brew_Factory_Name", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "Style", "matchType": "fuzzy", "fields" : "Style", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "ABV", "matchType": "fuzzy", "fields" : "ABV", - "dataType": "\"double\"" + "dataType": "double" }], "output" : [{ "name":"output", diff --git a/examples/databricks/config120.json b/examples/databricks/config120.json index b1aa971b5..14f2ccbdb 100644 --- a/examples/databricks/config120.json +++ b/examples/databricks/config120.json @@ -4,55 +4,55 @@ "fieldName" : "fname", "matchType" : "fuzzy", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "lname", "matchType" : "fuzzy", "fields" : "lname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "stNo", "matchType": "fuzzy", "fields" : "stNo", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add1", "matchType": "fuzzy", "fields" : "add1", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add2", "matchType": "fuzzy", "fields" : "add2", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "city", "matchType": "fuzzy", "fields" : "city", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "state", "matchType": "fuzzy", "fields" : "state", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "dob", "matchType": "fuzzy", "fields" : "dob", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "ssn", "matchType": "fuzzy", "fields" : "ssn", - "dataType": "\"string\"" + "dataType": "string" } ], "output" : [{ diff --git a/examples/febrl/config.json b/examples/febrl/config.json index 70bfb166d..4853f2551 100644 --- a/examples/febrl/config.json +++ b/examples/febrl/config.json @@ -4,67 +4,67 @@ "fieldName" : "recId", "matchType" : "dont_use", "fields" : "recId", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "fname", "matchType" : "fuzzy", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "lname", "matchType" : "fuzzy", "fields" : "lname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "stNo", "matchType": "fuzzy", "fields" : "stNo", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add1", "matchType": "fuzzy", "fields" : "add1", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add2", "matchType": "fuzzy", "fields" : "add2", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "city", "matchType": "fuzzy", "fields" : "city", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "areacode", "matchType": "fuzzy", "fields" : "areacode", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "state", "matchType": "fuzzy", "fields" : "state", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "dob", "matchType": "fuzzy", "fields" : "dob", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "ssn", "matchType": "fuzzy", "fields" : "ssn", - "dataType": "\"string\"" + "dataType": "string" } ], "output" : [{ diff --git a/examples/febrl/configLink.json b/examples/febrl/configLink.json index 6c8e83a72..78c6f5188 100644 --- a/examples/febrl/configLink.json +++ b/examples/febrl/configLink.json @@ -4,61 +4,61 @@ "fieldName" : "fname", "matchType" : "fuzzy", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "lname", "matchType" : "fuzzy", "fields" : "lname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "stNo", "matchType": "exact", "fields" : "stNo", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add1", "matchType": "fuzzy", "fields" : "add1", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add2", "matchType": "fuzzy", "fields" : "add2", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "city", "matchType": "fuzzy", "fields" : "city", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "areacode", "matchType": "exact", "fields" : "areacode", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "state", "matchType": "fuzzy", "fields" : "state", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "dob", "matchType": "fuzzy", "fields" : "dob", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "ssn", "matchType": "fuzzy", "fields" : "ssn", - "dataType": "\"string\"" + "dataType": "string" } ], "output" : [{ diff --git a/examples/febrl/configSnow.json b/examples/febrl/configSnow.json index dcc82e031..4a43ff767 100644 --- a/examples/febrl/configSnow.json +++ b/examples/febrl/configSnow.json @@ -4,61 +4,61 @@ "fieldName" : "FNAME", "matchType" : "fuzzy", "fields" : "FNAME", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "LNAME", "matchType" : "fuzzy", "fields" : "LNAME", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "STREETNUMBER", "matchType": "exact", "fields" : "STREETNUMBER", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "STREET", "matchType": "fuzzy", "fields" : "STREET", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "ADDRESS", "matchType": "fuzzy", "fields" : "ADDRESS", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "LOCALITY", "matchType": "fuzzy", "fields" : "LOCALITY", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "AREACODE", "matchType": "exact", "fields" : "AREACODE", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "STATE", "matchType": "fuzzy", "fields" : "STATE", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "DATEOFBIRTH", "matchType": "fuzzy", "fields" : "DATEOFBIRTH", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "SSN", "matchType": "dont_use", "fields" : "SSN", - "dataType": "\"string\"" + "dataType": "string" }], "output" : [{ "name":"unifiedCustomers", diff --git a/examples/febrl/configWithTrainingSamples.json b/examples/febrl/configWithTrainingSamples.json index 67d90a0d5..3e9534911 100644 --- a/examples/febrl/configWithTrainingSamples.json +++ b/examples/febrl/configWithTrainingSamples.json @@ -15,61 +15,61 @@ "fieldName" : "fname", "matchType" : "fuzzy", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "lname", "matchType" : "fuzzy", "fields" : "lname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "stNo", "matchType": "exact", "fields" : "stNo", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add1", "matchType": "fuzzy", "fields" : "add1", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add2", "matchType": "fuzzy", "fields" : "add2", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "city", "matchType": "fuzzy", "fields" : "city", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "areacode", "matchType": "exact", "fields" : "areacode", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "state", "matchType": "fuzzy", "fields" : "state", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "dob", "matchType": "fuzzy", "fields" : "dob", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "ssn", "matchType": "fuzzy", "fields" : "ssn", - "dataType": "\"string\"" + "dataType": "string" } ], "output" : [{ diff --git a/examples/febrl/findTrainingData.json b/examples/febrl/findTrainingData.json index 26e508ff3..4422b181b 100644 --- a/examples/febrl/findTrainingData.json +++ b/examples/febrl/findTrainingData.json @@ -4,61 +4,61 @@ "fieldName" : "fname", "matchType" : "fuzzy", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "lname", "matchType" : "fuzzy", "fields" : "lname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "stNo", "matchType": "exact", "fields" : "stNo", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add1", "matchType": "fuzzy", "fields" : "add1", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add2", "matchType": "fuzzy", "fields" : "add2", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "city", "matchType": "fuzzy", "fields" : "city", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "areacode", "matchType": "exact", "fields" : "areacode", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "state", "matchType": "fuzzy", "fields" : "state", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "dob", "matchType": "fuzzy", "fields" : "dob", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "ssn", "matchType": "fuzzy", "fields" : "ssn", - "dataType": "\"string\"" + "dataType": "string" } ], "output" : [{ diff --git a/examples/febrl120k/config.json b/examples/febrl120k/config.json index 47ed65c35..9dd2cc027 100644 --- a/examples/febrl120k/config.json +++ b/examples/febrl120k/config.json @@ -4,67 +4,67 @@ "fieldName" : "id", "matchType" : "dont_use", "fields" : "id", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "fname", "matchType" : "fuzzy", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "lname", "matchType" : "fuzzy", "fields" : "lname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "stNo", "matchType": "exact", "fields" : "stNo", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add1", "matchType": "fuzzy", "fields" : "add1", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add2", "matchType": "fuzzy", "fields" : "add2", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "city", "matchType": "fuzzy", "fields" : "city", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "areacode", "matchType": "exact", "fields" : "areacode", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "state", "matchType": "fuzzy", "fields" : "state", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "dob", "matchType": "fuzzy", "fields" : "dob", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "ssn", "matchType": "fuzzy", "fields" : "ssn", - "dataType": "\"string\"" + "dataType": "string" } ], "output" : [{ diff --git a/examples/febrl120k/config120k.json b/examples/febrl120k/config120k.json index 21b5737a1..1b7b3e773 100644 --- a/examples/febrl120k/config120k.json +++ b/examples/febrl120k/config120k.json @@ -4,61 +4,61 @@ "fieldName" : "fname", "matchType" : "email", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "lname", "matchType" : "fuzzy", "fields" : "lname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "stNo", "matchType": "exact", "fields" : "stNo", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add1", "matchType": "fuzzy", "fields" : "add1", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add2", "matchType": "fuzzy", "fields" : "add2", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "city", "matchType": "fuzzy", "fields" : "city", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "areacode", "matchType": "exact", "fields" : "areacode", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "state", "matchType": "fuzzy", "fields" : "state", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "dob", "matchType": "fuzzy", "fields" : "dob", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "ssn", "matchType": "fuzzy", "fields" : "ssn", - "dataType": "\"string\"" + "dataType": "string" } ], "output" : [{ diff --git a/examples/febrl120k/config500k.json b/examples/febrl120k/config500k.json index 961f455e8..a361d843d 100644 --- a/examples/febrl120k/config500k.json +++ b/examples/febrl120k/config500k.json @@ -4,61 +4,61 @@ "fieldName" : "fname", "matchType" : "email", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "lname", "matchType" : "fuzzy", "fields" : "lname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "stNo", "matchType": "exact", "fields" : "stNo", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add1", "matchType": "fuzzy", "fields" : "add1", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add2", "matchType": "fuzzy", "fields" : "add2", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "city", "matchType": "fuzzy", "fields" : "city", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "areacode", "matchType": "exact", "fields" : "areacode", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "state", "matchType": "fuzzy", "fields" : "state", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "dob", "matchType": "fuzzy", "fields" : "dob", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "ssn", "matchType": "fuzzy", "fields" : "ssn", - "dataType": "\"string\"" + "dataType": "string" } ], "output" : [{ diff --git a/examples/iTunes-amazon/config.json b/examples/iTunes-amazon/config.json index 2007a7995..40f0b599f 100644 --- a/examples/iTunes-amazon/config.json +++ b/examples/iTunes-amazon/config.json @@ -4,55 +4,55 @@ "fieldName" : "id", "matchType" : "DONT_USE", "fields" : "id", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "Song_Name", "matchType" : "fuzzy", "fields" : "Song_Name", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "Artist_Name", "matchType" : "fuzzy", "fields" : "Artist_Name", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "Album_Name", "matchType": "fuzzy", "fields" : "Album_Name", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "Genre", "matchType": "fuzzy", "fields" : "Genre", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "Price", "matchType": "fuzzy", "fields" : "Price", - "dataType": "\"double\"" + "dataType": "double" }, { "fieldName" : "CopyRight", "matchType": "fuzzy", "fields" : "CopyRight", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "Time", "matchType": "fuzzy", "fields" : "Time", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "Released", "matchType": "fuzzy", "fields" : "Released", - "dataType": "\"string\"" + "dataType": "string" }], "output" : [{ "name":"output", diff --git a/examples/ncVoters5M/config.json b/examples/ncVoters5M/config.json index 03ad71604..3fe06d994 100644 --- a/examples/ncVoters5M/config.json +++ b/examples/ncVoters5M/config.json @@ -4,31 +4,31 @@ "fieldName" : "recid", "matchType" : "dont_use", "fields" : "recid", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "givenname", "matchType" : "fuzzy", "fields" : "givenname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "surname", "matchType": "exact", "fields" : "surname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "suburb", "matchType": "fuzzy", "fields" : "suburb", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "postcode", "matchType": "exact", "fields" : "postcode", - "dataType": "\"string\"" + "dataType": "string" } ], "output" : [{ diff --git a/perf/120kconfig.json b/perf/120kconfig.json index b4a443ace..e15e703df 100644 --- a/perf/120kconfig.json +++ b/perf/120kconfig.json @@ -4,55 +4,55 @@ "fieldName" : "fname", "matchType" : "email", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "lname", "matchType" : "fuzzy", "fields" : "lname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "stNo", "matchType": "fuzzy", "fields" : "stNo", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add1", "matchType": "fuzzy", "fields" : "add1", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add2", "matchType": "fuzzy", "fields" : "add2", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "city", "matchType": "fuzzy", "fields" : "city", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "state", "matchType": "fuzzy", "fields" : "state", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "dob", "matchType": "fuzzy", "fields" : "dob", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "ssn", "matchType": "fuzzy", "fields" : "ssn", - "dataType": "\"string\"" + "dataType": "string" } ], "output" : [{ diff --git a/python/zingg/client.py b/python/zingg/client.py index 24b897a42..7e7f6e29b 100644 --- a/python/zingg/client.py +++ b/python/zingg/client.py @@ -534,7 +534,8 @@ def stringify(self, str): :return: The stringify'ed value of the dataType :rtype: String """ - return '"' + str + '"' + + return str def parseArguments(argv): diff --git a/scala/examples/FebrlExample.scala b/scala/examples/FebrlExample.scala index 2ca6a0b56..64273cb92 100644 --- a/scala/examples/FebrlExample.scala +++ b/scala/examples/FebrlExample.scala @@ -9,61 +9,61 @@ val args = new Arguments(); //set field definitions val fname = new FieldDefinition(); fname.setFieldName("fname"); -fname.setDataType("\"string\""); +fname.setDataType("string"); fname.setMatchType(new ArrayList[MatchType](Arrays.asList(MatchType.FUZZY))); fname.setFields("fname"); val lname = new FieldDefinition(); lname.setFieldName("lname"); -lname.setDataType("\"string\""); +lname.setDataType("string"); lname.setMatchType(new ArrayList[MatchType](Arrays.asList(MatchType.FUZZY))); lname.setFields("lname"); val stNo = new FieldDefinition(); stNo.setFieldName("stNo"); -stNo.setDataType("\"string\""); +stNo.setDataType("string"); stNo.setMatchType(new ArrayList[MatchType](Arrays.asList(MatchType.EXACT))); stNo.setFields("stNo"); val add1 = new FieldDefinition(); add1.setFieldName("add1"); -add1.setDataType("\"string\""); +add1.setDataType("string"); add1.setMatchType(new ArrayList[MatchType](Arrays.asList(MatchType.FUZZY))); add1.setFields("add1"); val add2 = new FieldDefinition(); add2.setFieldName("add2"); -add2.setDataType("\"string\""); +add2.setDataType("string"); add2.setMatchType(new ArrayList[MatchType](Arrays.asList(MatchType.FUZZY))); add2.setFields("add2"); val city = new FieldDefinition(); city.setFieldName("city"); -city.setDataType("\"string\""); +city.setDataType("string"); city.setMatchType(new ArrayList[MatchType](Arrays.asList(MatchType.FUZZY))); city.setFields("city"); val areacode = new FieldDefinition(); areacode.setFieldName("areacode"); -areacode.setDataType("\"string\""); +areacode.setDataType("string"); areacode.setMatchType(new ArrayList[MatchType](Arrays.asList(MatchType.EXACT))); areacode.setFields("areacode"); val state = new FieldDefinition(); state.setFieldName("state"); -state.setDataType("\"string\""); +state.setDataType("string"); state.setMatchType(new ArrayList[MatchType](Arrays.asList(MatchType.FUZZY))); state.setFields("state"); val dob = new FieldDefinition(); dob.setFieldName("dob"); -dob.setDataType("\"string\""); +dob.setDataType("string"); dob.setMatchType(new ArrayList[MatchType](Arrays.asList(MatchType.FUZZY))); dob.setFields("dob"); val ssn = new FieldDefinition(); ssn.setFieldName("ssn"); -ssn.setDataType("\"string\""); +ssn.setDataType("string"); ssn.setMatchType(new ArrayList[MatchType](Arrays.asList(MatchType.FUZZY))); ssn.setFields("ssn"); :silent diff --git a/spark/client/src/test/java/zingg/spark/client/TestArguments.java b/spark/client/src/test/java/zingg/spark/client/TestArguments.java index 6a9ae0400..a79193c5c 100644 --- a/spark/client/src/test/java/zingg/spark/client/TestArguments.java +++ b/spark/client/src/test/java/zingg/spark/client/TestArguments.java @@ -25,13 +25,13 @@ public void testWriteArgumentObjectToJSONFile() { try { FieldDefinition fname = new FieldDefinition(); fname.setFieldName("fname"); - fname.setDataType("\"string\""); + fname.setDataType("string"); fname.setMatchType(Arrays.asList(MatchType.EXACT, MatchType.FUZZY, MatchType.PINCODE)); //fname.setMatchType(Arrays.asList(MatchType.EXACT)); fname.setFields("fname"); FieldDefinition lname = new FieldDefinition(); lname.setFieldName("lname"); - lname.setDataType("\"string\""); + lname.setDataType("string"); lname.setMatchType(Arrays.asList(MatchType.FUZZY)); lname.setFields("lname"); args.setFieldDefinition(Arrays.asList(fname, lname)); diff --git a/spark/core/src/main/java/zingg/spark/core/block/SparkBlock.java b/spark/core/src/main/java/zingg/spark/core/block/SparkBlock.java index 99a94f4bd..38a2bd080 100644 --- a/spark/core/src/main/java/zingg/spark/core/block/SparkBlock.java +++ b/spark/core/src/main/java/zingg/spark/core/block/SparkBlock.java @@ -23,7 +23,7 @@ public SparkBlock(ZFrame, Row, Column> training, ZFrame getFieldDefList() { List fdList = new ArrayList<>(4); FieldDefinition idFD = new FieldDefinition(); - idFD.setDataType("\"integer\""); + idFD.setDataType("integer"); idFD.setFieldName("id"); ArrayList matchTypelistId = new ArrayList(); matchTypelistId.add(MatchType.DONT_USE); @@ -102,19 +102,19 @@ private List getFieldDefList() { FieldDefinition yearFD = new FieldDefinition(); - yearFD.setDataType("\"integer\""); + yearFD.setDataType("integer"); yearFD.setFieldName("year"); yearFD.setMatchType(matchTypelistFuzzy); fdList.add(yearFD); FieldDefinition eventFD = new FieldDefinition(); - eventFD.setDataType("\"string\""); + eventFD.setDataType("string"); eventFD.setFieldName("event"); eventFD.setMatchType(matchTypelistFuzzy); fdList.add(eventFD); FieldDefinition commentFD = new FieldDefinition(); - commentFD.setDataType("\"string\""); + commentFD.setDataType("string"); commentFD.setFieldName("comment"); commentFD.setMatchType(matchTypelistFuzzy); fdList.add(commentFD); diff --git a/spark/core/src/test/java/zingg/common/core/util/TestDSUtil.java b/spark/core/src/test/java/zingg/common/core/util/TestDSUtil.java index e0fb1a12b..a1683d7d6 100644 --- a/spark/core/src/test/java/zingg/common/core/util/TestDSUtil.java +++ b/spark/core/src/test/java/zingg/common/core/util/TestDSUtil.java @@ -34,19 +34,19 @@ public void testGetFieldDefColumnsWhenShowConciseIsTrue() throws ZinggClientExce FieldDefinition def1 = new FieldDefinition(); def1.setFieldName("field_fuzzy"); - def1.setDataType("\"string\""); + def1.setDataType("string"); def1.setMatchTypeInternal(MatchType.FUZZY); def1.setFields("field_fuzzy"); FieldDefinition def2 = new FieldDefinition(); def2.setFieldName("field_match_type_DONT_USE"); - def2.setDataType("\"string\""); + def2.setDataType("string"); def2.setMatchTypeInternal(MatchType.DONT_USE); def2.setFields("field_match_type_DONT_USE"); FieldDefinition def3 = new FieldDefinition(); def3.setFieldName("field_str_DONTspaceUSE"); - def3.setDataType("\"string\""); + def3.setDataType("string"); def3.setMatchTypeInternal(MatchType.getMatchType("DONT_USE")); def3.setFields("field_str_DONTspaceUSE"); @@ -62,9 +62,9 @@ public void testGetFieldDefColumnsWhenShowConciseIsTrue() throws ZinggClientExce e.printStackTrace(); } StructType schema = DataTypes.createStructType(new StructField[] { - DataTypes.createStructField(def1.getFieldName(), DataType.fromJson(def1.getDataType()), false), - DataTypes.createStructField(def2.getFieldName(), DataType.fromJson(def2.getDataType()), false), - DataTypes.createStructField(def3.getFieldName(), DataType.fromJson(def3.getDataType()), false), + DataTypes.createStructField(def1.getFieldName(), DataType.fromDDL(def1.getDataType()), false), + DataTypes.createStructField(def2.getFieldName(), DataType.fromDDL(def2.getDataType()), false), + DataTypes.createStructField(def3.getFieldName(), DataType.fromDDL(def3.getDataType()), false), DataTypes.createStructField(ColName.SOURCE_COL, DataTypes.StringType, false) }); List list = Arrays.asList(RowFactory.create("1", "first", "one", "Junit"), RowFactory.create("2", "second", "two", "Junit"), @@ -85,19 +85,19 @@ public void testGetFieldDefColumnsWhenShowConciseIsTrue() throws ZinggClientExce public void testGetFieldDefColumnsWhenShowConciseIsFalse() throws ZinggClientException { FieldDefinition def1 = new FieldDefinition(); def1.setFieldName("field_fuzzy"); - def1.setDataType("\"string\""); + def1.setDataType("string"); def1.setMatchTypeInternal(MatchType.FUZZY); def1.setFields("field_fuzzy"); FieldDefinition def2 = new FieldDefinition(); def2.setFieldName("field_match_type_DONT_USE"); - def2.setDataType("\"string\""); + def2.setDataType("string"); def2.setMatchTypeInternal(MatchType.DONT_USE); def2.setFields("field_match_type_DONT_USE"); FieldDefinition def3 = new FieldDefinition(); def3.setFieldName("field_str_DONTspaceUSE"); - def3.setDataType("\"string\""); + def3.setDataType("string"); def3.setMatchTypeInternal(MatchType.getMatchType("DONT_USE")); def3.setFields("field_str_DONTspaceUSE"); @@ -113,9 +113,9 @@ public void testGetFieldDefColumnsWhenShowConciseIsFalse() throws ZinggClientExc e.printStackTrace(); } StructType schema = DataTypes.createStructType(new StructField[] { - DataTypes.createStructField(def1.getFieldName(), DataType.fromJson(def1.getDataType()), false), - DataTypes.createStructField(def2.getFieldName(), DataType.fromJson(def2.getDataType()), false), - DataTypes.createStructField(def3.getFieldName(), DataType.fromJson(def3.getDataType()), false), + DataTypes.createStructField(def1.getFieldName(), DataType.fromDDL(def1.getDataType()), false), + DataTypes.createStructField(def2.getFieldName(), DataType.fromDDL(def2.getDataType()), false), + DataTypes.createStructField(def3.getFieldName(), DataType.fromDDL(def3.getDataType()), false), DataTypes.createStructField(ColName.SOURCE_COL, DataTypes.StringType, false) }); List list = Arrays.asList(RowFactory.create("1", "first", "one", "Junit"), RowFactory.create("2", "second", "two", "Junit"), diff --git a/spark/core/src/test/resources/documenter/config.json b/spark/core/src/test/resources/documenter/config.json index ed55402de..6ecb4df5b 100644 --- a/spark/core/src/test/resources/documenter/config.json +++ b/spark/core/src/test/resources/documenter/config.json @@ -4,67 +4,67 @@ "fieldName" : "recId", "matchType" : "dont_use", "fields" : "recId", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "fname", "matchType" : "fuzzy", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "lname", "matchType" : "fuzzy", "fields" : "lname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "stNo", "matchType": "exact", "fields" : "stNo", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add1", "matchType": "fuzzy", "fields" : "add1", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add2", "matchType": "fuzzy", "fields" : "add2", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "city", "matchType": "fuzzy", "fields" : "city", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "areacode", "matchType": "exact", "fields" : "areacode", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "state", "matchType": "fuzzy", "fields" : "state", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "dob", "matchType": "fuzzy", "fields" : "dob", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "ssn", "matchType": "fuzzy", "fields" : "ssn", - "dataType": "\"string\"" + "dataType": "string" } ], "output" : [{ diff --git a/spark/core/src/test/resources/testDocumenter/config.json b/spark/core/src/test/resources/testDocumenter/config.json index ec096b0a2..94070cf1a 100644 --- a/spark/core/src/test/resources/testDocumenter/config.json +++ b/spark/core/src/test/resources/testDocumenter/config.json @@ -4,55 +4,55 @@ "fieldName" : "fname", "matchType" : "fuzzy", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "lname", "matchType" : "fuzzy", "fields" : "lname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "stNo", "matchType": "fuzzy", "fields" : "stNo", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add1", "matchType": "fuzzy", "fields" : "add1", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add2", "matchType": "fuzzy", "fields" : "add2", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "city", "matchType": "fuzzy", "fields" : "city", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "state", "matchType": "fuzzy", "fields" : "state", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "dob", "matchType": "fuzzy", "fields" : "dob", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "ssn", "matchType": "fuzzy", "fields" : "ssn", - "dataType": "\"string\"" + "dataType": "string" } ], "output" : [{ diff --git a/spark/core/src/test/resources/testFebrl/config.json b/spark/core/src/test/resources/testFebrl/config.json index 97645a1cb..cdcbe82ef 100644 --- a/spark/core/src/test/resources/testFebrl/config.json +++ b/spark/core/src/test/resources/testFebrl/config.json @@ -4,67 +4,67 @@ "fieldName" : "id", "matchType" : "dont_use", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "fname", "matchType" : "fuzzy", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "lname", "matchType" : "fuzzy", "fields" : "lname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "stNo", "matchType": "exact", "fields" : "stNo", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add1", "matchType": "fuzzy", "fields" : "add1", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add2", "matchType": "fuzzy", "fields" : "add2", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "city", "matchType": "fuzzy", "fields" : "city", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "areacode", "matchType": "exact", "fields" : "areacode", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "state", "matchType": "exact", "fields" : "state", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "dob", "matchType": "exact", "fields" : "dob", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "ssn", "matchType": "exact", "fields" : "ssn", - "dataType": "\"string\"" + "dataType": "string" } ], "output" : [{ diff --git a/spark/core/src/test/resources/testPeekModel/config.json b/spark/core/src/test/resources/testPeekModel/config.json index 4a869a670..469738d78 100644 --- a/spark/core/src/test/resources/testPeekModel/config.json +++ b/spark/core/src/test/resources/testPeekModel/config.json @@ -4,67 +4,67 @@ "fieldName" : "id", "matchType" : "dont_use", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "fname", "matchType" : "fuzzy", "fields" : "fname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "lname", "matchType" : "fuzzy", "fields" : "lname", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "stNo", "matchType": "exact", "fields" : "stNo", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add1", "matchType": "fuzzy", "fields" : "add1", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "add2", "matchType": "fuzzy", "fields" : "add2", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "city", "matchType": "fuzzy", "fields" : "city", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "areacode", "matchType": "exact", "fields" : "areacode", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "state", "matchType": "exact", "fields" : "state", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "dob", "matchType": "exact", "fields" : "dob", - "dataType": "\"string\"" + "dataType": "string" }, { "fieldName" : "ssn", "matchType": "exact", "fields" : "ssn", - "dataType": "\"string\"" + "dataType": "string" } ], "output" : [{