Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improved deduplication between USA ZIP vs ZIP+4 properties #1675

Merged
merged 2 commits into from
Jul 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ jobs:
os:
- ${{ vars.UBUNTU_VERSION }}
node-version:
- 12.x
- 14.x
- 16.x
- 18.x
- 20.x
- 22.x
steps:
- uses: actions/checkout@v4
- name: 'Install node.js ${{ matrix.node-version }}'
Expand Down
2 changes: 1 addition & 1 deletion .jshintrc
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"node": true,
"curly": true,
"eqeqeq": true,
"esversion": 9,
"esversion": "2022",
"freeze": true,
"immed": true,
"indent": 2,
Expand Down
29 changes: 24 additions & 5 deletions helper/diffPlaces.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,12 @@ function isLayerDifferent(item1, item2){
return false;
}

function isCountryCode(item, code) {
return field.getStringValue( item?.parent?.country_a ) === code;
}

function isUsState(item) {
if (!_.isArray(item.parent.country_a)) { return false; }
return item.parent.country_a[0] === 'USA' && item.layer === 'region';
return isCountryCode(item, 'USA') && item.layer === 'region';
}

// Geonames records in the locality and localadmin layer are parented by themselves
Expand Down Expand Up @@ -206,7 +209,7 @@ function isAddressDifferent(item1, item2){
// only compare zip if both records have it, otherwise just ignore and assume it's the same
// since by this time we've already compared parent hierarchies
if( _.has(address1, 'zip') && _.has(address2, 'zip') ){
if( isPropertyDifferent(address1, address2, 'zip') ){ return true; }
if( isZipDifferent(item1, item2) ){ return true; }
}

return false;
Expand Down Expand Up @@ -255,10 +258,26 @@ function isDifferent(item1, item2, requestLanguage){
return false;
}

/**
* return true if zip codes are different
*/
function isZipDifferent(item1, item2) {
let address1 = _.get(item1, 'address_parts');
let address2 = _.get(item2, 'address_parts');

// handle USA ZIP+4 vs ZIP (98036-6119 vs 98036)
if (isCountryCode(item1, 'USA') && isCountryCode(item2, 'USA')) {
const firstWordOnly = (str) => _.first(normalizeString(str).split(' '));
return isPropertyDifferent(address1, address2, 'zip', firstWordOnly);
}

return isPropertyDifferent(address1, address2, 'zip');
}

/**
* return true if properties are different
*/
function isPropertyDifferent(item1, item2, prop ){
function isPropertyDifferent(item1, item2, prop, normalizer = normalizeString ){

// if neither item has prop, we consider them the same
if( !_.has(item1, prop) && !_.has(item2, prop) ){ return false; }
Expand All @@ -274,7 +293,7 @@ function isPropertyDifferent(item1, item2, prop ){
let prop1StringValue = field.getStringValue( prop1[i] );
for( let j=0; j<prop2.length; j++ ){
let prop2StringValue = field.getStringValue( prop2[j] );
if( normalizeString( prop1StringValue ) === normalizeString( prop2StringValue ) ){
if( normalizer( prop1StringValue ) === normalizer( prop2StringValue ) ){
return false;
}
}
Expand Down
52 changes: 52 additions & 0 deletions test/unit/helper/diffPlaces.js
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,58 @@ module.exports.tests.dedupe = function(test, common) {
t.end();
});

test('ZIP vs ZIP+4', function(t) {
var item1 = {
'parent': {
'country_a': ['USA']
},
'address_parts': {
'number': '1',
'street': 'Main Street',
'zip': '90210'
}
};
var item2 = {
'parent': {
'country_a': ['USA']
},
'address_parts': {
'number': '1',
'street': 'Main Street',
'zip': '90210-1111'
}
};

t.false(isDifferent(item1, item2), 'should be the same');
t.end();
});

test('ZIP vs ZIP+4 functionality does not apply for non-USA documents', function(t) {
var item1 = {
'parent': {
'country_a': ['NOT']
},
'address_parts': {
'number': '1',
'street': 'Main Street',
'zip': '90210'
}
};
var item2 = {
'parent': {
'country_a': ['NOT']
},
'address_parts': {
'number': '1',
'street': 'Main Street',
'zip': '90210-1111'
}
};

t.true(isDifferent(item1, item2), 'should be the same');
t.end();
});

test('completely empty objects', function(t) {
var item1 = {};
var item2 = {};
Expand Down