From 85765c110a09c5427aff40889722138030d73f02 Mon Sep 17 00:00:00 2001 From: Joxit Date: Wed, 15 May 2019 19:03:48 +0200 Subject: [PATCH 1/2] feat(multi-lang): Add index for all wof languages (when differents to default language) This importer did not index preferred names of all languages. We need this for the multi-lang search in autocomplete. I remove all elements when they are present in the default index. That means, if we say New York in the default language and New York in FRA, I will drop the FRA entry. This should prevent a massive increase of ES indexes. --- src/components/extractFields.js | 24 +++++++++++++++++++++++- src/peliasDocGenerators.js | 25 +++++++++++++++++++++---- test/components/extractFieldsTest.js | 4 ++++ test/readStreamTest.js | 3 +++ 4 files changed, 51 insertions(+), 5 deletions(-) diff --git a/src/components/extractFields.js b/src/components/extractFields.js index e1c4dd89..d8f6b1e8 100644 --- a/src/components/extractFields.js +++ b/src/components/extractFields.js @@ -1,6 +1,7 @@ const through2 = require('through2'); const _ = require('lodash'); const util = require('util'); +const iso3166 = require('iso3166-1'); // hierarchy in importance-descending order of population fields const population_hierarchy = [ @@ -26,6 +27,8 @@ const NAME_ALIAS_FIELDS = [ 'label:%s_x_preferred' ]; +const WOF_NAMES_REGEX = /(name|label):[a-z]{3}_x_(preferred|variant)/; + // this function is used to verify that a US county QS altname is available function isUsCounty(base_record, wof_country, qs_a2_alt) { return 'US' === wof_country && @@ -131,6 +134,23 @@ function getNameAliases(properties) { return concatArrayFields(properties, nameFields); } +function getMultiLangNames(defaultName, properties) { + return Object.keys(properties) + .filter(key => WOF_NAMES_REGEX.test(key)) // get only name:.* keys + .map(key => { + return { + key: key.substring(key.indexOf(':') + 1, key.indexOf(':') + 4).toUpperCase(), // get the iso part of the key name:iso_x_preferred + value: properties[key] + .filter(name => !defaultName || defaultName.indexOf(name) < 0) // remove duplicate elements found in default name + }; + }) // + .filter(({ key, value }) => value.length > 0 && iso3166.is3(key)) // filter correct iso 3 keys + .map(({key, value}) => { return { key: iso3166.to2(key).toLowerCase(), value: value }; }) + .reduce((langs, { key, value }) => + _.set(langs, key, _.union(langs[key], value)), {} + ); // create the lang/value map +} + function getAbbreviation(properties) { if (properties['wof:placetype'] === 'country' && properties['wof:country']) { return properties['wof:country']; @@ -167,10 +187,12 @@ function getHierarchies(id, properties) { */ module.exports.create = function map_fields_stream() { return through2.obj(function(json_object, enc, callback) { + const default_names = getName(json_object.properties); var record = { id: json_object.id, - name: getName(json_object.properties), + name: default_names, name_aliases: getNameAliases(json_object.properties), + name_langs: getMultiLangNames(default_names, json_object.properties), abbreviation: getAbbreviation(json_object.properties), place_type: json_object.properties['wof:placetype'], lat: getLat(json_object.properties), diff --git a/src/peliasDocGenerators.js b/src/peliasDocGenerators.js index 1404b3ce..df3b4d34 100644 --- a/src/peliasDocGenerators.js +++ b/src/peliasDocGenerators.js @@ -57,6 +57,18 @@ function assignField(hierarchyElement, wofDoc) { } +function addMultiLangAliases(wofDoc, name_langs) { + for (let lang in name_langs) { + for (let i = 0; i < name_langs[lang].length; i++) { + if (i === 0) { + wofDoc.setName(lang, name_langs[lang][i]); + } else { + wofDoc.setNameAlias(lang, name_langs[lang][i]); + } + } + } +} + // method that extracts the logic for Document creation. `hierarchy` is optional function setupDocument(record, hierarchy) { var wofDoc = new Document( 'whosonfirst', record.place_type, record.id ); @@ -73,10 +85,15 @@ function setupDocument(record, hierarchy) { } // index name aliases for all other records (where available) - else if (record.name_aliases.length) { - record.name_aliases.forEach(alias => { - wofDoc.setNameAlias('default', alias); - }); + else { + if (record.name_aliases.length) { + record.name_aliases.forEach(alias => { + wofDoc.setNameAlias('default', alias); + }); + } + if (record.name_langs) { + addMultiLangAliases(wofDoc, record.name_langs); + } } } wofDoc.setCentroid({ lat: record.lat, lon: record.lon }); diff --git a/test/components/extractFieldsTest.js b/test/components/extractFieldsTest.js index 86ed100c..eeb0e2ff 100644 --- a/test/components/extractFieldsTest.js +++ b/test/components/extractFieldsTest.js @@ -54,6 +54,7 @@ tape('readStreamComponents', function(test) { id: 12345, name: 'name 1', name_aliases: [], + name_langs: {}, place_type: 'place type 1', lat: 12.121212, lon: 21.212121, @@ -92,6 +93,7 @@ tape('readStreamComponents', function(test) { id: 23456, name: undefined, name_aliases: [], + name_langs: {}, place_type: undefined, lat: undefined, lon: undefined, @@ -130,6 +132,7 @@ tape('readStreamComponents', function(test) { id: 12345, name: 'name 1', name_aliases: [], + name_langs: {}, place_type: 'place type 1', lat: 12.121212, lon: 21.212121, @@ -504,6 +507,7 @@ tape('readStreamComponents', function(test) { id: 12345, name: 'wof:name value', name_aliases: [], + name_langs: {}, place_type: 'country', lat: undefined, lon: undefined, diff --git a/test/readStreamTest.js b/test/readStreamTest.js index 2457dfe1..dcb477a7 100644 --- a/test/readStreamTest.js +++ b/test/readStreamTest.js @@ -79,6 +79,7 @@ tape('readStream', (test) => { id: 123, name: 'name 1', name_aliases: [], + name_langs: {}, place_type: 'place type 1', lat: 12.121212, lon: 21.212121, @@ -94,6 +95,7 @@ tape('readStream', (test) => { id: 456, name: 'name 2', name_aliases: [], + name_langs: {}, place_type: 'place type 2', lat: 13.131313, lon: 31.313131, @@ -234,6 +236,7 @@ tape('readStream', (test) => { id: 421302191, name: 'name 421302191', name_aliases: [], + name_langs: {}, abbreviation: undefined, place_type: undefined, lat: 45.240295, From 505997a434a712d0277db410b664ca4e3caf0b7e Mon Sep 17 00:00:00 2001 From: Joxit Date: Fri, 17 May 2019 11:36:22 +0200 Subject: [PATCH 2/2] test(multi-lang): Add some tests for the feature and use iso639 instead of iso3166. WOF is using iso639 in their names. That caused some fail with spanish/spain (spa vs es) --- src/components/extractFields.js | 8 +- src/helpers/iso639.js | 207 +++++++++++++++++++++++++++ test/components/extractFieldsTest.js | 72 ++++++++++ test/peliasDocGeneratorsTest.js | 54 +++++++ 4 files changed, 337 insertions(+), 4 deletions(-) create mode 100644 src/helpers/iso639.js diff --git a/src/components/extractFields.js b/src/components/extractFields.js index d8f6b1e8..7d710c3e 100644 --- a/src/components/extractFields.js +++ b/src/components/extractFields.js @@ -1,7 +1,7 @@ const through2 = require('through2'); const _ = require('lodash'); const util = require('util'); -const iso3166 = require('iso3166-1'); +const iso639 = require('../helpers/iso639'); // hierarchy in importance-descending order of population fields const population_hierarchy = [ @@ -139,13 +139,13 @@ function getMultiLangNames(defaultName, properties) { .filter(key => WOF_NAMES_REGEX.test(key)) // get only name:.* keys .map(key => { return { - key: key.substring(key.indexOf(':') + 1, key.indexOf(':') + 4).toUpperCase(), // get the iso part of the key name:iso_x_preferred + key: key.substring(key.indexOf(':') + 1, key.indexOf(':') + 4), // get the iso part of the key name:iso_x_preferred value: properties[key] .filter(name => !defaultName || defaultName.indexOf(name) < 0) // remove duplicate elements found in default name }; }) // - .filter(({ key, value }) => value.length > 0 && iso3166.is3(key)) // filter correct iso 3 keys - .map(({key, value}) => { return { key: iso3166.to2(key).toLowerCase(), value: value }; }) + .filter(({ key, value }) => value.length > 0 && iso639[key]) // filter correct iso 3 keys + .map(({key, value}) => { return { key: iso639[key], value: value }; }) .reduce((langs, { key, value }) => _.set(langs, key, _.union(langs[key], value)), {} ); // create the lang/value map diff --git a/src/helpers/iso639.js b/src/helpers/iso639.js new file mode 100644 index 00000000..14a735e3 --- /dev/null +++ b/src/helpers/iso639.js @@ -0,0 +1,207 @@ +// Based on https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes +module.exports = { + 'abk': 'ab', + 'aar': 'aa', + 'afr': 'af', + 'aka': 'ak', + 'alb': 'sq', + 'sqi': 'sq', + 'amh': 'am', + 'ara': 'ar', + 'arg': 'an', + 'hye': 'hy', + 'arm': 'hy', + 'asm': 'as', + 'ava': 'av', + 'ave': 'ae', + 'aym': 'ay', + 'aze': 'az', + 'bam': 'bm', + 'bak': 'ba', + 'eus': 'eu', + 'baq': 'eu', + 'bel': 'be', + 'ben': 'bn', + 'bih': 'bh', + 'bis': 'bi', + 'bos': 'bs', + 'bre': 'br', + 'bul': 'bg', + 'mya': 'my', + 'bur': 'my', + 'cat': 'ca', + 'cha': 'ch', + 'che': 'ce', + 'nya': 'ny', + 'chi': 'zh', + 'zho': 'zh', + 'chv': 'cv', + 'cor': 'kw', + 'cos': 'co', + 'cre': 'cr', + 'hrv': 'hr', + 'ces': 'cs', + 'cze': 'cs', + 'dan': 'da', + 'div': 'dv', + 'nld': 'nl', + 'dut': 'nl', + 'dzo': 'dz', + 'eng': 'en', + 'epo': 'eo', + 'est': 'et', + 'ewe': 'ee', + 'fao': 'fo', + 'fij': 'fj', + 'fin': 'fi', + 'fra': 'fr', + 'fre': 'fr', + 'ful': 'ff', + 'glg': 'gl', + 'kat': 'ka', + 'geo': 'ka', + 'deu': 'de', + 'ger': 'de', + 'ell': 'el', + 'gre': 'el', + 'grn': 'gn', + 'guj': 'gu', + 'hat': 'ht', + 'hau': 'ha', + 'heb': 'he', + 'her': 'hz', + 'hin': 'hi', + 'hmo': 'ho', + 'hun': 'hu', + 'ina': 'ia', + 'ind': 'id', + 'ile': 'ie', + 'gle': 'ga', + 'ibo': 'ig', + 'ipk': 'ik', + 'ido': 'io', + 'isl': 'is', + 'ice': 'is', + 'ita': 'it', + 'iku': 'iu', + 'jpn': 'ja', + 'jav': 'jv', + 'kal': 'kl', + 'kan': 'kn', + 'kau': 'kr', + 'kas': 'ks', + 'kaz': 'kk', + 'khm': 'km', + 'kik': 'ki', + 'kin': 'rw', + 'kir': 'ky', + 'kom': 'kv', + 'kon': 'kg', + 'kor': 'ko', + 'kur': 'ku', + 'kua': 'kj', + 'lat': 'la', + 'ltz': 'lb', + 'lug': 'lg', + 'lim': 'li', + 'lin': 'ln', + 'lao': 'lo', + 'lit': 'lt', + 'lub': 'lu', + 'lav': 'lv', + 'glv': 'gv', + 'mkd': 'mk', + 'mac': 'mk', + 'mlg': 'mg', + 'may': 'ms', + 'msa': 'ms', + 'mal': 'ml', + 'mlt': 'mt', + 'mri': 'mi', + 'mao': 'mi', + 'mar': 'mr', + 'mah': 'mh', + 'mon': 'mn', + 'nau': 'na', + 'nav': 'nv', + 'nde': 'nd', + 'nep': 'ne', + 'ndo': 'ng', + 'nob': 'nb', + 'nno': 'nn', + 'nor': 'no', + 'iii': 'ii', + 'nbl': 'nr', + 'oci': 'oc', + 'oji': 'oj', + 'chu': 'cu', + 'orm': 'om', + 'ori': 'or', + 'oss': 'os', + 'pan': 'pa', + 'pli': 'pi', + 'per': 'fa', + 'fas': 'fa', + 'pol': 'pl', + 'pus': 'ps', + 'por': 'pt', + 'que': 'qu', + 'roh': 'rm', + 'run': 'rn', + 'ron': 'ro', + 'rum': 'ro', + 'rus': 'ru', + 'san': 'sa', + 'srd': 'sc', + 'snd': 'sd', + 'sme': 'se', + 'smo': 'sm', + 'sag': 'sg', + 'srp': 'sr', + 'gla': 'gd', + 'sna': 'sn', + 'sin': 'si', + 'slk': 'sk', + 'slo': 'sk', + 'slv': 'sl', + 'som': 'so', + 'sot': 'st', + 'spa': 'es', + 'sun': 'su', + 'swa': 'sw', + 'ssw': 'ss', + 'swe': 'sv', + 'tam': 'ta', + 'tel': 'te', + 'tgk': 'tg', + 'tha': 'th', + 'tir': 'ti', + 'bod': 'bo', + 'tib': 'bo', + 'tuk': 'tk', + 'tgl': 'tl', + 'tsn': 'tn', + 'ton': 'to', + 'tur': 'tr', + 'tso': 'ts', + 'tat': 'tt', + 'twi': 'tw', + 'tah': 'ty', + 'uig': 'ug', + 'ukr': 'uk', + 'urd': 'ur', + 'uzb': 'uz', + 'ven': 've', + 'vie': 'vi', + 'vol': 'vo', + 'wln': 'wa', + 'cym': 'cy', + 'wel': 'cy', + 'wol': 'wo', + 'fry': 'fy', + 'xho': 'xh', + 'yid': 'yi', + 'yor': 'yo', + 'zha': 'za', + 'zul': 'zu' +}; \ No newline at end of file diff --git a/test/components/extractFieldsTest.js b/test/components/extractFieldsTest.js index eeb0e2ff..9ab1a380 100644 --- a/test/components/extractFieldsTest.js +++ b/test/components/extractFieldsTest.js @@ -985,3 +985,75 @@ tape('name alias tests', (test) => { test.end(); }); + +tape('multi-lang index test', (test) => { + test.test('all elements in default language should not be in other indexes', function (t) { + var input = [{ + id: 54321, + properties: { + 'wof:name': ['default1', 'default2'], + 'name:eng_x_preferred': ['preferredENG1'], + 'name:fra_x_preferred': ['default1', 'preferredFRA1', 'preferredFRA2'], + 'name:spa_x_variant': ['default2', 'variantSPA1', 'variantSPA2'], + } + }]; + + const expected_name_langs = { + 'en': ['preferredENG1'], + 'fr': ['preferredFRA1', 'preferredFRA2'], + 'es': ['variantSPA1', 'variantSPA2'] + }; + + test_stream(input, extractFields.create(), function (err, actual) { + t.deepEqual(actual[0].name_langs, expected_name_langs, 'name langs populated from fr preferred and SPA variant fields'); + t.end(); + }); + }); + + test.test('name langs should be without duplicates', function (t) { + var input = [ + { + id: 54321, + properties: { + 'name:fra_x_preferred': ['preferredFRA1', 'preferredFRA2', 'preferredFRA2'], + 'name:spa_x_preferred': ['variantSPA1', 'variantSPA1', 'variantSPA2'], + 'wof:name': ['prefered1'] + } + } + ]; + + const expected_name_langs = { + 'fr': ['preferredFRA1', 'preferredFRA2'], + 'es': ['variantSPA1', 'variantSPA2'] + }; + + test_stream(input, extractFields.create(), function (err, actual) { + t.deepEqual(actual[0].name_langs, expected_name_langs, 'should not have duplicates'); + t.end(); + }); + }); + + test.test('name langs should concat iso639-2B and iso639-2T', function (t) { + var input = [ + { + id: 54321, + properties: { + 'name:fra_x_preferred': ['preferredFRA1', 'preferredFRA2'], + 'name:fre_x_preferred': ['preferredFRE1'], + 'wof:name': ['prefered1'] + } + } + ]; + + const expected_name_langs = { + 'fr': ['preferredFRA1', 'preferredFRA2', 'preferredFRE1'] + }; + + test_stream(input, extractFields.create(), function (err, actual) { + t.deepEqual(actual[0].name_langs, expected_name_langs, 'should not have duplicates'); + t.end(); + }); + }); + + test.end(); +}); \ No newline at end of file diff --git a/test/peliasDocGeneratorsTest.js b/test/peliasDocGeneratorsTest.js index 880bd94d..913909fa 100644 --- a/test/peliasDocGeneratorsTest.js +++ b/test/peliasDocGeneratorsTest.js @@ -676,6 +676,60 @@ tape('create', function(test) { }); + test.test('name langs should be set on doc', function (t) { + var wofRecords = { + 1: { + id: 1, + name: 'Japan', + name_aliases: [], + name_langs: { + 'jp': [ + 'Nihon' + ], + 'fr': [ + 'Japon', 'Pays Du Soleil Levant' + ] + }, + lat: 12.121212, + lon: 21.212121, + place_type: 'country', + abbreviation: 'JP', + popularity: 25000 + } + }; + + var input = [ + wofRecords['1'] + ]; + + var expected = [ + new Document('whosonfirst', 'country', '1') + .setName('default', 'Japan') + .setName('jp', 'Nihon') + .setName('fr', 'Japon') + .setNameAlias('fr', 'Pays Du Soleil Levant') + .setCentroid({ lat: 12.121212, lon: 21.212121 }) + .addParent('country', 'Japan', '1', 'JPN') + .setPopularity(25000) + ]; + + var hierarchies_finder = function () { + return [ + [ + wofRecords['1'] + ] + ]; + }; + + var docGenerator = peliasDocGenerators.create(hierarchies_finder); + + test_stream(input, docGenerator, function (err, actual) { + t.deepEqual(actual, expected, 'population should not be set'); + t.end(); + }); + + }); + test.end(); });