From e5dc8288b6743d0ea398c84e1678e48ba28a28ad Mon Sep 17 00:00:00 2001 From: Marc Durdin Date: Mon, 3 Jun 2024 12:37:28 +0700 Subject: [PATCH 1/3] chore: split search-prepare-data.sql to make it easier to debug Part of #253. Also fixes null language description from keyboard_info --- tools/db/build/build.inc.php | 10 ++++++- tools/db/build/build_keyboards_script.inc.php | 5 +++- tools/db/build/search-prepare-data-1.sql | 6 ++++ tools/db/build/search-prepare-data-2.sql | 15 ++++++++++ tools/db/build/search-prepare-data-3.sql | 19 ++++++++++++ tools/db/build/search-prepare-data-4.sql | 29 +++++++++++++++++++ tools/db/build/search-prepare-data-5.sql | 16 ++++++++++ tools/db/build/search-prepare-data-6.sql | 20 +++++++++++++ tools/db/build/search-prepare-data-7.sql | 15 ++++++++++ tools/db/build/search-prepare-data-8.sql | 15 ++++++++++ tools/db/build/search-prepare-data-9.sql | 9 ++++++ 11 files changed, 157 insertions(+), 2 deletions(-) create mode 100644 tools/db/build/search-prepare-data-1.sql create mode 100644 tools/db/build/search-prepare-data-2.sql create mode 100644 tools/db/build/search-prepare-data-3.sql create mode 100644 tools/db/build/search-prepare-data-4.sql create mode 100644 tools/db/build/search-prepare-data-5.sql create mode 100644 tools/db/build/search-prepare-data-6.sql create mode 100644 tools/db/build/search-prepare-data-7.sql create mode 100644 tools/db/build/search-prepare-data-8.sql create mode 100644 tools/db/build/search-prepare-data-9.sql diff --git a/tools/db/build/build.inc.php b/tools/db/build/build.inc.php index c53c4cf..2f5c654 100644 --- a/tools/db/build/build.inc.php +++ b/tools/db/build/build.inc.php @@ -68,7 +68,15 @@ function BuildDatabase($DBDataSources, $schema, $do_force) { $this->sqlrun("${data_path}keyboards.sql"); $this->sqlrun("${data_path}models.sql"); - $this->sqlrun(dirname(__FILE__)."/search-prepare-data.sql"); + $this->sqlrun(dirname(__FILE__)."/search-prepare-data-1.sql"); + $this->sqlrun(dirname(__FILE__)."/search-prepare-data-2.sql"); + $this->sqlrun(dirname(__FILE__)."/search-prepare-data-3.sql"); + $this->sqlrun(dirname(__FILE__)."/search-prepare-data-4.sql"); + $this->sqlrun(dirname(__FILE__)."/search-prepare-data-5.sql"); + $this->sqlrun(dirname(__FILE__)."/search-prepare-data-6.sql"); + $this->sqlrun(dirname(__FILE__)."/search-prepare-data-7.sql"); + $this->sqlrun(dirname(__FILE__)."/search-prepare-data-8.sql"); + $this->sqlrun(dirname(__FILE__)."/search-prepare-data-9.sql"); $this->sqlrun(dirname(__FILE__)."/indexes.sql"); $this->sqlrun(dirname(__FILE__)."/full-text-indexes.sql", false, false); diff --git a/tools/db/build/build_keyboards_script.inc.php b/tools/db/build/build_keyboards_script.inc.php index b4cfaba..da79824 100644 --- a/tools/db/build/build_keyboards_script.inc.php +++ b/tools/db/build/build_keyboards_script.inc.php @@ -249,6 +249,9 @@ function generate_keyboard_language_inserts() { assert(!is_array($keyboard->languages)); // array format was deprecated in 1.0.5, kmcomp should never generate it any more foreach($keyboard->languages as $id => $language) { $this->parse_bcp47($id, $lang, $region, $script); + $langName = empty($language->languageName) + ? (empty($language->displayName) ? 'undefined' : $language->displayName) + : $language->languageName; $result .= <<sqlv($keyboard, 'id')}, @@ -256,7 +259,7 @@ function generate_keyboard_language_inserts() { {$this->sqlv(null, $lang)}, {$this->sqlv(null, $region)}, {$this->sqlv(null, $script)}, - {$this->sqlv($language, 'languageName')}); + {$this->sqlv(null, $langName)}); GO diff --git a/tools/db/build/search-prepare-data-1.sql b/tools/db/build/search-prepare-data-1.sql new file mode 100644 index 0000000..257dd72 --- /dev/null +++ b/tools/db/build/search-prepare-data-1.sql @@ -0,0 +1,6 @@ +UPDATE t_iso639_3 SET Part2B=NULL WHERE Part2B=''; +UPDATE t_iso639_3 SET Part2T=NULL WHERE Part2T=''; +UPDATE t_iso639_3 SET Part1=NULL WHERE Part1=''; +UPDATE t_iso639_3 SET _Comment=NULL WHERE _Comment=''; +UPDATE t_iso639_3 SET CanonicalId=COALESCE(CAST(Part1 AS NVARCHAR),CAST(Id AS NVARCHAR)) + diff --git a/tools/db/build/search-prepare-data-2.sql b/tools/db/build/search-prepare-data-2.sql new file mode 100644 index 0000000..37e77bf --- /dev/null +++ b/tools/db/build/search-prepare-data-2.sql @@ -0,0 +1,15 @@ +-- +-- We need to do some sanitisation of the t_language_index and t_iso639_3_names +-- to remove names marked as pejorative in the Ethnologue index. +-- + +delete + t_iso639_3_names +where exists (select * from t_ethnologue_language_index el where el.LangID = t_iso639_3_names.Id and (el.nametype='LP' or el.nametype='DP')) + +delete + t_language_index +where exists (select * from t_ethnologue_language_index el where el.LangID = t_language_index.language_id and (el.nametype='LP' or el.nametype='DP')) + +delete from t_ethnologue_language_index where nametype='LP' or nametype='DP'; + diff --git a/tools/db/build/search-prepare-data-3.sql b/tools/db/build/search-prepare-data-3.sql new file mode 100644 index 0000000..44b9562 --- /dev/null +++ b/tools/db/build/search-prepare-data-3.sql @@ -0,0 +1,19 @@ +-- +-- Deprecated keyboards and models should be flagged as such in the t_keyboard/t_model data +-- + +update t_keyboard + set deprecated = 1 + where exists (select * from t_keyboard_related kr where kr.related_keyboard_id = t_keyboard.keyboard_id and kr.deprecates = 1); + +update t_model + set deprecated = 1 + where exists (select * from t_model_related mr where mr.related_model_id = t_model.model_id and mr.deprecates = 1); + +-- +-- Any keyboard that has been replaced by another one, or is not Unicode, is marked as obsolete +-- + +update t_keyboard + set obsolete = 1 + where deprecated = 1 or is_unicode = 0 diff --git a/tools/db/build/search-prepare-data-4.sql b/tools/db/build/search-prepare-data-4.sql new file mode 100644 index 0000000..adc74e7 --- /dev/null +++ b/tools/db/build/search-prepare-data-4.sql @@ -0,0 +1,29 @@ +-- +-- Canonicalize bcp47 codes into langtags entries +-- +-- Fixup those that are missing from t_langtags, first +-- + +-- Find those that are missing where there is a matching base tag but not a matching full tag + +INSERT + t_langtag (tag, [full], iso639_3, region, regionname, name, sldr, script, windows) +SELECT DISTINCT + kl.bcp47, + kl.bcp47, + null, + t.region, + t.regionname, + kl.description, + 0, + kl.script_id, + kl.bcp47 +FROM + t_keyboard_language kl LEFT JOIN + t_langtag_tag tt ON kl.bcp47 = tt.tag LEFT JOIN + t_langtag_tag tt0 ON kl.language_id = tt0.tag LEFT JOIN + t_langtag t ON tt0.base_tag = t.tag +WHERE + tt.tag IS NULL AND + tt0.tag IS NOT NULL + diff --git a/tools/db/build/search-prepare-data-5.sql b/tools/db/build/search-prepare-data-5.sql new file mode 100644 index 0000000..b2af68f --- /dev/null +++ b/tools/db/build/search-prepare-data-5.sql @@ -0,0 +1,16 @@ +-- Insert the tags above for searching against + +INSERT + t_langtag_tag (base_tag, tag, tagtype) +SELECT DISTINCT + kl.bcp47, + kl.bcp47, + 5 -- custom (keyboard) tag type +FROM + t_keyboard_language kl LEFT JOIN + t_langtag_tag tt ON kl.bcp47 = tt.tag LEFT JOIN + t_langtag_tag tt0 ON kl.language_id = tt0.tag +WHERE + tt.tag IS NULL AND + tt0.tag IS NOT NULL + diff --git a/tools/db/build/search-prepare-data-6.sql b/tools/db/build/search-prepare-data-6.sql new file mode 100644 index 0000000..f66fb63 --- /dev/null +++ b/tools/db/build/search-prepare-data-6.sql @@ -0,0 +1,20 @@ +-- Fixup those where we cannot find any matching base tag at all (e.g. qa? tags will fit into this) + +INSERT + t_langtag (tag, [full], iso639_3, region, regionname, name, sldr, script, windows) +SELECT DISTINCT + kl.bcp47, + kl.bcp47, + null, + '001', --t.region, + 'World', --t.regionname, + kl.description, + 0, + kl.script_id, + kl.bcp47 +FROM + t_keyboard_language kl LEFT JOIN + t_langtag_tag tt ON kl.bcp47 = tt.tag +WHERE + tt.tag IS NULL + diff --git a/tools/db/build/search-prepare-data-7.sql b/tools/db/build/search-prepare-data-7.sql new file mode 100644 index 0000000..258f016 --- /dev/null +++ b/tools/db/build/search-prepare-data-7.sql @@ -0,0 +1,15 @@ +-- Insert the tags above for searching against + +INSERT + t_langtag_tag (base_tag, tag, tagtype) +SELECT DISTINCT + kl.bcp47, + kl.bcp47, + 5 -- custom (keyboard) tag type +FROM + t_keyboard_language kl LEFT JOIN + t_langtag_tag tt ON kl.bcp47 = tt.tag LEFT JOIN + t_langtag t ON kl.bcp47 = t.tag +WHERE + tt.tag IS NULL AND + t.tag IS NOT NULL diff --git a/tools/db/build/search-prepare-data-8.sql b/tools/db/build/search-prepare-data-8.sql new file mode 100644 index 0000000..905c809 --- /dev/null +++ b/tools/db/build/search-prepare-data-8.sql @@ -0,0 +1,15 @@ +-- Add new names that have been defined by keyboard authors + +INSERT + t_langtag_name (tag, name, name_kd, nametype) +SELECT DISTINCT + t.base_tag, + kl.description, + kl.description, -- TODO: we can't do full normalisation here, but we'll live with it for now + 4 -- custom +FROM + t_keyboard_language kl LEFT JOIN + t_langtag_tag t ON kl.bcp47 = t.tag LEFT JOIN + t_langtag_name n ON n.tag = t.base_tag AND n.name = kl.description +WHERE + n._id IS NULL and t.tag is not null diff --git a/tools/db/build/search-prepare-data-9.sql b/tools/db/build/search-prepare-data-9.sql new file mode 100644 index 0000000..72d08ab --- /dev/null +++ b/tools/db/build/search-prepare-data-9.sql @@ -0,0 +1,9 @@ +-- Finally, match up all the keyboards with langtags! + +INSERT + t_keyboard_langtag +SELECT + kl.keyboard_id, tt.base_tag +FROM + t_keyboard_language kl INNER JOIN + t_langtag_tag tt ON kl.bcp47 = tt.tag From 40ca3e8ab892024d219e8b2791ad6f672807a30e Mon Sep 17 00:00:00 2001 From: Marc Durdin Date: Mon, 3 Jun 2024 12:49:08 +0700 Subject: [PATCH 2/3] fix: select first matching language name where there are conflicts Fixes: #253 --- tools/db/build/search-prepare-data-6.sql | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/db/build/search-prepare-data-6.sql b/tools/db/build/search-prepare-data-6.sql index f66fb63..ebf9121 100644 --- a/tools/db/build/search-prepare-data-6.sql +++ b/tools/db/build/search-prepare-data-6.sql @@ -8,7 +8,7 @@ SELECT DISTINCT null, '001', --t.region, 'World', --t.regionname, - kl.description, + (select top 1 kl0.description from k0.t_keyboard_language kl0 where kl0.bcp47 = kl.bcp47), 0, kl.script_id, kl.bcp47 @@ -17,4 +17,3 @@ FROM t_langtag_tag tt ON kl.bcp47 = tt.tag WHERE tt.tag IS NULL - From 286a0b9a5141260d5e4c679aae626e005d6a9ee2 Mon Sep 17 00:00:00 2001 From: Marc Durdin Date: Mon, 3 Jun 2024 12:52:26 +0700 Subject: [PATCH 3/3] chore: remove unused search-prepare-data.sql --- tools/db/build/search-prepare-data.sql | 147 ------------------------- 1 file changed, 147 deletions(-) delete mode 100644 tools/db/build/search-prepare-data.sql diff --git a/tools/db/build/search-prepare-data.sql b/tools/db/build/search-prepare-data.sql deleted file mode 100644 index dc3bf2c..0000000 --- a/tools/db/build/search-prepare-data.sql +++ /dev/null @@ -1,147 +0,0 @@ -UPDATE t_iso639_3 SET Part2B=NULL WHERE Part2B=''; -UPDATE t_iso639_3 SET Part2T=NULL WHERE Part2T=''; -UPDATE t_iso639_3 SET Part1=NULL WHERE Part1=''; -UPDATE t_iso639_3 SET _Comment=NULL WHERE _Comment=''; -UPDATE t_iso639_3 SET CanonicalId=COALESCE(CAST(Part1 AS NVARCHAR),CAST(Id AS NVARCHAR)) - --- --- We need to do some sanitisation of the t_language_index and t_iso639_3_names --- to remove names marked as pejorative in the Ethnologue index. --- - -delete - t_iso639_3_names -where exists (select * from t_ethnologue_language_index el where el.LangID = t_iso639_3_names.Id and (el.nametype='LP' or el.nametype='DP')) - -delete - t_language_index -where exists (select * from t_ethnologue_language_index el where el.LangID = t_language_index.language_id and (el.nametype='LP' or el.nametype='DP')) - -delete from t_ethnologue_language_index where nametype='LP' or nametype='DP'; - --- --- Deprecated keyboards and models should be flagged as such in the t_keyboard/t_model data --- - -update t_keyboard - set deprecated = 1 - where exists (select * from t_keyboard_related kr where kr.related_keyboard_id = t_keyboard.keyboard_id and kr.deprecates = 1); - -update t_model - set deprecated = 1 - where exists (select * from t_model_related mr where mr.related_model_id = t_model.model_id and mr.deprecates = 1); - --- --- Any keyboard that has been replaced by another one, or is not Unicode, is marked as obsolete --- - -update t_keyboard - set obsolete = 1 - where deprecated = 1 or is_unicode = 0 - --- --- Canonicalize bcp47 codes into langtags entries --- --- Fixup those that are missing from t_langtags, first --- - --- Find those that are missing where there is a matching base tag but not a matching full tag - -INSERT - t_langtag (tag, [full], iso639_3, region, regionname, name, sldr, script, windows) -SELECT DISTINCT - kl.bcp47, - kl.bcp47, - null, - t.region, - t.regionname, - kl.description, - 0, - kl.script_id, - kl.bcp47 -FROM - t_keyboard_language kl LEFT JOIN - t_langtag_tag tt ON kl.bcp47 = tt.tag LEFT JOIN - t_langtag_tag tt0 ON kl.language_id = tt0.tag LEFT JOIN - t_langtag t ON tt0.base_tag = t.tag -WHERE - tt.tag IS NULL AND - tt0.tag IS NOT NULL - --- Insert the tags above for searching against - -INSERT - t_langtag_tag (base_tag, tag, tagtype) -SELECT DISTINCT - kl.bcp47, - kl.bcp47, - 5 -- custom (keyboard) tag type -FROM - t_keyboard_language kl LEFT JOIN - t_langtag_tag tt ON kl.bcp47 = tt.tag LEFT JOIN - t_langtag_tag tt0 ON kl.language_id = tt0.tag -WHERE - tt.tag IS NULL AND - tt0.tag IS NOT NULL - --- Fixup those where we cannot find any matching base tag at all (e.g. qa? tags will fit into this) - -INSERT - t_langtag (tag, [full], iso639_3, region, regionname, name, sldr, script, windows) -SELECT DISTINCT - kl.bcp47, - kl.bcp47, - null, - '001', --t.region, - 'World', --t.regionname, - kl.description, - 0, - kl.script_id, - kl.bcp47 -FROM - t_keyboard_language kl LEFT JOIN - t_langtag_tag tt ON kl.bcp47 = tt.tag -WHERE - tt.tag IS NULL - --- Insert the tags above for searching against - -INSERT - t_langtag_tag (base_tag, tag, tagtype) -SELECT DISTINCT - kl.bcp47, - kl.bcp47, - 5 -- custom (keyboard) tag type -FROM - t_keyboard_language kl LEFT JOIN - t_langtag_tag tt ON kl.bcp47 = tt.tag LEFT JOIN - t_langtag t ON kl.bcp47 = t.tag -WHERE - tt.tag IS NULL AND - t.tag IS NOT NULL - --- Add new names that have been defined by keyboard authors - -INSERT - t_langtag_name (tag, name, name_kd, nametype) -SELECT DISTINCT - t.base_tag, - kl.description, - kl.description, -- TODO: we can't do full normalisation here, but we'll live with it for now - 4 -- custom -FROM - t_keyboard_language kl LEFT JOIN - t_langtag_tag t ON kl.bcp47 = t.tag LEFT JOIN - t_langtag_name n ON n.tag = t.base_tag AND n.name = kl.description -WHERE - n._id IS NULL and t.tag is not null - --- Finally, match up all the keyboards with langtags! - -INSERT - t_keyboard_langtag -SELECT - kl.keyboard_id, tt.base_tag -FROM - t_keyboard_language kl INNER JOIN - t_langtag_tag tt ON kl.bcp47 = tt.tag