From 1fae7dda4fe6594918029a787deec6def244dc83 Mon Sep 17 00:00:00 2001 From: Joseph White Date: Tue, 5 Nov 2024 11:02:24 -0500 Subject: [PATCH] Add ability to set character set for load data infile. --- classes/ETL/Ingestor/IngestorOptions.php | 3 +++ classes/ETL/Ingestor/pdoIngestor.php | 11 +++++++++++ 2 files changed, 14 insertions(+) diff --git a/classes/ETL/Ingestor/IngestorOptions.php b/classes/ETL/Ingestor/IngestorOptions.php index 0fd257d851..3b3d30fe57 100644 --- a/classes/ETL/Ingestor/IngestorOptions.php +++ b/classes/ETL/Ingestor/IngestorOptions.php @@ -95,6 +95,9 @@ public function __construct(array $options = null) // INFILE...REPLACE INTO instead. "force_load_data_infile_replace_into" => false, + // Character set override to use when loading data via a file. + "load_data_infile_character_set" => null, + // Hide all SQL warnings returned by the database. "hide_sql_warnings" => false, diff --git a/classes/ETL/Ingestor/pdoIngestor.php b/classes/ETL/Ingestor/pdoIngestor.php index 68e53875d4..5cb185ebc0 100644 --- a/classes/ETL/Ingestor/pdoIngestor.php +++ b/classes/ETL/Ingestor/pdoIngestor.php @@ -604,6 +604,15 @@ private function multiDatabaseIngest() // Keys are table columns (destination) and values are query result columns (source) $destColumnList = array_keys($destFieldToSourceFieldMap); + // The mysql documentation claims that file contents are interpreted using the character set + // in the character_set_database system variable. However, I was not able to get this to work + // Explicitly setting the CHARACTER SET does appear to work though. + + $characterSetOverride = ''; + if ( $this->options->load_data_infile_character_set ) { + $characterSetOverride = "CHARACTER SET '" . $this->options->load_data_infile_character_set . "' "; + } + // The default method for ingestion is INSERT INTO ON DUPLICATE KEY UPDATE because tests // have shown an approx 40% performance improvement when updating existing data over // REPLACE INTO. REPLACE INTO also may cause issues with auto increment keys because @@ -615,6 +624,7 @@ private function multiDatabaseIngest() if ( $this->options->force_load_data_infile_replace_into ) { $loadStatement = "LOAD DATA LOCAL INFILE '$infileName' replace into table $qualifiedDestTableName " + . $characterSetOverride . "FIELDS TERMINATED BY " . sprintf("0x%02x", ord($this->fieldSeparator)) . " OPTIONALLY ENCLOSED BY " . sprintf("0x%02x", ord($this->stringEnclosure)) . " ESCAPED BY " . sprintf("0x%02x", ord($this->escapeChar)) @@ -639,6 +649,7 @@ function ($s) { $loadStatement = "CREATE TABLE $tmpTable LIKE $qualifiedDestTableName; " . "ALTER TABLE $tmpTable DISABLE KEYS; " . "LOAD DATA LOCAL INFILE '$infileName' INTO TABLE $tmpTable " + . $characterSetOverride . "FIELDS TERMINATED BY " . sprintf("0x%02x", ord($this->fieldSeparator)) . " OPTIONALLY ENCLOSED BY " . sprintf("0x%02x", ord($this->stringEnclosure)) . " ESCAPED BY " . sprintf("0x%02x", ord($this->escapeChar))