From de9e6dcbf75025d59bf09af62c62fce3a4d1d1a3 Mon Sep 17 00:00:00 2001 From: ashitsalesforce Date: Wed, 18 Dec 2024 00:04:55 -0800 Subject: [PATCH] Output BOM in extraction results for UTF-32LE and UTF-32BE charsets Output BOM in extraction results for UTF-32LE and UTF-32BE charsets --- .../dataloader/dao/csv/CSVFileWriter.java | 3 +- .../salesforce/dataloader/dao/CsvTest.java | 76 ++++++++++++++---- .../testfiles/data/csvtext_BOM_UTF32BE.csv | Bin 0 -> 338 bytes .../testfiles/data/csvtext_BOM_UTF32LE.csv | Bin 0 -> 338 bytes 4 files changed, 63 insertions(+), 16 deletions(-) create mode 100644 src/test/resources/testfiles/data/csvtext_BOM_UTF32BE.csv create mode 100644 src/test/resources/testfiles/data/csvtext_BOM_UTF32LE.csv diff --git a/src/main/java/com/salesforce/dataloader/dao/csv/CSVFileWriter.java b/src/main/java/com/salesforce/dataloader/dao/csv/CSVFileWriter.java index 5b2e1e21..cf494d35 100644 --- a/src/main/java/com/salesforce/dataloader/dao/csv/CSVFileWriter.java +++ b/src/main/java/com/salesforce/dataloader/dao/csv/CSVFileWriter.java @@ -132,7 +132,8 @@ public void open() throws DataAccessObjectInitializationException { private byte[] getBOM() { if (StandardCharsets.UTF_8.equals(Charset.forName(this.encoding))) { return new byte[]{(byte) 0xEF, (byte) 0xBB, (byte) 0xBF}; - } else if (this.encoding.startsWith(StandardCharsets.UTF_16.name())) { + } else if (this.encoding.startsWith(StandardCharsets.UTF_16.name()) + || this.encoding.startsWith("UTF-32")) { return new byte[]{(byte) 0xFE, (byte) 0xFF}; } return new byte[0]; diff --git a/src/test/java/com/salesforce/dataloader/dao/CsvTest.java b/src/test/java/com/salesforce/dataloader/dao/CsvTest.java index ad8c49a9..d2a6fe3c 100644 --- a/src/test/java/com/salesforce/dataloader/dao/CsvTest.java +++ b/src/test/java/com/salesforce/dataloader/dao/CsvTest.java @@ -26,8 +26,11 @@ package com.salesforce.dataloader.dao; import java.io.File; +import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.io.FileInputStream; +import java.io.InputStream; import org.junit.Before; import org.junit.Ignore; @@ -57,24 +60,20 @@ public class CsvTest extends ConfigTestBase { @Before public void createTestData() { writeHeader = new ArrayList(3); - writeHeader.add("COL1"); - writeHeader.add("COL2"); - writeHeader.add("COL3"); - - ArrayList headerLabelList = new ArrayList(); - headerLabelList.add("COL1"); - headerLabelList.add("COL2"); - headerLabelList.add("COL3"); - TableHeader header = new TableHeader(headerLabelList); + writeHeader.add("column1"); + writeHeader.add("column2"); + writeHeader.add("column3"); + + TableHeader header = new TableHeader(writeHeader); row1 = new TableRow(header); - row1.put("COL1", "row1col1"); - row1.put("COL2", "row1col2"); - row1.put("COL3", "row1col3"); + row1.put(writeHeader.get(0), "row1-1"); + row1.put(writeHeader.get(1), "row1-2"); + row1.put(writeHeader.get(2), "row1-3"); row2 = new TableRow(header); - row2.put("COL1", "row2col1"); - row2.put("COL2", "row2col2"); - row2.put("COL3", "row2col3"); + row2.put(writeHeader.get(0), "row2-1"); + row2.put(writeHeader.get(1), "row2-2"); + row2.put(writeHeader.get(2), "row2-3"); } @Test public void testCSVReadBasic() throws Exception { @@ -84,12 +83,16 @@ public void testCSVReadBasic() throws Exception { @Test public void testCSVReadUTF8BOMBasic() throws Exception{ testCSVReadBasic("csvtext_BOM_UTF8.csv"); + assertTrue("did not find BOM in " + getTestDataDir() + "/csvtext_BOM_UTF8.csv", + hasBOM(getTestDataDir() + "/csvtext_BOM_UTF8.csv")); } @Test public void testCSVReadUTF16BEBOMBasic() throws Exception{ getController().getAppConfig().setValue(AppConfig.PROP_READ_CHARSET, "UTF-16BE"); testCSVReadBasic("csvtext_BOM_UTF16BE.csv"); + assertTrue("did not find BOM in " + getTestDataDir() + "/csvtext_BOM_UTF16BE.csv", + hasBOM(getTestDataDir() + "/csvtext_BOM_UTF16BE.csv")); getController().getAppConfig().setValue(AppConfig.PROP_READ_CHARSET, ""); } @@ -97,6 +100,26 @@ public void testCSVReadUTF16BEBOMBasic() throws Exception{ public void testCSVReadUTF16LEBOMBasic() throws Exception{ getController().getAppConfig().setValue(AppConfig.PROP_READ_CHARSET, "UTF-16LE"); testCSVReadBasic("csvtext_BOM_UTF16LE.csv"); + assertTrue("did not find BOM in " + getTestDataDir() + "/csvtext_BOM_UTF16LE.csv", + hasBOM(getTestDataDir() + "/csvtext_BOM_UTF16LE.csv")); + getController().getAppConfig().setValue(AppConfig.PROP_READ_CHARSET, ""); + } + + @Test + public void testCSVReadUTF32LEBOMBasic() throws Exception{ + getController().getAppConfig().setValue(AppConfig.PROP_READ_CHARSET, "UTF-32LE"); + testCSVReadBasic("csvtext_BOM_UTF32LE.csv"); + assertTrue("did not find BOM in " + getTestDataDir() + "/csvtext_BOM_UTF32LE.csv", + hasBOM(getTestDataDir() + "/csvtext_BOM_UTF32LE.csv")); + getController().getAppConfig().setValue(AppConfig.PROP_READ_CHARSET, ""); + } + + @Test + public void testCSVReadUTF32BEBOMBasic() throws Exception{ + getController().getAppConfig().setValue(AppConfig.PROP_READ_CHARSET, "UTF-32BE"); + testCSVReadBasic("csvtext_BOM_UTF32BE.csv"); + assertTrue("did not find BOM in " + getTestDataDir() + "/csvtext_BOM_UTF32BE.csv", + hasBOM(getTestDataDir() + "/csvtext_BOM_UTF32BE.csv")); getController().getAppConfig().setValue(AppConfig.PROP_READ_CHARSET, ""); } @@ -125,6 +148,28 @@ private void testCSVReadBasic(String csvFile) throws Exception { csv.close(); } + + + public static boolean hasBOM(String filePath) throws IOException { + try (InputStream is = new FileInputStream(filePath)) { + byte[] bom = new byte[3]; + if (is.read(bom) == 3) { + boolean bomFound = false; + // UTF-8 case + bomFound = bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF; + if (!bomFound) { + // UTF-16BE, UTF-32BE, UTF-32LE cases + bomFound = bom[0] == (byte)0xFE && bom[1] == (byte) 0xFF; + } + if (!bomFound) { + // UTF16-LE case + bomFound = bom[0] == (byte)0xFF && bom[1] == (byte) 0xFE; + } + return bomFound; + } + } + return false; + } @Test public void testCSVWriteBasic() throws Exception { @@ -196,6 +241,7 @@ private void doTestCSVWriteBasic(String delimiter) throws Exception { writer.writeRowList(rowList); writer.close(); + assertTrue("did not find BOM in " + path, hasBOM(path)); compareWriterFile(path, delimiter, false, false); // 3rd param false and 4th param false => CSV for a upload compareWriterFile(path, delimiter, false, true); // 3rd param false and 4th param true => query result CSV diff --git a/src/test/resources/testfiles/data/csvtext_BOM_UTF32BE.csv b/src/test/resources/testfiles/data/csvtext_BOM_UTF32BE.csv new file mode 100644 index 0000000000000000000000000000000000000000..528248683795f952262e88ab1542a0103dfb9345 GIT binary patch literal 338 zcmezOpMilv35b({I3I{}fVdQhbAdPyhz)@lB(DRdaj7#RQk^kW9T$`?0@5IR%YhhV gmoAh>wiBBgbo;QW!DkmpKOwtdYVg?wQvv8h3~51Sf%c7gO0vJ0jLpItCD$aVn$7Qz`j literal 0 HcmV?d00001