Skip to content

Commit

Permalink
Fix crash in parseSerdeParameters() (facebookincubator#8730)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: facebookincubator#8730

We simply didn't check `nullStringIt` for being
invalid before using it.

Reviewed By: gggrace14

Differential Revision: D53676033

fbshipit-source-id: aea451e7995be84a06d86ae17eee39c26747e04b
  • Loading branch information
Sergey Pershin authored and facebook-github-bot committed Feb 13, 2024
1 parent 7227ff8 commit 42b10d9
Show file tree
Hide file tree
Showing 4 changed files with 216 additions and 1 deletion.
4 changes: 3 additions & 1 deletion velox/connectors/hive/HiveConnectorUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,9 @@ std::unique_ptr<dwio::common::SerDeOptions> parseSerdeParameters(
}
auto serDeOptions = std::make_unique<dwio::common::SerDeOptions>(
fieldDelim, collectionDelim, mapKeyDelim);
serDeOptions->nullString = nullStringIt->second;
if (nullStringIt != tableParameters.end()) {
serDeOptions->nullString = nullStringIt->second;
}
return serDeOptions;
}

Expand Down
1 change: 1 addition & 0 deletions velox/connectors/hive/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ add_executable(
FileHandleTest.cpp
HivePartitionUtilTest.cpp
HiveConnectorTest.cpp
HiveConnectorUtilTest.cpp
HiveConnectorSerDeTest.cpp
PartitionIdGeneratorTest.cpp
TableHandleTest.cpp
Expand Down
205 changes: 205 additions & 0 deletions velox/connectors/hive/tests/HiveConnectorUtilTest.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <gtest/gtest.h>
#include "velox/exec/tests/utils/HiveConnectorTestBase.h"

#include "velox/connectors/hive/HiveConfig.h"
#include "velox/connectors/hive/HiveConnectorSplit.h"
#include "velox/connectors/hive/HiveConnectorUtil.h"
#include "velox/connectors/hive/TableHandle.h"
#include "velox/core/Config.h"

namespace facebook::velox::connector {

using namespace dwio::common;

class HiveConnectorUtilTest : public exec::test::HiveConnectorTestBase {
protected:
static bool compareSerDeOptions(
const SerDeOptions& l,
const SerDeOptions& r) {
return l.isEscaped == r.isEscaped && l.escapeChar == r.escapeChar &&
l.lastColumnTakesRest == r.lastColumnTakesRest &&
l.nullString == r.nullString && l.separators == r.separators;
}

std::shared_ptr<memory::MemoryPool> pool_ =
memory::memoryManager()->addLeafPool();
};

TEST_F(HiveConnectorUtilTest, configureReaderOptions) {
core::MemConfig sessionProperties;
auto hiveConfig =
std::make_shared<hive::HiveConfig>(std::make_shared<core::MemConfig>());
const std::unordered_map<std::string, std::optional<std::string>>
partitionKeys;
const std::unordered_map<std::string, std::string> customSplitInfo;

// Dynamic parameters.
dwio::common::ReaderOptions readerOptions(pool_.get());
FileFormat fileFormat{FileFormat::DWRF};
std::unordered_map<std::string, std::string> tableParameters;
std::unordered_map<std::string, std::string> serdeParameters;
SerDeOptions expectedSerDe;

auto createTableHandle = [&]() {
return std::make_shared<hive::HiveTableHandle>(
"testConnectorId",
"testTable",
false,
hive::SubfieldFilters{},
nullptr,
nullptr,
tableParameters);
};

auto createSplit = [&]() {
return std::make_shared<hive::HiveConnectorSplit>(
"testConnectorId",
"/tmp/",
fileFormat,
0UL,
std::numeric_limits<uint64_t>::max(),
partitionKeys,
std::nullopt,
customSplitInfo,
nullptr,
serdeParameters);
};

auto performConfigure = [&]() {
auto tableHandle = createTableHandle();
auto split = createSplit();
configureReaderOptions(
readerOptions, hiveConfig, &sessionProperties, tableHandle, split);
};

auto clearDynamicParameters = [&](FileFormat newFileFormat) {
readerOptions = dwio::common::ReaderOptions(pool_.get());
fileFormat = newFileFormat;
tableParameters.clear();
serdeParameters.clear();
expectedSerDe = SerDeOptions{};
};

// Default.
performConfigure();
EXPECT_EQ(readerOptions.getFileFormat(), fileFormat);
EXPECT_TRUE(
compareSerDeOptions(readerOptions.getSerDeOptions(), expectedSerDe));
EXPECT_EQ(readerOptions.maxCoalesceBytes(), hiveConfig->maxCoalescedBytes());
EXPECT_EQ(
readerOptions.maxCoalesceDistance(),
hiveConfig->maxCoalescedDistanceBytes());
EXPECT_EQ(
readerOptions.isFileColumnNamesReadAsLowerCase(),
hiveConfig->isFileColumnNamesReadAsLowerCase(&sessionProperties));
EXPECT_EQ(
readerOptions.isUseColumnNamesForColumnMapping(),
hiveConfig->isOrcUseColumnNames(&sessionProperties));
EXPECT_EQ(
readerOptions.getFooterEstimatedSize(),
hiveConfig->footerEstimatedSize());
EXPECT_EQ(
readerOptions.getFilePreloadThreshold(),
hiveConfig->filePreloadThreshold());

// Modify field delimiter and change the file format.
clearDynamicParameters(FileFormat::TEXT);
serdeParameters[SerDeOptions::kFieldDelim] = '\t';
expectedSerDe.separators[size_t(SerDeSeparator::FIELD_DELIM)] = '\t';
performConfigure();
EXPECT_EQ(readerOptions.getFileFormat(), fileFormat);
EXPECT_TRUE(
compareSerDeOptions(readerOptions.getSerDeOptions(), expectedSerDe));

// Modify collection delimiter.
clearDynamicParameters(FileFormat::TEXT);
serdeParameters[SerDeOptions::kCollectionDelim] = '=';
expectedSerDe.separators[size_t(SerDeSeparator::COLLECTION_DELIM)] = '=';
performConfigure();
EXPECT_TRUE(
compareSerDeOptions(readerOptions.getSerDeOptions(), expectedSerDe));

// Modify map key delimiter.
clearDynamicParameters(FileFormat::TEXT);
serdeParameters[SerDeOptions::kMapKeyDelim] = '&';
expectedSerDe.separators[size_t(SerDeSeparator::MAP_KEY_DELIM)] = '&';
performConfigure();
EXPECT_TRUE(
compareSerDeOptions(readerOptions.getSerDeOptions(), expectedSerDe));

// Modify null string.
clearDynamicParameters(FileFormat::TEXT);
tableParameters[TableParameter::kSerializationNullFormat] = "x-x";
expectedSerDe.nullString = "x-x";
performConfigure();
EXPECT_TRUE(
compareSerDeOptions(readerOptions.getSerDeOptions(), expectedSerDe));

// Modify all previous together.
clearDynamicParameters(FileFormat::TEXT);
serdeParameters[SerDeOptions::kFieldDelim] = '~';
expectedSerDe.separators[size_t(SerDeSeparator::FIELD_DELIM)] = '~';
serdeParameters[SerDeOptions::kCollectionDelim] = '$';
expectedSerDe.separators[size_t(SerDeSeparator::COLLECTION_DELIM)] = '$';
serdeParameters[SerDeOptions::kMapKeyDelim] = '*';
expectedSerDe.separators[size_t(SerDeSeparator::MAP_KEY_DELIM)] = '*';
tableParameters[TableParameter::kSerializationNullFormat] = "";
expectedSerDe.nullString = "";
performConfigure();
EXPECT_TRUE(
compareSerDeOptions(readerOptions.getSerDeOptions(), expectedSerDe));
EXPECT_TRUE(
compareSerDeOptions(readerOptions.getSerDeOptions(), expectedSerDe));
EXPECT_TRUE(
compareSerDeOptions(readerOptions.getSerDeOptions(), expectedSerDe));
EXPECT_TRUE(
compareSerDeOptions(readerOptions.getSerDeOptions(), expectedSerDe));

// Tests other custom reader options.
clearDynamicParameters(FileFormat::TEXT);
std::unordered_map<std::string, std::string> customHiveConfigProps;
customHiveConfigProps[hive::HiveConfig::kMaxCoalescedBytes] = "129";
customHiveConfigProps[hive::HiveConfig::kMaxCoalescedDistanceBytes] = "513";
customHiveConfigProps[hive::HiveConfig::kFileColumnNamesReadAsLowerCase] =
"true";
customHiveConfigProps[hive::HiveConfig::kOrcUseColumnNames] = "true";
customHiveConfigProps[hive::HiveConfig::kFooterEstimatedSize] = "1111";
customHiveConfigProps[hive::HiveConfig::kFilePreloadThreshold] = "9999";
hiveConfig = std::make_shared<hive::HiveConfig>(
std::make_shared<core::MemConfig>(customHiveConfigProps));
performConfigure();
EXPECT_EQ(readerOptions.maxCoalesceBytes(), hiveConfig->maxCoalescedBytes());
EXPECT_EQ(
readerOptions.maxCoalesceDistance(),
hiveConfig->maxCoalescedDistanceBytes());
EXPECT_EQ(
readerOptions.isFileColumnNamesReadAsLowerCase(),
hiveConfig->isFileColumnNamesReadAsLowerCase(&sessionProperties));
EXPECT_EQ(
readerOptions.isUseColumnNamesForColumnMapping(),
hiveConfig->isOrcUseColumnNames(&sessionProperties));
EXPECT_EQ(
readerOptions.getFooterEstimatedSize(),
hiveConfig->footerEstimatedSize());
EXPECT_EQ(
readerOptions.getFilePreloadThreshold(),
hiveConfig->filePreloadThreshold());
}

}; // namespace facebook::velox::connector
7 changes: 7 additions & 0 deletions velox/dwio/common/Options.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,14 @@ class SerDeOptions {
};

struct TableParameter {
/// If present in the table parameters, the option is passed to the row reader
/// to instruct it to skip the number of rows from the current position. Used
/// to skip the column header row(s).
static constexpr const char* kSkipHeaderLineCount = "skip.header.line.count";
/// If present in the table parameters, the option overrides the default value
/// of the SerDeOptions::nullString. It causes any field read from the file
/// (usually of the TEXT format) to be considered NULL if it is equal to this
/// string.
static constexpr const char* kSerializationNullFormat =
"serialization.null.format";
};
Expand Down

0 comments on commit 42b10d9

Please sign in to comment.