Skip to content

Commit

Permalink
cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
karthikeyann committed Nov 4, 2024
1 parent 9e31b71 commit 82cf186
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 23 deletions.
37 changes: 22 additions & 15 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -420,9 +420,11 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
not schema.value().column_order->empty()
? schema.value().column_order.value()
: json_col.column_order;

for (auto const& col_name : col_order) {
auto child_schema_element = get_child_schema(col_name);
auto const found_it = json_col.child_columns.find(col_name);

if (prune_columns and found_it == std::end(json_col.child_columns)) {
CUDF_EXPECTS(child_schema_element.has_value(),
"Column name not found in input schema map, but present in column order and "
Expand All @@ -438,6 +440,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
continue;
}
column_names.emplace_back(found_it->first);

auto& child_col = found_it->second;
if (!prune_columns or child_schema_element.has_value()) {
auto [child_column, names] = device_json_column_to_cudf_column(
Expand Down Expand Up @@ -599,9 +602,6 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
std::vector<column_name_info> out_column_names;
auto parse_opt = parsing_options(options, stream);

// Iterate over the struct's child columns and convert to cudf column
size_type column_index = 0;

bool const has_column_order =
options.is_enabled_prune_columns() and
std::holds_alternative<schema_element>(options.get_dtypes()) and
Expand All @@ -616,6 +616,9 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
"Input schema column order size mismatch with input schema child types");
}
auto root_col_size = root_struct_col.num_rows;

// Iterate over the struct's child columns/column_order and convert to cudf column
size_type column_index = 0;
for (auto const& col_name : col_order) {
std::optional<schema_element> child_schema_element = std::visit(
cudf::detail::visitor_overload{
Expand All @@ -642,24 +645,28 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
return std::nullopt;
}},
options.get_dtypes());

#ifdef NJP_DEBUG_PRINT
auto debug_schema_print = [](auto ret) {
std::cout << ", type id: "
<< (ret.has_value() ? std::to_string(static_cast<int>(ret->type.id())) : "n/a")
<< ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children"
<< "\n";
};
std::visit(
cudf::detail::visitor_overload{[column_index](std::vector<data_type> const&) {
std::cout << "Column by index: #" << column_index;
},
[col_name](std::map<std::string, data_type> const&) {
std::cout << "Column by flat name: '" << col_name;
},
[col_name](std::map<std::string, schema_element> const&) {
std::cout << "Column by nested name: #" << col_name;
}},
options.get_dtypes());
std::visit(cudf::detail::visitor_overload{
[column_index](std::vector<data_type> const&) {
std::cout << "Column by index: #" << column_index;
},
[col_name](std::map<std::string, data_type> const&) {
std::cout << "Column by flat name: '" << col_name;
},
[col_name](std::map<std::string, schema_element> const&) {
std::cout << "Column by nested name: #" << col_name;
},
[col_name](schema_element const&) {
std::cout << "Column by nested schema with column order: #" << col_name;
}},
options.get_dtypes());
debug_schema_print(child_schema_element);
#endif

Expand All @@ -669,7 +676,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
CUDF_EXPECTS(child_schema_element.has_value(),
"Column name not found in input schema map, but present in column order and "
"prune_columns is enabled");
// inserts empty null column
// inserts all null column
out_column_names.emplace_back(make_column_name_info(child_schema_element.value(), col_name));
auto all_null_column =
make_all_nulls_column(child_schema_element.value(), root_col_size, stream, mr);
Expand Down
5 changes: 4 additions & 1 deletion cpp/src/io/json/nested_json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
* @param schema The schema of the column to create
* @param num_rows The number of rows in the column
* @param stream The CUDA stream to which kernels are dispatched
* @param mr Optional, resource with which to allocate
* @param mr resource with which to allocate
* @return The all null column
*/
std::unique_ptr<column> make_all_nulls_column(schema_element const& schema,
Expand All @@ -443,6 +443,9 @@ std::unique_ptr<column> make_all_nulls_column(schema_element const& schema,
rmm::device_async_resource_ref mr);

/** @brief Create metadata for a column of a given schema
*
* @param schema The schema of the column
* @param col_name The name of the column
*/
column_name_info make_column_name_info(schema_element const& schema, std::string const& col_name);

Expand Down
10 changes: 3 additions & 7 deletions cpp/src/io/json/parser_features.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,13 @@
#include <string>
#include <vector>

/*
data_type type;
std::map<std::string, schema_element> child_types;
std::optional<std::vector<std::string>> column_order;
*/
namespace cudf::io {
namespace {
bool validate_column_order(schema_element const& types)
{
// for struct types, check if column_order size matches child_types size and all elements in
// For struct types, check if column_order size matches child_types size and all elements in
// column_order are in child_types, in child_types, call this function recursively.
// for list types, check if child_types size is 1 and call this function recursively.
// For list types, check if child_types size is 1 and call this function recursively.
if (types.type.id() == type_id::STRUCT) {
if (types.column_order.has_value()) {
if (types.column_order.value().size() != types.child_types.size()) { return false; }
Expand All @@ -63,6 +58,7 @@ bool validate_column_order(schema_element const& types)
return true;
}
} // namespace

void json_reader_options::set_dtypes(schema_element types)
{
CUDF_EXPECTS(validate_column_order(types), "Column order does not match child types");
Expand Down

0 comments on commit 82cf186

Please sign in to comment.