From 82cf186fc63ea1aedf82b8e6db8e691d257b369a Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 4 Nov 2024 17:28:20 +0000 Subject: [PATCH] cleanup --- cpp/src/io/json/json_column.cu | 37 +++++++++++++++++------------ cpp/src/io/json/nested_json.hpp | 5 +++- cpp/src/io/json/parser_features.cpp | 10 +++----- 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 0687b2dbc58..08d9bb840c8 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -420,9 +420,11 @@ std::pair, std::vector> device_json_co not schema.value().column_order->empty() ? schema.value().column_order.value() : json_col.column_order; + for (auto const& col_name : col_order) { auto child_schema_element = get_child_schema(col_name); auto const found_it = json_col.child_columns.find(col_name); + if (prune_columns and found_it == std::end(json_col.child_columns)) { CUDF_EXPECTS(child_schema_element.has_value(), "Column name not found in input schema map, but present in column order and " @@ -438,6 +440,7 @@ std::pair, std::vector> device_json_co continue; } column_names.emplace_back(found_it->first); + auto& child_col = found_it->second; if (!prune_columns or child_schema_element.has_value()) { auto [child_column, names] = device_json_column_to_cudf_column( @@ -599,9 +602,6 @@ table_with_metadata device_parse_nested_json(device_span d_input, std::vector out_column_names; auto parse_opt = parsing_options(options, stream); - // Iterate over the struct's child columns and convert to cudf column - size_type column_index = 0; - bool const has_column_order = options.is_enabled_prune_columns() and std::holds_alternative(options.get_dtypes()) and @@ -616,6 +616,9 @@ table_with_metadata device_parse_nested_json(device_span d_input, "Input schema column order size mismatch with input schema child types"); } auto root_col_size = root_struct_col.num_rows; + + // Iterate over the struct's child columns/column_order and convert to cudf column + size_type column_index = 0; for (auto const& col_name : col_order) { std::optional child_schema_element = std::visit( cudf::detail::visitor_overload{ @@ -642,6 +645,7 @@ table_with_metadata device_parse_nested_json(device_span d_input, return std::nullopt; }}, options.get_dtypes()); + #ifdef NJP_DEBUG_PRINT auto debug_schema_print = [](auto ret) { std::cout << ", type id: " @@ -649,17 +653,20 @@ table_with_metadata device_parse_nested_json(device_span d_input, << ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children" << "\n"; }; - std::visit( - cudf::detail::visitor_overload{[column_index](std::vector const&) { - std::cout << "Column by index: #" << column_index; - }, - [col_name](std::map const&) { - std::cout << "Column by flat name: '" << col_name; - }, - [col_name](std::map const&) { - std::cout << "Column by nested name: #" << col_name; - }}, - options.get_dtypes()); + std::visit(cudf::detail::visitor_overload{ + [column_index](std::vector const&) { + std::cout << "Column by index: #" << column_index; + }, + [col_name](std::map const&) { + std::cout << "Column by flat name: '" << col_name; + }, + [col_name](std::map const&) { + std::cout << "Column by nested name: #" << col_name; + }, + [col_name](schema_element const&) { + std::cout << "Column by nested schema with column order: #" << col_name; + }}, + options.get_dtypes()); debug_schema_print(child_schema_element); #endif @@ -669,7 +676,7 @@ table_with_metadata device_parse_nested_json(device_span d_input, CUDF_EXPECTS(child_schema_element.has_value(), "Column name not found in input schema map, but present in column order and " "prune_columns is enabled"); - // inserts empty null column + // inserts all null column out_column_names.emplace_back(make_column_name_info(child_schema_element.value(), col_name)); auto all_null_column = make_all_nulls_column(child_schema_element.value(), root_col_size, stream, mr); diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 29ea6e45db6..9a70c267be5 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -434,7 +434,7 @@ table_with_metadata device_parse_nested_json(device_span input, * @param schema The schema of the column to create * @param num_rows The number of rows in the column * @param stream The CUDA stream to which kernels are dispatched - * @param mr Optional, resource with which to allocate + * @param mr resource with which to allocate * @return The all null column */ std::unique_ptr make_all_nulls_column(schema_element const& schema, @@ -443,6 +443,9 @@ std::unique_ptr make_all_nulls_column(schema_element const& schema, rmm::device_async_resource_ref mr); /** @brief Create metadata for a column of a given schema + * + * @param schema The schema of the column + * @param col_name The name of the column */ column_name_info make_column_name_info(schema_element const& schema, std::string const& col_name); diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp index 38363fbe296..60a3175d31b 100644 --- a/cpp/src/io/json/parser_features.cpp +++ b/cpp/src/io/json/parser_features.cpp @@ -30,18 +30,13 @@ #include #include -/* - data_type type; - std::map child_types; - std::optional> column_order; -*/ namespace cudf::io { namespace { bool validate_column_order(schema_element const& types) { - // for struct types, check if column_order size matches child_types size and all elements in + // For struct types, check if column_order size matches child_types size and all elements in // column_order are in child_types, in child_types, call this function recursively. - // for list types, check if child_types size is 1 and call this function recursively. + // For list types, check if child_types size is 1 and call this function recursively. if (types.type.id() == type_id::STRUCT) { if (types.column_order.has_value()) { if (types.column_order.value().size() != types.child_types.size()) { return false; } @@ -63,6 +58,7 @@ bool validate_column_order(schema_element const& types) return true; } } // namespace + void json_reader_options::set_dtypes(schema_element types) { CUDF_EXPECTS(validate_column_order(types), "Column order does not match child types");