Skip to content

Commit

Permalink
Merge branch 'branch-23.10' into unlock-p2p
Browse files Browse the repository at this point in the history
  • Loading branch information
rjzamora authored Sep 22, 2023
2 parents 5f14ae5 + a6d014e commit e3d664e
Show file tree
Hide file tree
Showing 15 changed files with 177 additions and 73 deletions.
3 changes: 2 additions & 1 deletion cpp/benchmarks/text/ngrams.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,12 @@ static void BM_ngrams(benchmark::State& state, ngrams_type nt)
cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
cudf::strings_column_view input(column->view());
auto const separator = cudf::string_scalar("_");

for (auto _ : state) {
cuda_event_timer raii(state, true);
switch (nt) {
case ngrams_type::tokens: nvtext::generate_ngrams(input); break;
case ngrams_type::tokens: nvtext::generate_ngrams(input, 2, separator); break;
case ngrams_type::characters: nvtext::generate_character_ngrams(input); break;
}
}
Expand Down
7 changes: 5 additions & 2 deletions cpp/benchmarks/text/tokenize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,11 @@ static void bench_tokenize(nvbench::state& state)
auto result = nvtext::count_tokens(input, cudf::strings_column_view(delimiters));
});
} else if (tokenize_type == "ngrams") {
state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto result = nvtext::ngrams_tokenize(input); });
auto const delimiter = cudf::string_scalar("");
auto const separator = cudf::string_scalar("_");
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = nvtext::ngrams_tokenize(input, 2, delimiter, separator);
});
} else if (tokenize_type == "characters") {
state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto result = nvtext::character_tokenize(input); });
Expand Down
38 changes: 21 additions & 17 deletions cpp/include/nvtext/generate_ngrams.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,19 +47,19 @@ namespace nvtext {
* @throw cudf::logic_error if `separator` is invalid
* @throw cudf::logic_error if there are not enough strings to generate any ngrams
*
* @param strings Strings column to tokenize and produce ngrams from.
* @param ngrams The ngram number to generate.
* Default is 2 = bigram.
* @param separator The string to use for separating ngram tokens.
* Default is "_" character.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of tokens.
* @param input Strings column to tokenize and produce ngrams from
* @param ngrams The ngram number to generate
* @param separator The string to use for separating ngram tokens
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of tokens
*/
std::unique_ptr<cudf::column> generate_ngrams(
cudf::strings_column_view const& strings,
cudf::size_type ngrams = 2,
cudf::string_scalar const& separator = cudf::string_scalar{"_"},
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
cudf::strings_column_view const& input,
cudf::size_type ngrams,
cudf::string_scalar const& separator,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Generates ngrams of characters within each string.
Expand All @@ -79,15 +79,17 @@ std::unique_ptr<cudf::column> generate_ngrams(
* @throw cudf::logic_error if `ngrams < 2`
* @throw cudf::logic_error if there are not enough characters to generate any ngrams
*
* @param strings Strings column to produce ngrams from.
* @param input Strings column to produce ngrams from
* @param ngrams The ngram number to generate.
* Default is 2 = bigram.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of tokens.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of tokens
*/
std::unique_ptr<cudf::column> generate_character_ngrams(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::size_type ngrams = 2,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -113,14 +115,16 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
* @throw cudf::logic_error if `ngrams < 2`
* @throw cudf::logic_error if there are not enough characters to generate any ngrams
*
* @param strings Strings column to produce ngrams from.
* @param input Strings column to produce ngrams from
* @param ngrams The ngram number to generate. Default is 5.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return A lists column of hash values
*/
std::unique_ptr<cudf::column> hash_character_ngrams(
cudf::strings_column_view const& strings,
cudf::strings_column_view const& input,
cudf::size_type ngrams = 5,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
Expand Down
28 changes: 14 additions & 14 deletions cpp/include/nvtext/ngrams_tokenize.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -66,22 +66,22 @@ namespace nvtext {
*
* All null row entries are ignored and the output contains all valid rows.
*
* @param strings Strings column to tokenize and produce ngrams from.
* @param ngrams The ngram number to generate.
* Default is 2 = bigram.
* @param input Strings column to tokenize and produce ngrams from
* @param ngrams The ngram number to generate
* @param delimiter UTF-8 characters used to separate each string into tokens.
* The default of empty string will separate tokens using whitespace.
* @param separator The string to use for separating ngram tokens.
* Default is "_" character.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings columns of tokens.
* An empty string will separate tokens using whitespace.
* @param separator The string to use for separating ngram tokens
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of tokens
*/
std::unique_ptr<cudf::column> ngrams_tokenize(
cudf::strings_column_view const& strings,
cudf::size_type ngrams = 2,
cudf::string_scalar const& delimiter = cudf::string_scalar{""},
cudf::string_scalar const& separator = cudf::string_scalar{"_"},
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
cudf::strings_column_view const& input,
cudf::size_type ngrams,
cudf::string_scalar const& delimiter,
cudf::string_scalar const& separator,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
} // namespace nvtext
12 changes: 6 additions & 6 deletions cpp/src/lists/count_elements.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -36,12 +36,12 @@ namespace cudf {
namespace lists {
namespace detail {
/**
* @brief Returns a numeric column containing lengths of each element.
* @brief Returns a numeric column containing lengths of each element
*
* @param input Input lists column.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param input Input lists column
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New INT32 column with lengths.
* @return New size_type column with lengths
*/
std::unique_ptr<column> count_elements(lists_column_view const& input,
rmm::cuda_stream_view stream,
Expand All @@ -52,7 +52,7 @@ std::unique_ptr<column> count_elements(lists_column_view const& input,
// create output column
auto output = make_fixed_width_column(data_type{type_to_id<size_type>()},
input.size(),
copy_bitmask(input.parent()),
cudf::detail::copy_bitmask(input.parent(), stream, mr),
input.null_count(),
stream,
mr);
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/replace/clamp.cu
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,9 @@ std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<cudf::column>> clamp
auto output =
detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr);
// mask will not change
if (input.nullable()) { output->set_null_mask(copy_bitmask(input), input.null_count()); }
if (input.nullable()) {
output->set_null_mask(cudf::detail::copy_bitmask(input, stream, mr), input.null_count());
}

auto output_device_view =
cudf::mutable_column_device_view::create(output->mutable_view(), stream);
Expand Down
9 changes: 6 additions & 3 deletions cpp/src/text/generate_ngrams.cu
Original file line number Diff line number Diff line change
Expand Up @@ -150,10 +150,11 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& strings,
cudf::size_type ngrams,
cudf::string_scalar const& separator,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::generate_ngrams(strings, ngrams, separator, cudf::get_default_stream(), mr);
return detail::generate_ngrams(strings, ngrams, separator, stream, mr);
}

namespace detail {
Expand Down Expand Up @@ -317,18 +318,20 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co

std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
cudf::size_type ngrams,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::generate_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
return detail::generate_character_ngrams(strings, ngrams, stream, mr);
}

std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
cudf::size_type ngrams,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::hash_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
return detail::hash_character_ngrams(strings, ngrams, stream, mr);
}

} // namespace nvtext
4 changes: 2 additions & 2 deletions cpp/src/text/jaccard.cu
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ rmm::device_uvector<cudf::size_type> compute_unique_counts(cudf::column_view con
*
* This is called with a warp per row
*/
struct sorted_interset_fn {
struct sorted_intersect_fn {
cudf::column_device_view const d_input1;
cudf::column_device_view const d_input2;
cudf::size_type* d_results;
Expand Down Expand Up @@ -151,7 +151,7 @@ rmm::device_uvector<cudf::size_type> compute_intersect_counts(cudf::column_view
auto const d_input1 = cudf::column_device_view::create(input1, stream);
auto const d_input2 = cudf::column_device_view::create(input2, stream);
auto d_results = rmm::device_uvector<cudf::size_type>(input1.size(), stream);
sorted_interset_fn fn{*d_input1, *d_input2, d_results.data()};
sorted_intersect_fn fn{*d_input1, *d_input2, d_results.data()};
thrust::for_each_n(rmm::exec_policy(stream),
thrust::counting_iterator<cudf::size_type>(0),
input1.size() * cudf::detail::warp_size,
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/text/ngrams_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -265,11 +265,11 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
cudf::size_type ngrams,
cudf::string_scalar const& delimiter,
cudf::string_scalar const& separator,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::ngrams_tokenize(
strings, ngrams, delimiter, separator, cudf::get_default_stream(), mr);
return detail::ngrams_tokenize(strings, ngrams, delimiter, separator, stream, mr);
}

} // namespace nvtext
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,7 @@ ConfigureTest(
STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE
testing
)
ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing)

# ##################################################################################################
# Install tests ####################################################################################
Expand Down
59 changes: 59 additions & 0 deletions cpp/tests/streams/text/ngrams_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <nvtext/generate_ngrams.hpp>
#include <nvtext/ngrams_tokenize.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/default_stream.hpp>

class TextNGramsTest : public cudf::test::BaseFixture {};

TEST_F(TextNGramsTest, GenerateNgrams)
{
auto const input =
cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()};
nvtext::generate_ngrams(
cudf::strings_column_view(input), 3, separator, cudf::test::get_default_stream());
}

TEST_F(TextNGramsTest, GenerateCharacterNgrams)
{
auto const input =
cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
nvtext::generate_character_ngrams(
cudf::strings_column_view(input), 3, cudf::test::get_default_stream());
}

TEST_F(TextNGramsTest, HashCharacterNgrams)
{
auto input =
cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
nvtext::hash_character_ngrams(
cudf::strings_column_view(input), 5, cudf::test::get_default_stream());
}

TEST_F(TextNGramsTest, NgramsTokenize)
{
auto input =
cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()};
nvtext::ngrams_tokenize(
cudf::strings_column_view(input), 2, delimiter, separator, cudf::test::get_default_stream());
}
28 changes: 16 additions & 12 deletions cpp/tests/text/ngrams_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,19 @@ TEST_F(TextGenerateNgramsTest, Ngrams)
{
cudf::test::strings_column_wrapper strings{"the", "fox", "jumped", "over", "thé", "dog"};
cudf::strings_column_view strings_view(strings);
auto const separator = cudf::string_scalar("_");

{
cudf::test::strings_column_wrapper expected{
"the_fox", "fox_jumped", "jumped_over", "over_thé", "thé_dog"};
auto const results = nvtext::generate_ngrams(strings_view);
auto const results = nvtext::generate_ngrams(strings_view, 2, separator);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

{
cudf::test::strings_column_wrapper expected{
"the_fox_jumped", "fox_jumped_over", "jumped_over_thé", "over_thé_dog"};
auto const results = nvtext::generate_ngrams(strings_view, 3);
auto const results = nvtext::generate_ngrams(strings_view, 3, separator);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
{
Expand Down Expand Up @@ -83,10 +84,11 @@ TEST_F(TextGenerateNgramsTest, NgramsWithNulls)
h_strings.begin(),
h_strings.end(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
auto const separator = cudf::string_scalar("_");

cudf::strings_column_view strings_view(strings);
{
auto const results = nvtext::generate_ngrams(strings_view, 3);
auto const results = nvtext::generate_ngrams(strings_view, 3, separator);
cudf::test::strings_column_wrapper expected{
"the_fox_jumped", "fox_jumped_over", "jumped_over_the", "over_the_dog"};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
Expand All @@ -103,7 +105,10 @@ TEST_F(TextGenerateNgramsTest, Empty)
{
auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();

auto results = nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column));
auto const separator = cudf::string_scalar("_");

auto results =
nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column), 2, separator);
cudf::test::expect_column_empty(results->view());
results = nvtext::generate_character_ngrams(cudf::strings_column_view(zero_size_strings_column));
cudf::test::expect_column_empty(results->view());
Expand All @@ -112,21 +117,20 @@ TEST_F(TextGenerateNgramsTest, Empty)
TEST_F(TextGenerateNgramsTest, Errors)
{
cudf::test::strings_column_wrapper strings{""};
auto const separator = cudf::string_scalar("_");
// invalid parameter value
EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1), cudf::logic_error);
EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1, separator),
cudf::logic_error);
EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 1),
cudf::logic_error);
// not enough strings to generate ngrams
EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3), cudf::logic_error);
EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3, separator),
cudf::logic_error);
EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 3),
cudf::logic_error);

std::vector<char const*> h_strings{"", nullptr, "", nullptr};
cudf::test::strings_column_wrapper strings_no_tokens(
h_strings.begin(),
h_strings.end(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings_no_tokens)),
cudf::test::strings_column_wrapper strings_no_tokens({"", "", "", ""}, {1, 0, 1, 0});
EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings_no_tokens), 2, separator),
cudf::logic_error);
EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings_no_tokens)),
cudf::logic_error);
Expand Down
Loading

0 comments on commit e3d664e

Please sign in to comment.