From f7d2ab6677de751183051e8b334fd9f1514df273 Mon Sep 17 00:00:00 2001 From: yah01 Date: Wed, 15 Nov 2023 18:08:20 +0800 Subject: [PATCH] enhance: reduce 1x copy for variable length field while retrieving (#28345) - Reduce 1x copy for varchar/string/JSON/array types while retrieving - Reduce 1x copy for int8/int16 while retrieving Signed-off-by: yah01 --- .../core/src/segcore/SegmentGrowingImpl.cpp | 115 +++++--- .../core/src/segcore/SegmentGrowingImpl.h | 18 +- .../core/src/segcore/SegmentSealedImpl.cpp | 172 ++++++------ internal/core/src/segcore/SegmentSealedImpl.h | 21 +- internal/core/src/segcore/Utils.cpp | 3 +- internal/core/unittest/test_float16.cpp | 4 +- internal/core/unittest/test_growing.cpp | 4 +- internal/core/unittest/test_query.cpp | 1 - internal/core/unittest/test_sealed.cpp | 255 ++++++++++++++---- internal/core/unittest/test_span.cpp | 14 +- 10 files changed, 400 insertions(+), 207 deletions(-) diff --git a/internal/core/src/segcore/SegmentGrowingImpl.cpp b/internal/core/src/segcore/SegmentGrowingImpl.cpp index 5411467f4b0d6..22b9f6cc62af7 100644 --- a/internal/core/src/segcore/SegmentGrowingImpl.cpp +++ b/internal/core/src/segcore/SegmentGrowingImpl.cpp @@ -383,9 +383,9 @@ SegmentGrowingImpl::bulk_subscript(FieldId field_id, AssertInfo(!field_meta.is_vector(), "Scalar field meta type is vector type"); + auto result = CreateScalarDataArray(count, field_meta); switch (field_meta.get_data_type()) { case DataType::BOOL: { - auto result = CreateScalarDataArray(count, field_meta); bulk_subscript_impl(vec_ptr, seg_offsets, count, @@ -393,22 +393,29 @@ SegmentGrowingImpl::bulk_subscript(FieldId field_id, ->mutable_bool_data() ->mutable_data() ->mutable_data()); - return result; + break; } case DataType::INT8: { - FixedVector output(count); - bulk_subscript_impl( - vec_ptr, seg_offsets, count, output.data()); - return CreateScalarDataArrayFrom(output.data(), count, field_meta); + bulk_subscript_impl(vec_ptr, + seg_offsets, + count, + result->mutable_scalars() + ->mutable_int_data() + ->mutable_data() + ->mutable_data()); + break; } case DataType::INT16: { - FixedVector output(count); - bulk_subscript_impl( - vec_ptr, seg_offsets, count, output.data()); - return CreateScalarDataArrayFrom(output.data(), count, field_meta); + bulk_subscript_impl(vec_ptr, + seg_offsets, + count, + result->mutable_scalars() + ->mutable_int_data() + ->mutable_data() + ->mutable_data()); + break; } case DataType::INT32: { - auto result = CreateScalarDataArray(count, field_meta); bulk_subscript_impl(vec_ptr, seg_offsets, count, @@ -416,10 +423,9 @@ SegmentGrowingImpl::bulk_subscript(FieldId field_id, ->mutable_int_data() ->mutable_data() ->mutable_data()); - return result; + break; } case DataType::INT64: { - auto result = CreateScalarDataArray(count, field_meta); bulk_subscript_impl(vec_ptr, seg_offsets, count, @@ -427,10 +433,9 @@ SegmentGrowingImpl::bulk_subscript(FieldId field_id, ->mutable_long_data() ->mutable_data() ->mutable_data()); - return result; + break; } case DataType::FLOAT: { - auto result = CreateScalarDataArray(count, field_meta); bulk_subscript_impl(vec_ptr, seg_offsets, count, @@ -438,10 +443,9 @@ SegmentGrowingImpl::bulk_subscript(FieldId field_id, ->mutable_float_data() ->mutable_data() ->mutable_data()); - return result; + break; } case DataType::DOUBLE: { - auto result = CreateScalarDataArray(count, field_meta); bulk_subscript_impl(vec_ptr, seg_offsets, count, @@ -449,25 +453,34 @@ SegmentGrowingImpl::bulk_subscript(FieldId field_id, ->mutable_double_data() ->mutable_data() ->mutable_data()); - return result; + break; } case DataType::VARCHAR: { - FixedVector output(count); - bulk_subscript_impl( - vec_ptr, seg_offsets, count, output.data()); - return CreateScalarDataArrayFrom(output.data(), count, field_meta); + bulk_subscript_ptr_impl(vec_ptr, + seg_offsets, + count, + result->mutable_scalars() + ->mutable_string_data() + ->mutable_data()); + break; } case DataType::JSON: { - FixedVector output(count); - bulk_subscript_impl( - vec_ptr, seg_offsets, count, output.data()); - return CreateScalarDataArrayFrom(output.data(), count, field_meta); + bulk_subscript_ptr_impl( + vec_ptr, + seg_offsets, + count, + result->mutable_scalars()->mutable_json_data()->mutable_data()); + break; } case DataType::ARRAY: { // element - FixedVector output(count); - bulk_subscript_impl(*vec_ptr, seg_offsets, count, output.data()); - return CreateScalarDataArrayFrom(output.data(), count, field_meta); + bulk_subscript_array_impl(*vec_ptr, + seg_offsets, + count, + result->mutable_scalars() + ->mutable_array_data() + ->mutable_data()); + break; } default: { PanicInfo( @@ -475,6 +488,22 @@ SegmentGrowingImpl::bulk_subscript(FieldId field_id, fmt::format("unsupported type {}", field_meta.get_data_type())); } } + return result; +} + +template +void +SegmentGrowingImpl::bulk_subscript_ptr_impl( + const VectorBase* vec_raw, + const int64_t* seg_offsets, + int64_t count, + google::protobuf::RepeatedPtrField* dst) const { + auto vec = dynamic_cast*>(vec_raw); + auto& src = *vec; + for (int64_t i = 0; i < count; ++i) { + auto offset = seg_offsets[i]; + dst->at(i) = std::move(T(src[offset])); + } } template @@ -523,31 +552,31 @@ void SegmentGrowingImpl::bulk_subscript_impl(const VectorBase* vec_raw, const int64_t* seg_offsets, int64_t count, - void* output_raw) const { + T* output) const { static_assert(IsScalar); auto vec_ptr = dynamic_cast*>(vec_raw); AssertInfo(vec_ptr, "Pointer of vec_raw is nullptr"); auto& vec = *vec_ptr; - auto output = reinterpret_cast(output_raw); for (int64_t i = 0; i < count; ++i) { auto offset = seg_offsets[i]; output[i] = vec[offset]; } } +template void -SegmentGrowingImpl::bulk_subscript_impl(const VectorBase& vec_raw, - const int64_t* seg_offsets, - int64_t count, - void* output_raw) const { +SegmentGrowingImpl::bulk_subscript_array_impl( + const VectorBase& vec_raw, + const int64_t* seg_offsets, + int64_t count, + google::protobuf::RepeatedPtrField* dst) const { auto vec_ptr = dynamic_cast*>(&vec_raw); AssertInfo(vec_ptr, "Pointer of vec_raw is nullptr"); auto& vec = *vec_ptr; - auto output = reinterpret_cast(output_raw); for (int64_t i = 0; i < count; ++i) { auto offset = seg_offsets[i]; if (offset != INVALID_SEG_OFFSET) { - output[i] = vec[offset].output_data(); + dst->at(i) = vec[offset].output_data(); } } } @@ -559,12 +588,16 @@ SegmentGrowingImpl::bulk_subscript(SystemFieldType system_type, void* output) const { switch (system_type) { case SystemFieldType::Timestamp: - bulk_subscript_impl( - &this->insert_record_.timestamps_, seg_offsets, count, output); + bulk_subscript_impl(&this->insert_record_.timestamps_, + seg_offsets, + count, + static_cast(output)); break; case SystemFieldType::RowId: - bulk_subscript_impl( - &this->insert_record_.row_ids_, seg_offsets, count, output); + bulk_subscript_impl(&this->insert_record_.row_ids_, + seg_offsets, + count, + static_cast(output)); break; default: PanicInfo(DataTypeInvalid, "unknown subscript fields"); diff --git a/internal/core/src/segcore/SegmentGrowingImpl.h b/internal/core/src/segcore/SegmentGrowingImpl.h index 1a050bfbb101e..95259d612fdbe 100644 --- a/internal/core/src/segcore/SegmentGrowingImpl.h +++ b/internal/core/src/segcore/SegmentGrowingImpl.h @@ -155,14 +155,22 @@ class SegmentGrowingImpl : public SegmentGrowing { bulk_subscript_impl(const VectorBase* vec_raw, const int64_t* seg_offsets, int64_t count, - void* output_raw) const; + T* output) const; + + template + void + bulk_subscript_ptr_impl(const VectorBase* vec_raw, + const int64_t* seg_offsets, + int64_t count, + google::protobuf::RepeatedPtrField* dst) const; // for scalar array vectors + template void - bulk_subscript_impl(const VectorBase& vec_raw, - const int64_t* seg_offsets, - int64_t count, - void* output_raw) const; + bulk_subscript_array_impl(const VectorBase& vec_raw, + const int64_t* seg_offsets, + int64_t count, + google::protobuf::RepeatedPtrField* dst) const; template void diff --git a/internal/core/src/segcore/SegmentSealedImpl.cpp b/internal/core/src/segcore/SegmentSealedImpl.cpp index 2b0362ad8a85a..16173f640434d 100644 --- a/internal/core/src/segcore/SegmentSealedImpl.cpp +++ b/internal/core/src/segcore/SegmentSealedImpl.cpp @@ -27,11 +27,13 @@ #include "common/Json.h" #include "common/EasyAssert.h" #include "common/Array.h" +#include "google/protobuf/message_lite.h" #include "mmap/Column.h" #include "common/Consts.h" #include "common/FieldMeta.h" #include "common/Types.h" #include "log/Log.h" +#include "pb/schema.pb.h" #include "query/ScalarIndex.h" #include "query/SearchBruteForce.h" #include "query/SearchOnSealed.h" @@ -921,7 +923,7 @@ SegmentSealedImpl::bulk_subscript(SystemFieldType system_type, this->insert_record_.timestamps_.get_chunk_data(0), seg_offsets, count, - output); + static_cast(output)); break; case SystemFieldType::RowId: AssertInfo(insert_record_.row_ids_.num_chunk() == 1, @@ -930,7 +932,7 @@ SegmentSealedImpl::bulk_subscript(SystemFieldType system_type, this->insert_record_.row_ids_.get_chunk_data(0), seg_offsets, count, - output); + static_cast(output)); break; default: PanicInfo(DataTypeInvalid, @@ -938,16 +940,14 @@ SegmentSealedImpl::bulk_subscript(SystemFieldType system_type, } } -template +template void SegmentSealedImpl::bulk_subscript_impl(const void* src_raw, const int64_t* seg_offsets, int64_t count, - void* dst_raw) { + T* dst) { static_assert(IsScalar); - auto src = reinterpret_cast(src_raw); - auto dst = reinterpret_cast(dst_raw); - + auto src = static_cast(src_raw); for (int64_t i = 0; i < count; ++i) { auto offset = seg_offsets[i]; dst[i] = src[offset]; @@ -968,18 +968,31 @@ SegmentSealedImpl::bulk_subscript_impl(const ColumnBase* column, } } +template void -SegmentSealedImpl::bulk_subscript_impl(const ColumnBase* column, - const int64_t* seg_offsets, - int64_t count, - void* dst_raw) { +SegmentSealedImpl::bulk_subscript_ptr_impl( + const ColumnBase* column, + const int64_t* seg_offsets, + int64_t count, + google::protobuf::RepeatedPtrField* dst) { + auto field = reinterpret_cast*>(column); + for (int64_t i = 0; i < count; ++i) { + auto offset = seg_offsets[i]; + dst->at(i) = std::move(T(field->RawAt(offset))); + } +} + +template +void +SegmentSealedImpl::bulk_subscript_array_impl( + const ColumnBase* column, + const int64_t* seg_offsets, + int64_t count, + google::protobuf::RepeatedPtrField* dst) { auto field = reinterpret_cast(column); - auto dst = reinterpret_cast(dst_raw); for (int64_t i = 0; i < count; ++i) { auto offset = seg_offsets[i]; - if (offset != INVALID_SEG_OFFSET) { - dst[i] = std::move(field->RawAt(offset)); - } + dst->at(i) = std::move(field->RawAt(offset)); } } @@ -990,11 +1003,11 @@ SegmentSealedImpl::bulk_subscript_impl(int64_t element_sizeof, const int64_t* seg_offsets, int64_t count, void* dst_raw) { - auto src_vec = reinterpret_cast(src_raw); + auto column = reinterpret_cast(src_raw); auto dst_vec = reinterpret_cast(dst_raw); for (int64_t i = 0; i < count; ++i) { auto offset = seg_offsets[i]; - auto src = src_vec + element_sizeof * offset; + auto src = column + element_sizeof * offset; auto dst = dst_vec + i * element_sizeof; memcpy(dst, src, element_sizeof); } @@ -1037,142 +1050,135 @@ SegmentSealedImpl::bulk_subscript(FieldId field_id, // we have to clone the shared pointer, // to make sure it won't get released if segment released auto column = fields_.at(field_id); - if (datatype_is_variable(field_meta.get_data_type())) { - switch (field_meta.get_data_type()) { - case DataType::VARCHAR: - case DataType::STRING: { - FixedVector output(count); - bulk_subscript_impl( - column.get(), seg_offsets, count, output.data()); - return CreateScalarDataArrayFrom( - output.data(), count, field_meta); - } - - case DataType::JSON: { - FixedVector output(count); - bulk_subscript_impl( - column.get(), seg_offsets, count, output.data()); - return CreateScalarDataArrayFrom( - output.data(), count, field_meta); - } + auto ret = fill_with_empty(field_id, count); + switch (field_meta.get_data_type()) { + case DataType::VARCHAR: + case DataType::STRING: { + bulk_subscript_ptr_impl( + column.get(), + seg_offsets, + count, + ret->mutable_scalars()->mutable_string_data()->mutable_data()); + break; + } - case DataType::ARRAY: { - FixedVector output(count); - bulk_subscript_impl( - column.get(), seg_offsets, count, output.data()); - return CreateScalarDataArrayFrom( - output.data(), count, field_meta); - } + case DataType::JSON: { + bulk_subscript_ptr_impl( + column.get(), + seg_offsets, + count, + ret->mutable_scalars()->mutable_json_data()->mutable_data()); + break; + } - default: - PanicInfo( - DataTypeInvalid, - fmt::format("unsupported data type: {}", - datatype_name(field_meta.get_data_type()))); + case DataType::ARRAY: { + bulk_subscript_array_impl( + column.get(), + seg_offsets, + count, + ret->mutable_scalars()->mutable_array_data()->mutable_data()); + break; } - } - auto src_vec = column->Data(); - switch (field_meta.get_data_type()) { case DataType::BOOL: { - auto ret = fill_with_empty(field_id, count); - bulk_subscript_impl(src_vec, + bulk_subscript_impl(column->Data(), seg_offsets, count, ret->mutable_scalars() ->mutable_bool_data() ->mutable_data() ->mutable_data()); - return ret; + break; } case DataType::INT8: { - FixedVector output(count); - bulk_subscript_impl( - src_vec, seg_offsets, count, output.data()); - return CreateScalarDataArrayFrom(output.data(), count, field_meta); + bulk_subscript_impl(column->Data(), + seg_offsets, + count, + ret->mutable_scalars() + ->mutable_int_data() + ->mutable_data() + ->mutable_data()); + break; } case DataType::INT16: { - FixedVector output(count); - bulk_subscript_impl( - src_vec, seg_offsets, count, output.data()); - return CreateScalarDataArrayFrom(output.data(), count, field_meta); + bulk_subscript_impl(column->Data(), + seg_offsets, + count, + ret->mutable_scalars() + ->mutable_int_data() + ->mutable_data() + ->mutable_data()); + break; } case DataType::INT32: { - auto ret = fill_with_empty(field_id, count); - bulk_subscript_impl(src_vec, + bulk_subscript_impl(column->Data(), seg_offsets, count, ret->mutable_scalars() ->mutable_int_data() ->mutable_data() ->mutable_data()); - return ret; + break; } case DataType::INT64: { - auto ret = fill_with_empty(field_id, count); - bulk_subscript_impl(src_vec, + bulk_subscript_impl(column->Data(), seg_offsets, count, ret->mutable_scalars() ->mutable_long_data() ->mutable_data() ->mutable_data()); - return ret; + break; } case DataType::FLOAT: { - auto ret = fill_with_empty(field_id, count); - bulk_subscript_impl(src_vec, + bulk_subscript_impl(column->Data(), seg_offsets, count, ret->mutable_scalars() ->mutable_float_data() ->mutable_data() ->mutable_data()); - return ret; + break; } case DataType::DOUBLE: { - auto ret = fill_with_empty(field_id, count); - bulk_subscript_impl(src_vec, + bulk_subscript_impl(column->Data(), seg_offsets, count, ret->mutable_scalars() ->mutable_double_data() ->mutable_data() ->mutable_data()); - return ret; + break; } case DataType::VECTOR_FLOAT: { - auto ret = fill_with_empty(field_id, count); bulk_subscript_impl(field_meta.get_sizeof(), - src_vec, + column->Data(), seg_offsets, count, ret->mutable_vectors() ->mutable_float_vector() ->mutable_data() ->mutable_data()); - return ret; + break; } case DataType::VECTOR_FLOAT16: { - auto ret = fill_with_empty(field_id, count); bulk_subscript_impl( field_meta.get_sizeof(), - src_vec, + column->Data(), seg_offsets, count, ret->mutable_vectors()->mutable_float16_vector()->data()); - return ret; + break; } case DataType::VECTOR_BINARY: { - auto ret = fill_with_empty(field_id, count); bulk_subscript_impl( field_meta.get_sizeof(), - src_vec, + column->Data(), seg_offsets, count, ret->mutable_vectors()->mutable_binary_vector()->data()); - return ret; + break; } default: { @@ -1181,6 +1187,8 @@ SegmentSealedImpl::bulk_subscript(FieldId field_id, field_meta.get_data_type())); } } + + return ret; } bool diff --git a/internal/core/src/segcore/SegmentSealedImpl.h b/internal/core/src/segcore/SegmentSealedImpl.h index 6c849200abbbe..ff3754a55a87f 100644 --- a/internal/core/src/segcore/SegmentSealedImpl.h +++ b/internal/core/src/segcore/SegmentSealedImpl.h @@ -29,6 +29,7 @@ #include "SegmentSealed.h" #include "TimestampIndex.h" #include "common/EasyAssert.h" +#include "google/protobuf/message_lite.h" #include "mmap/Column.h" #include "index/ScalarIndex.h" #include "sys/mman.h" @@ -166,12 +167,12 @@ class SegmentSealedImpl : public SegmentSealed { } private: - template + template static void bulk_subscript_impl(const void* src_raw, const int64_t* seg_offsets, int64_t count, - void* dst_raw); + T* dst_raw); template static void @@ -180,11 +181,19 @@ class SegmentSealedImpl : public SegmentSealed { int64_t count, void* dst_raw); + template static void - bulk_subscript_impl(const ColumnBase* column, - const int64_t* seg_offsets, - int64_t count, - void* dst_raw); + bulk_subscript_ptr_impl(const ColumnBase* field, + const int64_t* seg_offsets, + int64_t count, + google::protobuf::RepeatedPtrField* dst_raw); + + template + static void + bulk_subscript_array_impl(const ColumnBase* column, + const int64_t* seg_offsets, + int64_t count, + google::protobuf::RepeatedPtrField* dst); static void bulk_subscript_impl(int64_t element_sizeof, diff --git a/internal/core/src/segcore/Utils.cpp b/internal/core/src/segcore/Utils.cpp index bede4e60bedd0..413f93ba8db37 100644 --- a/internal/core/src/segcore/Utils.cpp +++ b/internal/core/src/segcore/Utils.cpp @@ -261,7 +261,8 @@ CreateScalarDataArray(int64_t count, const FieldMeta& field_meta) { obj->mutable_data()->Resize(count, 0); break; } - case DataType::VARCHAR: { + case DataType::VARCHAR: + case DataType::STRING: { auto obj = scalar_array->mutable_string_data(); obj->mutable_data()->Reserve(count); for (auto i = 0; i < count; i++) { diff --git a/internal/core/unittest/test_float16.cpp b/internal/core/unittest/test_float16.cpp index e3d1e88af71cd..4069b8f376fc7 100644 --- a/internal/core/unittest/test_float16.cpp +++ b/internal/core/unittest/test_float16.cpp @@ -176,13 +176,13 @@ TEST(Float16, GetVector) { std::map type_params = {{"dim", "128"}}; FieldIndexMeta fieldIndexMeta( vec, std::move(index_params), std::move(type_params)); - auto& config = SegcoreConfig::default_config(); + auto config = SegcoreConfig::default_config(); config.set_chunk_rows(1024); config.set_enable_interim_segment_index(true); std::map filedMap = {{vec, fieldIndexMeta}}; IndexMetaPtr metaPtr = std::make_shared(100000, std::move(filedMap)); - auto segment_growing = CreateGrowingSegment(schema, metaPtr); + auto segment_growing = CreateGrowingSegment(schema, metaPtr, 1, config); auto segment = dynamic_cast(segment_growing.get()); int64_t per_batch = 5000; diff --git a/internal/core/unittest/test_growing.cpp b/internal/core/unittest/test_growing.cpp index 039c78a7ce4c9..671d5d23a78f9 100644 --- a/internal/core/unittest/test_growing.cpp +++ b/internal/core/unittest/test_growing.cpp @@ -132,13 +132,13 @@ TEST(Growing, FillData) { std::map type_params = {{"dim", "128"}}; FieldIndexMeta fieldIndexMeta( vec, std::move(index_params), std::move(type_params)); - auto& config = SegcoreConfig::default_config(); + auto config = SegcoreConfig::default_config(); config.set_chunk_rows(1024); config.set_enable_interim_segment_index(true); std::map filedMap = {{vec, fieldIndexMeta}}; IndexMetaPtr metaPtr = std::make_shared(100000, std::move(filedMap)); - auto segment_growing = CreateGrowingSegment(schema, metaPtr); + auto segment_growing = CreateGrowingSegment(schema, metaPtr, 1, config); auto segment = dynamic_cast(segment_growing.get()); int64_t per_batch = 1000; diff --git a/internal/core/unittest/test_query.cpp b/internal/core/unittest/test_query.cpp index de53f6299447c..afa6a618b635e 100644 --- a/internal/core/unittest/test_query.cpp +++ b/internal/core/unittest/test_query.cpp @@ -549,7 +549,6 @@ TEST(Query, FillSegment) { pb::schema::CollectionSchema proto; proto.set_name("col"); proto.set_description("asdfhsalkgfhsadg"); - proto.set_autoid(false); auto dim = 16; { diff --git a/internal/core/unittest/test_sealed.cpp b/internal/core/unittest/test_sealed.cpp index c1dd5842f4866..eb8359a851c6d 100644 --- a/internal/core/unittest/test_sealed.cpp +++ b/internal/core/unittest/test_sealed.cpp @@ -1405,27 +1405,27 @@ TEST(Sealed, SkipIndexSkipUnaryRange) { auto pk_field_data = storage::CreateFieldData(DataType::INT64, 1, 10); pk_field_data->FillFieldData(pks.data(), N); segment->LoadPrimitiveSkipIndex( - pk_fid, 0, DataType::INT64, pk_field_data->Data(), N); + pk_fid, 0, DataType::INT64, pk_field_data->Data(), N); auto& skip_index = segment->GetSkipIndex(); - bool equal_5_skip = skip_index.CanSkipUnaryRange( - pk_fid, 0, OpType::Equal, 5); - bool equal_12_skip = skip_index.CanSkipUnaryRange( - pk_fid, 0, OpType::Equal, 12); - bool equal_10_skip = skip_index.CanSkipUnaryRange( - pk_fid, 0, OpType::Equal, 10); + bool equal_5_skip = + skip_index.CanSkipUnaryRange(pk_fid, 0, OpType::Equal, 5); + bool equal_12_skip = + skip_index.CanSkipUnaryRange(pk_fid, 0, OpType::Equal, 12); + bool equal_10_skip = + skip_index.CanSkipUnaryRange(pk_fid, 0, OpType::Equal, 10); ASSERT_FALSE(equal_5_skip); ASSERT_TRUE(equal_12_skip); ASSERT_FALSE(equal_10_skip); - bool less_than_1_skip = skip_index.CanSkipUnaryRange( - pk_fid, 0, OpType::LessThan, 1); - bool less_than_5_skip = skip_index.CanSkipUnaryRange( - pk_fid, 0, OpType::LessThan, 5); + bool less_than_1_skip = + skip_index.CanSkipUnaryRange(pk_fid, 0, OpType::LessThan, 1); + bool less_than_5_skip = + skip_index.CanSkipUnaryRange(pk_fid, 0, OpType::LessThan, 5); ASSERT_TRUE(less_than_1_skip); ASSERT_FALSE(less_than_5_skip); - bool less_equal_than_1_skip = skip_index.CanSkipUnaryRange( - pk_fid, 0, OpType::LessEqual, 1); - bool less_equal_than_15_skip = skip_index.CanSkipUnaryRange( - pk_fid, 0, OpType::LessThan, 15); + bool less_equal_than_1_skip = + skip_index.CanSkipUnaryRange(pk_fid, 0, OpType::LessEqual, 1); + bool less_equal_than_15_skip = + skip_index.CanSkipUnaryRange(pk_fid, 0, OpType::LessThan, 15); ASSERT_FALSE(less_equal_than_1_skip); ASSERT_FALSE(less_equal_than_15_skip); bool greater_than_10_skip = skip_index.CanSkipUnaryRange( @@ -1446,9 +1446,9 @@ TEST(Sealed, SkipIndexSkipUnaryRange) { auto int32_field_data = storage::CreateFieldData(DataType::INT32, 1, 10); int32_field_data->FillFieldData(int32s.data(), N); segment->LoadPrimitiveSkipIndex( - i32_fid, 0, DataType::INT32, int32_field_data->Data(), N); - less_than_1_skip = skip_index.CanSkipUnaryRange( - i32_fid, 0, OpType::LessThan, 1); + i32_fid, 0, DataType::INT32, int32_field_data->Data(), N); + less_than_1_skip = + skip_index.CanSkipUnaryRange(i32_fid, 0, OpType::LessThan, 1); ASSERT_TRUE(less_than_1_skip); //test for int16 @@ -1456,9 +1456,9 @@ TEST(Sealed, SkipIndexSkipUnaryRange) { auto int16_field_data = storage::CreateFieldData(DataType::INT16, 1, 10); int16_field_data->FillFieldData(int16s.data(), N); segment->LoadPrimitiveSkipIndex( - i16_fid, 0, DataType::INT16, int16_field_data->Data(), N); - bool less_than_12_skip = skip_index.CanSkipUnaryRange( - i16_fid, 0, OpType::LessThan, 12); + i16_fid, 0, DataType::INT16, int16_field_data->Data(), N); + bool less_than_12_skip = + skip_index.CanSkipUnaryRange(i16_fid, 0, OpType::LessThan, 12); ASSERT_FALSE(less_than_12_skip); //test for int8 @@ -1466,7 +1466,7 @@ TEST(Sealed, SkipIndexSkipUnaryRange) { auto int8_field_data = storage::CreateFieldData(DataType::INT8, 1, 10); int8_field_data->FillFieldData(int8s.data(), N); segment->LoadPrimitiveSkipIndex( - i8_fid, 0, DataType::INT8, int8_field_data->Data(), N); + i8_fid, 0, DataType::INT8, int8_field_data->Data(), N); bool greater_than_12_skip = skip_index.CanSkipUnaryRange( i8_fid, 0, OpType::GreaterThan, 12); ASSERT_TRUE(greater_than_12_skip); @@ -1477,7 +1477,7 @@ TEST(Sealed, SkipIndexSkipUnaryRange) { auto float_field_data = storage::CreateFieldData(DataType::FLOAT, 1, 10); float_field_data->FillFieldData(floats.data(), N); segment->LoadPrimitiveSkipIndex( - float_fid, 0, DataType::FLOAT, float_field_data->Data(), N); + float_fid, 0, DataType::FLOAT, float_field_data->Data(), N); greater_than_10_skip = skip_index.CanSkipUnaryRange( float_fid, 0, OpType::GreaterThan, 10.0); ASSERT_TRUE(greater_than_10_skip); @@ -1488,7 +1488,7 @@ TEST(Sealed, SkipIndexSkipUnaryRange) { auto double_field_data = storage::CreateFieldData(DataType::DOUBLE, 1, 10); double_field_data->FillFieldData(doubles.data(), N); segment->LoadPrimitiveSkipIndex( - double_fid, 0, DataType::DOUBLE, double_field_data->Data(), N); + double_fid, 0, DataType::DOUBLE, double_field_data->Data(), N); greater_than_10_skip = skip_index.CanSkipUnaryRange( double_fid, 0, OpType::GreaterThan, 10.0); ASSERT_TRUE(greater_than_10_skip); @@ -1511,22 +1511,22 @@ TEST(Sealed, SkipIndexSkipBinaryRange) { auto pk_field_data = storage::CreateFieldData(DataType::INT64, 1, 10); pk_field_data->FillFieldData(pks.data(), N); segment->LoadPrimitiveSkipIndex( - pk_fid, 0, DataType::INT64, pk_field_data->Data(), N); + pk_fid, 0, DataType::INT64, pk_field_data->Data(), N); auto& skip_index = segment->GetSkipIndex(); - ASSERT_FALSE(skip_index.CanSkipBinaryRange( - pk_fid, 0, -3, 1, true, true)); - ASSERT_TRUE(skip_index.CanSkipBinaryRange( - pk_fid, 0, -3, 1, true, false)); - - ASSERT_FALSE(skip_index.CanSkipBinaryRange( - pk_fid, 0, 7, 9, true, true)); - ASSERT_FALSE(skip_index.CanSkipBinaryRange( - pk_fid, 0, 8, 12, true, false)); - - ASSERT_TRUE(skip_index.CanSkipBinaryRange( - pk_fid, 0, 10, 12, false, true)); - ASSERT_FALSE(skip_index.CanSkipBinaryRange( - pk_fid, 0, 10, 12, true, true)); + ASSERT_FALSE( + skip_index.CanSkipBinaryRange(pk_fid, 0, -3, 1, true, true)); + ASSERT_TRUE( + skip_index.CanSkipBinaryRange(pk_fid, 0, -3, 1, true, false)); + + ASSERT_FALSE( + skip_index.CanSkipBinaryRange(pk_fid, 0, 7, 9, true, true)); + ASSERT_FALSE( + skip_index.CanSkipBinaryRange(pk_fid, 0, 8, 12, true, false)); + + ASSERT_TRUE( + skip_index.CanSkipBinaryRange(pk_fid, 0, 10, 12, false, true)); + ASSERT_FALSE( + skip_index.CanSkipBinaryRange(pk_fid, 0, 10, 12, true, true)); } TEST(Sealed, SkipIndexSkipStringRange) { @@ -1536,7 +1536,7 @@ TEST(Sealed, SkipIndexSkipStringRange) { auto pk_fid = schema->AddDebugField("pk", DataType::INT64); auto string_fid = schema->AddDebugField("string_field", DataType::VARCHAR); auto fake_vec_fid = schema->AddDebugField( - "fakeVec", DataType::VECTOR_FLOAT, dim, metrics_type); + "fakeVec", DataType::VECTOR_FLOAT, dim, metrics_type); size_t N = 5; auto dataset = DataGen(schema, N); auto segment = CreateSealedSegment(schema); @@ -1546,28 +1546,165 @@ TEST(Sealed, SkipIndexSkipStringRange) { auto string_field_data = storage::CreateFieldData(DataType::VARCHAR, 1, N); string_field_data->FillFieldData(strings.data(), N); auto string_field_data_info = - FieldDataInfo{string_fid.get(), - N, - std::vector{string_field_data}}; + FieldDataInfo{string_fid.get(), + N, + std::vector{string_field_data}}; segment->LoadFieldData(string_fid, string_field_data_info); auto& skip_index = segment->GetSkipIndex(); - ASSERT_TRUE(skip_index.CanSkipUnaryRange(string_fid, 0, OpType::Equal, "w")); - ASSERT_FALSE(skip_index.CanSkipUnaryRange(string_fid, 0, OpType::Equal, "e")); - ASSERT_FALSE(skip_index.CanSkipUnaryRange(string_fid, 0, OpType::Equal, "j")); - - ASSERT_TRUE(skip_index.CanSkipUnaryRange(string_fid, 0, OpType::LessThan, "e")); - ASSERT_FALSE(skip_index.CanSkipUnaryRange(string_fid, 0, OpType::LessEqual, "e")); + ASSERT_TRUE(skip_index.CanSkipUnaryRange( + string_fid, 0, OpType::Equal, "w")); + ASSERT_FALSE(skip_index.CanSkipUnaryRange( + string_fid, 0, OpType::Equal, "e")); + ASSERT_FALSE(skip_index.CanSkipUnaryRange( + string_fid, 0, OpType::Equal, "j")); + + ASSERT_TRUE(skip_index.CanSkipUnaryRange( + string_fid, 0, OpType::LessThan, "e")); + ASSERT_FALSE(skip_index.CanSkipUnaryRange( + string_fid, 0, OpType::LessEqual, "e")); + + ASSERT_TRUE(skip_index.CanSkipUnaryRange( + string_fid, 0, OpType::GreaterThan, "j")); + ASSERT_FALSE(skip_index.CanSkipUnaryRange( + string_fid, 0, OpType::GreaterEqual, "j")); + ASSERT_FALSE(skip_index.CanSkipUnaryRange( + string_fid, 0, OpType::GreaterEqual, 1)); + + ASSERT_TRUE(skip_index.CanSkipBinaryRange( + string_fid, 0, "a", "c", true, true)); + ASSERT_TRUE(skip_index.CanSkipBinaryRange( + string_fid, 0, "c", "e", true, false)); + ASSERT_FALSE(skip_index.CanSkipBinaryRange( + string_fid, 0, "c", "e", true, true)); + ASSERT_FALSE(skip_index.CanSkipBinaryRange( + string_fid, 0, "e", "k", false, true)); + ASSERT_FALSE(skip_index.CanSkipBinaryRange( + string_fid, 0, "j", "k", true, true)); + ASSERT_TRUE(skip_index.CanSkipBinaryRange( + string_fid, 0, "j", "k", false, true)); + ASSERT_FALSE(skip_index.CanSkipBinaryRange( + string_fid, 0, 1, 2, false, true)); +} - ASSERT_TRUE(skip_index.CanSkipUnaryRange(string_fid, 0, OpType::GreaterThan, "j")); - ASSERT_FALSE(skip_index.CanSkipUnaryRange(string_fid, 0, OpType::GreaterEqual, "j")); - ASSERT_FALSE(skip_index.CanSkipUnaryRange(string_fid, 0, OpType::GreaterEqual, 1)); +TEST(Sealed, QueryAllFields) { + auto schema = std::make_shared(); + auto metric_type = knowhere::metric::L2; + auto bool_field = schema->AddDebugField("bool", DataType::BOOL); + auto int8_field = schema->AddDebugField("int8", DataType::INT8); + auto int16_field = schema->AddDebugField("int16", DataType::INT16); + auto int32_field = schema->AddDebugField("int32", DataType::INT32); + auto int64_field = schema->AddDebugField("int64", DataType::INT64); + auto float_field = schema->AddDebugField("float", DataType::FLOAT); + auto double_field = schema->AddDebugField("double", DataType::DOUBLE); + auto varchar_field = schema->AddDebugField("varchar", DataType::VARCHAR); + auto json_field = schema->AddDebugField("json", DataType::JSON); + auto int_array_field = + schema->AddDebugField("int_array", DataType::ARRAY, DataType::INT8); + auto long_array_field = + schema->AddDebugField("long_array", DataType::ARRAY, DataType::INT64); + auto bool_array_field = + schema->AddDebugField("bool_array", DataType::ARRAY, DataType::BOOL); + auto string_array_field = schema->AddDebugField( + "string_array", DataType::ARRAY, DataType::VARCHAR); + auto double_array_field = schema->AddDebugField( + "double_array", DataType::ARRAY, DataType::DOUBLE); + auto float_array_field = + schema->AddDebugField("float_array", DataType::ARRAY, DataType::FLOAT); + auto vec = schema->AddDebugField( + "embeddings", DataType::VECTOR_FLOAT, 128, metric_type); + schema->set_primary_field_id(int64_field); + + std::map index_params = { + {"index_type", "IVF_FLAT"}, + {"metric_type", metric_type}, + {"nlist", "128"}}; + std::map type_params = {{"dim", "128"}}; + FieldIndexMeta fieldIndexMeta( + vec, std::move(index_params), std::move(type_params)); + std::map filedMap = {{vec, fieldIndexMeta}}; + IndexMetaPtr metaPtr = + std::make_shared(100000, std::move(filedMap)); + auto segment_sealed = CreateSealedSegment(schema, metaPtr); + auto segment = dynamic_cast(segment_sealed.get()); - ASSERT_TRUE(skip_index.CanSkipBinaryRange(string_fid, 0, "a", "c", true, true)); - ASSERT_TRUE(skip_index.CanSkipBinaryRange(string_fid, 0, "c", "e", true, false)); - ASSERT_FALSE(skip_index.CanSkipBinaryRange(string_fid, 0, "c", "e", true, true)); - ASSERT_FALSE(skip_index.CanSkipBinaryRange(string_fid, 0, "e", "k", false, true)); - ASSERT_FALSE(skip_index.CanSkipBinaryRange(string_fid, 0, "j", "k", true, true)); - ASSERT_TRUE(skip_index.CanSkipBinaryRange(string_fid, 0, "j", "k", false, true)); - ASSERT_FALSE(skip_index.CanSkipBinaryRange(string_fid, 0, 1, 2, false, true)); + int64_t dataset_size = 1000; + int64_t dim = 128; + auto dataset = DataGen(schema, dataset_size); + SealedLoadFieldData(dataset, *segment); + auto bool_values = dataset.get_col(bool_field); + auto int8_values = dataset.get_col(int8_field); + auto int16_values = dataset.get_col(int16_field); + auto int32_values = dataset.get_col(int32_field); + auto int64_values = dataset.get_col(int64_field); + auto float_values = dataset.get_col(float_field); + auto double_values = dataset.get_col(double_field); + auto varchar_values = dataset.get_col(varchar_field); + auto json_values = dataset.get_col(json_field); + auto int_array_values = dataset.get_col(int_array_field); + auto long_array_values = dataset.get_col(long_array_field); + auto bool_array_values = dataset.get_col(bool_array_field); + auto string_array_values = dataset.get_col(string_array_field); + auto double_array_values = dataset.get_col(double_array_field); + auto float_array_values = dataset.get_col(float_array_field); + auto vector_values = dataset.get_col(vec); + + auto ids_ds = GenRandomIds(dataset_size); + auto bool_result = + segment->bulk_subscript(bool_field, ids_ds->GetIds(), dataset_size); + auto int8_result = + segment->bulk_subscript(int8_field, ids_ds->GetIds(), dataset_size); + auto int16_result = + segment->bulk_subscript(int16_field, ids_ds->GetIds(), dataset_size); + auto int32_result = + segment->bulk_subscript(int32_field, ids_ds->GetIds(), dataset_size); + auto int64_result = + segment->bulk_subscript(int64_field, ids_ds->GetIds(), dataset_size); + auto float_result = + segment->bulk_subscript(float_field, ids_ds->GetIds(), dataset_size); + auto double_result = + segment->bulk_subscript(double_field, ids_ds->GetIds(), dataset_size); + auto varchar_result = + segment->bulk_subscript(varchar_field, ids_ds->GetIds(), dataset_size); + auto json_result = + segment->bulk_subscript(json_field, ids_ds->GetIds(), dataset_size); + auto int_array_result = segment->bulk_subscript( + int_array_field, ids_ds->GetIds(), dataset_size); + auto long_array_result = segment->bulk_subscript( + long_array_field, ids_ds->GetIds(), dataset_size); + auto bool_array_result = segment->bulk_subscript( + bool_array_field, ids_ds->GetIds(), dataset_size); + auto string_array_result = segment->bulk_subscript( + string_array_field, ids_ds->GetIds(), dataset_size); + auto double_array_result = segment->bulk_subscript( + double_array_field, ids_ds->GetIds(), dataset_size); + auto float_array_result = segment->bulk_subscript( + float_array_field, ids_ds->GetIds(), dataset_size); + auto vec_result = + segment->bulk_subscript(vec, ids_ds->GetIds(), dataset_size); + + EXPECT_EQ(bool_result->scalars().bool_data().data_size(), dataset_size); + EXPECT_EQ(int8_result->scalars().int_data().data_size(), dataset_size); + EXPECT_EQ(int16_result->scalars().int_data().data_size(), dataset_size); + EXPECT_EQ(int32_result->scalars().int_data().data_size(), dataset_size); + EXPECT_EQ(int64_result->scalars().long_data().data_size(), dataset_size); + EXPECT_EQ(float_result->scalars().float_data().data_size(), dataset_size); + EXPECT_EQ(double_result->scalars().double_data().data_size(), dataset_size); + EXPECT_EQ(varchar_result->scalars().string_data().data_size(), + dataset_size); + EXPECT_EQ(json_result->scalars().json_data().data_size(), dataset_size); + EXPECT_EQ(vec_result->vectors().float_vector().data_size(), + dataset_size * dim); + EXPECT_EQ(int_array_result->scalars().array_data().data_size(), + dataset_size); + EXPECT_EQ(long_array_result->scalars().array_data().data_size(), + dataset_size); + EXPECT_EQ(bool_array_result->scalars().array_data().data_size(), + dataset_size); + EXPECT_EQ(string_array_result->scalars().array_data().data_size(), + dataset_size); + EXPECT_EQ(double_array_result->scalars().array_data().data_size(), + dataset_size); + EXPECT_EQ(float_array_result->scalars().array_data().data_size(), + dataset_size); } \ No newline at end of file diff --git a/internal/core/unittest/test_span.cpp b/internal/core/unittest/test_span.cpp index 721cc18c547da..7c5e29c14e5fc 100644 --- a/internal/core/unittest/test_span.cpp +++ b/internal/core/unittest/test_span.cpp @@ -32,8 +32,7 @@ TEST(Span, Naive) { schema->set_primary_field_id(i64_fid); auto dataset = DataGen(schema, N); - auto seg_conf = SegcoreConfig::default_config(); - auto segment = CreateGrowingSegment(schema, empty_index_meta, -1, seg_conf); + auto segment = CreateGrowingSegment(schema, empty_index_meta, -1); segment->PreInsert(N); segment->Insert(0, N, @@ -43,17 +42,16 @@ TEST(Span, Naive) { auto vec_ptr = dataset.get_col(bin_vec_fid); auto age_ptr = dataset.get_col(float_fid); auto float_ptr = dataset.get_col(float_vec_fid); - SegmentInternalInterface& interface = *segment; - auto num_chunk = interface.num_chunk(); + auto num_chunk = segment->num_chunk(); ASSERT_EQ(num_chunk, upper_div(N, size_per_chunk)); - auto row_count = interface.get_row_count(); + auto row_count = segment->get_row_count(); ASSERT_EQ(N, row_count); for (auto chunk_id = 0; chunk_id < num_chunk; ++chunk_id) { auto vec_span = - interface.chunk_data(bin_vec_fid, chunk_id); - auto age_span = interface.chunk_data(float_fid, chunk_id); + segment->chunk_data(bin_vec_fid, chunk_id); + auto age_span = segment->chunk_data(float_fid, chunk_id); auto float_span = - interface.chunk_data(float_vec_fid, chunk_id); + segment->chunk_data(float_vec_fid, chunk_id); auto begin = chunk_id * size_per_chunk; auto end = std::min((chunk_id + 1) * size_per_chunk, N); auto size_of_chunk = end - begin;