From 435b67e475c0384592a3d03e46c59f42906d14bc Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Sat, 16 Dec 2023 11:10:04 -0500 Subject: [PATCH] GH-15060: [JS] Add LargeUtf8 type (#35780) This pull request adds support for the LargeUTF8 type in Arrow. Now we can create, decode, and encode these vectors. However, while the offset vectors support 64 bit integers, note that the value buffers are limited to a length of 32 bits meaning that LargeUTF8 vectors cannot yet be larger than UTF8 vectors. We will see how we can address this limitation in a follow up pull request. The issue is that JS typed arrays can be at most 2**31-1 elements long (implementation defined). This pull request also fixes a bug in a rounding method which prevented us from supporting large vectors so it's already a big step forward. Fixes #15060. * Closes: #15060 --------- Co-authored-by: Kyle Barron --- docs/source/status.rst | 2 +- js/src/Arrow.dom.ts | 4 +- js/src/Arrow.ts | 3 +- js/src/builder.ts | 27 +++++----- js/src/builder/buffer.ts | 52 ++++++++---------- js/src/builder/largeutf8.ts | 59 ++++++++++++++++++++ js/src/builder/list.ts | 4 +- js/src/data.ts | 18 +++++-- js/src/enum.ts | 6 +-- js/src/interfaces.ts | 22 +++++--- js/src/ipc/metadata/json.ts | 3 +- js/src/ipc/metadata/message.ts | 3 +- js/src/type.ts | 35 +++++++++--- js/src/util/buffer.ts | 10 ++-- js/src/visitor.ts | 4 ++ js/src/visitor/builderctor.ts | 2 + js/src/visitor/bytelength.ts | 3 +- js/src/visitor/get.ts | 19 ++++++- js/src/visitor/indexof.ts | 4 +- js/src/visitor/iterator.ts | 4 +- js/src/visitor/jsontypeassembler.ts | 5 +- js/src/visitor/jsonvectorassembler.ts | 8 ++- js/src/visitor/set.ts | 23 ++++++-- js/src/visitor/typeassembler.ts | 5 ++ js/src/visitor/typecomparator.ts | 4 +- js/src/visitor/typector.ts | 1 + js/src/visitor/vectorassembler.ts | 26 ++++++++- js/src/visitor/vectorloader.ts | 7 ++- js/test/data/tables.ts | 2 +- js/test/generate-test-data.ts | 60 +++++++++++++++++---- js/test/unit/builders/builder-tests.ts | 1 + js/test/unit/builders/largeUtf8-tests.ts | 65 +++++++++++++++++++++++ js/test/unit/generated-data-tests.ts | 1 + js/test/unit/generated-data-validators.ts | 20 +++++-- js/test/unit/vector/vector-tests.ts | 24 ++++++++- js/test/unit/visitor-tests.ts | 6 ++- 36 files changed, 432 insertions(+), 110 deletions(-) create mode 100644 js/src/builder/largeutf8.ts create mode 100644 js/test/unit/builders/largeUtf8-tests.ts diff --git a/docs/source/status.rst b/docs/source/status.rst index b8ee7eedbf284..e52e4e4cd49bc 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -66,7 +66,7 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Utf8 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Large Utf8 | ✓ | ✓ | ✓ | | | ✓ | ✓ | | +| Large Utf8 | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Binary View | ✓ | | ✓ | | | | | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ diff --git a/js/src/Arrow.dom.ts b/js/src/Arrow.dom.ts index 451bf6acb6186..9ec76fdd009f3 100644 --- a/js/src/Arrow.dom.ts +++ b/js/src/Arrow.dom.ts @@ -47,7 +47,7 @@ export { Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, + Utf8, LargeUtf8, Binary, FixedSizeBinary, Date_, DateDay, DateMillisecond, @@ -96,5 +96,5 @@ export { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, TimestampMicrosecondBuilder, TimestampNanosecondBuilder, TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecondBuilder, TimeNanosecondBuilder, UnionBuilder, DenseUnionBuilder, SparseUnionBuilder, - Utf8Builder, + Utf8Builder, LargeUtf8Builder } from './Arrow.js'; diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts index 714861e764ccb..b7e5f63a6ab5a 100644 --- a/js/src/Arrow.ts +++ b/js/src/Arrow.ts @@ -36,7 +36,7 @@ export { Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, + Utf8, LargeUtf8, Binary, FixedSizeBinary, Date_, DateDay, DateMillisecond, @@ -78,6 +78,7 @@ export { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, export { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder } from './builder/interval.js'; export { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder } from './builder/duration.js'; export { Utf8Builder } from './builder/utf8.js'; +export { LargeUtf8Builder } from './builder/largeutf8.js'; export { BinaryBuilder } from './builder/binary.js'; export { ListBuilder } from './builder/list.js'; export { FixedSizeListBuilder } from './builder/fixedsizelist.js'; diff --git a/js/src/builder.ts b/js/src/builder.ts index 93510eedf84ff..1a4c52f871bbf 100644 --- a/js/src/builder.ts +++ b/js/src/builder.ts @@ -22,7 +22,7 @@ import { DataType, strideForType, Float, Int, Decimal, FixedSizeBinary, Date_, Time, Timestamp, Interval, Duration, - Utf8, Binary, List, Map_, + Utf8, LargeUtf8, Binary, List, Map_, } from './type.js'; import { createIsValidFunction } from './builder/valid.js'; import { BufferBuilder, BitmapBufferBuilder, DataBufferBuilder, OffsetsBufferBuilder } from './builder/buffer.js'; @@ -198,10 +198,10 @@ export abstract class Builder { return this.children.reduce((size, child) => size + child.reservedByteLength, size); } - declare protected _offsets: DataBufferBuilder; + declare protected _offsets: DataBufferBuilder; public get valueOffsets() { return this._offsets ? this._offsets.buffer : null; } - declare protected _values: BufferBuilder; + declare protected _values: BufferBuilder; public get values() { return this._values ? this._values.buffer : null; } declare protected _nulls: BitmapBufferBuilder; @@ -277,18 +277,15 @@ export abstract class Builder { * @returns A `Data` of the buffers and children representing the values written. */ public flush(): Data { - - let data; - let typeIds; - let nullBitmap; - let valueOffsets; + let data: BufferBuilder | undefined; + let typeIds: Int8Array; + let nullBitmap: Uint8Array | undefined; + let valueOffsets: T['TOffsetArray']; const { type, length, nullCount, _typeIds, _offsets, _values, _nulls } = this; - if (typeIds = _typeIds?.flush(length)) { // Unions - // DenseUnions + if (typeIds = _typeIds?.flush(length)) { // Unions, DenseUnions valueOffsets = _offsets?.flush(length); - } else if (valueOffsets = _offsets?.flush(length)) { // Variable-width primitives (Binary, Utf8), and Lists - // Binary, Utf8 + } else if (valueOffsets = _offsets?.flush(length)) { // Variable-width primitives (Binary, Utf8, LargeUtf8), and Lists data = _values?.flush(_offsets.last()); } else { // Fixed-width primitives (Int, Float, Decimal, Time, Timestamp, Duration and Interval) data = _values?.flush(length); @@ -355,13 +352,13 @@ export abstract class FixedWidthBuilder extends Builder { +export abstract class VariableWidthBuilder extends Builder { protected _pendingLength = 0; - protected _offsets: OffsetsBufferBuilder; + protected _offsets: OffsetsBufferBuilder; protected _pending: Map | undefined; constructor(opts: BuilderOptions) { super(opts); - this._offsets = new OffsetsBufferBuilder(); + this._offsets = new OffsetsBufferBuilder(opts.type); } public setValue(index: number, value: T['TValue']) { const pending = this._pending || (this._pending = new Map()); diff --git a/js/src/builder/buffer.ts b/js/src/builder/buffer.ts index 03d4f33349a7a..402172059682c 100644 --- a/js/src/builder/buffer.ts +++ b/js/src/builder/buffer.ts @@ -16,32 +16,21 @@ // under the License. import { memcpy } from '../util/buffer.js'; -import { - TypedArray, TypedArrayConstructor, - BigIntArray, BigIntArrayConstructor -} from '../interfaces.js'; - -/** @ignore */ type DataValue = T extends TypedArray ? number : T extends BigIntArray ? WideValue : T; -/** @ignore */ type WideValue = T extends BigIntArray ? bigint | Int32Array | Uint32Array : never; -/** @ignore */ type ArrayCtor = - T extends TypedArray ? TypedArrayConstructor : - T extends BigIntArray ? BigIntArrayConstructor : - any; +import { TypedArray, BigIntArray, ArrayCtor } from '../interfaces.js'; +import { DataType } from '../type.js'; /** @ignore */ -const roundLengthUpToNearest64Bytes = (len: number, BPE: number) => ((((Math.ceil(len) * BPE) + 63) & ~63) || 64) / BPE; +function roundLengthUpToNearest64Bytes(len: number, BPE: number) { + const bytesMinus1 = Math.ceil(len) * BPE - 1; + return ((bytesMinus1 - bytesMinus1 % 64 + 64) || 64) / BPE; +} /** @ignore */ const sliceOrExtendArray = (arr: T, len = 0) => ( arr.length >= len ? arr.subarray(0, len) : memcpy(new (arr.constructor as any)(len), arr, 0) ) as T; /** @ignore */ -export interface BufferBuilder> { - readonly offset: number; -} - -/** @ignore */ -export class BufferBuilder> { +export class BufferBuilder { constructor(buffer: T, stride = 1) { this.buffer = buffer; @@ -64,8 +53,8 @@ export class BufferBuilder 0) { this.length += extra; @@ -97,13 +86,11 @@ export class BufferBuilder extends BufferBuilder { +export class DataBufferBuilder extends BufferBuilder { public last() { return this.get(this.length - 1); } - public get(index: number) { return this.buffer[index]; } - public set(index: number, value: number) { + public get(index: number): T[0] { return this.buffer[index]; } + public set(index: number, value: T[0]) { this.reserve(index - this.length + 1); this.buffer[index * this.stride] = value; return this; @@ -134,15 +121,18 @@ export class BitmapBufferBuilder extends DataBufferBuilder { } /** @ignore */ -export class OffsetsBufferBuilder extends DataBufferBuilder { - constructor(data = new Int32Array(1)) { super(data, 1); } - public append(value: number) { +export class OffsetsBufferBuilder extends DataBufferBuilder { + constructor(type: T) { + super(new type.OffsetArrayType(1), 1); + } + + public append(value: T['TOffsetArray'][0]) { return this.set(this.length - 1, value); } - public set(index: number, value: number) { + public set(index: number, value: T['TOffsetArray'][0]) { const offset = this.length - 1; const buffer = this.reserve(index - offset + 1).buffer; - if (offset < index++) { + if (offset < index++ && offset >= 0) { buffer.fill(buffer[offset], offset, index); } buffer[index] = buffer[index - 1] + value; @@ -150,7 +140,7 @@ export class OffsetsBufferBuilder extends DataBufferBuilder { } public flush(length = this.length - 1) { if (length > this.length) { - this.set(length - 1, 0); + this.set(length - 1, this.BYTES_PER_ELEMENT > 4 ? BigInt(0) : 0); } return super.flush(length + 1); } diff --git a/js/src/builder/largeutf8.ts b/js/src/builder/largeutf8.ts new file mode 100644 index 0000000000000..fddfeaf8e7b17 --- /dev/null +++ b/js/src/builder/largeutf8.ts @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { LargeUtf8 } from '../type.js'; +import { encodeUtf8 } from '../util/utf8.js'; +import { BufferBuilder } from './buffer.js'; +import { VariableWidthBuilder, BuilderOptions } from '../builder.js'; + +/** @ignore */ +export class LargeUtf8Builder extends VariableWidthBuilder { + constructor(opts: BuilderOptions) { + super(opts); + this._values = new BufferBuilder(new Uint8Array(0)); + } + public get byteLength(): number { + let size = this._pendingLength + (this.length * 4); + this._offsets && (size += this._offsets.byteLength); + this._values && (size += this._values.byteLength); + this._nulls && (size += this._nulls.byteLength); + return size; + } + public setValue(index: number, value: string) { + return super.setValue(index, encodeUtf8(value) as any); + } + // @ts-ignore + // TODO: move to largeBinaryBuilder when implemented + // protected _flushPending(pending: Map, pendingLength: number): void { } + protected _flushPending(pending: Map, pendingLength: number) { + const offsets = this._offsets; + const data = this._values.reserve(pendingLength).buffer; + let offset = 0; + for (const [index, value] of pending) { + if (value === undefined) { + offsets.set(index, BigInt(0)); + } else { + const length = value.length; + data.set(value, offset); + offsets.set(index, BigInt(length)); + offset += length; + } + } + } +} + +// (LargeUtf8Builder.prototype as any)._flushPending = (LargeBinaryBuilder.prototype as any)._flushPending; diff --git a/js/src/builder/list.ts b/js/src/builder/list.ts index d83cac8e7b1c6..b2739cd5a3260 100644 --- a/js/src/builder/list.ts +++ b/js/src/builder/list.ts @@ -22,10 +22,10 @@ import { Builder, BuilderOptions, VariableWidthBuilder } from '../builder.js'; /** @ignore */ export class ListBuilder extends VariableWidthBuilder, TNull> { - protected _offsets: OffsetsBufferBuilder; + protected _offsets: OffsetsBufferBuilder>; constructor(opts: BuilderOptions, TNull>) { super(opts); - this._offsets = new OffsetsBufferBuilder(); + this._offsets = new OffsetsBufferBuilder(opts.type); } public addChild(child: Builder, name = '0') { if (this.numChildren > 0) { diff --git a/js/src/data.ts b/js/src/data.ts index 1e9df71cff8a7..145ee9d049cb4 100644 --- a/js/src/data.ts +++ b/js/src/data.ts @@ -17,7 +17,7 @@ import { Vector } from './vector.js'; import { BufferType, Type, UnionMode } from './enum.js'; -import { DataType, strideForType } from './type.js'; +import { DataType, LargeUtf8, strideForType } from './type.js'; import { popcnt_bit_range, truncateBitmap } from './util/bit.js'; // When slicing, we do not know the null count of the sliced range without @@ -30,11 +30,12 @@ import { popcnt_bit_range, truncateBitmap } from './util/bit.js'; /** @ignore */ export type NullBuffer = Uint8Array | null | undefined; /** @ignore */ export type TypeIdsBuffer = Int8Array | ArrayLike | Iterable | undefined; /** @ignore */ export type ValueOffsetsBuffer = Int32Array | ArrayLike | Iterable | undefined; +/** @ignore */ export type LargeValueOffsetsBuffer = BigInt64Array | ArrayLike | Iterable | undefined; /** @ignore */ export type DataBuffer = T['TArray'] | ArrayLike | Iterable | undefined; /** @ignore */ export interface Buffers { - [BufferType.OFFSET]: Int32Array; + [BufferType.OFFSET]: T['TOffsetArray']; [BufferType.DATA]: T['TArray']; [BufferType.VALIDITY]: Uint8Array; [BufferType.TYPE]: T['TArray']; @@ -264,7 +265,7 @@ import { } from './type.js'; import { Visitor } from './visitor.js'; -import { toArrayBufferView, toInt32Array, toUint8Array } from './util/buffer.js'; +import { toArrayBufferView, toBigInt64Array, toInt32Array, toUint8Array } from './util/buffer.js'; class MakeDataVisitor extends Visitor { public visit(props: any): Data { @@ -307,6 +308,14 @@ class MakeDataVisitor extends Visitor { const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]); } + public visitLargeUtf8(props: LargeUtf8DataProps) { + const { ['type']: type, ['offset']: offset = 0 } = props; + const data = toUint8Array(props['data']); + const nullBitmap = toUint8Array(props['nullBitmap']); + const valueOffsets = toBigInt64Array(props['valueOffsets']); + const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; + return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]); + } public visitBinary(props: BinaryDataProps) { const { ['type']: type, ['offset']: offset = 0 } = props; const data = toUint8Array(props['data']); @@ -436,6 +445,7 @@ interface DurationDataProps extends DataProps_ { data?: D interface FixedSizeBinaryDataProps extends DataProps_ { data?: DataBuffer } interface BinaryDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } interface Utf8DataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } +interface LargeUtf8DataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; data?: DataBuffer } interface ListDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; child: Data } interface FixedSizeListDataProps extends DataProps_ { child: Data } interface StructDataProps extends DataProps_ { children: Data[] } @@ -459,6 +469,7 @@ export type DataProps = ( T extends FixedSizeBinary /* */ ? FixedSizeBinaryDataProps : T extends Binary /* */ ? BinaryDataProps : T extends Utf8 /* */ ? Utf8DataProps : + T extends LargeUtf8 /* */ ? LargeUtf8DataProps : T extends List /* */ ? ListDataProps : T extends FixedSizeList /* */ ? FixedSizeListDataProps : T extends Struct /* */ ? StructDataProps : @@ -485,6 +496,7 @@ export function makeData(props: DurationDataProps): Data< export function makeData(props: FixedSizeBinaryDataProps): Data; export function makeData(props: BinaryDataProps): Data; export function makeData(props: Utf8DataProps): Data; +export function makeData(props: LargeUtf8DataProps): Data; export function makeData(props: ListDataProps): Data; export function makeData(props: FixedSizeListDataProps): Data; export function makeData(props: StructDataProps): Data; diff --git a/js/src/enum.ts b/js/src/enum.ts index 2a82dd4235c51..764ea64e63338 100644 --- a/js/src/enum.ts +++ b/js/src/enum.ts @@ -137,8 +137,7 @@ export enum MessageHeader { * nested type consisting of other data types, or another data type (e.g. a * timestamp encoded as an int64). * - * **Note**: Only enum values 0-18 (NONE through Duration) are written to an Arrow - * IPC payload. + * **Note**: Only non-negative enum values are written to an Arrow IPC payload. * * The rest of the values are specified here so TypeScript can narrow the type * signatures further beyond the base Arrow Types. The Arrow DataTypes include @@ -175,6 +174,7 @@ export enum Type { FixedSizeList = 16, /** Fixed-size list. Each value occupies the same number of bytes */ Map = 17, /** Map of named logical types */ Duration = 18, /** Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds. */ + LargeUtf8 = 20, /** Large variable-length string as List */ Dictionary = -1, /** Dictionary aka Category type */ Int8 = -2, @@ -205,7 +205,7 @@ export enum Type { DurationSecond = -27, DurationMillisecond = -28, DurationMicrosecond = -29, - DurationNanosecond = -30 + DurationNanosecond = -30, } export enum BufferType { diff --git a/js/src/interfaces.ts b/js/src/interfaces.ts index 95c5adbb2a25e..707d01bb14cca 100644 --- a/js/src/interfaces.ts +++ b/js/src/interfaces.ts @@ -33,6 +33,7 @@ import type { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuil import type { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder } from './builder/interval.js'; import type { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder } from './builder/duration.js'; import type { Utf8Builder } from './builder/utf8.js'; +import type { LargeUtf8Builder } from './builder/largeutf8.js'; import type { BinaryBuilder } from './builder/binary.js'; import type { ListBuilder } from './builder/list.js'; import type { FixedSizeListBuilder } from './builder/fixedsizelist.js'; @@ -98,6 +99,12 @@ export interface BigIntArrayConstructor { from(arrayLike: ArrayLike, mapfn: (v: U, k: number) => bigint, thisArg?: any): T; } +/** @ignore */ +export type ArrayCtor = + T extends TypedArray ? TypedArrayConstructor : + T extends BigIntArray ? BigIntArrayConstructor : + any; + /** @ignore */ export type BuilderCtorArgs< T extends BuilderType, @@ -105,7 +112,7 @@ export type BuilderCtorArgs< TArgs extends any[] = any[], TCtor extends new (type: R, ...args: TArgs) => T = new (type: R, ...args: TArgs) => T - > = TCtor extends new (type: R, ...args: infer TArgs) => T ? TArgs : never; +> = TCtor extends new (type: R, ...args: infer TArgs) => T ? TArgs : never; /** * Obtain the constructor function of an instance type @@ -115,7 +122,7 @@ export type ConstructorType< T, TCtor extends new (...args: any[]) => T = new (...args: any[]) => T - > = TCtor extends new (...args: any[]) => T ? TCtor : never; +> = TCtor extends new (...args: any[]) => T ? TCtor : never; /** @ignore */ export type BuilderCtorType< @@ -123,7 +130,7 @@ export type BuilderCtorType< R extends DataType = any, TCtor extends new (options: BuilderOptions) => T = new (options: BuilderOptions) => T - > = TCtor extends new (options: BuilderOptions) => T ? TCtor : never; +> = TCtor extends new (options: BuilderOptions) => T ? TCtor : never; /** @ignore */ export type BuilderType = @@ -201,6 +208,7 @@ export type TypeToDataType = { [Type.Float64]: type.Float64; [Type.Float]: type.Float; [Type.Utf8]: type.Utf8; + [Type.LargeUtf8]: type.LargeUtf8; [Type.Binary]: type.Binary; [Type.FixedSizeBinary]: type.FixedSizeBinary; [Type.Date]: type.Date_; @@ -254,6 +262,7 @@ type TypeToBuilder = { [Type.Float64]: Float64Builder; [Type.Float]: FloatBuilder; [Type.Utf8]: Utf8Builder; + [Type.LargeUtf8]: LargeUtf8Builder; [Type.Binary]: BinaryBuilder; [Type.FixedSizeBinary]: FixedSizeBinaryBuilder; [Type.Date]: DateBuilder; @@ -307,6 +316,7 @@ type DataTypeToBuilder = { [Type.Float64]: T extends type.Float64 ? Float64Builder : never; [Type.Float]: T extends type.Float ? FloatBuilder : never; [Type.Utf8]: T extends type.Utf8 ? Utf8Builder : never; + [Type.LargeUtf8]: T extends type.LargeUtf8 ? LargeUtf8Builder : never; [Type.Binary]: T extends type.Binary ? BinaryBuilder : never; [Type.FixedSizeBinary]: T extends type.FixedSizeBinary ? FixedSizeBinaryBuilder : never; [Type.Date]: T extends type.Date_ ? DateBuilder : never; @@ -329,11 +339,11 @@ type DataTypeToBuilder = { [Type.Interval]: T extends type.Interval ? IntervalBuilder : never; [Type.IntervalDayTime]: T extends type.IntervalDayTime ? IntervalDayTimeBuilder : never; [Type.IntervalYearMonth]: T extends type.IntervalYearMonth ? IntervalYearMonthBuilder : never; - [Type.Duration]: T extends type.Duration ? DurationBuilder: never; + [Type.Duration]: T extends type.Duration ? DurationBuilder : never; [Type.DurationSecond]: T extends type.DurationSecond ? DurationSecondBuilder : never; [Type.DurationMillisecond]: T extends type.DurationMillisecond ? DurationMillisecondBuilder : never; - [Type.DurationMicrosecond]: T extends type.DurationMicrosecond ? DurationMicrosecondBuilder: never; - [Type.DurationNanosecond]: T extends type.DurationNanosecond ? DurationNanosecondBuilder: never; + [Type.DurationMicrosecond]: T extends type.DurationMicrosecond ? DurationMicrosecondBuilder : never; + [Type.DurationNanosecond]: T extends type.DurationNanosecond ? DurationNanosecondBuilder : never; [Type.Map]: T extends type.Map_ ? MapBuilder : never; [Type.List]: T extends type.List ? ListBuilder : never; [Type.Struct]: T extends type.Struct ? StructBuilder : never; diff --git a/js/src/ipc/metadata/json.ts b/js/src/ipc/metadata/json.ts index f1f306730ddba..b669c0c612f8a 100644 --- a/js/src/ipc/metadata/json.ts +++ b/js/src/ipc/metadata/json.ts @@ -20,7 +20,7 @@ import { Schema, Field } from '../../schema.js'; import { DataType, Dictionary, TimeBitWidth, - Utf8, Binary, Decimal, FixedSizeBinary, + Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, } from '../../type.js'; @@ -150,6 +150,7 @@ function typeFromJSON(f: any, children?: Field[]): DataType { case 'null': return new Null(); case 'binary': return new Binary(); case 'utf8': return new Utf8(); + case 'largeutf8': return new LargeUtf8(); case 'bool': return new Bool(); case 'list': return new List((children || [])[0]); case 'struct': return new Struct(children || []); diff --git a/js/src/ipc/metadata/message.ts b/js/src/ipc/metadata/message.ts index 27c9b92d6897b..cf05bff54cfba 100644 --- a/js/src/ipc/metadata/message.ts +++ b/js/src/ipc/metadata/message.ts @@ -56,7 +56,7 @@ import ByteBuffer = flatbuffers.ByteBuffer; import { DataType, Dictionary, TimeBitWidth, - Utf8, Binary, Decimal, FixedSizeBinary, + Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, } from '../../type.js'; @@ -433,6 +433,7 @@ function decodeFieldType(f: _Field, children?: Field[]): DataType { case Type['Null']: return new Null(); case Type['Binary']: return new Binary(); case Type['Utf8']: return new Utf8(); + case Type['LargeUtf8']: return new LargeUtf8(); case Type['Bool']: return new Bool(); case Type['List']: return new List((children || [])[0]); case Type['Struct_']: return new Struct(children || []); diff --git a/js/src/type.ts b/js/src/type.ts index 34bbf45bca728..6223d0316f17a 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -19,7 +19,7 @@ import { Field } from './schema.js'; import { Vector } from './vector.js'; import { MapRow } from './row/map.js'; import { StructRow, StructRowProxy } from './row/struct.js'; -import { TypedArrayConstructor } from './interfaces.js'; +import { ArrayCtor, BigIntArrayConstructor, TypedArrayConstructor } from './interfaces.js'; import { bigIntToNumber } from './util/bigint.js'; import { @@ -38,9 +38,11 @@ export type IsSigned = { 'true': true; 'false': false }; export interface DataType { readonly TType: TType; readonly TArray: any; + readonly TOffsetArray: any; readonly TValue: any; readonly TChildren: TChildren; readonly ArrayType: any; + readonly OffsetArrayType: ArrayCtor; readonly children: Field[]; } @@ -57,6 +59,7 @@ export abstract class DataType { (proto).children = null; (proto).ArrayType = Array; + (proto).OffsetArrayType = Int32Array; return proto[Symbol.toStringTag] = 'DataType'; })(DataType.prototype); } @@ -232,7 +236,7 @@ Object.defineProperty(Float32.prototype, 'ArrayType', { value: Float32Array }); Object.defineProperty(Float64.prototype, 'ArrayType', { value: Float64Array }); /** @ignore */ -export interface Binary extends DataType { TArray: Uint8Array; TValue: Uint8Array; ArrayType: TypedArrayConstructor } +export interface Binary extends DataType { TArray: Uint8Array; TOffsetArray: Int32Array; TValue: Uint8Array; ArrayType: TypedArrayConstructor; OffsetArrayType: TypedArrayConstructor } /** @ignore */ export class Binary extends DataType { constructor() { @@ -247,7 +251,7 @@ export class Binary extends DataType { } /** @ignore */ -export interface Utf8 extends DataType { TArray: Uint8Array; TValue: string; ArrayType: TypedArrayConstructor } +export interface Utf8 extends DataType { TArray: Uint8Array; TOffsetArray: Int32Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetArrayType: TypedArrayConstructor } /** @ignore */ export class Utf8 extends DataType { constructor() { @@ -261,6 +265,22 @@ export class Utf8 extends DataType { })(Utf8.prototype); } +/** @ignore */ +export interface LargeUtf8 extends DataType { TArray: Uint8Array; TOffsetArray: BigInt64Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetArrayType: BigIntArrayConstructor } +/** @ignore */ +export class LargeUtf8 extends DataType { + constructor() { + super(); + } + public get typeId() { return Type.LargeUtf8 as Type.LargeUtf8; } + public toString() { return `LargeUtf8`; } + protected static [Symbol.toStringTag] = ((proto: LargeUtf8) => { + (proto).ArrayType = Uint8Array; + (proto).OffsetArrayType = BigInt64Array; + return proto[Symbol.toStringTag] = 'LargeUtf8'; + })(LargeUtf8.prototype); +} + /** @ignore */ export interface Bool extends DataType { TArray: Uint8Array; TValue: boolean; ArrayType: TypedArrayConstructor } /** @ignore */ @@ -458,13 +478,13 @@ export class Duration extends DataType { } /** @ignore */ -export class DurationSecond extends Duration { constructor() { super(TimeUnit.SECOND); }} +export class DurationSecond extends Duration { constructor() { super(TimeUnit.SECOND); } } /** @ignore */ -export class DurationMillisecond extends Duration { constructor() { super(TimeUnit.MILLISECOND); }} +export class DurationMillisecond extends Duration { constructor() { super(TimeUnit.MILLISECOND); } } /** @ignore */ -export class DurationMicrosecond extends Duration { constructor() { super(TimeUnit.MICROSECOND); }} +export class DurationMicrosecond extends Duration { constructor() { super(TimeUnit.MICROSECOND); } } /** @ignore */ -export class DurationNanosecond extends Duration { constructor() { super(TimeUnit.NANOSECOND); }} +export class DurationNanosecond extends Duration { constructor() { super(TimeUnit.NANOSECOND); } } /** @ignore */ @@ -581,6 +601,7 @@ export class FixedSizeBinary extends DataType { protected static [Symbol.toStringTag] = ((proto: FixedSizeBinary) => { (proto).byteWidth = null; (proto).ArrayType = Uint8Array; + (proto).OffsetArrayType = Int32Array; return proto[Symbol.toStringTag] = 'FixedSizeBinary'; })(FixedSizeBinary.prototype); } diff --git a/js/src/util/buffer.ts b/js/src/util/buffer.ts index dd8edf11f9258..4f4379dedf6d8 100644 --- a/js/src/util/buffer.ts +++ b/js/src/util/buffer.ts @@ -83,9 +83,9 @@ export function joinUint8Arrays(chunks: Uint8Array[], size?: number | null): [Ui } /** @ignore */ -export type ArrayBufferViewInput = ArrayBufferView | ArrayBufferLike | ArrayBufferView | Iterable | ArrayLike | ByteBuffer | string | null | undefined | - IteratorResult | ArrayLike | ByteBuffer | string | null | undefined> | - ReadableStreamReadResult | ArrayLike | ByteBuffer | string | null | undefined>; +export type ArrayBufferViewInput = ArrayBufferView | ArrayBufferLike | ArrayBufferView | Iterable | Iterable | ArrayLike | ArrayLike | ByteBuffer | string | null | undefined | + IteratorResult | Iterable | ArrayLike | ArrayLike | ByteBuffer | string | null | undefined> | + ReadableStreamReadResult | Iterable | ArrayLike | ArrayLike | ByteBuffer | string | null | undefined>; /** @ignore */ export function toArrayBufferView< @@ -208,7 +208,9 @@ export async function* toArrayBufferViewAsyncIterator(Arra /** @ignore */ export const toUint8ClampedArrayAsyncIterator = (input: ArrayBufferViewAsyncIteratorInput) => toArrayBufferViewAsyncIterator(Uint8ClampedArray, input); /** @ignore */ -export function rebaseValueOffsets(offset: number, length: number, valueOffsets: Int32Array) { +export function rebaseValueOffsets(offset: number, length: number, valueOffsets: Int32Array): Int32Array; +export function rebaseValueOffsets(offset: number, length: number, valueOffsets: BigInt64Array): BigInt64Array; +export function rebaseValueOffsets(offset: number, length: number, valueOffsets: any) { // If we have a non-zero offset, create a new offsets array with the values // shifted by the start offset, such that the new start offset is 0 if (offset !== 0) { diff --git a/js/src/visitor.ts b/js/src/visitor.ts index c63640b038e47..5b3cc4d3d0593 100644 --- a/js/src/visitor.ts +++ b/js/src/visitor.ts @@ -36,6 +36,7 @@ export abstract class Visitor { public visitInt(_node: any, ..._args: any[]): any { return null; } public visitFloat(_node: any, ..._args: any[]): any { return null; } public visitUtf8(_node: any, ..._args: any[]): any { return null; } + public visitLargeUtf8(_node: any, ..._args: any[]): any { return null; } public visitBinary(_node: any, ..._args: any[]): any { return null; } public visitFixedSizeBinary(_node: any, ..._args: any[]): any { return null; } public visitDate(_node: any, ..._args: any[]): any { return null; } @@ -89,6 +90,7 @@ function getVisitFnByTypeId(visitor: Visitor, dtype: Type, throwIfNotFound = tru case Type.Float32: fn = visitor.visitFloat32 || visitor.visitFloat; break; case Type.Float64: fn = visitor.visitFloat64 || visitor.visitFloat; break; case Type.Utf8: fn = visitor.visitUtf8; break; + case Type.LargeUtf8: fn = visitor.visitLargeUtf8; break; case Type.Binary: fn = visitor.visitBinary; break; case Type.FixedSizeBinary: fn = visitor.visitFixedSizeBinary; break; case Type.Date: fn = visitor.visitDate; break; @@ -152,6 +154,7 @@ function inferDType(type: T): Type { return Type.Float; case Type.Binary: return Type.Binary; case Type.Utf8: return Type.Utf8; + case Type.LargeUtf8: return Type.LargeUtf8; case Type.Bool: return Type.Bool; case Type.Decimal: return Type.Decimal; case Type.Time: @@ -229,6 +232,7 @@ export interface Visitor { visitFloat32?(node: any, ...args: any[]): any; visitFloat64?(node: any, ...args: any[]): any; visitUtf8(node: any, ...args: any[]): any; + visitLargeUtf8(node: any, ...args: any[]): any; visitBinary(node: any, ...args: any[]): any; visitFixedSizeBinary(node: any, ...args: any[]): any; visitDate(node: any, ...args: any[]): any; diff --git a/js/src/visitor/builderctor.ts b/js/src/visitor/builderctor.ts index 54b5610a50eed..83374712b2642 100644 --- a/js/src/visitor/builderctor.ts +++ b/js/src/visitor/builderctor.ts @@ -40,6 +40,7 @@ import { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, import { TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecondBuilder, TimeNanosecondBuilder } from '../builder/time.js'; import { UnionBuilder, DenseUnionBuilder, SparseUnionBuilder } from '../builder/union.js'; import { Utf8Builder } from '../builder/utf8.js'; +import { LargeUtf8Builder } from '../builder/largeutf8.js'; /** @ignore */ export interface GetBuilderCtor extends Visitor { @@ -67,6 +68,7 @@ export class GetBuilderCtor extends Visitor { public visitFloat32() { return Float32Builder; } public visitFloat64() { return Float64Builder; } public visitUtf8() { return Utf8Builder; } + public visitLargeUtf8() { return LargeUtf8Builder; } public visitBinary() { return BinaryBuilder; } public visitFixedSizeBinary() { return FixedSizeBinaryBuilder; } public visitDate() { return DateBuilder; } diff --git a/js/src/visitor/bytelength.ts b/js/src/visitor/bytelength.ts index 72d6148a52fd8..c3bfadd50e155 100644 --- a/js/src/visitor/bytelength.ts +++ b/js/src/visitor/bytelength.ts @@ -26,7 +26,7 @@ import { Type, TimeUnit, UnionMode } from '../enum.js'; import { DataType, Dictionary, Float, Int, Date_, Interval, Time, Timestamp, Duration, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, DenseUnion, SparseUnion, } from '../type.js'; @@ -40,6 +40,7 @@ export interface GetByteLengthVisitor extends Visitor { getVisitFn(node: T): (data: Data>, index: number) => number; visitBinary(data: Data, index: number): number; visitUtf8(data: Data, index: number): number; + visitLargeUtf8(data: Data, index: number): number; visitList(data: Data, index: number): number; visitDenseUnion(data: Data, index: number): number; visitSparseUnion(data: Data, index: number): number; diff --git a/js/src/visitor/get.ts b/js/src/visitor/get.ts index 5aaaedf51a37e..a801c90047c89 100644 --- a/js/src/visitor/get.ts +++ b/js/src/visitor/get.ts @@ -21,6 +21,7 @@ import { Vector } from '../vector.js'; import { Visitor } from '../visitor.js'; import { MapRow } from '../row/map.js'; import { StructRow, StructRowProxy } from '../row/struct.js'; +import { bigIntToNumber } from '../util/bigint.js'; import { decodeUtf8 } from '../util/utf8.js'; import { TypeToDataType } from '../interfaces.js'; import { uint16ToFloat64 } from '../util/math.js'; @@ -35,7 +36,7 @@ import { Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, - Union, DenseUnion, SparseUnion, + Union, DenseUnion, SparseUnion, LargeUtf8, } from '../type.js'; /** @ignore */ @@ -60,6 +61,7 @@ export interface GetVisitor extends Visitor { visitFloat32(data: Data, index: number): T['TValue'] | null; visitFloat64(data: Data, index: number): T['TValue'] | null; visitUtf8(data: Data, index: number): T['TValue'] | null; + visitLargeUtf8(data: Data, index: number): T['TValue'] | null; visitBinary(data: Data, index: number): T['TValue'] | null; visitFixedSizeBinary(data: Data, index: number): T['TValue'] | null; visitDate(data: Data, index: number): T['TValue'] | null; @@ -122,6 +124,15 @@ const getVariableWidthBytes = (values: Uint8Array, valueOffsets: Int32Array, ind const y = valueOffsets[index + 1]; return values.subarray(x, y); }; +/** @ignore */ +const getLargeVariableWidthBytes = (values: Uint8Array, valueOffsets: BigInt64Array, index: number) => { + if (index + 1 >= valueOffsets.length) { + return null as any; + } + const x = bigIntToNumber(valueOffsets[index]); + const y = bigIntToNumber(valueOffsets[index + 1]); + return values.subarray(x, y); +}; /** @ignore */ const getBool = ({ offset, values }: Data, index: number): T['TValue'] => { @@ -155,6 +166,11 @@ const getUtf8 = ({ values, valueOffsets }: Data, index: numbe const bytes = getVariableWidthBytes(values, valueOffsets, index); return bytes !== null ? decodeUtf8(bytes) : null as any; }; +/** @ignore */ +const getLargeUtf8 = ({ values, valueOffsets }: Data, index: number): T['TValue'] => { + const bytes = getLargeVariableWidthBytes(values, valueOffsets, index); + return bytes !== null ? decodeUtf8(bytes) : null as any; +}; /* istanbul ignore next */ /** @ignore */ @@ -328,6 +344,7 @@ GetVisitor.prototype.visitFloat16 = wrapGet(getFloat16); GetVisitor.prototype.visitFloat32 = wrapGet(getNumeric); GetVisitor.prototype.visitFloat64 = wrapGet(getNumeric); GetVisitor.prototype.visitUtf8 = wrapGet(getUtf8); +GetVisitor.prototype.visitLargeUtf8 = wrapGet(getLargeUtf8); GetVisitor.prototype.visitBinary = wrapGet(getBinary); GetVisitor.prototype.visitFixedSizeBinary = wrapGet(getFixedSizeBinary); GetVisitor.prototype.visitDate = wrapGet(getDate); diff --git a/js/src/visitor/indexof.ts b/js/src/visitor/indexof.ts index 4cf0076b3c8e2..76f95788c7953 100644 --- a/js/src/visitor/indexof.ts +++ b/js/src/visitor/indexof.ts @@ -24,7 +24,7 @@ import { getBool, BitIterator } from '../util/bit.js'; import { createElementComparator } from '../util/vector.js'; import { DataType, Dictionary, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -57,6 +57,7 @@ export interface IndexOfVisitor extends Visitor { visitFloat32(data: Data, value: T['TValue'] | null, index?: number): number; visitFloat64(data: Data, value: T['TValue'] | null, index?: number): number; visitUtf8(data: Data, value: T['TValue'] | null, index?: number): number; + visitLargeUtf8(data: Data, value: T['TValue'] | null, index?: number): number; visitBinary(data: Data, value: T['TValue'] | null, index?: number): number; visitFixedSizeBinary(data: Data, value: T['TValue'] | null, index?: number): number; visitDate(data: Data, value: T['TValue'] | null, index?: number): number; @@ -172,6 +173,7 @@ IndexOfVisitor.prototype.visitFloat16 = indexOfValue; IndexOfVisitor.prototype.visitFloat32 = indexOfValue; IndexOfVisitor.prototype.visitFloat64 = indexOfValue; IndexOfVisitor.prototype.visitUtf8 = indexOfValue; +IndexOfVisitor.prototype.visitLargeUtf8 = indexOfValue; IndexOfVisitor.prototype.visitBinary = indexOfValue; IndexOfVisitor.prototype.visitFixedSizeBinary = indexOfValue; IndexOfVisitor.prototype.visitDate = indexOfValue; diff --git a/js/src/visitor/iterator.ts b/js/src/visitor/iterator.ts index e38bb907695d0..09dfcb0b565ae 100644 --- a/js/src/visitor/iterator.ts +++ b/js/src/visitor/iterator.ts @@ -21,7 +21,7 @@ import { Type, Precision } from '../enum.js'; import { TypeToDataType } from '../interfaces.js'; import { DataType, Dictionary, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -55,6 +55,7 @@ export interface IteratorVisitor extends Visitor { visitFloat32(vector: Vector): IterableIterator; visitFloat64(vector: Vector): IterableIterator; visitUtf8(vector: Vector): IterableIterator; + visitLargeUtf8(vector: Vector): IterableIterator; visitBinary(vector: Vector): IterableIterator; visitFixedSizeBinary(vector: Vector): IterableIterator; visitDate(vector: Vector): IterableIterator; @@ -158,6 +159,7 @@ IteratorVisitor.prototype.visitFloat16 = vectorIterator; IteratorVisitor.prototype.visitFloat32 = vectorIterator; IteratorVisitor.prototype.visitFloat64 = vectorIterator; IteratorVisitor.prototype.visitUtf8 = vectorIterator; +IteratorVisitor.prototype.visitLargeUtf8 = vectorIterator; IteratorVisitor.prototype.visitBinary = vectorIterator; IteratorVisitor.prototype.visitFixedSizeBinary = vectorIterator; IteratorVisitor.prototype.visitDate = vectorIterator; diff --git a/js/src/visitor/jsontypeassembler.ts b/js/src/visitor/jsontypeassembler.ts index 6e6cfb07413c3..a6746a858ecb4 100644 --- a/js/src/visitor/jsontypeassembler.ts +++ b/js/src/visitor/jsontypeassembler.ts @@ -48,6 +48,9 @@ export class JSONTypeAssembler extends Visitor { public visitUtf8({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } + public visitLargeUtf8({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } public visitDecimal({ typeId, scale, precision, bitWidth }: T) { return { 'name': ArrowType[typeId].toLowerCase(), 'scale': scale, 'precision': precision, 'bitWidth': bitWidth }; } @@ -64,7 +67,7 @@ export class JSONTypeAssembler extends Visitor { return { 'name': ArrowType[typeId].toLowerCase(), 'unit': IntervalUnit[unit] }; } public visitDuration({ typeId, unit }: T) { - return { 'name': ArrowType[typeId].toLocaleLowerCase(), 'unit': TimeUnit[unit]}; + return { 'name': ArrowType[typeId].toLocaleLowerCase(), 'unit': TimeUnit[unit] }; } public visitList({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; diff --git a/js/src/visitor/jsonvectorassembler.ts b/js/src/visitor/jsonvectorassembler.ts index 0af954e4adacc..9a3cb8601a434 100644 --- a/js/src/visitor/jsonvectorassembler.ts +++ b/js/src/visitor/jsonvectorassembler.ts @@ -27,7 +27,7 @@ import { BitIterator, getBit, getBool } from '../util/bit.js'; import { DataType, Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, IntArray, + Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, IntArray, LargeUtf8, } from '../type.js'; /** @ignore */ @@ -42,6 +42,7 @@ export interface JSONVectorAssembler extends Visitor { visitInt(data: Data): { DATA: number[] | string[] }; visitFloat(data: Data): { DATA: number[] }; visitUtf8(data: Data): { DATA: string[]; OFFSET: number[] }; + visitLargeUtf8(data: Data): { DATA: string[]; OFFSET: string[] }; visitBinary(data: Data): { DATA: string[]; OFFSET: number[] }; visitFixedSizeBinary(data: Data): { DATA: string[] }; visitDate(data: Data): { DATA: number[] }; @@ -100,6 +101,9 @@ export class JSONVectorAssembler extends Visitor { public visitUtf8(data: Data) { return { 'DATA': [...new Vector([data])], 'OFFSET': [...data.valueOffsets] }; } + public visitLargeUtf8(data: Data) { + return { 'DATA': [...new Vector([data])], 'OFFSET': [...bigNumsToStrings(data.valueOffsets, 2)] }; + } public visitBinary(data: Data) { return { 'DATA': [...binaryToString(new Vector([data]))], OFFSET: [...data.valueOffsets] }; } @@ -148,7 +152,7 @@ export class JSONVectorAssembler extends Visitor { return { 'DATA': [...data.values] }; } public visitDuration(data: Data) { - return { 'DATA': [...bigNumsToStrings(data.values, 2)]}; + return { 'DATA': [...bigNumsToStrings(data.values, 2)] }; } public visitFixedSizeList(data: Data) { return { diff --git a/js/src/visitor/set.ts b/js/src/visitor/set.ts index 1a0eddc556899..a439ec8311fd6 100644 --- a/js/src/visitor/set.ts +++ b/js/src/visitor/set.ts @@ -19,13 +19,14 @@ import { Data } from '../data.js'; import { Field } from '../schema.js'; import { Vector } from '../vector.js'; import { Visitor } from '../visitor.js'; +import { bigIntToNumber } from '../util/bigint.js'; import { encodeUtf8 } from '../util/utf8.js'; import { TypeToDataType } from '../interfaces.js'; import { float64ToUint16 } from '../util/math.js'; import { Type, UnionMode, Precision, DateUnit, TimeUnit, IntervalUnit } from '../enum.js'; import { DataType, Dictionary, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -58,6 +59,7 @@ export interface SetVisitor extends Visitor { visitFloat32(data: Data, index: number, value: T['TValue']): void; visitFloat64(data: Data, index: number, value: T['TValue']): void; visitUtf8(data: Data, index: number, value: T['TValue']): void; + visitLargeUtf8(data: Data, index: number, value: T['TValue']): void; visitBinary(data: Data, index: number, value: T['TValue']): void; visitFixedSizeBinary(data: Data, index: number, value: T['TValue']): void; visitDate(data: Data, index: number, value: T['TValue']): void; @@ -123,9 +125,19 @@ export const setEpochMsToNanosecondsLong = (data: Int32Array, index: number, epo }; /** @ignore */ -export const setVariableWidthBytes = (values: Uint8Array, valueOffsets: Int32Array, index: number, value: Uint8Array) => { +export const setVariableWidthBytes = (values: Uint8Array, valueOffsets: T, index: number, value: Uint8Array) => { if (index + 1 < valueOffsets.length) { - const { [index]: x, [index + 1]: y } = valueOffsets; + const x = valueOffsets[index]; + const y = valueOffsets[index + 1]; + values.set(value.subarray(0, y - x), x); + } +}; + +/** @ignore */ +export const setLargeVariableWidthBytes = (values: Uint8Array, valueOffsets: T, index: number, value: Uint8Array) => { + if (index + 1 < valueOffsets.length) { + const x = bigIntToNumber(valueOffsets[index]); + const y = bigIntToNumber(valueOffsets[index + 1]); values.set(value.subarray(0, y - x), x); } }; @@ -167,6 +179,10 @@ const setBinary = ({ values, valueOffsets }: Data, index: n const setUtf8 = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => { setVariableWidthBytes(values, valueOffsets, index, encodeUtf8(value)); }; +/** @ignore */ +const setLargeUtf8 = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => { + setLargeVariableWidthBytes(values, valueOffsets, index, encodeUtf8(value)); +}; /* istanbul ignore next */ export const setDate = (data: Data, index: number, value: T['TValue']): void => { @@ -365,6 +381,7 @@ SetVisitor.prototype.visitFloat16 = wrapSet(setFloat16); SetVisitor.prototype.visitFloat32 = wrapSet(setFloat); SetVisitor.prototype.visitFloat64 = wrapSet(setFloat); SetVisitor.prototype.visitUtf8 = wrapSet(setUtf8); +SetVisitor.prototype.visitLargeUtf8 = wrapSet(setLargeUtf8); SetVisitor.prototype.visitBinary = wrapSet(setBinary); SetVisitor.prototype.visitFixedSizeBinary = wrapSet(setFixedSizeBinary); SetVisitor.prototype.visitDate = wrapSet(setDate); diff --git a/js/src/visitor/typeassembler.ts b/js/src/visitor/typeassembler.ts index c2262d20531b9..f072714222739 100644 --- a/js/src/visitor/typeassembler.ts +++ b/js/src/visitor/typeassembler.ts @@ -27,6 +27,7 @@ import { FloatingPoint } from '../fb/floating-point.js'; import { Binary } from '../fb/binary.js'; import { Bool } from '../fb/bool.js'; import { Utf8 } from '../fb/utf8.js'; +import { LargeUtf8 } from '../fb/large-utf8.js'; import { Decimal } from '../fb/decimal.js'; import { Date } from '../fb/date.js'; import { Time } from '../fb/time.js'; @@ -78,6 +79,10 @@ export class TypeAssembler extends Visitor { Utf8.startUtf8(b); return Utf8.endUtf8(b); } + public visitLargeUtf8(_node: T, b: Builder) { + LargeUtf8.startLargeUtf8(b); + return LargeUtf8.endLargeUtf8(b); + } public visitDecimal(node: T, b: Builder) { Decimal.startDecimal(b); Decimal.addScale(b, node.scale); diff --git a/js/src/visitor/typecomparator.ts b/js/src/visitor/typecomparator.ts index 1de8e218dae4f..2417dec09c6e9 100644 --- a/js/src/visitor/typecomparator.ts +++ b/js/src/visitor/typecomparator.ts @@ -21,7 +21,7 @@ import { Visitor } from '../visitor.js'; import { Schema, Field } from '../schema.js'; import { DataType, TypeMap, Dictionary, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -53,6 +53,7 @@ export interface TypeComparator extends Visitor { visitFloat32(type: T, other?: DataType | null): other is T; visitFloat64(type: T, other?: DataType | null): other is T; visitUtf8(type: T, other?: DataType | null): other is T; + visitLargeUtf8(type: T, other?: DataType | null): other is T; visitBinary(type: T, other?: DataType | null): other is T; visitFixedSizeBinary(type: T, other?: DataType | null): other is T; visitDate(type: T, other?: DataType | null): other is T; @@ -249,6 +250,7 @@ TypeComparator.prototype.visitFloat16 = compareFloat; TypeComparator.prototype.visitFloat32 = compareFloat; TypeComparator.prototype.visitFloat64 = compareFloat; TypeComparator.prototype.visitUtf8 = compareAny; +TypeComparator.prototype.visitLargeUtf8 = compareAny; TypeComparator.prototype.visitBinary = compareAny; TypeComparator.prototype.visitFixedSizeBinary = compareFixedSizeBinary; TypeComparator.prototype.visitDate = compareDate; diff --git a/js/src/visitor/typector.ts b/js/src/visitor/typector.ts index 077f66592fbfb..2e0bbc4147abb 100644 --- a/js/src/visitor/typector.ts +++ b/js/src/visitor/typector.ts @@ -49,6 +49,7 @@ export class GetDataTypeConstructor extends Visitor { public visitFloat32() { return type.Float32; } public visitFloat64() { return type.Float64; } public visitUtf8() { return type.Utf8; } + public visitLargeUtf8() { return type.LargeUtf8; } public visitBinary() { return type.Binary; } public visitFixedSizeBinary() { return type.FixedSizeBinary; } public visitDate() { return type.Date_; } diff --git a/js/src/visitor/vectorassembler.ts b/js/src/visitor/vectorassembler.ts index 949463272e718..7a9d3bdd57b0d 100644 --- a/js/src/visitor/vectorassembler.ts +++ b/js/src/visitor/vectorassembler.ts @@ -27,8 +27,9 @@ import { BufferRegion, FieldNode } from '../ipc/metadata/message.js'; import { DataType, Dictionary, Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, LargeUtf8, } from '../type.js'; +import { bigIntToNumber } from '../util/bigint.js'; /** @ignore */ export interface VectorAssembler extends Visitor { @@ -204,9 +205,29 @@ function assembleFlatVector(this: VectorAssembler, data: Data) { const { length, values, valueOffsets } = data; const { [0]: begin, [length]: end } = valueOffsets; + return _assembleFlatListVector.call(this, length, begin, end, values, valueOffsets); +} + +/** @ignore */ +function assembleLargeFlatListVector(this: VectorAssembler, data: Data) { + const { length, values, valueOffsets } = data; + const begin = bigIntToNumber(valueOffsets[0]); + const end = bigIntToNumber(valueOffsets[length]); + return _assembleFlatListVector.call(this, length, begin, end, values, valueOffsets); +} + +/** @ignore */ +function _assembleFlatListVector( + this: VectorAssembler, + length: number, + begin: number, + end: number, + values: T['TArray'], + valueOffsets: T['TOffsetArray'] +) { const byteLength = Math.min(end - begin, values.byteLength - begin); // Push in the order FlatList types read their buffers - addBuffer.call(this, rebaseValueOffsets(-begin, length + 1, valueOffsets)); // valueOffsets buffer first + addBuffer.call(this, rebaseValueOffsets(-begin, length + 1, valueOffsets as any)); // valueOffsets buffer first addBuffer.call(this, values.subarray(begin, begin + byteLength)); // sliced values buffer second return this; } @@ -234,6 +255,7 @@ VectorAssembler.prototype.visitBool = assembleBoolVector; VectorAssembler.prototype.visitInt = assembleFlatVector; VectorAssembler.prototype.visitFloat = assembleFlatVector; VectorAssembler.prototype.visitUtf8 = assembleFlatListVector; +VectorAssembler.prototype.visitLargeUtf8 = assembleLargeFlatListVector; VectorAssembler.prototype.visitBinary = assembleFlatListVector; VectorAssembler.prototype.visitFixedSizeBinary = assembleFlatVector; VectorAssembler.prototype.visitDate = assembleFlatVector; diff --git a/js/src/visitor/vectorloader.ts b/js/src/visitor/vectorloader.ts index db34edad9a1c1..35f28f49baada 100644 --- a/js/src/visitor/vectorloader.ts +++ b/js/src/visitor/vectorloader.ts @@ -71,6 +71,9 @@ export class VectorLoader extends Visitor { public visitUtf8(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } + public visitLargeUtf8(type: T, { length, nullCount } = this.nextFieldNode()) { + return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); + } public visitBinary(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } @@ -151,7 +154,7 @@ export class JSONVectorLoader extends VectorLoader { return nullCount <= 0 ? new Uint8Array(0) : packBools(this.sources[offset]); } protected readOffsets(_type: T, { offset } = this.nextBufferRange()) { - return toArrayBufferView(Uint8Array, toArrayBufferView(Int32Array, this.sources[offset])); + return toArrayBufferView(Uint8Array, toArrayBufferView(_type.OffsetArrayType, this.sources[offset])); } protected readTypeIds(type: T, { offset } = this.nextBufferRange()) { return toArrayBufferView(Uint8Array, toArrayBufferView(type.ArrayType, this.sources[offset])); @@ -170,7 +173,7 @@ export class JSONVectorLoader extends VectorLoader { return binaryDataFromJSON(sources[offset] as string[]); } else if (DataType.isBool(type)) { return packBools(sources[offset] as number[]); - } else if (DataType.isUtf8(type)) { + } else if (DataType.isUtf8(type) || DataType.isLargeUtf8(type)) { return encodeUtf8((sources[offset] as string[]).join('')); } return toArrayBufferView(Uint8Array, toArrayBufferView(type.ArrayType, sources[offset].map((x) => +x))); diff --git a/js/test/data/tables.ts b/js/test/data/tables.ts index 28aed7e4feccf..449cfe1fb853a 100644 --- a/js/test/data/tables.ts +++ b/js/test/data/tables.ts @@ -27,7 +27,7 @@ const nestedVectorGeneratorNames = ['struct', 'denseUnion', 'sparseUnion', 'map' const dictionaryKeyGeneratorNames = ['int8', 'int16', 'int32', 'uint8', 'uint16', 'uint32']; const valueVectorGeneratorNames = [ 'null_', 'bool', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64', - 'float16', 'float32', 'float64', 'utf8', 'binary', 'fixedSizeBinary', 'dateDay', 'dateMillisecond', + 'float16', 'float32', 'float64', 'utf8', 'largeUtf8', 'binary', 'fixedSizeBinary', 'dateDay', 'dateMillisecond', 'timestampSecond', 'timestampMillisecond', 'timestampMicrosecond', 'timestampNanosecond', 'timeSecond', 'timeMillisecond', 'timeMicrosecond', 'timeNanosecond', 'decimal', 'dictionary', 'intervalDayTime', 'intervalYearMonth', diff --git a/js/test/generate-test-data.ts b/js/test/generate-test-data.ts index 15fb715a31f95..9d7b038331fe6 100644 --- a/js/test/generate-test-data.ts +++ b/js/test/generate-test-data.ts @@ -24,7 +24,7 @@ import { Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, + Utf8, LargeUtf8, Binary, FixedSizeBinary, Date_, DateDay, DateMillisecond, @@ -52,6 +52,7 @@ interface TestDataVectorGenerator extends Visitor { visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; + visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; @@ -75,6 +76,7 @@ interface TestDataVectorGenerator extends Visitor { visitUint64: typeof generateBigInt; visitFloat: typeof generateFloat; visitUtf8: typeof generateUtf8; + visitLargeUtf8: typeof generateLargeUtf8; visitBinary: typeof generateBinary; visitFixedSizeBinary: typeof generateFixedSizeBinary; visitDate: typeof generateDate; @@ -100,6 +102,7 @@ TestDataVectorGenerator.prototype.visitInt64 = generateBigInt; TestDataVectorGenerator.prototype.visitUint64 = generateBigInt; TestDataVectorGenerator.prototype.visitFloat = generateFloat; TestDataVectorGenerator.prototype.visitUtf8 = generateUtf8; +TestDataVectorGenerator.prototype.visitLargeUtf8 = generateLargeUtf8; TestDataVectorGenerator.prototype.visitBinary = generateBinary; TestDataVectorGenerator.prototype.visitFixedSizeBinary = generateFixedSizeBinary; TestDataVectorGenerator.prototype.visitDate = generateDate; @@ -214,6 +217,7 @@ export const float16 = (length = 100, nullCount = Math.trunc(length * 0.2)) => v export const float32 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Float32(), length, nullCount); export const float64 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Float64(), length, nullCount); export const utf8 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Utf8(), length, nullCount); +export const largeUtf8 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new LargeUtf8(), length, nullCount); export const binary = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Binary(), length, nullCount); export const fixedSizeBinary = (length = 100, nullCount = Math.trunc(length * 0.2), byteWidth = 8) => vectorGenerator.visit(new FixedSizeBinary(byteWidth), length, nullCount); export const dateDay = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new DateDay(), length, nullCount); @@ -242,7 +246,7 @@ export const fixedSizeList = (length = 100, nullCount = Math.trunc(length * 0.2) export const map = (length = 100, nullCount = Math.trunc(length * 0.2), child: Field> = defaultMapChild()) => vectorGenerator.visit(new Map_(child), length, nullCount); export const vecs = { - null_, bool, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float16, float32, float64, utf8, binary, fixedSizeBinary, dateDay, dateMillisecond, timestampSecond, timestampMillisecond, timestampMicrosecond, timestampNanosecond, timeSecond, timeMillisecond, timeMicrosecond, timeNanosecond, decimal, list, struct, denseUnion, sparseUnion, dictionary, intervalDayTime, intervalYearMonth, fixedSizeList, map, durationSecond, durationMillisecond, durationMicrosecond, durationNanosecond + null_, bool, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float16, float32, float64, utf8, largeUtf8, binary, fixedSizeBinary, dateDay, dateMillisecond, timestampSecond, timestampMillisecond, timestampMicrosecond, timestampNanosecond, timeSecond, timeMillisecond, timeMicrosecond, timeNanosecond, decimal, list, struct, denseUnion, sparseUnion, dictionary, intervalDayTime, intervalYearMonth, fixedSizeList, map, durationSecond, durationMillisecond, durationMicrosecond, durationNanosecond } as { [k: string]: (...args: any[]) => any }; function generateNull(this: TestDataVectorGenerator, type: T, length = 100): GeneratedVector { @@ -312,7 +316,7 @@ function generateFloat(this: TestDataVectorGenerator, type: T, function generateUtf8(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { const nullBitmap = createBitmap(length, nullCount); - const valueOffsets = createVariableWidthOffsets(length, nullBitmap, 10, 20, nullCount != 0); + const valueOffsets = createVariableWidthOffsets32(length, nullBitmap, 10, 20, nullCount != 0); const values: string[] = new Array(valueOffsets.length - 1).fill(null); [...valueOffsets.slice(1)] .map((o, i) => isValid(nullBitmap, i) ? o - valueOffsets[i] : null) @@ -332,9 +336,31 @@ function generateUtf8(this: TestDataVectorGenerator, type: T, le return { values: () => values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, valueOffsets, data })]) }; } +function generateLargeUtf8(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { + const nullBitmap = createBitmap(length, nullCount); + const valueOffsets = createVariableWidthOffsets64(length, nullBitmap, 10, 20, nullCount != 0); + const values: string[] = new Array(valueOffsets.length - 1).fill(null); + [...valueOffsets.slice(1)] + .map((o, i) => isValid(nullBitmap, i) ? o - valueOffsets[i] : null) + .reduce((map, length, i) => { + if (length !== null) { + if (length > 0) { + do { + values[i] = randomString(Number(length)); + } while (map.has(values[i])); + return map.set(values[i], i); + } + values[i] = ''; + } + return map; + }, new Map()); + const data = createVariableWidthBytes(length, nullBitmap, valueOffsets, (i) => encodeUtf8(values[i])); + return { values: () => values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, valueOffsets, data })]) }; +} + function generateBinary(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { const nullBitmap = createBitmap(length, nullCount); - const valueOffsets = createVariableWidthOffsets(length, nullBitmap, 10, 20, nullCount != 0); + const valueOffsets = createVariableWidthOffsets32(length, nullBitmap, 10, 20, nullCount != 0); const values = [...valueOffsets.slice(1)] .map((o, i) => isValid(nullBitmap, i) ? o - valueOffsets[i] : null) .map((length) => length == null ? null : randomBytes(length)); @@ -443,7 +469,7 @@ function generateList(this: TestDataVectorGenerator, type: T, le const childVec = child.vector; const nullBitmap = createBitmap(length, nullCount); const stride = childVec.length / (length - nullCount); - const valueOffsets = createVariableWidthOffsets(length, nullBitmap, stride, stride); + const valueOffsets = createVariableWidthOffsets32(length, nullBitmap, stride, stride); const values = memoize(() => { const childValues = child.values(); const values: (T['valueType'] | null)[] = [...valueOffsets.slice(1)] @@ -581,7 +607,7 @@ function generateMap(this: TestDataVectorGenerator, const childVec = child.vector; const nullBitmap = createBitmap(length, nullCount); const stride = childVec.length / (length - nullCount); - const valueOffsets = createVariableWidthOffsets(length, nullBitmap, stride, stride); + const valueOffsets = createVariableWidthOffsets32(length, nullBitmap, stride, stride); const values = memoize(() => { const childValues: { key: K; value: V }[] = child.values(); const values: (Record | null)[] = [...valueOffsets.slice(1)] @@ -660,7 +686,7 @@ function createBitmap(length: number, nullCount: number) { return bytes; } -function createVariableWidthOffsets(length: number, nullBitmap: Uint8Array, min = 10, max = Number.POSITIVE_INFINITY, allowEmpty = true) { +function createVariableWidthOffsets32(length: number, nullBitmap: Uint8Array, min = 10, max = Number.POSITIVE_INFINITY, allowEmpty = true) { const offsets = new Int32Array(length + 1); iterateBitmap(length, nullBitmap, (i, valid) => { if (!valid) { @@ -674,10 +700,24 @@ function createVariableWidthOffsets(length: number, nullBitmap: Uint8Array, min return offsets; } -function createVariableWidthBytes(length: number, nullBitmap: Uint8Array, offsets: Int32Array, getBytes: (index: number) => Uint8Array) { - const bytes = new Uint8Array(offsets[length]); +function createVariableWidthOffsets64(length: number, nullBitmap: Uint8Array, min = 10, max = Number.POSITIVE_INFINITY, allowEmpty = true) { + const offsets = new BigInt64Array(length + 1); + iterateBitmap(length, nullBitmap, (i, valid) => { + if (!valid) { + offsets[i + 1] = offsets[i]; + } else { + do { + offsets[i + 1] = offsets[i] + BigInt(Math.min(max, Math.max(min, Math.trunc(rand() * max)))); + } while (!allowEmpty && offsets[i + 1] === offsets[i]); + } + }); + return offsets; +} + +function createVariableWidthBytes(length: number, nullBitmap: Uint8Array, offsets: Int32Array | BigInt64Array, getBytes: (index: number) => Uint8Array) { + const bytes = new Uint8Array(Number(offsets[length])); iterateBitmap(length, nullBitmap, (i, valid) => { - valid && bytes.set(getBytes(i), offsets[i]); + valid && bytes.set(getBytes(i), Number(offsets[i])); }); return bytes; } diff --git a/js/test/unit/builders/builder-tests.ts b/js/test/unit/builders/builder-tests.ts index b261e4f815e3a..0137c7aa66635 100644 --- a/js/test/unit/builders/builder-tests.ts +++ b/js/test/unit/builders/builder-tests.ts @@ -44,6 +44,7 @@ describe('Generated Test Data', () => { describe('Float32Builder', () => { validateBuilder(generate.float32); }); describe('Float64Builder', () => { validateBuilder(generate.float64); }); describe('Utf8Builder', () => { validateBuilder(generate.utf8); }); + describe('LargeUtf8Builder', () => { validateBuilder(generate.largeUtf8); }); describe('BinaryBuilder', () => { validateBuilder(generate.binary); }); describe('FixedSizeBinaryBuilder', () => { validateBuilder(generate.fixedSizeBinary); }); describe('DateDayBuilder', () => { validateBuilder(generate.dateDay); }); diff --git a/js/test/unit/builders/largeUtf8-tests.ts b/js/test/unit/builders/largeUtf8-tests.ts new file mode 100644 index 0000000000000..c789d5dbb1671 --- /dev/null +++ b/js/test/unit/builders/largeUtf8-tests.ts @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import 'web-streams-polyfill'; + +import { validateVector } from './utils.js'; +import { + encodeAll, + encodeEach, + encodeEachDOM, + encodeEachNode, + stringsNoNulls, + stringsWithNAs, + stringsWithNulls, + stringsWithEmpties +} from './utils.js'; + +import { Vector, LargeUtf8 } from 'apache-arrow'; + +const testDOMStreams = process.env.TEST_DOM_STREAMS === 'true'; +const testNodeStreams = process.env.TEST_NODE_STREAMS === 'true'; + +describe('LargeUtf8Builder', () => { + runTestsWithEncoder('encodeAll', encodeAll(() => new LargeUtf8())); + runTestsWithEncoder('encodeEach: 5', encodeEach(() => new LargeUtf8(), 5)); + runTestsWithEncoder('encodeEach: 25', encodeEach(() => new LargeUtf8(), 25)); + runTestsWithEncoder('encodeEach: undefined', encodeEach(() => new LargeUtf8(), void 0)); + testDOMStreams && runTestsWithEncoder('encodeEachDOM: 25', encodeEachDOM(() => new LargeUtf8(), 25)); + testNodeStreams && runTestsWithEncoder('encodeEachNode: 25', encodeEachNode(() => new LargeUtf8(), 25)); +}); + +function runTestsWithEncoder(name: string, encode: (vals: (string | null)[], nullVals?: any[]) => Promise>) { + describe(`${encode.name} ${name}`, () => { + it(`encodes strings no nulls`, async () => { + const vals = stringsNoNulls(20); + validateVector(vals, await encode(vals, []), []); + }); + it(`encodes strings with nulls`, async () => { + const vals = stringsWithNulls(20); + validateVector(vals, await encode(vals, [null]), [null]); + }); + it(`encodes strings using n/a as the null value rep`, async () => { + const vals = stringsWithNAs(20); + validateVector(vals, await encode(vals, ['n/a']), ['n/a']); + }); + it(`encodes strings using \\0 as the null value rep`, async () => { + const vals = stringsWithEmpties(20); + validateVector(vals, await encode(vals, ['\0']), ['\0']); + }); + }); +} diff --git a/js/test/unit/generated-data-tests.ts b/js/test/unit/generated-data-tests.ts index d64c7c188d3ed..0a06bcbab8ee0 100644 --- a/js/test/unit/generated-data-tests.ts +++ b/js/test/unit/generated-data-tests.ts @@ -38,6 +38,7 @@ describe('Generated Test Data', () => { describe('Float32', () => { validateVector(generate.float32()); }); describe('Float64', () => { validateVector(generate.float64()); }); describe('Utf8', () => { validateVector(generate.utf8()); }); + describe('LargeUtf8', () => { validateVector(generate.largeUtf8()); }); describe('Binary', () => { validateVector(generate.binary()); }); describe('FixedSizeBinary', () => { validateVector(generate.fixedSizeBinary()); }); describe('DateDay', () => { validateVector(generate.dateDay()); }); diff --git a/js/test/unit/generated-data-validators.ts b/js/test/unit/generated-data-validators.ts index 52f642d2a6e89..57ee94876c300 100644 --- a/js/test/unit/generated-data-validators.ts +++ b/js/test/unit/generated-data-validators.ts @@ -113,7 +113,9 @@ function vectorTests(values: any[], vector: Vector, keys?: number[]) { expected = values[i]; expect(actual).toArrowCompare(expected); } - } catch (e: any) { throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); } + } catch (e: any) { + throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); + } }); if (keys && keys.length > 0) { test(`dictionary indices should match`, () => { @@ -126,7 +128,9 @@ function vectorTests(values: any[], vector: Vector, keys?: number[]) { ? expect(indices.get(i)).toBe(keys[i]) : expect(indices.get(i)).toBeNull(); } - } catch (e) { throw new Error(`${indices}[${i}]: ${e}`); } + } catch (e) { + throw new Error(`${indices}[${i}]: ${e}`); + } }); } test(`sets expected values`, () => { @@ -139,7 +143,9 @@ function vectorTests(values: any[], vector: Vector, keys?: number[]) { actual = vector.get(i); expect(actual).toArrowCompare(expected); } - } catch (e: any) { throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); } + } catch (e: any) { + throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); + } }); test(`iterates expected values`, () => { expect.hasAssertions(); @@ -149,7 +155,9 @@ function vectorTests(values: any[], vector: Vector, keys?: number[]) { expected = values[++i]; expect(actual).toArrowCompare(expected); } - } catch (e: any) { throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); } + } catch (e: any) { + throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); + } }); test(`indexOf returns expected values`, () => { expect.hasAssertions(); @@ -169,7 +177,9 @@ function vectorTests(values: any[], vector: Vector, keys?: number[]) { expect(vector.indexOf('purple elephants')).toBe(-1); expect(vector.indexOf('whistling wombats')).toBe(-1); expect(vector.indexOf('carnivorous novices')).toBe(-1); - } catch (e: any) { throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); } + } catch (e: any) { + throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); + } }); } diff --git a/js/test/unit/vector/vector-tests.ts b/js/test/unit/vector/vector-tests.ts index a259cbef87772..bfcf0d8547861 100644 --- a/js/test/unit/vector/vector-tests.ts +++ b/js/test/unit/vector/vector-tests.ts @@ -16,7 +16,7 @@ // under the License. import { - Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Timestamp, TimeUnit, Utf8, util, Vector, vectorFromArray + Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Timestamp, TimeUnit, Utf8, LargeUtf8, util, Vector, vectorFromArray } from 'apache-arrow'; describe(`makeVectorFromArray`, () => { @@ -196,6 +196,28 @@ describe(`Utf8Vector`, () => { }); }); +describe(`LargeUtf8Vector`, () => { + const values = ['foo', 'bar', 'baz', 'foo bar', 'bar']; + const vector = vectorFromArray(values, new LargeUtf8); + + test(`has largeUtf8 type`, () => { + expect(vector.type).toBeInstanceOf(LargeUtf8); + }); + + test(`is not memoized`, () => { + expect(vector.isMemoized).toBe(false); + const memoizedVector = vector.memoize(); + expect(memoizedVector.isMemoized).toBe(true); + const unMemoizedVector = vector.unmemoize(); + expect(unMemoizedVector.isMemoized).toBe(false); + }); + + basicVectorTests(vector, values, ['abc', '123']); + describe(`sliced`, () => { + basicVectorTests(vector.slice(1, 3), values.slice(1, 3), ['foo', 'abc']); + }); +}); + describe(`ListVector`, () => { const values = [[1, 2], [1, 2, 3]]; const vector = vectorFromArray(values); diff --git a/js/test/unit/visitor-tests.ts b/js/test/unit/visitor-tests.ts index 8a7ba1ed778aa..f78adc59f8e98 100644 --- a/js/test/unit/visitor-tests.ts +++ b/js/test/unit/visitor-tests.ts @@ -18,7 +18,7 @@ import { Field, Visitor, DataType, Dictionary, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -36,6 +36,7 @@ class BasicVisitor extends Visitor { public visitInt(type: T) { return (this.type = type); } public visitFloat(type: T) { return (this.type = type); } public visitUtf8(type: T) { return (this.type = type); } + public visitLargeUtf8(type: T) { return (this.type = type); } public visitBinary(type: T) { return (this.type = type); } public visitFixedSizeBinary(type: T) { return (this.type = type); } public visitDate(type: T) { return (this.type = type); } @@ -68,6 +69,7 @@ class FeatureVisitor extends Visitor { public visitFloat32(type: T) { return (this.type = type); } public visitFloat64(type: T) { return (this.type = type); } public visitUtf8(type: T) { return (this.type = type); } + public visitLargeUtf8(type: T) { return (this.type = type); } public visitBinary(type: T) { return (this.type = type); } public visitFixedSizeBinary(type: T) { return (this.type = type); } public visitDateDay(type: T) { return (this.type = type); } @@ -104,6 +106,7 @@ describe('Visitor', () => { test(`visits Int types`, () => validateBasicVisitor(new Int(true, 32))); test(`visits Float types`, () => validateBasicVisitor(new Float(0))); test(`visits Utf8 types`, () => validateBasicVisitor(new Utf8())); + test(`visits LargeUtf8 types`, () => validateBasicVisitor(new LargeUtf8())); test(`visits Binary types`, () => validateBasicVisitor(new Binary())); test(`visits FixedSizeBinary types`, () => validateBasicVisitor(new FixedSizeBinary(128))); test(`visits Date types`, () => validateBasicVisitor(new Date_(0))); @@ -144,6 +147,7 @@ describe('Visitor', () => { test(`visits Float32 types`, () => validateFeatureVisitor(new Float32())); test(`visits Float64 types`, () => validateFeatureVisitor(new Float64())); test(`visits Utf8 types`, () => validateFeatureVisitor(new Utf8())); + test(`visits LargeUtf8 types`, () => validateFeatureVisitor(new LargeUtf8())); test(`visits Binary types`, () => validateFeatureVisitor(new Binary())); test(`visits FixedSizeBinary types`, () => validateFeatureVisitor(new FixedSizeBinary(128))); test(`visits DateDay types`, () => validateFeatureVisitor(new DateDay()));