diff --git a/docs/source/status.rst b/docs/source/status.rst index b8ee7eedbf284..e52e4e4cd49bc 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -66,7 +66,7 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Utf8 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Large Utf8 | ✓ | ✓ | ✓ | | | ✓ | ✓ | | +| Large Utf8 | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Binary View | ✓ | | ✓ | | | | | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ diff --git a/js/src/Arrow.dom.ts b/js/src/Arrow.dom.ts index 451bf6acb6186..9ec76fdd009f3 100644 --- a/js/src/Arrow.dom.ts +++ b/js/src/Arrow.dom.ts @@ -47,7 +47,7 @@ export { Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, + Utf8, LargeUtf8, Binary, FixedSizeBinary, Date_, DateDay, DateMillisecond, @@ -96,5 +96,5 @@ export { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, TimestampMicrosecondBuilder, TimestampNanosecondBuilder, TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecondBuilder, TimeNanosecondBuilder, UnionBuilder, DenseUnionBuilder, SparseUnionBuilder, - Utf8Builder, + Utf8Builder, LargeUtf8Builder } from './Arrow.js'; diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts index 714861e764ccb..b7e5f63a6ab5a 100644 --- a/js/src/Arrow.ts +++ b/js/src/Arrow.ts @@ -36,7 +36,7 @@ export { Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, + Utf8, LargeUtf8, Binary, FixedSizeBinary, Date_, DateDay, DateMillisecond, @@ -78,6 +78,7 @@ export { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, export { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder } from './builder/interval.js'; export { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder } from './builder/duration.js'; export { Utf8Builder } from './builder/utf8.js'; +export { LargeUtf8Builder } from './builder/largeutf8.js'; export { BinaryBuilder } from './builder/binary.js'; export { ListBuilder } from './builder/list.js'; export { FixedSizeListBuilder } from './builder/fixedsizelist.js'; diff --git a/js/src/builder.ts b/js/src/builder.ts index 93510eedf84ff..1a4c52f871bbf 100644 --- a/js/src/builder.ts +++ b/js/src/builder.ts @@ -22,7 +22,7 @@ import { DataType, strideForType, Float, Int, Decimal, FixedSizeBinary, Date_, Time, Timestamp, Interval, Duration, - Utf8, Binary, List, Map_, + Utf8, LargeUtf8, Binary, List, Map_, } from './type.js'; import { createIsValidFunction } from './builder/valid.js'; import { BufferBuilder, BitmapBufferBuilder, DataBufferBuilder, OffsetsBufferBuilder } from './builder/buffer.js'; @@ -198,10 +198,10 @@ export abstract class Builder { return this.children.reduce((size, child) => size + child.reservedByteLength, size); } - declare protected _offsets: DataBufferBuilder; + declare protected _offsets: DataBufferBuilder; public get valueOffsets() { return this._offsets ? this._offsets.buffer : null; } - declare protected _values: BufferBuilder; + declare protected _values: BufferBuilder; public get values() { return this._values ? this._values.buffer : null; } declare protected _nulls: BitmapBufferBuilder; @@ -277,18 +277,15 @@ export abstract class Builder { * @returns A `Data` of the buffers and children representing the values written. */ public flush(): Data { - - let data; - let typeIds; - let nullBitmap; - let valueOffsets; + let data: BufferBuilder | undefined; + let typeIds: Int8Array; + let nullBitmap: Uint8Array | undefined; + let valueOffsets: T['TOffsetArray']; const { type, length, nullCount, _typeIds, _offsets, _values, _nulls } = this; - if (typeIds = _typeIds?.flush(length)) { // Unions - // DenseUnions + if (typeIds = _typeIds?.flush(length)) { // Unions, DenseUnions valueOffsets = _offsets?.flush(length); - } else if (valueOffsets = _offsets?.flush(length)) { // Variable-width primitives (Binary, Utf8), and Lists - // Binary, Utf8 + } else if (valueOffsets = _offsets?.flush(length)) { // Variable-width primitives (Binary, Utf8, LargeUtf8), and Lists data = _values?.flush(_offsets.last()); } else { // Fixed-width primitives (Int, Float, Decimal, Time, Timestamp, Duration and Interval) data = _values?.flush(length); @@ -355,13 +352,13 @@ export abstract class FixedWidthBuilder extends Builder { +export abstract class VariableWidthBuilder extends Builder { protected _pendingLength = 0; - protected _offsets: OffsetsBufferBuilder; + protected _offsets: OffsetsBufferBuilder; protected _pending: Map | undefined; constructor(opts: BuilderOptions) { super(opts); - this._offsets = new OffsetsBufferBuilder(); + this._offsets = new OffsetsBufferBuilder(opts.type); } public setValue(index: number, value: T['TValue']) { const pending = this._pending || (this._pending = new Map()); diff --git a/js/src/builder/buffer.ts b/js/src/builder/buffer.ts index 03d4f33349a7a..402172059682c 100644 --- a/js/src/builder/buffer.ts +++ b/js/src/builder/buffer.ts @@ -16,32 +16,21 @@ // under the License. import { memcpy } from '../util/buffer.js'; -import { - TypedArray, TypedArrayConstructor, - BigIntArray, BigIntArrayConstructor -} from '../interfaces.js'; - -/** @ignore */ type DataValue = T extends TypedArray ? number : T extends BigIntArray ? WideValue : T; -/** @ignore */ type WideValue = T extends BigIntArray ? bigint | Int32Array | Uint32Array : never; -/** @ignore */ type ArrayCtor = - T extends TypedArray ? TypedArrayConstructor : - T extends BigIntArray ? BigIntArrayConstructor : - any; +import { TypedArray, BigIntArray, ArrayCtor } from '../interfaces.js'; +import { DataType } from '../type.js'; /** @ignore */ -const roundLengthUpToNearest64Bytes = (len: number, BPE: number) => ((((Math.ceil(len) * BPE) + 63) & ~63) || 64) / BPE; +function roundLengthUpToNearest64Bytes(len: number, BPE: number) { + const bytesMinus1 = Math.ceil(len) * BPE - 1; + return ((bytesMinus1 - bytesMinus1 % 64 + 64) || 64) / BPE; +} /** @ignore */ const sliceOrExtendArray = (arr: T, len = 0) => ( arr.length >= len ? arr.subarray(0, len) : memcpy(new (arr.constructor as any)(len), arr, 0) ) as T; /** @ignore */ -export interface BufferBuilder> { - readonly offset: number; -} - -/** @ignore */ -export class BufferBuilder> { +export class BufferBuilder { constructor(buffer: T, stride = 1) { this.buffer = buffer; @@ -64,8 +53,8 @@ export class BufferBuilder 0) { this.length += extra; @@ -97,13 +86,11 @@ export class BufferBuilder extends BufferBuilder { +export class DataBufferBuilder extends BufferBuilder { public last() { return this.get(this.length - 1); } - public get(index: number) { return this.buffer[index]; } - public set(index: number, value: number) { + public get(index: number): T[0] { return this.buffer[index]; } + public set(index: number, value: T[0]) { this.reserve(index - this.length + 1); this.buffer[index * this.stride] = value; return this; @@ -134,15 +121,18 @@ export class BitmapBufferBuilder extends DataBufferBuilder { } /** @ignore */ -export class OffsetsBufferBuilder extends DataBufferBuilder { - constructor(data = new Int32Array(1)) { super(data, 1); } - public append(value: number) { +export class OffsetsBufferBuilder extends DataBufferBuilder { + constructor(type: T) { + super(new type.OffsetArrayType(1), 1); + } + + public append(value: T['TOffsetArray'][0]) { return this.set(this.length - 1, value); } - public set(index: number, value: number) { + public set(index: number, value: T['TOffsetArray'][0]) { const offset = this.length - 1; const buffer = this.reserve(index - offset + 1).buffer; - if (offset < index++) { + if (offset < index++ && offset >= 0) { buffer.fill(buffer[offset], offset, index); } buffer[index] = buffer[index - 1] + value; @@ -150,7 +140,7 @@ export class OffsetsBufferBuilder extends DataBufferBuilder { } public flush(length = this.length - 1) { if (length > this.length) { - this.set(length - 1, 0); + this.set(length - 1, this.BYTES_PER_ELEMENT > 4 ? BigInt(0) : 0); } return super.flush(length + 1); } diff --git a/js/src/builder/largeutf8.ts b/js/src/builder/largeutf8.ts new file mode 100644 index 0000000000000..fddfeaf8e7b17 --- /dev/null +++ b/js/src/builder/largeutf8.ts @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { LargeUtf8 } from '../type.js'; +import { encodeUtf8 } from '../util/utf8.js'; +import { BufferBuilder } from './buffer.js'; +import { VariableWidthBuilder, BuilderOptions } from '../builder.js'; + +/** @ignore */ +export class LargeUtf8Builder extends VariableWidthBuilder { + constructor(opts: BuilderOptions) { + super(opts); + this._values = new BufferBuilder(new Uint8Array(0)); + } + public get byteLength(): number { + let size = this._pendingLength + (this.length * 4); + this._offsets && (size += this._offsets.byteLength); + this._values && (size += this._values.byteLength); + this._nulls && (size += this._nulls.byteLength); + return size; + } + public setValue(index: number, value: string) { + return super.setValue(index, encodeUtf8(value) as any); + } + // @ts-ignore + // TODO: move to largeBinaryBuilder when implemented + // protected _flushPending(pending: Map, pendingLength: number): void { } + protected _flushPending(pending: Map, pendingLength: number) { + const offsets = this._offsets; + const data = this._values.reserve(pendingLength).buffer; + let offset = 0; + for (const [index, value] of pending) { + if (value === undefined) { + offsets.set(index, BigInt(0)); + } else { + const length = value.length; + data.set(value, offset); + offsets.set(index, BigInt(length)); + offset += length; + } + } + } +} + +// (LargeUtf8Builder.prototype as any)._flushPending = (LargeBinaryBuilder.prototype as any)._flushPending; diff --git a/js/src/builder/list.ts b/js/src/builder/list.ts index d83cac8e7b1c6..b2739cd5a3260 100644 --- a/js/src/builder/list.ts +++ b/js/src/builder/list.ts @@ -22,10 +22,10 @@ import { Builder, BuilderOptions, VariableWidthBuilder } from '../builder.js'; /** @ignore */ export class ListBuilder extends VariableWidthBuilder, TNull> { - protected _offsets: OffsetsBufferBuilder; + protected _offsets: OffsetsBufferBuilder>; constructor(opts: BuilderOptions, TNull>) { super(opts); - this._offsets = new OffsetsBufferBuilder(); + this._offsets = new OffsetsBufferBuilder(opts.type); } public addChild(child: Builder, name = '0') { if (this.numChildren > 0) { diff --git a/js/src/data.ts b/js/src/data.ts index 1e9df71cff8a7..145ee9d049cb4 100644 --- a/js/src/data.ts +++ b/js/src/data.ts @@ -17,7 +17,7 @@ import { Vector } from './vector.js'; import { BufferType, Type, UnionMode } from './enum.js'; -import { DataType, strideForType } from './type.js'; +import { DataType, LargeUtf8, strideForType } from './type.js'; import { popcnt_bit_range, truncateBitmap } from './util/bit.js'; // When slicing, we do not know the null count of the sliced range without @@ -30,11 +30,12 @@ import { popcnt_bit_range, truncateBitmap } from './util/bit.js'; /** @ignore */ export type NullBuffer = Uint8Array | null | undefined; /** @ignore */ export type TypeIdsBuffer = Int8Array | ArrayLike | Iterable | undefined; /** @ignore */ export type ValueOffsetsBuffer = Int32Array | ArrayLike | Iterable | undefined; +/** @ignore */ export type LargeValueOffsetsBuffer = BigInt64Array | ArrayLike | Iterable | undefined; /** @ignore */ export type DataBuffer = T['TArray'] | ArrayLike | Iterable | undefined; /** @ignore */ export interface Buffers { - [BufferType.OFFSET]: Int32Array; + [BufferType.OFFSET]: T['TOffsetArray']; [BufferType.DATA]: T['TArray']; [BufferType.VALIDITY]: Uint8Array; [BufferType.TYPE]: T['TArray']; @@ -264,7 +265,7 @@ import { } from './type.js'; import { Visitor } from './visitor.js'; -import { toArrayBufferView, toInt32Array, toUint8Array } from './util/buffer.js'; +import { toArrayBufferView, toBigInt64Array, toInt32Array, toUint8Array } from './util/buffer.js'; class MakeDataVisitor extends Visitor { public visit(props: any): Data { @@ -307,6 +308,14 @@ class MakeDataVisitor extends Visitor { const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]); } + public visitLargeUtf8(props: LargeUtf8DataProps) { + const { ['type']: type, ['offset']: offset = 0 } = props; + const data = toUint8Array(props['data']); + const nullBitmap = toUint8Array(props['nullBitmap']); + const valueOffsets = toBigInt64Array(props['valueOffsets']); + const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; + return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]); + } public visitBinary(props: BinaryDataProps) { const { ['type']: type, ['offset']: offset = 0 } = props; const data = toUint8Array(props['data']); @@ -436,6 +445,7 @@ interface DurationDataProps extends DataProps_ { data?: D interface FixedSizeBinaryDataProps extends DataProps_ { data?: DataBuffer } interface BinaryDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } interface Utf8DataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } +interface LargeUtf8DataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; data?: DataBuffer } interface ListDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; child: Data } interface FixedSizeListDataProps extends DataProps_ { child: Data } interface StructDataProps extends DataProps_ { children: Data[] } @@ -459,6 +469,7 @@ export type DataProps = ( T extends FixedSizeBinary /* */ ? FixedSizeBinaryDataProps : T extends Binary /* */ ? BinaryDataProps : T extends Utf8 /* */ ? Utf8DataProps : + T extends LargeUtf8 /* */ ? LargeUtf8DataProps : T extends List /* */ ? ListDataProps : T extends FixedSizeList /* */ ? FixedSizeListDataProps : T extends Struct /* */ ? StructDataProps : @@ -485,6 +496,7 @@ export function makeData(props: DurationDataProps): Data< export function makeData(props: FixedSizeBinaryDataProps): Data; export function makeData(props: BinaryDataProps): Data; export function makeData(props: Utf8DataProps): Data; +export function makeData(props: LargeUtf8DataProps): Data; export function makeData(props: ListDataProps): Data; export function makeData(props: FixedSizeListDataProps): Data; export function makeData(props: StructDataProps): Data; diff --git a/js/src/enum.ts b/js/src/enum.ts index 2a82dd4235c51..764ea64e63338 100644 --- a/js/src/enum.ts +++ b/js/src/enum.ts @@ -137,8 +137,7 @@ export enum MessageHeader { * nested type consisting of other data types, or another data type (e.g. a * timestamp encoded as an int64). * - * **Note**: Only enum values 0-18 (NONE through Duration) are written to an Arrow - * IPC payload. + * **Note**: Only non-negative enum values are written to an Arrow IPC payload. * * The rest of the values are specified here so TypeScript can narrow the type * signatures further beyond the base Arrow Types. The Arrow DataTypes include @@ -175,6 +174,7 @@ export enum Type { FixedSizeList = 16, /** Fixed-size list. Each value occupies the same number of bytes */ Map = 17, /** Map of named logical types */ Duration = 18, /** Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds. */ + LargeUtf8 = 20, /** Large variable-length string as List */ Dictionary = -1, /** Dictionary aka Category type */ Int8 = -2, @@ -205,7 +205,7 @@ export enum Type { DurationSecond = -27, DurationMillisecond = -28, DurationMicrosecond = -29, - DurationNanosecond = -30 + DurationNanosecond = -30, } export enum BufferType { diff --git a/js/src/interfaces.ts b/js/src/interfaces.ts index 95c5adbb2a25e..707d01bb14cca 100644 --- a/js/src/interfaces.ts +++ b/js/src/interfaces.ts @@ -33,6 +33,7 @@ import type { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuil import type { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder } from './builder/interval.js'; import type { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder } from './builder/duration.js'; import type { Utf8Builder } from './builder/utf8.js'; +import type { LargeUtf8Builder } from './builder/largeutf8.js'; import type { BinaryBuilder } from './builder/binary.js'; import type { ListBuilder } from './builder/list.js'; import type { FixedSizeListBuilder } from './builder/fixedsizelist.js'; @@ -98,6 +99,12 @@ export interface BigIntArrayConstructor { from(arrayLike: ArrayLike, mapfn: (v: U, k: number) => bigint, thisArg?: any): T; } +/** @ignore */ +export type ArrayCtor = + T extends TypedArray ? TypedArrayConstructor : + T extends BigIntArray ? BigIntArrayConstructor : + any; + /** @ignore */ export type BuilderCtorArgs< T extends BuilderType, @@ -105,7 +112,7 @@ export type BuilderCtorArgs< TArgs extends any[] = any[], TCtor extends new (type: R, ...args: TArgs) => T = new (type: R, ...args: TArgs) => T - > = TCtor extends new (type: R, ...args: infer TArgs) => T ? TArgs : never; +> = TCtor extends new (type: R, ...args: infer TArgs) => T ? TArgs : never; /** * Obtain the constructor function of an instance type @@ -115,7 +122,7 @@ export type ConstructorType< T, TCtor extends new (...args: any[]) => T = new (...args: any[]) => T - > = TCtor extends new (...args: any[]) => T ? TCtor : never; +> = TCtor extends new (...args: any[]) => T ? TCtor : never; /** @ignore */ export type BuilderCtorType< @@ -123,7 +130,7 @@ export type BuilderCtorType< R extends DataType = any, TCtor extends new (options: BuilderOptions) => T = new (options: BuilderOptions) => T - > = TCtor extends new (options: BuilderOptions) => T ? TCtor : never; +> = TCtor extends new (options: BuilderOptions) => T ? TCtor : never; /** @ignore */ export type BuilderType = @@ -201,6 +208,7 @@ export type TypeToDataType = { [Type.Float64]: type.Float64; [Type.Float]: type.Float; [Type.Utf8]: type.Utf8; + [Type.LargeUtf8]: type.LargeUtf8; [Type.Binary]: type.Binary; [Type.FixedSizeBinary]: type.FixedSizeBinary; [Type.Date]: type.Date_; @@ -254,6 +262,7 @@ type TypeToBuilder = { [Type.Float64]: Float64Builder; [Type.Float]: FloatBuilder; [Type.Utf8]: Utf8Builder; + [Type.LargeUtf8]: LargeUtf8Builder; [Type.Binary]: BinaryBuilder; [Type.FixedSizeBinary]: FixedSizeBinaryBuilder; [Type.Date]: DateBuilder; @@ -307,6 +316,7 @@ type DataTypeToBuilder = { [Type.Float64]: T extends type.Float64 ? Float64Builder : never; [Type.Float]: T extends type.Float ? FloatBuilder : never; [Type.Utf8]: T extends type.Utf8 ? Utf8Builder : never; + [Type.LargeUtf8]: T extends type.LargeUtf8 ? LargeUtf8Builder : never; [Type.Binary]: T extends type.Binary ? BinaryBuilder : never; [Type.FixedSizeBinary]: T extends type.FixedSizeBinary ? FixedSizeBinaryBuilder : never; [Type.Date]: T extends type.Date_ ? DateBuilder : never; @@ -329,11 +339,11 @@ type DataTypeToBuilder = { [Type.Interval]: T extends type.Interval ? IntervalBuilder : never; [Type.IntervalDayTime]: T extends type.IntervalDayTime ? IntervalDayTimeBuilder : never; [Type.IntervalYearMonth]: T extends type.IntervalYearMonth ? IntervalYearMonthBuilder : never; - [Type.Duration]: T extends type.Duration ? DurationBuilder: never; + [Type.Duration]: T extends type.Duration ? DurationBuilder : never; [Type.DurationSecond]: T extends type.DurationSecond ? DurationSecondBuilder : never; [Type.DurationMillisecond]: T extends type.DurationMillisecond ? DurationMillisecondBuilder : never; - [Type.DurationMicrosecond]: T extends type.DurationMicrosecond ? DurationMicrosecondBuilder: never; - [Type.DurationNanosecond]: T extends type.DurationNanosecond ? DurationNanosecondBuilder: never; + [Type.DurationMicrosecond]: T extends type.DurationMicrosecond ? DurationMicrosecondBuilder : never; + [Type.DurationNanosecond]: T extends type.DurationNanosecond ? DurationNanosecondBuilder : never; [Type.Map]: T extends type.Map_ ? MapBuilder : never; [Type.List]: T extends type.List ? ListBuilder : never; [Type.Struct]: T extends type.Struct ? StructBuilder : never; diff --git a/js/src/ipc/metadata/json.ts b/js/src/ipc/metadata/json.ts index f1f306730ddba..b669c0c612f8a 100644 --- a/js/src/ipc/metadata/json.ts +++ b/js/src/ipc/metadata/json.ts @@ -20,7 +20,7 @@ import { Schema, Field } from '../../schema.js'; import { DataType, Dictionary, TimeBitWidth, - Utf8, Binary, Decimal, FixedSizeBinary, + Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, } from '../../type.js'; @@ -150,6 +150,7 @@ function typeFromJSON(f: any, children?: Field[]): DataType { case 'null': return new Null(); case 'binary': return new Binary(); case 'utf8': return new Utf8(); + case 'largeutf8': return new LargeUtf8(); case 'bool': return new Bool(); case 'list': return new List((children || [])[0]); case 'struct': return new Struct(children || []); diff --git a/js/src/ipc/metadata/message.ts b/js/src/ipc/metadata/message.ts index 27c9b92d6897b..cf05bff54cfba 100644 --- a/js/src/ipc/metadata/message.ts +++ b/js/src/ipc/metadata/message.ts @@ -56,7 +56,7 @@ import ByteBuffer = flatbuffers.ByteBuffer; import { DataType, Dictionary, TimeBitWidth, - Utf8, Binary, Decimal, FixedSizeBinary, + Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, } from '../../type.js'; @@ -433,6 +433,7 @@ function decodeFieldType(f: _Field, children?: Field[]): DataType { case Type['Null']: return new Null(); case Type['Binary']: return new Binary(); case Type['Utf8']: return new Utf8(); + case Type['LargeUtf8']: return new LargeUtf8(); case Type['Bool']: return new Bool(); case Type['List']: return new List((children || [])[0]); case Type['Struct_']: return new Struct(children || []); diff --git a/js/src/type.ts b/js/src/type.ts index 34bbf45bca728..6223d0316f17a 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -19,7 +19,7 @@ import { Field } from './schema.js'; import { Vector } from './vector.js'; import { MapRow } from './row/map.js'; import { StructRow, StructRowProxy } from './row/struct.js'; -import { TypedArrayConstructor } from './interfaces.js'; +import { ArrayCtor, BigIntArrayConstructor, TypedArrayConstructor } from './interfaces.js'; import { bigIntToNumber } from './util/bigint.js'; import { @@ -38,9 +38,11 @@ export type IsSigned = { 'true': true; 'false': false }; export interface DataType { readonly TType: TType; readonly TArray: any; + readonly TOffsetArray: any; readonly TValue: any; readonly TChildren: TChildren; readonly ArrayType: any; + readonly OffsetArrayType: ArrayCtor; readonly children: Field[]; } @@ -57,6 +59,7 @@ export abstract class DataType { (proto).children = null; (proto).ArrayType = Array; + (proto).OffsetArrayType = Int32Array; return proto[Symbol.toStringTag] = 'DataType'; })(DataType.prototype); } @@ -232,7 +236,7 @@ Object.defineProperty(Float32.prototype, 'ArrayType', { value: Float32Array }); Object.defineProperty(Float64.prototype, 'ArrayType', { value: Float64Array }); /** @ignore */ -export interface Binary extends DataType { TArray: Uint8Array; TValue: Uint8Array; ArrayType: TypedArrayConstructor } +export interface Binary extends DataType { TArray: Uint8Array; TOffsetArray: Int32Array; TValue: Uint8Array; ArrayType: TypedArrayConstructor; OffsetArrayType: TypedArrayConstructor } /** @ignore */ export class Binary extends DataType { constructor() { @@ -247,7 +251,7 @@ export class Binary extends DataType { } /** @ignore */ -export interface Utf8 extends DataType { TArray: Uint8Array; TValue: string; ArrayType: TypedArrayConstructor } +export interface Utf8 extends DataType { TArray: Uint8Array; TOffsetArray: Int32Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetArrayType: TypedArrayConstructor } /** @ignore */ export class Utf8 extends DataType { constructor() { @@ -261,6 +265,22 @@ export class Utf8 extends DataType { })(Utf8.prototype); } +/** @ignore */ +export interface LargeUtf8 extends DataType { TArray: Uint8Array; TOffsetArray: BigInt64Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetArrayType: BigIntArrayConstructor } +/** @ignore */ +export class LargeUtf8 extends DataType { + constructor() { + super(); + } + public get typeId() { return Type.LargeUtf8 as Type.LargeUtf8; } + public toString() { return `LargeUtf8`; } + protected static [Symbol.toStringTag] = ((proto: LargeUtf8) => { + (proto).ArrayType = Uint8Array; + (proto).OffsetArrayType = BigInt64Array; + return proto[Symbol.toStringTag] = 'LargeUtf8'; + })(LargeUtf8.prototype); +} + /** @ignore */ export interface Bool extends DataType { TArray: Uint8Array; TValue: boolean; ArrayType: TypedArrayConstructor } /** @ignore */ @@ -458,13 +478,13 @@ export class Duration extends DataType { } /** @ignore */ -export class DurationSecond extends Duration { constructor() { super(TimeUnit.SECOND); }} +export class DurationSecond extends Duration { constructor() { super(TimeUnit.SECOND); } } /** @ignore */ -export class DurationMillisecond extends Duration { constructor() { super(TimeUnit.MILLISECOND); }} +export class DurationMillisecond extends Duration { constructor() { super(TimeUnit.MILLISECOND); } } /** @ignore */ -export class DurationMicrosecond extends Duration { constructor() { super(TimeUnit.MICROSECOND); }} +export class DurationMicrosecond extends Duration { constructor() { super(TimeUnit.MICROSECOND); } } /** @ignore */ -export class DurationNanosecond extends Duration { constructor() { super(TimeUnit.NANOSECOND); }} +export class DurationNanosecond extends Duration { constructor() { super(TimeUnit.NANOSECOND); } } /** @ignore */ @@ -581,6 +601,7 @@ export class FixedSizeBinary extends DataType { protected static [Symbol.toStringTag] = ((proto: FixedSizeBinary) => { (proto).byteWidth = null; (proto).ArrayType = Uint8Array; + (proto).OffsetArrayType = Int32Array; return proto[Symbol.toStringTag] = 'FixedSizeBinary'; })(FixedSizeBinary.prototype); } diff --git a/js/src/util/buffer.ts b/js/src/util/buffer.ts index dd8edf11f9258..4f4379dedf6d8 100644 --- a/js/src/util/buffer.ts +++ b/js/src/util/buffer.ts @@ -83,9 +83,9 @@ export function joinUint8Arrays(chunks: Uint8Array[], size?: number | null): [Ui } /** @ignore */ -export type ArrayBufferViewInput = ArrayBufferView | ArrayBufferLike | ArrayBufferView | Iterable | ArrayLike | ByteBuffer | string | null | undefined | - IteratorResult | ArrayLike | ByteBuffer | string | null | undefined> | - ReadableStreamReadResult | ArrayLike | ByteBuffer | string | null | undefined>; +export type ArrayBufferViewInput = ArrayBufferView | ArrayBufferLike | ArrayBufferView | Iterable | Iterable | ArrayLike | ArrayLike | ByteBuffer | string | null | undefined | + IteratorResult | Iterable | ArrayLike | ArrayLike | ByteBuffer | string | null | undefined> | + ReadableStreamReadResult | Iterable | ArrayLike | ArrayLike | ByteBuffer | string | null | undefined>; /** @ignore */ export function toArrayBufferView< @@ -208,7 +208,9 @@ export async function* toArrayBufferViewAsyncIterator(Arra /** @ignore */ export const toUint8ClampedArrayAsyncIterator = (input: ArrayBufferViewAsyncIteratorInput) => toArrayBufferViewAsyncIterator(Uint8ClampedArray, input); /** @ignore */ -export function rebaseValueOffsets(offset: number, length: number, valueOffsets: Int32Array) { +export function rebaseValueOffsets(offset: number, length: number, valueOffsets: Int32Array): Int32Array; +export function rebaseValueOffsets(offset: number, length: number, valueOffsets: BigInt64Array): BigInt64Array; +export function rebaseValueOffsets(offset: number, length: number, valueOffsets: any) { // If we have a non-zero offset, create a new offsets array with the values // shifted by the start offset, such that the new start offset is 0 if (offset !== 0) { diff --git a/js/src/visitor.ts b/js/src/visitor.ts index c63640b038e47..5b3cc4d3d0593 100644 --- a/js/src/visitor.ts +++ b/js/src/visitor.ts @@ -36,6 +36,7 @@ export abstract class Visitor { public visitInt(_node: any, ..._args: any[]): any { return null; } public visitFloat(_node: any, ..._args: any[]): any { return null; } public visitUtf8(_node: any, ..._args: any[]): any { return null; } + public visitLargeUtf8(_node: any, ..._args: any[]): any { return null; } public visitBinary(_node: any, ..._args: any[]): any { return null; } public visitFixedSizeBinary(_node: any, ..._args: any[]): any { return null; } public visitDate(_node: any, ..._args: any[]): any { return null; } @@ -89,6 +90,7 @@ function getVisitFnByTypeId(visitor: Visitor, dtype: Type, throwIfNotFound = tru case Type.Float32: fn = visitor.visitFloat32 || visitor.visitFloat; break; case Type.Float64: fn = visitor.visitFloat64 || visitor.visitFloat; break; case Type.Utf8: fn = visitor.visitUtf8; break; + case Type.LargeUtf8: fn = visitor.visitLargeUtf8; break; case Type.Binary: fn = visitor.visitBinary; break; case Type.FixedSizeBinary: fn = visitor.visitFixedSizeBinary; break; case Type.Date: fn = visitor.visitDate; break; @@ -152,6 +154,7 @@ function inferDType(type: T): Type { return Type.Float; case Type.Binary: return Type.Binary; case Type.Utf8: return Type.Utf8; + case Type.LargeUtf8: return Type.LargeUtf8; case Type.Bool: return Type.Bool; case Type.Decimal: return Type.Decimal; case Type.Time: @@ -229,6 +232,7 @@ export interface Visitor { visitFloat32?(node: any, ...args: any[]): any; visitFloat64?(node: any, ...args: any[]): any; visitUtf8(node: any, ...args: any[]): any; + visitLargeUtf8(node: any, ...args: any[]): any; visitBinary(node: any, ...args: any[]): any; visitFixedSizeBinary(node: any, ...args: any[]): any; visitDate(node: any, ...args: any[]): any; diff --git a/js/src/visitor/builderctor.ts b/js/src/visitor/builderctor.ts index 54b5610a50eed..83374712b2642 100644 --- a/js/src/visitor/builderctor.ts +++ b/js/src/visitor/builderctor.ts @@ -40,6 +40,7 @@ import { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, import { TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecondBuilder, TimeNanosecondBuilder } from '../builder/time.js'; import { UnionBuilder, DenseUnionBuilder, SparseUnionBuilder } from '../builder/union.js'; import { Utf8Builder } from '../builder/utf8.js'; +import { LargeUtf8Builder } from '../builder/largeutf8.js'; /** @ignore */ export interface GetBuilderCtor extends Visitor { @@ -67,6 +68,7 @@ export class GetBuilderCtor extends Visitor { public visitFloat32() { return Float32Builder; } public visitFloat64() { return Float64Builder; } public visitUtf8() { return Utf8Builder; } + public visitLargeUtf8() { return LargeUtf8Builder; } public visitBinary() { return BinaryBuilder; } public visitFixedSizeBinary() { return FixedSizeBinaryBuilder; } public visitDate() { return DateBuilder; } diff --git a/js/src/visitor/bytelength.ts b/js/src/visitor/bytelength.ts index 72d6148a52fd8..c3bfadd50e155 100644 --- a/js/src/visitor/bytelength.ts +++ b/js/src/visitor/bytelength.ts @@ -26,7 +26,7 @@ import { Type, TimeUnit, UnionMode } from '../enum.js'; import { DataType, Dictionary, Float, Int, Date_, Interval, Time, Timestamp, Duration, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, DenseUnion, SparseUnion, } from '../type.js'; @@ -40,6 +40,7 @@ export interface GetByteLengthVisitor extends Visitor { getVisitFn(node: T): (data: Data>, index: number) => number; visitBinary(data: Data, index: number): number; visitUtf8(data: Data, index: number): number; + visitLargeUtf8(data: Data, index: number): number; visitList(data: Data, index: number): number; visitDenseUnion(data: Data, index: number): number; visitSparseUnion(data: Data, index: number): number; diff --git a/js/src/visitor/get.ts b/js/src/visitor/get.ts index 5aaaedf51a37e..a801c90047c89 100644 --- a/js/src/visitor/get.ts +++ b/js/src/visitor/get.ts @@ -21,6 +21,7 @@ import { Vector } from '../vector.js'; import { Visitor } from '../visitor.js'; import { MapRow } from '../row/map.js'; import { StructRow, StructRowProxy } from '../row/struct.js'; +import { bigIntToNumber } from '../util/bigint.js'; import { decodeUtf8 } from '../util/utf8.js'; import { TypeToDataType } from '../interfaces.js'; import { uint16ToFloat64 } from '../util/math.js'; @@ -35,7 +36,7 @@ import { Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, Duration, DurationSecond, DurationMillisecond, DurationMicrosecond, DurationNanosecond, - Union, DenseUnion, SparseUnion, + Union, DenseUnion, SparseUnion, LargeUtf8, } from '../type.js'; /** @ignore */ @@ -60,6 +61,7 @@ export interface GetVisitor extends Visitor { visitFloat32(data: Data, index: number): T['TValue'] | null; visitFloat64(data: Data, index: number): T['TValue'] | null; visitUtf8(data: Data, index: number): T['TValue'] | null; + visitLargeUtf8(data: Data, index: number): T['TValue'] | null; visitBinary(data: Data, index: number): T['TValue'] | null; visitFixedSizeBinary(data: Data, index: number): T['TValue'] | null; visitDate(data: Data, index: number): T['TValue'] | null; @@ -122,6 +124,15 @@ const getVariableWidthBytes = (values: Uint8Array, valueOffsets: Int32Array, ind const y = valueOffsets[index + 1]; return values.subarray(x, y); }; +/** @ignore */ +const getLargeVariableWidthBytes = (values: Uint8Array, valueOffsets: BigInt64Array, index: number) => { + if (index + 1 >= valueOffsets.length) { + return null as any; + } + const x = bigIntToNumber(valueOffsets[index]); + const y = bigIntToNumber(valueOffsets[index + 1]); + return values.subarray(x, y); +}; /** @ignore */ const getBool = ({ offset, values }: Data, index: number): T['TValue'] => { @@ -155,6 +166,11 @@ const getUtf8 = ({ values, valueOffsets }: Data, index: numbe const bytes = getVariableWidthBytes(values, valueOffsets, index); return bytes !== null ? decodeUtf8(bytes) : null as any; }; +/** @ignore */ +const getLargeUtf8 = ({ values, valueOffsets }: Data, index: number): T['TValue'] => { + const bytes = getLargeVariableWidthBytes(values, valueOffsets, index); + return bytes !== null ? decodeUtf8(bytes) : null as any; +}; /* istanbul ignore next */ /** @ignore */ @@ -328,6 +344,7 @@ GetVisitor.prototype.visitFloat16 = wrapGet(getFloat16); GetVisitor.prototype.visitFloat32 = wrapGet(getNumeric); GetVisitor.prototype.visitFloat64 = wrapGet(getNumeric); GetVisitor.prototype.visitUtf8 = wrapGet(getUtf8); +GetVisitor.prototype.visitLargeUtf8 = wrapGet(getLargeUtf8); GetVisitor.prototype.visitBinary = wrapGet(getBinary); GetVisitor.prototype.visitFixedSizeBinary = wrapGet(getFixedSizeBinary); GetVisitor.prototype.visitDate = wrapGet(getDate); diff --git a/js/src/visitor/indexof.ts b/js/src/visitor/indexof.ts index 4cf0076b3c8e2..76f95788c7953 100644 --- a/js/src/visitor/indexof.ts +++ b/js/src/visitor/indexof.ts @@ -24,7 +24,7 @@ import { getBool, BitIterator } from '../util/bit.js'; import { createElementComparator } from '../util/vector.js'; import { DataType, Dictionary, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -57,6 +57,7 @@ export interface IndexOfVisitor extends Visitor { visitFloat32(data: Data, value: T['TValue'] | null, index?: number): number; visitFloat64(data: Data, value: T['TValue'] | null, index?: number): number; visitUtf8(data: Data, value: T['TValue'] | null, index?: number): number; + visitLargeUtf8(data: Data, value: T['TValue'] | null, index?: number): number; visitBinary(data: Data, value: T['TValue'] | null, index?: number): number; visitFixedSizeBinary(data: Data, value: T['TValue'] | null, index?: number): number; visitDate(data: Data, value: T['TValue'] | null, index?: number): number; @@ -172,6 +173,7 @@ IndexOfVisitor.prototype.visitFloat16 = indexOfValue; IndexOfVisitor.prototype.visitFloat32 = indexOfValue; IndexOfVisitor.prototype.visitFloat64 = indexOfValue; IndexOfVisitor.prototype.visitUtf8 = indexOfValue; +IndexOfVisitor.prototype.visitLargeUtf8 = indexOfValue; IndexOfVisitor.prototype.visitBinary = indexOfValue; IndexOfVisitor.prototype.visitFixedSizeBinary = indexOfValue; IndexOfVisitor.prototype.visitDate = indexOfValue; diff --git a/js/src/visitor/iterator.ts b/js/src/visitor/iterator.ts index e38bb907695d0..09dfcb0b565ae 100644 --- a/js/src/visitor/iterator.ts +++ b/js/src/visitor/iterator.ts @@ -21,7 +21,7 @@ import { Type, Precision } from '../enum.js'; import { TypeToDataType } from '../interfaces.js'; import { DataType, Dictionary, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -55,6 +55,7 @@ export interface IteratorVisitor extends Visitor { visitFloat32(vector: Vector): IterableIterator; visitFloat64(vector: Vector): IterableIterator; visitUtf8(vector: Vector): IterableIterator; + visitLargeUtf8(vector: Vector): IterableIterator; visitBinary(vector: Vector): IterableIterator; visitFixedSizeBinary(vector: Vector): IterableIterator; visitDate(vector: Vector): IterableIterator; @@ -158,6 +159,7 @@ IteratorVisitor.prototype.visitFloat16 = vectorIterator; IteratorVisitor.prototype.visitFloat32 = vectorIterator; IteratorVisitor.prototype.visitFloat64 = vectorIterator; IteratorVisitor.prototype.visitUtf8 = vectorIterator; +IteratorVisitor.prototype.visitLargeUtf8 = vectorIterator; IteratorVisitor.prototype.visitBinary = vectorIterator; IteratorVisitor.prototype.visitFixedSizeBinary = vectorIterator; IteratorVisitor.prototype.visitDate = vectorIterator; diff --git a/js/src/visitor/jsontypeassembler.ts b/js/src/visitor/jsontypeassembler.ts index 6e6cfb07413c3..a6746a858ecb4 100644 --- a/js/src/visitor/jsontypeassembler.ts +++ b/js/src/visitor/jsontypeassembler.ts @@ -48,6 +48,9 @@ export class JSONTypeAssembler extends Visitor { public visitUtf8({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } + public visitLargeUtf8({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } public visitDecimal({ typeId, scale, precision, bitWidth }: T) { return { 'name': ArrowType[typeId].toLowerCase(), 'scale': scale, 'precision': precision, 'bitWidth': bitWidth }; } @@ -64,7 +67,7 @@ export class JSONTypeAssembler extends Visitor { return { 'name': ArrowType[typeId].toLowerCase(), 'unit': IntervalUnit[unit] }; } public visitDuration({ typeId, unit }: T) { - return { 'name': ArrowType[typeId].toLocaleLowerCase(), 'unit': TimeUnit[unit]}; + return { 'name': ArrowType[typeId].toLocaleLowerCase(), 'unit': TimeUnit[unit] }; } public visitList({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; diff --git a/js/src/visitor/jsonvectorassembler.ts b/js/src/visitor/jsonvectorassembler.ts index 0af954e4adacc..9a3cb8601a434 100644 --- a/js/src/visitor/jsonvectorassembler.ts +++ b/js/src/visitor/jsonvectorassembler.ts @@ -27,7 +27,7 @@ import { BitIterator, getBit, getBool } from '../util/bit.js'; import { DataType, Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, IntArray, + Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, IntArray, LargeUtf8, } from '../type.js'; /** @ignore */ @@ -42,6 +42,7 @@ export interface JSONVectorAssembler extends Visitor { visitInt(data: Data): { DATA: number[] | string[] }; visitFloat(data: Data): { DATA: number[] }; visitUtf8(data: Data): { DATA: string[]; OFFSET: number[] }; + visitLargeUtf8(data: Data): { DATA: string[]; OFFSET: string[] }; visitBinary(data: Data): { DATA: string[]; OFFSET: number[] }; visitFixedSizeBinary(data: Data): { DATA: string[] }; visitDate(data: Data): { DATA: number[] }; @@ -100,6 +101,9 @@ export class JSONVectorAssembler extends Visitor { public visitUtf8(data: Data) { return { 'DATA': [...new Vector([data])], 'OFFSET': [...data.valueOffsets] }; } + public visitLargeUtf8(data: Data) { + return { 'DATA': [...new Vector([data])], 'OFFSET': [...bigNumsToStrings(data.valueOffsets, 2)] }; + } public visitBinary(data: Data) { return { 'DATA': [...binaryToString(new Vector([data]))], OFFSET: [...data.valueOffsets] }; } @@ -148,7 +152,7 @@ export class JSONVectorAssembler extends Visitor { return { 'DATA': [...data.values] }; } public visitDuration(data: Data) { - return { 'DATA': [...bigNumsToStrings(data.values, 2)]}; + return { 'DATA': [...bigNumsToStrings(data.values, 2)] }; } public visitFixedSizeList(data: Data) { return { diff --git a/js/src/visitor/set.ts b/js/src/visitor/set.ts index 1a0eddc556899..a439ec8311fd6 100644 --- a/js/src/visitor/set.ts +++ b/js/src/visitor/set.ts @@ -19,13 +19,14 @@ import { Data } from '../data.js'; import { Field } from '../schema.js'; import { Vector } from '../vector.js'; import { Visitor } from '../visitor.js'; +import { bigIntToNumber } from '../util/bigint.js'; import { encodeUtf8 } from '../util/utf8.js'; import { TypeToDataType } from '../interfaces.js'; import { float64ToUint16 } from '../util/math.js'; import { Type, UnionMode, Precision, DateUnit, TimeUnit, IntervalUnit } from '../enum.js'; import { DataType, Dictionary, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -58,6 +59,7 @@ export interface SetVisitor extends Visitor { visitFloat32(data: Data, index: number, value: T['TValue']): void; visitFloat64(data: Data, index: number, value: T['TValue']): void; visitUtf8(data: Data, index: number, value: T['TValue']): void; + visitLargeUtf8(data: Data, index: number, value: T['TValue']): void; visitBinary(data: Data, index: number, value: T['TValue']): void; visitFixedSizeBinary(data: Data, index: number, value: T['TValue']): void; visitDate(data: Data, index: number, value: T['TValue']): void; @@ -123,9 +125,19 @@ export const setEpochMsToNanosecondsLong = (data: Int32Array, index: number, epo }; /** @ignore */ -export const setVariableWidthBytes = (values: Uint8Array, valueOffsets: Int32Array, index: number, value: Uint8Array) => { +export const setVariableWidthBytes = (values: Uint8Array, valueOffsets: T, index: number, value: Uint8Array) => { if (index + 1 < valueOffsets.length) { - const { [index]: x, [index + 1]: y } = valueOffsets; + const x = valueOffsets[index]; + const y = valueOffsets[index + 1]; + values.set(value.subarray(0, y - x), x); + } +}; + +/** @ignore */ +export const setLargeVariableWidthBytes = (values: Uint8Array, valueOffsets: T, index: number, value: Uint8Array) => { + if (index + 1 < valueOffsets.length) { + const x = bigIntToNumber(valueOffsets[index]); + const y = bigIntToNumber(valueOffsets[index + 1]); values.set(value.subarray(0, y - x), x); } }; @@ -167,6 +179,10 @@ const setBinary = ({ values, valueOffsets }: Data, index: n const setUtf8 = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => { setVariableWidthBytes(values, valueOffsets, index, encodeUtf8(value)); }; +/** @ignore */ +const setLargeUtf8 = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => { + setLargeVariableWidthBytes(values, valueOffsets, index, encodeUtf8(value)); +}; /* istanbul ignore next */ export const setDate = (data: Data, index: number, value: T['TValue']): void => { @@ -365,6 +381,7 @@ SetVisitor.prototype.visitFloat16 = wrapSet(setFloat16); SetVisitor.prototype.visitFloat32 = wrapSet(setFloat); SetVisitor.prototype.visitFloat64 = wrapSet(setFloat); SetVisitor.prototype.visitUtf8 = wrapSet(setUtf8); +SetVisitor.prototype.visitLargeUtf8 = wrapSet(setLargeUtf8); SetVisitor.prototype.visitBinary = wrapSet(setBinary); SetVisitor.prototype.visitFixedSizeBinary = wrapSet(setFixedSizeBinary); SetVisitor.prototype.visitDate = wrapSet(setDate); diff --git a/js/src/visitor/typeassembler.ts b/js/src/visitor/typeassembler.ts index c2262d20531b9..f072714222739 100644 --- a/js/src/visitor/typeassembler.ts +++ b/js/src/visitor/typeassembler.ts @@ -27,6 +27,7 @@ import { FloatingPoint } from '../fb/floating-point.js'; import { Binary } from '../fb/binary.js'; import { Bool } from '../fb/bool.js'; import { Utf8 } from '../fb/utf8.js'; +import { LargeUtf8 } from '../fb/large-utf8.js'; import { Decimal } from '../fb/decimal.js'; import { Date } from '../fb/date.js'; import { Time } from '../fb/time.js'; @@ -78,6 +79,10 @@ export class TypeAssembler extends Visitor { Utf8.startUtf8(b); return Utf8.endUtf8(b); } + public visitLargeUtf8(_node: T, b: Builder) { + LargeUtf8.startLargeUtf8(b); + return LargeUtf8.endLargeUtf8(b); + } public visitDecimal(node: T, b: Builder) { Decimal.startDecimal(b); Decimal.addScale(b, node.scale); diff --git a/js/src/visitor/typecomparator.ts b/js/src/visitor/typecomparator.ts index 1de8e218dae4f..2417dec09c6e9 100644 --- a/js/src/visitor/typecomparator.ts +++ b/js/src/visitor/typecomparator.ts @@ -21,7 +21,7 @@ import { Visitor } from '../visitor.js'; import { Schema, Field } from '../schema.js'; import { DataType, TypeMap, Dictionary, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -53,6 +53,7 @@ export interface TypeComparator extends Visitor { visitFloat32(type: T, other?: DataType | null): other is T; visitFloat64(type: T, other?: DataType | null): other is T; visitUtf8(type: T, other?: DataType | null): other is T; + visitLargeUtf8(type: T, other?: DataType | null): other is T; visitBinary(type: T, other?: DataType | null): other is T; visitFixedSizeBinary(type: T, other?: DataType | null): other is T; visitDate(type: T, other?: DataType | null): other is T; @@ -249,6 +250,7 @@ TypeComparator.prototype.visitFloat16 = compareFloat; TypeComparator.prototype.visitFloat32 = compareFloat; TypeComparator.prototype.visitFloat64 = compareFloat; TypeComparator.prototype.visitUtf8 = compareAny; +TypeComparator.prototype.visitLargeUtf8 = compareAny; TypeComparator.prototype.visitBinary = compareAny; TypeComparator.prototype.visitFixedSizeBinary = compareFixedSizeBinary; TypeComparator.prototype.visitDate = compareDate; diff --git a/js/src/visitor/typector.ts b/js/src/visitor/typector.ts index 077f66592fbfb..2e0bbc4147abb 100644 --- a/js/src/visitor/typector.ts +++ b/js/src/visitor/typector.ts @@ -49,6 +49,7 @@ export class GetDataTypeConstructor extends Visitor { public visitFloat32() { return type.Float32; } public visitFloat64() { return type.Float64; } public visitUtf8() { return type.Utf8; } + public visitLargeUtf8() { return type.LargeUtf8; } public visitBinary() { return type.Binary; } public visitFixedSizeBinary() { return type.FixedSizeBinary; } public visitDate() { return type.Date_; } diff --git a/js/src/visitor/vectorassembler.ts b/js/src/visitor/vectorassembler.ts index 949463272e718..7a9d3bdd57b0d 100644 --- a/js/src/visitor/vectorassembler.ts +++ b/js/src/visitor/vectorassembler.ts @@ -27,8 +27,9 @@ import { BufferRegion, FieldNode } from '../ipc/metadata/message.js'; import { DataType, Dictionary, Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, LargeUtf8, } from '../type.js'; +import { bigIntToNumber } from '../util/bigint.js'; /** @ignore */ export interface VectorAssembler extends Visitor { @@ -204,9 +205,29 @@ function assembleFlatVector(this: VectorAssembler, data: Data) { const { length, values, valueOffsets } = data; const { [0]: begin, [length]: end } = valueOffsets; + return _assembleFlatListVector.call(this, length, begin, end, values, valueOffsets); +} + +/** @ignore */ +function assembleLargeFlatListVector(this: VectorAssembler, data: Data) { + const { length, values, valueOffsets } = data; + const begin = bigIntToNumber(valueOffsets[0]); + const end = bigIntToNumber(valueOffsets[length]); + return _assembleFlatListVector.call(this, length, begin, end, values, valueOffsets); +} + +/** @ignore */ +function _assembleFlatListVector( + this: VectorAssembler, + length: number, + begin: number, + end: number, + values: T['TArray'], + valueOffsets: T['TOffsetArray'] +) { const byteLength = Math.min(end - begin, values.byteLength - begin); // Push in the order FlatList types read their buffers - addBuffer.call(this, rebaseValueOffsets(-begin, length + 1, valueOffsets)); // valueOffsets buffer first + addBuffer.call(this, rebaseValueOffsets(-begin, length + 1, valueOffsets as any)); // valueOffsets buffer first addBuffer.call(this, values.subarray(begin, begin + byteLength)); // sliced values buffer second return this; } @@ -234,6 +255,7 @@ VectorAssembler.prototype.visitBool = assembleBoolVector; VectorAssembler.prototype.visitInt = assembleFlatVector; VectorAssembler.prototype.visitFloat = assembleFlatVector; VectorAssembler.prototype.visitUtf8 = assembleFlatListVector; +VectorAssembler.prototype.visitLargeUtf8 = assembleLargeFlatListVector; VectorAssembler.prototype.visitBinary = assembleFlatListVector; VectorAssembler.prototype.visitFixedSizeBinary = assembleFlatVector; VectorAssembler.prototype.visitDate = assembleFlatVector; diff --git a/js/src/visitor/vectorloader.ts b/js/src/visitor/vectorloader.ts index db34edad9a1c1..35f28f49baada 100644 --- a/js/src/visitor/vectorloader.ts +++ b/js/src/visitor/vectorloader.ts @@ -71,6 +71,9 @@ export class VectorLoader extends Visitor { public visitUtf8(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } + public visitLargeUtf8(type: T, { length, nullCount } = this.nextFieldNode()) { + return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); + } public visitBinary(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } @@ -151,7 +154,7 @@ export class JSONVectorLoader extends VectorLoader { return nullCount <= 0 ? new Uint8Array(0) : packBools(this.sources[offset]); } protected readOffsets(_type: T, { offset } = this.nextBufferRange()) { - return toArrayBufferView(Uint8Array, toArrayBufferView(Int32Array, this.sources[offset])); + return toArrayBufferView(Uint8Array, toArrayBufferView(_type.OffsetArrayType, this.sources[offset])); } protected readTypeIds(type: T, { offset } = this.nextBufferRange()) { return toArrayBufferView(Uint8Array, toArrayBufferView(type.ArrayType, this.sources[offset])); @@ -170,7 +173,7 @@ export class JSONVectorLoader extends VectorLoader { return binaryDataFromJSON(sources[offset] as string[]); } else if (DataType.isBool(type)) { return packBools(sources[offset] as number[]); - } else if (DataType.isUtf8(type)) { + } else if (DataType.isUtf8(type) || DataType.isLargeUtf8(type)) { return encodeUtf8((sources[offset] as string[]).join('')); } return toArrayBufferView(Uint8Array, toArrayBufferView(type.ArrayType, sources[offset].map((x) => +x))); diff --git a/js/test/data/tables.ts b/js/test/data/tables.ts index 28aed7e4feccf..449cfe1fb853a 100644 --- a/js/test/data/tables.ts +++ b/js/test/data/tables.ts @@ -27,7 +27,7 @@ const nestedVectorGeneratorNames = ['struct', 'denseUnion', 'sparseUnion', 'map' const dictionaryKeyGeneratorNames = ['int8', 'int16', 'int32', 'uint8', 'uint16', 'uint32']; const valueVectorGeneratorNames = [ 'null_', 'bool', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64', - 'float16', 'float32', 'float64', 'utf8', 'binary', 'fixedSizeBinary', 'dateDay', 'dateMillisecond', + 'float16', 'float32', 'float64', 'utf8', 'largeUtf8', 'binary', 'fixedSizeBinary', 'dateDay', 'dateMillisecond', 'timestampSecond', 'timestampMillisecond', 'timestampMicrosecond', 'timestampNanosecond', 'timeSecond', 'timeMillisecond', 'timeMicrosecond', 'timeNanosecond', 'decimal', 'dictionary', 'intervalDayTime', 'intervalYearMonth', diff --git a/js/test/generate-test-data.ts b/js/test/generate-test-data.ts index 15fb715a31f95..9d7b038331fe6 100644 --- a/js/test/generate-test-data.ts +++ b/js/test/generate-test-data.ts @@ -24,7 +24,7 @@ import { Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, + Utf8, LargeUtf8, Binary, FixedSizeBinary, Date_, DateDay, DateMillisecond, @@ -52,6 +52,7 @@ interface TestDataVectorGenerator extends Visitor { visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; + visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; @@ -75,6 +76,7 @@ interface TestDataVectorGenerator extends Visitor { visitUint64: typeof generateBigInt; visitFloat: typeof generateFloat; visitUtf8: typeof generateUtf8; + visitLargeUtf8: typeof generateLargeUtf8; visitBinary: typeof generateBinary; visitFixedSizeBinary: typeof generateFixedSizeBinary; visitDate: typeof generateDate; @@ -100,6 +102,7 @@ TestDataVectorGenerator.prototype.visitInt64 = generateBigInt; TestDataVectorGenerator.prototype.visitUint64 = generateBigInt; TestDataVectorGenerator.prototype.visitFloat = generateFloat; TestDataVectorGenerator.prototype.visitUtf8 = generateUtf8; +TestDataVectorGenerator.prototype.visitLargeUtf8 = generateLargeUtf8; TestDataVectorGenerator.prototype.visitBinary = generateBinary; TestDataVectorGenerator.prototype.visitFixedSizeBinary = generateFixedSizeBinary; TestDataVectorGenerator.prototype.visitDate = generateDate; @@ -214,6 +217,7 @@ export const float16 = (length = 100, nullCount = Math.trunc(length * 0.2)) => v export const float32 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Float32(), length, nullCount); export const float64 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Float64(), length, nullCount); export const utf8 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Utf8(), length, nullCount); +export const largeUtf8 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new LargeUtf8(), length, nullCount); export const binary = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Binary(), length, nullCount); export const fixedSizeBinary = (length = 100, nullCount = Math.trunc(length * 0.2), byteWidth = 8) => vectorGenerator.visit(new FixedSizeBinary(byteWidth), length, nullCount); export const dateDay = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new DateDay(), length, nullCount); @@ -242,7 +246,7 @@ export const fixedSizeList = (length = 100, nullCount = Math.trunc(length * 0.2) export const map = (length = 100, nullCount = Math.trunc(length * 0.2), child: Field> = defaultMapChild()) => vectorGenerator.visit(new Map_(child), length, nullCount); export const vecs = { - null_, bool, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float16, float32, float64, utf8, binary, fixedSizeBinary, dateDay, dateMillisecond, timestampSecond, timestampMillisecond, timestampMicrosecond, timestampNanosecond, timeSecond, timeMillisecond, timeMicrosecond, timeNanosecond, decimal, list, struct, denseUnion, sparseUnion, dictionary, intervalDayTime, intervalYearMonth, fixedSizeList, map, durationSecond, durationMillisecond, durationMicrosecond, durationNanosecond + null_, bool, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float16, float32, float64, utf8, largeUtf8, binary, fixedSizeBinary, dateDay, dateMillisecond, timestampSecond, timestampMillisecond, timestampMicrosecond, timestampNanosecond, timeSecond, timeMillisecond, timeMicrosecond, timeNanosecond, decimal, list, struct, denseUnion, sparseUnion, dictionary, intervalDayTime, intervalYearMonth, fixedSizeList, map, durationSecond, durationMillisecond, durationMicrosecond, durationNanosecond } as { [k: string]: (...args: any[]) => any }; function generateNull(this: TestDataVectorGenerator, type: T, length = 100): GeneratedVector { @@ -312,7 +316,7 @@ function generateFloat(this: TestDataVectorGenerator, type: T, function generateUtf8(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { const nullBitmap = createBitmap(length, nullCount); - const valueOffsets = createVariableWidthOffsets(length, nullBitmap, 10, 20, nullCount != 0); + const valueOffsets = createVariableWidthOffsets32(length, nullBitmap, 10, 20, nullCount != 0); const values: string[] = new Array(valueOffsets.length - 1).fill(null); [...valueOffsets.slice(1)] .map((o, i) => isValid(nullBitmap, i) ? o - valueOffsets[i] : null) @@ -332,9 +336,31 @@ function generateUtf8(this: TestDataVectorGenerator, type: T, le return { values: () => values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, valueOffsets, data })]) }; } +function generateLargeUtf8(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { + const nullBitmap = createBitmap(length, nullCount); + const valueOffsets = createVariableWidthOffsets64(length, nullBitmap, 10, 20, nullCount != 0); + const values: string[] = new Array(valueOffsets.length - 1).fill(null); + [...valueOffsets.slice(1)] + .map((o, i) => isValid(nullBitmap, i) ? o - valueOffsets[i] : null) + .reduce((map, length, i) => { + if (length !== null) { + if (length > 0) { + do { + values[i] = randomString(Number(length)); + } while (map.has(values[i])); + return map.set(values[i], i); + } + values[i] = ''; + } + return map; + }, new Map()); + const data = createVariableWidthBytes(length, nullBitmap, valueOffsets, (i) => encodeUtf8(values[i])); + return { values: () => values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, valueOffsets, data })]) }; +} + function generateBinary(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { const nullBitmap = createBitmap(length, nullCount); - const valueOffsets = createVariableWidthOffsets(length, nullBitmap, 10, 20, nullCount != 0); + const valueOffsets = createVariableWidthOffsets32(length, nullBitmap, 10, 20, nullCount != 0); const values = [...valueOffsets.slice(1)] .map((o, i) => isValid(nullBitmap, i) ? o - valueOffsets[i] : null) .map((length) => length == null ? null : randomBytes(length)); @@ -443,7 +469,7 @@ function generateList(this: TestDataVectorGenerator, type: T, le const childVec = child.vector; const nullBitmap = createBitmap(length, nullCount); const stride = childVec.length / (length - nullCount); - const valueOffsets = createVariableWidthOffsets(length, nullBitmap, stride, stride); + const valueOffsets = createVariableWidthOffsets32(length, nullBitmap, stride, stride); const values = memoize(() => { const childValues = child.values(); const values: (T['valueType'] | null)[] = [...valueOffsets.slice(1)] @@ -581,7 +607,7 @@ function generateMap(this: TestDataVectorGenerator, const childVec = child.vector; const nullBitmap = createBitmap(length, nullCount); const stride = childVec.length / (length - nullCount); - const valueOffsets = createVariableWidthOffsets(length, nullBitmap, stride, stride); + const valueOffsets = createVariableWidthOffsets32(length, nullBitmap, stride, stride); const values = memoize(() => { const childValues: { key: K; value: V }[] = child.values(); const values: (Record | null)[] = [...valueOffsets.slice(1)] @@ -660,7 +686,7 @@ function createBitmap(length: number, nullCount: number) { return bytes; } -function createVariableWidthOffsets(length: number, nullBitmap: Uint8Array, min = 10, max = Number.POSITIVE_INFINITY, allowEmpty = true) { +function createVariableWidthOffsets32(length: number, nullBitmap: Uint8Array, min = 10, max = Number.POSITIVE_INFINITY, allowEmpty = true) { const offsets = new Int32Array(length + 1); iterateBitmap(length, nullBitmap, (i, valid) => { if (!valid) { @@ -674,10 +700,24 @@ function createVariableWidthOffsets(length: number, nullBitmap: Uint8Array, min return offsets; } -function createVariableWidthBytes(length: number, nullBitmap: Uint8Array, offsets: Int32Array, getBytes: (index: number) => Uint8Array) { - const bytes = new Uint8Array(offsets[length]); +function createVariableWidthOffsets64(length: number, nullBitmap: Uint8Array, min = 10, max = Number.POSITIVE_INFINITY, allowEmpty = true) { + const offsets = new BigInt64Array(length + 1); + iterateBitmap(length, nullBitmap, (i, valid) => { + if (!valid) { + offsets[i + 1] = offsets[i]; + } else { + do { + offsets[i + 1] = offsets[i] + BigInt(Math.min(max, Math.max(min, Math.trunc(rand() * max)))); + } while (!allowEmpty && offsets[i + 1] === offsets[i]); + } + }); + return offsets; +} + +function createVariableWidthBytes(length: number, nullBitmap: Uint8Array, offsets: Int32Array | BigInt64Array, getBytes: (index: number) => Uint8Array) { + const bytes = new Uint8Array(Number(offsets[length])); iterateBitmap(length, nullBitmap, (i, valid) => { - valid && bytes.set(getBytes(i), offsets[i]); + valid && bytes.set(getBytes(i), Number(offsets[i])); }); return bytes; } diff --git a/js/test/unit/builders/builder-tests.ts b/js/test/unit/builders/builder-tests.ts index b261e4f815e3a..0137c7aa66635 100644 --- a/js/test/unit/builders/builder-tests.ts +++ b/js/test/unit/builders/builder-tests.ts @@ -44,6 +44,7 @@ describe('Generated Test Data', () => { describe('Float32Builder', () => { validateBuilder(generate.float32); }); describe('Float64Builder', () => { validateBuilder(generate.float64); }); describe('Utf8Builder', () => { validateBuilder(generate.utf8); }); + describe('LargeUtf8Builder', () => { validateBuilder(generate.largeUtf8); }); describe('BinaryBuilder', () => { validateBuilder(generate.binary); }); describe('FixedSizeBinaryBuilder', () => { validateBuilder(generate.fixedSizeBinary); }); describe('DateDayBuilder', () => { validateBuilder(generate.dateDay); }); diff --git a/js/test/unit/builders/largeUtf8-tests.ts b/js/test/unit/builders/largeUtf8-tests.ts new file mode 100644 index 0000000000000..c789d5dbb1671 --- /dev/null +++ b/js/test/unit/builders/largeUtf8-tests.ts @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import 'web-streams-polyfill'; + +import { validateVector } from './utils.js'; +import { + encodeAll, + encodeEach, + encodeEachDOM, + encodeEachNode, + stringsNoNulls, + stringsWithNAs, + stringsWithNulls, + stringsWithEmpties +} from './utils.js'; + +import { Vector, LargeUtf8 } from 'apache-arrow'; + +const testDOMStreams = process.env.TEST_DOM_STREAMS === 'true'; +const testNodeStreams = process.env.TEST_NODE_STREAMS === 'true'; + +describe('LargeUtf8Builder', () => { + runTestsWithEncoder('encodeAll', encodeAll(() => new LargeUtf8())); + runTestsWithEncoder('encodeEach: 5', encodeEach(() => new LargeUtf8(), 5)); + runTestsWithEncoder('encodeEach: 25', encodeEach(() => new LargeUtf8(), 25)); + runTestsWithEncoder('encodeEach: undefined', encodeEach(() => new LargeUtf8(), void 0)); + testDOMStreams && runTestsWithEncoder('encodeEachDOM: 25', encodeEachDOM(() => new LargeUtf8(), 25)); + testNodeStreams && runTestsWithEncoder('encodeEachNode: 25', encodeEachNode(() => new LargeUtf8(), 25)); +}); + +function runTestsWithEncoder(name: string, encode: (vals: (string | null)[], nullVals?: any[]) => Promise>) { + describe(`${encode.name} ${name}`, () => { + it(`encodes strings no nulls`, async () => { + const vals = stringsNoNulls(20); + validateVector(vals, await encode(vals, []), []); + }); + it(`encodes strings with nulls`, async () => { + const vals = stringsWithNulls(20); + validateVector(vals, await encode(vals, [null]), [null]); + }); + it(`encodes strings using n/a as the null value rep`, async () => { + const vals = stringsWithNAs(20); + validateVector(vals, await encode(vals, ['n/a']), ['n/a']); + }); + it(`encodes strings using \\0 as the null value rep`, async () => { + const vals = stringsWithEmpties(20); + validateVector(vals, await encode(vals, ['\0']), ['\0']); + }); + }); +} diff --git a/js/test/unit/generated-data-tests.ts b/js/test/unit/generated-data-tests.ts index d64c7c188d3ed..0a06bcbab8ee0 100644 --- a/js/test/unit/generated-data-tests.ts +++ b/js/test/unit/generated-data-tests.ts @@ -38,6 +38,7 @@ describe('Generated Test Data', () => { describe('Float32', () => { validateVector(generate.float32()); }); describe('Float64', () => { validateVector(generate.float64()); }); describe('Utf8', () => { validateVector(generate.utf8()); }); + describe('LargeUtf8', () => { validateVector(generate.largeUtf8()); }); describe('Binary', () => { validateVector(generate.binary()); }); describe('FixedSizeBinary', () => { validateVector(generate.fixedSizeBinary()); }); describe('DateDay', () => { validateVector(generate.dateDay()); }); diff --git a/js/test/unit/generated-data-validators.ts b/js/test/unit/generated-data-validators.ts index 52f642d2a6e89..57ee94876c300 100644 --- a/js/test/unit/generated-data-validators.ts +++ b/js/test/unit/generated-data-validators.ts @@ -113,7 +113,9 @@ function vectorTests(values: any[], vector: Vector, keys?: number[]) { expected = values[i]; expect(actual).toArrowCompare(expected); } - } catch (e: any) { throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); } + } catch (e: any) { + throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); + } }); if (keys && keys.length > 0) { test(`dictionary indices should match`, () => { @@ -126,7 +128,9 @@ function vectorTests(values: any[], vector: Vector, keys?: number[]) { ? expect(indices.get(i)).toBe(keys[i]) : expect(indices.get(i)).toBeNull(); } - } catch (e) { throw new Error(`${indices}[${i}]: ${e}`); } + } catch (e) { + throw new Error(`${indices}[${i}]: ${e}`); + } }); } test(`sets expected values`, () => { @@ -139,7 +143,9 @@ function vectorTests(values: any[], vector: Vector, keys?: number[]) { actual = vector.get(i); expect(actual).toArrowCompare(expected); } - } catch (e: any) { throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); } + } catch (e: any) { + throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); + } }); test(`iterates expected values`, () => { expect.hasAssertions(); @@ -149,7 +155,9 @@ function vectorTests(values: any[], vector: Vector, keys?: number[]) { expected = values[++i]; expect(actual).toArrowCompare(expected); } - } catch (e: any) { throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); } + } catch (e: any) { + throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); + } }); test(`indexOf returns expected values`, () => { expect.hasAssertions(); @@ -169,7 +177,9 @@ function vectorTests(values: any[], vector: Vector, keys?: number[]) { expect(vector.indexOf('purple elephants')).toBe(-1); expect(vector.indexOf('whistling wombats')).toBe(-1); expect(vector.indexOf('carnivorous novices')).toBe(-1); - } catch (e: any) { throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); } + } catch (e: any) { + throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); + } }); } diff --git a/js/test/unit/vector/vector-tests.ts b/js/test/unit/vector/vector-tests.ts index a259cbef87772..bfcf0d8547861 100644 --- a/js/test/unit/vector/vector-tests.ts +++ b/js/test/unit/vector/vector-tests.ts @@ -16,7 +16,7 @@ // under the License. import { - Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Timestamp, TimeUnit, Utf8, util, Vector, vectorFromArray + Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Timestamp, TimeUnit, Utf8, LargeUtf8, util, Vector, vectorFromArray } from 'apache-arrow'; describe(`makeVectorFromArray`, () => { @@ -196,6 +196,28 @@ describe(`Utf8Vector`, () => { }); }); +describe(`LargeUtf8Vector`, () => { + const values = ['foo', 'bar', 'baz', 'foo bar', 'bar']; + const vector = vectorFromArray(values, new LargeUtf8); + + test(`has largeUtf8 type`, () => { + expect(vector.type).toBeInstanceOf(LargeUtf8); + }); + + test(`is not memoized`, () => { + expect(vector.isMemoized).toBe(false); + const memoizedVector = vector.memoize(); + expect(memoizedVector.isMemoized).toBe(true); + const unMemoizedVector = vector.unmemoize(); + expect(unMemoizedVector.isMemoized).toBe(false); + }); + + basicVectorTests(vector, values, ['abc', '123']); + describe(`sliced`, () => { + basicVectorTests(vector.slice(1, 3), values.slice(1, 3), ['foo', 'abc']); + }); +}); + describe(`ListVector`, () => { const values = [[1, 2], [1, 2, 3]]; const vector = vectorFromArray(values); diff --git a/js/test/unit/visitor-tests.ts b/js/test/unit/visitor-tests.ts index 8a7ba1ed778aa..f78adc59f8e98 100644 --- a/js/test/unit/visitor-tests.ts +++ b/js/test/unit/visitor-tests.ts @@ -18,7 +18,7 @@ import { Field, Visitor, DataType, Dictionary, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -36,6 +36,7 @@ class BasicVisitor extends Visitor { public visitInt(type: T) { return (this.type = type); } public visitFloat(type: T) { return (this.type = type); } public visitUtf8(type: T) { return (this.type = type); } + public visitLargeUtf8(type: T) { return (this.type = type); } public visitBinary(type: T) { return (this.type = type); } public visitFixedSizeBinary(type: T) { return (this.type = type); } public visitDate(type: T) { return (this.type = type); } @@ -68,6 +69,7 @@ class FeatureVisitor extends Visitor { public visitFloat32(type: T) { return (this.type = type); } public visitFloat64(type: T) { return (this.type = type); } public visitUtf8(type: T) { return (this.type = type); } + public visitLargeUtf8(type: T) { return (this.type = type); } public visitBinary(type: T) { return (this.type = type); } public visitFixedSizeBinary(type: T) { return (this.type = type); } public visitDateDay(type: T) { return (this.type = type); } @@ -104,6 +106,7 @@ describe('Visitor', () => { test(`visits Int types`, () => validateBasicVisitor(new Int(true, 32))); test(`visits Float types`, () => validateBasicVisitor(new Float(0))); test(`visits Utf8 types`, () => validateBasicVisitor(new Utf8())); + test(`visits LargeUtf8 types`, () => validateBasicVisitor(new LargeUtf8())); test(`visits Binary types`, () => validateBasicVisitor(new Binary())); test(`visits FixedSizeBinary types`, () => validateBasicVisitor(new FixedSizeBinary(128))); test(`visits Date types`, () => validateBasicVisitor(new Date_(0))); @@ -144,6 +147,7 @@ describe('Visitor', () => { test(`visits Float32 types`, () => validateFeatureVisitor(new Float32())); test(`visits Float64 types`, () => validateFeatureVisitor(new Float64())); test(`visits Utf8 types`, () => validateFeatureVisitor(new Utf8())); + test(`visits LargeUtf8 types`, () => validateFeatureVisitor(new LargeUtf8())); test(`visits Binary types`, () => validateFeatureVisitor(new Binary())); test(`visits FixedSizeBinary types`, () => validateFeatureVisitor(new FixedSizeBinary(128))); test(`visits DateDay types`, () => validateFeatureVisitor(new DateDay()));