Skip to content

Commit

Permalink
GH-15060: [JS] Add LargeUtf8 type
Browse files Browse the repository at this point in the history
  • Loading branch information
domoritz committed May 26, 2023
1 parent 2d32efe commit 179e858
Show file tree
Hide file tree
Showing 17 changed files with 131 additions and 27 deletions.
4 changes: 2 additions & 2 deletions js/src/Arrow.dom.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ export {
Bool,
Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64,
Float, Float16, Float32, Float64,
Utf8,
Utf8, LargeUtf8,
Binary,
FixedSizeBinary,
Date_, DateDay, DateMillisecond,
Expand Down Expand Up @@ -94,5 +94,5 @@ export {
TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, TimestampMicrosecondBuilder, TimestampNanosecondBuilder,
TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecondBuilder, TimeNanosecondBuilder,
UnionBuilder, DenseUnionBuilder, SparseUnionBuilder,
Utf8Builder,
Utf8Builder, LargeUtf8Builder
} from './Arrow.js';
3 changes: 2 additions & 1 deletion js/src/Arrow.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ export {
Bool,
Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64,
Float, Float16, Float32, Float64,
Utf8,
Utf8, LargeUtf8,
Binary,
FixedSizeBinary,
Date_, DateDay, DateMillisecond,
Expand Down Expand Up @@ -76,6 +76,7 @@ export { TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecond
export { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, TimestampMicrosecondBuilder, TimestampNanosecondBuilder } from './builder/timestamp.js';
export { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder } from './builder/interval.js';
export { Utf8Builder } from './builder/utf8.js';
export { LargeUtf8Builder } from './builder/largeutf8.js';
export { BinaryBuilder } from './builder/binary.js';
export { ListBuilder } from './builder/list.js';
export { FixedSizeListBuilder } from './builder/fixedsizelist.js';
Expand Down
9 changes: 5 additions & 4 deletions js/src/builder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import {
DataType, strideForType,
Float, Int, Decimal, FixedSizeBinary,
Date_, Time, Timestamp, Interval,
Utf8, Binary, List, Map_,
Utf8, LargeUtf8, Binary, List, Map_,
} from './type.js';
import { createIsValidFunction } from './builder/valid.js';
import { BufferBuilder, BitmapBufferBuilder, DataBufferBuilder, OffsetsBufferBuilder } from './builder/buffer.js';
Expand Down Expand Up @@ -163,6 +163,7 @@ export abstract class Builder<T extends DataType = any, TNull = any> {
public toVector() { return new Vector([this.flush()]); }

public get ArrayType() { return this.type.ArrayType; }
public get OffsetType() { return this.type.OffsetType; }
public get nullCount() { return this._nulls.numInvalid; }
public get numChildren() { return this.children.length; }

Expand Down Expand Up @@ -355,13 +356,13 @@ export abstract class FixedWidthBuilder<T extends Int | Float | FixedSizeBinary
}

/** @ignore */
export abstract class VariableWidthBuilder<T extends Binary | Utf8 | List | Map_, TNull = any> extends Builder<T, TNull> {
export abstract class VariableWidthBuilder<T extends Binary | Utf8 | LargeUtf8 | List | Map_, TNull = any> extends Builder<T, TNull> {
protected _pendingLength = 0;
protected _offsets: OffsetsBufferBuilder;
protected _offsets: OffsetsBufferBuilder<T>;
protected _pending: Map<number, any> | undefined;
constructor(opts: BuilderOptions<T, TNull>) {
super(opts);
this._offsets = new OffsetsBufferBuilder();
this._offsets = new OffsetsBufferBuilder(opts.type);
}
public setValue(index: number, value: T['TValue']) {
const pending = this._pending || (this._pending = new Map());
Expand Down
5 changes: 3 additions & 2 deletions js/src/builder/buffer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import {
TypedArray, TypedArrayConstructor,
BigIntArray, BigIntArrayConstructor
} from '../interfaces.js';
import { DataType } from '../type.js';

/** @ignore */ type DataValue<T> = T extends TypedArray ? number : T extends BigIntArray ? WideValue<T> : T;
/** @ignore */ type WideValue<T extends BigIntArray> = T extends BigIntArray ? bigint | Int32Array | Uint32Array : never;
Expand Down Expand Up @@ -134,8 +135,8 @@ export class BitmapBufferBuilder extends DataBufferBuilder<Uint8Array> {
}

/** @ignore */
export class OffsetsBufferBuilder extends DataBufferBuilder<Int32Array> {
constructor(data = new Int32Array(1)) { super(data, 1); }
export class OffsetsBufferBuilder<T extends DataType> extends DataBufferBuilder<T['TOffset']> {
constructor(type: T) { super(new type.OffsetType(1), 1); }
public append(value: number) {
return this.set(this.length - 1, value);
}
Expand Down
44 changes: 44 additions & 0 deletions js/src/builder/largeutf8.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

import { LargeUtf8 } from '../type.js';
import { encodeUtf8 } from '../util/utf8.js';
import { BinaryBuilder } from './binary.js';
import { BufferBuilder } from './buffer.js';
import { VariableWidthBuilder, BuilderOptions } from '../builder.js';

/** @ignore */
export class LargeUtf8Builder<TNull = any> extends VariableWidthBuilder<LargeUtf8, TNull> {
constructor(opts: BuilderOptions<LargeUtf8, TNull>) {
super(opts);
this._values = new BufferBuilder(new Uint8Array(0));
}
public get byteLength(): number {
let size = this._pendingLength + (this.length * 4);
this._offsets && (size += this._offsets.byteLength);
this._values && (size += this._values.byteLength);
this._nulls && (size += this._nulls.byteLength);
return size;
}
public setValue(index: number, value: string) {
return super.setValue(index, encodeUtf8(value) as any);
}
// @ts-ignore
protected _flushPending(pending: Map<number, Uint8Array | undefined>, pendingLength: number): void { }
}

(LargeUtf8Builder.prototype as any)._flushPending = (BinaryBuilder.prototype as any)._flushPending;
4 changes: 2 additions & 2 deletions js/src/builder/list.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ import { Builder, BuilderOptions, VariableWidthBuilder } from '../builder.js';

/** @ignore */
export class ListBuilder<T extends DataType = any, TNull = any> extends VariableWidthBuilder<List<T>, TNull> {
protected _offsets: OffsetsBufferBuilder;
protected _offsets: OffsetsBufferBuilder<List<T>>;
constructor(opts: BuilderOptions<List<T>, TNull>) {
super(opts);
this._offsets = new OffsetsBufferBuilder();
this._offsets = new OffsetsBufferBuilder(opts.type);
}
public addChild(child: Builder<T>, name = '0') {
if (this.numChildren > 0) {
Expand Down
14 changes: 12 additions & 2 deletions js/src/data.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import { Vector } from './vector.js';
import { BufferType, Type } from './enum.js';
import { DataType, strideForType } from './type.js';
import { DataType, LargeUtf8, strideForType } from './type.js';
import { popcnt_bit_range, truncateBitmap } from './util/bit.js';

// When slicing, we do not know the null count of the sliced range without
Expand All @@ -34,7 +34,7 @@ import { popcnt_bit_range, truncateBitmap } from './util/bit.js';

/** @ignore */
export interface Buffers<T extends DataType> {
[BufferType.OFFSET]: Int32Array;
[BufferType.OFFSET]: T['TOffset'];
[BufferType.DATA]: T['TArray'];
[BufferType.VALIDITY]: Uint8Array;
[BufferType.TYPE]: T['TArray'];
Expand Down Expand Up @@ -259,6 +259,14 @@ class MakeDataVisitor extends Visitor {
const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props;
return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]);
}
public visitLargeUtf8<T extends LargeUtf8>(props: LargeUtf8DataProps<T>) {
const { ['type']: type, ['offset']: offset = 0 } = props;
const data = toUint8Array(props['data']);
const nullBitmap = toUint8Array(props['nullBitmap']);
const valueOffsets = toInt32Array(props['valueOffsets']);
const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props;
return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]);
}
public visitBinary<T extends Binary>(props: BinaryDataProps<T>) {
const { ['type']: type, ['offset']: offset = 0 } = props;
const data = toUint8Array(props['data']);
Expand Down Expand Up @@ -381,6 +389,7 @@ interface IntervalDataProps<T extends Interval> extends DataProps_<T> { data?: D
interface FixedSizeBinaryDataProps<T extends FixedSizeBinary> extends DataProps_<T> { data?: DataBuffer<T> }
interface BinaryDataProps<T extends Binary> extends DataProps_<T> { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer<T> }
interface Utf8DataProps<T extends Utf8> extends DataProps_<T> { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer<T> }
interface LargeUtf8DataProps<T extends LargeUtf8> extends DataProps_<T> { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer<T> }
interface ListDataProps<T extends List> extends DataProps_<T> { valueOffsets: ValueOffsetsBuffer; child: Data<T['valueType']> }
interface FixedSizeListDataProps<T extends FixedSizeList> extends DataProps_<T> { child: Data<T['valueType']> }
interface StructDataProps<T extends Struct> extends DataProps_<T> { children: Data[] }
Expand All @@ -403,6 +412,7 @@ export type DataProps<T extends DataType> = (
T extends FixedSizeBinary /* */ ? FixedSizeBinaryDataProps<T> :
T extends Binary /* */ ? BinaryDataProps<T> :
T extends Utf8 /* */ ? Utf8DataProps<T> :
T extends LargeUtf8 /* */ ? LargeUtf8DataProps<T> :
T extends List /* */ ? ListDataProps<T> :
T extends FixedSizeList /* */ ? FixedSizeListDataProps<T> :
T extends Struct /* */ ? StructDataProps<T> :
Expand Down
1 change: 1 addition & 0 deletions js/src/enum.ts
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ export enum Type {
SparseUnion = -24,
IntervalDayTime = -25,
IntervalYearMonth = -26,
LargeUtf8 = -27,
}

export enum BufferType {
Expand Down
10 changes: 7 additions & 3 deletions js/src/interfaces.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import type { TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicros
import type { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, TimestampMicrosecondBuilder, TimestampNanosecondBuilder } from './builder/timestamp.js';
import type { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder } from './builder/interval.js';
import type { Utf8Builder } from './builder/utf8.js';
import type { LargeUtf8Builder } from './builder/largeutf8.js';
import type { BinaryBuilder } from './builder/binary.js';
import type { ListBuilder } from './builder/list.js';
import type { FixedSizeListBuilder } from './builder/fixedsizelist.js';
Expand Down Expand Up @@ -104,7 +105,7 @@ export type BuilderCtorArgs<
TArgs extends any[] = any[],
TCtor extends new (type: R, ...args: TArgs) => T =
new (type: R, ...args: TArgs) => T
> = TCtor extends new (type: R, ...args: infer TArgs) => T ? TArgs : never;
> = TCtor extends new (type: R, ...args: infer TArgs) => T ? TArgs : never;

/**
* Obtain the constructor function of an instance type
Expand All @@ -114,15 +115,15 @@ export type ConstructorType<
T,
TCtor extends new (...args: any[]) => T =
new (...args: any[]) => T
> = TCtor extends new (...args: any[]) => T ? TCtor : never;
> = TCtor extends new (...args: any[]) => T ? TCtor : never;

/** @ignore */
export type BuilderCtorType<
T extends BuilderType<R, any>,
R extends DataType = any,
TCtor extends new (options: BuilderOptions<R, any>) => T =
new (options: BuilderOptions<R, any>) => T
> = TCtor extends new (options: BuilderOptions<R, any>) => T ? TCtor : never;
> = TCtor extends new (options: BuilderOptions<R, any>) => T ? TCtor : never;

/** @ignore */
export type BuilderType<T extends Type | DataType = any, TNull = any> =
Expand Down Expand Up @@ -200,6 +201,7 @@ export type TypeToDataType<T extends Type> = {
[Type.Float64]: type.Float64;
[Type.Float]: type.Float;
[Type.Utf8]: type.Utf8;
[Type.LargeUtf8]: type.LargeUtf8;
[Type.Binary]: type.Binary;
[Type.FixedSizeBinary]: type.FixedSizeBinary;
[Type.Date]: type.Date_;
Expand Down Expand Up @@ -248,6 +250,7 @@ type TypeToBuilder<T extends Type = any, TNull = any> = {
[Type.Float64]: Float64Builder<TNull>;
[Type.Float]: FloatBuilder<any, TNull>;
[Type.Utf8]: Utf8Builder<TNull>;
[Type.LargeUtf8]: LargeUtf8Builder<TNull>;
[Type.Binary]: BinaryBuilder<TNull>;
[Type.FixedSizeBinary]: FixedSizeBinaryBuilder<TNull>;
[Type.Date]: DateBuilder<any, TNull>;
Expand Down Expand Up @@ -296,6 +299,7 @@ type DataTypeToBuilder<T extends DataType = any, TNull = any> = {
[Type.Float64]: T extends type.Float64 ? Float64Builder<TNull> : never;
[Type.Float]: T extends type.Float ? FloatBuilder<T, TNull> : never;
[Type.Utf8]: T extends type.Utf8 ? Utf8Builder<TNull> : never;
[Type.LargeUtf8]: T extends type.LargeUtf8 ? LargeUtf8Builder<TNull> : never;
[Type.Binary]: T extends type.Binary ? BinaryBuilder<TNull> : never;
[Type.FixedSizeBinary]: T extends type.FixedSizeBinary ? FixedSizeBinaryBuilder<TNull> : never;
[Type.Date]: T extends type.Date_ ? DateBuilder<T, TNull> : never;
Expand Down
26 changes: 24 additions & 2 deletions js/src/type.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import { Field } from './schema.js';
import { Vector } from './vector.js';
import { MapRow } from './row/map.js';
import { StructRow, StructRowProxy } from './row/struct.js';
import { TypedArrayConstructor } from './interfaces.js';
import { BigIntArrayConstructor, TypedArrayConstructor } from './interfaces.js';
import { BigInt64Array, BigUint64Array } from './util/compat.js';
import { bigIntToNumber } from './util/bigint.js';

Expand All @@ -39,9 +39,11 @@ export type IsSigned = { 'true': true; 'false': false };
export interface DataType<TType extends Type = Type, TChildren extends TypeMap = any> {
readonly TType: TType;
readonly TArray: any;
readonly TOffset: any;
readonly TValue: any;
readonly TChildren: TChildren;
readonly ArrayType: any;
readonly OffsetType: TypedArrayConstructor<Uint32Array> | BigIntArrayConstructor<BigUint64Array>;
readonly children: Field<TChildren[keyof TChildren]>[];
}

Expand All @@ -58,6 +60,7 @@ export abstract class DataType<TType extends Type = Type, TChildren extends Type
/** @nocollapse */ static isFloat(x: any): x is Float { return x?.typeId === Type.Float; }
/** @nocollapse */ static isBinary(x: any): x is Binary { return x?.typeId === Type.Binary; }
/** @nocollapse */ static isUtf8(x: any): x is Utf8 { return x?.typeId === Type.Utf8; }
/** @nocollapse */ static isLargeUtf8(x: any): x is LargeUtf8 { return x?.typeId === Type.LargeUtf8; }
/** @nocollapse */ static isBool(x: any): x is Bool { return x?.typeId === Type.Bool; }
/** @nocollapse */ static isDecimal(x: any): x is Decimal { return x?.typeId === Type.Decimal; }
/** @nocollapse */ static isDate(x: any): x is Date_ { return x?.typeId === Type.Date; }
Expand All @@ -80,6 +83,7 @@ export abstract class DataType<TType extends Type = Type, TChildren extends Type
protected static [Symbol.toStringTag] = ((proto: DataType) => {
(<any>proto).children = null;
(<any>proto).ArrayType = Array;
(<any>proto).OffsetType = Array;
return proto[Symbol.toStringTag] = 'DataType';
})(DataType.prototype);
}
Expand Down Expand Up @@ -247,7 +251,7 @@ export class Binary extends DataType<Type.Binary> {
}

/** @ignore */
export interface Utf8 extends DataType<Type.Utf8> { TArray: Uint8Array; TValue: string; ArrayType: TypedArrayConstructor<Uint8Array> }
export interface Utf8 extends DataType<Type.Utf8> { TArray: Uint8Array; TOffset: Uint32Array; TValue: string; ArrayType: TypedArrayConstructor<Uint8Array>; OffsetType: TypedArrayConstructor<Uint32Array> }
/** @ignore */
export class Utf8 extends DataType<Type.Utf8> {
constructor() {
Expand All @@ -257,10 +261,27 @@ export class Utf8 extends DataType<Type.Utf8> {
public toString() { return `Utf8`; }
protected static [Symbol.toStringTag] = ((proto: Utf8) => {
(<any>proto).ArrayType = Uint8Array;
(<any>proto).OffsetType = Uint32Array;
return proto[Symbol.toStringTag] = 'Utf8';
})(Utf8.prototype);
}

/** @ignore */
export interface LargeUtf8 extends DataType<Type.LargeUtf8> { TArray: Uint8Array; TOffset: BigUint64Array; TValue: string; ArrayType: TypedArrayConstructor<Uint8Array>; OffsetType: BigIntArrayConstructor<BigUint64Array> }
/** @ignore */
export class LargeUtf8 extends DataType<Type.LargeUtf8> {
constructor() {
super();
}
public get typeId() { return Type.LargeUtf8 as Type.LargeUtf8; }
public toString() { return `LargeUtf8`; }
protected static [Symbol.toStringTag] = ((proto: LargeUtf8) => {
(<any>proto).ArrayType = Uint8Array;
(<any>proto).OffsetType = BigUint64Array;
return proto[Symbol.toStringTag] = 'LargeUtf8';
})(LargeUtf8.prototype);
}

/** @ignore */
export interface Bool extends DataType<Type.Bool> { TArray: Uint8Array; TValue: boolean; ArrayType: TypedArrayConstructor<Uint8Array> }
/** @ignore */
Expand Down Expand Up @@ -548,6 +569,7 @@ export class FixedSizeBinary extends DataType<Type.FixedSizeBinary> {
protected static [Symbol.toStringTag] = ((proto: FixedSizeBinary) => {
(<any>proto).byteWidth = null;
(<any>proto).ArrayType = Uint8Array;
(<any>proto).OffsetType = Uint32Array;
return proto[Symbol.toStringTag] = 'FixedSizeBinary';
})(FixedSizeBinary.prototype);
}
Expand Down
4 changes: 2 additions & 2 deletions js/src/visitor/get.ts
Original file line number Diff line number Diff line change
Expand Up @@ -108,13 +108,13 @@ function wrapGet<T extends DataType>(fn: (data: Data<T>, _1: any) => any) {
/** @ignore */
const getNull = <T extends Null>(_data: Data<T>, _index: number): T['TValue'] => null;
/** @ignore */
const getVariableWidthBytes = (values: Uint8Array, valueOffsets: Int32Array, index: number) => {
const getVariableWidthBytes = (values: Uint8Array, valueOffsets: Uint32Array | BigUint64Array, index: number) => {
if (index + 1 >= valueOffsets.length) {
return null as any;
}
const x = valueOffsets[index];
const y = valueOffsets[index + 1];
return values.subarray(x, y);
return values.subarray(Number(x), Number(y));
};

/** @ignore */
Expand Down
5 changes: 4 additions & 1 deletion js/src/visitor/jsonvectorassembler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ import { BitIterator, getBit, getBool } from '../util/bit.js';
import {
DataType,
Float, Int, Date_, Interval, Time, Timestamp, Union,
Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, IntArray,
Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, IntArray, LargeUtf8,
} from '../type.js';

/** @ignore */
Expand Down Expand Up @@ -98,6 +98,9 @@ export class JSONVectorAssembler extends Visitor {
public visitUtf8<T extends Utf8>(data: Data<T>) {
return { 'DATA': [...new Vector([data])], 'OFFSET': [...data.valueOffsets] };
}
public visitLargeUtf8<T extends LargeUtf8>(data: Data<T>) {
return { 'DATA': [...new Vector([data])], 'OFFSET': [...data.valueOffsets] };
}
public visitBinary<T extends Binary>(data: Data<T>) {
return { 'DATA': [...binaryToString(new Vector([data]))], OFFSET: [...data.valueOffsets] };
}
Expand Down
Loading

0 comments on commit 179e858

Please sign in to comment.