From 074ac1ad7a2c495e76310405cf61738985c08441 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Thu, 18 Jul 2024 23:28:25 -0400 Subject: [PATCH] add "prepare" script --- package.json | 1 + src/selection.ts | 116 +++++++++------------------ src/tixrixqid.ts | 201 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 240 insertions(+), 78 deletions(-) create mode 100644 src/tixrixqid.ts diff --git a/package.json b/package.json index 47ec08cd6..f8de2cf47 100644 --- a/package.json +++ b/package.json @@ -19,6 +19,7 @@ "scripts": { "dev": "vite --mode dev --port 3344 --host", "format": "prettier --write src", + "prepare": "npm run build", "build": "vite build && tsc", "prepublishOnly": "vite build && tsc && typedoc --skipErrorChecking src/*", "test": "vite build && npm run test:node", diff --git a/src/selection.ts b/src/selection.ts index af4c7b9a7..32af8a80d 100644 --- a/src/selection.ts +++ b/src/selection.ts @@ -2,17 +2,16 @@ import { Deeptable } from './Deeptable'; import { Scatterplot } from './scatterplot'; import { Tile } from './tile'; +import { getTileFromRow } from './tixrixqid'; import type * as DS from './shared.d'; import { Bool, - DataType, StructRowProxy, - Type, Utf8, Vector, makeData, } from 'apache-arrow'; -import { bisectLeft, range } from 'd3-array'; +import { range } from 'd3-array'; interface SelectParams { name: string; useNameCache?: boolean; // If true and a selection with that name already exists, use it and ignore all passed parameters. Otherwise, throw an error. @@ -501,14 +500,14 @@ export class DataSelection { return this; } - async removePoints(name: string, ixes: bigint[]): Promise { - return this.add_or_remove_points(name, ixes, 'remove'); + async removePoints(name: string, points: StructRowProxy[]): Promise { + return this.add_or_remove_points(name, points, 'remove'); } // Non-editable behavior: // if a single point is added, will also adjust the cursor. - async addPoints(name: string, ixes: bigint[]): Promise { - return this.add_or_remove_points(name, ixes, 'add'); + async addPoints(name: string, points: StructRowProxy[]): Promise { + return this.add_or_remove_points(name, points, 'add'); } /** @@ -539,27 +538,20 @@ export class DataSelection { // } public moveCursorToPoint( - point: StructRowProxy<{ ix: DataType }>, + point: StructRowProxy, ) { // The point contains a field called 'ix', which increases in each tile; // we use this for moving because it lets us do binary search for relevant tile. const rowNumber = point[Symbol.for('rowIndex')] as number; - const ix = point.ix as bigint; - if (point.ix === undefined) { - throw new Error( - 'Unable to move cursor to point, because it has no `ix` property.', - ); - } + const relevantTile = getTileFromRow(point, this.deeptable); + let currentOffset = 0; - let relevantTile: Tile = undefined; - let current_tile_ix = 0; let positionInTile: number; + + let current_tile_ix = 0; for (const match_length of this.match_count) { const tile = this.tiles[current_tile_ix]; - - const ixcol = tile.record_batch.getChild('ix').data[0]; - if (ixcol[rowNumber] === ix) { - relevantTile = tile; + if (tile.key === relevantTile.key) { positionInTile = rowNumber; break; } @@ -567,10 +559,6 @@ export class DataSelection { currentOffset += match_length; } - if (relevantTile === undefined || positionInTile === undefined) { - return null; - } - const column = relevantTile.record_batch.getChild( this.name, ) as Vector; @@ -586,76 +574,48 @@ export class DataSelection { private async add_or_remove_points( newName: string, - ixes: bigint[], + points: StructRowProxy[], which: 'add' | 'remove', - ) { - let newCursor = 0; - let tileOfMatch = undefined; + ) : Promise{ + + const matches : Record= {}; + for (const point of points) { + const t = getTileFromRow(point, this.deeptable); + const rowNum = point[Symbol.for('rowIndex')] as number; + if (!matches[t.key]) { + matches[t.key] = [rowNum]; + } else { + matches[t.key].push(rowNum); + } + } + const tileFunction = async (tile: Tile) => { - newCursor = -1; await this.ready; // First, get the current version of the tile. const original = (await tile.get_column(this.name)) as Vector; - // Then locate the ix column and look for matches. - const ixcol = tile.record_batch.getChild('ix').data[0] - .values as BigInt64Array; - const mask = Bitmask.from_arrow(original); - for (const ix of ixes) { - // Since ix is ordered, we can do a fast binary search to see if the - // point is there--no need for a full scan. - - //@ts-expect-error d3.bisect is not aware it works with bigints as well as numbers - const mid = bisectLeft([...ixcol], ix as unknown as number); - const val = tile.record_batch.get(mid); - // We have to check that there's actually a match, - // because the binary search identifies where it *would* be. - if (val !== null && val.ix === ix) { - // Copy the buffer so we don't overwrite the old one. - // Set the specific value. + + // Then if there are matches. + if (matches[tile.key] !== undefined) { + const mask = Bitmask.from_arrow(original); + for (const rowNum of matches[tile.key]) { if (which === 'add') { - mask.set(mid); - if (ixes.length === 1) { - tileOfMatch = tile.key; - // For single additions, we also move the cursor to the - // newly added point. - // First we see the number of points earlier on the current tile. - let offset_in_tile = 0; - for (let i = 0; i < mid; i++) { - if (mask.get(i)) { - offset_in_tile += 1; - } - } - // Then, we count the number of matches already seen - newCursor = offset_in_tile; - } + mask.set(rowNum); } else { - // If deleting, we set it to zero. - mask.unset(mid); - } + mask.unset(rowNum); + } } + return mask.to_arrow(); + } else { + return original; } - return mask.to_arrow(); }; + const selection = new DataSelection(this.deeptable, { name: newName, tileFunction, }); - selection.on('tile loaded', () => { - // The new cursor gets moved when we encounter a singleton - if (newCursor >= 0) { - selection.cursor = newCursor; - for (let i = 0; i < selection.tiles.length; i++) { - const tile = selection.tiles[i]; - if (tile.key === tileOfMatch) { - // Don't add the full number of matches here. - break; - } - selection.cursor += this.match_count[i]; - } - } - }); await selection.ready; for (const tile of this.tiles) { // This one we actually apply. We'll see if that gets to be slow. diff --git a/src/tixrixqid.ts b/src/tixrixqid.ts new file mode 100644 index 000000000..1522b6343 --- /dev/null +++ b/src/tixrixqid.ts @@ -0,0 +1,201 @@ +import type { Bool, Data, Field, Struct, StructRowProxy, Vector } from 'apache-arrow'; + +import type { Tile } from './deepscatter'; +import { Bitmask, DataSelection, Deeptable } from './deepscatter'; + +// The type below indicates that a Qid is not valid if +// there are zero rows selected in the tile. + +// A Tix is a tile index, which is an integer identifier for a tile in quadtree. +// It uses the formula (4^z - 1) / 3 + y * 2^z + x, where z is the zoom level, +// and x and y are the tile coordinates. +type Tix = number; + +// An Rix is a row index, which is an integer identifier for a row in a tile. +type Rix = number; + +// A Rixen is a list of row indices. It must be non-empty. +type Rixen = [Rix, ...Rix[]]; + +// A Qid is a pair of a Tix and a Rixen. It identifies a set of rows in a tile. +export type Qid = [Tix, Rixen]; +export type QidArray = Qid[]; + +export function zxyToTix(z: number, x: number, y: number) { + return (4 ** z - 1) / 3 + y * 2 ** z + x; +} + +function parentTix(tix: number) { + const [z, x, y] = tixToZxy(tix); + return zxyToTix(z - 1, Math.floor(x / 2), Math.floor(y / 2)); +} + +/** + * + * @param tix The numeric tile index + * @param dataset The deepscatter dataset + * @returns The tile, if it exists. + * + */ +export async function tixToTile(tix: Tix, dataset: Deeptable): Promise { + if (tix === 0) { + return dataset.root_tile; + } + if (isNaN(tix)) { + throw new Error('NaN tile index'); + } + // We need all parents to exist to find their children. So + // we fetch the tiles here to ensure they've loaded. + const parent = await tixToTile(parentTix(tix), dataset); + // + await parent.populateManifest(); + // Now that the parents are loaded, we can find the child. + const [z, x, y] = tixToZxy(tix); + const key = `${z}/${x}/${y}`; + const t = dataset + .map((tile: Tile) => tile) + .filter((tile: Tile) => tile.key === key); + if (t.length) { + return t[0]; + } + throw new Error(`Tile ${key} not found in dataset.`); +} + +/** + * + * @param qid a quadtree id + * @param dataset + * @returns + */ +export async function qidToRowProxy(qid: Qid, dataset: Deeptable) { + const tile = await tixToTile(qid[0], dataset); + await tile.get_column('x'); + return tile.record_batch.get(qid[1][0]); +} + +export function tileKey_to_tix(key: string) { + const [z, x, y] = key.split('/').map((d) => parseInt(d)); + return zxyToTix(z, x, y); +} + +export function tixToZxy(tix: Tix): [number, number, number] { + // This is the inverse function that goes from a quadtree tile's integer identifier 'qix' to the [z, x, y] tuple. + + // The z level is the inverse of the qix function. + // Javascript doesn't have base-4 logarithm I guess, so we divide the natural log by the natural log of 4. + const z = Math.floor(Math.log(tix * 3 + 1) / Math.log(4)); + + // We then get the index inside the tile, which is the offset from the base sequence. + const blockPosition = tix - (4 ** z - 1) / 3; + + // Modulo operations turn this into x and y coordinates. + const x = blockPosition % 2 ** z; + const y = Math.floor(blockPosition / 2 ** z); + return [z, x, y]; +} + +/** + * + * @param row the row returned from a point event, etc. + * @param dataset a deepscatter dataset. + * @returns + */ +export function getQidFromRow( + row: StructRowProxy, + dataset: Deeptable +): [number, number] { + const tile = getTileFromRow(row, dataset); + const rix = row[Symbol.for('rowIndex')] as number; + return [tileKey_to_tix(tile.key), rix] as [number, number]; +} + +export function getTileFromRow(row: StructRowProxy, dataset: Deeptable): Tile { + + const parent = row[Symbol.for('parent')] as Data; + const parentsColumns = parent.children; + + // Since columns are immutable, we can just compare the memory location of the + // value buffers to find the tile. BUT since columns can be added, we + // need to find the tile that matches the most columns, not assume + // that every column matches exactly. + let best_match: [Tile | null, number] = [null, 0]; + const parentNames : [string, Data][] = parent.type.children.map( + (d: Field, i: number) => [d.name, parentsColumns[i]] + ); + + dataset.map((t: Tile) => { + // @ts-expect-error NOM-1667 expose existence of record batch without generating it. + const batch_exists = t._batch !== undefined; + if (!batch_exists) { + return false; + } + let matching_columns = 0; + for (const [name, column] of parentNames) { + const b = t.record_batch.getChild(name); + if (b !== null) { + if (b.data[0].values === column.values) { + matching_columns++; + } + } + } + if (matching_columns > best_match[1]) { + best_match = [t, matching_columns]; + } + }); + if (best_match[0] === undefined) { + throw new Error( + 'No tiles found for this row.' + JSON.stringify({ ...row }) + ); + } + return best_match[0]; +} + +export function getQidArrayFromRows( + rows: StructRowProxy[], + dataset: Deeptable, +): QidArray { + // TODO: this is really inefficient. We should be able to do this in one pass. + const qids = rows.map((row) => getQidFromRow(row, dataset)); + const mapped = new Map(); + for (const qid of qids) { + if (mapped.has(qid[0])) { + mapped.get(qid[0]).push(qid[1]); + } else { + mapped.set(qid[0], [qid[1]]); + } + } + return Array.from(mapped.entries()); +} + +export function selectQixOnTile(tile: Tile, qidList: QidArray) { + const mask = new Bitmask(tile.record_batch.numRows); + const [z, x, y] = tile.key.split('/').map((d) => parseInt(d)); + const tix = zxyToTix(z, x, y); + const rixes = qidList + .filter((d) => d[0] === tix) + .map((d) => d[1]) + .flat(); + for (const rix of rixes) { + mask.set(rix); + } + return mask.to_arrow(); +} + +/** + * + * @param hoverDatum A struct row. + * @param selection A DataSelection + * @param deeptable A Deepscatter dataset + * @returns + */ +export async function isDatumInSelection( + hoverDatum: StructRowProxy, + selection: DataSelection | null, + deeptable: Deeptable, +): Promise { + if (!selection) return false; + const [tix, rix] = getQidFromRow(hoverDatum, deeptable); + const owningTile = await tixToTile(tix, deeptable); + const array = (await owningTile.get_column(selection.name)) as Vector; + return !!array.get(rix); +}