Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement serializing to duckdb #1578

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cli/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,12 @@
"dependencies": {
"@dodona/dolos-lib": "3.3.1",
"@dodona/dolos-web": "2.7.1",
"apache-arrow": "^17.0.0",
"chalk": "^5.3.0",
"cliui": "^8.0.1",
"commander": "^12.1.0",
"csv-stringify": "^6.5.0",
"duckdb-async": "^0.9.1",
"open": "^10.1.0",
"tree-sitter": "^0.21.1"
},
Expand Down
7 changes: 7 additions & 0 deletions cli/src/cli/commands/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@ import {
} from "../util/utils.js";

import { DEFAULT_HOST, DEFAULT_PORT } from "../server.js";
import { DbView } from "../views/dbView.js";
import { TerminalView } from "../views/terminalView.js";
import { FileView } from "../views/fileView.js";
import { WebView } from "../views/webView.js";
import { Command } from "commander";
import * as Utils from "../util/utils.js";
import { Dolos, Options } from "@dodona/dolos-lib";
import { DbWebView } from "../views/dbWebView.js";

export function runCommand(program: Command): Command {
return new Command("run")
Expand Down Expand Up @@ -219,12 +221,17 @@ export async function run(locations: string[], options: RunOptions): Promise<voi
"csv": () => new FileView(report, options),
"html": () => new WebView(report, options),
"web": () => new WebView(report, options),
"db": () => new DbView(report, options),
"dbweb": () => new DbWebView(report, options),
});

if (view == null) {
throw new Error(`Invalid output format: ${options.outputFormat}`);
}

const startTime = new Date().getTime();
await view().show();
const endTime = new Date().getTime();
console.log(`Write-out took ${endTime - startTime} ms`);
});
}
14 changes: 12 additions & 2 deletions cli/src/cli/server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ export interface Options {
}

const MIME: { [k: string]: string } = {
"db": "application/octet-stream",
"wasm": "application/wasm",
"html": "text/html",
"css": "text/css",
"js": "text/javascript",
Expand All @@ -32,7 +34,8 @@ function notFound(response: http.ServerResponse): void {

export default async function runServer(
reportDir: string,
options: Options
options: Options,
doneCallback?: () => void
): Promise<void> {
const port = options.port || DEFAULT_PORT;
const host = options.host || DEFAULT_HOST;
Expand All @@ -48,7 +51,14 @@ export default async function runServer(
const reqPath = path.normalize(new URL(request.url, baseURL).pathname);

let filePath;
if (reqPath.startsWith("/data")) {
if (reqPath === "/data/dolos.db") {
filePath = reportDir; // actually path to dolos.db
} else if (reqPath.startsWith("/data/done")) {
if (doneCallback) doneCallback();
response.writeHead(202);
response.end();
return;
} else if (reqPath.startsWith("/data")) {
filePath = path.join(reportDir, reqPath.slice(5));
} else if (reqPath.endsWith("/")) {
filePath = path.join(webDir, reqPath, "index.html");
Expand Down
146 changes: 146 additions & 0 deletions cli/src/cli/views/dbView.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import { View } from "./view.js";
import { Database } from "duckdb-async";
import * as arrow from "apache-arrow";
import { promises as fs } from "fs";
import {
Region,
Report,
} from "@dodona/dolos-lib";


export interface Options {
outputDestination?: string;
}

export class DbView extends View {

protected outputDestination: string;

constructor(protected report: Report, options: Options) {
super();
this.outputDestination =
options.outputDestination || this.createName();
}

private createName(): string {
const dashedName = this.report.name.replace(/ /g, "-").replace(/[^a-zA-Z0-9-]/g, "");
const timestamp = new Date().toISOString().replace(/[.:-]/g, "");
return `dolos-report-${ timestamp }-${ dashedName }.db`;
}

public async writePairs(db: Database): Promise<void> {
const pairs = this.report.allPairs();

const table = arrow.tableFromArrays({
id: new Uint32Array(pairs.map(p => p.id)),
leftFileId: new Uint32Array(pairs.map(p => p.leftFile.id)),
leftFilePath: pairs.map(p => p.leftFile.path),
rightFileId: new Uint32Array(pairs.map(p => p.rightFile.id)),
rightFilePath: pairs.map(p => p.rightFile.path),
similarity: pairs.map(p => p.similarity),
totalOverlap: new Uint32Array(pairs.map(p => p.overlap)),
longestFragment: new Uint32Array(pairs.map(p => p.longest)),
leftCovered: new Uint32Array(pairs.map(p => p.leftCovered)),
rightCovered: new Uint32Array(pairs.map(p => p.rightCovered)),
});

await db.register_buffer("arrow_pairs", [arrow.tableToIPC(table)], true);
await db.exec("CREATE TABLE pairs AS SELECT * FROM arrow_pairs");
}

public async writeKgrams(db: Database): Promise<void> {
const fingerprints = this.report.sharedFingerprints();

const table = arrow.tableFromArrays({
id: new Uint32Array(fingerprints.map(f => f.id)),
hash: new BigUint64Array(fingerprints.map(f => BigInt(f.hash))),
ignored: fingerprints.map(f => f.ignored),
data: fingerprints.map(f => f.kgram?.join(" ") || ""),
files: fingerprints.map(f => JSON.stringify(f.files().map(f => f.id)))
});


await db.register_buffer("arrow_kgrams", [arrow.tableToIPC(table)], true);
await db.exec("CREATE TABLE kgrams AS SELECT id, hash, ignored, data, list_transform(files->'$[*]', f -> f::INTEGER) AS files FROM arrow_kgrams");

Check failure on line 64 in cli/src/cli/views/dbView.ts

View workflow job for this annotation

GitHub Actions / CLI 💻️ - lint 📏 (22)

This line has a length of 150. Maximum allowed is 120
}

public async writeFiles(db: Database): Promise<void> {
const entries = this.report.entries().concat(this.report.ignoredEntries());

const table = arrow.tableFromArrays({
id: new Uint32Array(entries.map(e => e.file.id)),
ignored: entries.map(e => e.isIgnored),
path: entries.map(e => e.file.path),
content: entries.map(e => e.file.content),
kgramCount: new Uint32Array(entries.map(e => e.kgrams.length)),
ast: entries.map(e => JSON.stringify(e.file.tokens)),
mapping: entries.map(e => JSON.stringify(Region.toUInt16(e.file.mapping))),
extra: entries.map(e => JSON.stringify(e.file.extra || {}))
});

await db.register_buffer("arrow_files", [arrow.tableToIPC(table)], true);
await db.exec(`
CREATE TABLE files
AS SELECT
id,
ignored,
path,
content,
kgramCount,
list_transform(ast->>'$[*]', s->s::STRING) AS ast,
list_transform(mapping->'$[*]', m -> m::INT2) AS mapping,
extra->'$' as extra
FROM arrow_files
`);
}

public async writeMetadata(db: Database): Promise<void> {
await db.exec(`
CREATE TABLE metadata (
key STRING,
value STRING,
type STRING
);
`);
const stmt = await db.prepare(`INSERT INTO metadata VALUES (?, ?, ?)`);

Check failure on line 105 in cli/src/cli/views/dbView.ts

View workflow job for this annotation

GitHub Actions / CLI 💻️ - lint 📏 (22)

Strings must use doublequote
for (const [key, value] of Object.entries(this.report.metadata())) {
await stmt.run(
key,
value == null ? "null" : value.toString(),
typeof value
);
}
stmt.finalize();
}

async writeToDb(): Promise<string> {
const dbName = this.outputDestination;
if (await fs.stat(dbName).catch(() => false)) {
throw new Error(`File ${dbName} already exists. Please specify a different output destination.`);
}
const db = await Database.create(dbName);
await db.exec('INSTALL arrow; LOAD arrow; BEGIN TRANSACTION;');

Check failure on line 122 in cli/src/cli/views/dbView.ts

View workflow job for this annotation

GitHub Actions / CLI 💻️ - lint 📏 (22)

Strings must use doublequote

console.log(`Writing results to database: ${dbName}`);
await this.writeMetadata(db);
console.log("Metadata written.");
await this.writePairs(db);

console.log("Pairs written.");

await this.writeKgrams(db);
console.log("Kgrams written.");
await this.writeFiles(db);
console.log("Files written.");

await db.exec('COMMIT;');

Check failure on line 136 in cli/src/cli/views/dbView.ts

View workflow job for this annotation

GitHub Actions / CLI 💻️ - lint 📏 (22)

Strings must use doublequote
console.log("Completed");
await db.close();
return dbName;
}

async show(): Promise<void> {
await this.writeToDb();
}

}
26 changes: 26 additions & 0 deletions cli/src/cli/views/dbWebView.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import { FileView, Options as FVOptions } from "./fileView.js";

Check failure on line 1 in cli/src/cli/views/dbWebView.ts

View workflow job for this annotation

GitHub Actions / CLI 💻️ - lint 📏 (22)

'FileView' is defined but never used
import runServer, { Options as ServerOptions } from "../server.js";
import { Report } from "@dodona/dolos-lib";
import {DbView} from "./dbView.js";

Check failure on line 4 in cli/src/cli/views/dbWebView.ts

View workflow job for this annotation

GitHub Actions / CLI 💻️ - lint 📏 (22)

A space is required after '{'

Check failure on line 4 in cli/src/cli/views/dbWebView.ts

View workflow job for this annotation

GitHub Actions / CLI 💻️ - lint 📏 (22)

A space is required before '}'

/**
* This {@link View} will launch a webserver which hosts a web application to
* display results of an analysis written to CSV-files using the
* {@link FileView}.
*/
export class DbWebView extends DbView {

constructor(report: Report, private options: ServerOptions & FVOptions) {
super(report, options);
}

async show(): Promise<void> {
const reportDir = await this.writeToDb();
const start = Date.now();
const done = () => {

Check failure on line 20 in cli/src/cli/views/dbWebView.ts

View workflow job for this annotation

GitHub Actions / CLI 💻️ - lint 📏 (22)

Missing return type on function
const stop = Date.now();
console.log(`Shown in ${stop - start}ms`);
}

Check failure on line 23 in cli/src/cli/views/dbWebView.ts

View workflow job for this annotation

GitHub Actions / CLI 💻️ - lint 📏 (22)

Missing semicolon
setTimeout(async () => await runServer(reportDir, this.options, done), 1000);
}
}
7 changes: 6 additions & 1 deletion cli/src/cli/views/webView.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@

async show(): Promise<void> {
const reportDir = await this.writeToDirectory();
setTimeout(async () => await runServer(reportDir, this.options), 1000);
const start = Date.now();
const done = () => {

Check failure on line 19 in cli/src/cli/views/webView.ts

View workflow job for this annotation

GitHub Actions / CLI 💻️ - lint 📏 (22)

Missing return type on function
const stop = Date.now();
console.log(`Shown in ${stop - start}ms`);
}
setTimeout(async () => await runServer(reportDir, this.options, done), 1000);
}
}
17 changes: 17 additions & 0 deletions core/src/algorithm/fingerprintIndex.ts
Original file line number Diff line number Diff line change
Expand Up @@ -241,4 +241,21 @@ export class FingerprintIndex {
}
return pairs;
}

public pairsByOverlap(n: number = 100): Array<Pair> {
const byShared = Array.from(this.files.values());
byShared.sort((a, b) => a.shared.size - b.shared.size);

let j = 0, k = 1;
const pairs = [];
while (pairs.length < n && k < byShared.length) {
pairs.push(new Pair(byShared[j], byShared[k]));
j += 1;
if (j === k) {
k += 1;
j = 0;
}
}
return pairs;
}
}
20 changes: 20 additions & 0 deletions core/src/util/region.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,26 @@ export class Region {
return new Region(startRow, startCol, endRow, endCol);
}

public static toUInt16(array: Region[]): number[] {
const out = [];
for (let i = 0; i < array.length; i++) {
const idx = i * 4;
out[idx] = array[i].startRow;
out[idx + 1] = array[i].startCol;
out[idx + 2] = array[i].endRow;
out[idx + 3] = array[i].endCol;
}
return out;
}

public static fromUInt16(array: Uint16Array): Region[] {
const out = [];
for (let i = 0; i < array.length; i += 4) {
out.push(new Region(array[i], array[i + 1], array[i + 2], array[i + 3]));
}
return out;
}

constructor(
public startRow: number,
public startCol: number,
Expand Down
10 changes: 10 additions & 0 deletions lib/package.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{ lib
, mkYarnPackage
, nodejs
, pkg-config
}:
let
package-json = lib.importJSON ./package.json;
in mkYarnPackage {
inherit (package-json) name version;
}
2 changes: 1 addition & 1 deletion lib/src/lib/dolos.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ export class Dolos {
private languageDetected = false;
private language: Language | null = null;
private tokenizer: Tokenizer | null = null;
private index: FingerprintIndex | null = null;
public index: FingerprintIndex | null = null;

private readonly languagePicker = new LanguagePicker();

Expand Down
16 changes: 16 additions & 0 deletions lib/src/test/index.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import test from "ava";
import { Dolos } from "../lib/dolos.js";

test("rank by total overlap", async t => {
const dolos = new Dolos();
const report = await dolos.analyzePaths(["../cli/exercise - Pyramidal constants/info.csv"]);
t.is(report.files.length, 392);
const index = dolos.index!;

const ranked = index.pairsByOverlap();
for (let i = 0; i < ranked.length; i++) {
t.true(ranked[i].overlap >= ranked[i + 1].overlap, `Fail for pair ${i}: ${ranked[i].overlap} < ${ranked[i + 1].overlap}`);

Check failure on line 12 in lib/src/test/index.test.ts

View workflow job for this annotation

GitHub Actions / Lib 📚 - lint 📏 (22)

This line has a length of 126. Maximum allowed is 120
}


});
Loading
Loading