From ff7c4bf532539dce2580fc1ef4f8b3804dddc362 Mon Sep 17 00:00:00 2001 From: arkw Date: Tue, 17 Sep 2024 00:52:34 +0900 Subject: [PATCH 01/15] add opfs feature --- examples/esbuild-node/index.ts | 2 +- lib/include/duckdb/web/config.h | 2 + lib/src/arrow_type_mapping.cc | 4 + lib/src/config.cc | 3 + lib/src/io/web_filesystem.cc | 14 +- lib/src/json_typedef.cc | 6 + lib/src/webdb.cc | 1 + .../src/system/arquero_benchmarks.ts | 104 +++++------ .../duckdb-wasm/src/bindings/bindings_base.ts | 26 ++- .../src/bindings/bindings_interface.ts | 3 +- packages/duckdb-wasm/src/bindings/config.ts | 4 + packages/duckdb-wasm/src/bindings/runtime.ts | 8 + .../src/bindings/runtime_browser.ts | 166 ++++++++++++++---- .../src/parallel/worker_dispatcher.ts | 12 +- packages/duckdb-wasm/test/index_browser.ts | 2 + packages/duckdb-wasm/test/opfs.test.ts | 157 +++++++++++++++++ patches/duckdb/bind_copy_direct_io.patch | 15 ++ patches/duckdb/fix_load_database.patch | 20 +++ 18 files changed, 450 insertions(+), 99 deletions(-) create mode 100644 packages/duckdb-wasm/test/opfs.test.ts create mode 100644 patches/duckdb/bind_copy_direct_io.patch create mode 100644 patches/duckdb/fix_load_database.patch diff --git a/examples/esbuild-node/index.ts b/examples/esbuild-node/index.ts index 734d22c2b..183de2a7a 100644 --- a/examples/esbuild-node/index.ts +++ b/examples/esbuild-node/index.ts @@ -1,11 +1,11 @@ import * as duckdb from '@duckdb/duckdb-wasm'; import * as arrow from 'apache-arrow'; import path from 'path'; -import Worker from 'web-worker'; import { createRequire } from 'module'; const require = createRequire(import.meta.url); const DUCKDB_DIST = path.dirname(require.resolve('@duckdb/duckdb-wasm')); +const Worker = require('web-worker'); (async () => { try { diff --git a/lib/include/duckdb/web/config.h b/lib/include/duckdb/web/config.h index c112cb390..a8826eda0 100644 --- a/lib/include/duckdb/web/config.h +++ b/lib/include/duckdb/web/config.h @@ -81,6 +81,8 @@ struct WebDBConfig { std::optional access_mode = std::nullopt; /// The thread count uint32_t maximum_threads = (STATIC_WEBDB_FEATURES & (1 << WebDBFeature::THREADS)) ? 4 : 1; + /// The direct io flag + bool use_direct_io = false; /// The query config QueryConfig query = { .cast_bigint_to_double = std::nullopt, diff --git a/lib/src/arrow_type_mapping.cc b/lib/src/arrow_type_mapping.cc index 4ecbf43fc..10c995304 100644 --- a/lib/src/arrow_type_mapping.cc +++ b/lib/src/arrow_type_mapping.cc @@ -122,6 +122,10 @@ arrow::Result mapArrowTypeToDuckDB(const arrow::DataType& t case arrow::Type::type::EXTENSION: case arrow::Type::type::SPARSE_UNION: case arrow::Type::type::DENSE_UNION: + case arrow::Type::type::STRING_VIEW: + case arrow::Type::type::BINARY_VIEW: + case arrow::Type::type::LIST_VIEW: + case arrow::Type::type::LARGE_LIST_VIEW: return arrow::Status::NotImplemented("DuckDB type mapping for: ", type.ToString()); } return duckdb::LogicalTypeId::INVALID; diff --git a/lib/src/config.cc b/lib/src/config.cc index ea984f26b..887f28448 100644 --- a/lib/src/config.cc +++ b/lib/src/config.cc @@ -76,6 +76,9 @@ WebDBConfig WebDBConfig::ReadFrom(std::string_view args_json) { if (doc.HasMember("allowUnsignedExtensions") && doc["allowUnsignedExtensions"].IsBool()) { config.allow_unsigned_extensions = doc["allowUnsignedExtensions"].GetBool(); } + if (doc.HasMember("useDirectIO") && doc["useDirectIO"].IsBool()) { + config.use_direct_io = doc["useDirectIO"].GetBool(); + } if (doc.HasMember("query") && doc["query"].IsObject()) { auto q = doc["query"].GetObject(); if (q.HasMember("queryPollingInterval") && q["queryPollingInterval"].IsNumber()) { diff --git a/lib/src/io/web_filesystem.cc b/lib/src/io/web_filesystem.cc index e46f7f14e..8faaa8ca5 100644 --- a/lib/src/io/web_filesystem.cc +++ b/lib/src/io/web_filesystem.cc @@ -226,6 +226,8 @@ WebFileSystem::DataProtocol WebFileSystem::inferDataProtocol(std::string_view ur proto = WebFileSystem::DataProtocol::HTTP; } else if (hasPrefix(url, "s3://")) { proto = WebFileSystem::DataProtocol::S3; + } else if (hasPrefix(url, "opfs://")) { + proto = WebFileSystem::DataProtocol::BROWSER_FSACCESS; } else if (hasPrefix(url, "file://")) { data_url = std::string_view{url}.substr(7); proto = default_data_protocol_; @@ -793,7 +795,7 @@ void WebFileSystem::Write(duckdb::FileHandle &handle, void *buffer, int64_t nr_b auto file_size = file_hdl.file_->file_size_; auto writer = static_cast(buffer); file_hdl.position_ = location; - while (nr_bytes > 0 && location < file_size) { + while (nr_bytes > 0) { auto n = Write(handle, writer, nr_bytes); writer += n; nr_bytes -= n; @@ -1006,10 +1008,12 @@ void WebFileSystem::FileSync(duckdb::FileHandle &handle) { vector WebFileSystem::Glob(const std::string &path, FileOpener *opener) { std::unique_lock fs_guard{fs_mutex_}; std::vector results; - auto glob = glob_to_regex(path); - for (auto [name, file] : files_by_name_) { - if (std::regex_match(file->file_name_, glob)) { - results.push_back(std::string{name}); + if (!FileSystem::IsRemoteFile(path)) { + auto glob = glob_to_regex(path); + for (auto [name, file] : files_by_name_) { + if (std::regex_match(file->file_name_, glob)) { + results.push_back(std::string{name}); + } } } auto &state = GetLocalState(); diff --git a/lib/src/json_typedef.cc b/lib/src/json_typedef.cc index 890ee8a73..1b98e21ee 100644 --- a/lib/src/json_typedef.cc +++ b/lib/src/json_typedef.cc @@ -396,6 +396,12 @@ arrow::Result WriteSQLType(rapidjson::Document& doc, const duc case duckdb::LogicalTypeId::AGGREGATE_STATE: case duckdb::LogicalTypeId::BIT: case duckdb::LogicalTypeId::LAMBDA: + case duckdb::LogicalTypeId::STRING_LITERAL: + case duckdb::LogicalTypeId::INTEGER_LITERAL: + case duckdb::LogicalTypeId::UHUGEINT: + case duckdb::LogicalTypeId::UNION: + case duckdb::LogicalTypeId::ARRAY: + case duckdb::LogicalTypeId::VARINT: break; } return out; diff --git a/lib/src/webdb.cc b/lib/src/webdb.cc index 0161443aa..981e5d618 100644 --- a/lib/src/webdb.cc +++ b/lib/src/webdb.cc @@ -827,6 +827,7 @@ arrow::Status WebDB::Open(std::string_view args_json) { db_config.options.access_mode = access_mode; db_config.options.duckdb_api = "wasm"; db_config.options.custom_user_agent = config_->custom_user_agent; + db_config.options.use_direct_io = config_->use_direct_io; auto db = make_shared_ptr(config_->path, &db_config); #ifndef WASM_LOADABLE_EXTENSIONS duckdb_web_parquet_init(db.get()); diff --git a/packages/benchmarks/src/system/arquero_benchmarks.ts b/packages/benchmarks/src/system/arquero_benchmarks.ts index b18f32747..5a742a21d 100644 --- a/packages/benchmarks/src/system/arquero_benchmarks.ts +++ b/packages/benchmarks/src/system/arquero_benchmarks.ts @@ -84,12 +84,12 @@ export class ArqueroTPCHBenchmark implements SystemBenchmark { case 2: { const tmp = this.tables['region'] .filter((d: any) => d.op.equal(d.r_name, 'EUROPE')) - .join(this.tables['nation'], ['r_regionkey', 'n_regionkey']) - .join(this.tables['supplier'], ['n_nationkey', 's_nationkey']); - const sub = tmp.join(this.tables['partsupp'], ['s_suppkey', 'ps_suppkey']); + .join(this.tables['nation'], [['r_regionkey'], ['n_regionkey']]) + .join(this.tables['supplier'], [['n_nationkey'], ['s_nationkey']]); + const sub = tmp.join(this.tables['partsupp'], [['s_suppkey'], ['ps_suppkey']]); const sub2 = this.tables['part'] .filter((d: any) => d.p_size == 15 && aq.op.match(d.p_type, /.*BRASS$/g, 0) != null) - .join(sub, ['p_partkey', 'ps_partkey']) + .join(sub, [['p_partkey'], ['ps_partkey']]) .groupby('p_partkey') .rollup({ min_ps_supplycost: (d: any) => aq.op.min(d.ps_supplycost), @@ -99,7 +99,7 @@ export class ArqueroTPCHBenchmark implements SystemBenchmark { (a: any, b: any) => a.p_partkey == b.ps_partkey && a.min_ps_supplycost == b.ps_supplycost, ); const query = tmp - .join(sub2, ['s_suppkey', 'ps_suppkey']) + .join(sub2, [['s_suppkey'], ['ps_suppkey']]) .orderby(aq.desc('s_acctbal'), 'n_name', 's_name', 'p_partkey'); for (const v of query.objects()) { noop(v); @@ -111,8 +111,8 @@ export class ArqueroTPCHBenchmark implements SystemBenchmark { const o = this.tables['orders'].filter((d: any) => d.o_orderdate < aq.op.timestamp('1995-03-15')); const l = this.tables['lineitem'].filter((d: any) => d.l_shipdate < aq.op.timestamp('1995-03-15')); const query = c - .join(o, ['c_custkey', 'o_custkey']) - .join(l, ['o_orderkey', 'l_orderkey']) + .join(o, [['c_custkey'], ['o_custkey']]) + .join(l, [['o_orderkey'], ['l_orderkey']]) .derive({ disc_price: (d: any) => d.l_extendedprice * (1 - d.l_discount), }) @@ -133,7 +133,7 @@ export class ArqueroTPCHBenchmark implements SystemBenchmark { ); const l = this.tables['lineitem'].filter((d: any) => d.l_commitdate < d.l_receiptdate); const query = o - .join(l, ['o_orderkey', 'l_orderkey']) + .join(l, [['o_orderkey'], ['l_orderkey']]) .groupby('o_orderpriority') .rollup({ order_count: aq.op.count(), @@ -156,10 +156,10 @@ export class ArqueroTPCHBenchmark implements SystemBenchmark { const n = this.tables['nation']; const right = r - .join(n, ['r_regionkey', 'n_regionkey']) - .join(c, ['n_nationkey', 'c_nationkey']) - .join(o, ['c_custkey', 'o_custkey']) - .join(l, ['o_orderkey', 'l_orderkey']); + .join(n, [['r_regionkey'], ['n_regionkey']]) + .join(c, [['n_nationkey'], ['c_nationkey']]) + .join(o, [['c_custkey'], ['o_custkey']]) + .join(l, [['o_orderkey'], ['l_orderkey']]); const query = s .join( right, @@ -232,11 +232,11 @@ export class ArqueroTPCHBenchmark implements SystemBenchmark { (a.n1_nationkey == 'GERMANY' && b.n2_nationkey == 'FRANCE'), ); const right = nations - .join(c, ['n2_nationkey', 'c_nationkey']) - .join(o, ['c_custkey', 'o_custkey']) - .join(l, ['o_orderkey', 'l_orderkey']); + .join(c, [['n2_nationkey'], ['c_nationkey']]) + .join(o, [['c_custkey'], ['o_custkey']]) + .join(l, [['o_orderkey'], ['l_orderkey']]); const query = s - .join(right, ['s_suppkey', 'l_suppkey']) + .join(right, [['s_suppkey'], ['l_suppkey']]) .groupby('n1_name', 'n2_name', 'l_year') .rollup({ revenue: (d: any) => aq.op.sum(d.volume), @@ -255,9 +255,9 @@ export class ArqueroTPCHBenchmark implements SystemBenchmark { d.o_orderdate <= aq.op.timestamp('1996-12-31'), ); const sub = p - .join(this.tables['lineitem'], ['p_partkey', 'l_partkey']) - .join(o, ['l_orderkey', 'o_orderkey']) - .join(this.tables['customer'], ['o_custkey', 'c_custkey']); + .join(this.tables['lineitem'], [['p_partkey'], ['l_partkey']]) + .join(o, [['l_orderkey'], ['o_orderkey']]) + .join(this.tables['customer'], [['o_custkey'], ['c_custkey']]); const r2 = this.tables['region'] .filter((d: any) => d.r_name == 'AMERICA') .rename({ @@ -269,11 +269,11 @@ export class ArqueroTPCHBenchmark implements SystemBenchmark { n_name: 'n2_name', }); const sub2 = r2 - .join(n2, ['r2_regionkey', 'n2_regionkey']) - .join(sub, ['n2_nationkey', 'c_nationkey']) - .join(this.tables['supplier'], ['l_suppkey', 's_suppkey']); + .join(n2, [['r2_regionkey'], ['n2_regionkey']]) + .join(sub, [['n2_nationkey'], ['c_nationkey']]) + .join(this.tables['supplier'], [['l_suppkey'], ['s_suppkey']]); const query = this.tables['nation'] - .join(sub2, ['n_nationkey', 's_nationkey']) + .join(sub2, [['n_nationkey'], ['s_nationkey']]) .derive({ o_year: (d: any) => aq.op.year(d.o_orderdate), volume: (d: any) => d.l_extendedprice * (1 - d.l_discount), @@ -290,16 +290,16 @@ export class ArqueroTPCHBenchmark implements SystemBenchmark { break; } case 9: { - const sub = this.tables['nation'].join(this.tables['supplier'], ['n_nationkey', 's_nationkey']); + const sub = this.tables['nation'].join(this.tables['supplier'], [['n_nationkey'], ['s_nationkey']]); const query = this.tables['part'] .filter((d: any) => aq.op.match(d.p_name, /.*green.*/g, 0) != null) - .join(this.tables['partsupp'], ['p_partkey', 'ps_partkey']) - .join(sub, ['ps_suppkey', 's_suppkey']) + .join(this.tables['partsupp'], [['p_partkey'], ['ps_partkey']]) + .join(sub, [['ps_suppkey'], ['s_suppkey']]) .join( this.tables['lineitem'], (a: any, b: any) => a.p_partkey == b.l_partkey && a.s_suppkey == b.l_suppkey, ) - .join(this.tables['orders'], ['l_orderkey', 'o_orderkey']) + .join(this.tables['orders'], [['l_orderkey'], ['o_orderkey']]) .derive({ o_year: (d: any) => aq.op.year(d.o_orderdate), amount: (d: any) => d.l_extendedprice * (1 - d.l_discount) - d.ps_supplycost * d.l_quantity, @@ -323,10 +323,10 @@ export class ArqueroTPCHBenchmark implements SystemBenchmark { ) .join( this.tables['lineitem'].filter((d: any) => d.l_returnflag == 'R'), - ['o_orderkey', 'l_orderkey'], + [['o_orderkey'], ['l_orderkey']], ) - .join(this.tables['customer'], ['o_custkey', 'c_custkey']) - .join(this.tables['nation'], ['c_nationkey', 'n_nationkey']) + .join(this.tables['customer'], [['o_custkey'], ['c_custkey']]) + .join(this.tables['nation'], [['c_nationkey'], ['n_nationkey']]) .derive({ realprice: (d: any) => d.l_extendedprice * (1 - d.l_discount), }) @@ -343,8 +343,8 @@ export class ArqueroTPCHBenchmark implements SystemBenchmark { case 11: { const temp = this.tables['nation'] .filter((d: any) => d.n_name == 'GERMANY') - .join(this.tables['supplier'], ['n_nationkey', 's_nationkey']) - .join(this.tables['partsupp'], ['s_suppkey', 'ps_suppkey']) + .join(this.tables['supplier'], [['n_nationkey'], ['s_nationkey']]) + .join(this.tables['partsupp'], [['s_suppkey'], ['ps_suppkey']]) .derive({ value: (d: any) => d.ps_supplycost * d.ps_availqty, }); @@ -373,7 +373,7 @@ export class ArqueroTPCHBenchmark implements SystemBenchmark { d.l_receiptdate >= aq.op.timestamp('1994-01-01') && d.l_receiptdate <= aq.op.timestamp('1994-12-31'), ) - .join(this.tables['orders'], ['l_orderkey', 'o_orderkey']) + .join(this.tables['orders'], [['l_orderkey'], ['o_orderkey']]) .derive({ high_line: (d: any) => d.o_orderpriority == '1-URGENT' || d.o_orderpriority == '2-HIGH' ? 1 : 0, @@ -396,7 +396,7 @@ export class ArqueroTPCHBenchmark implements SystemBenchmark { (d: any) => aq.op.match(d.o_comment, /^.*special.*requests.*$/g, 0) == null, ); const query = this.tables['customer'] - .join_left(o, ['c_custkey', 'o_custkey']) + .join_left(o, [['c_custkey'], ['o_custkey']]) .derive({ o_orderkey_not_null: (d: any) => (d.o_orderkey != null ? 1 : 0), }) @@ -421,7 +421,7 @@ export class ArqueroTPCHBenchmark implements SystemBenchmark { d.l_receiptdate >= aq.op.timestamp('1995-09-01') && d.l_receiptdate <= aq.op.timestamp('1995-09-30'), ) - .join(this.tables['part'], ['l_partkey', 'p_partkey']) + .join(this.tables['part'], [['l_partkey'], ['p_partkey']]) .derive({ realprice: (d: any) => d.l_extendedprice * (1 - d.l_discount), promoprice: (d: any) => @@ -458,7 +458,7 @@ export class ArqueroTPCHBenchmark implements SystemBenchmark { total_revenue: (d: any) => aq.op.max(d.revenue), }) .join(temp, (a: any, b: any) => aq.op.equal(a.total_revenue, b.revenue)) - .join(this.tables['supplier'], ['l_suppkey', 's_suppkey']) + .join(this.tables['supplier'], [['l_suppkey'], ['s_suppkey']]) .orderby('s_suppkey'); for (const v of query.objects({ grouped: true })) { noop(v); @@ -482,8 +482,8 @@ export class ArqueroTPCHBenchmark implements SystemBenchmark { d.p_size == 45 || d.p_size == 19), ) - .join(this.tables['partsupp'], ['p_partkey', 'ps_partkey']) - .antijoin(supplier, ['ps_partkey', 's_suppkey']) + .join(this.tables['partsupp'], [['p_partkey'], ['ps_partkey']]) + .antijoin(supplier, [['ps_partkey'], ['s_suppkey']]) .groupby('p_brand', 'p_type', 'p_size') .rollup({ supplier_cnt: aq.op.distinct('ps_suppkey'), @@ -497,7 +497,7 @@ export class ArqueroTPCHBenchmark implements SystemBenchmark { case 17: { const tmp = this.tables['part'] .filter((d: any) => d.p_brand == 'Brand#23' && d.p_container == 'MED BOX') - .join(this.tables['lineitem'], ['p_partkey', 'l_partkey']); + .join(this.tables['lineitem'], [['p_partkey'], ['l_partkey']]); const agg = tmp.groupby('p_partkey').rollup({ avg_qty: aq.op.mean('l_quantity'), }); @@ -518,9 +518,9 @@ export class ArqueroTPCHBenchmark implements SystemBenchmark { quantity: aq.op.sum('l_quantity'), }) .filter((d: any) => d.quantity > 300) - .join(this.tables['orders'], ['l_orderkey', 'o_orderkey']) - .join(this.tables['customer'], ['o_custkey', 'c_custkey']) - .join(this.tables['lineitem'], ['o_orderkey', 'l_orderkey']) + .join(this.tables['orders'], [['l_orderkey'], ['o_orderkey']]) + .join(this.tables['customer'], [['o_custkey'], ['c_custkey']]) + .join(this.tables['lineitem'], [['o_orderkey'], ['l_orderkey']]) .groupby('c_name', 'c_custkey', 'o_orderkey', 'o_orderdate', 'o_totalprice') .rollup({ quantity: aq.op.sum('l_quantity'), @@ -609,7 +609,7 @@ export class ArqueroTPCHBenchmark implements SystemBenchmark { }); const sub = this.tables['part'] .filter((d: any) => aq.op.match(d.p_name, /^forest.*$/, 0) != null) - .join(this.tables['partsupp'], ['p_partkey', 'ps_partkey']) + .join(this.tables['partsupp'], [['p_partkey'], ['ps_partkey']]) .join( qty, (a: any, b: any) => @@ -619,8 +619,8 @@ export class ArqueroTPCHBenchmark implements SystemBenchmark { ); const query = this.tables['nation'] .filter((d: any) => d.n_name == 'CANADA') - .join(this.tables['supplier'], ['n_nationkey', 's_nationkey']) - .semijoin(sub, ['s_suppkey', 'ps_suppkey']) + .join(this.tables['supplier'], [['n_nationkey'], ['s_nationkey']]) + .semijoin(sub, [['s_suppkey'], ['ps_suppkey']]) .orderby('s_name'); for (const v of query.objects({ grouped: true })) { noop(v); @@ -632,9 +632,9 @@ export class ArqueroTPCHBenchmark implements SystemBenchmark { const orders = this.tables['orders'].filter((d: any) => d.o_orderstatus == 'F'); const query = this.tables['nation'] .filter((d: any) => d.n_name == 'SAUDI ARABIA') - .join(this.tables['supplier'], ['n_nationkey', 's_nationkey']) - .join(lineitem, ['s_suppkey', 'l_suppkey']) - .join(orders, ['l_orderkey', 'o_orderkey']) + .join(this.tables['supplier'], [['n_nationkey'], ['s_nationkey']]) + .join(lineitem, [['s_suppkey'], ['l_suppkey']]) + .join(orders, [['l_orderkey'], ['o_orderkey']]) .antijoin(lineitem, (a: any, b: any) => a.l_suppkey != b.l_suppkey && a.l_orderkey == b.l_orderkey) .semijoin( this.tables['lineitem'], @@ -661,7 +661,7 @@ export class ArqueroTPCHBenchmark implements SystemBenchmark { }); const query = customers .join(total_avg, (a: any, b: any) => a.c_acctbal > b.avg_c_acctbal) - .antijoin(this.tables['orders'], ['c_custkey', 'o_custkey']) + .antijoin(this.tables['orders'], [['c_custkey'], ['o_custkey']]) .derive({ cntrycode: (d: any) => aq.op.substring(d.c_phone, 0, 2), }) @@ -979,7 +979,7 @@ export class ArqueroIntegerJoin2Benchmark implements SystemBenchmark { .params({ filter }) .filter((row: any) => row.v0 < filter) .rename({ v0: 'a0' }) - .join(this.tables['B'].rename({ v0: 'b0', v1: 'b1' }), ['a0', 'b1']); + .join(this.tables['B'].rename({ v0: 'b0', v1: 'b1' }), [['a0'], ['b1']]); let n = 0; for (const v of result) { noop(v); @@ -1048,8 +1048,8 @@ export class ArqueroIntegerJoin3Benchmark implements SystemBenchmark { .params({ filter }) .filter((row: any) => row.v0 < filter) .rename({ v0: 'a0' }) - .join(this.tables['B'].rename({ v0: 'b0', v1: 'b1' }), ['a0', 'b1']) - .join(this.tables['C'].rename({ v0: 'c0', v1: 'c1' }), ['b0', 'c1']); + .join(this.tables['B'].rename({ v0: 'b0', v1: 'b1' }), [['a0'], ['b1']]) + .join(this.tables['C'].rename({ v0: 'c0', v1: 'c1' }), [['b0'], ['c1']]); let n = 0; for (const v of result) { noop(v); diff --git a/packages/duckdb-wasm/src/bindings/bindings_base.ts b/packages/duckdb-wasm/src/bindings/bindings_base.ts index 0823f0e7a..571f9fe48 100644 --- a/packages/duckdb-wasm/src/bindings/bindings_base.ts +++ b/packages/duckdb-wasm/src/bindings/bindings_base.ts @@ -449,13 +449,32 @@ export abstract class DuckDBBindingsBase implements DuckDBBindings { } dropResponseBuffers(this.mod); } + /** Prepare a file handle that could only be acquired aschronously */ + public async prepareDBFileHandle(path: string, protocol: DuckDBDataProtocol): Promise { + if (protocol === DuckDBDataProtocol.BROWSER_FSACCESS && this._runtime.prepareDBFileHandle) { + const list = await this._runtime.prepareDBFileHandle(path, DuckDBDataProtocol.BROWSER_FSACCESS); + for (const item of list) { + const { handle, path: filePath, fromCached } = item; + if (!fromCached && handle.getSize()) { + await this.registerFileHandle(filePath, handle, DuckDBDataProtocol.BROWSER_FSACCESS, true); + } + } + return; + } + throw new Error(`prepareDBFileHandle: unsupported protocol ${protocol}`); + } /** Register a file object URL */ - public registerFileHandle( + public async registerFileHandle( name: string, handle: HandleType, protocol: DuckDBDataProtocol, directIO: boolean, - ): void { + ): Promise { + if (protocol === DuckDBDataProtocol.BROWSER_FSACCESS && handle instanceof FileSystemFileHandle) { + // handle is an async handle, should convert to sync handle + const fileHandle: FileSystemFileHandle = handle as any; + handle = (await fileHandle.createSyncAccessHandle()) as any; + } const [s, d, n] = callSRet( this.mod, 'duckdb_web_fs_register_file_url', @@ -467,6 +486,9 @@ export abstract class DuckDBBindingsBase implements DuckDBBindings { } dropResponseBuffers(this.mod); globalThis.DUCKDB_RUNTIME._files = (globalThis.DUCKDB_RUNTIME._files || new Map()).set(name, handle); + if (globalThis.DUCKDB_RUNTIME._preparedHandles?.[name]) { + delete globalThis.DUCKDB_RUNTIME._preparedHandles[name]; + } if (this.pthread) { for (const worker of this.pthread.runningWorkers) { worker.postMessage({ diff --git a/packages/duckdb-wasm/src/bindings/bindings_interface.ts b/packages/duckdb-wasm/src/bindings/bindings_interface.ts index 9dcf422db..b33d637d6 100644 --- a/packages/duckdb-wasm/src/bindings/bindings_interface.ts +++ b/packages/duckdb-wasm/src/bindings/bindings_interface.ts @@ -41,7 +41,8 @@ export interface DuckDBBindings { handle: HandleType, protocol: DuckDBDataProtocol, directIO: boolean, - ): void; + ): Promise; + prepareDBFileHandle(path: string, protocol: DuckDBDataProtocol): Promise; globFiles(path: string): WebFile[]; dropFile(name: string): void; dropFiles(): void; diff --git a/packages/duckdb-wasm/src/bindings/config.ts b/packages/duckdb-wasm/src/bindings/config.ts index ed5bb3cdb..ce29ca0f5 100644 --- a/packages/duckdb-wasm/src/bindings/config.ts +++ b/packages/duckdb-wasm/src/bindings/config.ts @@ -50,6 +50,10 @@ export interface DuckDBConfig { * Note that this will only work with cross-origin isolated sites since it requires SharedArrayBuffers. */ maximumThreads?: number; + /** + * The direct io flag + */ + useDirectIO?: boolean; /** * The query config */ diff --git a/packages/duckdb-wasm/src/bindings/runtime.ts b/packages/duckdb-wasm/src/bindings/runtime.ts index 83632deca..cb501c3a6 100644 --- a/packages/duckdb-wasm/src/bindings/runtime.ts +++ b/packages/duckdb-wasm/src/bindings/runtime.ts @@ -89,6 +89,12 @@ export interface DuckDBGlobalFileInfo { s3Config?: S3Config; } +export interface PreparedDBFileHandle { + path: string; + handle: any; + fromCached: boolean; +} + /** Call a function with packed response buffer */ export function callSRet( mod: DuckDBModule, @@ -149,6 +155,8 @@ export interface DuckDBRuntime { checkFile(mod: DuckDBModule, pathPtr: number, pathLen: number): boolean; removeFile(mod: DuckDBModule, pathPtr: number, pathLen: number): void; + prepareDBFileHandle?: (path: string, protocol: DuckDBDataProtocol) => Promise; + // Call a scalar UDF function callScalarUDF( mod: DuckDBModule, diff --git a/packages/duckdb-wasm/src/bindings/runtime_browser.ts b/packages/duckdb-wasm/src/bindings/runtime_browser.ts index 2c0c270d3..ef01d6b4c 100644 --- a/packages/duckdb-wasm/src/bindings/runtime_browser.ts +++ b/packages/duckdb-wasm/src/bindings/runtime_browser.ts @@ -11,13 +11,19 @@ import { failWith, FileFlags, readString, + PreparedDBFileHandle, } from './runtime'; import { DuckDBModule } from './duckdb_module'; import * as udf from './udf_runtime'; +const OPFS_PREFIX_LEN = 'opfs://'.length; +const PATH_SEP_REGEX = /\/|\\/; + export const BROWSER_RUNTIME: DuckDBRuntime & { + _files: Map; _fileInfoCache: Map; _globalFileInfo: DuckDBGlobalFileInfo | null; + _preparedHandles: Record; getFileInfo(mod: DuckDBModule, fileId: number): DuckDBFileInfo | null; getGlobalFileInfo(mod: DuckDBModule): DuckDBGlobalFileInfo | null; @@ -26,6 +32,7 @@ export const BROWSER_RUNTIME: DuckDBRuntime & { _fileInfoCache: new Map(), _udfFunctions: new Map(), _globalFileInfo: null, + _preparedHandles: {} as any, getFileInfo(mod: DuckDBModule, fileId: number): DuckDBFileInfo | null { try { @@ -47,13 +54,17 @@ export const BROWSER_RUNTIME: DuckDBRuntime & { try { const info = JSON.parse(infoStr); if (info == null) { - return null; + return null; } const file = { ...info, blob: null } as DuckDBFileInfo; BROWSER_RUNTIME._fileInfoCache.set(fileId, file); + if (!BROWSER_RUNTIME._files.has(file.fileName) && BROWSER_RUNTIME._preparedHandles[file.fileName]) { + BROWSER_RUNTIME._files.set(file.fileName, BROWSER_RUNTIME._preparedHandles[file.fileName]); + delete BROWSER_RUNTIME._preparedHandles[file.fileName]; + } return file; } catch (error) { - console.warn(error); + console.warn(error); return null; } } catch (e: any) { @@ -91,6 +102,59 @@ export const BROWSER_RUNTIME: DuckDBRuntime & { } }, + /** Prepare a file handle that could only be acquired aschronously */ + async prepareDBFileHandle(dbPath: string, protocol: DuckDBDataProtocol): Promise { + if (protocol === DuckDBDataProtocol.BROWSER_FSACCESS) { + const filePaths = [dbPath, `${dbPath}.wal`]; + const prepare = async (path: string): Promise => { + if (BROWSER_RUNTIME._files.has(path)) { + return { + path, + handle: BROWSER_RUNTIME._files.get(path), + fromCached: true, + }; + } + const opfsRoot = await navigator.storage.getDirectory(); + let dirHandle: FileSystemDirectoryHandle = opfsRoot; + // check if mkdir -p is needed + const opfsPath = path.slice(OPFS_PREFIX_LEN); + let fileName = opfsPath; + if (PATH_SEP_REGEX.test(opfsPath)) { + const folders = opfsPath.split(PATH_SEP_REGEX); + fileName = folders.pop()!; + if (!fileName) { + throw new Error(`Invalid path ${path}`); + } + // mkdir -p + for (const folder of folders) { + dirHandle = await dirHandle.getDirectoryHandle(folder, { create: true }); + } + } + const fileHandle = await dirHandle.getFileHandle(fileName, { create: false }).catch(e => { + if (e?.name === 'NotFoundError') { + console.debug(`File ${path} does not exists yet, creating...`); + return dirHandle.getFileHandle(fileName, { create: true }); + } + throw e; + }); + const handle = await fileHandle.createSyncAccessHandle(); + BROWSER_RUNTIME._preparedHandles[path] = handle; + return { + path, + handle, + fromCached: false, + }; + }; + const result: PreparedDBFileHandle[] = []; + for (const filePath of filePaths) { + const res = await prepare(filePath); + result.push(res); + } + return result; + } + throw new Error(`Unsupported protocol ${protocol} for path ${dbPath} with protocol ${protocol}`); + }, + testPlatformFeature: (_mod: DuckDBModule, feature: number): boolean => { switch (feature) { case 1: @@ -160,32 +224,32 @@ export const BROWSER_RUNTIME: DuckDBRuntime & { let contentLength = null; let error: any | null = null; if (file.reliableHeadRequests || !file.allowFullHttpReads) { - try { - // Send a dummy HEAD request with range protocol - // -> good IFF status is 206 and contentLenght is present - const xhr = new XMLHttpRequest(); - if (file.dataProtocol == DuckDBDataProtocol.S3) { - xhr.open('HEAD', getHTTPUrl(file.s3Config, file.dataUrl!), false); - addS3Headers(xhr, file.s3Config, file.dataUrl!, 'HEAD'); - } else { - xhr.open('HEAD', file.dataUrl!, false); - } - xhr.setRequestHeader('Range', `bytes=0-`); - xhr.send(null); + try { + // Send a dummy HEAD request with range protocol + // -> good IFF status is 206 and contentLenght is present + const xhr = new XMLHttpRequest(); + if (file.dataProtocol == DuckDBDataProtocol.S3) { + xhr.open('HEAD', getHTTPUrl(file.s3Config, file.dataUrl!), false); + addS3Headers(xhr, file.s3Config, file.dataUrl!, 'HEAD'); + } else { + xhr.open('HEAD', file.dataUrl!, false); + } + xhr.setRequestHeader('Range', `bytes=0-`); + xhr.send(null); - // Supports range requests - contentLength = xhr.getResponseHeader('Content-Length'); - if (contentLength !== null && xhr.status == 206) { - const result = mod._malloc(2 * 8); - mod.HEAPF64[(result >> 3) + 0] = +contentLength; - mod.HEAPF64[(result >> 3) + 1] = 0; - return result; - } + // Supports range requests + contentLength = xhr.getResponseHeader('Content-Length'); + if (contentLength !== null && xhr.status == 206) { + const result = mod._malloc(2 * 8); + mod.HEAPF64[(result >> 3) + 0] = +contentLength; + mod.HEAPF64[(result >> 3) + 1] = 0; + return result; + } - } catch (e: any) { - error = e; - console.warn(`HEAD request with range header failed: ${e}`); - } + } catch (e: any) { + error = e; + console.warn(`HEAD request with range header failed: ${e}`); + } } // Try to fallback to full read? @@ -223,7 +287,7 @@ export const BROWSER_RUNTIME: DuckDBRuntime & { } head.setRequestHeader('Range', `bytes=0-`); head.send(null); - + // Supports range requests contentLength = head.getResponseHeader('Content-Length'); if (contentLength !== null && +contentLength > 1) { @@ -296,6 +360,20 @@ export const BROWSER_RUNTIME: DuckDBRuntime & { mod.HEAPF64[(result >> 3) + 1] = buffer; return result; } + case DuckDBDataProtocol.BROWSER_FSACCESS: { + const handle: FileSystemSyncAccessHandle = BROWSER_RUNTIME._files?.get(file.fileName); + if (!handle) { + throw new Error(`No OPFS access handle registered with name: ${file.fileName}`); + } + if (flags & FileFlags.FILE_FLAGS_FILE_CREATE_NEW) { + handle.truncate(0); + } + const result = mod._malloc(2 * 8); + const fileSize = handle.getSize(); + mod.HEAPF64[(result >> 3) + 0] = fileSize; + mod.HEAPF64[(result >> 3) + 1] = 0; + return result; + } } } catch (e: any) { // TODO (samansmink): this path causes the WASM code to hang @@ -343,11 +421,19 @@ export const BROWSER_RUNTIME: DuckDBRuntime & { return 0; } const contentLength = xhr2.getResponseHeader('Content-Length'); - if (contentLength && (+contentLength > 1)) { - console.warn(`Range request for ${path} did not return a partial response: ${xhr2.status} "${xhr2.statusText}"`); + if (contentLength && +contentLength > 1) { + console.warn( + `Range request for ${path} did not return a partial response: ${xhr2.status} "${xhr2.statusText}"`, + ); } } mod.ccall('duckdb_web_fs_glob_add_path', null, ['string'], [path]); + } else { + for (const [filePath] of BROWSER_RUNTIME._files!.entries() || []) { + if (filePath.startsWith(path)) { + mod.ccall('duckdb_web_fs_glob_add_path', null, ['string'], [filePath]); + } + } } } catch (e: any) { console.log(e); @@ -372,6 +458,8 @@ export const BROWSER_RUNTIME: DuckDBRuntime & { } xhr.send(null); return xhr.status == 206 || xhr.status == 200; + } else { + return BROWSER_RUNTIME._files.has(path); } } catch (e: any) { console.log(e); @@ -393,11 +481,13 @@ export const BROWSER_RUNTIME: DuckDBRuntime & { // XXX Remove from registry return; case DuckDBDataProtocol.BROWSER_FSACCESS: { - const handle = BROWSER_RUNTIME._files?.get(file.fileName); + const handle: FileSystemSyncAccessHandle = BROWSER_RUNTIME._files?.get(file.fileName); if (!handle) { throw new Error(`No OPFS access handle registered with name: ${file.fileName}`); } - return handle.flush(); + handle.flush(); + handle.close(); + BROWSER_RUNTIME._files.delete(file.fileName); } } }, @@ -461,8 +551,14 @@ export const BROWSER_RUNTIME: DuckDBRuntime & { } else if (xhr.status == 200) { // TODO: here we are actually throwing away all non-relevant bytes, but this is still better than failing // proper solution would require notifying duckdb-wasm cache, while we are piggybackign on browser cache - console.warn(`Range request for ${file.dataUrl} did not return a partial response: ${xhr.status} "${xhr.statusText}"`); - const src = new Uint8Array(xhr.response, location, Math.min(xhr.response.byteLength-location, bytes)); + console.warn( + `Range request for ${file.dataUrl} did not return a partial response: ${xhr.status} "${xhr.statusText}"`, + ); + const src = new Uint8Array( + xhr.response, + location, + Math.min(xhr.response.byteLength - location, bytes), + ); mod.HEAPU8.set(src, buf); return src.byteLength; } else { @@ -486,7 +582,7 @@ export const BROWSER_RUNTIME: DuckDBRuntime & { return data.byteLength; } case DuckDBDataProtocol.BROWSER_FSACCESS: { - const handle = BROWSER_RUNTIME._files?.get(file.fileName); + const handle: FileSystemSyncAccessHandle = BROWSER_RUNTIME._files.get(file.fileName); if (!handle) { throw new Error(`No OPFS access handle registered with name: ${file.fileName}`); } @@ -523,7 +619,7 @@ export const BROWSER_RUNTIME: DuckDBRuntime & { failWith(mod, 'cannot write using the html5 file reader api'); return 0; case DuckDBDataProtocol.BROWSER_FSACCESS: { - const handle = BROWSER_RUNTIME._files?.get(file.fileName); + const handle: FileSystemSyncAccessHandle = BROWSER_RUNTIME._files?.get(file.fileName); if (!handle) { throw new Error(`No OPFS access handle registered with name: ${file.fileName}`); } diff --git a/packages/duckdb-wasm/src/parallel/worker_dispatcher.ts b/packages/duckdb-wasm/src/parallel/worker_dispatcher.ts index 5d1ea6b1c..188ab9973 100644 --- a/packages/duckdb-wasm/src/parallel/worker_dispatcher.ts +++ b/packages/duckdb-wasm/src/parallel/worker_dispatcher.ts @@ -1,4 +1,4 @@ -import { DuckDBBindings } from '../bindings'; +import { DuckDBBindings, DuckDBDataProtocol } from '../bindings'; import { WorkerResponseVariant, WorkerRequestVariant, WorkerRequestType, WorkerResponseType } from './worker_request'; import { Logger, LogEntryVariant } from '../log'; import { InstantiationProgress } from '../bindings/progress'; @@ -134,10 +134,16 @@ export abstract class AsyncDuckDBDispatcher implements Logger { this.sendOK(request); break; - case WorkerRequestType.OPEN: + case WorkerRequestType.OPEN: { + const path = request.data.path; + if (path?.startsWith('opfs://')) { + await this._bindings.prepareDBFileHandle(path, DuckDBDataProtocol.BROWSER_FSACCESS); + request.data.useDirectIO = true; + } this._bindings.open(request.data); this.sendOK(request); break; + } case WorkerRequestType.DROP_FILE: this._bindings.dropFile(request.data); this.sendOK(request); @@ -322,7 +328,7 @@ export abstract class AsyncDuckDBDispatcher implements Logger { break; case WorkerRequestType.REGISTER_FILE_HANDLE: - this._bindings.registerFileHandle( + await this._bindings.registerFileHandle( request.data[0], request.data[1], request.data[2], diff --git a/packages/duckdb-wasm/test/index_browser.ts b/packages/duckdb-wasm/test/index_browser.ts index 0e3ad0c75..1588c4fdd 100644 --- a/packages/duckdb-wasm/test/index_browser.ts +++ b/packages/duckdb-wasm/test/index_browser.ts @@ -100,6 +100,7 @@ import { testBindings, testAsyncBindings } from './bindings.test'; import { testBatchStream } from './batch_stream.test'; import { testAsyncBatchStream } from './batch_stream_async.test'; import { testFilesystem } from './filesystem.test'; +import { testOPFS } from './opfs.test'; import { testArrowInsert, testArrowInsertAsync } from './insert_arrow.test'; import { testJSONInsert, testJSONInsertAsync } from './insert_json.test'; import { testCSVInsert, testCSVInsertAsync } from './insert_csv.test'; @@ -128,6 +129,7 @@ testAsyncBindings(() => adb!, dataURL, duckdb.DuckDBDataProtocol.HTTP); testBatchStream(() => db!); testAsyncBatchStream(() => adb!); testFilesystem(() => adb!, resolveData, dataURL, duckdb.DuckDBDataProtocol.HTTP); +testOPFS(dataURL, () => DUCKDB_BUNDLE!); testArrowInsert(() => db!); testArrowInsertAsync(() => adb!); testJSONInsert(() => db!); diff --git a/packages/duckdb-wasm/test/opfs.test.ts b/packages/duckdb-wasm/test/opfs.test.ts new file mode 100644 index 000000000..407e5fc4a --- /dev/null +++ b/packages/duckdb-wasm/test/opfs.test.ts @@ -0,0 +1,157 @@ +import * as duckdb from '../src/'; +import {LogLevel} from '../src/'; +import * as arrow from 'apache-arrow'; + +export function testOPFS(baseDir: string, bundle: () => duckdb.DuckDBBundle): void { + let db: duckdb.AsyncDuckDB; + let conn: duckdb.AsyncDuckDBConnection; + + beforeAll(async () => { + removeFiles(); + }); + + afterAll(async () => { + if (conn) { + await conn.close(); + } + if (db) { + await db.terminate(); + } + removeFiles(); + }); + + beforeEach(async () => { + removeFiles(); + // + const logger = new duckdb.ConsoleLogger(LogLevel.ERROR); + const worker = new Worker(bundle().mainWorker!); + db = new duckdb.AsyncDuckDB(logger, worker); + await db.instantiate(bundle().mainModule, bundle().pthreadWorker); + await db.open({ + path: 'opfs://test.db', + accessMode: duckdb.DuckDBAccessMode.READ_WRITE + }); + conn = await db.connect(); + }); + + afterEach(async () => { + if (conn) { + await conn.close(); + } + if (db) { + await db.terminate(); + } + removeFiles(); + }); + + describe('Load Data', () => { + it('Imporet Small Parquet file', async () => { + await conn.send(`CREATE TABLE stu AS SELECT * FROM "${baseDir}/uni/studenten.parquet"`); + await conn.send(`CHECKPOINT;`); + const result = await conn.send(`SELECT matrnr FROM stu;`); + const batches = []; + for await (const batch of result) { + batches.push(batch); + } + const table = await new arrow.Table<{ cnt: arrow.Int }>(batches); + expect(table.getChildAt(0)?.toArray()).toEqual( + new Int32Array([24002, 25403, 26120, 26830, 27550, 28106, 29120, 29555]), + ); + }); + + it('Import Larget Parquet file', async () => { + await conn.send(`CREATE TABLE lineitem AS SELECT * FROM "${baseDir}/tpch/0_01/parquet/lineitem.parquet"`); + await conn.send(`CHECKPOINT;`); + const result = await conn.send(`SELECT count(*)::INTEGER as cnt FROM lineitem;`); + const batches = []; + for await (const batch of result) { + batches.push(batch); + } + const table = await new arrow.Table<{ cnt: arrow.Int }>(batches); + expect(table.getChildAt(0)?.get(0)).toBeGreaterThan(60_000); + }); + + it('Load Existing DB File in OPFS', async () => { + await conn.send(`CREATE TABLE tmp AS SELECT * FROM "${baseDir}/tpch/0_01/parquet/lineitem.parquet"`); + await conn.send(`CHECKPOINT;`); + await conn.close(); + await db.terminate(); + + const logger = new duckdb.ConsoleLogger(LogLevel.ERROR); + const worker = new Worker(bundle().mainWorker!); + db = new duckdb.AsyncDuckDB(logger, worker); + await db.instantiate(bundle().mainModule, bundle().pthreadWorker); + await db.open({ + path: 'opfs://test.db', + accessMode: duckdb.DuckDBAccessMode.READ_WRITE + }); + conn = await db.connect(); + + const result = await conn.send(`SELECT count(*)::INTEGER as cnt FROM tmp;`); + const batches = []; + for await (const batch of result) { + batches.push(batch); + } + const table = await new arrow.Table<{ cnt: arrow.Int }>(batches); + expect(table.getChildAt(0)?.get(0)).toBeGreaterThan(60_000); + }); + + it('Export as CSV to OPFS + Load CSV that are already in OPFS', async () => { + const opfsRoot = await navigator.storage.getDirectory(); + const testHandle = await opfsRoot.getFileHandle('test.csv', {create: true}); + await db.registerFileHandle('test.csv', testHandle, duckdb.DuckDBDataProtocol.BROWSER_FSACCESS, true); + await conn.send(`CREATE TABLE zzz AS SELECT * FROM "${baseDir}/tpch/0_01/parquet/lineitem.parquet"`); + await conn.send(`COPY (SELECT * FROM zzz) TO 'test.csv'`); + await conn.close(); + await db.dropFile('test.csv'); + await db.reset(); + + await db.open({}); + conn = await db.connect(); + await db.registerFileHandle('test.csv', testHandle, duckdb.DuckDBDataProtocol.BROWSER_FSACCESS, true); + + const result = await conn.send(`SELECT count(*)::INTEGER as cnt FROM 'test.csv';`); + const batches = []; + for await (const batch of result) { + batches.push(batch); + } + const table = await new arrow.Table<{ cnt: arrow.Int }>(batches); + expect(table.getChildAt(0)?.get(0)).toBeGreaterThan(60_000); + }); + + it('Load Parquet file that are already in OPFS', async () => { + const parquetBuffer = await fetch(`${baseDir}/tpch/0_01/parquet/lineitem.parquet`).then(res => + res.arrayBuffer(), + ); + const opfsRoot = await navigator.storage.getDirectory(); + const fileHandle = await opfsRoot.getFileHandle('test.parquet', {create: true}); + const writable = await fileHandle.createWritable(); + await writable.write(parquetBuffer); + await writable.close(); + + await db.registerFileHandle('test.parquet', fileHandle, duckdb.DuckDBDataProtocol.BROWSER_FSACCESS, true); + await conn.send(`CREATE TABLE lineitem AS SELECT * FROM read_parquet('test.parquet')`); + await conn.send(`CHECKPOINT;`); + + const result = await conn.send(`SELECT count(*)::INTEGER as cnt FROM lineitem;`); + const batches = []; + for await (const batch of result) { + batches.push(batch); + } + const table = await new arrow.Table<{ cnt: arrow.Int }>(batches); + expect(table.getChildAt(0)?.get(0)).toBeGreaterThan(60_000); + }); + }); + + async function removeFiles() { + const opfsRoot = await navigator.storage.getDirectory(); + await opfsRoot.removeEntry('test.db').catch(() => { + }); + await opfsRoot.removeEntry('test.db.wal').catch(() => { + }); + await opfsRoot.removeEntry('test.csv').catch(() => { + }); + await opfsRoot.removeEntry('test.parquet').catch(() => { + }); + } +} diff --git a/patches/duckdb/bind_copy_direct_io.patch b/patches/duckdb/bind_copy_direct_io.patch new file mode 100644 index 000000000..994917fe8 --- /dev/null +++ b/patches/duckdb/bind_copy_direct_io.patch @@ -0,0 +1,15 @@ +diff --git a/src/planner/binder/statement/bind_copy.cpp b/src/planner/binder/statement/bind_copy.cpp +index 7db1db812d..60131d5916 100644 +--- a/src/planner/binder/statement/bind_copy.cpp ++++ b/src/planner/binder/statement/bind_copy.cpp +@@ -137,7 +137,9 @@ BoundStatement Binder::BindCopyTo(CopyStatement &stmt) { + throw NotImplementedException("Can't combine FILE_SIZE_BYTES and PARTITION_BY for COPY"); + } + bool is_remote_file = FileSystem::IsRemoteFile(stmt.info->file_path); +- if (is_remote_file) { ++ if ( is_remote_file ) { ++ use_tmp_file = false; ++ } else if( config.options.use_direct_io ) { + use_tmp_file = false; + } else { + auto &fs = FileSystem::GetFileSystem(context); diff --git a/patches/duckdb/fix_load_database.patch b/patches/duckdb/fix_load_database.patch new file mode 100644 index 000000000..e9d785701 --- /dev/null +++ b/patches/duckdb/fix_load_database.patch @@ -0,0 +1,20 @@ +diff --git a/src/storage/storage_manager.cpp b/src/storage/storage_manager.cpp +index 456cd22614..c4777dd28e 100644 +--- a/src/storage/storage_manager.cpp ++++ b/src/storage/storage_manager.cpp +@@ -163,9 +163,12 @@ void SingleFileStorageManager::LoadDatabase(const optional_idx block_alloc_size) + options.use_direct_io = config.options.use_direct_io; + options.debug_initialize = config.options.debug_initialize; + +- // Check if the database file already exists. +- // Note: a file can also exist if there was a ROLLBACK on a previous transaction creating that file. +- if (!read_only && !fs.FileExists(path)) { ++ auto db_file_handle = fs.OpenFile(path, FileFlags::FILE_FLAGS_READ | FileFlags::FILE_FLAGS_NULL_IF_NOT_EXISTS); ++ bool is_empty_file = db_file_handle->GetFileSize() == 0; ++ db_file_handle.reset(); ++ ++ // first check if the database exists ++ if (!read_only && ( !fs.FileExists(path) || ( options.use_direct_io && is_empty_file )) ) { + // file does not exist and we are in read-write mode + // create a new file + From 30669c402c1f15e9a733784ce668844a74cf6abe Mon Sep 17 00:00:00 2001 From: arkw Date: Tue, 17 Sep 2024 12:56:27 +0900 Subject: [PATCH 02/15] add test for url with long query string --- packages/duckdb-wasm/test/httpfs_test.ts | 18 ++++++++++++++++++ .../duckdb-wasm/test/string_test_helper.ts | 8 ++++++++ 2 files changed, 26 insertions(+) create mode 100644 packages/duckdb-wasm/test/string_test_helper.ts diff --git a/packages/duckdb-wasm/test/httpfs_test.ts b/packages/duckdb-wasm/test/httpfs_test.ts index 6a0762d13..3953effb9 100644 --- a/packages/duckdb-wasm/test/httpfs_test.ts +++ b/packages/duckdb-wasm/test/httpfs_test.ts @@ -2,6 +2,7 @@ import * as duckdb from '../src/'; import { getS3Params, S3Params, S3PayloadParams, createS3Headers, uriEncode, getHTTPUrl } from '../src/utils'; import { AsyncDuckDBConnection, DuckDBBindings, DuckDBBindingsBase, DuckDBModule } from '../src/'; import BROWSER_RUNTIME from '../src/bindings/runtime_browser'; +import {generateRandomString} from "./string_test_helper"; // S3 config for tests const BUCKET_NAME = 'test-bucket'; @@ -312,5 +313,22 @@ export function testHTTPFSAsync( ), ).toBeRejectedWithError('Invalid Error: File is not opened in write mode'); }); + + + it('can read parquet file from URL with long query string', async () => { + const queryString = generateRandomString(1500); + const result = await conn!.query( + `select * from "${S3_ENDPOINT}/${BUCKET_NAME}/correct_auth_test.parquet?${queryString}";`, + ); + expect(Number((result.getChildAt(0)?.get(6)))).toEqual(Number(29120)); + }); + + it('can read csv file from URL with long query string', async () => { + const queryString = generateRandomString(1500); + const result = await conn!.query( + `select * from "${S3_ENDPOINT}/${BUCKET_NAME}/correct_auth_test.csv?${queryString}";`, + ); + expect(Number((result.getChildAt(0)?.get(6)))).toEqual(Number(29120)); + }); }); } diff --git a/packages/duckdb-wasm/test/string_test_helper.ts b/packages/duckdb-wasm/test/string_test_helper.ts new file mode 100644 index 000000000..6ca13168e --- /dev/null +++ b/packages/duckdb-wasm/test/string_test_helper.ts @@ -0,0 +1,8 @@ +export function generateRandomString(length: number): string { + const chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'; + let result = ''; + for (let i = 0; i < length; i++) { + result += chars.charAt(Math.floor(Math.random() * chars.length)); + } + return result; +} From caaf8ad3015d459b3669615bcb29bd553624edcd Mon Sep 17 00:00:00 2001 From: arkw Date: Wed, 18 Sep 2024 14:55:48 +0900 Subject: [PATCH 03/15] update s3rver cors settings --- packages/duckdb-wasm/karma/s3rver/s3rver.js | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/duckdb-wasm/karma/s3rver/s3rver.js b/packages/duckdb-wasm/karma/s3rver/s3rver.js index d133af956..72a8c51ec 100644 --- a/packages/duckdb-wasm/karma/s3rver/s3rver.js +++ b/packages/duckdb-wasm/karma/s3rver/s3rver.js @@ -7,6 +7,7 @@ const CORS_CONFIG = "\n" + " GET\n" + " HEAD\n" + " *\n" + + " Content-Range\n" + " \n" + ""; From 0977660a5d6aacc5e75b2f871b67e99f1cad2f23 Mon Sep 17 00:00:00 2001 From: arkw Date: Sat, 21 Sep 2024 15:23:59 +0900 Subject: [PATCH 04/15] update httpfs test. --- packages/duckdb-wasm/test/httpfs_test.ts | 11 +++++------ packages/duckdb-wasm/test/string_test_helper.ts | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/packages/duckdb-wasm/test/httpfs_test.ts b/packages/duckdb-wasm/test/httpfs_test.ts index 3953effb9..a48a11a50 100644 --- a/packages/duckdb-wasm/test/httpfs_test.ts +++ b/packages/duckdb-wasm/test/httpfs_test.ts @@ -2,7 +2,7 @@ import * as duckdb from '../src/'; import { getS3Params, S3Params, S3PayloadParams, createS3Headers, uriEncode, getHTTPUrl } from '../src/utils'; import { AsyncDuckDBConnection, DuckDBBindings, DuckDBBindingsBase, DuckDBModule } from '../src/'; import BROWSER_RUNTIME from '../src/bindings/runtime_browser'; -import {generateRandomString} from "./string_test_helper"; +import {generateLongQueryString} from "./string_test_helper"; // S3 config for tests const BUCKET_NAME = 'test-bucket'; @@ -314,19 +314,18 @@ export function testHTTPFSAsync( ).toBeRejectedWithError('Invalid Error: File is not opened in write mode'); }); - it('can read parquet file from URL with long query string', async () => { - const queryString = generateRandomString(1500); + const queryString = generateLongQueryString(); const result = await conn!.query( - `select * from "${S3_ENDPOINT}/${BUCKET_NAME}/correct_auth_test.parquet?${queryString}";`, + `SELECT * FROM "${S3_ENDPOINT}/${BUCKET_NAME}/correct_auth_test.parquet?${queryString}";`, ); expect(Number((result.getChildAt(0)?.get(6)))).toEqual(Number(29120)); }); it('can read csv file from URL with long query string', async () => { - const queryString = generateRandomString(1500); + const queryString = generateLongQueryString(); const result = await conn!.query( - `select * from "${S3_ENDPOINT}/${BUCKET_NAME}/correct_auth_test.csv?${queryString}";`, + `SELECT * FROM "${S3_ENDPOINT}/${BUCKET_NAME}/correct_auth_test.csv?${queryString}";`, ); expect(Number((result.getChildAt(0)?.get(6)))).toEqual(Number(29120)); }); diff --git a/packages/duckdb-wasm/test/string_test_helper.ts b/packages/duckdb-wasm/test/string_test_helper.ts index 6ca13168e..ebad5be99 100644 --- a/packages/duckdb-wasm/test/string_test_helper.ts +++ b/packages/duckdb-wasm/test/string_test_helper.ts @@ -1,3 +1,20 @@ +export function generateLongQueryString(): string { + const aaa = generateRandomString(512); + const ddd = generateRandomString(512); + const ccc = generateRandomString(256); + const eee = generateRandomString(128); + const ggg = generateRandomString(64); + const hhh = generateRandomString(32); + + return `T=Long` + + `&T-AAA=${aaa}` + + `&T-CCC=${ccc}` + + `&T-DDD=${ddd}` + + `&T-EEE=${eee}` + + `&T-GGG=${ggg}` + + `&T-HHH=${hhh}`; +} + export function generateRandomString(length: number): string { const chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'; let result = ''; From 426a1a41939b6a053028f4a6ff5201cad8e53b30 Mon Sep 17 00:00:00 2001 From: arkw Date: Tue, 24 Sep 2024 23:35:28 +0900 Subject: [PATCH 05/15] update httpfs test. --- packages/duckdb-wasm/test/httpfs_test.ts | 10 +++++ .../duckdb-wasm/test/string_test_helper.ts | 38 +++++++++---------- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/packages/duckdb-wasm/test/httpfs_test.ts b/packages/duckdb-wasm/test/httpfs_test.ts index a48a11a50..7f5284b93 100644 --- a/packages/duckdb-wasm/test/httpfs_test.ts +++ b/packages/duckdb-wasm/test/httpfs_test.ts @@ -315,7 +315,12 @@ export function testHTTPFSAsync( }); it('can read parquet file from URL with long query string', async () => { + // Create S3 file + let data = await resolveData('/uni/studenten.parquet'); + await putTestFileToS3('correct_auth_test', 'parquet', data); + // Generate a long query string, similar to an S3 Presigned URL const queryString = generateLongQueryString(); + // Execute the query const result = await conn!.query( `SELECT * FROM "${S3_ENDPOINT}/${BUCKET_NAME}/correct_auth_test.parquet?${queryString}";`, ); @@ -323,7 +328,12 @@ export function testHTTPFSAsync( }); it('can read csv file from URL with long query string', async () => { + // Create S3 file + let data = await resolveData('/uni/studenten.parquet'); + await putTestFileToS3('correct_auth_test', 'csv', data); + // Generate a long query string, similar to an S3 Presigned URL const queryString = generateLongQueryString(); + // Execute the query const result = await conn!.query( `SELECT * FROM "${S3_ENDPOINT}/${BUCKET_NAME}/correct_auth_test.csv?${queryString}";`, ); diff --git a/packages/duckdb-wasm/test/string_test_helper.ts b/packages/duckdb-wasm/test/string_test_helper.ts index ebad5be99..26a6cb430 100644 --- a/packages/duckdb-wasm/test/string_test_helper.ts +++ b/packages/duckdb-wasm/test/string_test_helper.ts @@ -1,25 +1,21 @@ export function generateLongQueryString(): string { - const aaa = generateRandomString(512); - const ddd = generateRandomString(512); - const ccc = generateRandomString(256); - const eee = generateRandomString(128); - const ggg = generateRandomString(64); - const hhh = generateRandomString(32); + const aaa = repeatCharacter('A', 512); + const ccc = repeatCharacter('C', 256); + const ddd = repeatCharacter('D', 512); + const eee = repeatCharacter('E', 256); + const ggg = repeatCharacter('G', 128); + const hhh = repeatCharacter('H', 64); - return `T=Long` + - `&T-AAA=${aaa}` + - `&T-CCC=${ccc}` + - `&T-DDD=${ddd}` + - `&T-EEE=${eee}` + - `&T-GGG=${ggg}` + - `&T-HHH=${hhh}`; + return `test=inline` + + `&Test-Security-Token=${aaa}` + + `&Test-Algorithm=${ccc}` + + `&Test-Date=${ddd}` + + `&Test-SignedHeaders=host` + + `&Test-Expires=43200` + + `&Test-Credential=${eee}` + + `&Test-Signature=${ggg}`; } -export function generateRandomString(length: number): string { - const chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'; - let result = ''; - for (let i = 0; i < length; i++) { - result += chars.charAt(Math.floor(Math.random() * chars.length)); - } - return result; -} +export function repeatCharacter(char: string, length: number): string { + return char.repeat(length); +} \ No newline at end of file From 2916b3bd9305c0b265a05e74bae18b26b4632331 Mon Sep 17 00:00:00 2001 From: arkw Date: Tue, 24 Sep 2024 23:40:20 +0900 Subject: [PATCH 06/15] update httpfs test for eslint --- packages/duckdb-wasm/test/httpfs_test.ts | 4 ++-- packages/duckdb-wasm/test/string_test_helper.ts | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/packages/duckdb-wasm/test/httpfs_test.ts b/packages/duckdb-wasm/test/httpfs_test.ts index 7f5284b93..4ffe2ac0b 100644 --- a/packages/duckdb-wasm/test/httpfs_test.ts +++ b/packages/duckdb-wasm/test/httpfs_test.ts @@ -316,7 +316,7 @@ export function testHTTPFSAsync( it('can read parquet file from URL with long query string', async () => { // Create S3 file - let data = await resolveData('/uni/studenten.parquet'); + const data = await resolveData('/uni/studenten.parquet'); await putTestFileToS3('correct_auth_test', 'parquet', data); // Generate a long query string, similar to an S3 Presigned URL const queryString = generateLongQueryString(); @@ -329,7 +329,7 @@ export function testHTTPFSAsync( it('can read csv file from URL with long query string', async () => { // Create S3 file - let data = await resolveData('/uni/studenten.parquet'); + const data = await resolveData('/uni/studenten.parquet'); await putTestFileToS3('correct_auth_test', 'csv', data); // Generate a long query string, similar to an S3 Presigned URL const queryString = generateLongQueryString(); diff --git a/packages/duckdb-wasm/test/string_test_helper.ts b/packages/duckdb-wasm/test/string_test_helper.ts index 26a6cb430..96e83b823 100644 --- a/packages/duckdb-wasm/test/string_test_helper.ts +++ b/packages/duckdb-wasm/test/string_test_helper.ts @@ -4,7 +4,6 @@ export function generateLongQueryString(): string { const ddd = repeatCharacter('D', 512); const eee = repeatCharacter('E', 256); const ggg = repeatCharacter('G', 128); - const hhh = repeatCharacter('H', 64); return `test=inline` + `&Test-Security-Token=${aaa}` + From fda93d09753508975ee8c27dc4961dd9b4dcfa08 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Fri, 27 Sep 2024 11:26:49 +0200 Subject: [PATCH 07/15] Fixup patch, now allowing installing from other repositories via 'INSTALL x FROM community' --- patches/duckdb/extension_install_rework.patch | 93 +++++++++++++++++-- 1 file changed, 84 insertions(+), 9 deletions(-) diff --git a/patches/duckdb/extension_install_rework.patch b/patches/duckdb/extension_install_rework.patch index a689cabf6..ad04aeb74 100644 --- a/patches/duckdb/extension_install_rework.patch +++ b/patches/duckdb/extension_install_rework.patch @@ -1,3 +1,18 @@ +diff --git a/src/include/duckdb/main/database.hpp b/src/include/duckdb/main/database.hpp +index 222a36c051..fb895920ef 100644 +--- a/src/include/duckdb/main/database.hpp ++++ b/src/include/duckdb/main/database.hpp +@@ -91,6 +91,10 @@ private: + ValidChecker db_validity; + unique_ptr db_file_system; + shared_ptr db_cache_entry; ++public: ++ static void SetPreferredRepository(const string& extension, const string &repository); ++ static string GetPreferredRepository(const string& extension); ++ static unordered_map extensionsRepos; + }; + + //! The database object. This object holds the catalog and all the diff --git a/src/include/duckdb/main/extension_install_info.hpp b/src/include/duckdb/main/extension_install_info.hpp index 6ccd1a1156..8040f537b6 100644 --- a/src/include/duckdb/main/extension_install_info.hpp @@ -15,11 +30,44 @@ index 6ccd1a1156..8040f537b6 100644 //! Debugging repositories (target local, relative paths that are produced by DuckDB's build system) static constexpr const char *BUILD_DEBUG_REPOSITORY_PATH = "./build/debug/repository"; +diff --git a/src/main/database.cpp b/src/main/database.cpp +index 4308c4a016..fe23c36ead 100644 +--- a/src/main/database.cpp ++++ b/src/main/database.cpp +@@ -328,6 +328,28 @@ DuckDB::DuckDB(DatabaseInstance &instance_p) : instance(instance_p.shared_from_t + DuckDB::~DuckDB() { + } + ++unordered_map DatabaseInstance::extensionsRepos = {}; ++ ++void DatabaseInstance::SetPreferredRepository(const string& extension, const string &repository) { ++ auto &x = extensionsRepos; ++ auto it = x.find(extension); ++ if (it != x.end()) { ++ it->second=repository; ++ } else { ++ x.emplace(extension, repository); ++ } ++} ++ ++string DatabaseInstance::GetPreferredRepository(const string& extension) { ++ const auto &x = extensionsRepos; ++ auto it = x.find(extension); ++ if (it != x.end()) { ++ return it->second; ++ } ++ return ""; ++} ++ ++ + SecretManager &DatabaseInstance::GetSecretManager() { + return *config.secret_manager; + } diff --git a/src/main/extension/extension_helper.cpp b/src/main/extension/extension_helper.cpp -index c821caedea..aae791b786 100644 +index 494832417e..17a39d04b4 100644 --- a/src/main/extension/extension_helper.cpp +++ b/src/main/extension/extension_helper.cpp -@@ -319,7 +319,6 @@ vector ExtensionHelper::UpdateExtensions(ClientContext &c +@@ -328,7 +328,6 @@ vector ExtensionHelper::UpdateExtensions(ClientContext &c vector result; DatabaseInstance &db = DatabaseInstance::GetDatabase(context); @@ -27,7 +75,7 @@ index c821caedea..aae791b786 100644 case_insensitive_set_t seen_extensions; // scan the install directory for installed extensions -@@ -336,7 +335,6 @@ vector ExtensionHelper::UpdateExtensions(ClientContext &c +@@ -345,7 +344,6 @@ vector ExtensionHelper::UpdateExtensions(ClientContext &c result.push_back(UpdateExtensionInternal(context, db, fs, fs.JoinPath(ext_directory, path), extension_name)); }); @@ -36,10 +84,30 @@ index c821caedea..aae791b786 100644 return result; } diff --git a/src/main/extension/extension_install.cpp b/src/main/extension/extension_install.cpp -index d190ea197c..157db58641 100644 +index b0ca9fb775..67dfcdfb26 100644 --- a/src/main/extension/extension_install.cpp +++ b/src/main/extension/extension_install.cpp -@@ -204,7 +204,7 @@ string ExtensionHelper::ExtensionUrlTemplate(optional_ptr ExtensionHelper::InstallExtension(DatabaseInstance &db, FileSystem &fs, + const string &extension, + ExtensionInstallOptions &options) { ++ if (options.repository) { ++ DatabaseInstance::SetPreferredRepository(extension, options.repository->path); ++ } + #ifdef WASM_LOADABLE_EXTENSIONS + // Install is currently a no-op + return nullptr; +@@ -154,6 +157,9 @@ unique_ptr ExtensionHelper::InstallExtension(DatabaseInsta + + unique_ptr ExtensionHelper::InstallExtension(ClientContext &context, const string &extension, + ExtensionInstallOptions &options) { ++ if (options.repository) { ++ DatabaseInstance::SetPreferredRepository(extension, options.repository->path); ++ } + #ifdef WASM_LOADABLE_EXTENSIONS + // Install is currently a no-op + return nullptr; +@@ -198,7 +204,7 @@ string ExtensionHelper::ExtensionUrlTemplate(optional_ptr Date: Sun, 29 Sep 2024 13:38:41 +0900 Subject: [PATCH 08/15] fix dropfile --- lib/include/duckdb/web/io/web_filesystem.h | 2 + lib/js-stubs.js | 4 + lib/src/io/web_filesystem.cc | 9 ++ lib/src/webdb.cc | 21 ++- .../duckdb-wasm/src/bindings/bindings_base.ts | 6 +- packages/duckdb-wasm/src/bindings/runtime.ts | 3 + .../src/bindings/runtime_browser.ts | 48 +++++-- .../duckdb-wasm/src/bindings/runtime_node.ts | 1 + packages/duckdb-wasm/test/opfs.test.ts | 131 +++++++++++++++--- 9 files changed, 185 insertions(+), 40 deletions(-) diff --git a/lib/include/duckdb/web/io/web_filesystem.h b/lib/include/duckdb/web/io/web_filesystem.h index 71a1ef780..96638f992 100644 --- a/lib/include/duckdb/web/io/web_filesystem.h +++ b/lib/include/duckdb/web/io/web_filesystem.h @@ -209,6 +209,8 @@ class WebFileSystem : public duckdb::FileSystem { DataBuffer file_buffer); /// Try to drop a specific file bool TryDropFile(std::string_view file_name); + /// drop a specific file + void DropFile(std::string_view file_name); /// Drop all files without references (including buffers) void DropDanglingFiles(); /// Configure file statistics diff --git a/lib/js-stubs.js b/lib/js-stubs.js index 2371d1e93..86a8cfbfc 100644 --- a/lib/js-stubs.js +++ b/lib/js-stubs.js @@ -15,6 +15,10 @@ addToLibrary({ duckdb_web_fs_file_sync: function (fileId) { return globalThis.DUCKDB_RUNTIME.syncFile(Module, fileId); }, + duckdb_web_fs_file_drop_file__sig: 'vpi', + duckdb_web_fs_file_drop_file: function (fileName, fileNameLen) { + return globalThis.DUCKDB_RUNTIME.dropFile(Module, fileName, fileNameLen); + }, duckdb_web_fs_file_close__sig: 'vi', duckdb_web_fs_file_close: function (fileId) { return globalThis.DUCKDB_RUNTIME.closeFile(Module, fileId); diff --git a/lib/src/io/web_filesystem.cc b/lib/src/io/web_filesystem.cc index 8faaa8ca5..aa6cdfa28 100644 --- a/lib/src/io/web_filesystem.cc +++ b/lib/src/io/web_filesystem.cc @@ -117,6 +117,7 @@ RT_FN(void duckdb_web_fs_file_close(size_t file_id), { auto &infos = GetLocalState(); infos.handles.erase(file_id); }); +RT_FN(void duckdb_web_fs_file_drop_file(const char *fileName, size_t pathLen), {}); RT_FN(void duckdb_web_fs_file_truncate(size_t file_id, double new_size), { GetOrOpen(file_id).Truncate(new_size); }); RT_FN(time_t duckdb_web_fs_file_get_last_modified_time(size_t file_id), { auto &file = GetOrOpen(file_id); @@ -455,6 +456,7 @@ void WebFileSystem::DropDanglingFiles() { for (auto &[file_id, file] : files_by_id_) { if (file->handle_count_ == 0) { files_by_name_.erase(file->file_name_); + DropFile(file->file_name_); if (file->data_url_.has_value()) { files_by_url_.erase(file->data_url_.value()); } @@ -483,6 +485,13 @@ bool WebFileSystem::TryDropFile(std::string_view file_name) { return false; } +/// drop a file +void WebFileSystem::DropFile(std::string_view file_name) { + DEBUG_TRACE(); + std::string fileNameS = std::string{file_name}; + duckdb_web_fs_file_drop_file(fileNameS.c_str(), fileNameS.size()); +} + /// Write the global filesystem info rapidjson::Value WebFileSystem::WriteGlobalFileInfo(rapidjson::Document &doc, uint32_t cache_epoch) { DEBUG_TRACE(); diff --git a/lib/src/webdb.cc b/lib/src/webdb.cc index 981e5d618..23639047c 100644 --- a/lib/src/webdb.cc +++ b/lib/src/webdb.cc @@ -912,18 +912,29 @@ arrow::Status WebDB::RegisterFileBuffer(std::string_view file_name, std::unique_ /// Drop all files arrow::Status WebDB::DropFiles() { file_page_buffer_->DropDanglingFiles(); - pinned_web_files_.clear(); + std::vector files_to_drop; + for (const auto& [key, handle] : pinned_web_files_) { + files_to_drop.push_back(handle->GetName()); + } + for (const auto& fileName : files_to_drop) { + arrow::Status status = DropFile(fileName); + if (!status.ok()) { + return arrow::Status::Invalid("Failed to drop file: " + fileName); + } + } if (auto fs = io::WebFileSystem::Get()) { fs->DropDanglingFiles(); } return arrow::Status::OK(); } /// Drop a file -arrow::Status WebDB::DropFile(std::string_view file_name) { - file_page_buffer_->TryDropFile(file_name); - pinned_web_files_.erase(file_name); +arrow::Status WebDB::DropFile(std::string_view fileName) { + file_page_buffer_->TryDropFile(fileName); + pinned_web_files_.erase(fileName); if (auto fs = io::WebFileSystem::Get()) { - if (!fs->TryDropFile(file_name)) { + if (fs->TryDropFile(fileName)) { + fs->DropFile(fileName); + } else { return arrow::Status::Invalid("file is in use"); } } diff --git a/packages/duckdb-wasm/src/bindings/bindings_base.ts b/packages/duckdb-wasm/src/bindings/bindings_base.ts index 571f9fe48..af940318e 100644 --- a/packages/duckdb-wasm/src/bindings/bindings_base.ts +++ b/packages/duckdb-wasm/src/bindings/bindings_base.ts @@ -473,7 +473,11 @@ export abstract class DuckDBBindingsBase implements DuckDBBindings { if (protocol === DuckDBDataProtocol.BROWSER_FSACCESS && handle instanceof FileSystemFileHandle) { // handle is an async handle, should convert to sync handle const fileHandle: FileSystemFileHandle = handle as any; - handle = (await fileHandle.createSyncAccessHandle()) as any; + try { + handle = (await fileHandle.createSyncAccessHandle()) as any; + } catch (e: any) { + throw new Error( e.message + ":" + name ); + } } const [s, d, n] = callSRet( this.mod, diff --git a/packages/duckdb-wasm/src/bindings/runtime.ts b/packages/duckdb-wasm/src/bindings/runtime.ts index cb501c3a6..4bc360db3 100644 --- a/packages/duckdb-wasm/src/bindings/runtime.ts +++ b/packages/duckdb-wasm/src/bindings/runtime.ts @@ -140,6 +140,7 @@ export interface DuckDBRuntime { openFile(mod: DuckDBModule, fileId: number, flags: FileFlags): void; syncFile(mod: DuckDBModule, fileId: number): void; closeFile(mod: DuckDBModule, fileId: number): void; + dropFile(mod: DuckDBModule, fileNamePtr: number, fileNameLen:number): void; getLastFileModificationTime(mod: DuckDBModule, fileId: number): number; truncateFile(mod: DuckDBModule, fileId: number, newSize: number): void; readFile(mod: DuckDBModule, fileId: number, buffer: number, bytes: number, location: number): number; @@ -155,6 +156,7 @@ export interface DuckDBRuntime { checkFile(mod: DuckDBModule, pathPtr: number, pathLen: number): boolean; removeFile(mod: DuckDBModule, pathPtr: number, pathLen: number): void; + // Prepare a file handle that could only be acquired aschronously prepareDBFileHandle?: (path: string, protocol: DuckDBDataProtocol) => Promise; // Call a scalar UDF function @@ -177,6 +179,7 @@ export const DEFAULT_RUNTIME: DuckDBRuntime = { openFile: (_mod: DuckDBModule, _fileId: number, flags: FileFlags): void => {}, syncFile: (_mod: DuckDBModule, _fileId: number): void => {}, closeFile: (_mod: DuckDBModule, _fileId: number): void => {}, + dropFile: (_mod: DuckDBModule, _fileNamePtr: number, _fileNameLen:number): void => {}, getLastFileModificationTime: (_mod: DuckDBModule, _fileId: number): number => { return 0; }, diff --git a/packages/duckdb-wasm/src/bindings/runtime_browser.ts b/packages/duckdb-wasm/src/bindings/runtime_browser.ts index ef01d6b4c..8e46955fe 100644 --- a/packages/duckdb-wasm/src/bindings/runtime_browser.ts +++ b/packages/duckdb-wasm/src/bindings/runtime_browser.ts @@ -1,5 +1,5 @@ -import { StatusCode } from '../status'; -import { addS3Headers, getHTTPUrl } from '../utils'; +import {StatusCode} from '../status'; +import {addS3Headers, getHTTPUrl} from '../utils'; import { callSRet, @@ -23,7 +23,7 @@ export const BROWSER_RUNTIME: DuckDBRuntime & { _files: Map; _fileInfoCache: Map; _globalFileInfo: DuckDBGlobalFileInfo | null; - _preparedHandles: Record; + _preparedHandles: Record; getFileInfo(mod: DuckDBModule, fileId: number): DuckDBFileInfo | null; getGlobalFileInfo(mod: DuckDBModule): DuckDBGlobalFileInfo | null; @@ -93,7 +93,7 @@ export const BROWSER_RUNTIME: DuckDBRuntime & { if (info == null) { return null; } - BROWSER_RUNTIME._globalFileInfo = { ...info, blob: null } as DuckDBGlobalFileInfo; + BROWSER_RUNTIME._globalFileInfo = { ...info, blob: null} as DuckDBGlobalFileInfo; return BROWSER_RUNTIME._globalFileInfo; } catch (e: any) { @@ -137,13 +137,17 @@ export const BROWSER_RUNTIME: DuckDBRuntime & { } throw e; }); - const handle = await fileHandle.createSyncAccessHandle(); - BROWSER_RUNTIME._preparedHandles[path] = handle; - return { - path, - handle, - fromCached: false, - }; + try { + const handle = await fileHandle.createSyncAccessHandle(); + BROWSER_RUNTIME._preparedHandles[path] = handle; + return { + path, + handle, + fromCached: false, + }; + } catch (e: any) { + throw new Error(e.message + ":" + name); + } }; const result: PreparedDBFileHandle[] = []; for (const filePath of filePaths) { @@ -485,9 +489,25 @@ export const BROWSER_RUNTIME: DuckDBRuntime & { if (!handle) { throw new Error(`No OPFS access handle registered with name: ${file.fileName}`); } - handle.flush(); - handle.close(); - BROWSER_RUNTIME._files.delete(file.fileName); + return handle.flush(); + } + } + }, + dropFile: (mod: DuckDBModule, fileNamePtr: number, fileNameLen: number) => { + const fileName = readString(mod, fileNamePtr, fileNameLen); + const handle: FileSystemSyncAccessHandle = BROWSER_RUNTIME._files?.get(fileName); + if (handle) { + BROWSER_RUNTIME._files.delete(fileName); + if (handle instanceof FileSystemSyncAccessHandle) { + try { + handle.flush(); + handle.close(); + } catch (e: any) { + throw new Error(`Cannot drop file with name: ${fileName}`); + } + } + if (handle instanceof Blob) { + // nothing } } }, diff --git a/packages/duckdb-wasm/src/bindings/runtime_node.ts b/packages/duckdb-wasm/src/bindings/runtime_node.ts index aa63822e2..2f92368bc 100644 --- a/packages/duckdb-wasm/src/bindings/runtime_node.ts +++ b/packages/duckdb-wasm/src/bindings/runtime_node.ts @@ -127,6 +127,7 @@ export const NODE_RUNTIME: DuckDBRuntime & { } return 0; }, + dropFile: (mod: DuckDBModule, _fileNamePtr: number, _fileNameLen:number) => {}, truncateFile: (mod: DuckDBModule, fileId: number, newSize: number) => { try { const file = NODE_RUNTIME.resolveFileInfo(mod, fileId); diff --git a/packages/duckdb-wasm/test/opfs.test.ts b/packages/duckdb-wasm/test/opfs.test.ts index 407e5fc4a..ddf52eafb 100644 --- a/packages/duckdb-wasm/test/opfs.test.ts +++ b/packages/duckdb-wasm/test/opfs.test.ts @@ -44,7 +44,7 @@ export function testOPFS(baseDir: string, bundle: () => duckdb.DuckDBBundle): vo removeFiles(); }); - describe('Load Data', () => { + describe('Load Data in OPFS', () => { it('Imporet Small Parquet file', async () => { await conn.send(`CREATE TABLE stu AS SELECT * FROM "${baseDir}/uni/studenten.parquet"`); await conn.send(`CHECKPOINT;`); @@ -71,7 +71,7 @@ export function testOPFS(baseDir: string, bundle: () => duckdb.DuckDBBundle): vo expect(table.getChildAt(0)?.get(0)).toBeGreaterThan(60_000); }); - it('Load Existing DB File in OPFS', async () => { + it('Load Existing DB File', async () => { await conn.send(`CREATE TABLE tmp AS SELECT * FROM "${baseDir}/tpch/0_01/parquet/lineitem.parquet"`); await conn.send(`CHECKPOINT;`); await conn.close(); @@ -96,7 +96,57 @@ export function testOPFS(baseDir: string, bundle: () => duckdb.DuckDBBundle): vo expect(table.getChildAt(0)?.get(0)).toBeGreaterThan(60_000); }); - it('Export as CSV to OPFS + Load CSV that are already in OPFS', async () => { + it('Load Parquet file that are already', async () => { + const parquetBuffer = await fetch(`${baseDir}/tpch/0_01/parquet/lineitem.parquet`).then(res => + res.arrayBuffer(), + ); + const opfsRoot = await navigator.storage.getDirectory(); + const fileHandle = await opfsRoot.getFileHandle('test.parquet', {create: true}); + const writable = await fileHandle.createWritable(); + await writable.write(parquetBuffer); + await writable.close(); + + await db.registerFileHandle('test.parquet', fileHandle, duckdb.DuckDBDataProtocol.BROWSER_FSACCESS, true); + await conn.send(`CREATE TABLE lineitem1 AS SELECT * FROM read_parquet('test.parquet')`); + await conn.send(`CHECKPOINT;`); + await conn.send(`CREATE TABLE lineitem2 AS SELECT * FROM read_parquet('test.parquet')`); + await conn.send(`CHECKPOINT;`); + await conn.send(`CREATE TABLE lineitem3 AS SELECT * FROM read_parquet('test.parquet')`); + await conn.send(`CHECKPOINT;`); + + { + const result1 = await conn.send(`SELECT count(*)::INTEGER as cnt FROM lineitem1;`); + const batches1 = []; + for await (const batch of result1) { + batches1.push(batch); + } + const table1 = await new arrow.Table<{ cnt: arrow.Int }>(batches1); + expect(table1.getChildAt(0)?.get(0)).toBeGreaterThan(60_000); + } + + { + const result2 = await conn.send(`SELECT count(*)::INTEGER as cnt FROM lineitem2;`); + const batches2 = []; + for await (const batch of result2) { + batches2.push(batch); + } + const table2 = await new arrow.Table<{ cnt: arrow.Int }>(batches2); + expect(table2.getChildAt(0)?.get(0)).toBeGreaterThan(60_000); + } + + { + const result3 = await conn.send(`SELECT count(*)::INTEGER as cnt FROM lineitem3;`); + const batches3 = []; + for await (const batch of result3) { + batches3.push(batch); + } + const table3 = await new arrow.Table<{ cnt: arrow.Int }>(batches3); + expect(table3.getChildAt(0)?.get(0)).toBeGreaterThan(60_000); + } + + }); + + it('Drop File + Export as CSV to OPFS + Load CSV', async () => { const opfsRoot = await navigator.storage.getDirectory(); const testHandle = await opfsRoot.getFileHandle('test.csv', {create: true}); await db.registerFileHandle('test.csv', testHandle, duckdb.DuckDBDataProtocol.BROWSER_FSACCESS, true); @@ -117,29 +167,64 @@ export function testOPFS(baseDir: string, bundle: () => duckdb.DuckDBBundle): vo } const table = await new arrow.Table<{ cnt: arrow.Int }>(batches); expect(table.getChildAt(0)?.get(0)).toBeGreaterThan(60_000); + + await db.dropFile('test.csv'); }); - it('Load Parquet file that are already in OPFS', async () => { - const parquetBuffer = await fetch(`${baseDir}/tpch/0_01/parquet/lineitem.parquet`).then(res => - res.arrayBuffer(), - ); + + it('Drop Files + Export as CSV to OPFS + Load CSV', async () => { const opfsRoot = await navigator.storage.getDirectory(); - const fileHandle = await opfsRoot.getFileHandle('test.parquet', {create: true}); - const writable = await fileHandle.createWritable(); - await writable.write(parquetBuffer); - await writable.close(); + const testHandle1 = await opfsRoot.getFileHandle('test1.csv', {create: true}); + const testHandle2 = await opfsRoot.getFileHandle('test2.csv', {create: true}); + const testHandle3 = await opfsRoot.getFileHandle('test3.csv', {create: true}); + await db.registerFileHandle('test1.csv', testHandle1, duckdb.DuckDBDataProtocol.BROWSER_FSACCESS, true); + await db.registerFileHandle('test2.csv', testHandle2, duckdb.DuckDBDataProtocol.BROWSER_FSACCESS, true); + await db.registerFileHandle('test3.csv', testHandle3, duckdb.DuckDBDataProtocol.BROWSER_FSACCESS, true); - await db.registerFileHandle('test.parquet', fileHandle, duckdb.DuckDBDataProtocol.BROWSER_FSACCESS, true); - await conn.send(`CREATE TABLE lineitem AS SELECT * FROM read_parquet('test.parquet')`); - await conn.send(`CHECKPOINT;`); + await conn.send(`CREATE TABLE zzz AS SELECT * FROM "${baseDir}/tpch/0_01/parquet/lineitem.parquet"`); + await conn.send(`COPY (SELECT * FROM zzz) TO 'test1.csv'`); + await conn.send(`COPY (SELECT * FROM zzz) TO 'test2.csv'`); + await conn.send(`COPY (SELECT * FROM zzz) TO 'test3.csv'`); + await conn.close(); - const result = await conn.send(`SELECT count(*)::INTEGER as cnt FROM lineitem;`); - const batches = []; - for await (const batch of result) { - batches.push(batch); + await db.dropFiles(); + await db.reset(); + + await db.open({}); + conn = await db.connect(); + await db.registerFileHandle('test1.csv', testHandle1, duckdb.DuckDBDataProtocol.BROWSER_FSACCESS, true); + await db.registerFileHandle('test2.csv', testHandle2, duckdb.DuckDBDataProtocol.BROWSER_FSACCESS, true); + await db.registerFileHandle('test3.csv', testHandle3, duckdb.DuckDBDataProtocol.BROWSER_FSACCESS, true); + + { + const result1 = await conn.send(`SELECT count(*)::INTEGER as cnt FROM 'test1.csv';`); + const batches1 = []; + for await (const batch of result1) { + batches1.push(batch); + } + const table1 = await new arrow.Table<{ cnt: arrow.Int }>(batches1); + expect(table1.getChildAt(0)?.get(0)).toBeGreaterThan(60_000); } - const table = await new arrow.Table<{ cnt: arrow.Int }>(batches); - expect(table.getChildAt(0)?.get(0)).toBeGreaterThan(60_000); + { + const result2 = await conn.send(`SELECT count(*)::INTEGER as cnt FROM 'test2.csv';`); + const batches2 = []; + for await (const batch of result2) { + batches2.push(batch); + } + const table2 = await new arrow.Table<{ cnt: arrow.Int }>(batches2); + expect(table2.getChildAt(0)?.get(0)).toBeGreaterThan(60_000); + } + { + const result3 = await conn.send(`SELECT count(*)::INTEGER as cnt FROM 'test3.csv';`); + const batches3 = []; + for await (const batch of result3) { + batches3.push(batch); + } + const table3 = await new arrow.Table<{ cnt: arrow.Int }>(batches3); + expect(table3.getChildAt(0)?.get(0)).toBeGreaterThan(60_000); + } + + await db.dropFiles(); }); }); @@ -151,6 +236,12 @@ export function testOPFS(baseDir: string, bundle: () => duckdb.DuckDBBundle): vo }); await opfsRoot.removeEntry('test.csv').catch(() => { }); + await opfsRoot.removeEntry('test1.csv').catch(() => { + }); + await opfsRoot.removeEntry('test2.csv').catch(() => { + }); + await opfsRoot.removeEntry('test3.csv').catch(() => { + }); await opfsRoot.removeEntry('test.parquet').catch(() => { }); } From ecec0df24d6f6491d608ea84d46f3fd0c95cb752 Mon Sep 17 00:00:00 2001 From: Eiichi Arikawa <157803904+e1arikawa@users.noreply.github.com> Date: Sun, 6 Oct 2024 18:20:42 +0900 Subject: [PATCH 09/15] Update packages/duckdb-wasm/test/opfs.test.ts Co-authored-by: asrar --- packages/duckdb-wasm/test/opfs.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/duckdb-wasm/test/opfs.test.ts b/packages/duckdb-wasm/test/opfs.test.ts index ddf52eafb..f345ee448 100644 --- a/packages/duckdb-wasm/test/opfs.test.ts +++ b/packages/duckdb-wasm/test/opfs.test.ts @@ -45,7 +45,7 @@ export function testOPFS(baseDir: string, bundle: () => duckdb.DuckDBBundle): vo }); describe('Load Data in OPFS', () => { - it('Imporet Small Parquet file', async () => { + it('Import Small Parquet file', async () => { await conn.send(`CREATE TABLE stu AS SELECT * FROM "${baseDir}/uni/studenten.parquet"`); await conn.send(`CHECKPOINT;`); const result = await conn.send(`SELECT matrnr FROM stu;`); From 8964dc1afb21930d8edf47148c755fdd8f99ee10 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Mon, 7 Oct 2024 14:52:25 +0200 Subject: [PATCH 10/15] Improve README --- README.md | 47 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 469b7a2f1..1a3d7dc3e 100644 --- a/README.md +++ b/README.md @@ -32,23 +32,62 @@ DuckDB-Wasm brings DuckDB to every browser thanks to WebAssembly. Duckdb-Wasm speaks Arrow fluently, reads Parquet, CSV and JSON files backed by Filesystem APIs or HTTP requests and has been tested with Chrome, Firefox, Safari and Node.js. Learn more about DuckDB-Wasm from our [VLDB publication](https://www.vldb.org/pvldb/vol15/p3574-kohn.pdf) or the [recorded talk](https://www.youtube.com/watch?v=wm82b7PlM6s). -Try it out at [shell.duckdb.org](https://shell.duckdb.org) or [Observable](https://observablehq.com/@observablehq/duckdb), read the [API documentation](https://shell.duckdb.org/docs/modules/index.html), check out the [web-app examples](https://github.com/duckdb-wasm-examples), and chat with us on [Discord](https://discord.duckdb.org). +Try it out at [shell.duckdb.org](https://shell.duckdb.org) or [external third party embedding of DuckDB-Wasm](https://github.com/davidgasquez/awesome-duckdb?tab=readme-ov-file#web-clients), read the [API documentation](https://shell.duckdb.org/docs/modules/index.html), check out the [web-app examples](https://github.com/duckdb-wasm-examples), and chat with us on [Discord](https://discord.duckdb.org). ## DuckDB and DuckDB-Wasm DuckDB-Wasm is currently based on DuckDB v1.1.1. -## DuckDB-Wasm with DuckDB Extension +Relevant differences: +* HTTP stack is different between native and Wasm versions of DuckDB. Most relevant are: + * Requests are always upgraded to HTTPS + * Requests needs server to allow Cross Origin access on a given resource + * File system implementation (eg. S3) is different and this might cause some differences +* Extension install is lazy, meaning that `INSTALL extension_name FROM 'https://repository.endpoint.org';` defer fetching the extension to the first `LOAD extension_name;` instruction. `INSTALL x FROM community;` shorthands are also supported. +* DuckDB-Wasm builds are optimized for download speed. Core extensions like autocomplete, JSON, Parquet and ICU are usually bundled DuckDB binaries, while in duckdb-wasm they are autoloaded (including fetching them) at runtime. In particular for ICU autoloading do not work corrently in all cases, explicit `LOAD icu;` might be needed to reproduce same behaviour. +* DuckDB-Wasm is sandboxed and migth not have the same level of support for out-of-core operations and access to file system +* DuckDB-Wasm default mode is single threaded. Multithreading is at the moment still experimental. +Supported DuckDB features: +* DuckDB databases files are compatible to be read from DuckDB-Wasm. +* Databases files can be made available as simple as: `ATTACH 'https://blobs.duckdb.org/data/test.db'; FROM db.t;` (demo)[https://shell.duckdb.org/#queries=v0,ATTACH-'https%3A%2F%2Fblobs.duckdb.org%2Fdata%2Ftest.db'-as-db~,FROM-db.t~] +* Spatial support via `LOAD spatial` (spatial demo)[https://shell.duckdb.org/#queries=v0,%20%20-Spatial-extension-for-geospatial-support%0AINSTALL-spatial~%0ALOAD-spatial~,CREATE-TABLE-stations-AS%0A----FROM-'s3%3A%2F%2Fduckdb%20blobs%2Fstations.parquet'~,%20%20-What-are-the-top%203-closest-Intercity-stations%0A%20%20-using-aerial-distance%3F%0ASELECT%0A----s1.name_long-AS-station1%2C%0A----s2.name_long-AS-station2%2C%0A----ST_Distance(%0A--------ST_Point(s1.geo_lng%2C-s1.geo_lat)%2C%0A--------ST_Point(s2.geo_lng%2C-s2.geo_lat)%0A----)-*-111139-AS-distance%0AFROM-stations-s1%2C-stations-s2%0AWHERE-s1.type-LIKE-'%25Intercity%25'%0A--AND-s2.type-LIKE-'%25Intercity%25'%0A--AND-s1.id-%3C-s2.id%0AORDER-BY-distance-ASC%0ALIMIT-3~] +* A growing subset of extensions, either core, community or external, are supported for DuckDB-Wasm +* Multithreading work but it's still experimental and by default not enabled + +## DuckDB-Wasm and DuckDB Extension + +DuckDB is extensible and this allows to delegate functionality to [extensions](https://duckdb.org/docs/extensions/overview). + +Core extensions are available at https://extensions.duckdb.org, and community extensions are available at https://community-extensions.duckdb.org. ```sql --- Excplicitly load extensions LOAD icu; - --- Or have them autoloaded when using relevant functions or settings -FROM read_json('https://some.url/file.json'); +DESCRIBE FROM read_parquet('https://blobs.duckdb.org/stations.parquet'); -- (this autoloads JSON) + +--- Or register extensions +INSTALL h3 FROM community; +INSTALL sqlite_scanner FROM 'https://extensions.duckdb.org'; +INSTALL quack FROM 'https://community-extensions.duckdb.org'; + +--- And then load them +LOAD h3; +LOAD sqlite_scanner; +LOAD quack; ``` +```sql +FROM duckdb_extensions() WHERE loaded; +``` +Will show that h3, icu, parquet, quack and sqlite_scanner have been loaded. + +You can try the [Shell demo with loading of extensions](https://shell.duckdb.org/#queries=v0,%20%20%20-Excplicitly-load-extensions%0ALOAD-icu~%0A%0A%20%20%20-Or-have-them-autoloaded-when-using-relevant-functions-or-settings%0ADESCRIBE-FROM-read_parquet('https%3A%2F%2Fblobs.duckdb.org%2Fstations.parquet')~--%20%20-(this-autoloads-JSON)%0A%0A%20%20%20-Or-register-extensions%0AINSTALL-h3-FROM-community~%0AINSTALL-sqlite_scanner-FROM-'https%3A%2F%2Fextensions.duckdb.org'~%0AINSTALL-quack-FROM-'https%3A%2F%2Fcommunity%20extensions.duckdb.org'~%0A%0A%20%20%20-And-then-load-them%3A%0ALOAD-h3~%0ALOAD-sqlite_scanner~%0ALOAD-quack~,FROM-duckdb_extensions()-WHERE-loaded~) but this do require about 3.2 MB of compressed Wasm files to be transfered over the network (on first visit, caching might help). + +Extension sizes will vary depending, among other things, on provided functionality or toolchain used. + + ## Build from source ```shell From 48297e0a66619d7753ba4c128251aea880ceb6fb Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Mon, 7 Oct 2024 14:56:48 +0200 Subject: [PATCH 11/15] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1a3d7dc3e..6e232406b 100644 --- a/README.md +++ b/README.md @@ -50,8 +50,8 @@ Relevant differences: Supported DuckDB features: * DuckDB databases files are compatible to be read from DuckDB-Wasm. -* Databases files can be made available as simple as: `ATTACH 'https://blobs.duckdb.org/data/test.db'; FROM db.t;` (demo)[https://shell.duckdb.org/#queries=v0,ATTACH-'https%3A%2F%2Fblobs.duckdb.org%2Fdata%2Ftest.db'-as-db~,FROM-db.t~] -* Spatial support via `LOAD spatial` (spatial demo)[https://shell.duckdb.org/#queries=v0,%20%20-Spatial-extension-for-geospatial-support%0AINSTALL-spatial~%0ALOAD-spatial~,CREATE-TABLE-stations-AS%0A----FROM-'s3%3A%2F%2Fduckdb%20blobs%2Fstations.parquet'~,%20%20-What-are-the-top%203-closest-Intercity-stations%0A%20%20-using-aerial-distance%3F%0ASELECT%0A----s1.name_long-AS-station1%2C%0A----s2.name_long-AS-station2%2C%0A----ST_Distance(%0A--------ST_Point(s1.geo_lng%2C-s1.geo_lat)%2C%0A--------ST_Point(s2.geo_lng%2C-s2.geo_lat)%0A----)-*-111139-AS-distance%0AFROM-stations-s1%2C-stations-s2%0AWHERE-s1.type-LIKE-'%25Intercity%25'%0A--AND-s2.type-LIKE-'%25Intercity%25'%0A--AND-s1.id-%3C-s2.id%0AORDER-BY-distance-ASC%0ALIMIT-3~] +* Databases files can be made available as simple as: `ATTACH 'https://blobs.duckdb.org/data/test.db'; FROM db.t;` [demo](https://shell.duckdb.org/#queries=v0,ATTACH-'https%3A%2F%2Fblobs.duckdb.org%2Fdata%2Ftest.db'-as-db~,FROM-db.t~) +* Spatial support via `LOAD spatial` [spatial demo](https://shell.duckdb.org/#queries=v0,%20%20-Spatial-extension-for-geospatial-support%0AINSTALL-spatial~%0ALOAD-spatial~,CREATE-TABLE-stations-AS%0A----FROM-'s3%3A%2F%2Fduckdb%20blobs%2Fstations.parquet'~,%20%20-What-are-the-top%203-closest-Intercity-stations%0A%20%20-using-aerial-distance%3F%0ASELECT%0A----s1.name_long-AS-station1%2C%0A----s2.name_long-AS-station2%2C%0A----ST_Distance(%0A--------ST_Point(s1.geo_lng%2C-s1.geo_lat)%2C%0A--------ST_Point(s2.geo_lng%2C-s2.geo_lat)%0A----)-*-111139-AS-distance%0AFROM-stations-s1%2C-stations-s2%0AWHERE-s1.type-LIKE-'%25Intercity%25'%0A--AND-s2.type-LIKE-'%25Intercity%25'%0A--AND-s1.id-%3C-s2.id%0AORDER-BY-distance-ASC%0ALIMIT-3~) * A growing subset of extensions, either core, community or external, are supported for DuckDB-Wasm * Multithreading work but it's still experimental and by default not enabled From 1fa2fae5a329c009bd5b5ee3727e66485f4c70b6 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Mon, 7 Oct 2024 17:09:19 +0200 Subject: [PATCH 12/15] Add npm_tags.yml --- .github/workflows/npm_tags.yml | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 .github/workflows/npm_tags.yml diff --git a/.github/workflows/npm_tags.yml b/.github/workflows/npm_tags.yml new file mode 100644 index 000000000..7183cf3a9 --- /dev/null +++ b/.github/workflows/npm_tags.yml @@ -0,0 +1,30 @@ +name: 'NPM tag' +on: + workflow_call: + inputs: + version: + type: string + tag: + type: string + workflow_dispatch: + inputs: + version: + type: string + tag: + type: string + +jobs: + change_tags: + name: Change tags + runs-on: ubuntu-latest + steps: + - name: Change tags + env: + NODE_AUTH_TOKEN: ${{ secrets.NPM_PUBLISH_TOKEN }} + TAG: ${{ inputs.tag }} + VERSION: ${{ inputs.version }} + if: env.NODE_AUTH_TOKEN != null + run: | + npm dist-tag ls @duckdb/duckdb-wasm@"${VERSION}" + npm dist-tag add @duckdb/duckdb-wasm@"${VERSION}" "${TAG}" + npm dist-tag ls @duckdb/duckdb-wasm@"${VERSION}" From b1a3449e4656018af19f510a2b3fb9461fb185f1 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Mon, 7 Oct 2024 17:34:48 +0200 Subject: [PATCH 13/15] Perform checkout --- .github/workflows/npm_tags.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/npm_tags.yml b/.github/workflows/npm_tags.yml index 7183cf3a9..c30cde60c 100644 --- a/.github/workflows/npm_tags.yml +++ b/.github/workflows/npm_tags.yml @@ -18,6 +18,16 @@ jobs: name: Change tags runs-on: ubuntu-latest steps: + - uses: actions/checkout@v4 + with: + submodules: 'recursive' + fetch-depth: 0 + + - uses: actions/setup-node@v4 + with: + node-version: '18.x' + registry-url: 'https://registry.npmjs.org' + - name: Change tags env: NODE_AUTH_TOKEN: ${{ secrets.NPM_PUBLISH_TOKEN }} From b78525d3d15b932eafb615997fafe8c19374f070 Mon Sep 17 00:00:00 2001 From: arkw Date: Wed, 9 Oct 2024 11:40:22 +0900 Subject: [PATCH 14/15] Fix registerFileHandle. --- .../duckdb-wasm/src/bindings/bindings_base.ts | 27 ++++++++++++++----- packages/duckdb-wasm/test/opfs.test.ts | 24 +++++++++++++++++ 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/packages/duckdb-wasm/src/bindings/bindings_base.ts b/packages/duckdb-wasm/src/bindings/bindings_base.ts index af940318e..ac6a4567e 100644 --- a/packages/duckdb-wasm/src/bindings/bindings_base.ts +++ b/packages/duckdb-wasm/src/bindings/bindings_base.ts @@ -470,13 +470,26 @@ export abstract class DuckDBBindingsBase implements DuckDBBindings { protocol: DuckDBDataProtocol, directIO: boolean, ): Promise { - if (protocol === DuckDBDataProtocol.BROWSER_FSACCESS && handle instanceof FileSystemFileHandle) { - // handle is an async handle, should convert to sync handle - const fileHandle: FileSystemFileHandle = handle as any; - try { - handle = (await fileHandle.createSyncAccessHandle()) as any; - } catch (e: any) { - throw new Error( e.message + ":" + name ); + if (protocol === DuckDBDataProtocol.BROWSER_FSACCESS) { + if( handle instanceof FileSystemSyncAccessHandle ){ + //already a handle is sync handle. + } else if( handle instanceof FileSystemFileHandle ){ + // handle is an async handle, should convert to sync handle + const fileHandle: FileSystemFileHandle = handle as any; + try { + handle = (await fileHandle.createSyncAccessHandle()) as any; + } catch (e: any) { + throw new Error( e.message + ":" + name ); + } + } else if( name != null ){ + //handler is not an async handle, should get sync handle from the file name. + try { + const opfsRoot = await navigator.storage.getDirectory(); + const fileHandle = await opfsRoot.getFileHandle(name); + handle = (await fileHandle.createSyncAccessHandle()) as any; + } catch (e: any) { + throw new Error( e.message + ":" + name ); + } } } const [s, d, n] = callSRet( diff --git a/packages/duckdb-wasm/test/opfs.test.ts b/packages/duckdb-wasm/test/opfs.test.ts index f345ee448..2f0ec8c17 100644 --- a/packages/duckdb-wasm/test/opfs.test.ts +++ b/packages/duckdb-wasm/test/opfs.test.ts @@ -96,6 +96,30 @@ export function testOPFS(baseDir: string, bundle: () => duckdb.DuckDBBundle): vo expect(table.getChildAt(0)?.get(0)).toBeGreaterThan(60_000); }); + it('Load Parquet file that are already with empty handler', async () => { + //1. write to opfs + const parquetBuffer = await fetch(`${baseDir}/tpch/0_01/parquet/lineitem.parquet`).then(res => + res.arrayBuffer(), + ); + const opfsRoot = await navigator.storage.getDirectory(); + const fileHandle = await opfsRoot.getFileHandle('test.parquet', {create: true}); + const writable = await fileHandle.createWritable(); + await writable.write(parquetBuffer); + await writable.close(); + //2. handle is empty object, because worker gets a File Handle using the file name. + await db.registerFileHandle('test.parquet', {}, duckdb.DuckDBDataProtocol.BROWSER_FSACCESS, true); + await conn.send(`CREATE TABLE lineitem1 AS SELECT * FROM read_parquet('test.parquet')`); + await conn.send(`CHECKPOINT;`); + + const result1 = await conn.send(`SELECT count(*)::INTEGER as cnt FROM lineitem1;`); + const batches1 = []; + for await (const batch of result1) { + batches1.push(batch); + } + const table1 = await new arrow.Table<{ cnt: arrow.Int }>(batches1); + expect(table1.getChildAt(0)?.get(0)).toBeGreaterThan(60_000); + }); + it('Load Parquet file that are already', async () => { const parquetBuffer = await fetch(`${baseDir}/tpch/0_01/parquet/lineitem.parquet`).then(res => res.arrayBuffer(), From 847010709a3c6a651dc034bfed6fbfb3f6c2628f Mon Sep 17 00:00:00 2001 From: arkw Date: Wed, 9 Oct 2024 12:27:47 +0900 Subject: [PATCH 15/15] update comment --- packages/duckdb-wasm/src/bindings/bindings_base.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/duckdb-wasm/src/bindings/bindings_base.ts b/packages/duckdb-wasm/src/bindings/bindings_base.ts index ac6a4567e..83c91ec45 100644 --- a/packages/duckdb-wasm/src/bindings/bindings_base.ts +++ b/packages/duckdb-wasm/src/bindings/bindings_base.ts @@ -472,7 +472,7 @@ export abstract class DuckDBBindingsBase implements DuckDBBindings { ): Promise { if (protocol === DuckDBDataProtocol.BROWSER_FSACCESS) { if( handle instanceof FileSystemSyncAccessHandle ){ - //already a handle is sync handle. + // already a handle is sync handle. } else if( handle instanceof FileSystemFileHandle ){ // handle is an async handle, should convert to sync handle const fileHandle: FileSystemFileHandle = handle as any; @@ -482,7 +482,7 @@ export abstract class DuckDBBindingsBase implements DuckDBBindings { throw new Error( e.message + ":" + name ); } } else if( name != null ){ - //handler is not an async handle, should get sync handle from the file name. + // should get sync handle from the file name. try { const opfsRoot = await navigator.storage.getDirectory(); const fileHandle = await opfsRoot.getFileHandle(name);