diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 89549467..d04e8689 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -35,24 +35,33 @@ Usage: make Targets: help: ## Show the help. +## +## ================ development ================ debug: ## Build vsag with debug options. -release: ## Build vsag with release options. -distribution: ## Build vsag with distribution options. -libcxx: ## Build vsag using libc++. -fmt: ## Format codes. test: ## Build and run unit tests. asan: ## Build with AddressSanitizer option. -test_asan_parallel: asan ## Run unit tests parallel with AddressSanitizer option. test_asan: asan ## Run unit tests with AddressSanitizer option. tsan: ## Build with ThreadSanitizer option. test_tsan: tsan ## Run unit tests with ThreadSanitizer option. -test_cov: cov ## Build and run unit tests with code coverage enabled. clean: ## Clear build/ directory. +## +## ================ integration ================ +fmt: ## Format codes. +cov: ## Build unit tests with code coverage enabled. +test_parallel: debug ## Run all tests parallel (used in CI). +test_asan_parallel: asan ## Run unit tests parallel with AddressSanitizer option. +test_tsan_parallel: tsan ## Run unit tests parallel with ThreadSanitizer option. +## +## ================ distribution ================ +release: ## Build vsag with release options. +distribution: ## Build vsag with distribution options. +libcxx: ## Build vsag using libc++. +pyvsag: ## Build pyvsag wheel. +clean-release: ## Clear build-release/ directory. install: ## Build and install the release version of vsag. ``` ## Project Structure -- `benchs/`: benchmark script in Python - `cmake/`: cmake util functions - `docker/`: the dockerfile to build develop and ci image - `docs/`: the design documents @@ -60,7 +69,9 @@ install: ## Build and install the release version of vsag. - `extern/`: third-party libraries - `include/`: export header files - `mockimpl/`: the mock implementation that can be used in interface test +- `python/`: the pyvsag package and setup tools - `python_bindings/`: the python bindings - `scripts/`: useful scripts - `src/`: the source codes and unit tests - `tests/`: the functional tests +- `tools/`: the tools diff --git a/Makefile b/Makefile index 80474508..7d360ef9 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,11 @@ CMAKE_INSTALL_PREFIX ?= "/usr/local/" COMPILE_JOBS ?= 6 DEBUG_BUILD_DIR ?= "./build/" RELEASE_BUILD_DIR ?= "./build-release/" -VSAG_CMAKE_ARGS = -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX} -DNUM_BUILDING_JOBS=${COMPILE_JOBS} -DENABLE_TESTS=1 -DENABLE_PYBINDS=1 -G ${CMAKE_GENERATOR} -S. + +VSAG_CMAKE_ARGS := -DCMAKE_EXPORT_COMPILE_COMMANDS=1 +VSAG_CMAKE_ARGS := ${VSAG_CMAKE_ARGS} -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX} -DNUM_BUILDING_JOBS=${COMPILE_JOBS} +VSAG_CMAKE_ARGS := ${VSAG_CMAKE_ARGS} -DENABLE_TESTS=1 -DENABLE_PYBINDS=1 -G ${CMAKE_GENERATOR} -S. + UT_FILTER = "" ifdef CASE UT_FILTER = $(CASE) @@ -22,22 +26,13 @@ help: ## Show the help. @echo "Targets:" @fgrep "##" Makefile | fgrep -v fgrep -# ================= development part ================= +## +## ================ development ================ .PHONY: debug debug: ## Build vsag with debug options. - cmake ${VSAG_CMAKE_ARGS} -B${DEBUG_BUILD_DIR} -DCMAKE_BUILD_TYPE=Debug -DENABLE_CCACHE=ON -DENABLE_ASAN=OFF + cmake ${VSAG_CMAKE_ARGS} -B${DEBUG_BUILD_DIR} -DCMAKE_BUILD_TYPE=Debug -DENABLE_ASAN=OFF -DENABLE_CCACHE=ON cmake --build ${DEBUG_BUILD_DIR} --parallel ${COMPILE_JOBS} -.PHONY: fmt -fmt: ## Format codes. - find include/ -iname "*.h" -o -iname "*.cpp" | xargs clang-format -i - find src/ -iname "*.h" -o -iname "*.cpp" | xargs clang-format -i - find python_bindings/ -iname "*.h" -o -iname "*.cpp" | xargs clang-format -i - find examples/cpp/ -iname "*.h" -o -iname "*.cpp" | xargs clang-format -i - find mockimpl/ -iname "*.h" -o -iname "*.cpp" | xargs clang-format -i - find tests/ -iname "*.h" -o -iname "*.cpp" | xargs clang-format -i - find tools/ -iname "*.h" -o -iname "*.cpp" | xargs clang-format -i - .PHONY: test test: ## Build and run unit tests. cmake ${VSAG_CMAKE_ARGS} -B${DEBUG_BUILD_DIR} -DCMAKE_BUILD_TYPE=Debug -DENABLE_CCACHE=ON @@ -46,21 +41,11 @@ test: ## Build and run unit tests. ./build/tests/functests -d yes ${UT_FILTER} --allow-running-no-tests ${UT_SHARD} ./build/mockimpl/tests_mockimpl -d yes ${UT_FILTER} --allow-running-no-tests ${UT_SHARD} -.PHONY: test_parallel -test_parallel: debug - @./scripts/test_parallel_bg.sh - ./build/mockimpl/tests_mockimpl -d yes ${UT_FILTER} --allow-running-no-tests ${UT_SHARD} - .PHONY: asan asan: ## Build with AddressSanitizer option. - cmake ${VSAG_CMAKE_ARGS} -B${DEBUG_BUILD_DIR} -DCMAKE_BUILD_TYPE=Debug -DENABLE_ASAN=ON -DENABLE_CCACHE=ON + cmake ${VSAG_CMAKE_ARGS} -B${DEBUG_BUILD_DIR} -DCMAKE_BUILD_TYPE=Debug -DENABLE_ASAN=ON -DENABLE_TSAN=OFF -DENABLE_CCACHE=ON cmake --build ${DEBUG_BUILD_DIR} --parallel ${COMPILE_JOBS} -.PHONY: test_asan_parallel -test_asan_parallel: asan ## Run unit tests parallel with AddressSanitizer option. - @./scripts/test_parallel_bg.sh - ./build/mockimpl/tests_mockimpl -d yes ${UT_FILTER} --allow-running-no-tests ${UT_SHARD} - .PHONY: test_asan test_asan: asan ## Run unit tests with AddressSanitizer option. ./build/tests/unittests -d yes ${UT_FILTER} --allow-running-no-tests ${UT_SHARD} @@ -69,7 +54,7 @@ test_asan: asan ## Run unit tests with AddressSanitizer option. .PHONY: tsan tsan: ## Build with ThreadSanitizer option. - cmake ${VSAG_CMAKE_ARGS} -B${DEBUG_BUILD_DIR} -DCMAKE_BUILD_TYPE=Debug -DENABLE_TSAN=ON -DENABLE_CCACHE=ON + cmake ${VSAG_CMAKE_ARGS} -B${DEBUG_BUILD_DIR} -DCMAKE_BUILD_TYPE=Debug -DENABLE_ASAN=OFF -DENABLE_TSAN=ON -DENABLE_CCACHE=ON cmake --build ${DEBUG_BUILD_DIR} --parallel ${COMPILE_JOBS} .PHONY: test_tsan @@ -78,24 +63,38 @@ test_tsan: tsan ## Run unit tests with ThreadSanitizer option. ./build/tests/functests -d yes ${UT_FILTER} --allow-running-no-tests ${UT_SHARD} ./build/mockimpl/tests_mockimpl -d yes ${UT_FILTER} --allow-running-no-tests ${UT_SHARD} -.PHONY: cov # Build unit tests with code coverage enabled. -cov: +.PHONY: clean +clean: ## Clear build/ directory. + rm -rf ${DEBUG_BUILD_DIR}/* + +## +## ================ integration ================ +.PHONY: fmt +fmt: ## Format codes. + @./scripts/format-cpp.sh + +.PHONY: cov +cov: ## Build unit tests with code coverage enabled. cmake ${VSAG_CMAKE_ARGS} -B${DEBUG_BUILD_DIR} -DCMAKE_BUILD_TYPE=Debug -DENABLE_COVERAGE=ON -DENABLE_CCACHE=ON -DENABLE_ASAN=OFF cmake --build ${DEBUG_BUILD_DIR} --parallel ${COMPILE_JOBS} -.PHONY: test_cov -test_cov: cov ## Build and run unit tests with code coverage enabled. - ./build/tests/unittests -d yes ${UT_FILTER} --allow-running-no-tests ${UT_SHARD} - ./build/tests/functests -d yes ${UT_FILTER} --allow-running-no-tests ${UT_SHARD} +.PHONY: test_parallel +test_parallel: debug ## Run all tests parallel (used in CI). + @./scripts/test_parallel_bg.sh ./build/mockimpl/tests_mockimpl -d yes ${UT_FILTER} --allow-running-no-tests ${UT_SHARD} - bash scripts/collect_cpp_coverage.sh - genhtml --output-directory coverage/coverage/html coverage/coverage.info --ignore-errors inconsistent,inconsistent -.PHONY: clean -clean: ## Clear build/ directory. - rm -rf ${DEBUG_BUILD_DIR}/* +.PHONY: test_asan_parallel +test_asan_parallel: asan ## Run unit tests parallel with AddressSanitizer option. + @./scripts/test_parallel_bg.sh + ./build/mockimpl/tests_mockimpl -d yes ${UT_FILTER} --allow-running-no-tests ${UT_SHARD} + +.PHONY: test_tsan_parallel +test_tsan_parallel: tsan ## Run unit tests parallel with ThreadSanitizer option. + @./scripts/test_parallel_bg.sh + ./build/mockimpl/tests_mockimpl -d yes ${UT_FILTER} --allow-running-no-tests ${UT_SHARD} -# ================= distribution part ================= +## +## ================ distribution ================ .PHONY: release release: ## Build vsag with release options. cmake ${VSAG_CMAKE_ARGS} -B${RELEASE_BUILD_DIR} -DCMAKE_BUILD_TYPE=Release @@ -111,15 +110,18 @@ libcxx: ## Build vsag using libc++. cmake ${VSAG_CMAKE_ARGS} -B${RELEASE_BUILD_DIR} -DCMAKE_BUILD_TYPE=Release -DENABLE_LIBCXX=on cmake --build ${RELEASE_BUILD_DIR} --parallel ${COMPILE_JOBS} -.PHONY: install -install: ## Build and install the release version of vsag. - cmake --install ${RELEASE_BUILD_DIR}/ - - PARAM1 := "-DNUM_BUILDING_JOBS=${COMPILE_JOBS} -DENABLE_PYBINDS=1 -S. -B${RELEASE_BUILD_DIR} -DCMAKE_BUILD_TYPE=Release" PARAM2 := "--build ${RELEASE_BUILD_DIR} --parallel ${COMPILE_JOBS}" PARAM3 := "${RELEASE_BUILD_DIR}" -.PHONY: pyvsag ## Build pyvsag wheel -pyvsag: +.PHONY: pyvsag +pyvsag: ## Build pyvsag wheel. bash ./scripts/build_pyvsag_multiple_version.sh $(PARAM1) $(PARAM2) $(PARAM3) + +.PHONY: clean-release +clean-release: ## Clear build-release/ directory. + rm -rf ${RELEASE_BUILD_DIR}/* + +.PHONY: install +install: ## Build and install the release version of vsag. + cmake --install ${RELEASE_BUILD_DIR}/ diff --git a/benchs/README.md b/benchs/README.md deleted file mode 100644 index a8f87d96..00000000 --- a/benchs/README.md +++ /dev/null @@ -1,6 +0,0 @@ -# vsag benchmark - -## Usage -```shell -$ python3 run.py -``` diff --git a/benchs/benchmark/__init__.py b/benchs/benchmark/__init__.py deleted file mode 100644 index af271731..00000000 --- a/benchs/benchmark/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright 2024-present the vsag project -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - diff --git a/benchs/benchmark/bench_index.py b/benchs/benchmark/bench_index.py deleted file mode 100644 index 807f3cea..00000000 --- a/benchs/benchmark/bench_index.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright 2024-present the vsag project -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import h5py -import logging -import numpy as np -import json -import time -from datetime import datetime -import pyvsag -from .dataset import download_and_open_dataset - - -from itertools import product - - -def build_index(index_name, datas, ids, index_parameters): - index = pyvsag.Index(index_name, json.dumps(index_parameters)) - index.build(datas, ids, datas.shape[0], datas.shape[1]) - return index - -def run(config): - - logging.basicConfig(encoding='utf-8', - level=logging.INFO, - format='%(asctime)s [%(levelname)s] %(message)s', - handlers=[logging.FileHandler('/tmp/bench-index.log'), - logging.StreamHandler()]) - logging.info(f'{__file__} running at {datetime.now()}') - logging.info(f'config: {config}') - - for dataset in config["index_test"]: - dataset_name = dataset["dataset_name"] - logging.info(f"dataset: {dataset_name}") - with download_and_open_dataset(dataset_name, logging) as file: - base = np.array(file["train"]) - data_len = base.shape[0] - ids = np.arange(data_len) - dim = base.shape[1] - - for index in dataset["index"]: - build_time = time.time() - index_name = index["index_name"] - instance = build_index(index_name, base, ids, { - "dtype": index["params"]["dtype"], - "metric_type": index["params"]["metric_type"], - "dim": dim, - index_name: index["params"]["build"] - }) - build_time = time.time() - build_time - - query_config = dataset["query"] - query_size = query_config["query_size"] - query = np.array(file["test"][:query_size]) - neighbors = np.array(file["neighbors"][:query_size]) - - if "knn" in query_config: - k = query_config["knn"]["k"] - correct = 0 - time_list = [] - for gt, item in zip(neighbors, query): - search_time = time.time() - labels, distances = instance.knn_search(item, k, json.dumps({ - index_name: index["params"]["search"] - })) - time_list.append(time.time() - search_time) - correct += len(set(labels) & set(gt[:k])) - - logging.info(f"datasize:{data_len}") - logging.info(f"building time: {build_time * 1000 / data_len}, searching time: {(np.sum(time_list)) * 1000 / query_size}") - logging.info(f"recall: {correct/(query_size * k)}") diff --git a/benchs/benchmark/dataset.py b/benchs/benchmark/dataset.py deleted file mode 100644 index 563ff677..00000000 --- a/benchs/benchmark/dataset.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright 2024-present the vsag project -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import oss2 -import os -import h5py -import numpy as np -import pandas as pd -from sklearn.neighbors import NearestNeighbors -import datetime -from oss2.credentials import EnvironmentVariableCredentialsProvider -import ast -from tqdm import tqdm - -OSS_ACCESS_KEY_ID = os.environ.get('OSS_ACCESS_KEY_ID') -OSS_ACCESS_KEY_SECRET = os.environ.get('OSS_ACCESS_KEY_SECRET') -OSS_ENDPOINT = os.environ.get('OSS_ENDPOINT') -OSS_BUCKET = os.environ.get('OSS_BUCKET') -OSS_SOURCE_DIR = os.environ.get('OSS_SOURCE_DIR') - -_auth = oss2.ProviderAuth(EnvironmentVariableCredentialsProvider()) -_bucket = oss2.Bucket(_auth, OSS_ENDPOINT, OSS_BUCKET) -target_dir = '/tmp/dataset' -def download_and_open_dataset(dataset_name, logging=None): - if None in [OSS_ACCESS_KEY_ID, OSS_ACCESS_KEY_SECRET, OSS_ENDPOINT, OSS_BUCKET, OSS_SOURCE_DIR]: - if logging is not None: - logging.error("missing oss env") - exit(-1) - if not os.path.exists(target_dir): - os.makedirs(target_dir) - source_file = os.path.join(OSS_SOURCE_DIR, dataset_name) - target_file = os.path.join(target_dir, dataset_name) - - if not os.path.exists(target_file): - _bucket.get_object_to_file(source_file, target_file) - return h5py.File(target_file, 'r') - - -def read_dataset(dataset_name, logging=None): - with download_and_open_dataset(dataset_name, logging) as file: - train = np.array(file["train"]) - test = np.array(file["test"]) - neighbors = np.array(file["neighbors"]) - distances = np.array(file["distances"]) - return train, test, neighbors, distances - - -def create_dataset(ids, base, query, topk, dataset_name, distance): - if distance == "angular": - metric = "cosine" - elif distance == "euclidean": - metric = "euclidean" - print("data size:", len(ids), len(base)) - print("query size:", len(query)) - nbrs = NearestNeighbors(n_neighbors=topk, metric=metric, algorithm='brute').fit(base) - batch_size = 50 - n_query = len(query) - distances = [] - indices = [] - - for i in tqdm(range(0, n_query, batch_size)): - end = min(i + batch_size, n_query) - batch_query = query[i:end] - D_batch, I_batch = nbrs.kneighbors(batch_query) - distances.append(D_batch) - indices.append(I_batch) - - D = np.vstack(distances) - I = np.vstack(indices) - - with h5py.File(os.path.join(target_dir, dataset_name), "w") as f: - f.create_dataset("ids", data=ids) - f.create_dataset("train", data=base) - f.create_dataset("test", data=query) - f.create_dataset("neighbors", data=I) - f.create_dataset("distances", data=D) - f.attrs["type"] = "dense" - f.attrs["distance"] = distance - f.attrs["dimension"] = len(base[0]) - f.attrs["point_type"] = "float" - - -def csv_to_data(filename, id_column, base_column, dim): - parser = vector_parse_wrapper(dim) - df = pd.read_csv(filename=10000) - df[base_column] = df[base_column].apply(parser) - df_cleaned = df.dropna(subset=[base_column]) - base = np.array(df_cleaned[base_column].tolist()) - ids = np.array(df_cleaned[id_column].tolist()) - ids_dtype = h5py.string_dtype(encoding='utf-8', length=max(len(s) for s in ids)) - ids_array = np.array(ids, dtype=ids_dtype) - return ids_array, base - - - -def csv_to_dataset(base_filename, base_size, query_size, id_column, base_column, dim, distance, dataset_name): - data_ids, data = csv_to_data(base_filename, id_column, base_column, dim) - unique_ids, index = np.unique(data_ids, return_index=True) - unique_data = data[index] - base_ids, base = unique_ids[:base_size], unique_data[:base_size] - qeury_ids, query = unique_ids[-query_size:], unique_data[-query_size:] - create_dataset(base_ids, base, query, 100, dataset_name, distance) - - -# You can customize the parsing function for the vector column and then run the main function. -fail_count = 0 -def vector_parse_wrapper(dim): - def split_vector(x): - global fail_count - try: - data = np.array([float(i) for i in x.split(",")], dtype=np.float32) - if data.shape[0] != dim: - print(fail_count, x, dim) - fail_count += 1 - return None - return data - except: - print(fail_count, x) - fail_count += 1 - return None - return split_vector - - -""" -python script.py data/base.csv 1000000 --id_column=id --vector_column=base_vector --vector_size=2048 --output_file=test.hdf5 -""" - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description="Convert CSV to dataset") - parser.add_argument("csv_file", help="Path to the CSV file") - parser.add_argument("size", type=int, help="Size of dataset") - parser.add_argument("--query_size", type=int, default=10000, help="Index number") - parser.add_argument("--id_column", help="Name of the ID column") - parser.add_argument("--vector_column", help="Name of the vector column") - parser.add_argument("--vector_dim", type=int, help="Dim of the vector") - parser.add_argument("--index_type", choices=["angular", "euclidean"], default="angular", help="Type of index") - parser.add_argument("--output_file", help="Path to the output file") - - args = parser.parse_args() - csv_to_dataset(args.csv_file, args.size, args.column, args.query_size, args.id_column, args.vector_column, args.vector_dim, args.index_type, args.output_file) - - - - - - diff --git a/benchs/run.py b/benchs/run.py deleted file mode 100644 index 694118a7..00000000 --- a/benchs/run.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2024-present the vsag project -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import yaml -from benchmark import bench_index - - -if __name__ == '__main__': - with open('benchs/run.yaml', 'r') as f: - config = yaml.safe_load(f) - - bench_index.run(config) \ No newline at end of file diff --git a/benchs/run.yaml b/benchs/run.yaml deleted file mode 100644 index 11a78fb1..00000000 --- a/benchs/run.yaml +++ /dev/null @@ -1,16 +0,0 @@ -index_test: - - dataset_name: random-100k-128-euclidean.hdf5 - index: - - index_name: hnsw - params: - dtype: float32 - metric_type: l2 - build: - max_degree: 32 - ef_construction: 500 - search: - ef_search: 300 - query: - query_size: 1000 - knn: - k: 1 diff --git a/scripts/format-cpp.sh b/scripts/format-cpp.sh new file mode 100755 index 00000000..fb4e1416 --- /dev/null +++ b/scripts/format-cpp.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +find include/ -iname "*.h" -o -iname "*.cpp" | xargs clang-format -i +find src/ -iname "*.h" -o -iname "*.cpp" | xargs clang-format -i +find python_bindings/ -iname "*.h" -o -iname "*.cpp" | xargs clang-format -i +find examples/cpp/ -iname "*.h" -o -iname "*.cpp" | xargs clang-format -i +find mockimpl/ -iname "*.h" -o -iname "*.cpp" | xargs clang-format -i +find tests/ -iname "*.h" -o -iname "*.cpp" | xargs clang-format -i +find tools/ -iname "*.h" -o -iname "*.cpp" | xargs clang-format -i