diff --git a/.gitmodules b/.gitmodules index 3522bbc56..c70a42b28 100644 --- a/.gitmodules +++ b/.gitmodules @@ -14,3 +14,6 @@ [submodule "deps/bandit"] path = deps/bandit url = https://github.com/joakimkarlsson/bandit.git +[submodule "deps/meta-cmake"] + path = deps/meta-cmake + url = https://github.com/meta-toolkit/meta-cmake.git diff --git a/CHANGELOG.md b/CHANGELOG.md index 32fd0dcd3..13ae6fa26 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,43 @@ +# [v2.2.0][2.2.0] +## New features +- Parallelized versions of PageRank and Personalized PageRank have been + added. A demo is available in `wiki-page-rank`; see the website for + more information on obtaining the required data. +- Add a disk-based streaming minimal perfect hash function library. A + sub-component of this is a small memory-mapped succinct data structure + library for answering rank/select queries on bit vectors. +- Much of our CMake magic has been moved into a separate project included + as a submodule: https://github.com/meta-toolkit/meta-cmake, which can + now be used in other projects to simplify initial build system + configuration. + +## Bug fixes +- Fix parameter settings in language model rankers not being range checked + (issue #134). +- Fix incorrect incoming edge insertion in `directed_graph::add_edge()`. +- Fix `find_first_of` and `find_last_of` in `util::string_view`. + +## Enhancements +- `forward_index` now knows how to tokenize a document down to a + `feature_vector`, provided it was generated with a non-LIBSVM analyzer. +- Allow loading of an existing index where its corpus is no longer + available. +- Data is no longer shuffled in `batch_train`. Shuffling the data + causes horrible access patterns in the postings file, so the data + should instead shuffled before indexing. +- `util::array_view`s can now be constructed as empty. +- `util::multiway_merge` has been made more generic. You can now specify + both the comparison function and merging criteria as parameters, which + default to `operator<` and `operator==`, respectively. +- A simple utility classes `io::mifstream` and `io::mofstream` have been + added for places where a moveable `ifstream` or `ofstream` is desired + as a workaround for older standard libraries lacking these move + constructors. +- The number of indexing threads can be controlled via the configuration + key `indexer-num-threads` (which defaults to the number of threads on + the system), and the number of threads allowed to concurrently write to + disk can be controlled via `indexer-max-writers` (which defaults to 8). + # [v2.1.0][2.1.0] ## New features - Add the [GloVe algorithm](http://www-nlp.stanford.edu/pubs/glove.pdf) for @@ -341,7 +381,8 @@ # [v1.0][1.0] - Initial release. -[unreleased]: https://github.com/meta-toolkit/meta/compare/v2.1.0...develop +[unreleased]: https://github.com/meta-toolkit/meta/compare/v2.2.0...develop +[2.2.0]: https://github.com/meta-toolkit/meta/compare/v2.1.0...v2.2.0 [2.1.0]: https://github.com/meta-toolkit/meta/compare/v2.0.1...v2.1.0 [2.0.1]: https://github.com/meta-toolkit/meta/compare/v2.0.0...v2.0.1 [2.0.0]: https://github.com/meta-toolkit/meta/compare/v1.3.8...v2.0.0 diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ad7cd1a4..e9d799c3a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,12 +13,11 @@ set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) -include(CheckCXXCompilerFlag) -include(CheckCXXSourceCompiles) -include(CheckCXXSourceRuns) include(CMakePushCheckState) include(ExternalProject) -include(cmake/FindOrBuildICU.cmake) +include(deps/meta-cmake/FindOrBuildICU.cmake) +include(deps/meta-cmake/SetClangOptions.cmake) +include(deps/meta-cmake/CompilerKludges.cmake) find_package(Threads REQUIRED) find_package(ZLIB REQUIRED) @@ -37,7 +36,8 @@ if (NOT CMAKE_BUILD_TYPE AND CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) set(CMAKE_BUILD_TYPE "Release") endif() -set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/deps/findicu) +list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/deps/findicu) +list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/deps/meta-cmake/) # We require Unicode 8 for the unit tests, which was added in ICU 56.1 FindOrBuildICU( @@ -47,274 +47,66 @@ FindOrBuildICU( ) add_library(meta-definitions INTERFACE) -target_include_directories(meta-definitions INTERFACE ${PROJECT_SOURCE_DIR}/include) +target_include_directories(meta-definitions INTERFACE + ${CMAKE_CURRENT_SOURCE_DIR}/include) if(UNIX OR MINGW) target_compile_options(meta-definitions INTERFACE -Wall -Wextra -pedantic) if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") - # Enable -Wconversion on clang, since it's not *too* noisy there. - # - # As of GCC 5.2.0, there are still too many spurious warnings to bother - # enabling this there. - target_compile_options(meta-definitions INTERFACE "-Wconversion") - - if(CMAKE_GENERATOR STREQUAL "Ninja") - target_compile_options(meta-definitions INTERFACE "-fcolor-diagnostics") - endif() - - if(ENABLE_LIBCXX) - message("-- Locating libc++...") - find_library(LIBCXX_LIBRARY NAMES c++ cxx) - if(LIBCXX_LIBRARY) - message("-- Located libc++: ${LIBCXX_LIBRARY}") - set(LIBCXX_OPTIONS "-stdlib=libc++") - get_filename_component(LIBCXX_LIB_PATH ${LIBCXX_LIBRARY} - DIRECTORY) - find_path(LIBCXX_PREFIX c++/v1/algorithm - PATHS ${LIBCXX_LIB_PATH}/../include - ${CMAKE_SYSTEM_PREFIX_PATH}) - set(LIBCXX_INCLUDE_DIR ${LIBCXX_PREFIX}/c++/v1/) - message("-- Located libc++ include path: ${LIBCXX_INCLUDE_DIR}") - - message("-- Locating libc++'s abi...") - find_library(LIBCXXABI_LIBRARY NAMES c++abi) - find_library(LIBCXXRT_LIBRARY NAMES cxxrt) - if(LIBCXXABI_LIBRARY) - message("-- Found libc++abi: ${LIBCXXABI_LIBRARY}") - set(CXXABI_LIBRARY ${LIBCXXABI_LIBRARY}) - elseif(LIBCXXRT_LIBRARY) - message("-- Found libcxxrt: ${LIBCXXRT_LIBRARY}") - set(CXXABI_LIBRARY ${LIBCXXRT_LIBRARY}) - else() - message("-- No abi library found. " - "Attempting to continue without one...") - endif() - else() - message("-- Could not find libc++, will not use it.") - endif() - endif() - - find_library(LIBDL_LIBRARY NAMES dl ldl) - if(LIBDL_LIBRARY) - set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${LIBDL_LIBRARY}") - endif() - - if(LIBCXX_OPTIONS) - set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${LIBCXX_OPTIONS}") - endif() - - if(CXXABI_LIBRARY) - set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${LIBCXX_OPTIONS} ${CXXABI_LIBRARY} -L${LIBCXX_LIB_PATH}") - endif() - if(LIBCXX_INCLUDE_DIR) - set(CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES} ${LIBCXX_INCLUDE_DIR}") - endif() + SetClangOptions() endif() endif() -if(STDOPT) - target_compile_options(meta-definitions INTERFACE ${STDOPT}) -endif() - target_include_directories(meta-definitions SYSTEM INTERFACE ${ZLIB_INCLUDE_DIRS}) -if(LIBDL_LIBRARY) +if (LIBDL_LIBRARY) target_link_libraries(meta-definitions INTERFACE ${LIBDL_LIBRARY}) endif() -if(CXXABI_LIBRARY) +if (CXXABI_LIBRARY) target_link_libraries(meta-definitions INTERFACE ${CXXABI_LIBRARY}) endif() -if(LIBCXX_LIBRARY) +if (LIBCXX_FOUND) target_include_directories(meta-definitions SYSTEM INTERFACE ${LIBCXX_INCLUDE_DIR}) target_compile_options(meta-definitions INTERFACE ${LIBCXX_OPTIONS}) target_link_libraries(meta-definitions INTERFACE -L${LIBCXX_LIB_PATH}) target_link_libraries(meta-definitions INTERFACE ${LIBCXX_LIBRARY}) endif() -if(ENABLE_PROFILING) +if (ENABLE_PROFILING) find_library(GPERFTOOLS_PROFILER NAMES profiler REQUIRED) message("-- Found profiler: ${GPERFTOOLS_PROFILER}") target_link_libraries(meta-definitions INTERFACE ${GPERFTOOLS_PROFILER}) endif() find_library(JEMALLOC_LIB NAMES jemalloc) -if(JEMALLOC_LIB AND ENABLE_JEMALLOC) +if (JEMALLOC_LIB AND ENABLE_JEMALLOC) message("-- Using jemalloc: ${JEMALLOC_LIB}") target_link_libraries(meta-definitions INTERFACE ${JEMALLOC_LIB}) else() message("-- Using regular malloc; consider installing jemalloc") endif() -if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") +if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") target_compile_definitions(meta-definitions INTERFACE -D_DARWIN_USE_64_BIT_INODE=1) target_compile_definitions(meta-definitions INTERFACE -DMETA_IS_DARWIN=1) endif() -check_cxx_source_compiles(" -#include -#include -int main () { - auto sp = std::make_shared(1); - auto sp2 = std::atomic_load(&sp); - return 0; -}" META_HAS_STD_SHARED_PTR_ATOMICS) - -if(META_HAS_STD_SHARED_PTR_ATOMICS) - target_compile_definitions(meta-definitions INTERFACE - -DMETA_HAS_STD_SHARED_PTR_ATOMICS=1) -endif() - -check_cxx_source_compiles(" -#include -int main() { - std::ofstream ofs{\"path\"}; - std::ofstream ofs2{std::move(ofs)}; - return 0; -}" META_HAS_STREAM_MOVE) - -if(META_HAS_STREAM_MOVE) - target_compile_definitions(meta-definitions INTERFACE - -DMETA_HAS_STREAM_MOVE=1) -endif() - -check_cxx_source_compiles(" -#include -int main() { - auto i = std::make_unique(1); - return 0; -}" META_HAS_STD_MAKE_UNIQUE) - -if(META_HAS_STD_MAKE_UNIQUE) - target_compile_definitions(meta-definitions INTERFACE - -DMETA_HAS_STD_MAKE_UNIQUE) -endif() - -check_cxx_source_compiles(" -#include -int main() { - std::experimental::optional x; - return 0; -}" META_HAS_EXPERIMENTAL_OPTIONAL) - -if (META_HAS_EXPERIMENTAL_OPTIONAL) - target_compile_definitions(meta-definitions INTERFACE - -DMETA_HAS_EXPERIMENTAL_OPTIONAL) -endif() - -check_cxx_source_compiles(" -#include -int main() { - const std::experimental::string_view sv = \"hello world\"; - // test that string_view has to_string() const method - // Xcode 6.4 appears to have shipped a string_view without it - auto str = sv.to_string(); - return 0; -}" META_HAS_EXPERIMENTAL_STRING_VIEW) - -if (META_HAS_EXPERIMENTAL_STRING_VIEW) - target_compile_definitions(meta-definitions INTERFACE - -DMETA_HAS_EXPERIMENTAL_STRING_VIEW) -endif() - -check_cxx_source_compiles(" -#include - -int main() -{ - std::experimental::filesystem::path p1 = \"/usr\"; - return 0; -}" META_HAS_EXPERIMENTAL_FILESYSTEM) - -if (META_HAS_EXPERIMENTAL_FILESYSTEM) - target_compile_definitions(meta-definitions INTERFACE - -DMETA_HAS_EXPERIMENTAL_FILESYSTEM) -endif() +# set a bunch of preprocessor variables to work around various compiler and +# standard library bugs +CompilerKludges() if(NOT META_HAS_EXPERIMENTAL_FILESYSTEM) target_include_directories(meta-definitions SYSTEM INTERFACE ${PROJECT_SOURCE_DIR}/deps/meta-stlsoft/include) endif() -set(META_FOUND_ALIGNED_ALLOC_IMPL 0) - -check_cxx_source_compiles(" -#include - -int main() -{ - ::aligned_alloc(64, 128); - return 0; -}" META_HAS_ALIGNED_ALLOC) - -if (META_HAS_ALIGNED_ALLOC) - set(META_FOUND_ALIGNED_ALLOC_IMPL 1) - target_compile_definitions(meta-definitions INTERFACE - -DMETA_HAS_ALIGNED_ALLOC) -endif() - -if (NOT META_FOUND_ALIGNED_ALLOC_IMPL AND UNIX) - check_cxx_source_compiles(" - #include - - int main() - { - void* ptr; - ::posix_memalign(&ptr, 64, 128); - return 0; - }" META_HAS_POSIX_MEMALIGN) - - if (META_HAS_POSIX_MEMALIGN) - set(META_FOUND_ALIGNED_ALLOC_IMPL 1) - target_compile_definitions(meta-definitions INTERFACE - -DMETA_HAS_POSIX_MEMALIGN) - endif() -endif() - -if (NOT META_FOUND_ALIGNED_ALLOC_IMPL AND WIN32) - check_cxx_source_compiles(" - #include - - int main() - { - ::_aligned_malloc(128, 64); - return 0; - }" META_HAS_ALIGNED_MALLOC) - - if (META_HAS_ALIGNED_MALLOC) - set(META_FOUND_ALIGNED_ALLOC_IMPL 1) - target_compile_definitions(meta-definitions INTERFACE - -DMETA_HAS_ALIGNED_MALLOC) - endif() -endif() - -if (NOT META_FOUND_ALIGNED_ALLOC_IMPL) - message(FATAL_ERROR "Failed to find a suitable aligned allocation routine") -endif() - -check_cxx_source_compiles(" -int main() -{ - long x = 1; - if (__builtin_expect(x == 1, 0)) - return 1; - return 0; -}" META_HAS_BUILTIN_EXPECT) - -if (META_HAS_BUILTIN_EXPECT) - target_compile_definitions(meta-definitions INTERFACE - -DMETA_HAS_BUILTIN_EXPECT) -endif() +target_link_libraries(meta-definitions INTERFACE compiler-kludges) cmake_pop_check_state() -if(BIICODE) - include(contrib/biicode/CMakeLists.txt) - return() -ENDIF() - set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) add_custom_target(tidy diff --git a/README.md b/README.md index 91fb8827c..cd8f7738f 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ about MeTA! - [Ubuntu](#ubuntu-build-guide) - [Arch Linux](#arch-linux-build-guide) - [Fedora](#fedora-build-guide) + - [CentOS](#centos-build-guide) - [EWS/EngrIT](#ewsengrit-build-guide) (this is UIUC-specific) - [Windows](#windows-build-guide) - [Generic Setup Notes](#generic-setup-notes) @@ -108,13 +109,15 @@ check what version you are on, run the following command: cat /etc/issue ``` -If it reads "Ubuntu 12.04 LTS" or something of that nature, see the -[Ubuntu 12.04 LTS Build Guide](#ubuntu-1204-lts-build-guide). If it reads -"Ubuntu 14.04 LTS" (or 14.10), see the -[Ubuntu 14.04 LTS Build Guide](#ubuntu-1404-lts-build-guide). If your -version is less than 12.04 LTS, your operating system is not supported -(even by your vendor) and you should upgrade to at least 12.04 LTS (or -14.04 LTS, if possible). +Based on what you see, you should proceed with one of the following guides: + +- [Ubuntu 12.04 LTS Build Guide](#ubuntu-1204-lts-build-guide) +- [Ubuntu 14.04 LTS Build Guide](#ubuntu-1404-lts-build-guide) +- [Ubuntu 15.10 Build Guide](#ubuntu-1510-build-guide) + +If your version is less than 12.04 LTS, your operating system is not +supported (even by your vendor!) and you should upgrade to at least 12.04 +LTS (or 14.04 LTS, if possible). ### Ubuntu 12.04 LTS Build Guide Building on Ubuntu 12.04 LTS requires more work than its more up-to-date @@ -216,7 +219,7 @@ sudo add-apt-repository ppa:george-edison55/cmake-3.x sudo apt-get update # install dependencies -sudo apt-get install cmake libicu-dev git libjemalloc-dev zlib1g-dev +sudo apt-get install g++ cmake libicu-dev git libjemalloc-dev zlib1g-dev ``` Once the dependencies are all installed, you should double check your @@ -275,6 +278,46 @@ You can now test the system by running the following command: If everything passes, congratulations! MeTA seems to be working on your system. +## Ubuntu 15.10 Build Guide +Ubuntu's non-LTS desktop offering in 15.10 has enough modern software in +its repositories to build MeTA without much trouble. To install the +dependencies, run the following commands. + +```bash +apt update +apt install g++ git cmake make libjemalloc-dev zlib1g-dev +``` + +Once the dependencies are all installed, you should be ready to build. Run +the following commands to get started: + +```bash +# clone the project +git clone https://github.com/meta-toolkit/meta.git +cd meta/ + +# set up submodules +git submodule update --init --recursive + +# set up a build directory +mkdir build +cd build +cp ../config.toml . + +# configure and build the project +cmake ../ -DCMAKE_BUILD_TYPE=Release +make +``` + +You can now test the system by running the following command: + +```bash +./unit-test --reporter=spec +``` + +If everything passes, congratulations! MeTA seems to be working on your +system. + ## Arch Linux Build Guide Arch Linux consistently has the most up to date packages due to its rolling release setup, so it's often the easiest platform to get set up on. @@ -384,6 +427,77 @@ You can now test the system with the following command: ./unit-test --reporter=spec ``` +## CentOS Build Guide +MeTA can be built in CentOS 7 and above. CentOS 7 comes with a recent +enough compiler (GCC 4.8.5), but too old a version of CMake. We'll thus +install the compiler and related libraries from the package manager and +install our own more recent `cmake` ourselves. + +```bash +# install build dependencies (this will probably take a while) +sudo yum install gcc gcc-c++ git make wget zlib-devel epel-release +sudo yum install jemalloc-devel + +wget http://www.cmake.org/files/v3.2/cmake-3.2.0-Linux-x86_64.sh +sudo sh cmake-3.2.0-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir +``` + +You should be able to run the following commands and see the following +output: + +```bash +g++ --version +``` + +should print + + g++ (GCC) 4.8.5 20150623 (Red Hat 4.8.5-4) + Copyright (C) 2015 Free Software Foundation, Inc. + This is free software; see the source for copying conditions. There is NO + warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +and + +```bash +/usr/local/bin/cmake --version +``` + +should print + + cmake version 3.2.0 + + CMake suite maintained and supported by Kitware (kitware.com/cmake). + +Once the dependencies are all installed, you should be ready to build. Run +the following commands to get started: + +```bash +# clone the project +git clone https://github.com/meta-toolkit/meta.git +cd meta/ + +# set up submodules +git submodule update --init --recursive + +# set up a build directory +mkdir build +cd build +cp ../config.toml . + +# configure and build the project +/usr/local/bin/cmake ../ -DCMAKE_BUILD_TYPE=Release +make +``` + +You can now test the system by running the following command: + +```bash +./unit-test --reporter=spec +``` + +If everything passes, congratulations! MeTA seems to be working on your +system. + ## EWS/EngrIT Build Guide If you are on a machine managed by Engineering IT at UIUC, you should follow this guide. These systems have software that is much too old for @@ -396,7 +510,7 @@ back in to the system**), run the following commands: ```bash module load gcc -module load cmake/3.4.0 +module load cmake/3.5.0 ``` Once you have done this, double check your versions by running the @@ -421,7 +535,7 @@ cmake --version should output - cmake version 3.4.0 + cmake version 3.5.0 CMake suite maintained and supported by Kitware (kitware.com/cmake). @@ -442,7 +556,7 @@ cd build cp ../config.toml . # configure and build the project -CXX="/software/gcc-4.8.2/bin/g++" cmake ../ -DCMAKE_BUILD_TYPE=Release +CXX=`which g++` CC=`which gcc` cmake ../ -DICU_ROOT=/class/cs225/builds/icu make ``` diff --git a/cmake/FindOrBuildICU.cmake b/cmake/FindOrBuildICU.cmake deleted file mode 100644 index 4308b4bf3..000000000 --- a/cmake/FindOrBuildICU.cmake +++ /dev/null @@ -1,118 +0,0 @@ -include(CMakeParseArguments) -include(ExternalProject) - -# Searches the system using find_package for an ICU version that is greater -# or equal to the minimum version specified via the VERSION argument to the -# function. If find_package does not find a suitable version, ICU is added -# as an external project to be downloaded form the specified URL and -# validated with the specified URL_HASH. Currently, building ICU is only -# supported on Linux and OS X; on Windows precompiled binaries must be -# used, either via -DICU_ROOT=C:/path/to/icu or with MSYS2's icu package. -# -# The function creates an interface library "icu" that should be linked -# against by code that wishes to use ICU headers or ICU library functions. -# The library target added should ensure that transitive dependencies are -# satisfied. -# -# This function requires at least CMake version 3.2.0 for the -# BUILD_BYPRODUCTS argument to ExternalProject_Add -function(FindOrBuildICU) - cmake_parse_arguments(FindOrBuildICU "" "VERSION;URL;URL_HASH" "" ${ARGN}) - - if (NOT FindOrBuildICU_VERSION) - message(FATAL_ERROR "You must provide a minimum version") - endif() - - if (NOT FindOrBuildICU_URL) - message(FATAL_ERROR "You must provide a download url to the ICU sources") - endif() - - message("-- Searching for ICU ${FindOrBuildICU_VERSION}") - - find_package(ICU ${FindOrBuildICU_VERSION} COMPONENTS data i18n uc) - - if (NOT ICU_FOUND OR NOT ICU_VERSION VERSION_EQUAL "${FindOrBuildICU_VERSION}") - if (NOT ICU_FOUND) - message("-- ICU not found; attempting to build it...") - else() - message("-- ICU version found is ${ICU_VERSION}, expected ${FindOrBuildICU_VERSION}; attempting to build ICU from scratch...") - endif() - if (WIN32) - # not going to attempt to build ICU if we're on Windows for now - # probably could, but it's more trouble than it's worth I think - message("-- ICU building not supported on Windows.") - if (MINGW) - message(FATAL_ERROR " -- If on MSYS2; please install the icu package via pacman") - else() - message(FATAL_ERROR " -- Please download the latest ICU binaries from http://site.icu-project.org/download") - endif() - elseif(UNIX) - - # determine platform for runConfigureICU - if (APPLE) - set(ICU_PLATFORM "MacOSX") - else() - set(ICU_PLATFORM "Linux") - endif() - - # determine a reasonable number of threads to build ICU with - include(ProcessorCount) - ProcessorCount(CORES) - if (NOT CORES EQUAL 0) - # limit the number of cores to 4 on travis - if (CORES GREATER 4) - if ($ENV{TRAVIS}) - set(CORES 4) - endif() - endif() - set(ICU_MAKE_EXTRA_FLAGS "-j${CORES}") - endif() - - set(ICU_EP_PREFIX ${PROJECT_SOURCE_DIR}/deps/icu) - - ExternalProject_Add(ExternalICU - PREFIX ${ICU_EP_PREFIX} - URL ${FindOrBuildICU_URL} - URL_HASH ${FindOrBuildICU_URL_HASH} - CONFIGURE_COMMAND CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ${ICU_EP_PREFIX}/src/ExternalICU/source/runConfigureICU ${ICU_PLATFORM} - --disable-shared --enable-static --disable-dyload --disable-extras - --disable-tests --disable-samples - --prefix= - BUILD_COMMAND make ${ICU_MAKE_EXTRA_FLAGS} - INSTALL_COMMAND make install - BUILD_BYPRODUCTS ${ICU_EP_PREFIX}/lib/libicudata.a;${ICU_EP_PREFIX}/lib/libicui18n.a;${ICU_EP_PREFIX}/lib/libicuuc.a - ) - set(ICU_INCLUDE_DIRS ${ICU_EP_PREFIX}/include) - - add_library(icudata IMPORTED STATIC) - set_target_properties(icudata PROPERTIES IMPORTED_LOCATION - ${ICU_EP_PREFIX}/lib/libicudata.a) - add_dependencies(icudata ExternalICU) - - add_library(icui18n IMPORTED STATIC) - set_target_properties(icui18n PROPERTIES IMPORTED_LOCATION - ${ICU_EP_PREFIX}/lib/libicui18n.a) - add_dependencies(icui18n ExternalICU) - - add_library(icuuc IMPORTED STATIC) - set_target_properties(icuuc PROPERTIES IMPORTED_LOCATION - ${ICU_EP_PREFIX}/lib/libicuuc.a) - add_dependencies(icuuc ExternalICU) - - set(ICU_LIBRARIES icui18n icuuc icudata) - set(ICU_IS_EXTERNAL TRUE) - else() - message(FATAL_ERROR "-- ICU building not supported for this platform") - endif() - endif() - - message("-- ICU include dirs: ${ICU_INCLUDE_DIRS}") - message("-- ICU libraries: ${ICU_LIBRARIES}") - - add_library(icu INTERFACE) - if (ICU_IS_EXTERNAL) - file(MAKE_DIRECTORY ${ICU_INCLUDE_DIRS}) - endif() - target_link_libraries(icu INTERFACE ${ICU_LIBRARIES}) - target_include_directories(icu SYSTEM INTERFACE ${ICU_INCLUDE_DIRS}) -endfunction() diff --git a/config.toml b/config.toml index 52fb184ae..82b6d1b57 100644 --- a/config.toml +++ b/config.toml @@ -8,17 +8,18 @@ end-exceptions = "../data/sentence-boundaries/sentence-end-exceptions.txt" query-judgements = "../data/ceeaus-qrels.txt" query-path = "../queries.txt" # create this file -dataset = "20newsgroups" +dataset = "ceeaus" corpus = "line.toml" # located inside dataset folder -forward-index = "20news-fwd" -inverted-index = "20news-inv" +forward-index = "ceeaus-fwd" +inverted-index = "ceeaus-inv" indexer-ram-budget = 1024 # **estimated** RAM budget for indexing in MB # always set this lower than your physical RAM! +# indexer-num-threads = 8 # default value is system thread concurrency [[analyzers]] method = "ngram-word" ngram = 1 -filter = "default-chain" +filter = "default-unigram-chain" [ranker] method = "bm25" @@ -87,3 +88,11 @@ test-sections = [23, 23] [language-model] arpa-file = "../data/english-sentences.arpa" binary-file-prefix = "english-sentences-" + +[embeddings] +prefix = "word-embeddings" +filter = [{type = "icu-tokenizer"}, {type = "lowercase"}] +vector-size = 50 +[embeddings.vocab] +min-count = 10 +max-size = 500000 diff --git a/deps/cpptoml b/deps/cpptoml index 3eb45d727..589273870 160000 --- a/deps/cpptoml +++ b/deps/cpptoml @@ -1 +1 @@ -Subproject commit 3eb45d7278167f1032b8d0a4aa1add2965d4272f +Subproject commit 58927387022e149eef4f443599ee7ac35565777b diff --git a/deps/meta-cmake b/deps/meta-cmake new file mode 160000 index 000000000..fa6cdb474 --- /dev/null +++ b/deps/meta-cmake @@ -0,0 +1 @@ +Subproject commit fa6cdb474edeae25d0ffafc44458583e39412d39 diff --git a/include/meta/classify/batch_training.h b/include/meta/classify/batch_training.h index d215bd647..a2537bc5e 100644 --- a/include/meta/classify/batch_training.h +++ b/include/meta/classify/batch_training.h @@ -43,21 +43,18 @@ void batch_train(Index& idx, Classifier& cls, { using diff_type = decltype(training_set.begin())::difference_type; - auto docs = training_set; - std::mt19937 gen(std::random_device{}()); - std::shuffle(docs.begin(), docs.end(), gen); - - // integer-math ceil(docs.size() / batch_size) - auto num_batches = (docs.size() + batch_size - 1) / batch_size; + // integer-math ceil(training_set.size() / batch_size) + auto num_batches = (training_set.size() + batch_size - 1) / batch_size; for (uint64_t i = 0; i < num_batches; ++i) { LOG(progress) << "Training batch " << i + 1 << "/" << num_batches << '\n' << ENDLG; - auto end = std::min((i + 1) * batch_size, docs.size()); + auto end + = std::min((i + 1) * batch_size, training_set.size()); classify::multiclass_dataset batch{ - idx, docs.begin() + static_cast(i * batch_size), - docs.begin() + static_cast(end)}; + idx, training_set.begin() + static_cast(i * batch_size), + training_set.begin() + static_cast(end)}; cls.train(batch); } LOG(progress) << '\n' << ENDLG; diff --git a/include/meta/corpus/metadata_parser.h b/include/meta/corpus/metadata_parser.h index d76a34dcc..2fd4ed3e9 100644 --- a/include/meta/corpus/metadata_parser.h +++ b/include/meta/corpus/metadata_parser.h @@ -14,6 +14,7 @@ #include "meta/corpus/metadata.h" #include "meta/util/optional.h" +#include "meta/io/moveable_stream.h" namespace meta { @@ -45,8 +46,7 @@ class metadata_parser private: /// the parser used to extract metadata - /// unique_ptr because GCC<5.0 can't move streams - std::unique_ptr infile_; + io::mifstream infile_; /// the schema for the metadata being extracted metadata::schema schema_; diff --git a/include/meta/embeddings/coocur_iterator.h b/include/meta/embeddings/coocur_iterator.h index 37135829f..811bc4fb6 100644 --- a/include/meta/embeddings/coocur_iterator.h +++ b/include/meta/embeddings/coocur_iterator.h @@ -14,7 +14,7 @@ #include "meta/embeddings/coocur_record.h" #include "meta/io/filesystem.h" -#include "meta/util/shim.h" +#include "meta/io/moveable_stream.h" namespace meta { @@ -32,7 +32,7 @@ class coocur_iterator coocur_iterator(const std::string& filename) : path_{filename}, - input_{make_unique(filename, std::ios::binary)}, + input_{filename, std::ios::binary}, total_bytes_{filesystem::file_size(filename)}, bytes_read_{0} { @@ -44,10 +44,13 @@ class coocur_iterator coocur_iterator& operator++() { - if (input_->peek() == EOF) + if (input_.stream().peek() == EOF) + { + input_.stream().close(); return *this; + } - bytes_read_ += record_.read(*input_); + bytes_read_ += record_.read(input_.stream()); return *this; } @@ -63,9 +66,9 @@ class coocur_iterator bool operator==(const coocur_iterator& other) const { - if (!other.input_) + if (!other.input_.stream().is_open()) { - return !input_ || !static_cast(*input_); + return !input_.stream().is_open(); } else { @@ -86,7 +89,7 @@ class coocur_iterator private: std::string path_; - std::unique_ptr input_; + io::mifstream input_; coocur_record record_; uint64_t total_bytes_; uint64_t bytes_read_; diff --git a/include/meta/graph/algorithms/centrality.h b/include/meta/graph/algorithms/centrality.h index d0f9c8057..5dfcac70f 100644 --- a/include/meta/graph/algorithms/centrality.h +++ b/include/meta/graph/algorithms/centrality.h @@ -11,10 +11,11 @@ #define META_GRAPH_ALGORITHMS_CENTRALITY_H_ #include +#include + #include "meta/graph/undirected_graph.h" #include "meta/graph/directed_graph.h" - -#include +#include "meta/stats/multinomial.h" namespace meta { @@ -33,6 +34,23 @@ using centrality_result = std::vector>; template centrality_result degree_centrality(const Graph& g); +/** + * Find the PageRank centrality of each node in the graph via power iteration. + * @see http://ilpubs.stanford.edu:8090/422/1/1999-66.pdf + * @param g + * @param damp The dampening (smoothing) factor + * @param max_iters The maximum number of iterations to run the power iteration + * @param jump_dist A personalization vector that indicates the + * jumping probability for nodes in the graph. By default, this is empty + * which signifies a uniform distribution. + * @return a collection of (id, centrality) pairs + */ +template +centrality_result +page_rank_centrality(const DirectedGraph& g, double damp = 0.85, + const stats::multinomial& jump_dist = {}, + uint64_t max_iters = 100); + /** * Find the betweenness centrality of each node in the graph using the algorithm * from Ulrik Brandes, 2001. This function is parallelized as it takes some time diff --git a/include/meta/graph/algorithms/centrality.tcc b/include/meta/graph/algorithms/centrality.tcc index 0089f65d9..35982412c 100644 --- a/include/meta/graph/algorithms/centrality.tcc +++ b/include/meta/graph/algorithms/centrality.tcc @@ -27,8 +27,8 @@ centrality_result degree_centrality(const Graph& g) using pair_t = std::pair; std::sort(res.begin(), res.end(), [&](const pair_t& a, const pair_t& b) { - return a.second > b.second; - }); + return a.second > b.second; + }); return res; } @@ -47,20 +47,80 @@ centrality_result betweenness_centrality(const Graph& g) size_t done = 0; parallel::parallel_for(g.begin(), g.end(), [&](decltype(*g.begin()) n) { - internal::betweenness_step(g, cb, n.id, calc_mut); - std::lock_guard lock{print_mut}; - prog(++done); - }); + internal::betweenness_step(g, cb, n.id, + calc_mut); + std::lock_guard lock{print_mut}; + prog(++done); + }); prog.end(); using pair_t = std::pair; std::sort(cb.begin(), cb.end(), [&](const pair_t& a, const pair_t& b) { - return a.second > b.second; - }); + return a.second > b.second; + }); return cb; } +template +centrality_result +page_rank_centrality(const DirectedGraph& g, double damp /* = 0.85 */, + const stats::multinomial& jump_dist /* = {} */, + uint64_t max_iters /* = 100 */) +{ + if (damp < 0.0 || damp > 1.0) + throw graph_exception{"PageRank dampening factor must be on [0, 1]"}; + + std::vector v(g.size(), 1.0 / g.size()); + std::vector w(g.size(), 0.0); + + parallel::thread_pool pool; + + printing::progress prog{" > Calculating PageRank centrality ", max_iters}; + for (uint64_t iter = 0; iter < max_iters; ++iter) + { + prog(iter); + w.assign(w.size(), 0.0); + + using node = typename DirectedGraph::node_type; + parallel::parallel_for( + g.begin(), g.end(), pool, [&](const node& curr) + { + double sum = 0.0; + for (const auto& n : g.incoming(curr.id)) + { + auto adj_size = g.adjacent(n).size(); + if (adj_size != 0) + sum += v[n] / adj_size; + } + if (jump_dist.counts() == 0.0) + { + w[curr.id] = (1.0 - damp) / g.size() + damp * sum; + } + else + { + w[curr.id] = (1.0 - damp) * jump_dist.probability(curr.id) + + damp * sum; + } + }); + v.swap(w); + } + prog.end(); + + centrality_result evc; + evc.reserve(g.size()); + node_id id{0}; + for (auto& n : v) + evc.emplace_back(id++, n); + + using pair_t = std::pair; + std::sort(evc.begin(), evc.end(), [&](const pair_t& a, const pair_t& b) + { + return a.second > b.second; + }); + return evc; +} + template centrality_result eigenvector_centrality(const Graph& g, uint64_t max_iters /* = 100 */) @@ -90,8 +150,8 @@ centrality_result eigenvector_centrality(const Graph& g, using pair_t = std::pair; std::sort(evc.begin(), evc.end(), [&](const pair_t& a, const pair_t& b) { - return a.second > b.second; - }); + return a.second > b.second; + }); return evc; } diff --git a/include/meta/graph/directed_graph.tcc b/include/meta/graph/directed_graph.tcc index 90dc26d21..ada5ee8b0 100644 --- a/include/meta/graph/directed_graph.tcc +++ b/include/meta/graph/directed_graph.tcc @@ -57,7 +57,7 @@ void directed_graph::add_edge(Edge edge, node_id source, ++num_edges_; list.emplace_back(dest, edge); // add outgoing edge from source to dest - incoming_[source].push_back(dest); // add incoming edge to source + incoming_[dest].push_back(source); // add incoming edge to dest } } } diff --git a/include/meta/graph/graph.h b/include/meta/graph/graph.h index f638f6787..8dc1bcef4 100644 --- a/include/meta/graph/graph.h +++ b/include/meta/graph/graph.h @@ -22,6 +22,8 @@ class graph { public: using adjacency_list = std::vector>; + using node_type = Node; + using edge_type = Edge; virtual ~graph() = default; diff --git a/include/meta/hashing/perfect_hash.h b/include/meta/hashing/perfect_hash.h new file mode 100644 index 000000000..76940a284 --- /dev/null +++ b/include/meta/hashing/perfect_hash.h @@ -0,0 +1,68 @@ +/** + * @file perfect_hash.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#include "meta/succinct/compressed_vector.h" +#include "meta/succinct/sarray.h" + +namespace meta +{ +namespace hashing +{ + +/** + * Query class for the minimal perfect hash functions created by + * perfect_hash_builder. Always returns a number \f$\in [0, N)\f$, even for + * keys that were not used to construct the minimal perfect hash. It is up + * to the user to perform any collision resolution for unknown keys; this + * class simply represents the hash function itself, not a map/table. + */ +template +class perfect_hash +{ + public: + perfect_hash(const std::string& prefix) + : seeds_{prefix + "/seeds"}, + sarray_{prefix + "/sarray"}, + empty_rank_{prefix + "/sarray", sarray_} + { + std::ifstream metadata{prefix + "/hash-metadata.bin", std::ios::binary}; + io::packed::read(metadata, bucket_seed_); + io::packed::read(metadata, num_bins_); + // nothing + } + + uint64_t operator()(const K& key) const + { + using meta::hashing::hash_append; + farm_hash_seeded hasher{bucket_seed_}; + hash_append(hasher, key); + auto hash = static_cast(hasher); + auto bucket_id = hash % seeds_.size(); + auto seed = seeds_[bucket_id]; + auto pos = farm::hash_len_16(hash, seed) % num_bins_; + // the final position is the hash function's position shifted to + // the left by the number of empty bins that came before it. + return pos - empty_rank_.rank(pos); + } + + private: + + /// The seed to use for the bucket hash function + uint64_t bucket_seed_; + /// The number of bins for the perfect hash function + uint64_t num_bins_; + /// The seeds to use for each bucket + succinct::compressed_vector seeds_; + /// The sarray that backs the rank data structure + succinct::sarray sarray_; + /// The ranking data structure that counts the number of empty slots + succinct::sarray_rank empty_rank_; +}; +} +} diff --git a/include/meta/hashing/perfect_hash_builder.h b/include/meta/hashing/perfect_hash_builder.h new file mode 100644 index 000000000..55ce51113 --- /dev/null +++ b/include/meta/hashing/perfect_hash_builder.h @@ -0,0 +1,135 @@ +/** + * @file perfect_hash_builder.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_HASHING_PERFECT_HASH_BUILDER_H_ +#define META_HASHING_PERFECT_HASH_BUILDER_H_ + +#include +#include +#include + +namespace meta +{ +namespace hashing +{ + +/** + * Constructs a minimal perfect hash using a streaming variant of the + * hash, displace, and compress algorithm. Each key is first hashed to a + * bucket (where the number of buckets is < the number of total keys). The + * buckets are then sorted by size, and then each bucket's keys are hashed + * into the range 0, N by iterating through possible seed values for the + * hash function until there are no collisions with keys that have already + * been hashed. For performance reasons, N is bigger than the number of + * keys (defaulting to N = keys / 0.99), so the algorithm results in a + * perfect, but not minimal, hash function. The perfect hash function is + * then reduced to a minimal perfect hash function via the use of a + * succinct rank structure at the cost of some additional space. + * + * This is accomplished for data that is larger than available RAM with the + * following steps: + * + * 1. Hash keys to buckets using a fixed size buffer, sorting by bucket id + * and spilling chunks to disk when full + * 2. Merge chunks and records by the bucket id + * 3. Sort each bucket by size, writing chunks to disk when buffer is full + * 4. Merge chunks (no record merging) + * 5. Find appropriate seed values for each bucket + * 6. Compress the seed values + * 7. Compress the perfect hash function found into a minimal perfect hash + * + * Empirically, our algorithm approaches somewhere in the neighborhood of + * ~2.7 bits per key with our default settings. + * + * @see http://cmph.sourceforge.net/papers/esa09.pdf + */ +template +class perfect_hash_builder +{ + public: + struct options + { + std::string prefix; + uint64_t max_ram = 1024 * 1024 * 1024; // 1 GB + uint64_t num_keys; + uint64_t num_per_bucket = 4; + float load_factor = 0.99f; + + options() = default; + options(const options&) = default; + options(options&&) = default; + options& operator=(const options&) = default; + options& operator=(options&&) = default; + }; + + /** + * @param opts The options for the builder + */ + perfect_hash_builder(options opts); + + /** + * Records observed keys. Should be called *once* per unique key. + * @param key The key to record + */ + void operator()(const K& key); + + /** + * Writes the perfect_hash to disk. + */ + void write(); + + private: + void flush_chunk(); + + void merge_chunks_by_bucket_id(); + + template + void flush_bucket_chunk(Iterator begin, Iterator end); + + void merge_chunks_by_bucket_size(); + void sort_buckets_by_size(); + void construct_perfect_hash(); + + /// The options used during building + options opts_; + + /// The seed for the bucket hash function + uint64_t bucket_seed_; + + /// The number of buckets to use during building + uint64_t num_buckets_; + + /// The current number of chunks that have been flushed to disk + uint64_t num_chunks_; + + struct hashed_key + { + std::size_t idx; + K key; + + hashed_key(std::size_t index, const K& akey) + : idx{index}, key{akey} + { + // nothing + } + + bool operator<(const hashed_key& other) const + { + return idx < other.idx; + } + }; + + /// The buffer used for performing the bucket partitioning + std::vector buffer_; +}; +} +} + +#include "perfect_hash_builder.tcc" +#endif diff --git a/include/meta/hashing/perfect_hash_builder.tcc b/include/meta/hashing/perfect_hash_builder.tcc new file mode 100644 index 000000000..af01c10ae --- /dev/null +++ b/include/meta/hashing/perfect_hash_builder.tcc @@ -0,0 +1,502 @@ +/** + * @file perfect_hash_builder.tcc + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#include +#include +#include +#include "meta/hashing/hash.h" +#include "meta/hashing/probe_set.h" +#include "meta/io/filesystem.h" +#include "meta/io/moveable_stream.h" +#include "meta/io/packed.h" +#include "meta/logging/logger.h" +#include "meta/succinct/compressed_vector.h" +#include "meta/succinct/sarray.h" +#include "meta/util/array_view.h" +#include "meta/util/disk_vector.h" +#include "meta/util/multiway_merge.h" +#include "meta/util/printing.h" +#include "perfect_hash_builder.h" + +namespace meta +{ +namespace hashing +{ + +namespace mph +{ +template +struct bucket_record +{ + std::size_t idx; + std::vector keys; + + void merge_with(bucket_record&& other) + { + std::move(other.keys.begin(), other.keys.end(), + std::back_inserter(keys)); + std::vector{}.swap(other.keys); + } + + uint64_t read(std::istream& stream) + { + std::size_t len; + auto bytes = io::packed::read(stream, len); + bytes += io::packed::read(stream, idx); + + keys.resize(len); + for (std::size_t i = 0; i < len; ++i) + bytes += io::packed::read(stream, keys[i]); + return bytes; + } + + uint64_t write(std::ostream& stream) const + { + auto bytes = io::packed::write(stream, keys.size()); + bytes += io::packed::write(stream, idx); + for (const auto& key : keys) + bytes += io::packed::write(stream, key); + return bytes; + } +}; + +template +bool operator==(const bucket_record& a, const bucket_record& b) +{ + return a.idx == b.idx; +} + +template +bool operator<(const bucket_record a, const bucket_record& b) +{ + return a.idx < b.idx; +} + +template +class chunk_iterator +{ + public: + chunk_iterator() = default; + + chunk_iterator(const std::string& filename) + : input_{filename, std::ios::binary}, + bytes_read_{0}, + total_bytes_{filesystem::file_size(filename)} + { + ++(*this); + } + + chunk_iterator& operator++() + { + if (input_.stream().peek() == EOF) + { + input_.stream().close(); + + assert(*this == chunk_iterator{}); + return *this; + } + + bytes_read_ += record_.read(input_); + return *this; + } + + bucket_record& operator*() + { + return record_; + } + + const bucket_record& operator*() const + { + return record_; + } + + uint64_t total_bytes() const + { + return total_bytes_; + } + + uint64_t bytes_read() const + { + return bytes_read_; + } + + bool operator==(const chunk_iterator& other) const + { + return !input_.stream().is_open() && !other.input_.stream().is_open(); + } + + private: + io::mifstream input_; + bucket_record record_; + uint64_t bytes_read_; + uint64_t total_bytes_; +}; + +template +bool operator!=(const chunk_iterator& a, const chunk_iterator& b) +{ + return !(a == b); +} + +template +std::size_t hash(const K& key, uint64_t seed) +{ + using meta::hashing::hash_append; + farm_hash_seeded hasher{seed}; + hash_append(hasher, key); + return static_cast(hasher); +} +} + +template +perfect_hash_builder::perfect_hash_builder(options opts) + : opts_(opts), // parens to force bad compilers to locate cctor + num_buckets_{opts.num_keys / opts.num_per_bucket + 1}, + num_chunks_{0} +{ + filesystem::make_directory(opts_.prefix); + bucket_seed_ = std::random_device{}(); + + buffer_.reserve(opts_.max_ram / sizeof(hashed_key)); +} + +template +void perfect_hash_builder::flush_chunk() +{ + if (buffer_.empty()) + return; + + auto filename + = opts_.prefix + "/chunk-" + std::to_string(num_chunks_) + ".bin"; + + std::sort(buffer_.begin(), buffer_.end()); + std::ofstream output{filename, std::ios::binary}; + for (auto it = buffer_.begin(); it != buffer_.end();) + { + auto range = std::equal_range(it, buffer_.end(), *it); + + io::packed::write(output, static_cast(std::distance( + range.first, range.second))); + io::packed::write(output, range.first->idx); + for (; range.first != range.second; ++range.first) + io::packed::write(output, range.first->key); + + it = range.second; + } + + buffer_.clear(); + ++num_chunks_; +} + +template +void perfect_hash_builder::operator()(const K& key) +{ + if (buffer_.size() == buffer_.capacity()) + flush_chunk(); + buffer_.emplace_back(mph::hash(key, bucket_seed_) % num_buckets_, key); +} + +template +void perfect_hash_builder::write() +{ + if (!buffer_.empty()) + flush_chunk(); + + // free the buffer memory + std::vector{}.swap(buffer_); + + merge_chunks_by_bucket_id(); + sort_buckets_by_size(); + merge_chunks_by_bucket_size(); + construct_perfect_hash(); + + filesystem::delete_file(opts_.prefix + "/buckets.bin"); +} + +template +void perfect_hash_builder::merge_chunks_by_bucket_id() +{ + { + std::vector> iterators; + for (uint64_t i = 0; i < num_chunks_; ++i) + iterators.emplace_back(opts_.prefix + "/chunk-" + std::to_string(i) + + ".bin"); + + std::ofstream output{opts_.prefix + "/buckets.bin", std::ios::binary}; + util::multiway_merge(iterators.begin(), iterators.end(), + [&](mph::bucket_record&& bucket) + { + bucket.write(output); + }); + } + + // delete temporary files + for (uint64_t i = 0; i < num_chunks_; ++i) + filesystem::delete_file(opts_.prefix + "/chunk-" + std::to_string(i) + + ".bin"); + num_chunks_ = 0; +} + +template +void perfect_hash_builder::sort_buckets_by_size() +{ + // compute the number of buffered keys we can have + // + // storage for this step is broken into two pieces: one huge vector for + // holding keys that we've read out of the buckets file, and one + // smaller vector that holds the starting and ending position for the + // whole buckets that we've read from the buckets file + // + // the total RAM usage is approximately + // num_buf_keys * sizeof(K) + // + num_buckets / num_keys * num_buf_keys * sizeof(array_view) + // which is solved to get the number of buffered keys we should have + + auto num_buf_keys = static_cast( + opts_.max_ram + / (sizeof(K) + + sizeof(util::array_view) * static_cast(num_buckets_) + / opts_.num_keys)); + + auto num_buf_buckets = static_cast( + num_buf_keys * static_cast(num_buckets_) / opts_.num_keys); + + std::vector buffered_keys(num_buf_keys); + std::vector> buckets(num_buf_buckets); + + auto insert_it = buffered_keys.begin(); + auto bucket_it = buckets.begin(); + + mph::chunk_iterator it{opts_.prefix + "/buckets.bin"}; + printing::progress progress{" > Sorting buckets by size: ", + it.total_bytes()}; + for (; it != mph::chunk_iterator{}; ++it) + { + progress(it.bytes_read()); + auto& bucket = *it; + + if (bucket.keys.size() > static_cast(std::distance( + insert_it, buffered_keys.end())) + || bucket_it == buckets.end()) + { + // we can't fit this bucket into our buffer, so we need to sort + // and flush + flush_bucket_chunk(buckets.begin(), bucket_it); + + insert_it = buffered_keys.begin(); + bucket_it = buckets.begin(); + } + + auto bucket_end + = std::move(bucket.keys.begin(), bucket.keys.end(), insert_it); + *bucket_it++ = util::array_view{&*insert_it, bucket.keys.size()}; + insert_it = bucket_end; + } + + if (bucket_it != buckets.begin()) + flush_bucket_chunk(buckets.begin(), bucket_it); +} + +template +template +void perfect_hash_builder::flush_bucket_chunk(Iterator begin, Iterator end) +{ + std::sort(begin, end, + [](const util::array_view& a, const util::array_view& b) + { + return a.size() > b.size(); + }); + + std::ofstream chunk{opts_.prefix + "/chunk-" + std::to_string(num_chunks_) + + ".bin", + std::ios::binary}; + std::for_each(begin, end, [&](const util::array_view& bucket) + { + io::packed::write(chunk, bucket.size()); + io::packed::write(chunk, + mph::hash(bucket[0], bucket_seed_) + % num_buckets_); + for (const auto& key : bucket) + io::packed::write(chunk, key); + }); + ++num_chunks_; +} + +template +void perfect_hash_builder::merge_chunks_by_bucket_size() +{ + std::vector> iterators; + for (uint64_t i = 0; i < num_chunks_; ++i) + { + iterators.emplace_back(opts_.prefix + "/chunk-" + std::to_string(i) + + ".bin"); + } + + std::ofstream output{opts_.prefix + "/buckets.bin", std::ios::binary}; + util::multiway_merge( + iterators.begin(), iterators.end(), + // sort records at head of chunks by their size + // (descending) rather than their bucket index + [](const mph::bucket_record& a, const mph::bucket_record& b) + { + return a.keys.size() > b.keys.size(); + }, + // never merge two records together + [](const mph::bucket_record&, const mph::bucket_record&) + { + return false; + }, + [&](mph::bucket_record&& bucket) + { + bucket.write(output); + }); + + // delete temporary files + for (uint64_t i = 0; i < num_chunks_; ++i) + { + filesystem::delete_file(opts_.prefix + "/chunk-" + std::to_string(i) + + ".bin"); + } + num_chunks_ = 0; +} + +namespace mph +{ +template +std::vector hashes_for_bucket(const mph::bucket_record& bucket, + std::size_t seed) +{ + std::vector hashes(bucket.keys.size()); + std::transform(bucket.keys.begin(), bucket.keys.end(), hashes.begin(), + [&](const K& key) + { + return mph::hash(key, seed); + }); + std::sort(hashes.begin(), hashes.end()); + if (std::adjacent_find(hashes.begin(), hashes.end()) != hashes.end()) + throw std::runtime_error{"hash collision within bucket"}; + return hashes; +} + +template +void hashes_to_indices(ForwardIterator begin, ForwardIterator end, + OutputIterator output, std::size_t seed, std::size_t mod) +{ + std::transform(begin, end, output, [&](const std::size_t& key) + { + return farm::hash_len_16(key, seed) % mod; + }); +} + +bool insert_bucket(std::vector& indices, + std::vector& occupied_slots, std::size_t idx, + uint16_t seed, util::disk_vector& seeds) +{ + auto iit = indices.begin(); + for (; iit != indices.end(); ++iit) + { + if (occupied_slots[*iit]) + break; + occupied_slots[*iit] = true; + } + + // if we failed to place everything without collisions, unset + // the bits and try the next seed + if (iit != indices.end()) + { + for (auto iit2 = indices.begin(); iit2 != iit; ++iit2) + occupied_slots[*iit2] = false; + return false; + } + // otherwise, this seed worked, so store it and move on + else + { + seeds[idx] = seed; + return true; + } +} +} + +template +void perfect_hash_builder::construct_perfect_hash() +{ + auto num_bins = static_cast( + std::ceil(opts_.num_keys / opts_.load_factor)); + std::vector occupied_slots(num_bins, false); + + { + util::disk_vector seeds{opts_.prefix + "/seeds.tmp.bin", + num_buckets_}; + + { + mph::chunk_iterator it{opts_.prefix + "/buckets.bin"}; + printing::progress progress{" > Constructing hash: ", + it.total_bytes()}; + for (; it != mph::chunk_iterator{}; ++it) + { + progress(it.bytes_read()); + const auto& bucket = *it; + + auto hashes = mph::hashes_for_bucket(bucket, bucket_seed_); + + std::vector indices(bucket.keys.size()); + bool success = false; + const uint16_t max_probes + = std::numeric_limits::max(); + for (uint16_t i = 0; i < max_probes && !success; ++i) + { + auto seed = static_cast(i); + + mph::hashes_to_indices(hashes.begin(), hashes.end(), + indices.begin(), seed, num_bins); + + success = mph::insert_bucket(indices, occupied_slots, + bucket.idx, i, seeds); + } + if (!success) + throw std::runtime_error{ + "could not find a seed for a bucket in " + "minimal perfect hash generation"}; + } + } + + LOG(progress) << "> Compressing seeds...\n" << ENDLG; + + // compress the seed vector + succinct::make_compressed_vector(opts_.prefix + "/seeds", seeds.begin(), + seeds.end()); + } + + filesystem::remove_all(opts_.prefix + "/seeds.tmp.bin"); + + LOG(progress) << "> Minimizing hash...\n" << ENDLG; + + // minify the hash using a succinct::sarray + sarray_rank to compress + // the range via rank() queries + std::vector positions; + positions.reserve(occupied_slots.size() - opts_.num_keys); + for (std::size_t i = 0; i < occupied_slots.size(); ++i) + { + if (!occupied_slots[i]) + positions.push_back(i); + } + std::vector{}.swap(occupied_slots); + auto storage = succinct::make_sarray( + opts_.prefix + "/sarray", positions.begin(), positions.end(), num_bins); + succinct::sarray_rank{opts_.prefix + "/sarray", storage}; + + std::ofstream metadata{opts_.prefix + "/hash-metadata.bin", + std::ios::binary}; + io::packed::write(metadata, bucket_seed_); + io::packed::write(metadata, num_bins); + + LOG(progress) << "> Minimum perfect hash constructed\n" << ENDLG; +} +} +} diff --git a/include/meta/index/chunk_reader.h b/include/meta/index/chunk_reader.h index 0aa615599..0dfb0a6da 100644 --- a/include/meta/index/chunk_reader.h +++ b/include/meta/index/chunk_reader.h @@ -17,8 +17,8 @@ #include #include "meta/io/filesystem.h" +#include "meta/io/moveable_stream.h" #include "meta/util/progress.h" -#include "meta/util/shim.h" #include "meta/util/multiway_merge.h" namespace meta @@ -93,8 +93,8 @@ template class chunk_reader { private: - /// the file we're reading from currently, or null if there is none - std::unique_ptr file_; + /// the file we're reading from + io::mifstream file_; /// the path to the file we're reading from std::string path_; /// the current buffered postings data @@ -112,7 +112,7 @@ class chunk_reader * @param filename The path to the chunk to be read */ chunk_reader(const std::string& filename) - : file_{make_unique(filename, std::ios::binary)}, + : file_{filename, std::ios::binary}, path_{filename}, total_bytes_{filesystem::file_size(path_)}, bytes_read_{0} @@ -136,10 +136,9 @@ class chunk_reader ~chunk_reader() { if (file_) - { - file_ = nullptr; - filesystem::delete_file(path_); - } + file_.stream().close(); + + filesystem::delete_file(path_); } /** @@ -147,7 +146,14 @@ class chunk_reader */ void operator++() { - bytes_read_ += postings_.read(*file_); + if (file_.stream().peek() == EOF) + { + file_.stream().close(); + } + else + { + bytes_read_ += postings_.read(file_); + } } /** @@ -189,9 +195,9 @@ class chunk_reader */ bool operator==(const chunk_reader& other) const { - if (!other.file_) + if (!other.file_.stream().is_open()) { - return !file_ || !static_cast(*file_); + return !file_.stream().is_open(); } else { diff --git a/include/meta/index/disk_index.h b/include/meta/index/disk_index.h index e3b50ffef..7fcf38b84 100644 --- a/include/meta/index/disk_index.h +++ b/include/meta/index/disk_index.h @@ -103,7 +103,7 @@ class disk_index class_label label(doc_id d_id) const; /** - * @param d_id THe doc id to find the label_id for + * @param d_id The doc id to find the label_id for * @return the label_id of the class that to document belongs to */ label_id lbl_id(doc_id d_id) const; diff --git a/include/meta/index/forward_index.h b/include/meta/index/forward_index.h index fae047aed..3b9046668 100644 --- a/include/meta/index/forward_index.h +++ b/include/meta/index/forward_index.h @@ -15,6 +15,7 @@ #include "meta/index/disk_index.h" #include "meta/index/make_index.h" #include "meta/index/postings_stream.h" +#include "meta/learn/instance.h" #include "meta/util/disk_vector.h" #include "meta/util/optional.h" #include "meta/meta.h" @@ -142,6 +143,12 @@ class forward_index : public disk_index */ virtual uint64_t unique_terms() const override; + /** + * @param doc The document to tokenize + * @return the analyzed version of the document as a feature vector + */ + learn::feature_vector tokenize(const corpus::document& doc); + private: /** * Loads a forward index from its filesystem representation. diff --git a/include/meta/index/make_index.h b/include/meta/index/make_index.h index 5619d5ee0..8a67d8fbe 100644 --- a/include/meta/index/make_index.h +++ b/include/meta/index/make_index.h @@ -92,10 +92,16 @@ std::shared_ptr make_index(const cpptoml::table& config, config, std::forward(args)...); // if index has already been made, load it - if (!filesystem::make_directory(idx->index_name()) && idx->valid()) + if (filesystem::exists(idx->index_name()) && idx->valid()) + { idx->load_index(); + } else + { + if (!filesystem::exists(idx->index_name())) + filesystem::make_directory(idx->index_name()); idx->create_index(config, docs); + } return idx; } @@ -106,8 +112,52 @@ std::shared_ptr make_index(const cpptoml::table& config, template std::shared_ptr make_index(const cpptoml::table& config, Args&&... args) { - auto docs = corpus::make_corpus(config); - return make_index(config, *docs, std::forward(args)...); + + // check if we have paths specified for either kind of index + if (!(config.contains("forward-index") + && config.contains("inverted-index"))) + { + throw typename Index::exception{ + "forward-index or inverted-index missing from configuration file"}; + } + + // make sure that the index names are different! + auto fwd_name = config.get_as("forward-index"); + auto inv_name = config.get_as("inverted-index"); + + if (*fwd_name == *inv_name) + { + throw typename Index::exception{ + "forward and inverted index names must be different!"}; + } + + // below is needed so that make_shared can find a public ctor to invoke + struct make_shared_enabler : public Index + { + make_shared_enabler(const cpptoml::table& config, Args&&... args) + : Index(config, std::forward(args)...) + { + // nothing + } + }; + auto idx = std::make_shared( + config, std::forward(args)...); + + // if index has already been made, load it + if (filesystem::exists(idx->index_name()) && idx->valid()) + { + idx->load_index(); + } + else + { + if (!filesystem::exists(idx->index_name())) + filesystem::make_directory(idx->index_name()); + + auto docs = corpus::make_corpus(config); + idx->create_index(config, *docs); + } + + return idx; } /** diff --git a/include/meta/index/ranker/absolute_discount.h b/include/meta/index/ranker/absolute_discount.h index aa086a131..bb7935db3 100644 --- a/include/meta/index/ranker/absolute_discount.h +++ b/include/meta/index/ranker/absolute_discount.h @@ -35,15 +35,16 @@ namespace index class absolute_discount : public language_model_ranker { public: - /** - * The identifier of this ranker. - */ + /// The identifier of this ranker. const static util::string_view id; + /// Default value of delta + const static constexpr float default_delta = 0.7f; + /** * @param delta */ - absolute_discount(float delta = 0.7f); + absolute_discount(float delta = default_delta); /** * Loads an absolute_discount ranker from a stream. diff --git a/include/meta/index/ranker/ranker.h b/include/meta/index/ranker/ranker.h index 2dca3951e..fa2f4fea2 100644 --- a/include/meta/index/ranker/ranker.h +++ b/include/meta/index/ranker/ranker.h @@ -122,6 +122,15 @@ struct ranker_context }; } +/** + * Exception class for ranker interactions. + */ +class ranker_exception : public std::runtime_error +{ + public: + using std::runtime_error::runtime_error; +}; + /** * A ranker scores a query against all the documents in an inverted index, * returning a list of documents sorted by relevance. diff --git a/include/meta/index/string_list_writer.h b/include/meta/index/string_list_writer.h index 6b98bbd92..2874ae95e 100644 --- a/include/meta/index/string_list_writer.h +++ b/include/meta/index/string_list_writer.h @@ -14,11 +14,7 @@ #include #include -#if !META_HAS_STREAM_MOVE -#include -#include "meta/util/shim.h" -#endif - +#include "meta/io/moveable_stream.h" #include "meta/util/disk_vector.h" namespace meta @@ -61,41 +57,11 @@ class string_list_writer void insert(uint64_t idx, const std::string& elem); private: -#if META_HAS_STREAM_MOVE - using ofstream = std::ofstream; - std::ofstream& file() - { - return string_file_; - } - ofstream make_file(const std::string& path) - { - return std::ofstream{path}; - } -#else - /// workaround for lack of move operators for gcc 4.8 - using ofstream = std::unique_ptr; - /** - * @return a reference to the file stream - */ - std::ofstream& file() - { - return *string_file_; - } - /** - * @param path The path to the file - * @return a std::ofstream created from the file - */ - ofstream make_file(const std::string& path) - { - return make_unique(path); - } -#endif - /// Writes are internally synchronized std::mutex mutex_; /// The file containing the strings - ofstream string_file_; + io::mofstream string_file_; /// Keeps track of the write position uint64_t write_pos_; diff --git a/include/meta/io/filesystem.h b/include/meta/io/filesystem.h index baf410a0e..ef5e6704d 100644 --- a/include/meta/io/filesystem.h +++ b/include/meta/io/filesystem.h @@ -14,6 +14,10 @@ #include #include +#if META_HAS_EXPERIMENTAL_FILESYSTEM +#include +#endif + namespace meta { namespace filesystem @@ -55,6 +59,12 @@ bool make_directory(const std::string& dir_name); */ bool file_exists(const std::string& filename); +/** + * @param path The path to check + * @return true if the path (file or folder) exists + */ +bool exists(const std::string& path); + /** * Calculates a file's size in bytes with support for files over 4GB. * @param filename The path for the file diff --git a/include/meta/io/moveable_stream.h b/include/meta/io/moveable_stream.h new file mode 100644 index 000000000..db6b8544c --- /dev/null +++ b/include/meta/io/moveable_stream.h @@ -0,0 +1,152 @@ +/** + * @file moveable_stream.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_IO_MOVEABLE_STREAM_H_ +#define META_IO_MOVEABLE_STREAM_H_ + +#include +#include "meta/util/shim.h" + +namespace meta +{ +namespace io +{ + +namespace detail +{ +template +T& get_stream(std::unique_ptr& stream) +{ + return *stream; +} + +template +const T& get_stream(const std::unique_ptr& stream) +{ + return *stream; +} + +inline std::ifstream& get_stream(std::ifstream& stream) +{ + return stream; +} + +inline const std::ifstream& get_stream(const std::ifstream& stream) +{ + return stream; +} + +inline std::ofstream& get_stream(std::ofstream& stream) +{ + return stream; +} + +inline const std::ofstream& get_stream(const std::ofstream& stream) +{ + return stream; +} + +#if META_HAS_STREAM_MOVE +template +T make_stream(Args&&... args) +{ + return T{std::forward(args)...}; +} + +#else +template +std::unique_ptr make_stream(Args&&... args) +{ + return make_unique(std::forward(args)...); +} +#endif + +template +struct default_openmode; + +template <> +struct default_openmode +{ + const static constexpr std::ios_base::openmode value = std::ios_base::in; +}; + +template <> +struct default_openmode +{ + const static constexpr std::ios_base::openmode value = std::ios_base::out; +}; +} + +/** + * A stupid wrapper around a std::fstream to work around GCC's libstdc++ + * lacking move constructors for std::fstream until GCC 5. + */ +template +class mfstream +{ + public: + mfstream() : stream_{detail::make_stream()} + { + // nothing + } + + explicit mfstream(const char* filename, + std::ios_base::openmode mode + = detail::default_openmode::value) + : stream_{detail::make_stream(filename, mode)} + { + // nothing + } + + explicit mfstream(const std::string& filename, + std::ios_base::openmode mode + = detail::default_openmode::value) + : mfstream{filename.c_str(), mode} + { + // nothing + } + + operator Stream&() + { + return detail::get_stream(stream_); + } + + operator const Stream&() const + { + return detail::get_stream(stream_); + } + + Stream& stream() + { + return detail::get_stream(stream_); + } + + const Stream& stream() const + { + return detail::get_stream(stream_); + } + + explicit operator bool() const + { + return static_cast(detail::get_stream(stream_)); + } + + private: +#if META_HAS_STREAM_MOVE + Stream stream_; +#else + std::unique_ptr stream_; +#endif +}; + +using mifstream = mfstream; +using mofstream = mfstream; +} +} +#endif diff --git a/include/meta/learn/dataset.h b/include/meta/learn/dataset.h index dd1bec3e6..f04561030 100644 --- a/include/meta/learn/dataset.h +++ b/include/meta/learn/dataset.h @@ -10,66 +10,19 @@ #define META_LEARN_DATASET_H_ #include + #include "meta/corpus/metadata.h" #include "meta/index/forward_index.h" #include "meta/index/inverted_index.h" #include "meta/index/postings_data.h" +#include "meta/learn/instance.h" #include "meta/util/progress.h" #include "meta/util/range.h" -#include "meta/util/sparse_vector.h" -#include "meta/util/identifiers.h" namespace meta { namespace learn { - -using feature_id = term_id; -using feature_vector = util::sparse_vector; - -MAKE_NUMERIC_IDENTIFIER_UDL(instance_id, uint64_t, _inst_id) - -inline void print_liblinear(std::ostream& os, const feature_vector& weights) -{ - for (const auto& count : weights) - os << ' ' << (count.first + 1) << ':' << count.second; -} - -/** - * Represents an instance in the dataset, consisting of its id and - * feature_vector. - */ -struct instance -{ - template - instance(instance_id inst_id, ForwardIterator begin, ForwardIterator end) - : id{inst_id}, weights{begin, end} - { - // nothing - } - - instance(instance_id inst_id, feature_vector wv) - : id{inst_id}, weights{std::move(wv)} - { - // nothing - } - - instance(instance_id inst_id) : id{inst_id}, weights{} - { - // nothing - } - - void print_liblinear(std::ostream& os) const - { - learn::print_liblinear(os, weights); - } - - /// the id within the dataset that contains this instance - instance_id id; - /// the weights of the features in this instance - const feature_vector weights; -}; - /** * Represents an in-memory view of a set of documents for running learning * algorithms over. @@ -92,6 +45,10 @@ class dataset : total_features_{idx->unique_terms()} { auto size = static_cast(std::distance(begin, end)); + + if (!size) + return; + instances_.reserve(size); printing::progress progress{" > Loading instances into memory: ", size}; diff --git a/include/meta/learn/instance.h b/include/meta/learn/instance.h new file mode 100644 index 000000000..777ee57fe --- /dev/null +++ b/include/meta/learn/instance.h @@ -0,0 +1,66 @@ +/** + * @file instance.h + * @author Chase Geigle + * + * All files in META are released under the MIT license. For more details, + * consult the file LICENSE in the root of the project. + */ + +#ifndef META_LEARN_INSTANCE_H_ +#define META_LEARN_INSTANCE_H_ + +#include "meta/util/sparse_vector.h" +#include "meta/util/identifiers.h" + +namespace meta +{ +namespace learn +{ +using feature_id = term_id; +using feature_vector = util::sparse_vector; + +MAKE_NUMERIC_IDENTIFIER_UDL(instance_id, uint64_t, _inst_id) + +inline void print_liblinear(std::ostream& os, const feature_vector& weights) +{ + for (const auto& count : weights) + os << ' ' << (count.first + 1) << ':' << count.second; +} + +/** + * Represents an instance in the dataset, consisting of its id and + * feature_vector. + */ +struct instance +{ + template + instance(instance_id inst_id, ForwardIterator begin, ForwardIterator end) + : id{inst_id}, weights{begin, end} + { + // nothing + } + + instance(instance_id inst_id, feature_vector wv) + : id{inst_id}, weights{std::move(wv)} + { + // nothing + } + + instance(instance_id inst_id) : id{inst_id}, weights{} + { + // nothing + } + + void print_liblinear(std::ostream& os) const + { + learn::print_liblinear(os, weights); + } + + /// the id within the dataset that contains this instance + instance_id id; + /// the weights of the features in this instance + const feature_vector weights; +}; +} +} +#endif diff --git a/include/meta/math/integer.h b/include/meta/math/integer.h new file mode 100644 index 000000000..a50b9b3d4 --- /dev/null +++ b/include/meta/math/integer.h @@ -0,0 +1,35 @@ +/** + * @file integer.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_MATH_INTEGER_H_ +#define META_MATH_INTEGER_H_ + +namespace meta +{ +namespace math +{ +namespace integer +{ +/** + * @param num The numerator + * @param denom The denominator + * @return \f$\lceil \frac{num}{denom} \rceil\f$ + */ +template +auto div_ceil(IntType1 num, IntType2 denom) -> decltype(num / denom) +{ + auto denominator = static_cast(denom); + // this should be 1 instruction on most architectures since the div + // instruction also returns the remainder + return (num / denominator) + (num % denominator != 0); +} +} +} +} +#endif diff --git a/include/meta/succinct/bit_vector.h b/include/meta/succinct/bit_vector.h new file mode 100644 index 000000000..41d791ce5 --- /dev/null +++ b/include/meta/succinct/bit_vector.h @@ -0,0 +1,184 @@ +/** + * @file bit_vector.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_SUCCINCT_BIT_VECTOR_H_ +#define META_SUCCINCT_BIT_VECTOR_H_ + +#include +#include +#include + +#include "meta/util/array_view.h" +#include "meta/util/likely.h" + +namespace meta +{ +namespace succinct +{ + +/** + * Represents a collection of bits packed into a word (uint64_t) to be + * written into a bit_vector. + */ +class packed_bits +{ + public: + packed_bits(uint64_t word, uint8_t len) : word_{word}, len_{len} + { + if (META_UNLIKELY(len > 64)) + throw std::invalid_argument{"bit length longer than word"}; + + auto mask = len_ == 64 ? static_cast(-1) : (1ull << len_) - 1; + word_ &= mask; + } + + inline uint64_t word() const + { + return word_; + } + + inline uint8_t size() const + { + return len_; + } + + private: + uint64_t word_; + uint8_t len_; +}; + +/** + * Writes a word-aligned bit vector to a file to be mapped in later. + */ +template +class bit_vector_builder +{ + public: + bit_vector_builder(WordWriter&& writer) + : cur_word_{0}, + bit_in_word_{0}, + total_bits_{0}, + writer_{std::forward(writer)} + { + // nothing + } + + void write_bits(packed_bits bits) + { + if (64 - bit_in_word_ >= bits.size()) + { + // we can fit these bits in the current word + cur_word_ |= (bits.word() << bit_in_word_); + bit_in_word_ += bits.size(); + + if (bit_in_word_ == 64) + flush_word(); + } + else + { + // we don't have enough room, so we need to append what we can, + // flush the word, and then set the current word to the + // remaining bits we didn't write + auto num_written = static_cast(64 - bit_in_word_); + cur_word_ |= (bits.word() << bit_in_word_); + flush_word(); + cur_word_ = (bits.word() >> num_written); + bit_in_word_ = static_cast(bits.size() - num_written); + } + total_bits_ += bits.size(); + } + + uint64_t total_bits() const + { + return total_bits_; + } + + ~bit_vector_builder() + { + if (bit_in_word_) + flush_word(); + } + + private: + void flush_word() + { + writer_(cur_word_); + bit_in_word_ = 0; + cur_word_ = 0; + } + + uint64_t cur_word_; + uint8_t bit_in_word_; + uint64_t total_bits_; + WordWriter writer_; +}; + +namespace detail +{ +template +struct is_ostream_reference +{ + const static constexpr bool value + = std::is_convertible::value; +}; + +struct ostream_word_writer +{ + ostream_word_writer(std::ostream& out) : out_(out) + { + // nothing + } + + void operator()(uint64_t word) + { + out_.write(reinterpret_cast(&word), sizeof(uint64_t)); + } + + std::ostream& out_; +}; +} + +template ::value>::type> +bit_vector_builder make_bit_vector_builder(WordWriter&& writer) +{ + return bit_vector_builder{std::forward(writer)}; +} + +inline bit_vector_builder +make_bit_vector_builder(std::ostream& os) +{ + detail::ostream_word_writer writer{os}; + return {std::move(writer)}; +} + +/** + * Conceptually views a contiguous chunk of words as a (const) bit vector. + */ +class bit_vector_view +{ + public: + bit_vector_view(util::array_view data, uint64_t num_bits); + + bool operator[](uint64_t bit_idx) const; + + uint64_t extract(uint64_t bit_idx, uint8_t len) const; + + util::array_view data() const; + + uint64_t size() const; + + private: + util::array_view data_; + const uint64_t num_bits_; +}; +} +} +#endif diff --git a/include/meta/succinct/broadword.h b/include/meta/succinct/broadword.h new file mode 100644 index 000000000..4a4735e52 --- /dev/null +++ b/include/meta/succinct/broadword.h @@ -0,0 +1,176 @@ +/** + * @file broadword.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_SUCCINCT_BROADWORD_H_ +#define META_SUCCINCT_BROADWORD_H_ + +#include + +namespace meta +{ +namespace succinct +{ +/** + * A collection of functions for "broadword" bit operations like + * selection, msb and lsb finding, etc. + * + * Most of the code here is taken from code by Giuseppe Ottaviano and + * released under the Apache 2.0 license. + * + * @see https://github.com/ot/succinct/blob/master/broadword.hpp + * @see https://github.com/ot/succinct/blob/master/tables.hpp + * @see https://github.com/ot/succinct/blob/master/LICENSE + */ +namespace broadword +{ + +const uint8_t select_in_byte[2048] = { + 8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, + 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, + 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, + 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, + 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, + 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, + 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, + 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, + 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, + 1, 0, 2, 0, 1, 0, 8, 8, 8, 1, 8, 2, 2, 1, 8, 3, 3, 1, 3, 2, 2, 1, 8, 4, 4, + 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, + 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 6, 6, 1, 6, + 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, + 2, 1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, + 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 7, 7, 1, 7, 2, 2, 1, 7, 3, 3, 1, 3, 2, 2, 1, + 7, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 7, 5, 5, 1, 5, 2, 2, 1, 5, + 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 7, 6, + 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, + 1, 3, 2, 2, 1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, + 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 8, 8, 8, 8, 8, 8, 2, 8, 8, 8, 3, 8, + 3, 3, 2, 8, 8, 8, 4, 8, 4, 4, 2, 8, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 5, 8, 5, + 5, 2, 8, 5, 5, 3, 5, 3, 3, 2, 8, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, + 2, 8, 8, 8, 6, 8, 6, 6, 2, 8, 6, 6, 3, 6, 3, 3, 2, 8, 6, 6, 4, 6, 4, 4, 2, + 6, 4, 4, 3, 4, 3, 3, 2, 8, 6, 6, 5, 6, 5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2, 6, + 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 7, 8, 7, 7, 2, 8, 7, + 7, 3, 7, 3, 3, 2, 8, 7, 7, 4, 7, 4, 4, 2, 7, 4, 4, 3, 4, 3, 3, 2, 8, 7, 7, + 5, 7, 5, 5, 2, 7, 5, 5, 3, 5, 3, 3, 2, 7, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, + 4, 3, 3, 2, 8, 7, 7, 6, 7, 6, 6, 2, 7, 6, 6, 3, 6, 3, 3, 2, 7, 6, 6, 4, 6, + 4, 4, 2, 6, 4, 4, 3, 4, 3, 3, 2, 7, 6, 6, 5, 6, 5, 5, 2, 6, 5, 5, 3, 5, 3, + 3, 2, 6, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 3, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 4, 8, 4, 4, 3, + 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 3, 8, 8, 8, 5, 8, 5, 5, 4, 8, + 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 3, 8, 8, + 8, 6, 8, 6, 6, 4, 8, 6, 6, 4, 6, 4, 4, 3, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, + 5, 6, 5, 5, 3, 8, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, + 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 3, 8, 8, 8, 7, 8, 7, 7, 4, 8, 7, 7, 4, 7, + 4, 4, 3, 8, 8, 8, 7, 8, 7, 7, 5, 8, 7, 7, 5, 7, 5, 5, 3, 8, 7, 7, 5, 7, 5, + 5, 4, 7, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, + 3, 8, 7, 7, 6, 7, 6, 6, 4, 7, 6, 6, 4, 6, 4, 4, 3, 8, 7, 7, 6, 7, 6, 6, 5, + 7, 6, 6, 5, 6, 5, 5, 3, 7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, + 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 4, 8, 8, 8, 8, 8, + 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 5, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, + 5, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, + 7, 8, 8, 8, 7, 8, 7, 7, 4, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, + 8, 8, 8, 7, 8, 7, 7, 5, 8, 7, 7, 5, 7, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 7, 8, + 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 4, 8, 8, + 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, + 5, 6, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, + 8, 8, 8, 6, 8, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, + 8, 7, 7, 6, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 7, 8, + 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7}; + +const static constexpr uint64_t ones_step_4 = 0x1111111111111111ULL; +const static constexpr uint64_t ones_step_8 = 0x0101010101010101ULL; +const static constexpr uint64_t msbs_step_8 = 0x80ULL * ones_step_8; + +inline uint64_t byte_counts(uint64_t word) +{ + word = word - ((word & 0xa * ones_step_4) >> 1); + word = (word & 3 * ones_step_4) + ((word >> 2) & 3 * ones_step_4); + word = (word + (word >> 4)) & 0x0f * ones_step_8; + return word; +} + +inline uint64_t bytes_sum(uint64_t word) +{ + return word * ones_step_8 >> 56; +} + +inline uint64_t popcount(uint64_t word) +{ +#if META_BROADWORD_USE_POPCOUNT + return static_cast(__builtin_popcountll(word)); +#else + return bytes_sum(byte_counts(word)); +#endif +} + +inline uint64_t lsb(uint64_t word) +{ + // TODO: how good is the builtin here when it's not a single instruction? + return static_cast(__builtin_ctzll(word)); +} + +inline uint64_t msb(uint64_t word) +{ + // TODO: how good is the builtin here when it's not a single instruction? + return 64 - static_cast(__builtin_clzll(word)); +} + +inline uint64_t select_in_word(uint64_t word, uint64_t k) +{ + uint64_t byte_sums = byte_counts(word) * ones_step_8; + + const uint64_t k_step_8 = k * ones_step_8; + const uint64_t geq_k_step_8 + = (((k_step_8 | msbs_step_8) - byte_sums) & msbs_step_8); +#if META_BROADWORD_USE_POPCOUNT + const uint64_t place = intrinsics::popcount(geq_k_step_8) * 8; +#else + const uint64_t place + = ((geq_k_step_8 >> 7) * ones_step_8 >> 53) & ~uint64_t(0x7); +#endif + const uint64_t byte_rank + = k - (((byte_sums << 8) >> place) & uint64_t(0xFF)); + return place + + select_in_byte[((word >> place) & 0xFF) | (byte_rank << 8)]; +} +} +} +} +#endif diff --git a/include/meta/succinct/compressed_vector.h b/include/meta/succinct/compressed_vector.h new file mode 100644 index 000000000..24d4916a2 --- /dev/null +++ b/include/meta/succinct/compressed_vector.h @@ -0,0 +1,77 @@ +/** + * @file compressed_vector.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_SUCCINCT_COMPRESSED_VECTOR_H_ +#define META_SUCCINCT_COMPRESSED_VECTOR_H_ + +#include + +#include "meta/succinct/bit_vector.h" +#include "meta/succinct/broadword.h" +#include "meta/succinct/sarray.h" +#include "meta/util/disk_vector.h" + +namespace meta +{ +namespace succinct +{ + +/** + * Compressed, \f$O(1)\f$ time random-access sequences of unsigned 64-bit + * numbers. In order for this to work, the total sum of the minimal binary + * representation length for each integer (excluding leading zeroes) must + * fit in a 64-bit integer. Otherwise, the behavior is not defined. + */ +class compressed_vector +{ + public: + compressed_vector(const std::string& prefix); + + uint64_t operator[](uint64_t i) const; + + uint64_t size() const; + + private: + util::disk_vector numbers_; + sarray positions_; + sarray_select select_; +}; + +template +void make_compressed_vector(const std::string& prefix, ForwardIterator begin, + ForwardIterator end) +{ + filesystem::make_directory(prefix); + std::ofstream bv_stream{prefix + "/compressed-vec.bin", std::ios::binary}; + auto bv_builder = make_bit_vector_builder(bv_stream); + + uint64_t num_elems = 0; + uint64_t num_bits = 0; + for (auto it = begin; it != end; ++it) + { + uint64_t word = *it; + num_bits += (word) ? broadword::msb(word) : 1; + ++num_elems; + } + + filesystem::make_directory(prefix + "/sarray"); + sarray_builder s_builder{prefix + "/sarray", num_elems + 1, num_bits}; + s_builder(bv_builder.total_bits()); + for (auto it = begin; it != end; ++it) + { + uint64_t word = *it; + uint64_t len = (word) ? broadword::msb(word) : 1; + bv_builder.write_bits({word, static_cast(len)}); + + s_builder(bv_builder.total_bits()); + } +} +} +} +#endif diff --git a/include/meta/succinct/darray.h b/include/meta/succinct/darray.h new file mode 100644 index 000000000..3e1a1cb59 --- /dev/null +++ b/include/meta/succinct/darray.h @@ -0,0 +1,367 @@ +/** + * @file darray.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_SUCCINCT_DARRAY_H_ +#define META_SUCCINCT_DARRAY_H_ + +#include +#include +#include "meta/io/binary.h" +#include "meta/io/filesystem.h" +#include "meta/io/packed.h" +#include "meta/succinct/bit_vector.h" +#include "meta/succinct/broadword.h" +#include "meta/util/disk_vector.h" +#include "meta/util/pimpl.h" + +namespace meta +{ +namespace succinct +{ + +struct word_identity +{ + uint64_t operator()(uint64_t word) const + { + return word; + } +}; + +struct word_inverse +{ + uint64_t operator()(uint64_t word) const + { + return ~word; + } +}; + +namespace darray_detail +{ +inline std::string blocks_file(const std::string& prefix) +{ + return prefix + "/darray.blocks.bin"; +} + +inline std::string sub_blocks_file(const std::string& prefix) +{ + return prefix + "/darray.subblocks.bin"; +} + +inline std::string explicit_positions_file(const std::string& prefix) +{ + return prefix + "/darray.explicit.bin"; +} + +inline std::string num_ones_file(const std::string& prefix) +{ + return prefix + "/darray.num_ones.bin"; +} + +/** + * \f$L\f$ from the paper, the number of ones within each block + */ +const static constexpr uint64_t ones_per_block = 1 << 10; +/** + * \f$L_2\f$ from the paper, the maximum bit span a block can have + * before all of its ones' positions are recorded explicitly. + */ +const static constexpr uint64_t max_distance = 1 << 16; +/** + * \f$L_3\f$ from the paper: within each block with distance below + * \f$L_3\f$, store the position of every \f$L_3\f$-th on. + */ +const static constexpr uint64_t sub_block_stride = 32; +} + +/** + * A builder for the darray succinct data structure from Okanohara and + * Sadakane for answering select queries on dense bit arrays of length + * \f$n\f$ where the number of ones \f$m\f$ is about \f$n/2\f$. + * + * @see http://arxiv.org/abs/cs/0610001 + */ +template +class darray_builder +{ + public: + /** + * Constructs a darray over the given bit vector, writing output into + * the folder denoted by prefix. + * + * Adapted from code by Giuseppe Ottaviano and released under the + * Apache 2.0 license. + * + * @see https://github.com/ot/succinct/blob/master/darray.hpp + * @see https://github.com/ot/succinct/blob/master/LICENSE + */ + darray_builder(const std::string& prefix, bit_vector_view bvv) + { + using namespace darray_detail; + filesystem::make_directory(prefix); + + /** + * Output stream for an array that stores the positions of the + * \f$iL + 1\f$-th one if the block size was less than \f$L_2\f$, and + * a negative number indicating the index into the explicit array + * otherwise. + */ + std::ofstream blocks{blocks_file(prefix), std::ios::binary}; + /** + * Output stream for an array that stores every \f$L_3\f$-th one + * within blocks of size less than \f$L_2\f$ (and some undefined + * number for every \f$L_3\f$-th one within larger blocks). + */ + std::ofstream sub_blocks{sub_blocks_file(prefix), std::ios::binary}; + /** + * Output stream for storing the explicit positions of ones for the + * blocks that were larger than \f$L_2\f$. + */ + std::ofstream explicit_positions{explicit_positions_file(prefix), + std::ios::binary}; + + uint64_t num_ones = 0; + std::vector current_block; + current_block.reserve(ones_per_block); + auto data = bvv.data(); + for (uint64_t word_pos = 0; word_pos < data.size(); ++word_pos) + { + // reverse the word if needed + auto word = WordReader{}(data[word_pos]); + uint64_t bit_pos = word_pos * 64; + + // until we've read every bit in the bit vector, or we've run + // out of ones in the current word + while (bit_pos < bvv.size() && word) + { + // find the position of the next 1 + auto one_pos = broadword::lsb(word); + bit_pos += one_pos; + word >>= one_pos; + + // record the position of the one and flush the block if + // needed + current_block.push_back(bit_pos); + if (current_block.size() == ones_per_block) + { + flush_block(current_block, blocks, sub_blocks, + explicit_positions); + } + + // move everything forward past the one + ++bit_pos; + word >>= 1; + + // record that we've seen another one + ++num_ones; + } + } + + if (!current_block.empty()) + { + flush_block(current_block, blocks, sub_blocks, explicit_positions); + } + + // force the file to have data + if (num_explicit_ones_ == 0) + io::write_binary(explicit_positions, static_cast(-1)); + + std::ofstream num_ones_file{darray_detail::num_ones_file(prefix), + std::ios::binary}; + io::packed::write(num_ones_file, num_ones); + } + + private: + /** + * Flushes a completed block of ones to disk. + */ + void flush_block(std::vector& current_block, std::ostream& blocks, + std::ostream& sub_blocks, std::ostream& explicit_positions) + { + using namespace darray_detail; + + // if the block is larger than L_2, store every one position + // explicitly in S_l + if (current_block.back() - current_block.front() > max_distance) + { + io::write_binary(blocks, -num_explicit_ones_ - 1); + num_explicit_ones_ += static_cast(current_block.size()); + + for (const auto& pos : current_block) + { + io::write_binary(explicit_positions, pos); + } + + for (std::size_t i = 0; i < current_block.size(); + i += sub_block_stride) + { + io::write_binary(sub_blocks, static_cast(-1)); + } + } + // otherwise, store every L_3-th one in the block in S_s + else + { + io::write_binary(blocks, static_cast(current_block[0])); + + for (std::size_t i = 0; i < current_block.size(); + i += sub_block_stride) + { + auto offset = static_cast(current_block[i] + - current_block[0]); + io::write_binary(sub_blocks, offset); + } + } + current_block.clear(); + } + + /** + * The total number of one positions that have been written to the + * explicit positions array. + */ + int64_t num_explicit_ones_ = 0; +}; + +template +class darray +{ + public: + /** + * Loads or creates a darray, stored in files in the given prefix + * (folder). + * + * @param prefix The folder containing the output from a + * darray_builder + * @param bvv The bit vector that this darray should index over + */ + darray(const std::string& prefix, bit_vector_view bvv) + { + if (!is_valid(prefix)) + { + darray_builder{prefix, bvv}; + } + impl_ = make_unique(prefix, bvv); + } + + /** + * Determines the position of the \f$i\f$-th one in the bit vector. + */ + uint64_t select(uint64_t i) const + { + return impl_->select(i); + } + + /** + * @return the number of indexed positions in the vector + */ + uint64_t num_positions() const + { + return impl_->num_ones; + } + + private: + static bool is_valid(const std::string& prefix) + { + return filesystem::file_exists(darray_detail::blocks_file(prefix)) + && filesystem::file_exists( + darray_detail::sub_blocks_file(prefix)) + && filesystem::file_exists( + darray_detail::explicit_positions_file(prefix)) + && filesystem::file_exists(darray_detail::num_ones_file(prefix)); + } + + struct impl + { + impl(const std::string& prefix, bit_vector_view bv) + : bvv{bv}, + blocks{darray_detail::blocks_file(prefix)}, + sub_blocks{darray_detail::sub_blocks_file(prefix)}, + explicit_positions{darray_detail::explicit_positions_file(prefix)} + { + std::ifstream num_ones_file{darray_detail::num_ones_file(prefix), + std::ios::binary}; + io::packed::read(num_ones_file, num_ones); + } + + uint64_t select(uint64_t i) const + { + using namespace darray_detail; + + if (META_UNLIKELY(i > num_ones)) + throw std::out_of_range{"index out of range in select query"}; + + auto block_idx = i / ones_per_block; + if (blocks[block_idx] < 0) + { + // this was one of the blocks that was stored explicitly + auto block_start + = static_cast(-blocks[block_idx] - 1); + return explicit_positions[block_start + i % ones_per_block]; + } + + // otherwise, look up the closest L_3-th one and do a + // sequential scan from there + auto subblock_idx = i / sub_block_stride; + auto one_count = i % sub_block_stride; + auto start_pos = static_cast(blocks[block_idx]) + + sub_blocks[subblock_idx]; + + auto words = bvv.data(); + if (one_count == 0) + return start_pos; + + auto word_idx = start_pos / 64; + auto word_pos = start_pos % 64; + auto word = WordReader{}(words[word_idx]) + & (static_cast(-1) << word_pos); + + while (true) + { + auto popcount = broadword::popcount(word); + if (one_count < popcount) + break; + one_count -= popcount; + word = WordReader{}(words[++word_idx]); + } + + return 64 * word_idx + broadword::select_in_word(word, one_count); + } + + /** + * The bit vector view the darray indexes over. + */ + bit_vector_view bvv; + /** + * An array that stores the positions of the \f$iL + 1\f$-th one if + * the block size was less than \f$L_2\f$, and a negative number + * indicating the index into the explicit array otherwise. + */ + util::disk_vector blocks; + /** + * An array that stores every \f$L_3\f$-th one within blocks of + * size less than \f$L_2\f$ (and some undefined number for every + * \f$L_3\f$-th one within larger blocks). + */ + util::disk_vector sub_blocks; + /** + * An array storing the explicit positions of ones for the blocks + * that were larger than \f$L_2\f$. + */ + util::disk_vector explicit_positions; + /** + * The total number of ones found during construction. + */ + uint64_t num_ones; + }; + std::unique_ptr impl_; +}; + +using darray1 = darray<>; +using darray0 = darray; +} +} +#endif diff --git a/include/meta/succinct/sarray.h b/include/meta/succinct/sarray.h new file mode 100644 index 000000000..3fbe9140d --- /dev/null +++ b/include/meta/succinct/sarray.h @@ -0,0 +1,158 @@ +/** + * @file sarray.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_SUCCINCT_SARRAY_H_ +#define META_SUCCINCT_SARRAY_H_ + +#include "meta/succinct/broadword.h" +#include "meta/io/filesystem.h" +#include "meta/succinct/darray.h" + +namespace meta +{ +namespace succinct +{ + +namespace sarray_detail +{ +inline std::string low_file(const std::string& prefix) +{ + return prefix + "/sarray.low.bin"; +} + +inline std::string high_file(const std::string& prefix) +{ + return prefix + "/sarray.high.bin"; +} + +inline std::string num_bits_file(const std::string& prefix) +{ + return prefix + "/sarray.high.num_bits.bin"; +} +} + +/** + * Builder for the high and low bits storage for the sarray succinct data + * structure. + */ +class sarray_builder +{ + public: + sarray_builder(const std::string& prefix, uint64_t num_ones, + uint64_t num_bits); + + void operator()(uint64_t one_pos); + + ~sarray_builder(); + + private: + using builder_type = bit_vector_builder; + + std::ofstream low_stream_; + std::ofstream high_stream_; + std::ofstream nb_stream_; + std::unique_ptr low_builder_; + std::unique_ptr high_builder_; + uint8_t low_bits_; + uint64_t low_mask_; + const uint64_t num_ones_; + uint64_t num_calls_ = 0; + uint64_t curr_high_word_ = 0; + uint64_t high_word_idx_ = 0; + uint64_t high_word_pos_ = 0; +}; + +/** + * Storage class for the high and low bits of the sarray structure. To + * query, you need to construct/load the corresponding sarray_rank or + * sarray_select objects. + */ +class sarray +{ + public: + sarray(const std::string& prefix); + + bit_vector_view high_bits() const; + + bit_vector_view low_bits() const; + + /** + * @return the number of low bits that were stored per number + */ + uint8_t num_low_bits() const; + + private: + util::disk_vector high_bits_; + util::disk_vector low_bits_; + uint64_t high_bit_count_; + uint8_t num_low_bits_; // not the total count, but per number! +}; + +/** + * Query class for rank queries on an sarray succinct data structure. + */ +class sarray_rank +{ + public: + sarray_rank(const std::string& prefix, const sarray& sarr); + + uint64_t rank(uint64_t i) const; + + uint64_t size() const; + + private: + const sarray* sarray_; + const darray0 high_bit_zeroes_; +}; + +/** + * Query class for select queries on an sarray succinct data structure. + */ +class sarray_select +{ + public: + sarray_select(const std::string& prefix, const sarray& sarr); + + uint64_t select(uint64_t i) const; + + uint64_t size() const; + + private: + const sarray* sarray_; + const darray1 high_bit_ones_; +}; + +/** + * + * A builder for the sarray succinct data structure from Okanohara and + * Sadakane for answering rank queries on sparse bit arrays. + * + * @see http://arxiv.org/abs/cs/0610001 + * + * Constructs an sarray over the given positions, writing files out to + * the folder denoted by prefix. The positions must be sorted and must + * be <= total_bits. + */ +template +sarray make_sarray(const std::string& prefix, ForwardIterator begin, + ForwardIterator end, uint64_t total_bits) +{ + { + filesystem::make_directory(prefix); + auto num_ones = static_cast(std::distance(begin, end)); + sarray_builder builder{prefix, num_ones, total_bits}; + + for (; begin != end; ++begin) + builder(*begin); + } + return {prefix}; +} +} +} +#endif diff --git a/include/meta/util/aligned_allocator.h b/include/meta/util/aligned_allocator.h index 7c2cfe5c9..3df18612a 100644 --- a/include/meta/util/aligned_allocator.h +++ b/include/meta/util/aligned_allocator.h @@ -21,23 +21,13 @@ #elif META_HAS_ALIGNED_MALLOC #include "meta/util/aligned_alloc_msvc.h" #endif +#include "meta/math/integer.h" namespace meta { namespace util { -namespace detail -{ -template -Integer idiv_ceil(Integer num, Integer denom) -{ - // this should be 1 instruction on most architectures since the div - // instruction also returns the remainder - return (num / denom) + (num % denom != 0); -} -} - template struct aligned_allocator { @@ -64,8 +54,8 @@ struct aligned_allocator // determine adjusted size // ::aligned_alloc requires the size to be an integer multiple of // the requested alignment - auto size - = alignment_size * detail::idiv_ceil(n * sizeof(T), alignment_size); + auto size = alignment_size + * math::integer::div_ceil(n * sizeof(T), alignment_size); auto ptr = static_cast(detail::aligned_alloc(alignment_size, size)); if (!ptr && n > 0) diff --git a/include/meta/util/array_view.h b/include/meta/util/array_view.h index 7b906035b..9f4a11b69 100644 --- a/include/meta/util/array_view.h +++ b/include/meta/util/array_view.h @@ -26,6 +26,14 @@ template class array_view { public: + /** + * Constructs an empty array view. + */ + array_view() : start_{nullptr}, end_{nullptr} + { + // nothing + } + /** * Constructs an array view starting at the given starting point of * the specified length. @@ -120,7 +128,7 @@ class array_view */ std::size_t size() const { - return end_ - start_; + return static_cast(end_ - start_); } private: diff --git a/include/meta/util/disk_vector.h b/include/meta/util/disk_vector.h index 5dec9aa10..2a7747705 100644 --- a/include/meta/util/disk_vector.h +++ b/include/meta/util/disk_vector.h @@ -104,139 +104,29 @@ class disk_vector */ uint64_t size() const; + using iterator = T*; + using const_iterator = const T*; + /** - * Provides iterator functionality for the disk_vector class. + * @return an iterator to the beginning of this container */ - class iterator : public std::iterator - { - /// Need to access disk_vector representation - friend disk_vector; - - private: - /// The current index this iterator is at - uint64_t idx_; - - /// The current element this iterator is at - T* data_; - - /** - * Constructor for disk_vector to use. - * @param idx The index to start out at - * @param data The data element to initially contain - */ - iterator(uint64_t idx, T* data) : idx_{idx}, data_{data} - { - /* nothing */ - } - - public: - /// Constructor. - iterator() : idx_{0}, data_{nullptr} - { - /* nothing */ - } - - /// Copy constructor. - iterator(const iterator& other) : idx_{other.idx_}, data_{other.data_} - { - /* nothing */ - } - - /// assignment operator. - iterator& operator=(iterator other) - { - std::swap(*this, other); - return *this; - } - - /// Pre-increment. - iterator& operator++() - { - ++idx_; - return *this; - } - - /// Post-increment. - iterator operator++(int) - { - iterator save{*this}; - ++idx_; - return save; - } - - /// Pre-decrement. - iterator& operator--() - { - --idx_; - return *this; - } - - /// Post-decrement. - iterator operator--(int) - { - iterator save{*this}; - --idx_; - return *this; - } - - /// Equality. - bool operator==(const iterator& other) - { - return other.idx_ == idx_ && other.data_ == data_; - } - - /// Inequality. - bool operator!=(const iterator& other) - { - return !(*this == other); - } - - /// Dereference operator. - T& operator*() - { - return data_[idx_]; - } - - /// Arrow operator. - const T* operator->() - { - return &data_[idx_]; - } - - /// Operator<. - bool operator<(const iterator& other) const - { - return idx_ < other.idx_; - } - - /// Operator>. - bool operator>(const iterator& other) const - { - return idx_ > other.idx_; - } - - /// Operator<=. - bool operator<=(const iterator& other) const - { - return idx_ <= other.idx_; - } - - /// Operator>=. - bool operator>=(const iterator& other) const - { - return idx_ >= other.idx_; - } - }; + iterator begin(); /** - * @return an iterator to the beginning of this container + * @return an iterator to the beginning of this container (const + * version) */ - iterator begin() const; + const_iterator begin() const; /** * @return an iterator to the end of this container */ - iterator end() const; + iterator end(); + + /** + * @return an iterator to the end of this container (const version) + */ + const_iterator end() const; private: /// the path to the file this disk_vector uses for storage diff --git a/include/meta/util/disk_vector.tcc b/include/meta/util/disk_vector.tcc index a58ceaba9..656e01e68 100644 --- a/include/meta/util/disk_vector.tcc +++ b/include/meta/util/disk_vector.tcc @@ -129,15 +129,27 @@ uint64_t disk_vector::size() const } template -typename disk_vector::iterator disk_vector::begin() const +auto disk_vector::begin() -> iterator { - return iterator{0, start_}; + return start_; } template -typename disk_vector::iterator disk_vector::end() const +auto disk_vector::begin() const -> const_iterator { - return iterator{size_, start_}; + return start_; +} + +template +auto disk_vector::end() const -> const_iterator +{ + return start_ + size_; +} + +template +auto disk_vector::end() -> iterator +{ + return start_ + size_; } } } diff --git a/include/meta/util/multiway_merge.h b/include/meta/util/multiway_merge.h index 20905879c..f9c58ff31 100644 --- a/include/meta/util/multiway_merge.h +++ b/include/meta/util/multiway_merge.h @@ -31,11 +31,11 @@ namespace util * * - Record: * A Record must represent the atomic items that are to be merged. They - * are comparable via operator< and operator==, and must have a member - * function merge_with(Record&&). During the merging process, Records - * will be read from the individual chunks (via + * must have a member function merge_with(Record&&). During the merging + * process, Records will be read from the individual chunks (via * ChunkIterator::operator++), merge_with will be called across all - * Records across all chunks that compare equal, and the final merged + * Records across all chunks that should merge according to the + * predicate specified (defaulting to operator==), and the final merged * Record will be passed to the write callback. * * - ForwardIterator: @@ -67,12 +67,26 @@ namespace util * iterator shall compare equal to the default-constructed * ChunkIterator. * + * - Compare: + * A simple comparison function to be used for sorting the records. + * Defaults to operator<. + * + * - ShouldMerge: + * A binary function that returns true if the two records given to it + * as arguments should be merged together via Record::merge_with(). + * Defaults to operator==. + * + * - RecordHandler: + * A unary function that is called once per every unique Record after + * merging. + * * @return the total number of unique Records that were written to the * OutputStream */ - -template +template uint64_t multiway_merge(ForwardIterator begin, ForwardIterator end, + Compare&& record_comp, ShouldMerge&& should_merge, RecordHandler&& output) { using ChunkIterator = typename ForwardIterator::value_type; @@ -96,9 +110,9 @@ uint64_t multiway_merge(ForwardIterator begin, ForwardIterator end, for (; begin != end; ++begin) to_merge.emplace_back(*begin); - auto chunk_iter_comp = [](const ChunkIterator& a, const ChunkIterator& b) + auto chunk_iter_comp = [&](const ChunkIterator& a, const ChunkIterator& b) { - return *a < *b; + return record_comp(*a, *b); }; uint64_t unique_records = 0; @@ -121,10 +135,13 @@ uint64_t multiway_merge(ForwardIterator begin, ForwardIterator end, ++range.first; std::for_each(range.first, range.second, [&](ChunkIterator& iter) { - merged.merge_with(std::move(*iter)); - auto before = iter.bytes_read(); - ++iter; - total_read += (iter.bytes_read() - before); + if (should_merge(merged, *iter)) + { + merged.merge_with(std::move(*iter)); + auto before = iter.bytes_read(); + ++iter; + total_read += (iter.bytes_read() - before); + } }); // write out merged record @@ -141,6 +158,28 @@ uint64_t multiway_merge(ForwardIterator begin, ForwardIterator end, return unique_records; } + +/** + * A simplified wrapper for multiway_merge that uses the default comparison + * (operator<) and merge criteria (operator==). + */ +template +uint64_t multiway_merge(ForwardIterator begin, ForwardIterator end, + RecordHandler&& output) +{ + using Record = typename std::remove_reference::type; + + auto record_comp = [](const Record& a, const Record& b) + { + return a < b; + }; + auto record_equal = [](const Record& a, const Record& b) + { + return a == b; + }; + return multiway_merge(begin, end, record_comp, record_equal, + std::forward(output)); +} } } #endif diff --git a/include/meta/util/string_view.h b/include/meta/util/string_view.h index 5f352a784..8058bf72b 100644 --- a/include/meta/util/string_view.h +++ b/include/meta/util/string_view.h @@ -288,7 +288,7 @@ class basic_string_view = std::search(begin() + pos, end(), s.begin(), s.end(), Traits::eq); if (it == end()) return npos; - return std::distance(begin(), it); + return static_cast(std::distance(begin(), it)); } constexpr size_type find(Char c, size_type pos = 0) const noexcept @@ -350,7 +350,7 @@ class basic_string_view Traits::eq); if (it == end()) return npos; - return std::distance(begin(), it); + return static_cast(std::distance(begin(), it)); } constexpr size_type find_first_of(Char c, size_type pos = 0) const noexcept @@ -407,14 +407,19 @@ class basic_string_view if (pos >= size()) return npos; - auto it = std::find_if( - begin(), end(), [&](const_reference c) - { - return std::find(s.begin(), s.end(), c, Traits::eq) == s.end(); - }); + auto it + = std::find_if(begin(), end(), [&](const_reference c) + { + return std::find_if(s.begin(), s.end(), + [&](const_reference sc) + { + return Traits::eq(c, sc); + }) + == s.end(); + }); if (it == end()) return npos; - return std::distance(begin(), it); + return static_cast(std::distance(begin(), it)); } constexpr size_type find_first_not_of(Char c, size_type pos = 0) const @@ -442,11 +447,16 @@ class basic_string_view return npos; auto diff = size() - std::min(size(), pos); - auto it = std::find_if( - rbegin() + diff, rend(), [&](const_reference c) - { - return std::find(s.begin(), s.end(), c, Traits::eq) == s.end(); - }); + auto it + = std::find_if(rbegin() + diff, rend(), [&](const_reference c) + { + return std::find_if(s.begin(), s.end(), + [&](const_reference sc) + { + return Traits::eq(c, sc); + }) + == s.end(); + }); if (it == rend()) return npos; return size() - 1 - std::distance(rbegin(), it); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 374cbbaf7..4a5d73b8e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -14,6 +14,7 @@ add_subdirectory(parser) add_subdirectory(regression) add_subdirectory(sequence) add_subdirectory(stats) +add_subdirectory(succinct) add_subdirectory(tools) add_subdirectory(topics) add_subdirectory(utf) diff --git a/src/corpus/metadata_parser.cpp b/src/corpus/metadata_parser.cpp index f19a9ceff..e10e64772 100644 --- a/src/corpus/metadata_parser.cpp +++ b/src/corpus/metadata_parser.cpp @@ -17,7 +17,7 @@ namespace corpus metadata_parser::metadata_parser(const std::string& filename, metadata::schema schema) - : infile_{make_unique(filename)}, schema_{std::move(schema)} + : infile_{filename}, schema_{std::move(schema)} { // nothing } @@ -26,14 +26,14 @@ std::vector metadata_parser::next() { std::vector mdata; std::string str; - if (*infile_) + if (infile_) { - std::getline(*infile_, str); + std::getline(infile_.stream(), str); util::string_view line{str}; mdata.reserve(schema_.size()); for (const auto& finfo : schema_) { - if (!*infile_ || line.empty()) + if (!infile_ || line.empty()) throw metadata_exception{ "metadata input file ended prematurely"}; diff --git a/src/graph/tools/CMakeLists.txt b/src/graph/tools/CMakeLists.txt index 2ae4d7df6..208de12ac 100644 --- a/src/graph/tools/CMakeLists.txt +++ b/src/graph/tools/CMakeLists.txt @@ -1,2 +1,6 @@ add_executable(graph-test graph_test.cpp) target_link_libraries(graph-test meta-util ${CMAKE_THREAD_LIBS_INIT}) + +add_executable(wiki-page-rank wiki_page_rank.cpp) +target_link_libraries(wiki-page-rank cpptoml meta-io meta-util + ${CMAKE_THREAD_LIBS_INIT}) diff --git a/src/graph/tools/wiki_page_rank.cpp b/src/graph/tools/wiki_page_rank.cpp new file mode 100644 index 000000000..c886b690f --- /dev/null +++ b/src/graph/tools/wiki_page_rank.cpp @@ -0,0 +1,112 @@ +/** + * @file wiki_page_rank.cpp + * @author Sean Massung + * + * Demo for PageRank and Personalized PageRank. + * For input files and format, @see http://haselgrove.id.au/wikipedia.htm + */ + +#include "cpptoml.h" +#include "meta/graph/algorithms/algorithms.h" +#include "meta/logging/logger.h" +#include "meta/io/filesystem.h" + +using namespace meta; + +template +void print_results(const DirectedGraph& g, const ResultList& res, + uint64_t top_k) +{ + for (uint64_t idx = 0; idx < top_k && idx < g.size(); ++idx) + { + std::cout << " " << (idx + 1) << ". " << g.node(res[idx].first).label + << " " << res[idx].second << std::endl; + } +} + +/** + * Parses the Wikipedia links files and creates a directed graph with nodes + * labeled as Wikipedia article titles. + * For input files and format, @see http://haselgrove.id.au/wikipedia.htm + */ +graph::directed_graph<> create_network(const cpptoml::table& config) +{ + auto titles_path = config.get_as("wiki-titles"); + if (!titles_path) + throw std::runtime_error{"wiki-titles param needed in config"}; + + auto links_path = config.get_as("wiki-links"); + if (!links_path) + throw std::runtime_error{"wiki-links param needed in config"}; + + auto num_nodes = filesystem::num_lines(*titles_path); + if (num_nodes == 0) + throw std::runtime_error{"wiki-titles file was empty"}; + + graph::directed_graph<> network; + printing::progress prog{" > Creating graph ", num_nodes}; + std::string line; + std::ifstream titles_in{*titles_path}; + while (std::getline(titles_in, line)) + network.insert(graph::default_node{line}); + + uint64_t idx = 0; + std::ifstream links_in{*links_path}; + while (std::getline(links_in, line)) + { + std::stringstream ss{line}; + std::string node_str; + ss >> node_str; + node_str.pop_back(); + auto src = node_id{std::stoul(node_str) - 1}; + while (ss >> node_str) + { + auto dest = node_id{std::stoul(node_str) - 1}; + network.add_edge(src, dest); + } + prog(++idx); + } + prog.end(); + + return network; +} + +int main(int argc, char* argv[]) +{ + if (argc < 2) + { + std::cerr << "Usage: " << argv[0] << " config.toml" << std::endl; + return 1; + } + + logging::set_cerr_logging(); + + auto config = cpptoml::parse_file(argv[1]); + auto network = create_network(*config); + uint64_t top_k = 25; + + // First, run regular PageRank + auto ranks = graph::algorithms::page_rank_centrality(network, 0.85, {}, 50); + print_results(network, ranks, top_k); + + // Some example queries, where the id is the titles line # starting from 0 + const auto centers = { + node_id{1153141}, // Computer_science + node_id{679246}, // Bill_Gates + node_id{5315048}, // University_of_Illinois_at_Urbana-Champaign + node_id{3975552}, // Pizza + node_id{623970} // Beer + }; + + // Then, run a Personalized PageRank simulation for some pages + for (const auto& center : centers) + { + std::cout << "Personalized PageRank for \"" + << network.node(center).label << "\"" << std::endl; + stats::multinomial dist; + dist.increment(center, 1); + auto ranks + = graph::algorithms::page_rank_centrality(network, 0.85, dist, 50); + print_results(network, ranks, top_k); + } +} diff --git a/src/index/disk_index.cpp b/src/index/disk_index.cpp index 2de8cd1d9..d59f34cb0 100644 --- a/src/index/disk_index.cpp +++ b/src/index/disk_index.cpp @@ -4,6 +4,7 @@ */ #include +#include #include "meta/index/disk_index.h" #include "meta/index/disk_index_impl.h" @@ -55,11 +56,15 @@ label_id disk_index::lbl_id(doc_id d_id) const label_id disk_index::id(class_label label) const { + if (!impl_->label_ids_.contains_key(label)) + throw std::out_of_range{"Invalid class_label: " + std::string(label)}; return impl_->label_ids_.get_value(label); } class_label disk_index::class_label_from_id(label_id l_id) const { + if (!impl_->label_ids_.contains_value(l_id)) + throw std::out_of_range{"Invalid label_id: " + std::to_string(l_id)}; return impl_->label_ids_.get_key(l_id); } diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index 80cbddf1f..fe49619be 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -45,16 +45,15 @@ class forward_index::impl /** * Constructs an implementation based on a forward_index. */ - impl(forward_index* idx); + impl(forward_index* idx, const cpptoml::table& config); /** * Tokenizes the documents in the corpus in parallel, yielding * num_threads number of forward_index chunks that then need to be * merged. */ - void tokenize_docs(corpus::corpus& corpus, - const analyzers::analyzer& analyzer, - metadata_writer& mdata_writer, uint64_t ram_budget); + void tokenize_docs(corpus::corpus& corpus, metadata_writer& mdata_writer, + uint64_t ram_budget); /** * Merges together num_chunks number of intermediate chunks, using the @@ -90,7 +89,7 @@ class forward_index::impl * @return whether this index will be based off of a single * libsvm-formatted corpus file */ - bool is_libsvm_format(const cpptoml::table& config) const; + bool is_libsvm_analyzer(const cpptoml::table& config) const; /** * Compresses the postings file created by uninverting. @@ -105,6 +104,9 @@ class forward_index::impl */ void load_postings(); + /// The analyzer used to tokenize documents (nullptr if libsvm). + std::unique_ptr analyzer_; + /// the total number of unique terms if term_id_mapping_ is unused uint64_t total_unique_terms_; @@ -120,14 +122,16 @@ class forward_index::impl forward_index::forward_index(const cpptoml::table& config) : disk_index{config, *config.get_as("forward-index")}, - fwd_impl_{this} + fwd_impl_{this, config} { /* nothing */ } -forward_index::impl::impl(forward_index* idx) : idx_{idx} +forward_index::impl::impl(forward_index* idx, const cpptoml::table& config) + : idx_{idx} { - /* nothing */ + if (!is_libsvm_analyzer(config)) + analyzer_ = analyzers::load(config); } forward_index::forward_index(forward_index&&) = default; @@ -181,7 +185,7 @@ void forward_index::load_index() impl_->load_labels(); auto config = cpptoml::parse_file(index_name() + "/config.toml"); - if (!fwd_impl_->is_libsvm_format(*config)) + if (!fwd_impl_->is_libsvm_analyzer(*config)) impl_->load_term_id_mapping(); impl_->load_label_id_mapping(); @@ -201,8 +205,14 @@ void forward_index::create_index(const cpptoml::table& config, // if the corpus is a single libsvm formatted file, then we are done; // otherwise, we will create an inverted index and the uninvert it - if (fwd_impl_->is_libsvm_format(config)) + if (fwd_impl_->is_libsvm_analyzer(config)) { + // double check that the corpus is libsvm-corpus + if (!dynamic_cast(&docs)) + throw forward_index_exception{"both analyzer and corpus type must " + "be libsvm in order to use libsvm " + "formatted data"}; + LOG(info) << "Creating index from libsvm data: " << index_name() << ENDLG; @@ -236,15 +246,13 @@ void forward_index::create_index(const cpptoml::table& config, { LOG(info) << "Creating forward index: " << index_name() << ENDLG; - auto analyzer = analyzers::load(config); - metadata_writer mdata_writer{index_name(), docs.size(), docs.schema()}; impl_->load_labels(docs.size()); // RAM budget is given in MB - fwd_impl_->tokenize_docs(docs, *analyzer, mdata_writer, + fwd_impl_->tokenize_docs(docs, mdata_writer, ram_budget * 1024 * 1024); impl_->load_term_id_mapping(); impl_->save_label_id_mapping(); @@ -270,7 +278,6 @@ void forward_index::create_index(const cpptoml::table& config, } void forward_index::impl::tokenize_docs(corpus::corpus& docs, - const analyzers::analyzer& ana, metadata_writer& mdata_writer, uint64_t ram_budget) { @@ -286,7 +293,7 @@ void forward_index::impl::tokenize_docs(corpus::corpus& docs, std::ofstream chunk{idx_->index_name() + "/chunk-" + std::to_string(chunk_id), std::ios::binary}; - auto analyzer = ana.clone(); + auto analyzer = analyzer_->clone(); while (true) { util::optional doc; @@ -501,32 +508,8 @@ void forward_index::impl::create_uninverted_metadata(const std::string& name) idx_->index_name() + idx_->impl_->files[file]); } -bool forward_index::impl::is_libsvm_format(const cpptoml::table& config) const +bool forward_index::impl::is_libsvm_analyzer(const cpptoml::table& config) const { - auto prefix = config.get_as("prefix"); - auto dset = config.get_as("dataset"); - auto corp = config.get_as("corpus"); - - if (!prefix || !dset || !corp) - throw forward_index_exception{"failed to determine corpus type"}; - - auto corp_filename = *prefix + "/" + *dset + "/" + *corp; - if (!filesystem::file_exists(corp_filename)) - { - throw forward_index_exception{"corpus configuration file (" - + corp_filename + ") not present"}; - } - - auto corpus_config = cpptoml::parse_file(corp_filename); - auto type = corpus_config->get_as("type"); - - if (!type) - { - throw forward_index_exception{ - "'type' key not present in corpus configuration file " - + corp_filename}; - } - auto analyzers = config.get_table_array("analyzers")->get(); if (analyzers.size() != 1) return false; @@ -535,15 +518,24 @@ bool forward_index::impl::is_libsvm_format(const cpptoml::table& config) const if (!method) throw forward_index_exception{"failed to find analyzer method"}; - if (*method == "libsvm" && *type == corpus::libsvm_corpus::id) - return true; + return *method == "libsvm"; +} - if (*method == "libsvm" || *type == corpus::libsvm_corpus::id) - throw forward_index_exception{"both analyzer and corpus type must be " - "libsvm in order to use libsvm formatted " - "data"}; +learn::feature_vector forward_index::tokenize(const corpus::document& doc) +{ + if (!fwd_impl_->analyzer_) + throw exception{"this forward index type can't analyze docs"}; + + learn::feature_vector f_vec; + auto map = fwd_impl_->analyzer_->analyze(doc); + for (auto& pr : map) + { + auto t_id = get_term_id(pr.key()); + if (t_id != unique_terms()) // if known feature, add it + f_vec[t_id] = pr.value(); + } - return false; + return f_vec; } uint64_t forward_index::unique_terms() const diff --git a/src/index/inverted_index.cpp b/src/index/inverted_index.cpp index 5352c3539..0d1f8bd16 100644 --- a/src/index/inverted_index.cpp +++ b/src/index/inverted_index.cpp @@ -52,11 +52,13 @@ class inverted_index::impl * @param mdata_parser The parser for reading metadata * @param mdata_writer The writer for metadata * @param ram_budget The total **estimated** RAM budget + * @param num_threads The number of threads to tokenize and index docs with * @return the number of chunks created */ void tokenize_docs(corpus::corpus& docs, postings_inverter& inverter, - metadata_writer& mdata_writer, uint64_t ram_budget); + metadata_writer& mdata_writer, uint64_t ram_budget, + uint64_t num_threads); /** * Compresses the large postings file. @@ -126,6 +128,17 @@ void inverted_index::create_index(const cpptoml::table& config, auto max_writers = static_cast( config.get_as("indexer-max-writers").value_or(8)); + auto max_threads = std::thread::hardware_concurrency(); + auto num_threads = static_cast( + config.get_as("indexer-num-threads").value_or(max_threads)); + if (num_threads > max_threads) + { + num_threads = max_threads; + LOG(warning) << "Reducing indexer-num-threads to the hardware " + "concurrency level of " + << max_threads << ENDLG; + } + postings_inverter inverter{index_name(), max_writers}; { metadata_writer mdata_writer{index_name(), docs.size(), docs.schema()}; @@ -134,7 +147,7 @@ void inverted_index::create_index(const cpptoml::table& config, // RAM budget is given in megabytes inv_impl_->tokenize_docs(docs, inverter, mdata_writer, - ram_budget * 1024 * 1024); + ram_budget * 1024 * 1024, num_threads); } inverter.merge_chunks(); @@ -173,7 +186,7 @@ void inverted_index::load_index() void inverted_index::impl::tokenize_docs( corpus::corpus& docs, postings_inverter& inverter, - metadata_writer& mdata_writer, uint64_t ram_budget) + metadata_writer& mdata_writer, uint64_t ram_budget, uint64_t num_threads) { std::mutex mutex; printing::progress progress{" > Tokenizing Docs: ", docs.size()}; @@ -221,9 +234,8 @@ void inverted_index::impl::tokenize_docs( } }; - parallel::thread_pool pool; + parallel::thread_pool pool{num_threads}; std::vector> futures; - auto num_threads = pool.thread_ids().size(); for (size_t i = 0; i < num_threads; ++i) { futures.emplace_back( diff --git a/src/index/ranker/absolute_discount.cpp b/src/index/ranker/absolute_discount.cpp index 99f0f00a8..d48ea4426 100644 --- a/src/index/ranker/absolute_discount.cpp +++ b/src/index/ranker/absolute_discount.cpp @@ -49,11 +49,13 @@ float absolute_discount::doc_constant(const score_data& sd) const template <> std::unique_ptr - make_ranker(const cpptoml::table& config) +make_ranker(const cpptoml::table& config) { - if (auto delta = config.get_as("delta")) - return make_unique(*delta); - return make_unique(); + auto delta = config.get_as("delta") + .value_or(absolute_discount::default_delta); + if (delta < 0 || delta > 1) + throw ranker_exception{"absolute-discount delta must be on [0,1]"}; + return make_unique(delta); } } } diff --git a/src/index/ranker/dirichlet_prior.cpp b/src/index/ranker/dirichlet_prior.cpp index 02f3cfc09..43a4852c3 100644 --- a/src/index/ranker/dirichlet_prior.cpp +++ b/src/index/ranker/dirichlet_prior.cpp @@ -16,7 +16,7 @@ const util::string_view dirichlet_prior::id = "dirichlet-prior"; dirichlet_prior::dirichlet_prior(float mu) : mu_{mu} { - /* nothing */ + // nothing } dirichlet_prior::dirichlet_prior(std::istream& in) @@ -49,9 +49,10 @@ template <> std::unique_ptr make_ranker(const cpptoml::table& config) { - if (auto mu = config.get_as("mu")) - return make_unique(*mu); - return make_unique(); + auto mu = config.get_as("mu").value_or(dirichlet_prior::default_mu); + if (mu < 0) + throw ranker_exception{"dirichlet-prior mu must be >= 0"}; + return make_unique(mu); } } } diff --git a/src/index/ranker/jelinek_mercer.cpp b/src/index/ranker/jelinek_mercer.cpp index 51aa2d72b..200752d5f 100644 --- a/src/index/ranker/jelinek_mercer.cpp +++ b/src/index/ranker/jelinek_mercer.cpp @@ -49,9 +49,11 @@ template <> std::unique_ptr make_ranker(const cpptoml::table& config) { - if (auto lambda = config.get_as("lambda")) - return make_unique(*lambda); - return make_unique(); + auto lambda = config.get_as("lambda") + .value_or(jelinek_mercer::default_lambda); + if (lambda < 0 || lambda > 1) + throw ranker_exception{"jelinek-mercer lambda must be on [0,1]"}; + return make_unique(lambda); } } } diff --git a/src/index/ranker/okapi_bm25.cpp b/src/index/ranker/okapi_bm25.cpp index 6bd61ebda..cf2b127c2 100644 --- a/src/index/ranker/okapi_bm25.cpp +++ b/src/index/ranker/okapi_bm25.cpp @@ -44,7 +44,7 @@ void okapi_bm25::save(std::ostream& out) const float okapi_bm25::score_one(const score_data& sd) { - float doc_len = sd.idx.doc_size(sd.d_id); + float doc_len = sd.doc_size; // add 1.0 to the IDF to ensure that the result is positive float IDF = fastapprox::fastlog( @@ -67,6 +67,15 @@ std::unique_ptr make_ranker(const cpptoml::table& config) auto b = config.get_as("b").value_or(okapi_bm25::default_b); auto k3 = config.get_as("k3").value_or(okapi_bm25::default_k3); + if (k1 < 0) + throw ranker_exception{"bm25 k1 must be >= 0"}; + + if (k3 < 0) + throw ranker_exception{"bm25 k3 must be >= 0"}; + + if (b < 0 || b > 1) + throw ranker_exception{"bm25 b must be on [0,1]"}; + return make_unique(k1, b, k3); } } diff --git a/src/index/ranker/pivoted_length.cpp b/src/index/ranker/pivoted_length.cpp index 521630b7a..134a59740 100644 --- a/src/index/ranker/pivoted_length.cpp +++ b/src/index/ranker/pivoted_length.cpp @@ -36,7 +36,7 @@ void pivoted_length::save(std::ostream& out) const float pivoted_length::score_one(const score_data& sd) { - float doc_len = sd.idx.doc_size(sd.d_id); + float doc_len = sd.doc_size; float TF = 1.0f + fastapprox::fastlog( 1.0f + fastapprox::fastlog(sd.doc_term_count)); float norm = (1.0f - s_) + s_ * (doc_len / sd.avg_dl); @@ -50,6 +50,8 @@ std::unique_ptr make_ranker(const cpptoml::table& config) { auto s = config.get_as("s").value_or(pivoted_length::default_s); + if (s < 0 || s > 1) + throw ranker_exception{"pivoted-length s must be on [0,1]"}; return make_unique(s); } } diff --git a/src/index/string_list_writer.cpp b/src/index/string_list_writer.cpp index 36518c275..d51aec87c 100644 --- a/src/index/string_list_writer.cpp +++ b/src/index/string_list_writer.cpp @@ -5,9 +5,6 @@ #include "meta/io/binary.h" #include "meta/index/string_list_writer.h" -#if !META_HAS_STREAM_MOVE -#include "meta/util/shim.h" -#endif namespace meta { @@ -15,7 +12,7 @@ namespace index { string_list_writer::string_list_writer(const std::string& path, uint64_t size) - : string_file_{make_file(path)}, + : string_file_{path}, write_pos_{0}, index_{path + "_index", size} { @@ -45,7 +42,7 @@ void string_list_writer::insert(uint64_t idx, const std::string& elem) { std::lock_guard lock{mutex_}; index_[idx] = write_pos_; - io::write_binary(file(), elem); + io::write_binary(string_file_, elem); write_pos_ += elem.length() + 1; } } diff --git a/src/io/filesystem.cpp b/src/io/filesystem.cpp index 1a0f26e20..8458b5fb2 100644 --- a/src/io/filesystem.cpp +++ b/src/io/filesystem.cpp @@ -25,8 +25,7 @@ namespace meta namespace filesystem { -#if !defined META_HAS_EXPERIMENTAL_FILESYSTEM \ - && !defined META_HAS_TR2_SYS_FILESYSTEM +#ifndef META_HAS_EXPERIMENTAL_FILESYSTEM namespace { using traits = platformstl::filesystem_traits; @@ -53,6 +52,11 @@ bool file_exists(const std::string& filename) return traits::file_exists(filename.c_str()); } +bool exists(const std::string& filename) +{ + return file_exists(filename); +} + uint64_t file_size(const std::string& filename) { if (!file_exists(filename)) @@ -111,16 +115,12 @@ std::uintmax_t remove_all(const std::string& path) { return remove_all(path_type{path.c_str()}); } -#else // filesystem namespace exists, somewhere -#if META_HAS_EXPERIMENTAL_FILESYSTEM -using fs = std::experimental::filesystem; -#elif META_HAS_TR2_SYS_FILESYSTEM -using fs = std::tr2::sys::filesystem; -#endif +#else +namespace fs = std::experimental::filesystem; bool delete_file(const std::string& filename) { - return fs::remove(filename); + return fs::exists(filename) && fs::remove(filename); } void rename_file(const std::string& old_name, const std::string& new_name) @@ -138,6 +138,11 @@ bool file_exists(const std::string& filename) return fs::exists(filename); } +bool exists(const std::string& filename) +{ + return fs::exists(filename); +} + uint64_t file_size(const std::string& filename) { if (!file_exists(filename)) @@ -147,7 +152,22 @@ uint64_t file_size(const std::string& filename) std::uintmax_t remove_all(const std::string& path) { + if (!fs::exists(path)) + return 0; +#if META_HAS_EXPERIMENTAL_FILESYSTEM_REMOVE_ALL return fs::remove_all(path); +#else + // fs::remove_all doesn't properly recurse on directories, so we get + // to... + std::uintmax_t count = 1; + if (fs::is_directory(path)) + { + for (fs::directory_iterator d{path}, end; d != end; ++d) + count += meta::filesystem::remove_all(d->path()); + } + fs::remove(path); + return count; +#endif } #endif diff --git a/src/io/libsvm_parser.cpp b/src/io/libsvm_parser.cpp index b979f1653..a27d5f4c5 100644 --- a/src/io/libsvm_parser.cpp +++ b/src/io/libsvm_parser.cpp @@ -3,9 +3,10 @@ * @author Sean Massung */ -#include +#include #include "meta/io/libsvm_parser.h" +#include "meta/util/string_view.h" namespace meta { @@ -25,45 +26,66 @@ class_label label(const std::string& text) return class_label{text.substr(0, space)}; } +void throw_exception(const std::string& text) +{ + throw libsvm_parser_exception{"incorrectly formatted libsvm data: " + text}; +} + counts_t counts(const std::string& text, bool contains_label /* = true */) { - std::stringstream stream{text}; - std::string token; + util::string_view sv{text}; if (contains_label) { - if (!(stream >> token)) // ignore class label, but check that it's there - throw libsvm_parser_exception{ - "incorrectly formatted libsvm data: " + text}; + auto pos = sv.find_first_of(" \t"); + if (pos == std::string::npos || pos == 0) + throw_exception(text); + sv = sv.substr(pos); } + auto consume_whitespace = [&]() { + auto pos = sv.find_first_not_of(" \t"); + if (pos != sv.npos) + sv = sv.substr(pos); + else + sv = util::string_view{}; // .clear() doesn't exist for GCC... + }; + + consume_whitespace(); + std::vector> counts; - term_id term; - double count; - while (stream >> token) + while (!sv.empty()) { - size_t colon = token.find_first_of(':'); - if (colon == std::string::npos || colon == 0 || colon == token.size() - - 1) - throw libsvm_parser_exception{"incorrectly formatted libsvm data: " - + text}; - - std::istringstream term_stream{token.substr(0, colon)}; - term_stream >> term; - std::istringstream double_stream{token.substr(colon + 1)}; - double_stream >> count; - - // make sure double conversion worked and used the entire string - if (double_stream.fail() || !double_stream.eof()) - throw libsvm_parser_exception{"incorrectly formatted libsvm data: " - + text}; + auto whitespace = sv.find_first_of(" \t"); + auto token = sv.substr(0, whitespace); + + if (token.empty()) + throw_exception("empty token: " + token.to_string()); + + auto colon = token.find_first_of(":"); + if (colon == std::string::npos || colon == 0 || colon == token.size() - 1) + throw_exception("no colon in token: " + token.to_string()); + + char* end = nullptr; + auto term = std::strtoul(token.data(), nullptr, 0); + double count = std::strtod(token.substr(colon + 1).data(), &end); + + if (end != token.data() + token.size()) + throw_exception("full token not consumed: " + token.to_string()); + //throw_exception(text); if (term == 0) throw libsvm_parser_exception{"term id was 0 from libsvm format"}; // liblinear has term_ids start at 1 instead of 0 like MeTA and libsvm - term_id minus_term{static_cast(term) - 1}; + term_id minus_term{term - 1}; counts.emplace_back(minus_term, count); + + if (whitespace == std::string::npos) + break; + + sv = sv.substr(whitespace); + consume_whitespace(); } return counts; diff --git a/src/succinct/CMakeLists.txt b/src/succinct/CMakeLists.txt new file mode 100644 index 000000000..029981c2f --- /dev/null +++ b/src/succinct/CMakeLists.txt @@ -0,0 +1,4 @@ +project(meta-succinct) + +add_library(meta-succinct compressed_vector.cpp bit_vector.cpp sarray.cpp) +target_link_libraries(meta-succinct meta-io) diff --git a/src/succinct/bit_vector.cpp b/src/succinct/bit_vector.cpp new file mode 100644 index 000000000..6aa4f6a28 --- /dev/null +++ b/src/succinct/bit_vector.cpp @@ -0,0 +1,76 @@ +/** + * @file bit_vector.cpp + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#include "meta/math/integer.h" +#include "meta/succinct/bit_vector.h" +#include "meta/util/likely.h" + +namespace meta +{ +namespace succinct +{ + +bit_vector_view::bit_vector_view(util::array_view data, + uint64_t num_bits) + : data_{data}, num_bits_{num_bits} +{ + if (META_UNLIKELY(math::integer::div_ceil(num_bits_, 64) > data.size())) + throw std::out_of_range{"not enough bits in array view to " + "construct a bit vector view of the " + "requested length"}; +} + +bool bit_vector_view::operator[](uint64_t bit_idx) const +{ + auto word_pos = bit_idx / 64; + auto bit_pos = bit_idx % 64; + + return (data_[word_pos] >> bit_pos) & 1ull; +} + +uint64_t bit_vector_view::extract(uint64_t bit_idx, uint8_t len) const +{ + if (META_UNLIKELY(len > 64)) + throw std::invalid_argument{"bit length longer than word"}; + + auto word_pos = bit_idx / 64; + auto bit_pos = bit_idx % 64; + + uint64_t bits = 0; + if (64 - bit_pos >= len) + { + // one word contains all we need + bits |= (data_[word_pos] >> bit_pos); + } + else + { + // combine the high bits of the current word with the low bits + // of the next word + bits |= (data_[word_pos] >> bit_pos); + bits |= (data_[word_pos + 1] << (64 - bit_pos)); + } + + // mask off only the bits we need + auto mask = len == 64 ? static_cast(-1) : (1ull << len) - 1; + bits &= mask; + + return bits; +} + +util::array_view bit_vector_view::data() const +{ + return data_; +} + +uint64_t bit_vector_view::size() const +{ + return num_bits_; +} +} +} diff --git a/src/succinct/compressed_vector.cpp b/src/succinct/compressed_vector.cpp new file mode 100644 index 000000000..5f755477c --- /dev/null +++ b/src/succinct/compressed_vector.cpp @@ -0,0 +1,39 @@ +/** + * @file compressed_vector.cpp + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#include "meta/succinct/compressed_vector.h" + +namespace meta +{ +namespace succinct +{ + +compressed_vector::compressed_vector(const std::string& prefix) + : numbers_{prefix + "/compressed-vec.bin"}, + positions_{prefix + "/sarray"}, + select_{prefix + "/sarray", positions_} +{ + // nothing +} + +uint64_t compressed_vector::operator[](uint64_t i) const +{ + bit_vector_view num_bvv{{numbers_.begin(), numbers_.end()}, + 64 * numbers_.size()}; + auto start = select_.select(i); + auto end = select_.select(i + 1); + return num_bvv.extract(start, static_cast(end - start)); +} + +uint64_t compressed_vector::size() const +{ + return select_.size() - 1; +} +} +} diff --git a/src/succinct/sarray.cpp b/src/succinct/sarray.cpp new file mode 100644 index 000000000..725e8ae3c --- /dev/null +++ b/src/succinct/sarray.cpp @@ -0,0 +1,158 @@ +/** + * @file sarray.cpp + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#include "meta/succinct/sarray.h" +#include "meta/util/likely.h" + +namespace meta +{ +namespace succinct +{ + +using namespace sarray_detail; + +sarray_builder::sarray_builder(const std::string& prefix, uint64_t num_ones, + uint64_t num_bits) + : low_stream_{low_file(prefix), std::ios::binary}, + high_stream_{high_file(prefix), std::ios::binary}, + nb_stream_{num_bits_file(prefix), std::ios::binary}, + num_ones_{num_ones} +{ + auto ratio = num_bits / num_ones; + low_bits_ = ratio ? static_cast(broadword::msb(ratio)) : 0; + low_mask_ = (1ull << low_bits_) - 1; + + low_builder_ + = make_unique(make_bit_vector_builder(low_stream_)); + high_builder_ + = make_unique(make_bit_vector_builder(high_stream_)); +} + +void sarray_builder::operator()(uint64_t one_pos) +{ + if (low_bits_) + { + low_builder_->write_bits({one_pos & low_mask_, low_bits_}); + } + + // determine the next bit position to set in the upper bit + // array + uint64_t upper_bit_pos = (one_pos >> low_bits_) + num_calls_; + uint64_t word_idx = upper_bit_pos / 64; + uint64_t word_pos = upper_bit_pos % 64; + + // write full words until we're at the correct word index + for (; high_word_idx_ < word_idx; ++high_word_idx_) + { + high_builder_->write_bits({curr_high_word_, 64}); + curr_high_word_ = 0; + } + + // set the correct bit in the current word + curr_high_word_ |= 1ull << word_pos; + high_word_pos_ = word_pos + 1; + + ++num_calls_; + if (META_UNLIKELY(num_calls_ > num_ones_)) + throw std::out_of_range{ + "more positions given than bits in sarray building"}; +} + +sarray_builder::~sarray_builder() +{ + high_builder_->write_bits( + {curr_high_word_, static_cast(high_word_pos_)}); + + high_builder_ = nullptr; + low_builder_ = nullptr; + + if (!low_bits_) + io::write_binary(low_stream_, uint64_t{0}); + + io::packed::write(nb_stream_, 64 * high_word_idx_ + high_word_pos_); + io::packed::write(nb_stream_, low_bits_); +} + +sarray::sarray(const std::string& prefix) + : high_bits_{high_file(prefix)}, low_bits_{low_file(prefix)} +{ + std::ifstream num_bits{num_bits_file(prefix), std::ios::binary}; + io::packed::read(num_bits, high_bit_count_); + io::packed::read(num_bits, num_low_bits_); +} + +bit_vector_view sarray::high_bits() const +{ + return {{high_bits_.begin(), high_bits_.end()}, high_bit_count_}; +} + +bit_vector_view sarray::low_bits() const +{ + return {{low_bits_.begin(), low_bits_.end()}, 64 * low_bits_.size()}; +} + +uint8_t sarray::num_low_bits() const +{ + return num_low_bits_; +} + +sarray_rank::sarray_rank(const std::string& prefix, const sarray& sarr) + : sarray_{&sarr}, high_bit_zeroes_{prefix + "/rank", sarr.high_bits()} +{ + // nothing +} + +/// @see https://github.com/ot/succinct/blob/master/elias_fano.hpp +uint64_t sarray_rank::rank(uint64_t i) const +{ + auto num_low_bits = sarray_->num_low_bits(); + uint64_t high_query + = std::min(i >> num_low_bits, high_bit_zeroes_.num_positions()); + + uint64_t high_pos = high_bit_zeroes_.select(high_query); + uint64_t rank = high_pos - high_query; + + auto high_bvv = sarray_->high_bits(); + auto low_bvv = sarray_->low_bits(); + uint64_t low_val = i & ((1ull << num_low_bits) - 1); + while (high_pos > 0 && high_bvv[high_pos - 1] + && low_bvv.extract((rank - 1) * num_low_bits, num_low_bits) + >= low_val) + { + --rank; + --high_pos; + } + + return rank; +} + +uint64_t sarray_rank::size() const +{ + return sarray_->high_bits().size() - high_bit_zeroes_.num_positions() + 1; +} + +sarray_select::sarray_select(const std::string& prefix, const sarray& sarr) + : sarray_{&sarr}, high_bit_ones_{prefix + "/select", sarr.high_bits()} +{ + // nothing +} + +uint64_t sarray_select::select(uint64_t i) const +{ + uint8_t num_low_bits = sarray_->num_low_bits(); + return (high_bit_ones_.select(i) - i) << num_low_bits + | sarray_->low_bits().extract(i * num_low_bits, num_low_bits); +} + +uint64_t sarray_select::size() const +{ + return high_bit_ones_.num_positions(); +} +} +} diff --git a/src/tools/CMakeLists.txt b/src/tools/CMakeLists.txt index cda480512..d59ae8448 100644 --- a/src/tools/CMakeLists.txt +++ b/src/tools/CMakeLists.txt @@ -6,3 +6,6 @@ target_link_libraries(profile meta-index add_executable(top-k top_k.cpp) target_link_libraries(top-k meta-index) + +add_executable(mph-vocab mph_vocab.cpp) +target_link_libraries(mph-vocab meta-io meta-util meta-succinct) diff --git a/src/tools/mph_vocab.cpp b/src/tools/mph_vocab.cpp new file mode 100644 index 000000000..15936494d --- /dev/null +++ b/src/tools/mph_vocab.cpp @@ -0,0 +1,68 @@ +/** + * @file mph_vocab.cpp + * @author Chase Geigle + */ + +#include "meta/hashing/perfect_hash.h" +#include "meta/hashing/perfect_hash_builder.h" + +int main(int argc, char** argv) +{ + using namespace meta; + + logging::set_cerr_logging(); + + if (argc != 2) + { + std::cerr << "Usage: " << argv[0] << " file.txt" << std::endl; + return 1; + } + + using mph_builder = hashing::perfect_hash_builder; + using options_type = mph_builder::options; + + options_type options; + options.prefix = "hashed-vocab"; + options.num_keys = filesystem::num_lines(argv[1]); + + mph_builder builder{options}; + + { + std::ifstream input{argv[1]}; + std::string line; + + while (std::getline(input, line)) + builder(line); + } + + builder.write(); + + hashing::perfect_hash mph{"hashed-vocab"}; + std::ifstream input{argv[1]}; + std::string line; + + std::vector vocab(options.num_keys); + while (std::getline(input, line)) + { + auto id = mph(line); + if (!vocab[id].empty()) + { + std::cerr << "Collision: " << line << " and " << vocab[id] + << std::endl; + return 1; + } + vocab[id] = line; + std::cout << line << " -> " << id << "\n"; + } + + for (std::size_t id = 0; id < vocab.size(); ++id) + { + if (vocab[id].empty()) + { + std::cerr << "Unused term id: " << id << std::endl; + return 1; + } + } + + return 0; +} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index bb17ae871..c340954c0 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -39,4 +39,5 @@ target_link_libraries(unit-test meta-index meta-parser meta-features meta-language-model + meta-succinct meta-topics) diff --git a/tests/bit_vector_test.cpp b/tests/bit_vector_test.cpp new file mode 100644 index 000000000..2751da492 --- /dev/null +++ b/tests/bit_vector_test.cpp @@ -0,0 +1,123 @@ +/** + * @file bit_vector_test.cpp + * @author Chase Geigle + */ + +#include + +#include "bandit/bandit.h" +#include "meta/io/filesystem.h" +#include "meta/math/integer.h" +#include "meta/succinct/bit_vector.h" +#include "meta/util/disk_vector.h" + +using namespace bandit; + +go_bandit([]() { + using namespace meta; + using namespace succinct; + + describe("[bit vector]", []() { + std::string filename = "bit-vector-test.bin"; + it("should build files of the correct size", [&]() { + { + std::ofstream output{filename, std::ios::binary}; + + auto builder = make_bit_vector_builder(output); + + auto all_ones = static_cast(-1); + builder.write_bits({all_ones, 16}); + builder.write_bits({all_ones, 64}); + + AssertThat(builder.total_bits(), + Equals(static_cast(16 + 64))); + } + + AssertThat(filesystem::file_size(filename), + Equals(sizeof(uint64_t) * 2)); + filesystem::delete_file(filename); + }); + + it("should correctly extract single bits", [&]() { + auto alternating_ones + = static_cast(0xaaaaaaaaaaaaaaaaULL); + + auto sizes = {16, 8, 64, 2, 16, 32, 4}; + uint64_t total_size + = std::accumulate(sizes.begin(), sizes.end(), 0ull); + { + std::ofstream output{filename, std::ios::binary}; + + auto builder = make_bit_vector_builder(output); + + for (const auto& size : sizes) + builder.write_bits( + {alternating_ones, static_cast(size)}); + + AssertThat(builder.total_bits(), Equals(total_size)); + } + + AssertThat(filesystem::file_size(filename), + Equals(sizeof(uint64_t) + * math::integer::div_ceil(total_size, 64))); + + { + util::disk_vector storage{filename}; + util::array_view av{storage.begin(), + storage.end()}; + bit_vector_view bvv{av, total_size}; + + for (std::size_t i = 0; i < total_size; ++i) { + auto bit = bvv[i]; + if (i % 2 == 0) { + AssertThat(bit, Equals(0)); + } else { + AssertThat(bit, Equals(1)); + } + } + } + + filesystem::delete_file(filename); + }); + + it("should correctly extract multi-bit patterns", [&]() { + uint64_t deadbeef = 0xdeadbeefULL; + auto sizes = {32, 16, 64, 38, 32, 64, 8, 1, 2, 3, 7, 9}; + uint64_t total_size + = std::accumulate(sizes.begin(), sizes.end(), 0ull); + { + std::ofstream output{filename, std::ios::binary}; + + auto builder = make_bit_vector_builder(output); + for (const auto& size : sizes) + builder.write_bits({deadbeef, static_cast(size)}); + + AssertThat(builder.total_bits(), Equals(total_size)); + } + + AssertThat(filesystem::file_size(filename), + Equals(sizeof(uint64_t) + * math::integer::div_ceil(total_size, 64))); + { + util::disk_vector storage{filename}; + util::array_view av{storage.begin(), + storage.end()}; + + bit_vector_view bvv{av, total_size}; + + uint64_t pos = 0; + for (const auto& sze : sizes) { + auto size = static_cast(sze); + auto result = bvv.extract(pos, size); + + auto mask = size == 64 ? static_cast(-1) + : (1ull << size) - 1; + AssertThat(result, Equals(deadbeef & mask)); + + pos += size; + } + } + filesystem::delete_file(filename); + }); + }); +}); diff --git a/tests/classifier_test.cpp b/tests/classifier_test.cpp index 7fafab0cc..9d3db31df 100644 --- a/tests/classifier_test.cpp +++ b/tests/classifier_test.cpp @@ -169,6 +169,11 @@ void run_tests(const std::string& index_type) { cfg->insert("method", winnow::id.to_string()); check_split(f_idx, *cfg, 0.86); }); + + it("should run CV using an even class split", [&]() { + check_cv(f_idx, *hinge_sgd_cfg, 0.70, true); + check_cv(f_idx, *perc_sgd_cfg, 0.70, true); + }); }); describe("[classifier] SVM wrapper", [&]() { diff --git a/tests/classifier_test_helper.h b/tests/classifier_test_helper.h index 14da72c08..d8262e0e0 100644 --- a/tests/classifier_test_helper.h +++ b/tests/classifier_test_helper.h @@ -25,24 +25,26 @@ namespace tests { template ::type, cpptoml::table>::value>::type> -inline void check_cv(Index& idx, Creator&& creator, double min_accuracy) { +inline void check_cv(Index& idx, Creator&& creator, double min_accuracy, + bool even_split = false) { using namespace classify; multiclass_dataset dataset{idx}; multiclass_dataset_view mcdv{dataset, std::mt19937_64{47}}; - auto mtx = cross_validate(std::forward(creator), mcdv, 5); + auto mtx + = cross_validate(std::forward(creator), mcdv, 5, even_split); AssertThat(mtx.accuracy(), Is().GreaterThan(min_accuracy).And().LessThan(100.0)); } template inline void check_cv(Index& idx, const cpptoml::table& config, - double min_accuracy) { + double min_accuracy, bool even_split = false) { using namespace classify; check_cv(idx, [&](multiclass_dataset_view docs) { return make_classifier(config, std::move(docs)); - }, min_accuracy); + }, min_accuracy, even_split); } using creation_fn = std::function( @@ -124,8 +126,6 @@ inline void run_save_load_single(std::shared_ptr idx, } filesystem::remove_all("save-load-model"); } - - } } #endif diff --git a/tests/compressed_vector_test.cpp b/tests/compressed_vector_test.cpp new file mode 100644 index 000000000..efe63b87d --- /dev/null +++ b/tests/compressed_vector_test.cpp @@ -0,0 +1,41 @@ +/** + * @file compressed_vector_test.cpp + * @author Chase Geigle + */ + +#include + +#include "bandit/bandit.h" +#include "meta/io/filesystem.h" +#include "meta/util/random.h" +#include "meta/succinct/compressed_vector.h" + +using namespace bandit; + +go_bandit([]() { + using namespace meta; + using namespace succinct; + + describe("[compressed vector]", []() { + std::mt19937 rng{47}; + std::vector values(1000000); + std::generate(values.begin(), values.end(), [&]() { + return random::bounded_rand(rng, 65537); + }); + + filesystem::remove_all("compressed-vector-unit-test"); + + succinct::make_compressed_vector("compressed-vector-unit-test", + values.begin(), values.end()); + + compressed_vector cv{"compressed-vector-unit-test"}; + it("should report the correct size", [&]() { + AssertThat(cv.size(), Equals(values.size())); + }); + + it("should retrieve correct values", [&]() { + for (std::size_t i = 0; i < values.size(); ++i) + AssertThat(cv[i], Equals(values[i])); + }); + }); +}); diff --git a/tests/darray_test.cpp b/tests/darray_test.cpp new file mode 100644 index 000000000..290592e27 --- /dev/null +++ b/tests/darray_test.cpp @@ -0,0 +1,198 @@ +/** + * @file darray_test.cpp + * @author Chase Geigle + */ + +#include "bandit/bandit.h" +#include "meta/io/filesystem.h" +#include "meta/math/integer.h" +#include "meta/succinct/darray.h" + +using namespace bandit; + +go_bandit([]() { + using namespace meta; + using namespace succinct; + + describe("[darray]", []() { + filesystem::remove_all("darray-unit-test"); + + it("should correctly locate one bits in small blocks", []() { + auto alternating_ones + = static_cast(0xaaaaaaaaaaaaaaaaULL); + + std::vector sizes(128000, 64); + uint64_t total_size + = std::accumulate(sizes.begin(), sizes.end(), 0ull); + + std::vector storage; + storage.reserve(math::integer::div_ceil(total_size, 64)); + + { + auto builder = make_bit_vector_builder( + [&](uint64_t word) { storage.push_back(word); }); + + for (const auto& size : sizes) + builder.write_bits( + {alternating_ones, static_cast(size)}); + } + AssertThat(storage.size(), + Equals(math::integer::div_ceil(total_size, 64))); + + bit_vector_view bvv{{storage}, total_size}; + + darray1 ones{"darray-unit-test", bvv}; + + AssertThat(ones.num_positions(), Equals(total_size / 2)); + + // there is a one in every other position + for (std::size_t i = 0; i < total_size / 2; ++i) { + AssertThat(ones.select(i), Equals(i * 2 + 1)); + } + }); + + it("should correctly locate one bits in oddly-sized vector", []() { + uint64_t deadbeef = 0xdeadbeefULL; + auto sizes = {32, 16, 64, 38, 32, 64, 8, 1, 2, 3, 7, 9}; + uint64_t total_size + = std::accumulate(sizes.begin(), sizes.end(), 0ull); + + std::vector storage; + { + auto builder = make_bit_vector_builder( + [&](uint64_t word) { storage.push_back(word); }); + + for (const auto& size : sizes) + builder.write_bits({deadbeef, static_cast(size)}); + } + + bit_vector_view bvv{{storage}, total_size}; + + { + filesystem::remove_all("darray-unit-test"); + darray1 ones{"darray-unit-test", bvv}; + + uint64_t rank_pos = 0; + for (uint64_t i = 0; i < total_size; ++i) { + if (bvv[i]) { + AssertThat(ones.select(rank_pos), Equals(i)); + ++rank_pos; + } + } + } + }); + + it("should correctly locate zero bits in oddly-sized vector", []() { + uint64_t deadbeef = 0xdeadbeefULL; + auto sizes = {32, 16, 64, 38, 32, 64, 8, 1, 2, 3, 7, 9}; + uint64_t total_size + = std::accumulate(sizes.begin(), sizes.end(), 0ull); + + std::vector storage; + { + auto builder = make_bit_vector_builder( + [&](uint64_t word) { storage.push_back(word); }); + + for (const auto& size : sizes) + builder.write_bits({deadbeef, static_cast(size)}); + } + + bit_vector_view bvv{{storage}, total_size}; + + { + filesystem::remove_all("darray-unit-test"); + darray0 zeroes{"darray-unit-test", bvv}; + + uint64_t rank_pos = 0; + for (uint64_t i = 0; i < total_size; ++i) { + if (!bvv[i]) { + AssertThat(zeroes.select(rank_pos), Equals(i)); + ++rank_pos; + } + } + } + }); + + it("should correctly locate one bits in large blocks", []() { + auto sparse_pattern = 1ULL; + auto zero_pattern = 0ULL; + + std::vector sizes(128000, 64); + uint64_t total_size + = std::accumulate(sizes.begin(), sizes.end(), 0ull); + + std::vector storage; + storage.reserve(math::integer::div_ceil(total_size, 64)); + + { + auto builder = make_bit_vector_builder( + [&](uint64_t word) { storage.push_back(word); }); + + for (std::size_t i = 0; i < sizes.size(); ++i) { + if (i % 2 == 0) { + builder.write_bits( + {sparse_pattern, static_cast(sizes[i])}); + } else { + builder.write_bits( + {zero_pattern, static_cast(sizes[i])}); + } + } + } + AssertThat(storage.size(), + Equals(math::integer::div_ceil(total_size, 64))); + + bit_vector_view bvv{{storage}, total_size}; + + filesystem::remove_all("darray-unit-test"); + darray1 ones{"darray-unit-test", bvv}; + AssertThat(ones.num_positions(), Equals(uint64_t{64000})); + for (uint64_t i = 0; i < 64000; ++i) { + AssertThat(ones.select(i), Equals(128 * i)); + } + }); + + it("should locate one bits in a random bit vector", []() { + std::vector storage(128000); + + std::mt19937_64 rng{47}; + std::generate(storage.begin(), storage.end(), + [&]() { return rng(); }); + + bit_vector_view bvv{{storage}, 128000 * 64}; + + filesystem::remove_all("darray-unit-test"); + darray1 ones{"darray-unit-test", bvv}; + + uint64_t rank_idx = 0; + for (uint64_t i = 0; i < 128000 * 64; ++i) { + if (bvv[i]) { + auto pos = ones.select(rank_idx); + AssertThat(pos, Equals(i)); + ++rank_idx; + } + } + }); + + it("should locate zero bits in a random bit vector", []() { + std::vector storage(128000); + + std::mt19937_64 rng{47}; + std::generate(storage.begin(), storage.end(), + [&]() { return rng(); }); + + bit_vector_view bvv{{storage}, 128000 * 64}; + + filesystem::remove_all("darray-unit-test"); + darray0 zeroes{"darray-unit-test", bvv}; + + uint64_t rank_idx = 0; + for (uint64_t i = 0; i < 128000 * 64; ++i) { + if (!bvv[i]) { + auto pos = zeroes.select(rank_idx); + AssertThat(pos, Equals(i)); + ++rank_idx; + } + } + }); + }); +}); diff --git a/tests/forward_index_test.cpp b/tests/forward_index_test.cpp index 2a20fc8cf..8a5954e0a 100644 --- a/tests/forward_index_test.cpp +++ b/tests/forward_index_test.cpp @@ -5,6 +5,7 @@ #include #include +#include #include "bandit/bandit.h" #include "create_config.h" @@ -70,6 +71,33 @@ void check_ceeaus_expected_fwd(Index& idx) { // make sure there's exactly the correct amount AssertThat(id, Equals(idx.num_docs())); + + // make sure we have all the class label info + std::unordered_set label_ids; + label_ids.insert(idx.id(class_label{"japanese"})); + label_ids.insert(idx.id(class_label{"chinese"})); + label_ids.insert(idx.id(class_label{"english"})); + AssertThat(label_ids.find(label_id{1}), + Is().Not().EqualTo(label_ids.end())); + AssertThat(label_ids.find(label_id{2}), + Is().Not().EqualTo(label_ids.end())); + AssertThat(label_ids.find(label_id{3}), + Is().Not().EqualTo(label_ids.end())); + + std::unordered_set labels; + labels.insert(idx.class_label_from_id(label_id{1})); + labels.insert(idx.class_label_from_id(label_id{2})); + labels.insert(idx.class_label_from_id(label_id{3})); + AssertThat(labels.find(class_label{"japanese"}), + Is().Not().EqualTo(labels.end())); + AssertThat(labels.find(class_label{"chinese"}), + Is().Not().EqualTo(labels.end())); + AssertThat(labels.find(class_label{"english"}), + Is().Not().EqualTo(labels.end())); + + AssertThrows(std::out_of_range, idx.id(class_label{"bogus"})); + AssertThrows(std::out_of_range, idx.class_label_from_id(label_id{0})); + AssertThrows(std::out_of_range, idx.class_label_from_id(label_id{4})); } template @@ -155,6 +183,30 @@ go_bandit([]() { line_cfg->insert("uninvert", true); ceeaus_forward_test(*line_cfg); }); + + it("should analyze a new document with the current analyzer", [&]() { + auto cfg = tests::create_config("line"); + auto idx = index::make_index(*cfg); + std::string text{"I think smoking smoking bad."}; + corpus::document doc; + doc.content(text); + auto fvector = idx->tokenize(doc); + + auto begin_sent = idx->get_term_id(""); + auto end_sent = idx->get_term_id(""); + auto bad = idx->get_term_id("bad"); + auto smoke = idx->get_term_id("smoke"); + auto think = idx->get_term_id("think"); + + AssertThat(fvector.at(begin_sent), Equals(1)); + AssertThat(fvector.at(end_sent), Equals(1)); + AssertThat(fvector.at(bad), Equals(1)); + AssertThat(fvector.at(smoke), Equals(2)); + AssertThat(fvector.at(think), Equals(1)); + + auto oov = idx->get_term_id("somelongrandomword"); + AssertThat(fvector.at(oov), Equals(0)); + }); }); describe("[forward-index] from svm config", []() { @@ -166,6 +218,15 @@ go_bandit([]() { }); it("should load the index", [&]() { bcancer_forward_test(*svm_cfg); }); + + it("should not tokenize new docs", [&](){ + auto cfg = create_libsvm_config(); + auto idx = index::make_index(*cfg); + std::string text{"This should fail"}; + corpus::document doc; + doc.content(text); + AssertThrows(index::forward_index_exception, idx->tokenize(doc)); + }); }); describe("[forward-index] with zlib", []() { diff --git a/tests/graph_test.cpp b/tests/graph_test.cpp index 00ff06035..65553da6b 100644 --- a/tests/graph_test.cpp +++ b/tests/graph_test.cpp @@ -58,12 +58,18 @@ go_bandit([]() { g.add_edge(a, c); g.add_edge(a, d); check_sizes(g, 4, 3); + AssertThat(g.adjacent(a).size(), Equals(3ul)); + AssertThat(g.adjacent(b).size(), Equals(1ul)); + AssertThat(g.adjacent(c).size(), Equals(1ul)); + AssertThat(g.adjacent(d).size(), Equals(1ul)); AssertThat(algorithms::clustering_coefficient(g, a), EqualsWithDelta(0.0, delta)); AssertThat(algorithms::neighborhood_overlap(g, a, b), EqualsWithDelta(0.0, delta)); g.add_edge(c, d); + AssertThat(g.adjacent(c).size(), Equals(2ul)); + AssertThat(g.adjacent(d).size(), Equals(2ul)); check_sizes(g, 4, 4); AssertThat(algorithms::clustering_coefficient(g, a), EqualsWithDelta(1.0 / 3, delta)); @@ -103,9 +109,20 @@ go_bandit([]() { g.add_edge(a, c); g.add_edge(a, d); check_sizes(g, 4, 3); + AssertThat(g.adjacent(a).size(), Equals(3ul)); + AssertThat(g.adjacent(b).size(), Equals(0ul)); + AssertThat(g.adjacent(c).size(), Equals(0ul)); + AssertThat(g.adjacent(d).size(), Equals(0ul)); + AssertThat(g.incoming(a).size(), Equals(0ul)); + AssertThat(g.incoming(b).size(), Equals(1ul)); + AssertThat(g.incoming(c).size(), Equals(1ul)); + AssertThat(g.incoming(d).size(), Equals(1ul)); g.add_edge(c, d); check_sizes(g, 4, 4); + AssertThat(g.adjacent(c).size(), Equals(1ul)); + AssertThat(g.adjacent(d).size(), Equals(0ul)); + AssertThat(g.incoming(d).size(), Equals(2ul)); g.add_edge(d, c); // directed, so a different edge than (c, d) check_sizes(g, 4, 5); diff --git a/tests/perfect_hash_test.cpp b/tests/perfect_hash_test.cpp new file mode 100644 index 000000000..71e64a544 --- /dev/null +++ b/tests/perfect_hash_test.cpp @@ -0,0 +1,67 @@ +/** + * @file perfect_hash_test.cpp + * @author Chase Geigle + */ + +#include + +#include "bandit/bandit.h" +#include "meta/hashing/perfect_hash.h" +#include "meta/hashing/perfect_hash_builder.h" +#include "meta/io/filesystem.h" + +using namespace bandit; + +go_bandit([]() { + using namespace meta; + using namespace hashing; + + describe("[perfect hash]", []() { + + using mph_builder = hashing::perfect_hash_builder; + using options_type = mph_builder::options; + + it("should generate minimum perfect hash functions on strings", []() { + filesystem::remove_all("perfect-hash-unit-test"); + + options_type options; + options.prefix = "perfect-hash-unit-test"; + options.num_keys + = filesystem::num_lines("../data/lemur-stopwords.txt"); + options.max_ram = 1024 * 1024; // 1MB + + { + + mph_builder builder{options}; + + std::ifstream input{"../data/lemur-stopwords.txt"}; + std::string line; + while (std::getline(input, line)) + builder(line); + + builder.write(); + } + + { + hashing::perfect_hash mph{ + "perfect-hash-unit-test"}; + + std::vector vocab(options.num_keys); + + std::ifstream input{"../data/lemur-stopwords.txt"}; + std::string line; + while (std::getline(input, line)) { + auto id = mph(line); + AssertThat(vocab[id].empty(), Is().True()); + vocab[id] = line; + } + + for (std::size_t id = 0; id < vocab.size(); ++id) { + AssertThat(vocab[id].empty(), Is().False()); + } + } + + filesystem::remove_all("perfect-hash-unit-test"); + }); + }); +}); diff --git a/tests/sarray_test.cpp b/tests/sarray_test.cpp new file mode 100644 index 000000000..f21524e91 --- /dev/null +++ b/tests/sarray_test.cpp @@ -0,0 +1,105 @@ +/** + * @file sarray_test.cpp + * @author Chase Geigle + */ + +#include "bandit/bandit.h" +#include "meta/io/filesystem.h" +#include "meta/succinct/sarray.h" + +using namespace bandit; + +go_bandit([]() { + describe("[sarray]", []() { + using namespace meta; + using namespace succinct; + + it("should give correct rank results (very sparse)", []() { + + std::vector positions{100, 200, 222, + 1024, 10000, 1331337}; + uint64_t num_bits = 2000000; + + filesystem::remove_all("sarray-unit-test"); + auto storage = make_sarray("sarray-unit-test", positions.begin(), + positions.end(), num_bits); + sarray_rank ranks{"sarray-unit-test", storage}; + + AssertThat(ranks.size(), Equals(positions.size())); + + uint64_t start = 0; + for (uint64_t rank = 0; rank < positions.size(); ++rank) { + for (; start <= positions[rank]; ++start) { + AssertThat(ranks.rank(start), Equals(rank)); + } + } + + for (; start < num_bits; ++start) + AssertThat(ranks.rank(start), Equals(positions.size())); + }); + + it("should give correct select results (very sparse)", []() { + + std::vector positions{100, 200, 222, + 1024, 10000, 1331337}; + uint64_t num_bits = 2000000; + + filesystem::remove_all("sarray-unit-test"); + auto storage = make_sarray("sarray-unit-test", positions.begin(), + positions.end(), num_bits); + sarray_select select{"sarray-unit-test", storage}; + + AssertThat(select.size(), Equals(positions.size())); + + uint64_t i = 0; + for (const auto& pos : positions) + AssertThat(select.select(i++), Equals(pos)); + }); + + it("should give correct rank results (less sparse)", []() { + uint64_t num_bits = 2000000; + uint64_t stride = 100000; + std::vector positions; + positions.reserve(num_bits / stride); + for (uint64_t b = 0; b < num_bits; b += stride) + positions.push_back(b); + + filesystem::remove_all("sarray-unit-test"); + auto storage = make_sarray("sarray-unit-test", positions.begin(), + positions.end(), num_bits); + sarray_rank ranks{"sarray-unit-test", storage}; + + AssertThat(ranks.size(), Equals(positions.size())); + + uint64_t start = 0; + for (uint64_t rank = 0; rank < positions.size(); ++rank) { + for (; start <= positions[rank]; ++start) { + AssertThat(ranks.rank(start), Equals(rank)); + } + } + + for (; start < num_bits; ++start) + AssertThat(ranks.rank(start), Equals(positions.size())); + }); + + it("should give correct select results (less sparse)", []() { + uint64_t num_bits = 2000000; + uint64_t stride = 100000; + std::vector positions; + positions.reserve(num_bits / stride); + for (uint64_t b = 0; b < num_bits; b += stride) + positions.push_back(b); + + filesystem::remove_all("sarray-unit-test"); + auto storage = make_sarray("sarray-unit-test", positions.begin(), + positions.end(), num_bits); + sarray_select select{"sarray-unit-test", storage}; + + AssertThat(select.size(), Equals(positions.size())); + + uint64_t i = 0; + for (const auto& pos : positions) + AssertThat(select.select(i++), Equals(pos)); + }); + }); +}); diff --git a/travis/HandleOutOfTreeLLVM.patch b/travis/HandleOutOfTreeLLVM.patch deleted file mode 100644 index 5d597bfb4..000000000 --- a/travis/HandleOutOfTreeLLVM.patch +++ /dev/null @@ -1,11 +0,0 @@ ---- HandleOutOfTreeLLVM.cmake.orig 2015-08-01 20:53:23.716932808 -0500 -+++ HandleOutOfTreeLLVM.cmake 2015-08-01 20:52:34.760265353 -0500 -@@ -35,7 +35,7 @@ - set(LLVM_INCLUDE_DIR ${INCLUDE_DIR} CACHE PATH "Path to llvm/include") - set(LLVM_BINARY_DIR ${LLVM_OBJ_ROOT} CACHE PATH "Path to LLVM build tree") - set(LLVM_MAIN_SRC_DIR ${MAIN_SRC_DIR} CACHE PATH "Path to LLVM source tree") -- set(LLVM_CMAKE_PATH "${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/cmake/llvm") -+ set(LLVM_CMAKE_PATH "${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/cmake/llvm" CACHE PATH "Path to LLVM cmake modules") - else() - set(LLVM_FOUND OFF) - return() diff --git a/travis/install_libcxx.sh b/travis/install_libcxx.sh index 4e922cbc6..4edd3c619 100755 --- a/travis/install_libcxx.sh +++ b/travis/install_libcxx.sh @@ -1,35 +1,15 @@ #!/bin/bash set -v cwd=$(pwd) +svn co --quiet http://llvm.org/svn/llvm-project/llvm/trunk llvm +cd llvm/projects svn co --quiet http://llvm.org/svn/llvm-project/libcxx/trunk libcxx -git clone https://github.com/pathscale/libcxxrt.git libcxxrt -cd libcxxrt +svn co --quiet http://llvm.org/svn/llvm-project/libcxxabi/trunk libcxxabi +cd ../ mkdir build cd build -cmake -DCMAKE_BUILD_TYPE=Release .. -make -cp lib/libcxxrt.so $HOME/lib -ln -sf $HOME/lib/libcxxrt.so $HOME/lib/libcxxrt.so.1 -ln -sf $HOME/lib/libcxxrt.so $HOME/lib/libcxxrt.so.1.0 -cd $cwd -cd libcxx -cd cmake/Modules -# HORRIBLE TERRIBLE NO GOOD VERY BAD -# hack the HandleOutOfTreeLLVM.cmake module file to allow us to actually -# specify a cmake path -patch -u HandleOutOfTreeLLVM.cmake $cwd/travis/HandleOutOfTreeLLVM.patch -cd ../../ -mkdir build -cd build -cmake -DLIBCXX_CXX_ABI=libcxxrt \ - -DLIBCXX_CXX_ABI_INCLUDE_PATHS="../../libcxxrt/src" \ - -DLIBCXX_CXX_ABI_LIBRARY_PATH=$HOME/lib \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX=$HOME \ - -DLLVM_CONFIG=/usr/bin/llvm-config-3.6 \ - -DLLVM_CMAKE_PATH=/usr/share/llvm-3.6/cmake \ - .. -make -make install +cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$HOME ../ +make cxx +make install-libcxx install-libcxxabi cd $cwd set +v