diff --git a/.appveyor.yml b/.appveyor.yml index b280f1d86..35251ffc6 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -5,16 +5,20 @@ os: Visual Studio 2015 install: - set PATH=C:\msys64\usr\bin;%PATH% + - set MSYSTEM=MINGW64 - bash -lc "" - bash -lc "pacman --noconfirm --needed -Sy bash pacman pacman-mirrors msys2-runtime msys2-runtime-devel" # we don't actually need ada, fortran, libgfortran, or objc, but in # order to update gcc we need to also update those packages as well... - bash -lc "pacman --noconfirm -S mingw-w64-x86_64-{gcc,gcc-ada,gcc-fortran,gcc-libgfortran,gcc-objc,cmake,make,icu,jemalloc,zlib}" before_build: + - set MSYSTEM=MINGW64 - cd C:\projects\meta - git submodule update --init --recursive - bash -lc "export PATH=/mingw64/bin:$PATH && cd $APPVEYOR_BUILD_FOLDER && mkdir build && cd build && cmake .. -G \"MSYS Makefiles\"" build_script: - - bash -lc "export PATH=/mingw64/bin:$PATH && cd $APPVEYOR_BUILD_FOLDER/build && make" + - set MSYSTEM=MINGW64 + - bash -lc "export PATH=/mingw64/bin:$PATH && cd $APPVEYOR_BUILD_FOLDER/build && make -j2" test_script: + - set MSYSTEM=MINGW64 - bash -lc "export PATH=/mingw64/bin:$PATH && cd $APPVEYOR_BUILD_FOLDER/build && cp ../config.toml . && ./unit-test --reporter=spec" diff --git a/.travis.yml b/.travis.yml index 071c06c40..412cff95b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,10 @@ language: cpp sudo: false +cache: + directories: + deps/icu + addons: apt: packages: &default-packages @@ -49,6 +53,18 @@ matrix: - gcc-5 - g++-5 + # Linux/GCC 6 + - os: linux + env: COMPILER=gcc GCC_VERSION=6 + addons: + apt: + sources: + - ubuntu-toolchain-r-test + packages: + - *default-packages + - gcc-6 + - g++-6 + # Linux/Clang 3.6 - os: linux env: COMPILER=clang CLANG_VERSION=3.6 @@ -81,7 +97,7 @@ matrix: osx_image: xcode7.2 env: COMPILER=clang - # OS X/GCC 5 + # OS X/GCC 6 - os: osx env: COMPILER=gcc diff --git a/CHANGELOG.md b/CHANGELOG.md index 13ae6fa26..4779eaf59 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,57 @@ +# [v2.3.0][2.3.0] +## New features +- Forward and inverted indexes are now stored in one directory. **To make + use of your existing indexes, you will need to move their + directories.** For example, a configuration that used to look like the + following + + ```toml + dataset = "20newsgroups" + corpus = "line.toml" + forward-index = "20news-fwd" + inverted-index = "20news-inv" + ``` + + will now look like the following + + ```toml + dataset = "20newsgroups" + corpus = "line.toml" + index = "20news-index" + ``` + + and your folder structure should now look like + + ``` + 20news-index + ├── fwd + └── inv + ``` + + You can do this by simply moving the old folders around like so: + + ```bash + mkdir 20news-index + mv 20news-fwd 20news-index/fwd + mv 20news-inv 20news-index/inv + ``` +- `stats::multinomial` now can report the number of unique event types + counted (`unique_events()`) +- `std::vector` can now be hashed via `hash_append`. + +## Bug fixes +- Fix rounding bug in language model-based rankers. This bug caused + severely degraded performance for these rankers with short queries. The + unit tests have been improved to prevent such a regression in the + future. + +## Enhancements +- The bundled ICU version has been bumped to ICU 57.1. +- MeTA will now attempt to build its own version of ICU on Windows if it + fails to find a suitable ICU installed. +- CI support for GCC 6.x was added for all three major platforms. +- CI support also uses a fixed version of LLVM/libc++ instead of trunk. + # [v2.2.0][2.2.0] ## New features - Parallelized versions of PageRank and Personalized PageRank have been @@ -381,7 +435,8 @@ # [v1.0][1.0] - Initial release. -[unreleased]: https://github.com/meta-toolkit/meta/compare/v2.2.0...develop +[unreleased]: https://github.com/meta-toolkit/meta/compare/v2.3.0...develop +[2.3.0]: https://github.com/meta-toolkit/meta/compare/v2.2.0...v2.3.0 [2.2.0]: https://github.com/meta-toolkit/meta/compare/v2.1.0...v2.2.0 [2.1.0]: https://github.com/meta-toolkit/meta/compare/v2.0.1...v2.1.0 [2.0.1]: https://github.com/meta-toolkit/meta/compare/v2.0.0...v2.0.1 diff --git a/CMakeLists.txt b/CMakeLists.txt index e9d799c3a..e1cedc627 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,9 +41,9 @@ list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/deps/meta-cmake/) # We require Unicode 8 for the unit tests, which was added in ICU 56.1 FindOrBuildICU( - VERSION 56.1 - URL http://download.icu-project.org/files/icu4c/56.1/icu4c-56_1-src.tgz - URL_HASH MD5=c4a2d71ff56aec5ebfab2a3f059be99d + VERSION 57.1 + URL http://download.icu-project.org/files/icu4c/57.1/icu4c-57_1-src.tgz + URL_HASH MD5=976734806026a4ef8bdd17937c8898b9 ) add_library(meta-definitions INTERFACE) @@ -54,7 +54,7 @@ if(UNIX OR MINGW) target_compile_options(meta-definitions INTERFACE -Wall -Wextra -pedantic) if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") - SetClangOptions() + SetClangOptions(meta-definitions) endif() endif() diff --git a/STYLE.md b/STYLE.md index c429a91d8..38da6d985 100644 --- a/STYLE.md +++ b/STYLE.md @@ -30,7 +30,7 @@ void myclass::member(const type&); - Prefer `enum class` (strongly typed `enum`s). - Prefer no pointer over `unique_ptr` over `shared_ptr`. - Do not use `rand()` [deprecated in - C++14](www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n3841.pdf). + C++14](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n3841.pdf). - Use `#ifndef META_FILE_NAME_H_` for double inclusion guards. - `#define` kept to a minimum, and ALL_CAPS_SNAKE if used. - Lines should be no longer than 80 characters diff --git a/config.toml b/config.toml index 82b6d1b57..1b3f5987b 100644 --- a/config.toml +++ b/config.toml @@ -10,8 +10,7 @@ query-path = "../queries.txt" # create this file dataset = "ceeaus" corpus = "line.toml" # located inside dataset folder -forward-index = "ceeaus-fwd" -inverted-index = "ceeaus-inv" +index = "ceeaus" indexer-ram-budget = 1024 # **estimated** RAM budget for indexing in MB # always set this lower than your physical RAM! # indexer-num-threads = 8 # default value is system thread concurrency @@ -32,7 +31,6 @@ method = "one-vs-all" [classifier.base] method = "sgd" loss = "hinge" -prefix = "sgd-model" [lda] inference = "gibbs" diff --git a/deps/meta-cmake b/deps/meta-cmake index fa6cdb474..bafbe8112 160000 --- a/deps/meta-cmake +++ b/deps/meta-cmake @@ -1 +1 @@ -Subproject commit fa6cdb474edeae25d0ffafc44458583e39412d39 +Subproject commit bafbe81127faab9679fb7bfdb9f81c2d60d3f74a diff --git a/deps/meta-stlsoft b/deps/meta-stlsoft index 2fe7ee921..586c4fe84 160000 --- a/deps/meta-stlsoft +++ b/deps/meta-stlsoft @@ -1 +1 @@ -Subproject commit 2fe7ee9211bfd1b6b4d15adc99e13aacacc5f5a2 +Subproject commit 586c4fe84e8d822d1581f04d006f9b86ae75b8bb diff --git a/include/meta/corpus/corpus.h b/include/meta/corpus/corpus.h index d14565405..3ea7c309a 100644 --- a/include/meta/corpus/corpus.h +++ b/include/meta/corpus/corpus.h @@ -78,7 +78,7 @@ class corpus /** * @return the corpus' metadata schema */ - virtual metadata::schema schema() const; + virtual metadata::schema_type schema() const; /** * Destructor. diff --git a/include/meta/corpus/file_corpus.h b/include/meta/corpus/file_corpus.h index f3b3ca9d4..d56943e44 100644 --- a/include/meta/corpus/file_corpus.h +++ b/include/meta/corpus/file_corpus.h @@ -58,7 +58,7 @@ class file_corpus : public corpus /** * @return the metadata schema for this corpus */ - metadata::schema schema() const override; + metadata::schema_type schema() const override; private: /// the current document we are on diff --git a/include/meta/corpus/libsvm_corpus.h b/include/meta/corpus/libsvm_corpus.h index f4fd77924..3cfb8f285 100644 --- a/include/meta/corpus/libsvm_corpus.h +++ b/include/meta/corpus/libsvm_corpus.h @@ -55,7 +55,7 @@ class libsvm_corpus : public corpus uint64_t size() const override; - metadata::schema schema() const override; + metadata::schema_type schema() const override; private: /// The current document we are on diff --git a/include/meta/corpus/metadata.h b/include/meta/corpus/metadata.h index 38476ca3d..8f2e4c4e5 100644 --- a/include/meta/corpus/metadata.h +++ b/include/meta/corpus/metadata.h @@ -62,9 +62,9 @@ class metadata // I want the below to be a const field_info, but g++ gives a cryptic // compiler error in that case... clang++ accepts it just fine. -sigh- - using schema = std::vector; + using schema_type = std::vector; - metadata(const char* start, const schema& sch) + metadata(const char* start, const schema_type& sch) : schema_{&sch}, start_{start} { // nothing @@ -124,6 +124,14 @@ class metadata return util::nullopt; } + /** + * Returns the schema for this metadata object. + */ + const schema_type& schema() const + { + return *schema_; + } + /** * Tagged union to represent a single metadata field. */ @@ -303,7 +311,7 @@ class metadata }; /// pointer to the metadata_file's schema - const schema* schema_; + const schema_type* schema_; /// the start of the metadata within the metadata_file const char* start_; @@ -314,7 +322,7 @@ class metadata * @param config The configuration group that specifies the metadata * @return the corresponding metadata::schema object. */ -metadata::schema metadata_schema(const cpptoml::table& config); +metadata::schema_type metadata_schema(const cpptoml::table& config); /** * Exception class for metadata operations. diff --git a/include/meta/corpus/metadata_parser.h b/include/meta/corpus/metadata_parser.h index 2fd4ed3e9..3f139362d 100644 --- a/include/meta/corpus/metadata_parser.h +++ b/include/meta/corpus/metadata_parser.h @@ -32,7 +32,7 @@ class metadata_parser * @param filename The name of the file to parse * @param schema The schema to parse the file with */ - metadata_parser(const std::string& filename, metadata::schema schema); + metadata_parser(const std::string& filename, metadata::schema_type schema); /** * @return the metadata vector for the next document in the file @@ -42,14 +42,14 @@ class metadata_parser /** * @return the schema for the metadata in this file */ - const metadata::schema& schema() const; + const metadata::schema_type& schema() const; private: /// the parser used to extract metadata io::mifstream infile_; /// the schema for the metadata being extracted - metadata::schema schema_; + metadata::schema_type schema_; }; } } diff --git a/include/meta/hashing/hash.h b/include/meta/hashing/hash.h index 9b5d44aff..030423f9f 100644 --- a/include/meta/hashing/hash.h +++ b/include/meta/hashing/hash.h @@ -181,6 +181,14 @@ template void hash_append(HashAlgorithm& h, const T1& first, const T2& second, const Ts&... ts); +template +typename std::enable_if::value>::type +hash_append(HashAlgorithm& h, const std::vector& v); + +template +typename std::enable_if::value>::type +hash_append(HashAlgorithm& h, const std::vector& v); + // begin implementations for hash_append template @@ -258,6 +266,23 @@ hash_append(HashAlgorithm& h, const std::basic_string& s) hash_append(h, s.size()); } +template +typename std::enable_if::value>::type +hash_append(HashAlgorithm& h, const std::vector& v) +{ + h(v.data(), v.size() * sizeof(T)); + hash_append(h, v.size()); +} + +template +typename std::enable_if::value>::type +hash_append(HashAlgorithm& h, const std::vector& v) +{ + for (const auto& val : v) + hash_append(h, val); + hash_append(h, v.size()); +} + template void hash_append(HashAlgorithm& h, const T1& first, const T2& second, const Ts&... ts) diff --git a/include/meta/index/disk_index.h b/include/meta/index/disk_index.h index 7fcf38b84..bece0e476 100644 --- a/include/meta/index/disk_index.h +++ b/include/meta/index/disk_index.h @@ -31,11 +31,6 @@ class string_list; class vocabulary_map; } -namespace tokenizers -{ -class tokenizer; -} - namespace util { template diff --git a/include/meta/index/make_index.h b/include/meta/index/make_index.h index 8a67d8fbe..394fe89dd 100644 --- a/include/meta/index/make_index.h +++ b/include/meta/index/make_index.h @@ -61,22 +61,10 @@ template std::shared_ptr make_index(const cpptoml::table& config, corpus::corpus& docs, Args&&... args) { - // check if we have paths specified for either kind of index - if (!(config.contains("forward-index") - && config.contains("inverted-index"))) + if (!config.contains("index")) { throw typename Index::exception{ - "forward-index or inverted-index missing from configuration file"}; - } - - // make sure that the index names are different! - auto fwd_name = config.get_as("forward-index"); - auto inv_name = config.get_as("inverted-index"); - - if (*fwd_name == *inv_name) - { - throw typename Index::exception{ - "forward and inverted index names must be different!"}; + "index name missing from configuration file"}; } // below is needed so that make_shared can find a public ctor to invoke @@ -88,6 +76,7 @@ std::shared_ptr make_index(const cpptoml::table& config, // nothing } }; + auto idx = std::make_shared( config, std::forward(args)...); @@ -98,8 +87,7 @@ std::shared_ptr make_index(const cpptoml::table& config, } else { - if (!filesystem::exists(idx->index_name())) - filesystem::make_directory(idx->index_name()); + filesystem::remove_all(idx->index_name()); idx->create_index(config, docs); } @@ -112,23 +100,10 @@ std::shared_ptr make_index(const cpptoml::table& config, template std::shared_ptr make_index(const cpptoml::table& config, Args&&... args) { - - // check if we have paths specified for either kind of index - if (!(config.contains("forward-index") - && config.contains("inverted-index"))) + if (!config.contains("index")) { throw typename Index::exception{ - "forward-index or inverted-index missing from configuration file"}; - } - - // make sure that the index names are different! - auto fwd_name = config.get_as("forward-index"); - auto inv_name = config.get_as("inverted-index"); - - if (*fwd_name == *inv_name) - { - throw typename Index::exception{ - "forward and inverted index names must be different!"}; + "index name missing from configuration file"}; } // below is needed so that make_shared can find a public ctor to invoke @@ -140,6 +115,7 @@ std::shared_ptr make_index(const cpptoml::table& config, Args&&... args) // nothing } }; + auto idx = std::make_shared( config, std::forward(args)...); @@ -150,9 +126,7 @@ std::shared_ptr make_index(const cpptoml::table& config, Args&&... args) } else { - if (!filesystem::exists(idx->index_name())) - filesystem::make_directory(idx->index_name()); - + filesystem::remove_all(idx->index_name()); auto docs = corpus::make_corpus(config); idx->create_index(config, *docs); } diff --git a/include/meta/index/metadata_file.h b/include/meta/index/metadata_file.h index 87f1bb40d..350fbeede 100644 --- a/include/meta/index/metadata_file.h +++ b/include/meta/index/metadata_file.h @@ -72,7 +72,7 @@ class metadata_file private: /// the schema for this file - corpus::metadata::schema schema_; + corpus::metadata::schema_type schema_; /// the seek positions for every document in this file util::disk_vector index_; diff --git a/include/meta/index/metadata_writer.h b/include/meta/index/metadata_writer.h index e0d596252..416ff38c8 100644 --- a/include/meta/index/metadata_writer.h +++ b/include/meta/index/metadata_writer.h @@ -33,7 +33,7 @@ class metadata_writer * @param schema The schema for the metadata we will store */ metadata_writer(const std::string& prefix, uint64_t num_docs, - corpus::metadata::schema schema); + corpus::metadata::schema_type schema); /** * Writes a document's metadata to the database and index. @@ -59,7 +59,7 @@ class metadata_writer std::ofstream db_file_; /// the schema of the metadata we are writing - corpus::metadata::schema schema_; + corpus::metadata::schema_type schema_; }; } } diff --git a/include/meta/io/filesystem.h b/include/meta/io/filesystem.h index ef5e6704d..1a04719e4 100644 --- a/include/meta/io/filesystem.h +++ b/include/meta/io/filesystem.h @@ -53,6 +53,13 @@ void rename_file(const std::string& old_name, const std::string& new_name); */ bool make_directory(const std::string& dir_name); +/** + * Attempts to create the directory and any other directories in the path + * @param path The path to the new directory + * @return whether a new directory was created + */ +bool make_directories(const std::string& path); + /** * @param filename The file to check * @return true if the file exists diff --git a/include/meta/stats/multinomial.h b/include/meta/stats/multinomial.h index aa3fd7a7a..d4c8200a3 100644 --- a/include/meta/stats/multinomial.h +++ b/include/meta/stats/multinomial.h @@ -75,6 +75,11 @@ class multinomial */ double counts() const; + /** + * @return the number of unique event values that have been observed + */ + uint64_t unique_events() const; + /** * Runs a function for each observed event for this distribution. Note * that this does **not** include the prior, only events that have been diff --git a/include/meta/stats/multinomial.tcc b/include/meta/stats/multinomial.tcc index c9fc0347a..5cd7a298d 100644 --- a/include/meta/stats/multinomial.tcc +++ b/include/meta/stats/multinomial.tcc @@ -54,6 +54,12 @@ double multinomial::counts() const return total_counts_ + prior_.pseudo_counts(); } +template +uint64_t multinomial::unique_events() const +{ + return counts_.size(); +} + template template void multinomial::each_seen_event(Fun&& fun) const diff --git a/src/classify/CMakeLists.txt b/src/classify/CMakeLists.txt index 2ff9a4680..685e094a0 100644 --- a/src/classify/CMakeLists.txt +++ b/src/classify/CMakeLists.txt @@ -7,12 +7,16 @@ ExternalProject_Add(liblinear SOURCE_DIR ${meta_SOURCE_DIR}/../deps/libsvm-modules/liblinear BINARY_DIR ${meta_SOURCE_DIR}/../deps/libsvm-modules/liblinear/build CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release + CMAKE_CACHE_ARGS "-DCMAKE_C_COMPILER:FILEPATH=${CMAKE_C_COMPILER}" + "-DCMAKE_CXX_COMPILER:FILEPATH=${CMAKE_CXX_COMPILER}" INSTALL_COMMAND "") ExternalProject_Add(libsvm SOURCE_DIR ${meta_SOURCE_DIR}/../deps/libsvm-modules/libsvm BINARY_DIR ${meta_SOURCE_DIR}/../deps/libsvm-modules/libsvm/build CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release + CMAKE_CACHE_ARGS "-DCMAKE_C_COMPILER:FILEPATH=${CMAKE_C_COMPILER}" + "-DCMAKE_CXX_COMPILER:FILEPATH=${CMAKE_CXX_COMPILER}" INSTALL_COMMAND "") add_library(meta-classify binary_classifier_factory.cpp diff --git a/src/classify/confusion_matrix.cpp b/src/classify/confusion_matrix.cpp index ff0bd4e58..78f81c79d 100644 --- a/src/classify/confusion_matrix.cpp +++ b/src/classify/confusion_matrix.cpp @@ -187,19 +187,22 @@ void confusion_matrix::print_stats(std::ostream& out) const = std::setw(static_cast(width + printing::make_bold("").size())); auto w2 = std::setw(static_cast(12 + printing::make_bold("").size())); out.precision(3); - out << std::string(width + 12 * 3, '-') << std::endl + out << std::string(width + 12 * 4, '-') << std::endl << std::left << w1 << printing::make_bold("Class") << std::left << w2 << printing::make_bold("F1 Score") << std::left << w2 << printing::make_bold("Precision") << std::left << w2 - << printing::make_bold("Recall") << std::endl - << std::string(width + 12 * 3, '-') << std::endl; + << printing::make_bold("Recall") << std::left << w2 + << printing::make_bold("Class Dist") << std::endl + << std::string(width + 12 * 4, '-') << std::endl; for (auto& cls : classes_) { auto w3 = std::setw(12); // different width for non-bold out << std::left << std::setw(static_cast(width)) << cls << std::left << w3 << f1_score(cls) << std::left << w3 - << precision(cls) << std::left << w3 << recall(cls) << std::endl; + << precision(cls) << std::left << w3 << recall(cls) + << std::left << w3 << static_cast(counts_.at(cls)) / total_ + << std::endl; } auto limit = [](double val) @@ -210,12 +213,12 @@ void confusion_matrix::print_stats(std::ostream& out) const return ss.str(); }; - out << std::string(width + 12 * 3, '-') << std::endl + out << std::string(width + 12 * 4, '-') << std::endl << w1 << printing::make_bold("Total") << w2 << printing::make_bold(limit(f1_score())) << w2 << printing::make_bold(limit(precision())) << w2 << printing::make_bold(limit(recall())) << std::endl - << std::string(width + 12 * 3, '-') << std::endl + << std::string(width + 12 * 4, '-') << std::endl << total_ << " predictions attempted, overall accuracy: " << accuracy() << std::endl; diff --git a/src/corpus/corpus.cpp b/src/corpus/corpus.cpp index 0073a7f7e..915545e1b 100644 --- a/src/corpus/corpus.cpp +++ b/src/corpus/corpus.cpp @@ -25,7 +25,7 @@ std::vector corpus::next_metadata() return mdata_parser_->next(); } -metadata::schema corpus::schema() const +metadata::schema_type corpus::schema() const { auto schema = mdata_parser_->schema(); if (store_full_text()) diff --git a/src/corpus/file_corpus.cpp b/src/corpus/file_corpus.cpp index 87c4c1750..0238fa742 100644 --- a/src/corpus/file_corpus.cpp +++ b/src/corpus/file_corpus.cpp @@ -78,7 +78,7 @@ uint64_t file_corpus::size() const return docs_.size(); } -metadata::schema file_corpus::schema() const +metadata::schema_type file_corpus::schema() const { auto schema = corpus::schema(); schema.insert(schema.begin(), diff --git a/src/corpus/libsvm_corpus.cpp b/src/corpus/libsvm_corpus.cpp index a6bca8604..1b4b53630 100644 --- a/src/corpus/libsvm_corpus.cpp +++ b/src/corpus/libsvm_corpus.cpp @@ -65,7 +65,7 @@ document libsvm_corpus::next() return doc; } -metadata::schema libsvm_corpus::schema() const +metadata::schema_type libsvm_corpus::schema() const { auto schema = corpus::schema(); if (lbl_type_ == label_type::REGRESSION) diff --git a/src/corpus/metadata.cpp b/src/corpus/metadata.cpp index 08e2fd938..5479970ea 100644 --- a/src/corpus/metadata.cpp +++ b/src/corpus/metadata.cpp @@ -10,9 +10,9 @@ namespace meta namespace corpus { -metadata::schema metadata_schema(const cpptoml::table& config) +metadata::schema_type metadata_schema(const cpptoml::table& config) { - metadata::schema schema; + metadata::schema_type schema; if (auto metadata = config.get_table_array("metadata")) { const auto& arr = metadata->get(); diff --git a/src/corpus/metadata_parser.cpp b/src/corpus/metadata_parser.cpp index e10e64772..1d0e0f60f 100644 --- a/src/corpus/metadata_parser.cpp +++ b/src/corpus/metadata_parser.cpp @@ -16,7 +16,7 @@ namespace corpus { metadata_parser::metadata_parser(const std::string& filename, - metadata::schema schema) + metadata::schema_type schema) : infile_{filename}, schema_{std::move(schema)} { // nothing @@ -77,7 +77,7 @@ std::vector metadata_parser::next() return mdata; } -const metadata::schema& metadata_parser::schema() const +const metadata::schema_type& metadata_parser::schema() const { return schema_; } diff --git a/src/features/tools/CMakeLists.txt b/src/features/tools/CMakeLists.txt index 8032b0e95..a323bec71 100644 --- a/src/features/tools/CMakeLists.txt +++ b/src/features/tools/CMakeLists.txt @@ -1,2 +1,3 @@ add_executable(feature-summary feature_summary.cpp) -target_link_libraries(feature-summary meta-features meta-index) +target_link_libraries(feature-summary meta-features meta-index + meta-sequence-analyzers meta-parser-analyzers) diff --git a/src/features/tools/feature_summary.cpp b/src/features/tools/feature_summary.cpp index f9c9f40ba..86d7c3270 100644 --- a/src/features/tools/feature_summary.cpp +++ b/src/features/tools/feature_summary.cpp @@ -13,6 +13,8 @@ #include "meta/features/selector_factory.h" #include "meta/index/forward_index.h" #include "meta/logging/logger.h" +#include "meta/parser/analyzers/tree_analyzer.h" +#include "meta/sequence/analyzers/ngram_pos_analyzer.h" #include "meta/util/shim.h" using namespace meta; @@ -27,6 +29,10 @@ int main(int argc, char* argv[]) logging::set_cerr_logging(); + // Register additional analyzers + parser::register_analyzers(); + sequence::register_analyzers(); + auto config = cpptoml::parse_file(argv[1]); auto feature_config = config->get_table("features"); if (!feature_config) diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index fe49619be..6ffdef756 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -121,7 +121,7 @@ class forward_index::impl }; forward_index::forward_index(const cpptoml::table& config) - : disk_index{config, *config.get_as("forward-index")}, + : disk_index{config, *config.get_as("index") + "/fwd"}, fwd_impl_{this, config} { /* nothing */ @@ -198,6 +198,9 @@ void forward_index::load_index() void forward_index::create_index(const cpptoml::table& config, corpus::corpus& docs) { + if (!filesystem::make_directories(index_name())) + throw exception{"Unable to create index directory: " + index_name()}; + { std::ofstream config_file{index_name() + "/config.toml"}; config_file << config; diff --git a/src/index/inverted_index.cpp b/src/index/inverted_index.cpp index 0d1f8bd16..d5b60d1cb 100644 --- a/src/index/inverted_index.cpp +++ b/src/index/inverted_index.cpp @@ -87,7 +87,7 @@ inverted_index::impl::impl(inverted_index* idx, const cpptoml::table& config) } inverted_index::inverted_index(const cpptoml::table& config) - : disk_index{config, *config.get_as("inverted-index")}, + : disk_index{config, *config.get_as("index") + "/inv"}, inv_impl_{this, config} { // nothing @@ -115,6 +115,9 @@ bool inverted_index::valid() const void inverted_index::create_index(const cpptoml::table& config, corpus::corpus& docs) { + if (!filesystem::make_directories(index_name())) + throw exception{"Unable to create index directory: " + index_name()}; + // save the config file so we can recreate the analyzer { std::ofstream config_file{index_name() + "/config.toml"}; diff --git a/src/index/metadata_writer.cpp b/src/index/metadata_writer.cpp index 44ba66b32..4310ad6b8 100644 --- a/src/index/metadata_writer.cpp +++ b/src/index/metadata_writer.cpp @@ -12,7 +12,7 @@ namespace index { metadata_writer::metadata_writer(const std::string& prefix, uint64_t num_docs, - corpus::metadata::schema schema) + corpus::metadata::schema_type schema) : seek_pos_{prefix + "/metadata.index", num_docs}, byte_pos_{0}, db_file_{prefix + "/metadata.db", std::ios::binary}, diff --git a/src/index/ranker/absolute_discount.cpp b/src/index/ranker/absolute_discount.cpp index d48ea4426..4bb2caddd 100644 --- a/src/index/ranker/absolute_discount.cpp +++ b/src/index/ranker/absolute_discount.cpp @@ -14,6 +14,7 @@ namespace index { const util::string_view absolute_discount::id = "absolute-discount"; +const constexpr float absolute_discount::default_delta; absolute_discount::absolute_discount(float delta) : delta_{delta} { diff --git a/src/index/ranker/dirichlet_prior.cpp b/src/index/ranker/dirichlet_prior.cpp index 43a4852c3..07536afbe 100644 --- a/src/index/ranker/dirichlet_prior.cpp +++ b/src/index/ranker/dirichlet_prior.cpp @@ -13,6 +13,7 @@ namespace index { const util::string_view dirichlet_prior::id = "dirichlet-prior"; +const constexpr float dirichlet_prior::default_mu; dirichlet_prior::dirichlet_prior(float mu) : mu_{mu} { diff --git a/src/index/ranker/jelinek_mercer.cpp b/src/index/ranker/jelinek_mercer.cpp index 200752d5f..46ff9440e 100644 --- a/src/index/ranker/jelinek_mercer.cpp +++ b/src/index/ranker/jelinek_mercer.cpp @@ -13,6 +13,7 @@ namespace index { const util::string_view jelinek_mercer::id = "jelinek-mercer"; +const constexpr float jelinek_mercer::default_lambda; jelinek_mercer::jelinek_mercer(float lambda) : lambda_{lambda} { diff --git a/src/index/ranker/lm_ranker.cpp b/src/index/ranker/lm_ranker.cpp index c5460ff29..dc62fed79 100644 --- a/src/index/ranker/lm_ranker.cpp +++ b/src/index/ranker/lm_ranker.cpp @@ -19,7 +19,7 @@ const util::string_view language_model_ranker::id = "language-model"; float language_model_ranker::score_one(const score_data& sd) { float ps = smoothed_prob(sd); - float pc = sd.corpus_term_count / sd.total_terms; + float pc = static_cast(sd.corpus_term_count) / sd.total_terms; return sd.query_term_weight * fastapprox::fastlog(ps / (doc_constant(sd) * pc)); } diff --git a/src/io/filesystem.cpp b/src/io/filesystem.cpp index 8458b5fb2..f0d121d38 100644 --- a/src/io/filesystem.cpp +++ b/src/io/filesystem.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #else #include #endif @@ -47,6 +48,11 @@ bool make_directory(const std::string& dir_name) return traits::create_directory(dir_name.c_str()); } +bool make_directories(const std::string& path) +{ + return stlsoft::platformstl_project::create_directory_recurse(path); +} + bool file_exists(const std::string& filename) { return traits::file_exists(filename.c_str()); @@ -133,6 +139,11 @@ bool make_directory(const std::string& dir_name) return fs::create_directory(dir_name); } +bool make_directories(const std::string& path) +{ + return fs::create_directories(path); +} + bool file_exists(const std::string& filename) { return fs::exists(filename); diff --git a/tests/classifier_test.cpp b/tests/classifier_test.cpp index 9d3db31df..48905d62b 100644 --- a/tests/classifier_test.cpp +++ b/tests/classifier_test.cpp @@ -194,12 +194,10 @@ void run_tests(const std::string& index_type) { go_bandit([]() { - filesystem::remove_all("ceeaus-inv"); - filesystem::remove_all("ceeaus-fwd"); + filesystem::remove_all("ceeaus"); run_tests("line"); - filesystem::remove_all("ceeaus-inv"); - filesystem::remove_all("ceeaus-fwd"); + filesystem::remove_all("ceeaus"); run_tests("file"); describe("[classifier] saving and loading model files", [&]() { @@ -276,8 +274,7 @@ go_bandit([]() { }); }); - filesystem::remove_all("ceeaus-inv"); - filesystem::remove_all("ceeaus-fwd"); + filesystem::remove_all("ceeaus"); describe("[classifier] confusion matrix", [&]() { diff --git a/tests/create_config.h b/tests/create_config.h index d0053f7bd..a78fbdb3c 100644 --- a/tests/create_config.h +++ b/tests/create_config.h @@ -57,8 +57,7 @@ create_config(const std::string& corpus_type, bool multi = false) { table->insert("dataset", "ceeaus"); table->insert("corpus", corpus_type + ".toml"); table->insert("encoding", "shift_jis"); - table->insert("forward-index", "ceeaus-fwd"); - table->insert("inverted-index", "ceeaus-inv"); + table->insert("index", "ceeaus"); auto anas = cpptoml::make_table_array(); auto ana = cpptoml::make_table(); diff --git a/tests/feature_selection_test.cpp b/tests/feature_selection_test.cpp index 734a74241..beda82c7b 100644 --- a/tests/feature_selection_test.cpp +++ b/tests/feature_selection_test.cpp @@ -69,8 +69,7 @@ go_bandit([]() { }); f_idx = nullptr; - filesystem::remove_all("ceeaus-inv"); - filesystem::remove_all("ceeaus-fwd"); + filesystem::remove_all("ceeaus"); for (const std::string& id : {"chi-square", "info-gain", "corr-coef", "odds-ratio"}) { for (const std::string& suffix : {"1", "2", "3", "selected"}) { diff --git a/tests/forward_index_test.cpp b/tests/forward_index_test.cpp index 8a5954e0a..de785e208 100644 --- a/tests/forward_index_test.cpp +++ b/tests/forward_index_test.cpp @@ -27,8 +27,7 @@ std::shared_ptr create_libsvm_config() { config->insert("prefix", *orig_config->get_as("prefix")); config->insert("corpus", "libsvm.toml"); config->insert("dataset", "breast-cancer"); - config->insert("forward-index", "bcancer-fwd"); - config->insert("inverted-index", "bcancer-inv"); + config->insert("index", "bcancer"); auto anas = cpptoml::make_table_array(); auto ana = cpptoml::make_table(); @@ -151,16 +150,14 @@ go_bandit([]() { auto file_cfg = tests::create_config("file"); it("should create the index", [&]() { - filesystem::remove_all("ceeaus-inv"); - filesystem::remove_all("ceeaus-fwd"); + filesystem::remove_all("ceeaus"); ceeaus_forward_test(*file_cfg); }); it("should load the index", [&]() { ceeaus_forward_test(*file_cfg); }); it("should uninvert if specified", [&]() { - filesystem::remove_all("ceeaus-inv"); - filesystem::remove_all("ceeaus-fwd"); + filesystem::remove_all("ceeaus"); file_cfg->insert("uninvert", true); ceeaus_forward_test(*file_cfg); }); @@ -170,16 +167,14 @@ go_bandit([]() { auto line_cfg = tests::create_config("line"); it("should create the index", [&]() { - filesystem::remove_all("ceeaus-inv"); - filesystem::remove_all("ceeaus-fwd"); + filesystem::remove_all("ceeaus"); ceeaus_forward_test(*line_cfg); }); it("should load the index", [&]() { ceeaus_forward_test(*line_cfg); }); it("should uninvert if specified", [&]() { - filesystem::remove_all("ceeaus-inv"); - filesystem::remove_all("ceeaus-fwd"); + filesystem::remove_all("ceeaus"); line_cfg->insert("uninvert", true); ceeaus_forward_test(*line_cfg); }); @@ -213,7 +208,7 @@ go_bandit([]() { auto svm_cfg = create_libsvm_config(); it("should create the index", [&]() { - filesystem::remove_all("bcancer-fwd"); + filesystem::remove_all("bcancer"); bcancer_forward_test(*svm_cfg); }); @@ -231,12 +226,11 @@ go_bandit([]() { describe("[forward-index] with zlib", []() { - filesystem::remove_all("ceeaus-fwd"); + filesystem::remove_all("ceeaus"); auto gz_cfg = tests::create_config("gz"); it("should create the index", [&]() { - filesystem::remove_all("ceeaus-inv"); - filesystem::remove_all("ceeaus-fwd"); + filesystem::remove_all("ceeaus"); ceeaus_forward_test(*gz_cfg); }); @@ -244,7 +238,6 @@ go_bandit([]() { }); - filesystem::remove_all("ceeaus-inv"); - filesystem::remove_all("ceeaus-fwd"); - filesystem::remove_all("bcancer-fwd"); + filesystem::remove_all("ceeaus"); + filesystem::remove_all("bcancer"); }); diff --git a/tests/inverted_index_test.cpp b/tests/inverted_index_test.cpp index d62db6fe2..88e3edc54 100644 --- a/tests/inverted_index_test.cpp +++ b/tests/inverted_index_test.cpp @@ -76,7 +76,7 @@ go_bandit([]() { auto file_cfg = tests::create_config("file"); it("should create the index", [&]() { - filesystem::remove_all("ceeaus-inv"); + filesystem::remove_all("ceeaus"); auto idx = index::make_index(*file_cfg); check_ceeaus_expected(*idx); }); @@ -87,7 +87,7 @@ go_bandit([]() { check_term_id(*idx); }); - filesystem::remove_all("ceeaus-inv"); + filesystem::remove_all("ceeaus"); it("should be able to store full text metadata", [&]() { auto docs = corpus::make_corpus(*file_cfg); check_full_text(*docs, *file_cfg); @@ -96,7 +96,7 @@ go_bandit([]() { describe("[inverted-index] from line config", []() { - filesystem::remove_all("ceeaus-inv"); + filesystem::remove_all("ceeaus"); auto line_cfg = tests::create_config("line"); it("should create the index", [&]() { @@ -113,7 +113,7 @@ go_bandit([]() { check_term_id(*idx); // twice to check splay_caching }); - filesystem::remove_all("ceeaus-inv"); + filesystem::remove_all("ceeaus"); it("should be able to store full text metadata", [&]() { auto docs = corpus::make_corpus(*line_cfg); check_full_text(*docs, *line_cfg); @@ -150,7 +150,7 @@ go_bandit([]() { describe("[inverted-index] with zlib", []() { - filesystem::remove_all("ceeaus-inv"); + filesystem::remove_all("ceeaus"); auto gz_cfg = tests::create_config("gz"); it("should create the index", [&]() { @@ -164,12 +164,12 @@ go_bandit([]() { check_term_id(*idx); }); - filesystem::remove_all("ceeaus-inv"); + filesystem::remove_all("ceeaus"); it("should be able to store full text metadata", [&]() { auto docs = corpus::make_corpus(*gz_cfg); check_full_text(*docs, *gz_cfg); }); }); - filesystem::remove_all("ceeaus-inv"); + filesystem::remove_all("ceeaus"); }); diff --git a/tests/ir_eval_test.cpp b/tests/ir_eval_test.cpp index 8844fc43d..3d2c55cef 100644 --- a/tests/ir_eval_test.cpp +++ b/tests/ir_eval_test.cpp @@ -40,7 +40,7 @@ go_bandit([]() { describe("[ir-eval] retrieval metrics", []() { it("should give results on [0, 1] for all measures", []() { - filesystem::remove_all("ceeaus-inv"); + filesystem::remove_all("ceeaus"); auto file_cfg = tests::create_config("file"); auto idx = index::make_index(*file_cfg); index::okapi_bm25 ranker; @@ -156,6 +156,8 @@ go_bandit([]() { AssertThat(eval.map(), Is().GreaterThanOrEqualTo(0).And().LessThanOrEqualTo(1)); AssertThat(eval.gmap(), EqualsWithDelta(0.0, delta)); + + filesystem::remove_all("ceeaus"); }); }); diff --git a/tests/ranker_test.cpp b/tests/ranker_test.cpp index 6463db988..1ac39511e 100644 --- a/tests/ranker_test.cpp +++ b/tests/ranker_test.cpp @@ -4,8 +4,8 @@ */ #include "bandit/bandit.h" -#include "meta/corpus/document.h" #include "create_config.h" +#include "meta/corpus/document.h" #include "meta/index/ranker/all.h" using namespace bandit; @@ -15,6 +15,7 @@ namespace { template void test_rank(Ranker& r, Index& idx, const std::string& encoding) { + // exhaustive search for each document for (size_t i = 0; i < idx.num_docs(); ++i) { auto d_id = idx.docs()[i]; auto path = idx.doc_path(d_id); @@ -33,6 +34,20 @@ void test_rank(Ranker& r, Index& idx, const std::string& encoding) { EqualsWithDelta(ranking[1].score, 0.0001)); } } + + // sanity checks for simple query + corpus::document query; + query.content("character"); + + auto ranking = r.score(idx, query); + // ensure there is diversity in the top 10 documents + AssertThat(ranking[0].score, Is().GreaterThan(ranking.back().score)); + + // check for sorted-ness of ranking + for (uint64_t i = 1; i < ranking.size(); ++i) { + AssertThat(ranking[i - 1].score, + Is().GreaterThanOrEqualTo(ranking[i].score)); + } } } @@ -41,7 +56,7 @@ go_bandit([]() { describe("[rankers]", []() { auto config = tests::create_config("file"); - filesystem::remove_all("ceeaus-inv"); + filesystem::remove_all("ceeaus"); auto idx = index::make_index(*config); std::string encoding = "utf-8"; if (auto enc = config->get_as("encoding")) @@ -73,6 +88,6 @@ go_bandit([]() { }); idx = nullptr; - filesystem::remove_all("ceeaus-inv"); + filesystem::remove_all("ceeaus"); }); }); diff --git a/tests/regression_test.cpp b/tests/regression_test.cpp index 4f7940603..26e8f01e2 100644 --- a/tests/regression_test.cpp +++ b/tests/regression_test.cpp @@ -53,10 +53,9 @@ go_bandit([]() { config->insert("dataset", "housing"); config->insert("corpus", "libsvm.toml"); - config->insert("forward-index", "housing-fwd"); - config->insert("inverted-index", "housing-inv"); + config->insert("index", "housing"); - filesystem::remove_all("housing-fwd"); + filesystem::remove_all("housing"); auto f_idx = index::make_index(*config); regression::regression_dataset dataset{ @@ -101,5 +100,5 @@ go_bandit([]() { }); f_idx = nullptr; - filesystem::remove_all("housing-fwd"); + filesystem::remove_all("housing"); }); diff --git a/tests/topics_test.cpp b/tests/topics_test.cpp index a63309324..508f5be19 100644 --- a/tests/topics_test.cpp +++ b/tests/topics_test.cpp @@ -76,5 +76,5 @@ go_bandit([]() { [&]() { run_model(idx, prefix); }); }); - filesystem::remove_all("ceeaus-fwd"); + filesystem::remove_all("ceeaus"); }); diff --git a/travis/install_libcxx.sh b/travis/install_libcxx.sh index 4edd3c619..779035b20 100755 --- a/travis/install_libcxx.sh +++ b/travis/install_libcxx.sh @@ -1,15 +1,21 @@ #!/bin/bash set -v cwd=$(pwd) -svn co --quiet http://llvm.org/svn/llvm-project/llvm/trunk llvm + +LLVM_TAG="${LLVM_TAG:-RELEASE_381}" + +svn co --quiet http://llvm.org/svn/llvm-project/llvm/tags/$LLVM_TAG/final llvm + cd llvm/projects -svn co --quiet http://llvm.org/svn/llvm-project/libcxx/trunk libcxx -svn co --quiet http://llvm.org/svn/llvm-project/libcxxabi/trunk libcxxabi +svn co --quiet http://llvm.org/svn/llvm-project/libcxx/tags/$LLVM_TAG/final libcxx +svn co --quiet http://llvm.org/svn/llvm-project/libcxxabi/tags/$LLVM_TAG/final libcxxabi cd ../ + mkdir build cd build cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$HOME ../ make cxx make install-libcxx install-libcxxabi + cd $cwd set +v diff --git a/travis/install_osx.sh b/travis/install_osx.sh index 99dd5651e..72e22de67 100755 --- a/travis/install_osx.sh +++ b/travis/install_osx.sh @@ -11,9 +11,9 @@ fi if [ "$COMPILER" == "gcc" ]; then brew tap homebrew/versions - brew install homebrew/versions/gcc5 - export CC=gcc-5 - export CXX=g++-5 + brew install homebrew/versions/gcc6 + export CC=gcc-6 + export CXX=g++-6 fi if [ "$COMPILER" == "clang" ]; then