From bb3eaadb9a70c6eafc472b606225833dc4f8d8c9 Mon Sep 17 00:00:00 2001 From: Ch0p1k3 Date: Sat, 1 Jul 2023 14:34:14 +0000 Subject: [PATCH] Tutorial to apply CatBoost model from C++ --- apply_model/cpp/.clang-format | 12 + apply_model/cpp/.clang-tidy | 49 +++ apply_model/cpp/.gitignore | 98 +++++ apply_model/cpp/CMakeLists.txt | 10 + apply_model/cpp/README.md | 21 + apply_model/cpp/bin/CMakeLists.txt | 10 + apply_model/cpp/bin/main.cpp | 153 +++++++ apply_model/cpp/build/.gitignore | 2 + apply_model/cpp/cmake/Ccache.cmake | 6 + apply_model/cpp/cmake/CheckCompiler.cmake | 16 + apply_model/cpp/cmake/CompileOptions.cmake | 20 + apply_model/cpp/cmake/Development.cmake | 2 + apply_model/cpp/model/.gitignore | 2 + apply_model/cpp/model/train_model.ipynb | 389 ++++++++++++++++++ apply_model/cpp/third_party/CMakeLists.txt | 2 + .../cpp/third_party/argparse/CMakeLists.txt | 9 + .../cpp/third_party/catboost/CMakeLists.txt | 33 ++ 17 files changed, 834 insertions(+) create mode 100644 apply_model/cpp/.clang-format create mode 100644 apply_model/cpp/.clang-tidy create mode 100644 apply_model/cpp/.gitignore create mode 100644 apply_model/cpp/CMakeLists.txt create mode 100644 apply_model/cpp/README.md create mode 100644 apply_model/cpp/bin/CMakeLists.txt create mode 100644 apply_model/cpp/bin/main.cpp create mode 100644 apply_model/cpp/build/.gitignore create mode 100644 apply_model/cpp/cmake/Ccache.cmake create mode 100644 apply_model/cpp/cmake/CheckCompiler.cmake create mode 100644 apply_model/cpp/cmake/CompileOptions.cmake create mode 100644 apply_model/cpp/cmake/Development.cmake create mode 100644 apply_model/cpp/model/.gitignore create mode 100644 apply_model/cpp/model/train_model.ipynb create mode 100644 apply_model/cpp/third_party/CMakeLists.txt create mode 100644 apply_model/cpp/third_party/argparse/CMakeLists.txt create mode 100644 apply_model/cpp/third_party/catboost/CMakeLists.txt diff --git a/apply_model/cpp/.clang-format b/apply_model/cpp/.clang-format new file mode 100644 index 0000000..589a571 --- /dev/null +++ b/apply_model/cpp/.clang-format @@ -0,0 +1,12 @@ +BasedOnStyle: Google +--- +Language: Cpp +AllowShortFunctionsOnASingleLine: None +AllowShortIfStatementsOnASingleLine: false +AllowShortLambdasOnASingleLine: Empty +DerivePointerAlignment: false +PointerAlignment: Left +SortIncludes: false +PackConstructorInitializers: Never +BreakConstructorInitializers: BeforeColon +BreakInheritanceList: AfterComma diff --git a/apply_model/cpp/.clang-tidy b/apply_model/cpp/.clang-tidy new file mode 100644 index 0000000..f264464 --- /dev/null +++ b/apply_model/cpp/.clang-tidy @@ -0,0 +1,49 @@ +--- + +Checks: '-*,cppcoreguidelines-avoid-goto,cppcoreguidelines-pro-type-const-cast, google-runtime-int, modernize-use-nullptr, readability-braces-around-statements, readability-container-size-empty, readability-redundant-control-flow, readability-identifier-naming, readability-simplify-boolean-expr, google-build-using-namespace, readability-implicit-bool-conversion, google-explicit-constructor' + +HeaderFilterRegex: '\.hpp$' + +WarningsAsErrors: '*' + +CheckOptions: + - key: readability-identifier-naming.NamespaceCase + value: lower_case + - key: readability-identifier-naming.ClassCase + value: CamelCase + - key: readability-identifier-naming.StructCase + value: CamelCase + - key: readability-identifier-naming.TypedefCase + value: CamelCase + - key: readability-identifier-naming.TypeAliasCase + value: CamelCase + - key: readability-identifier-naming.FunctionCase + value: CamelCase + - key: readability-identifier-naming.ParameterCase + value: lower_case + - key: readability-identifier-naming.VariableCase + value: lower_case + - key: readability-identifier-naming.PrivateMemberCase + value: lower_case + - key: readability-identifier-naming.PrivateMemberSuffix + value: '_' + - key: readability-identifier-naming.GlobalConstantCase + value: CamelCase + - key: readability-identifier-naming.GlobalConstantPrefix + value: k + - key: readability-identifier-naming.StaticConstantCase + value: CamelCase + - key: readability-identifier-naming.StaticConstantPrefix + value: k + - key: readability-identifier-naming.ConstexprVariableCase + value: CamelCase + - key: readability-identifier-naming.ConstexprVariablePrefix + value: k + - key: readability-identifier-naming.TypeTemplateParameterCase + value: CamelCase + - key: readability-simplify-boolean-expr.ChainedConditionalReturn + value: '1' + - key: readability-simplify-boolean-expr.ChainedConditionalAssignment + value: '1' + - key: readability-identifier-naming.TypeTemplateParameterIgnoredRegexp + value: expr-type diff --git a/apply_model/cpp/.gitignore b/apply_model/cpp/.gitignore new file mode 100644 index 0000000..6aa8435 --- /dev/null +++ b/apply_model/cpp/.gitignore @@ -0,0 +1,98 @@ +# Created by .ignore support plugin (hsz.mobi) +### C++ template +# Compiled Object files +*.slo +*.lo +*.o +*.obj + +# Precompiled Headers +*.gch +*.pch + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Executables +*.exe +*.app +### CMake template +CMakeCache.txt +CMakeFiles +CMakeScripts +Makefile +cmake_install.cmake +install_manifest.txt +CTestTestfile.cmake +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff: +.idea/workspace.xml +.idea/tasks.xml +.idea/dictionaries +.idea/vcs.xml +.idea/jsLibraryMappings.xml + +# Sensitive or high-churn files: +.idea/dataSources.ids +.idea/dataSources.xml +.idea/dataSources.local.xml +.idea/sqlDataSources.xml +.idea/dynamic.xml +.idea/uiDesigner.xml + +# Gradle: +.idea/gradle.xml +.idea/libraries + +# Mongo Explorer plugin: +.idea/mongoSettings.xml + +## File-based project format: +*.iws + +## Plugin-specific files: + +# IntelliJ +/out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Idea +.idea/ +cmake-build-debug/ +cmake-build-release/ +cmake-* + +# VS code +.vscode/ + +# YouCompleteMe VIM plugin +**/.ycm_extra_conf.py + +# Python +__pycache__ + +# Clangd +.cache/ + +# CTest +Testing/ diff --git a/apply_model/cpp/CMakeLists.txt b/apply_model/cpp/CMakeLists.txt new file mode 100644 index 0000000..b839e24 --- /dev/null +++ b/apply_model/cpp/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.14) +project(apply-model) + +include(cmake/Ccache.cmake) +include(cmake/CheckCompiler.cmake) +include(cmake/CompileOptions.cmake) +include(cmake/Development.cmake) + +add_subdirectory(bin) +add_subdirectory(third_party) diff --git a/apply_model/cpp/README.md b/apply_model/cpp/README.md new file mode 100644 index 0000000..94af647 --- /dev/null +++ b/apply_model/cpp/README.md @@ -0,0 +1,21 @@ +# Apply CatBoost model from C++ +This tutorial consists of two parts: +- first part where we preprocess dataset and train the classifier model. + This part can be found in [train_model.ipynb](model/train_model.ipynb). +- second part where we load model into C++ application and then apply it. + This part presented as a small CMake project. + + To configure CMake, execute: + ```bash + cmake -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=TRUE -DCMAKE_BUILD_TYPE:STRING=Release -DCMAKE_C_COMPILER:FILEPATH= -DCMAKE_CXX_COMPILER:FILEPATH= -Bbuild -G "Unix Makefiles" + ``` + + Build target `apply_model`: + ```bash + cmake --build build --config Release --target apply_model + ``` + + Run binary: + ```bash + build/bin/apply_model -m model/adult.cbm + ``` diff --git a/apply_model/cpp/bin/CMakeLists.txt b/apply_model/cpp/bin/CMakeLists.txt new file mode 100644 index 0000000..f969882 --- /dev/null +++ b/apply_model/cpp/bin/CMakeLists.txt @@ -0,0 +1,10 @@ +add_executable( + apply_model + main.cpp +) + +target_link_libraries( + apply_model + argparse + catboost +) diff --git a/apply_model/cpp/bin/main.cpp b/apply_model/cpp/bin/main.cpp new file mode 100644 index 0000000..9ee75bc --- /dev/null +++ b/apply_model/cpp/bin/main.cpp @@ -0,0 +1,153 @@ +#include +#include +#include +#include + +#include +#include +#include + +float Sigmoid(const float x) { + return 1. / (1. + std::exp(-x)); +} + +std::string Answer(const bool makes_over_50k_a_year) { + if (makes_over_50k_a_year) { + return "makes over 50K a year"; + } + return "doesn't make over 50K a year"; +} + +int main(const int argc, const char* argv[]) { + argparse::ArgumentParser program("apply-model", "", + argparse::default_arguments::help); + program.add_argument("-m", "--model") + .help("path to model") + .metavar("model") + .action([](const auto& path) { + return std::filesystem::path(path); + }); + program.parse_args(argc, argv); + const auto model_path = program.get("-m"); + + // Load model that we trained withing Jupyter Notebook + ModelCalcerWrapper model(model_path); + std::cout << "Adult dataset model metainformation" << std::endl; + std::cout << "Tree count: " << model.GetTreeCount() << std::endl; + + // In our case we were solving a binary classification problem (weather person makes over 50K a year), so the + // dimension of the prediction will be 1, it will return probability of the object to belong to the positive + // class; in our case we had two classed encoded as "<=50K" and ">50K", during data preprocessing (see + // `get_fixed_adult()` in Notebook) we encoded "<=50K" as 0 and ">50K" as 1, so that ">50K" became a positive + // class. Probability of the negative class ("<=50K") can be easily deduced as (1-p) where p is a probability of + // positive class. + // + // For most of cases prediction dimension will be 1 (for regression and for ranking), it can be N for cases of + // multiclassification, where N is a number of classes. + std::cout << "numeric feature count: " << model.GetFloatFeaturesCount() + << std::endl; + std::cout << "categoric feature count: " << model.GetCatFeaturesCount() + << std::endl; + + std::cout << std::endl; + + // When we were training CatBoost we used a default classification threshold for AUC which is equal to 0.5, + // this means that our formula is optimized for this threashold, though we may change threshold to optimize some + // other metric on a different dataset, but we won't do it in this tutorial. + static constexpr auto kClassificationThreshold = 0.5; + + // Ok now lets try to use our model for prediction. We'll look at the test part of Adult dataset. You will need + // to download it [1] from UCI repository. Look for "adult.test", "adult.name" will also be useful because it + // in contains human-readable description of the dataset. + // + // So the first line of test part of the dataset is: + // + // "25, Private, 226802, 11th, 7, Never-married, Machine-op-inspct, Own-child, Black, Male, 0, 0, 40, United-States, <=50K." + // + // Based on "adult.name" we can recover its vectors of numeric and categoric features (in our case all + // "continuous" features are numeric and all other features are categoric): + // + // numericFeatures: {25, 226802, 7, 0, 0, 40} + // categoricFeatures: {"Private", "11th", "Never-married", "Machine-op-inspct", "Own-child", "Black", "Male", "United-States"} + // + // And he doesn't make 50K per year. Also note that order of numeric and categoric features in source data and + // in `numericFeatures` and `categoricFeatures` is kept the same. Otherwise we can't apply the model (well, we + // can, but result of prediction will be garbage). + // + // Now lets run it! And let's call this person "person A", to make variable names unique. + // + // [1]: https://archive.ics.uci.edu/ml/machine-learning-databases/adult/ + const std::vector person_a_numeric_features( + {25., 226'802., 7., 0., 0., 40.}); + const std::vector person_a_categoric_features( + {"Private", "11th", "Never-married", "Machine-op-inspct", "Own-child", + "Black", "Male", "United-States"}); + const auto person_a_prediction = + model.Calc(person_a_numeric_features, person_a_categoric_features); + const auto person_a_makes_over_50k_probability = Sigmoid(person_a_prediction); + + // Since we made prediction only for one person and prediction dimension is 1, proability of person A make + // over 50K will have index 0 in `person_a_prediction`. + // + // CatBoost doesn't compute "probability", to turn CatBoost prediction into a probability we'll need to apply + // sigmoid function. + const auto person_a_makes_over_50k = + person_a_makes_over_50k_probability > kClassificationThreshold; + std::cout << "Person A make over 50K a year with probability " + << person_a_makes_over_50k_probability << std::endl; + std::cout << "Person A " << Answer(person_a_makes_over_50k) << std::endl; + std::cout << std::endl; + + // Now lets find an example with missing features and income greater than 50K a year. At line 40 of "adult.test" + // we can find following line: + // + // "40, Private, 85019, Doctorate, 16, Married-civ-spouse, Prof-specialty, Husband, Asian-Pac-Islander, Male, 0, 0, 45, ?, >50K." + // + // Lets call this person "Person B", dataset missing (missing features are marked with "?") "native-county" + // feature for Person B. When we were doing preprocessing in `get_fixed_adult` we replaced missing categoric + // features with string "nan", now, when we apply trained model we must also use "nan" for missing features. + // Lets write out feature vectors for Person B: + // + // numericFeatures = {40, 85019, 16, 0, 0, 45}; + // categoricFeatures = {"Private", "Doctorate", "Married-civ-spouce", "Prof-specialty", "Husband", "Asian-Pac-Islander", "Male", "nan"}; + // + // And according to the dataset Person B makes more than 50K a year. Ok, lets try to apply the model to this + // example. + const std::vector person_b_numeric_features( + {40., 85019., 16., 0., 0., 45.}); + const std::vector person_b_categoric_features( + {"Private", "Doctorate", "Married-civ-spouce", "Prof-specialty", + "Husband", "Asian-Pac-Islander", "Male", "nan"}); + const auto person_b_prediction = + model.Calc(person_b_numeric_features, person_b_categoric_features); + const auto person_b_makes_over_50k_probability = Sigmoid(person_b_prediction); + + const auto person_b_makes_over_50k = + person_b_makes_over_50k_probability > kClassificationThreshold; + std::cout << "Person B make over 50K a year with probability " + << person_b_makes_over_50k_probability << std::endl; + std::cout << "Person B " << Answer(person_b_makes_over_50k) << std::endl; + std::cout << std::endl; + + // Let's try to apply the model to Person A and Person B in one call. + const std::vector> persons_ab_numberic_features = + {person_a_numeric_features, person_b_numeric_features}; + const std::vector> persons_ab_categoric_features = + {person_a_categoric_features, person_b_categoric_features}; + const auto persons_ab_predictions = model.Calc( + persons_ab_numberic_features, persons_ab_categoric_features); + const std::vector persons_ab_make_over_50k_probabilities = { + Sigmoid(persons_ab_predictions[0]), Sigmoid(persons_ab_predictions[1])}; + const std::vector persons_ab_make_over_50k = { + persons_ab_make_over_50k_probabilities[0] > kClassificationThreshold, + persons_ab_make_over_50k_probabilities[1] > kClassificationThreshold}; + + // Predictions should be same as above + std::cout << "Using batch interface" << std::endl; + std::cout << "Person A make over 50K a year with probability " + << persons_ab_make_over_50k_probabilities[0] << std::endl; + std::cout << "Person A " << Answer(persons_ab_make_over_50k[0]) << std::endl; + std::cout << "Person B make over 50K a year with probability " + << persons_ab_make_over_50k_probabilities[1] << std::endl; + std::cout << "Person B " << Answer(persons_ab_make_over_50k[1]) << std::endl; +} diff --git a/apply_model/cpp/build/.gitignore b/apply_model/cpp/build/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/apply_model/cpp/build/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/apply_model/cpp/cmake/Ccache.cmake b/apply_model/cpp/cmake/Ccache.cmake new file mode 100644 index 0000000..9eee24d --- /dev/null +++ b/apply_model/cpp/cmake/Ccache.cmake @@ -0,0 +1,6 @@ +find_program(CCACHE_FOUND ccache) +if(CCACHE_FOUND) + message(STATUS "Using ccache") + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) + set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache) +endif(CCACHE_FOUND) diff --git a/apply_model/cpp/cmake/CheckCompiler.cmake b/apply_model/cpp/cmake/CheckCompiler.cmake new file mode 100644 index 0000000..96cbfb0 --- /dev/null +++ b/apply_model/cpp/cmake/CheckCompiler.cmake @@ -0,0 +1,16 @@ +set(REQUIRED_CXX_COMPILER "Clang") +set(CXX_COMPILER_MIN_VERSION 14.0) + +message(STATUS "C++ compiler: ${CMAKE_CXX_COMPILER}") + +if("${CMAKE_CXX_COMPILER_ID}" STREQUAL REQUIRED_CXX_COMPILER) + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS CXX_COMPILER_MIN_VERSION) + message(FATAL_ERROR + "Old version of ${REQUIRED_CXX_COMPILER} compiler: ${CMAKE_CXX_COMPILER_VERSION}, required ${CXX_COMPILER_MIN_VERSION}." + ) + endif() +else() + message(FATAL_ERROR + "Unsupported compiler: ${CMAKE_CXX_COMPILER_ID}. Use ${REQUIRED_CXX_COMPILER}, version >= ${CXX_COMPILER_MIN_VERSION}." + ) +endif() diff --git a/apply_model/cpp/cmake/CompileOptions.cmake b/apply_model/cpp/cmake/CompileOptions.cmake new file mode 100644 index 0000000..ae5d757 --- /dev/null +++ b/apply_model/cpp/cmake/CompileOptions.cmake @@ -0,0 +1,20 @@ +# Common compile options for C++ + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) +include_directories(${CMAKE_CURRENT_BINARY_DIR}/src) +include_directories(${CMAKE_CURRENT_BINARY_DIR}/third_party) + +# https://clang.llvm.org/docs/DiagnosticsReference.html +add_compile_options(-Wall -Wextra -Wpedantic -fno-omit-frame-pointer) + +# Turn warnings into errors +add_compile_options(-Werror -Wno-language-extension-token) + +add_compile_options(-Wno-error=unused-command-line-argument) +add_compile_options(-Wno-error=unused-but-set-variable) + +message(STATUS "C++ standard: ${CMAKE_CXX_STANDARD}") diff --git a/apply_model/cpp/cmake/Development.cmake b/apply_model/cpp/cmake/Development.cmake new file mode 100644 index 0000000..8916b56 --- /dev/null +++ b/apply_model/cpp/cmake/Development.cmake @@ -0,0 +1,2 @@ +# https://cmake.org/cmake/help/v3.14/variable/CMAKE_EXPORT_COMPILE_COMMANDS.html +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) diff --git a/apply_model/cpp/model/.gitignore b/apply_model/cpp/model/.gitignore new file mode 100644 index 0000000..8d0f665 --- /dev/null +++ b/apply_model/cpp/model/.gitignore @@ -0,0 +1,2 @@ +adult.cbm +catboost_info/ diff --git a/apply_model/cpp/model/train_model.ipynb b/apply_model/cpp/model/train_model.ipynb new file mode 100644 index 0000000..285a873 --- /dev/null +++ b/apply_model/cpp/model/train_model.ipynb @@ -0,0 +1,389 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# catboost for cpp tutorial" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install -q numpy==1.23.4 pandas catboost" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from __future__ import absolute_import, division, print_function, unicode_literals" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CatBoost version 1.2\n", + "NumPy version 1.23.4\n", + "Pandas version 2.0.3\n" + ] + } + ], + "source": [ + "import catboost as cb\n", + "import catboost.datasets as cbd\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "# print module versions for reproducibility\n", + "print('CatBoost version {}'.format(cb.__version__))\n", + "print('NumPy version {}'.format(np.__version__))\n", + "print('Pandas version {}'.format(pd.__version__))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Download \"Adult Data Set\" [1] from UCI Machine Learning Repository.\n", + "\n", + " Will return two pandas.DataFrame-s, first with train part (adult.data) and second with test part\n", + " (adult.test) of the dataset.\n", + "\n", + " [1]: https://archive.ics.uci.edu/ml/datasets/Adult\n", + " \n" + ] + } + ], + "source": [ + "# We are going to use UCI Adult Data Set because it has both numerical and categorical \n", + "# features and also has missing features.\n", + "print(cbd.adult.__doc__)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def get_fixed_adult():\n", + " train, test = cbd.adult()\n", + " \n", + " # CatBoost doesn't support pandas.DataFrame missing values for categorical features out \n", + " # of the box (seed issue #571 on GitHub or issue MLTOOLS-2785 in internal tracker). So \n", + " # we have to replace them with some designated string manually. \n", + " for dataset in (train, test, ):\n", + " for name in (name for name, dtype in dict(dataset.dtypes).items() if dtype == np.object):\n", + " dataset[name].fillna('nan', inplace=True)\n", + " \n", + " X_train, y_train = train.drop('income', axis=1), train.income\n", + " X_test, y_test = test.drop('income', axis=1), test.income\n", + " return X_train, y_train, X_test, y_test" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2431430/2445291177.py:8: DeprecationWarning: `np.object` is a deprecated alias for the builtin `object`. To silence this warning, use `object` by itself. Doing this will not modify any behavior and is safe. \n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " for name in (name for name, dtype in dict(dataset.dtypes).items() if dtype == np.object):\n", + "/tmp/ipykernel_2431430/2445291177.py:8: DeprecationWarning: `np.object` is a deprecated alias for the builtin `object`. To silence this warning, use `object` by itself. Doing this will not modify any behavior and is safe. \n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " for name in (name for name, dtype in dict(dataset.dtypes).items() if dtype == np.object):\n" + ] + } + ], + "source": [ + "X_train, y_train, _, _ = get_fixed_adult()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-country
039.0State-gov77516.0Bachelors13.0Never-marriedAdm-clericalNot-in-familyWhiteMale2174.00.040.0United-States
150.0Self-emp-not-inc83311.0Bachelors13.0Married-civ-spouseExec-managerialHusbandWhiteMale0.00.013.0United-States
238.0Private215646.0HS-grad9.0DivorcedHandlers-cleanersNot-in-familyWhiteMale0.00.040.0United-States
353.0Private234721.011th7.0Married-civ-spouseHandlers-cleanersHusbandBlackMale0.00.040.0United-States
428.0Private338409.0Bachelors13.0Married-civ-spouseProf-specialtyWifeBlackFemale0.00.040.0Cuba
\n", + "
" + ], + "text/plain": [ + " age workclass fnlwgt education education-num \\\n", + "0 39.0 State-gov 77516.0 Bachelors 13.0 \n", + "1 50.0 Self-emp-not-inc 83311.0 Bachelors 13.0 \n", + "2 38.0 Private 215646.0 HS-grad 9.0 \n", + "3 53.0 Private 234721.0 11th 7.0 \n", + "4 28.0 Private 338409.0 Bachelors 13.0 \n", + "\n", + " marital-status occupation relationship race sex \\\n", + "0 Never-married Adm-clerical Not-in-family White Male \n", + "1 Married-civ-spouse Exec-managerial Husband White Male \n", + "2 Divorced Handlers-cleaners Not-in-family White Male \n", + "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n", + "4 Married-civ-spouse Prof-specialty Wife Black Female \n", + "\n", + " capital-gain capital-loss hours-per-week native-country \n", + "0 2174.0 0.0 40.0 United-States \n", + "1 0.0 0.0 13.0 United-States \n", + "2 0.0 0.0 40.0 United-States \n", + "3 0.0 0.0 40.0 United-States \n", + "4 0.0 0.0 40.0 Cuba " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2431430/982828455.py:15: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " cb.Pool(X_train, y_train, cat_features=np.where(X_train.dtypes != np.float)[0]),\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# If you want to find out how we found these parameters check \"Simple classification \n", + "# example with missing feature handling and parameter tuning\" tutorial in `classification`\n", + "# subdirectory of tutorials\n", + "model = cb.CatBoostClassifier(\n", + " class_names=('<=50K', '>50K'),\n", + " loss_function='Logloss',\n", + " eval_metric='AUC', \n", + " custom_metric=['AUC'],\n", + " iterations=100,\n", + " random_seed=20181224,\n", + " learning_rate=0.4234185321620083, \n", + " depth=5, \n", + " l2_leaf_reg=9.464266235679002)\n", + "model.fit(\n", + " cb.Pool(X_train, y_train, cat_features=np.where(X_train.dtypes != np.float)[0]),\n", + " verbose=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "model.save_model('adult.cbm')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "72K\tadult.cbm\n" + ] + } + ], + "source": [ + "!du -sh adult.cbm" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/apply_model/cpp/third_party/CMakeLists.txt b/apply_model/cpp/third_party/CMakeLists.txt new file mode 100644 index 0000000..2330eae --- /dev/null +++ b/apply_model/cpp/third_party/CMakeLists.txt @@ -0,0 +1,2 @@ +add_subdirectory(argparse) +add_subdirectory(catboost) diff --git a/apply_model/cpp/third_party/argparse/CMakeLists.txt b/apply_model/cpp/third_party/argparse/CMakeLists.txt new file mode 100644 index 0000000..9153f1b --- /dev/null +++ b/apply_model/cpp/third_party/argparse/CMakeLists.txt @@ -0,0 +1,9 @@ +include(FetchContent) + +FetchContent_Declare( + argparse + GIT_REPOSITORY https://github.com/p-ranav/argparse.git + GIT_TAG v2.9 + GIT_PROGRESS TRUE +) +FetchContent_MakeAvailable(argparse) diff --git a/apply_model/cpp/third_party/catboost/CMakeLists.txt b/apply_model/cpp/third_party/catboost/CMakeLists.txt new file mode 100644 index 0000000..22e6c1e --- /dev/null +++ b/apply_model/cpp/third_party/catboost/CMakeLists.txt @@ -0,0 +1,33 @@ +set(CATBOOST_TAG v1.2) +set(CATBOOST_BASE_PATH ${CMAKE_BINARY_DIR}/third_party/catboost) + +file( + DOWNLOAD + https://github.com/catboost/catboost/releases/download/${CATBOOST_TAG}/libcatboostmodel.so + ${CATBOOST_BASE_PATH}/libcatboostmodel.so + SHOW_PROGRESS +) + +file( + DOWNLOAD + https://raw.githubusercontent.com/catboost/catboost/${CATBOOST_TAG}/catboost/libs/model_interface/c_api.cpp + ${CATBOOST_BASE_PATH}/c_api.cpp + SHOW_PROGRESS +) + +file( + DOWNLOAD + https://raw.githubusercontent.com/catboost/catboost/${CATBOOST_TAG}/catboost/libs/model_interface/c_api.h + ${CATBOOST_BASE_PATH}/c_api.h + SHOW_PROGRESS +) + +file( + DOWNLOAD + https://raw.githubusercontent.com/catboost/catboost/${CATBOOST_TAG}/catboost/libs/model_interface/wrapped_calcer.h + ${CATBOOST_BASE_PATH}/wrapped_calcer.h + SHOW_PROGRESS +) + +add_library(catboost SHARED IMPORTED GLOBAL) +set_property(TARGET catboost PROPERTY IMPORTED_LOCATION ${CATBOOST_BASE_PATH}/libcatboostmodel.so)