From 03065ee935f90f31fc6757b79b0b4030cd2c8423 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Tue, 21 Nov 2023 07:39:50 +0900 Subject: [PATCH 01/31] Add test file --- tests/train.R | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 tests/train.R diff --git a/tests/train.R b/tests/train.R new file mode 100644 index 0000000..030fc2b --- /dev/null +++ b/tests/train.R @@ -0,0 +1,30 @@ +library(quanteda) +library(word2vec) + +corp <- data_corpus_inaugural %>% + corpus_reshape() +toks <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE) +lis <- as.list(toks) +txt <- stringi::stri_c_list(lis, " ") + +mod_lis <- word2vec(lis, dim = 50, iter = 5, min_count = 5, + verbose = TRUE, threads = 4) +emb_lis <- as.matrix(mod_lis) +dim(emb_lis) +predict(mod_lis, c("people", "American"), type = "nearest") + +mod_txt <- word2vec(txt, dim = 50, iter = 5, split = c("[ \n]", "\n"), min_count = 5, + verbose = TRUE, threads = 4) +emb_txt <- as.matrix(mod_txt) +dim(emb_txt) +predict(mod_txt, c("people", "American"), type = "nearest") + + +microbenchmark::microbenchmark( + "lis" = word2vec(lis, dim = 50, iter = 5, min_count = 5, + verbose = FALSE, threads = 10), + "txt" = word2vec(txt, dim = 50, iter = 5, split = c("[ \n]", "\n"), min_count = 5, + verbose = FALSE, threads = 10), + times = 10 +) + From ad51374f0a6041c5138bbfb109f47ddea8d93cfc Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Tue, 21 Nov 2023 07:40:19 +0900 Subject: [PATCH 02/31] Change to serialized tokens --- src/rcpp_word2vec.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rcpp_word2vec.cpp b/src/rcpp_word2vec.cpp index 8be203d..5161f7a 100644 --- a/src/rcpp_word2vec.cpp +++ b/src/rcpp_word2vec.cpp @@ -75,7 +75,7 @@ Rcpp::List w2v_train(Rcpp::List texts_, std::size_t vocWords; std::size_t trainWords; std::size_t totalWords; - if (verbose) { + if (verbose) { // NOTE: consider removing progress bar Progress p(100, true); trained = model->train(trainSettings, corpus, trainFile, stopWordsFile, // NOTE: remove @@ -112,7 +112,7 @@ Rcpp::List w2v_train(Rcpp::List texts_, << _percent << "%" << std::flush; */ - p.update(50+(_percent/2)); + p.update(50 + (_percent / 2)); } ); //std::cout << std::endl; From 11dfd0c7346cfb37eb189029c8239bb010260a31 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Tue, 21 Nov 2023 07:46:22 +0900 Subject: [PATCH 03/31] Move header file --- src/word2vec/{include => lib}/word2vec.hpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) rename src/word2vec/{include => lib}/word2vec.hpp (97%) diff --git a/src/word2vec/include/word2vec.hpp b/src/word2vec/lib/word2vec.hpp similarity index 97% rename from src/word2vec/include/word2vec.hpp rename to src/word2vec/lib/word2vec.hpp index fd9f1d2..ef2961e 100644 --- a/src/word2vec/include/word2vec.hpp +++ b/src/word2vec/lib/word2vec.hpp @@ -19,8 +19,8 @@ #include #include -typedef std::vector words_t; -typedef std::vector text_t; +typedef std::vector words_t; +typedef std::vector text_t; typedef std::vector texts_t; namespace w2v { @@ -31,11 +31,15 @@ namespace w2v { class corpus_t final { public: texts_t texts; + words_t types; words_t stopWords; // Constructors corpus_t(): texts() {} - corpus_t(texts_t _texts, words_t _stopWords): texts(_texts), stopWords(_stopWords) {} + // corpus_t(texts_t _texts, words_t _types, words_t _stopWords): + // texts(_texts), types(_types), stopWords(_stopWords) {} + corpus_t(texts_t _texts, words_t _stopWords): + texts(_texts), stopWords(_stopWords) {} }; From 232317053515178985cc9826cff23cd29c431ec0 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Tue, 21 Nov 2023 18:26:57 +0900 Subject: [PATCH 04/31] Disable parameters for file inputs --- R/RcppExports.R | 4 ++-- src/RcppExports.cpp | 17 ++++++++------ src/rcpp_word2vec.cpp | 14 +++++------ src/word2vec/lib/trainer.cpp | 15 +++++++----- src/word2vec/lib/trainer.hpp | 2 +- src/word2vec/lib/word2vec.cpp | 44 +++++++++++++++++------------------ src/word2vec/lib/word2vec.hpp | 10 ++++---- 7 files changed, 56 insertions(+), 50 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index 279c425..c7b440a 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,8 +1,8 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -w2v_train <- function(texts_, stopWords_, trainFile, modelFile, stopWordsFile, minWordFreq = 5L, size = 100L, window = 5L, expTableSize = 1000L, expValueMax = 6L, sample = 0.001, withHS = FALSE, negative = 5L, threads = 1L, iterations = 5L, alpha = 0.05, withSG = FALSE, wordDelimiterChars = " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r", endOfSentenceChars = ".\n?!", verbose = FALSE, normalize = TRUE) { - .Call('_word2vec_w2v_train', PACKAGE = 'word2vec', texts_, stopWords_, trainFile, modelFile, stopWordsFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, wordDelimiterChars, endOfSentenceChars, verbose, normalize) +w2v_train <- function(texts_, stopWords_, modelFile, minWordFreq = 5L, size = 100L, window = 5L, expTableSize = 1000L, expValueMax = 6L, sample = 0.001, withHS = FALSE, negative = 5L, threads = 1L, iterations = 5L, alpha = 0.05, withSG = FALSE, wordDelimiterChars = " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r", endOfSentenceChars = ".\n?!", verbose = FALSE, normalize = TRUE) { + .Call('_word2vec_w2v_train', PACKAGE = 'word2vec', texts_, stopWords_, modelFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, wordDelimiterChars, endOfSentenceChars, verbose, normalize) } w2v_load_model <- function(file, normalize = TRUE) { diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 9b55f46..b34d216 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -5,17 +5,20 @@ using namespace Rcpp; +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); +Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); +#endif + // w2v_train -Rcpp::List w2v_train(Rcpp::List texts_, Rcpp::CharacterVector stopWords_, std::string trainFile, std::string modelFile, std::string stopWordsFile, uint16_t minWordFreq, uint16_t size, uint8_t window, uint16_t expTableSize, uint8_t expValueMax, float sample, bool withHS, uint8_t negative, uint8_t threads, uint8_t iterations, float alpha, bool withSG, std::string wordDelimiterChars, std::string endOfSentenceChars, bool verbose, bool normalize); -RcppExport SEXP _word2vec_w2v_train(SEXP texts_SEXP, SEXP stopWords_SEXP, SEXP trainFileSEXP, SEXP modelFileSEXP, SEXP stopWordsFileSEXP, SEXP minWordFreqSEXP, SEXP sizeSEXP, SEXP windowSEXP, SEXP expTableSizeSEXP, SEXP expValueMaxSEXP, SEXP sampleSEXP, SEXP withHSSEXP, SEXP negativeSEXP, SEXP threadsSEXP, SEXP iterationsSEXP, SEXP alphaSEXP, SEXP withSGSEXP, SEXP wordDelimiterCharsSEXP, SEXP endOfSentenceCharsSEXP, SEXP verboseSEXP, SEXP normalizeSEXP) { +Rcpp::List w2v_train(Rcpp::List texts_, Rcpp::IntegerVector stopWords_, std::string modelFile, uint16_t minWordFreq, uint16_t size, uint8_t window, uint16_t expTableSize, uint8_t expValueMax, float sample, bool withHS, uint8_t negative, uint8_t threads, uint8_t iterations, float alpha, bool withSG, std::string wordDelimiterChars, std::string endOfSentenceChars, bool verbose, bool normalize); +RcppExport SEXP _word2vec_w2v_train(SEXP texts_SEXP, SEXP stopWords_SEXP, SEXP modelFileSEXP, SEXP minWordFreqSEXP, SEXP sizeSEXP, SEXP windowSEXP, SEXP expTableSizeSEXP, SEXP expValueMaxSEXP, SEXP sampleSEXP, SEXP withHSSEXP, SEXP negativeSEXP, SEXP threadsSEXP, SEXP iterationsSEXP, SEXP alphaSEXP, SEXP withSGSEXP, SEXP wordDelimiterCharsSEXP, SEXP endOfSentenceCharsSEXP, SEXP verboseSEXP, SEXP normalizeSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< Rcpp::List >::type texts_(texts_SEXP); - Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type stopWords_(stopWords_SEXP); - Rcpp::traits::input_parameter< std::string >::type trainFile(trainFileSEXP); + Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type stopWords_(stopWords_SEXP); Rcpp::traits::input_parameter< std::string >::type modelFile(modelFileSEXP); - Rcpp::traits::input_parameter< std::string >::type stopWordsFile(stopWordsFileSEXP); Rcpp::traits::input_parameter< uint16_t >::type minWordFreq(minWordFreqSEXP); Rcpp::traits::input_parameter< uint16_t >::type size(sizeSEXP); Rcpp::traits::input_parameter< uint8_t >::type window(windowSEXP); @@ -32,7 +35,7 @@ BEGIN_RCPP Rcpp::traits::input_parameter< std::string >::type endOfSentenceChars(endOfSentenceCharsSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); Rcpp::traits::input_parameter< bool >::type normalize(normalizeSEXP); - rcpp_result_gen = Rcpp::wrap(w2v_train(texts_, stopWords_, trainFile, modelFile, stopWordsFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, wordDelimiterChars, endOfSentenceChars, verbose, normalize)); + rcpp_result_gen = Rcpp::wrap(w2v_train(texts_, stopWords_, modelFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, wordDelimiterChars, endOfSentenceChars, verbose, normalize)); return rcpp_result_gen; END_RCPP } @@ -153,7 +156,7 @@ END_RCPP } static const R_CallMethodDef CallEntries[] = { - {"_word2vec_w2v_train", (DL_FUNC) &_word2vec_w2v_train, 21}, + {"_word2vec_w2v_train", (DL_FUNC) &_word2vec_w2v_train, 19}, {"_word2vec_w2v_load_model", (DL_FUNC) &_word2vec_w2v_load_model, 2}, {"_word2vec_w2v_save_model", (DL_FUNC) &_word2vec_w2v_save_model, 2}, {"_word2vec_w2v_dictionary", (DL_FUNC) &_word2vec_w2v_dictionary, 1}, diff --git a/src/rcpp_word2vec.cpp b/src/rcpp_word2vec.cpp index 5161f7a..115070b 100644 --- a/src/rcpp_word2vec.cpp +++ b/src/rcpp_word2vec.cpp @@ -11,10 +11,8 @@ // [[Rcpp::depends(RcppProgress)]] // [[Rcpp::export]] Rcpp::List w2v_train(Rcpp::List texts_, - Rcpp::CharacterVector stopWords_, - std::string trainFile, // NOTE: remove + Rcpp::IntegerVector stopWords_, std::string modelFile, - std::string stopWordsFile, // NOTE: remove uint16_t minWordFreq = 5, uint16_t size = 100, uint8_t window = 5, @@ -78,9 +76,9 @@ Rcpp::List w2v_train(Rcpp::List texts_, if (verbose) { // NOTE: consider removing progress bar Progress p(100, true); trained = model->train(trainSettings, corpus, - trainFile, stopWordsFile, // NOTE: remove + //trainFile, stopWordsFile, // NOTE: remove [&p] (float _percent) { - p.update(_percent/2); + p.update(_percent / 2); /* std::cout << "\rParsing train data... " << std::fixed << std::setprecision(2) @@ -118,7 +116,7 @@ Rcpp::List w2v_train(Rcpp::List texts_, //std::cout << std::endl; } else { trained = model->train(trainSettings, corpus, - trainFile, stopWordsFile, // NOTE: remove + //trainFile, stopWordsFile, // NOTE: remove nullptr, [&vocWords, &trainWords, &totalWords] (std::size_t _vocWords, std::size_t _trainWords, std::size_t _totalWords) { /* @@ -153,8 +151,8 @@ Rcpp::List w2v_train(Rcpp::List texts_, Rcpp::List out = Rcpp::List::create( Rcpp::Named("model") = model, Rcpp::Named("data") = Rcpp::List::create( - Rcpp::Named("file") = trainFile, - Rcpp::Named("stopwords") = stopWordsFile, + //Rcpp::Named("file") = trainFile, + //Rcpp::Named("stopwords") = stopWordsFile, Rcpp::Named("n") = totalWords, Rcpp::Named("n_vocabulary") = trainWords ), diff --git a/src/word2vec/lib/trainer.cpp b/src/word2vec/lib/trainer.cpp index 22f6216..d875bf1 100644 --- a/src/word2vec/lib/trainer.cpp +++ b/src/word2vec/lib/trainer.cpp @@ -13,7 +13,7 @@ namespace w2v { trainer_t::trainer_t(const std::shared_ptr &_trainSettings, const std::shared_ptr &_vocabulary, const std::shared_ptr &_corpus, - const std::shared_ptr &_fileMapper, // NOTE: remove + //const std::shared_ptr &_fileMapper, // NOTE: remove std::function _progressCallback): m_threads() { trainThread_t::sharedData_t sharedData; @@ -26,12 +26,15 @@ namespace w2v { throw std::runtime_error("vocabulary object is not initialized"); } sharedData.vocabulary = _vocabulary; - - if (!_corpus && !_fileMapper) { - throw std::runtime_error("corpus and file mapper objects are not initialized"); + + if (!_corpus) { + throw std::runtime_error("corpus is objects is not initialized"); } - sharedData.corpus = _corpus; - sharedData.fileMapper = _fileMapper; + // if (!_corpus && !_fileMapper) { + // throw std::runtime_error("corpus and file mapper objects are not initialized"); + // } + // sharedData.corpus = _corpus; + // sharedData.fileMapper = _fileMapper; sharedData.bpWeights.reset(new std::vector(_trainSettings->size * _vocabulary->size(), 0.0f)); sharedData.expTable.reset(new std::vector(_trainSettings->expTableSize)); diff --git a/src/word2vec/lib/trainer.hpp b/src/word2vec/lib/trainer.hpp index 19acd0b..1506cc7 100644 --- a/src/word2vec/lib/trainer.hpp +++ b/src/word2vec/lib/trainer.hpp @@ -42,7 +42,7 @@ namespace w2v { trainer_t(const std::shared_ptr &_trainSettings, const std::shared_ptr &_vocabulary, const std::shared_ptr &_corpus, - const std::shared_ptr &_fileMapper, // NOTE: remove + //const std::shared_ptr &_fileMapper, // NOTE: remove std::function _progressCallback); /** diff --git a/src/word2vec/lib/word2vec.cpp b/src/word2vec/lib/word2vec.cpp index ea717a3..dbd8caa 100644 --- a/src/word2vec/lib/word2vec.cpp +++ b/src/word2vec/lib/word2vec.cpp @@ -14,8 +14,8 @@ namespace w2v { bool w2vModel_t::train(const trainSettings_t &_trainSettings, const corpus_t &_corpus, - const std::string &_trainFile, // NOTE: remove - const std::string &_stopWordsFile, // NOTE: remove + //const std::string &_trainFile, // NOTE: remove + //const std::string &_stopWordsFile, // NOTE: remove vocabularyProgressCallback_t _vocabularyProgressCallback, vocabularyStatsCallback_t _vocabularyStatsCallback, trainProgressCallback_t _trainProgressCallback) noexcept { @@ -23,32 +23,32 @@ namespace w2v { // store tokens std::shared_ptr corpus(new corpus_t(_corpus)); // map train data set file to memory - std::shared_ptr trainWordsMapper; - if (!_trainFile.empty()) { - trainWordsMapper.reset(new fileMapper_t(_trainFile)); - } - // map stop-words file to memory - std::shared_ptr stopWordsMapper; - if (!_stopWordsFile.empty()) { - stopWordsMapper.reset(new fileMapper_t(_stopWordsFile)); - } + // std::shared_ptr trainWordsMapper; + // if (!_trainFile.empty()) { + // trainWordsMapper.reset(new fileMapper_t(_trainFile)); + // } + // // map stop-words file to memory + // std::shared_ptr stopWordsMapper; + // if (!_stopWordsFile.empty()) { + // stopWordsMapper.reset(new fileMapper_t(_stopWordsFile)); + // } // build vocabulary, skip stop-words and words with frequency < minWordFreq std::shared_ptr vocabulary; - if (!_trainFile.empty()) { - vocabulary.reset(new vocabulary_t(trainWordsMapper, - stopWordsMapper, - _trainSettings.wordDelimiterChars, - _trainSettings.endOfSentenceChars, - _trainSettings.minWordFreq, - _vocabularyProgressCallback, - _vocabularyStatsCallback)); - } else { + // if (!_trainFile.empty()) { + // vocabulary.reset(new vocabulary_t(trainWordsMapper, + // stopWordsMapper, + // _trainSettings.wordDelimiterChars, + // _trainSettings.endOfSentenceChars, + // _trainSettings.minWordFreq, + // _vocabularyProgressCallback, + // _vocabularyStatsCallback)); + // } else { vocabulary.reset(new vocabulary_t(corpus, _trainSettings.minWordFreq, _vocabularyProgressCallback, _vocabularyStatsCallback)); - } + //} // key words descending ordered by their indexes std::vector words; @@ -61,7 +61,7 @@ namespace w2v { trainer_t(std::make_shared(_trainSettings), vocabulary, corpus, - trainWordsMapper, // NOTE: remove + //trainWordsMapper, // NOTE: remove _trainProgressCallback)(_trainMatrix); //Rcpp::Rcout << "_trainMatrix: " << _trainMatrix.size() << "\n"; diff --git a/src/word2vec/lib/word2vec.hpp b/src/word2vec/lib/word2vec.hpp index ef2961e..fba9f8e 100644 --- a/src/word2vec/lib/word2vec.hpp +++ b/src/word2vec/lib/word2vec.hpp @@ -19,8 +19,10 @@ #include #include -typedef std::vector words_t; -typedef std::vector text_t; +typedef std::vector words_t; +typedef std::vector text_t; +// typedef std::vector words_t; +// typedef std::vector text_t; typedef std::vector texts_t; namespace w2v { @@ -295,8 +297,8 @@ namespace w2v { */ bool train(const trainSettings_t &_trainSettings, const corpus_t &_corpus, - const std::string &_trainFile, // NOTE: remove - const std::string &_stopWordsFile, // NOTE: remove + //const std::string &_trainFile, // NOTE: remove + //const std::string &_stopWordsFile, // NOTE: remove vocabularyProgressCallback_t _vocabularyProgressCallback, vocabularyStatsCallback_t _vocabularyStatsCallback, trainProgressCallback_t _trainProgressCallback) noexcept; From ca8c06214a698adac9ab10678f38d097ad2466cc Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Fri, 24 Nov 2023 17:47:38 +0900 Subject: [PATCH 05/31] Remove code for file inputs --- src/rcpp_word2vec.cpp | 19 +++--- src/word2vec/lib/trainThread.cpp | 110 ++++++++++--------------------- src/word2vec/lib/trainThread.hpp | 5 +- src/word2vec/lib/trainer.cpp | 4 +- src/word2vec/lib/trainer.hpp | 2 +- src/word2vec/lib/vocabulary.cpp | 105 +---------------------------- src/word2vec/lib/vocabulary.hpp | 16 ++--- 7 files changed, 60 insertions(+), 201 deletions(-) diff --git a/src/rcpp_word2vec.cpp b/src/rcpp_word2vec.cpp index 115070b..2eb5021 100644 --- a/src/rcpp_word2vec.cpp +++ b/src/rcpp_word2vec.cpp @@ -11,8 +11,8 @@ // [[Rcpp::depends(RcppProgress)]] // [[Rcpp::export]] Rcpp::List w2v_train(Rcpp::List texts_, - Rcpp::IntegerVector stopWords_, - std::string modelFile, + Rcpp::CharacterVector stopWords_, + std::string modelFile = "", uint16_t minWordFreq = 5, uint16_t size = 100, uint8_t window = 5, @@ -25,8 +25,6 @@ Rcpp::List w2v_train(Rcpp::List texts_, uint8_t iterations = 5, float alpha = 0.05, bool withSG = false, - std::string wordDelimiterChars = " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r", - std::string endOfSentenceChars = ".\n?!", bool verbose = false, bool normalize = true) { @@ -65,8 +63,8 @@ Rcpp::List w2v_train(Rcpp::List texts_, trainSettings.iterations = iterations; trainSettings.alpha = alpha; trainSettings.withSG = withSG; - trainSettings.wordDelimiterChars = wordDelimiterChars; - trainSettings.endOfSentenceChars = endOfSentenceChars; + //trainSettings.wordDelimiterChars = wordDelimiterChars; + //trainSettings.endOfSentenceChars = endOfSentenceChars; Rcpp::XPtr model(new w2v::w2vModel_t(), true); bool trained; @@ -170,9 +168,9 @@ Rcpp::List w2v_train(Rcpp::List texts_, Rcpp::Named("negative") = negative, Rcpp::Named("sample") = sample, Rcpp::Named("expTableSize") = expTableSize, - Rcpp::Named("expValueMax") = expValueMax, - Rcpp::Named("split_words") = wordDelimiterChars, - Rcpp::Named("split_sents") = endOfSentenceChars + Rcpp::Named("expValueMax") = expValueMax + //Rcpp::Named("split_words") = wordDelimiterChars, + //Rcpp::Named("split_sents") = endOfSentenceChars ) ); out.attr("class") = "word2vec_trained"; @@ -407,6 +405,7 @@ Rcpp::NumericMatrix w2v_read_binary(const std::string modelFile, bool normalize, return embedding_default; } +/* NOTE: temporarily disabled // [[Rcpp::export]] @@ -463,3 +462,5 @@ Rcpp::DataFrame d2vec_nearest(SEXP ptr_w2v, SEXP ptr_d2v, Rcpp::StringVector x, ); return out; } + + */ diff --git a/src/word2vec/lib/trainThread.cpp b/src/word2vec/lib/trainThread.cpp index c89e222..9edacbb 100644 --- a/src/word2vec/lib/trainThread.cpp +++ b/src/word2vec/lib/trainThread.cpp @@ -13,7 +13,7 @@ namespace w2v { m_sharedData(_sharedData), m_randomDevice(), m_randomGenerator(m_randomDevice()), m_rndWindowShift(0, static_cast((m_sharedData.trainSettings->window - 1))), m_downSampling(), m_nsDistribution(), m_hiddenLayerVals(), m_hiddenLayerErrors(), - m_wordReader(), m_thread() { + m_thread() { if (!m_sharedData.trainSettings) { throw std::runtime_error("train settings are not initialized"); @@ -42,25 +42,16 @@ namespace w2v { m_hiddenLayerVals.reset(new std::vector(m_sharedData.trainSettings->size)); } - if (!m_sharedData.corpus && !m_sharedData.fileMapper) { - throw std::runtime_error("corpus and file mapper objects are not initialized"); - } - if (m_sharedData.fileMapper) { - auto shift = m_sharedData.fileMapper->size() / m_sharedData.trainSettings->threads; - auto startFrom = shift * _id; - auto stopAt = (_id == m_sharedData.trainSettings->threads - 1) - ? (m_sharedData.fileMapper->size() - 1) : (shift * (_id + 1)); - m_wordReader.reset(new wordReader_t(*m_sharedData.fileMapper, - m_sharedData.trainSettings->wordDelimiterChars, - m_sharedData.trainSettings->endOfSentenceChars, - startFrom, stopAt)); - } else { - // NOTE: specify range for workers - auto n = m_sharedData.corpus->texts.size(); - auto threads = m_sharedData.trainSettings->threads; - range = std::make_pair(floor((n / threads) * _id), - floor((n / threads) * (_id + 1)) - 1); + if (!m_sharedData.corpus) { + throw std::runtime_error("corpus object is not initialized"); } + + // NOTE: specify range for workers + auto n = m_sharedData.corpus->texts.size(); + auto threads = m_sharedData.trainSettings->threads; + range = std::make_pair(floor((n / threads) * _id), + floor((n / threads) * (_id + 1)) - 1); + } void trainThread_t::worker(std::vector &_trainMatrix) noexcept { @@ -71,9 +62,6 @@ namespace w2v { std::size_t threadProcessedWords = 0; std::size_t prvThreadProcessedWords = 0; - if (m_sharedData.fileMapper) - m_wordReader->reset(); - std::size_t h = range.first; // NOTE: only used for corpus auto wordsPerAllThreads = m_sharedData.trainSettings->iterations * m_sharedData.vocabulary->trainWords(); @@ -100,65 +88,37 @@ namespace w2v { // read sentence std::vector sentence; - if (m_sharedData.fileMapper) { - while (true) { - std::string word; - if (!m_wordReader->nextWord(word)) { - exitFlag = true; // EOF or end of requested region - break; - } - if (word.empty()) { - break; // end of sentence - } - - auto wordData = m_sharedData.vocabulary->data(word); - if (wordData == nullptr) { - continue; // no such word - } - - threadProcessedWords++; - - if (m_sharedData.trainSettings->sample > 0.0f) { // down-sampling... - if ((*m_downSampling)(wordData->frequency, m_randomGenerator)) { - continue; // skip this word - } - } - //if (h == 1) - // Rcpp::Rcout << word << ": " << wordData->index << "\n"; - sentence.push_back(wordData); + + // Rcpp::Rcout << "h: " << h << "\n"; + if (h > range.second) { + exitFlag = true; // EOF or end of requested region + break; + } + text_t text = m_sharedData.corpus->texts[h]; + + for (size_t i = 0; i < text.size(); i++) { + + std::string word = text[i]; + if (word.empty()) { + continue; // padding } - - } else { - // Rcpp::Rcout << "h: " << h << "\n"; - if (h > range.second) { - exitFlag = true; // EOF or end of requested region - break; + auto wordData = m_sharedData.vocabulary->data(word); + if (wordData == nullptr) { + continue; // no such word } - text_t text = m_sharedData.corpus->texts[h]; - for (size_t i = 0; i < text.size(); i++) { - - std::string word = text[i]; - if (word.empty()) { - continue; // padding - } - auto wordData = m_sharedData.vocabulary->data(word); - if (wordData == nullptr) { - continue; // no such word - } - - threadProcessedWords++; - - if (m_sharedData.trainSettings->sample > 0.0f) { // down-sampling... - if ((*m_downSampling)(wordData->frequency, m_randomGenerator)) { - continue; // skip this word - } + threadProcessedWords++; + + if (m_sharedData.trainSettings->sample > 0.0f) { // down-sampling... + if ((*m_downSampling)(wordData->frequency, m_randomGenerator)) { + continue; // skip this word } - //if (h == 1) - // Rcpp::Rcout << word << ": " << wordData->index << "\n"; - sentence.push_back(wordData); } + //if (h == 1) + // Rcpp::Rcout << word << ": " << wordData->index << "\n"; + sentence.push_back(wordData); } + if (m_sharedData.trainSettings->withSG) { skipGram(sentence, _trainMatrix); } else { diff --git a/src/word2vec/lib/trainThread.hpp b/src/word2vec/lib/trainThread.hpp index 3e3b0ac..4cafc36 100644 --- a/src/word2vec/lib/trainThread.hpp +++ b/src/word2vec/lib/trainThread.hpp @@ -18,7 +18,7 @@ #include #include "word2vec.hpp" -#include "wordReader.hpp" +//#include "wordReader.hpp" #include "vocabulary.hpp" #include "huffmanTree.hpp" #include "nsDistribution.hpp" @@ -43,7 +43,7 @@ namespace w2v { std::shared_ptr trainSettings; ///< trainSettings structure std::shared_ptr vocabulary; ///< words data std::shared_ptr corpus; ///< train data - std::shared_ptr fileMapper; /// NOTE: remove + //std::shared_ptr fileMapper; /// NOTE: remove std::shared_ptr> bpWeights; ///< back propagation weights std::shared_ptr> expTable; ///< exp(x) / (exp(x) + 1) values lookup table std::shared_ptr huffmanTree; ///< Huffman tree used by hierarchical softmax @@ -65,7 +65,6 @@ namespace w2v { std::unique_ptr m_nsDistribution; std::unique_ptr> m_hiddenLayerVals; std::unique_ptr> m_hiddenLayerErrors; - std::unique_ptr> m_wordReader; std::unique_ptr m_thread; public: diff --git a/src/word2vec/lib/trainer.cpp b/src/word2vec/lib/trainer.cpp index d875bf1..5836c55 100644 --- a/src/word2vec/lib/trainer.cpp +++ b/src/word2vec/lib/trainer.cpp @@ -28,8 +28,10 @@ namespace w2v { sharedData.vocabulary = _vocabulary; if (!_corpus) { - throw std::runtime_error("corpus is objects is not initialized"); + throw std::runtime_error("corpus is object is not initialized"); } + sharedData.corpus = _corpus; + // if (!_corpus && !_fileMapper) { // throw std::runtime_error("corpus and file mapper objects are not initialized"); // } diff --git a/src/word2vec/lib/trainer.hpp b/src/word2vec/lib/trainer.hpp index 1506cc7..fcfba3b 100644 --- a/src/word2vec/lib/trainer.hpp +++ b/src/word2vec/lib/trainer.hpp @@ -15,7 +15,7 @@ #include #include "word2vec.hpp" -#include "wordReader.hpp" +//#include "wordReader.hpp" #include "vocabulary.hpp" #include "trainThread.hpp" diff --git a/src/word2vec/lib/vocabulary.cpp b/src/word2vec/lib/vocabulary.cpp index 7c4a471..a7d61b4 100644 --- a/src/word2vec/lib/vocabulary.cpp +++ b/src/word2vec/lib/vocabulary.cpp @@ -7,112 +7,9 @@ */ #include "vocabulary.hpp" -#include "wordReader.hpp" +//#include "wordReader.hpp" namespace w2v { - vocabulary_t::vocabulary_t(std::shared_ptr &_trainWordsMapper, - std::shared_ptr &_stopWordsMapper, - const std::string &_wordDelimiterChars, - const std::string &_endOfSentenceChars, - uint16_t _minFreq, - w2vModel_t::vocabularyProgressCallback_t _progressCallback, - w2vModel_t::vocabularyStatsCallback_t _statsCallback) noexcept: m_words() { - // load stop-words - std::vector stopWords; - if (_stopWordsMapper) { - wordReader_t wordReader(*_stopWordsMapper, _wordDelimiterChars, _endOfSentenceChars); - std::string word; - while (wordReader.nextWord(word)) { - stopWords.push_back(word); - } - } - - // load words and calculate their frequencies - struct tmpWordData_t { - std::size_t frequency = 0; - std::string word; - }; - std::unordered_map tmpWords; - off_t progressOffset = 0; - if (_trainWordsMapper) { - wordReader_t wordReader(*_trainWordsMapper, _wordDelimiterChars, _endOfSentenceChars); - std::string word; - while (wordReader.nextWord(word)) { - if (word.empty()) { - word = ""; - } - auto &i = tmpWords[word]; - if (i.frequency == 0) { - i.word = word; - } - i.frequency++; - m_totalWords++; - - if (_progressCallback != nullptr) { - if (wordReader.offset() - progressOffset >= _trainWordsMapper->size() / 10000 - 1) { - _progressCallback(static_cast(wordReader.offset()) - / _trainWordsMapper->size() * 100.0f); - progressOffset = wordReader.offset(); - } - } - } - } - - // remove stop words from the words set - for (auto &i:stopWords) { - tmpWords.erase(i); - } - - // remove sentence delimiter from the words set - { - std::string word = ""; - auto i = tmpWords.find(word); - if (i != tmpWords.end()) { - m_totalWords -= i->second.frequency; - tmpWords.erase(i); - } - } - - // prepare vector sorted by word frequencies - std::vector> wordsFreq; - // delimiter is the first word - wordsFreq.emplace_back(std::pair("", 0LU)); - for (auto const &i:tmpWords) { - //Rcpp::Rcout << i.first << ": " << i.second.frequency << "\n"; - if (i.second.frequency >= _minFreq) { - wordsFreq.emplace_back(std::pair(i.first, i.second.frequency)); - m_trainWords += i.second.frequency; - } - } - - // sorting, from more frequent to less frequent, skip delimiter (first word) - if (wordsFreq.size() > 1) { - std::sort(wordsFreq.begin() + 1, wordsFreq.end(), [](const std::pair &_what, - const std::pair &_with) { - if(_what.second == _with.second){ - return _what.first > _with.first; - }else{ - return _what.second > _with.second; - } - }); - // make delimiter frequency more then the most frequent word - wordsFreq[0].second = wordsFreq[1].second + 1; - // restore sentence delimiter - auto &i = tmpWords[""]; - i.word = ""; - i.frequency = wordsFreq[0].second; - } - // fill index values - for (std::size_t i = 0; i < wordsFreq.size(); ++i) { - auto &w = tmpWords[wordsFreq[i].first]; - m_words[wordsFreq[i].first] = wordData_t(i, w.frequency); - //Rcpp::Rcout << i << " " << wordsFreq[i].first << ": " << wordsFreq[i].second << "\n"; - } - - if (_statsCallback != nullptr) { - _statsCallback(m_words.size(), m_trainWords, m_totalWords); - } - } vocabulary_t::vocabulary_t(std::shared_ptr &_corpus, uint16_t _minFreq, diff --git a/src/word2vec/lib/vocabulary.hpp b/src/word2vec/lib/vocabulary.hpp index 05793fe..28b0a99 100644 --- a/src/word2vec/lib/vocabulary.hpp +++ b/src/word2vec/lib/vocabulary.hpp @@ -16,7 +16,7 @@ #include #include "word2vec.hpp" -#include "mapper.hpp" +//#include "mapper.hpp" namespace w2v { /** @@ -62,13 +62,13 @@ namespace w2v { * @param _statsCallback callback function to be called on train data loaded event to pass vocabulary size, * train words and total words amounts. */ - vocabulary_t(std::shared_ptr &_trainWordsMapper, - std::shared_ptr &_stopWordsMapper, - const std::string &_wordDelimiterChars, - const std::string &_endOfSentenceChars, - uint16_t _minFreq, - w2vModel_t::vocabularyProgressCallback_t _progressCallback, - w2vModel_t::vocabularyStatsCallback_t _statsCallback) noexcept; + // vocabulary_t(std::shared_ptr &_trainWordsMapper, + // std::shared_ptr &_stopWordsMapper, + // const std::string &_wordDelimiterChars, + // const std::string &_endOfSentenceChars, + // uint16_t _minFreq, + // w2vModel_t::vocabularyProgressCallback_t _progressCallback, + // w2vModel_t::vocabularyStatsCallback_t _statsCallback) noexcept; vocabulary_t(std::shared_ptr &_corpus, uint16_t _minFreq, From b1867425b5ef5f205898d24f57b356723f08eaa2 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Thu, 30 Nov 2023 13:58:03 +0900 Subject: [PATCH 06/31] Remove method for character --- NAMESPACE | 1 - R/RcppExports.R | 12 +-- R/word2vec.R | 79 ++---------------- man/word2vec.character.Rd | 167 -------------------------------------- man/word2vec.list.Rd | 2 +- src/RcppExports.cpp | 41 ++-------- 6 files changed, 13 insertions(+), 289 deletions(-) delete mode 100644 man/word2vec.character.Rd diff --git a/NAMESPACE b/NAMESPACE index 816efe4..e2f823c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,7 +6,6 @@ S3method(predict,word2vec) S3method(predict,word2vec_trained) S3method(summary,word2vec) S3method(summary,word2vec_trained) -S3method(word2vec,character) S3method(word2vec,list) export(doc2vec) export(read.word2vec) diff --git a/R/RcppExports.R b/R/RcppExports.R index c7b440a..d1533a5 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,8 +1,8 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -w2v_train <- function(texts_, stopWords_, modelFile, minWordFreq = 5L, size = 100L, window = 5L, expTableSize = 1000L, expValueMax = 6L, sample = 0.001, withHS = FALSE, negative = 5L, threads = 1L, iterations = 5L, alpha = 0.05, withSG = FALSE, wordDelimiterChars = " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r", endOfSentenceChars = ".\n?!", verbose = FALSE, normalize = TRUE) { - .Call('_word2vec_w2v_train', PACKAGE = 'word2vec', texts_, stopWords_, modelFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, wordDelimiterChars, endOfSentenceChars, verbose, normalize) +w2v_train <- function(texts_, stopWords_, modelFile = "", minWordFreq = 5L, size = 100L, window = 5L, expTableSize = 1000L, expValueMax = 6L, sample = 0.001, withHS = FALSE, negative = 5L, threads = 1L, iterations = 5L, alpha = 0.05, withSG = FALSE, verbose = FALSE, normalize = TRUE) { + .Call('_word2vec_w2v_train', PACKAGE = 'word2vec', texts_, stopWords_, modelFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, verbose, normalize) } w2v_load_model <- function(file, normalize = TRUE) { @@ -33,11 +33,3 @@ w2v_read_binary <- function(modelFile, normalize, n) { .Call('_word2vec_w2v_read_binary', PACKAGE = 'word2vec', modelFile, normalize, n) } -d2vec <- function(ptr, x, wordDelimiterChars = " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r") { - .Call('_word2vec_d2vec', PACKAGE = 'word2vec', ptr, x, wordDelimiterChars) -} - -d2vec_nearest <- function(ptr_w2v, ptr_d2v, x, wordDelimiterChars = " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r") { - .Call('_word2vec_d2vec_nearest', PACKAGE = 'word2vec', ptr_w2v, ptr_d2v, x, wordDelimiterChars) -} - diff --git a/R/word2vec.R b/R/word2vec.R index 781b9d7..2eb2433 100644 --- a/R/word2vec.R +++ b/R/word2vec.R @@ -126,75 +126,6 @@ word2vec <- function(x, UseMethod("word2vec") } -#' @inherit word2vec title description params details seealso return references examples -#' @param split a character vector of length 2 where the first element indicates how to split words and the second element indicates how to split sentences in \code{x} -#' @param encoding the encoding of \code{x} and \code{stopwords}. Defaults to 'UTF-8'. -#' Calculating the model always starts from files allowing to build a model on large corpora. The encoding argument -#' is passed on to \code{file} when writing \code{x} to hard disk in case you provided it as a character vector. -#' @param useBytes logical passed on to \code{\link{writeLines}} when writing the text and stopwords on disk before building the model. Defaults to \code{TRUE}. -#' @export -word2vec.character <- function(x, - type = c("cbow", "skip-gram"), - dim = 50, window = ifelse(type == "cbow", 5L, 10L), - iter = 5L, lr = 0.05, hs = FALSE, negative = 5L, sample = 0.001, min_count = 5L, - stopwords = character(), - threads = 1L, - split = c(" \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r", - ".\n?!"), - encoding = "UTF-8", - useBytes = TRUE, - ...){ - type <- match.arg(type) - stopw <- stopwords - model <- file.path(tempdir(), "w2v.bin") - if(length(stopw) == 0){ - stopw <- "" - } - file_stopwords <- tempfile() - filehandle_stopwords <- file(file_stopwords, open = "wt", encoding = encoding) - writeLines(stopw, con = filehandle_stopwords, useBytes = useBytes) - close(filehandle_stopwords) - on.exit({ - if (file.exists(file_stopwords)) file.remove(file_stopwords) - }) - if(length(x) == 1){ - file_train <- x - }else{ - file_train <- tempfile(pattern = "textspace_", fileext = ".txt") - on.exit({ - if (file.exists(file_stopwords)) file.remove(file_stopwords) - if (file.exists(file_train)) file.remove(file_train) - }) - filehandle_train <- file(file_train, open = "wt", encoding = encoding) - writeLines(text = x, con = filehandle_train, useBytes = useBytes) - close(filehandle_train) - } - #expTableSize <- 1000L - #expValueMax <- 6L - #expTableSize <- as.integer(expTableSize) - #expValueMax <- as.integer(expValueMax) - min_count <- as.integer(min_count) - dim <- as.integer(dim) - window <- as.integer(window) - iter <- as.integer(iter) - sample <- as.numeric(sample) - hs <- as.logical(hs) - negative <- as.integer(negative) - threads <- as.integer(threads) - iter <- as.integer(iter) - lr <- as.numeric(lr) - skipgram <- as.logical(type %in% "skip-gram") - split <- as.character(split) - model <- w2v_train(list(), character(), - trainFile = file_train, modelFile = model, stopWordsFile = file_stopwords, - minWordFreq = min_count, - size = dim, window = window, #expTableSize = expTableSize, expValueMax = expValueMax, - sample = sample, withHS = hs, negative = negative, threads = threads, iterations = iter, - alpha = lr, withSG = skipgram, wordDelimiterChars = split[1], endOfSentenceChars = split[2], ...) - model$data$stopwords <- stopwords - model -} - #' @inherit word2vec title description params details seealso return references #' @export #' @examples @@ -229,12 +160,12 @@ word2vec.list <- function(x, type = c("cbow", "skip-gram"), dim = 50, window = ifelse(type == "cbow", 5L, 10L), iter = 5L, lr = 0.05, hs = FALSE, negative = 5L, sample = 0.001, min_count = 5L, - stopwords = character(), + stopwords = integer(), threads = 1L, ...){ - x <- lapply(x, as.character) + #x <- lapply(x, as.character) type <- match.arg(type) - stopwords <- as.character(stopwords) + stopwords <- as.integer(stopwords) model <- file.path(tempdir(), "w2v.bin") #expTableSize <- 1000L #expValueMax <- 6L @@ -253,11 +184,11 @@ word2vec.list <- function(x, skipgram <- as.logical(type %in% "skip-gram") encoding <- "UTF-8" model <- w2v_train(x, stopwords, - trainFile = "", modelFile = model, stopWordsFile = "", + modelFile = model, minWordFreq = min_count, size = dim, window = window, #expTableSize = expTableSize, expValueMax = expValueMax, sample = sample, withHS = hs, negative = negative, threads = threads, iterations = iter, - alpha = lr, withSG = skipgram, wordDelimiterChars = "", endOfSentenceChars = "", ...) + alpha = lr, withSG = skipgram, ...) model$data$stopwords <- stopwords model } diff --git a/man/word2vec.character.Rd b/man/word2vec.character.Rd deleted file mode 100644 index 6a4aaa9..0000000 --- a/man/word2vec.character.Rd +++ /dev/null @@ -1,167 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/word2vec.R -\name{word2vec.character} -\alias{word2vec.character} -\title{Train a word2vec model on text} -\usage{ -\method{word2vec}{character}( - x, - type = c("cbow", "skip-gram"), - dim = 50, - window = ifelse(type == "cbow", 5L, 10L), - iter = 5L, - lr = 0.05, - hs = FALSE, - negative = 5L, - sample = 0.001, - min_count = 5L, - stopwords = character(), - threads = 1L, - split = c(" \\n,.-!?:;/\\"#$\%&'()*+<=>@[]\\\\^_`{|}~\\t\\v\\f\\r", ".\\n?!"), - encoding = "UTF-8", - useBytes = TRUE, - ... -) -} -\arguments{ -\item{x}{a character vector with text or the path to the file on disk containing training data or a list of tokens. See the examples.} - -\item{type}{the type of algorithm to use, either 'cbow' or 'skip-gram'. Defaults to 'cbow'} - -\item{dim}{dimension of the word vectors. Defaults to 50.} - -\item{window}{skip length between words. Defaults to 5.} - -\item{iter}{number of training iterations. Defaults to 5.} - -\item{lr}{initial learning rate also known as alpha. Defaults to 0.05} - -\item{hs}{logical indicating to use hierarchical softmax instead of negative sampling. Defaults to FALSE indicating to do negative sampling.} - -\item{negative}{integer with the number of negative samples. Only used in case hs is set to FALSE} - -\item{sample}{threshold for occurrence of words. Defaults to 0.001} - -\item{min_count}{integer indicating the number of time a word should occur to be considered as part of the training vocabulary. Defaults to 5.} - -\item{stopwords}{a character vector of stopwords to exclude from training} - -\item{threads}{number of CPU threads to use. Defaults to 1.} - -\item{split}{a character vector of length 2 where the first element indicates how to split words and the second element indicates how to split sentences in \code{x}} - -\item{encoding}{the encoding of \code{x} and \code{stopwords}. Defaults to 'UTF-8'. -Calculating the model always starts from files allowing to build a model on large corpora. The encoding argument -is passed on to \code{file} when writing \code{x} to hard disk in case you provided it as a character vector.} - -\item{useBytes}{logical passed on to \code{\link{writeLines}} when writing the text and stopwords on disk before building the model. Defaults to \code{TRUE}.} - -\item{...}{further arguments passed on to the methods \code{\link{word2vec.character}}, \code{\link{word2vec.list}} as well as the C++ function \code{w2v_train} - for expert use only} -} -\value{ -an object of class \code{w2v_trained} which is a list with elements -\itemize{ -\item{model: a Rcpp pointer to the model} -\item{data: a list with elements file: the training data used, stopwords: the character vector of stopwords, n} -\item{vocabulary: the number of words in the vocabulary} -\item{success: logical indicating if training succeeded} -\item{error_log: the error log in case training failed} -\item{control: as list of the training arguments used, namely min_count, dim, window, iter, lr, skipgram, hs, negative, sample, split_words, split_sents, expTableSize and expValueMax} -} -} -\description{ -Construct a word2vec model on text. The algorithm is explained at \url{https://arxiv.org/pdf/1310.4546.pdf} -} -\details{ -Some advice on the optimal set of parameters to use for training as defined by Mikolov et al. -\itemize{ -\item{argument type: skip-gram (slower, better for infrequent words) vs cbow (fast)} -\item{argument hs: the training algorithm: hierarchical softmax (better for infrequent words) vs negative sampling (better for frequent words, better with low dimensional vectors)} -\item{argument dim: dimensionality of the word vectors: usually more is better, but not always} -\item{argument window: for skip-gram usually around 10, for cbow around 5} -\item{argument sample: sub-sampling of frequent words: can improve both accuracy and speed for large data sets (useful values are in range 0.001 to 0.00001)} -} -} -\examples{ -\dontshow{if(require(udpipe))\{} -library(udpipe) -## Take data and standardise it a bit -data(brussels_reviews, package = "udpipe") -x <- subset(brussels_reviews, language == "nl") -x <- tolower(x$feedback) - -## Build the model get word embeddings and nearest neighbours -model <- word2vec(x = x, dim = 15, iter = 20) -emb <- as.matrix(model) -head(emb) -emb <- predict(model, c("bus", "toilet", "unknownword"), type = "embedding") -emb -nn <- predict(model, c("bus", "toilet"), type = "nearest", top_n = 5) -nn - -## Get vocabulary -vocab <- summary(model, type = "vocabulary") - -# Do some calculations with the vectors and find similar terms to these -emb <- as.matrix(model) -vector <- emb["buurt", ] - emb["rustige", ] + emb["restaurants", ] -predict(model, vector, type = "nearest", top_n = 10) - -vector <- emb["gastvrouw", ] - emb["gastvrij", ] -predict(model, vector, type = "nearest", top_n = 5) - -vectors <- emb[c("gastheer", "gastvrouw"), ] -vectors <- rbind(vectors, avg = colMeans(vectors)) -predict(model, vectors, type = "nearest", top_n = 10) - -## Save the model to hard disk -path <- "mymodel.bin" -\dontshow{ -path <- tempfile(pattern = "w2v", fileext = ".bin") -} -write.word2vec(model, file = path) -model <- read.word2vec(path) - -\dontshow{ -file.remove(path) -} -## -## Example of word2vec with a list of tokens -## -toks <- strsplit(x, split = "[[:space:][:punct:]]+") -model <- word2vec(x = toks, dim = 15, iter = 20) -emb <- as.matrix(model) -emb <- predict(model, c("bus", "toilet", "unknownword"), type = "embedding") -emb -nn <- predict(model, c("bus", "toilet"), type = "nearest", top_n = 5) -nn - -## -## Example getting word embeddings -## which are different depending on the parts of speech tag -## Look to the help of the udpipe R package -## to get parts of speech tags on text -## -library(udpipe) -data(brussels_reviews_anno, package = "udpipe") -x <- subset(brussels_reviews_anno, language == "fr") -x <- subset(x, grepl(xpos, pattern = paste(LETTERS, collapse = "|"))) -x$text <- sprintf("\%s/\%s", x$lemma, x$xpos) -x <- subset(x, !is.na(lemma)) -x <- split(x$text, list(x$doc_id, x$sentence_id)) - -model <- word2vec(x = x, dim = 15, iter = 20) -emb <- as.matrix(model) -nn <- predict(model, c("cuisine/NN", "rencontrer/VB"), type = "nearest") -nn -nn <- predict(model, c("accueillir/VBN", "accueillir/VBG"), type = "nearest") -nn - -\dontshow{\} # End of main if statement running only if the required packages are installed} -} -\references{ -\url{https://github.com/maxoodf/word2vec}, \url{https://arxiv.org/pdf/1310.4546.pdf} -} -\seealso{ -\code{\link{predict.word2vec}}, \code{\link{as.matrix.word2vec}}, \code{\link{word2vec}}, \code{\link{word2vec.character}}, \code{\link{word2vec.list}} -} diff --git a/man/word2vec.list.Rd b/man/word2vec.list.Rd index c5d93e3..b92d8f8 100644 --- a/man/word2vec.list.Rd +++ b/man/word2vec.list.Rd @@ -15,7 +15,7 @@ negative = 5L, sample = 0.001, min_count = 5L, - stopwords = character(), + stopwords = integer(), threads = 1L, ... ) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index b34d216..c8bfcda 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -11,13 +11,13 @@ Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // w2v_train -Rcpp::List w2v_train(Rcpp::List texts_, Rcpp::IntegerVector stopWords_, std::string modelFile, uint16_t minWordFreq, uint16_t size, uint8_t window, uint16_t expTableSize, uint8_t expValueMax, float sample, bool withHS, uint8_t negative, uint8_t threads, uint8_t iterations, float alpha, bool withSG, std::string wordDelimiterChars, std::string endOfSentenceChars, bool verbose, bool normalize); -RcppExport SEXP _word2vec_w2v_train(SEXP texts_SEXP, SEXP stopWords_SEXP, SEXP modelFileSEXP, SEXP minWordFreqSEXP, SEXP sizeSEXP, SEXP windowSEXP, SEXP expTableSizeSEXP, SEXP expValueMaxSEXP, SEXP sampleSEXP, SEXP withHSSEXP, SEXP negativeSEXP, SEXP threadsSEXP, SEXP iterationsSEXP, SEXP alphaSEXP, SEXP withSGSEXP, SEXP wordDelimiterCharsSEXP, SEXP endOfSentenceCharsSEXP, SEXP verboseSEXP, SEXP normalizeSEXP) { +Rcpp::List w2v_train(Rcpp::List texts_, Rcpp::CharacterVector stopWords_, std::string modelFile, uint16_t minWordFreq, uint16_t size, uint8_t window, uint16_t expTableSize, uint8_t expValueMax, float sample, bool withHS, uint8_t negative, uint8_t threads, uint8_t iterations, float alpha, bool withSG, bool verbose, bool normalize); +RcppExport SEXP _word2vec_w2v_train(SEXP texts_SEXP, SEXP stopWords_SEXP, SEXP modelFileSEXP, SEXP minWordFreqSEXP, SEXP sizeSEXP, SEXP windowSEXP, SEXP expTableSizeSEXP, SEXP expValueMaxSEXP, SEXP sampleSEXP, SEXP withHSSEXP, SEXP negativeSEXP, SEXP threadsSEXP, SEXP iterationsSEXP, SEXP alphaSEXP, SEXP withSGSEXP, SEXP verboseSEXP, SEXP normalizeSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< Rcpp::List >::type texts_(texts_SEXP); - Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type stopWords_(stopWords_SEXP); + Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type stopWords_(stopWords_SEXP); Rcpp::traits::input_parameter< std::string >::type modelFile(modelFileSEXP); Rcpp::traits::input_parameter< uint16_t >::type minWordFreq(minWordFreqSEXP); Rcpp::traits::input_parameter< uint16_t >::type size(sizeSEXP); @@ -31,11 +31,9 @@ BEGIN_RCPP Rcpp::traits::input_parameter< uint8_t >::type iterations(iterationsSEXP); Rcpp::traits::input_parameter< float >::type alpha(alphaSEXP); Rcpp::traits::input_parameter< bool >::type withSG(withSGSEXP); - Rcpp::traits::input_parameter< std::string >::type wordDelimiterChars(wordDelimiterCharsSEXP); - Rcpp::traits::input_parameter< std::string >::type endOfSentenceChars(endOfSentenceCharsSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); Rcpp::traits::input_parameter< bool >::type normalize(normalizeSEXP); - rcpp_result_gen = Rcpp::wrap(w2v_train(texts_, stopWords_, modelFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, wordDelimiterChars, endOfSentenceChars, verbose, normalize)); + rcpp_result_gen = Rcpp::wrap(w2v_train(texts_, stopWords_, modelFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, verbose, normalize)); return rcpp_result_gen; END_RCPP } @@ -127,36 +125,9 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// d2vec -Rcpp::List d2vec(SEXP ptr, Rcpp::StringVector x, std::string wordDelimiterChars); -RcppExport SEXP _word2vec_d2vec(SEXP ptrSEXP, SEXP xSEXP, SEXP wordDelimiterCharsSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< SEXP >::type ptr(ptrSEXP); - Rcpp::traits::input_parameter< Rcpp::StringVector >::type x(xSEXP); - Rcpp::traits::input_parameter< std::string >::type wordDelimiterChars(wordDelimiterCharsSEXP); - rcpp_result_gen = Rcpp::wrap(d2vec(ptr, x, wordDelimiterChars)); - return rcpp_result_gen; -END_RCPP -} -// d2vec_nearest -Rcpp::DataFrame d2vec_nearest(SEXP ptr_w2v, SEXP ptr_d2v, Rcpp::StringVector x, std::string wordDelimiterChars); -RcppExport SEXP _word2vec_d2vec_nearest(SEXP ptr_w2vSEXP, SEXP ptr_d2vSEXP, SEXP xSEXP, SEXP wordDelimiterCharsSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< SEXP >::type ptr_w2v(ptr_w2vSEXP); - Rcpp::traits::input_parameter< SEXP >::type ptr_d2v(ptr_d2vSEXP); - Rcpp::traits::input_parameter< Rcpp::StringVector >::type x(xSEXP); - Rcpp::traits::input_parameter< std::string >::type wordDelimiterChars(wordDelimiterCharsSEXP); - rcpp_result_gen = Rcpp::wrap(d2vec_nearest(ptr_w2v, ptr_d2v, x, wordDelimiterChars)); - return rcpp_result_gen; -END_RCPP -} static const R_CallMethodDef CallEntries[] = { - {"_word2vec_w2v_train", (DL_FUNC) &_word2vec_w2v_train, 19}, + {"_word2vec_w2v_train", (DL_FUNC) &_word2vec_w2v_train, 17}, {"_word2vec_w2v_load_model", (DL_FUNC) &_word2vec_w2v_load_model, 2}, {"_word2vec_w2v_save_model", (DL_FUNC) &_word2vec_w2v_save_model, 2}, {"_word2vec_w2v_dictionary", (DL_FUNC) &_word2vec_w2v_dictionary, 1}, @@ -164,8 +135,6 @@ static const R_CallMethodDef CallEntries[] = { {"_word2vec_w2v_nearest", (DL_FUNC) &_word2vec_w2v_nearest, 4}, {"_word2vec_w2v_nearest_vector", (DL_FUNC) &_word2vec_w2v_nearest_vector, 4}, {"_word2vec_w2v_read_binary", (DL_FUNC) &_word2vec_w2v_read_binary, 3}, - {"_word2vec_d2vec", (DL_FUNC) &_word2vec_d2vec, 3}, - {"_word2vec_d2vec_nearest", (DL_FUNC) &_word2vec_d2vec_nearest, 4}, {NULL, NULL, 0} }; From ffd3b92dbf4151f1f5c2c0827bef58164b7c7179 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Thu, 30 Nov 2023 14:51:05 +0900 Subject: [PATCH 07/31] Change tokens from string to int --- src/rcpp_word2vec.cpp | 10 ++++++--- src/word2vec/lib/trainThread.cpp | 37 ++++++++++++++++---------------- src/word2vec/lib/trainThread.hpp | 4 ++-- src/word2vec/lib/word2vec.hpp | 32 ++++++++++++++++++--------- 4 files changed, 50 insertions(+), 33 deletions(-) diff --git a/src/rcpp_word2vec.cpp b/src/rcpp_word2vec.cpp index 2eb5021..c89179b 100644 --- a/src/rcpp_word2vec.cpp +++ b/src/rcpp_word2vec.cpp @@ -11,6 +11,7 @@ // [[Rcpp::depends(RcppProgress)]] // [[Rcpp::export]] Rcpp::List w2v_train(Rcpp::List texts_, + Rcpp::CharacterVector types_, Rcpp::CharacterVector stopWords_, std::string modelFile = "", uint16_t minWordFreq = 5, @@ -47,9 +48,12 @@ Rcpp::List w2v_train(Rcpp::List texts_, */ texts_t texts = Rcpp::as(texts_); - words_t stopWords = Rcpp::as(stopWords_); - w2v::corpus_t corpus(texts, stopWords); - + types_t types = Rcpp::as(types_); + types_t stopWords = Rcpp::as(stopWords_); + + w2v::corpus_t corpus(texts, types, stopWords); + corpus.setWordFreq(); + w2v::trainSettings_t trainSettings; trainSettings.minWordFreq = minWordFreq; trainSettings.size = size; diff --git a/src/word2vec/lib/trainThread.cpp b/src/word2vec/lib/trainThread.cpp index 9edacbb..1844ad4 100644 --- a/src/word2vec/lib/trainThread.cpp +++ b/src/word2vec/lib/trainThread.cpp @@ -87,7 +87,7 @@ namespace w2v { } // read sentence - std::vector sentence; + std::vector sentence; // Rcpp::Rcout << "h: " << h << "\n"; if (h > range.second) { @@ -98,25 +98,26 @@ namespace w2v { for (size_t i = 0; i < text.size(); i++) { - std::string word = text[i]; - if (word.empty()) { + unsigned int word = text[i]; + if (word == 0) { continue; // padding } - auto wordData = m_sharedData.vocabulary->data(word); - if (wordData == nullptr) { - continue; // no such word - } + // auto wordData = m_sharedData.vocabulary->data(word); + // if (wordData == nullptr) { + // continue; // no such word + // } threadProcessedWords++; if (m_sharedData.trainSettings->sample > 0.0f) { // down-sampling... - if ((*m_downSampling)(wordData->frequency, m_randomGenerator)) { + //if ((*m_downSampling)(wordData->frequency, m_randomGenerator)) { + if ((*m_downSampling)(m_sharedData.corpus->frequency[word], m_randomGenerator)) { continue; // skip this word } } //if (h == 1) // Rcpp::Rcout << word << ": " << wordData->index << "\n"; - sentence.push_back(wordData); + sentence.push_back(word); } if (m_sharedData.trainSettings->withSG) { @@ -129,7 +130,7 @@ namespace w2v { } } - inline void trainThread_t::cbow(const std::vector &_sentence, + inline void trainThread_t::cbow(const std::vector &_sentence, std::vector &_trainMatrix) noexcept { for (std::size_t i = 0; i < _sentence.size(); ++i) { // hidden layers initialized with 0 values @@ -148,7 +149,7 @@ namespace w2v { continue; } for (std::size_t k = 0; k < m_sharedData.trainSettings->size; ++k) { - (*m_hiddenLayerVals)[k] += _trainMatrix[k + _sentence[posRndWindow]->index + (*m_hiddenLayerVals)[k] += _trainMatrix[k + _sentence[posRndWindow] * m_sharedData.trainSettings->size]; } cw++; @@ -161,9 +162,9 @@ namespace w2v { } if (m_sharedData.trainSettings->withHS) { - hierarchicalSoftmax(_sentence[i]->index, *m_hiddenLayerErrors, *m_hiddenLayerVals, 0); + hierarchicalSoftmax(_sentence[i], *m_hiddenLayerErrors, *m_hiddenLayerVals, 0); } else { - negativeSampling(_sentence[i]->index, *m_hiddenLayerErrors, *m_hiddenLayerVals, 0); + negativeSampling(_sentence[i], *m_hiddenLayerErrors, *m_hiddenLayerVals, 0); } // hidden -> in @@ -177,14 +178,14 @@ namespace w2v { continue; } for (std::size_t k = 0; k < m_sharedData.trainSettings->size; ++k) { - _trainMatrix[k + _sentence[posRndWindow]->index * m_sharedData.trainSettings->size] + _trainMatrix[k + _sentence[posRndWindow] * m_sharedData.trainSettings->size] += (*m_hiddenLayerErrors)[k]; } } } } - inline void trainThread_t::skipGram(const std::vector &_sentence, + inline void trainThread_t::skipGram(const std::vector &_sentence, std::vector &_trainMatrix) noexcept { for (std::size_t i = 0; i < _sentence.size(); ++i) { auto rndShift = m_rndWindowShift(m_randomGenerator); @@ -198,15 +199,15 @@ namespace w2v { continue; } // shift to the selected word vector in the matrix - auto shift = _sentence[posRndWindow]->index * m_sharedData.trainSettings->size; + auto shift = _sentence[posRndWindow] * m_sharedData.trainSettings->size; // hidden layer initialized with 0 values std::memset(m_hiddenLayerErrors->data(), 0, m_hiddenLayerErrors->size() * sizeof(float)); if (m_sharedData.trainSettings->withHS) { - hierarchicalSoftmax(_sentence[i]->index, (*m_hiddenLayerErrors), _trainMatrix, shift); + hierarchicalSoftmax(_sentence[i], (*m_hiddenLayerErrors), _trainMatrix, shift); } else { - negativeSampling(_sentence[i]->index, (*m_hiddenLayerErrors), _trainMatrix, shift); + negativeSampling(_sentence[i], (*m_hiddenLayerErrors), _trainMatrix, shift); } for (std::size_t k = 0; k < m_sharedData.trainSettings->size; ++k) { diff --git a/src/word2vec/lib/trainThread.hpp b/src/word2vec/lib/trainThread.hpp index 4cafc36..63f5f17 100644 --- a/src/word2vec/lib/trainThread.hpp +++ b/src/word2vec/lib/trainThread.hpp @@ -90,9 +90,9 @@ namespace w2v { private: void worker(std::vector &_trainMatrix) noexcept; - inline void cbow(const std::vector &_sentence, + inline void cbow(const std::vector &_sentence, std::vector &_trainMatrix) noexcept; - inline void skipGram(const std::vector &_sentence, + inline void skipGram(const std::vector &_sentence, std::vector &_trainMatrix) noexcept; inline void hierarchicalSoftmax(std::size_t _index, std::vector &_hiddenLayer, diff --git a/src/word2vec/lib/word2vec.hpp b/src/word2vec/lib/word2vec.hpp index fba9f8e..aa47de1 100644 --- a/src/word2vec/lib/word2vec.hpp +++ b/src/word2vec/lib/word2vec.hpp @@ -19,11 +19,12 @@ #include #include -typedef std::vector words_t; -typedef std::vector text_t; -// typedef std::vector words_t; -// typedef std::vector text_t; +typedef std::vector types_t; +// typedef std::vector words_t; +// typedef std::vector text_t; +typedef std::vector text_t; typedef std::vector texts_t; +typedef std::vector frequency_t; namespace w2v { @@ -33,16 +34,27 @@ namespace w2v { class corpus_t final { public: texts_t texts; - words_t types; - words_t stopWords; + types_t types; + types_t stopWords; + frequency_t frequency; // Constructors corpus_t(): texts() {} - // corpus_t(texts_t _texts, words_t _types, words_t _stopWords): - // texts(_texts), types(_types), stopWords(_stopWords) {} - corpus_t(texts_t _texts, words_t _stopWords): - texts(_texts), stopWords(_stopWords) {} + corpus_t(texts_t _texts, types_t _types, types_t _stopWords): + texts(_texts), types(_types), stopWords(_stopWords) {} + //corpus_t(texts_t _texts, words_t _stopWords): + // texts(_texts), stopWords(_stopWords) {} + void setWordFreq() { + frequency = frequency_t(types.size(), 0); + for (size_t h = 0; h < texts.size(); h++) { + text_t text = texts[h]; + for (size_t i = 0; i < text.size(); i++) { + unsigned int word = text[i]; + frequency[word - 1]++; + } + } + } }; /** From 267eed91d40fbb78f92d33616efe1c3c9384f5b0 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Wed, 6 Dec 2023 15:01:00 +0900 Subject: [PATCH 08/31] Remove vocabulary --- src/Makevars | 1 - src/Makevars.win | 1 - src/word2vec/lib/vocabulary.cpp | 108 ------------------------ src/word2vec/lib/vocabulary.hpp | 140 -------------------------------- 4 files changed, 250 deletions(-) delete mode 100644 src/word2vec/lib/vocabulary.cpp delete mode 100644 src/word2vec/lib/vocabulary.hpp diff --git a/src/Makevars b/src/Makevars index 9f2426e..620ba58 100644 --- a/src/Makevars +++ b/src/Makevars @@ -6,7 +6,6 @@ SOURCES = word2vec/lib/huffmanTree.cpp \ word2vec/lib/nsDistribution.cpp \ word2vec/lib/trainer.cpp \ word2vec/lib/trainThread.cpp \ - word2vec/lib/vocabulary.cpp \ word2vec/lib/word2vec.cpp \ rcpp_word2vec.cpp \ RcppExports.cpp diff --git a/src/Makevars.win b/src/Makevars.win index 0affdf1..459c5a1 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -6,7 +6,6 @@ SOURCES = word2vec/lib/huffmanTree.cpp \ word2vec/lib/nsDistribution.cpp \ word2vec/lib/trainer.cpp \ word2vec/lib/trainThread.cpp \ - word2vec/lib/vocabulary.cpp \ word2vec/lib/win/mman.cpp \ word2vec/lib/word2vec.cpp \ rcpp_word2vec.cpp \ diff --git a/src/word2vec/lib/vocabulary.cpp b/src/word2vec/lib/vocabulary.cpp deleted file mode 100644 index a7d61b4..0000000 --- a/src/word2vec/lib/vocabulary.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/** - * @file - * @brief vocabulary class containing word map, words frequencies and word indexes - * @author Max Fomichev - * @date 16.12.2016 - * @copyright Apache License v.2 (http://www.apache.org/licenses/LICENSE-2.0) -*/ - -#include "vocabulary.hpp" -//#include "wordReader.hpp" - -namespace w2v { - - vocabulary_t::vocabulary_t(std::shared_ptr &_corpus, - uint16_t _minFreq, - w2vModel_t::vocabularyProgressCallback_t _progressCallback, - w2vModel_t::vocabularyStatsCallback_t _statsCallback) noexcept: m_words() { - - // load words and calculate their frequencies - struct tmpWordData_t { - std::size_t frequency = 0; - std::string word; - }; - std::unordered_map tmpWords; - std::string word; - //off_t progressOffset = 0; - - for (auto &text:_corpus->texts) { - for (auto &word:text) { - // padding - if (word.empty()) { - continue; - } - auto &tmpWordData = tmpWords[word]; - if (tmpWordData.frequency == 0) { - tmpWordData.word = word; - } - tmpWordData.frequency++; - m_totalWords++; - - // if (_progressCallback != nullptr) { - // if (wordReader.offset() - progressOffset >= _trainWordsMapper->size() / 10000 - 1) { - // _progressCallback(static_cast(wordReader.offset()) - // / _trainWordsMapper->size() * 100.0f); - // progressOffset = wordReader.offset(); - // } - // } - } - } - - // remove stop words from the words set - for (auto &i:_corpus->stopWords) { - tmpWords.erase(i); - } - - // remove sentence delimiter from the words set - // { - // std::string word = ""; - // auto i = tmpWords.find(word); - // if (i != tmpWords.end()) { - // m_totalWords -= i->second.frequency; - // tmpWords.erase(i); - // } - // } - - // prepare vector sorted by word frequencies - std::vector> wordsFreq; - // delimiter is the first word - wordsFreq.emplace_back(std::pair("", 0LU)); - for (auto const &i:tmpWords) { - if (i.second.frequency >= _minFreq) { - wordsFreq.emplace_back(std::pair(i.first, i.second.frequency)); - m_trainWords += i.second.frequency; - } - } - - // sorting, from more frequent to less frequent, skip delimiter (first word) - if (wordsFreq.size() > 1) { - std::sort(wordsFreq.begin() + 1, wordsFreq.end(), [](const std::pair &_what, - const std::pair &_with) { - if(_what.second == _with.second){ - return _what.first > _with.first; - }else{ - return _what.second > _with.second; - } - - }); - // NOTE: should the index 0 be non word? - // make delimiter frequency more then the most frequent word - wordsFreq[0].second = wordsFreq[1].second + 1; - // restore sentence delimiter - auto &i = tmpWords[""]; - i.word = ""; - i.frequency = wordsFreq[0].second; - } - // fill index values - //wordsFreq.emplace(wordsFreq.begin(), 0, std::pair("", 0U)); // NOTE: insert dummy - for (std::size_t i = 0; i < wordsFreq.size(); ++i) { - auto &w = tmpWords[wordsFreq[i].first]; - m_words[wordsFreq[i].first] = wordData_t(i, w.frequency); - //Rcpp::Rcout << i << " " << wordsFreq[i].first << ": " << wordsFreq[i].second << "\n"; - } - - if (_statsCallback != nullptr) { - _statsCallback(m_words.size(), m_trainWords, m_totalWords); - } - } -} diff --git a/src/word2vec/lib/vocabulary.hpp b/src/word2vec/lib/vocabulary.hpp deleted file mode 100644 index 28b0a99..0000000 --- a/src/word2vec/lib/vocabulary.hpp +++ /dev/null @@ -1,140 +0,0 @@ -/** - * @file - * @brief vocabulary class containing word map, words frequencies and word indexes - * @author Max Fomichev - * @date 16.12.2016 - * @copyright Apache License v.2 (http://www.apache.org/licenses/LICENSE-2.0) -*/ -#ifndef WORD2VEC_VOCABULARY_H -#define WORD2VEC_VOCABULARY_H - -#include -#include -#include -#include -#include -#include - -#include "word2vec.hpp" -//#include "mapper.hpp" - -namespace w2v { - /** - * @brief vocabulary class - implements fast access to a words storage with their data - index and frequency. - * - * Vocabulary contains parsed words with minimum defined frequency, excluding stop words defined in a text file. - * Base word storage is the std::unordered_map object. - * - */ - class vocabulary_t final { - public: - /** - * @brief wordData structure is a stored word parameters - index and frequency - */ - struct wordData_t final { - std::size_t index; ///< word index (more frequent words have lower index value) - std::size_t frequency; ///< word frequency in a train data set - - /// Constructs an empty wordData object - wordData_t() noexcept: index(0), frequency(0) {} - /// Constructs a wordObject with the specified parameters - wordData_t(std::size_t _index, std::size_t _frequency) noexcept: - index(_index), frequency(_frequency) {} - }; - - private: - // word (key) with its index and frequency - using wordMap_t = std::unordered_map; - - std::size_t m_trainWords = 0; - std::size_t m_totalWords = 0; - - wordMap_t m_words; - - public: - /** - * Constructs a vocabulary object from the specified files and parameters - * @param _trainWordsMapper smart pointer to fileMapper object related to a train data set file - * @param _stopWordsMapper smart pointer to fileMapper object related to a file with stop-words. - * In case of unititialized pointer, _stopWordsMapper will be ignored. - * @param _minFreq minimum word frequency to include into vocabulary - * @param _progressCallback callback function to be called on each new 0.01% processed train data - * @param _statsCallback callback function to be called on train data loaded event to pass vocabulary size, - * train words and total words amounts. - */ - // vocabulary_t(std::shared_ptr &_trainWordsMapper, - // std::shared_ptr &_stopWordsMapper, - // const std::string &_wordDelimiterChars, - // const std::string &_endOfSentenceChars, - // uint16_t _minFreq, - // w2vModel_t::vocabularyProgressCallback_t _progressCallback, - // w2vModel_t::vocabularyStatsCallback_t _statsCallback) noexcept; - - vocabulary_t(std::shared_ptr &_corpus, - uint16_t _minFreq, - w2vModel_t::vocabularyProgressCallback_t _progressCallback, - w2vModel_t::vocabularyStatsCallback_t _statsCallback) noexcept; - - /** - * Requests a data (index, frequency, word) associated with the _word - * @param[in] _word key value - * @return pointer to a wordData object or nullptr if the word is not a member of vocabulary - */ - inline const wordData_t *data(const std::string &_word) const noexcept { - auto i = m_words.find(_word); - if (i != m_words.end()) { - return &(i->second); - } else { - return nullptr; - } - } - - /// @retrns vocabulary size - inline std::size_t size() const noexcept { - return m_words.size(); - } - - /// @returns total words amount parsed from a train data set - inline std::size_t totalWords() const noexcept { - return m_totalWords; - } - - /// @returns train words amount (totalWords - amount(stop words) - amount(words with low frequency)) - inline std::size_t trainWords() const noexcept { - return m_trainWords; - } - - /** - * Requests word frequencies - * @param[out] _output - vector of word frequencies where vector indexes are word indexes and vector values - * are word frequencies - */ - inline void frequencies(std::vector &_output) const noexcept { - _output.resize(m_words.size()); - for (auto const &i:m_words) { - _output[i.second.index] = i.second.frequency; - } - } - - /** - * Requests words descending sorted by their frequencies - * @param[out] _words vector of word descending sorted by their frequencies - */ - inline void words(std::vector &_words) const noexcept { - _words.clear(); - std::vector> indexedWords; - for (auto const &i:m_words) { - indexedWords.emplace_back(std::pair(i.second.index, i.first)); - } - std::sort(indexedWords.begin(), indexedWords.end(), [](const std::pair &_what, - const std::pair &_with) { - return _what.first < _with.first; - }); - for (auto const &i:indexedWords) { - _words.push_back(i.second); - } - } - }; -} - -#endif // WORD2VEC_VOCABULARY_H From 2d463477e5da5b8cbcfc4dac5aa9b56c2a6ae5f8 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sat, 10 Feb 2024 07:20:14 +0900 Subject: [PATCH 09/31] Set frequency --- src/word2vec/lib/word2vec.hpp | 36 +++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/src/word2vec/lib/word2vec.hpp b/src/word2vec/lib/word2vec.hpp index aa47de1..dea4df2 100644 --- a/src/word2vec/lib/word2vec.hpp +++ b/src/word2vec/lib/word2vec.hpp @@ -22,9 +22,10 @@ typedef std::vector types_t; // typedef std::vector words_t; // typedef std::vector text_t; +typedef std::vector words_t; typedef std::vector text_t; typedef std::vector texts_t; -typedef std::vector frequency_t; +typedef std::vector frequency_t; namespace w2v { @@ -35,25 +36,52 @@ namespace w2v { public: texts_t texts; types_t types; - types_t stopWords; + words_t stopWords; frequency_t frequency; + size_t totalWords; + size_t trainWords; // Constructors corpus_t(): texts() {} - corpus_t(texts_t _texts, types_t _types, types_t _stopWords): + corpus_t(texts_t _texts, types_t _types, words_t _stopWords): texts(_texts), types(_types), stopWords(_stopWords) {} //corpus_t(texts_t _texts, words_t _stopWords): // texts(_texts), stopWords(_stopWords) {} void setWordFreq() { + Rcpp::Rcout << "here1\n"; + + std::unordered_set setStopWords; + for (size_t g = 0; g < stopWords.size(); g++) { + setStopWords.insert(stopWords[g]); + } + //Rcpp::Rcout << "here2\n"; + //return; + frequency = frequency_t(types.size(), 0); + totalWords = 0; + trainWords = 0; for (size_t h = 0; h < texts.size(); h++) { text_t text = texts[h]; for (size_t i = 0; i < text.size(); i++) { - unsigned int word = text[i]; + int word = text[i]; + //Rcpp::Rcout << i << ": " << word << "\n"; + if (word == 0) // padding + continue; + if (word < 0 || frequency.size() < word - 1) + throw std::range_error("setWordFreq: invalid types"); frequency[word - 1]++; + totalWords++; + auto it = setStopWords.find(word); + if (it != setStopWords.end()) { + trainWords++; + } else { + texts[h][i] = 0; + } } } + Rcpp::Rcout << "trainWords: " << trainWords << "\n"; + Rcpp::Rcout << "totalWords: " << totalWords << "\n"; } }; From edd824d468b375a5d9abaab34b017e1fae30a1d0 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sat, 10 Feb 2024 09:02:32 +0900 Subject: [PATCH 10/31] Remove stopwords where types == "" --- src/rcpp_word2vec.cpp | 10 +++++++--- src/word2vec/lib/word2vec.hpp | 29 ++++++++++++----------------- tests/{train.R => test.R} | 11 +++++++++-- 3 files changed, 28 insertions(+), 22 deletions(-) rename tests/{train.R => test.R} (78%) diff --git a/src/rcpp_word2vec.cpp b/src/rcpp_word2vec.cpp index c89179b..61c9a27 100644 --- a/src/rcpp_word2vec.cpp +++ b/src/rcpp_word2vec.cpp @@ -12,7 +12,6 @@ // [[Rcpp::export]] Rcpp::List w2v_train(Rcpp::List texts_, Rcpp::CharacterVector types_, - Rcpp::CharacterVector stopWords_, std::string modelFile = "", uint16_t minWordFreq = 5, uint16_t size = 100, @@ -49,11 +48,16 @@ Rcpp::List w2v_train(Rcpp::List texts_, texts_t texts = Rcpp::as(texts_); types_t types = Rcpp::as(types_); - types_t stopWords = Rcpp::as(stopWords_); - w2v::corpus_t corpus(texts, types, stopWords); + w2v::corpus_t corpus(texts, types); corpus.setWordFreq(); + Rcpp::List out2 = Rcpp::List::create( + Rcpp::Named("frequency") = corpus.frequency + ); + + //return out2; + w2v::trainSettings_t trainSettings; trainSettings.minWordFreq = minWordFreq; trainSettings.size = size; diff --git a/src/word2vec/lib/word2vec.hpp b/src/word2vec/lib/word2vec.hpp index dea4df2..aec14cc 100644 --- a/src/word2vec/lib/word2vec.hpp +++ b/src/word2vec/lib/word2vec.hpp @@ -36,47 +36,42 @@ namespace w2v { public: texts_t texts; types_t types; - words_t stopWords; frequency_t frequency; size_t totalWords; size_t trainWords; // Constructors corpus_t(): texts() {} - corpus_t(texts_t _texts, types_t _types, words_t _stopWords): - texts(_texts), types(_types), stopWords(_stopWords) {} + corpus_t(texts_t _texts, types_t _types): + texts(_texts), types(_types) {} //corpus_t(texts_t _texts, words_t _stopWords): // texts(_texts), stopWords(_stopWords) {} void setWordFreq() { Rcpp::Rcout << "here1\n"; - std::unordered_set setStopWords; - for (size_t g = 0; g < stopWords.size(); g++) { - setStopWords.insert(stopWords[g]); - } - //Rcpp::Rcout << "here2\n"; - //return; - + std::unordered_set setStopWords; + frequency = frequency_t(types.size(), 0); totalWords = 0; trainWords = 0; for (size_t h = 0; h < texts.size(); h++) { text_t text = texts[h]; for (size_t i = 0; i < text.size(); i++) { - int word = text[i]; + totalWords++; + int &word = text[i]; //Rcpp::Rcout << i << ": " << word << "\n"; if (word == 0) // padding continue; - if (word < 0 || frequency.size() < word - 1) + if (word < 0 || types.size() < word) throw std::range_error("setWordFreq: invalid types"); frequency[word - 1]++; - totalWords++; - auto it = setStopWords.find(word); - if (it != setStopWords.end()) { - trainWords++; + if (types[word - 1].empty()) { + //Rcpp::Rcout << h << " " << i << " remove : " << word << "\n"; + word = 0; // remove and pad } else { - texts[h][i] = 0; + //Rcpp::Rcout << h << " " << i << " count : " << word << "\n"; + trainWords++; } } } diff --git a/tests/train.R b/tests/test.R similarity index 78% rename from tests/train.R rename to tests/test.R index 030fc2b..e232226 100644 --- a/tests/train.R +++ b/tests/test.R @@ -4,8 +4,15 @@ library(word2vec) corp <- data_corpus_inaugural %>% corpus_reshape() toks <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE) -lis <- as.list(toks) -txt <- stringi::stri_c_list(lis, " ") +lis <- unclass(toks) + +type <- types(toks) +type[type %in% stopwords()] <- "" +mod <- word2vec:::w2v_train(toks, type, verbose = TRUE) +dim(as.matrix(mod)) + +mod2 <- word2vec:::w2v_train(unclass(toks)[1:10], types(toks), verbose = TRUE) +dim(as.matrix(mod2)) mod_lis <- word2vec(lis, dim = 50, iter = 5, min_count = 5, verbose = TRUE, threads = 4) From 112bcc99828113f3b1faa39874baf46ca9e0f8be Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sat, 10 Feb 2024 10:18:36 +0900 Subject: [PATCH 11/31] Don't remove any words --- src/rcpp_word2vec.cpp | 10 +++++----- src/word2vec/lib/word2vec.hpp | 21 ++++++++------------- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/src/rcpp_word2vec.cpp b/src/rcpp_word2vec.cpp index 61c9a27..5982359 100644 --- a/src/rcpp_word2vec.cpp +++ b/src/rcpp_word2vec.cpp @@ -52,11 +52,11 @@ Rcpp::List w2v_train(Rcpp::List texts_, w2v::corpus_t corpus(texts, types); corpus.setWordFreq(); - Rcpp::List out2 = Rcpp::List::create( - Rcpp::Named("frequency") = corpus.frequency - ); - - //return out2; + // Rcpp::List out2 = Rcpp::List::create( + // Rcpp::Named("frequency") = corpus.frequency + // ); + // + // return out2; w2v::trainSettings_t trainSettings; trainSettings.minWordFreq = minWordFreq; diff --git a/src/word2vec/lib/word2vec.hpp b/src/word2vec/lib/word2vec.hpp index aec14cc..9407bb9 100644 --- a/src/word2vec/lib/word2vec.hpp +++ b/src/word2vec/lib/word2vec.hpp @@ -48,10 +48,7 @@ namespace w2v { // texts(_texts), stopWords(_stopWords) {} void setWordFreq() { - Rcpp::Rcout << "here1\n"; - std::unordered_set setStopWords; - frequency = frequency_t(types.size(), 0); totalWords = 0; trainWords = 0; @@ -59,20 +56,18 @@ namespace w2v { text_t text = texts[h]; for (size_t i = 0; i < text.size(); i++) { totalWords++; - int &word = text[i]; + auto &word = text[i]; //Rcpp::Rcout << i << ": " << word << "\n"; - if (word == 0) // padding - continue; if (word < 0 || types.size() < word) throw std::range_error("setWordFreq: invalid types"); + if (word == 0) // padding + continue; + // if (types[word - 1].empty()) { + // word = 0; // remove and pad + // continue; + // } frequency[word - 1]++; - if (types[word - 1].empty()) { - //Rcpp::Rcout << h << " " << i << " remove : " << word << "\n"; - word = 0; // remove and pad - } else { - //Rcpp::Rcout << h << " " << i << " count : " << word << "\n"; - trainWords++; - } + trainWords++; } } Rcpp::Rcout << "trainWords: " << trainWords << "\n"; From e1de9dc25c0a6bafd790029aae962b9a543031da Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sat, 10 Feb 2024 10:32:08 +0900 Subject: [PATCH 12/31] Disable save() and load() --- src/rcpp_word2vec.cpp | 4 +++- src/word2vec/lib/word2vec.cpp | 25 +++++++++++++------------ src/word2vec/lib/word2vec.hpp | 12 ++++++------ 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/src/rcpp_word2vec.cpp b/src/rcpp_word2vec.cpp index 5982359..a5714ae 100644 --- a/src/rcpp_word2vec.cpp +++ b/src/rcpp_word2vec.cpp @@ -140,6 +140,7 @@ Rcpp::List w2v_train(Rcpp::List texts_, }, nullptr); } + //return Rcpp::List::create(); bool success = true; if (!trained) { Rcpp::Rcout << "Training failed: " << model->errMsg() << std::endl; @@ -185,7 +186,7 @@ Rcpp::List w2v_train(Rcpp::List texts_, return out; } - +/* // [[Rcpp::export]] Rcpp::List w2v_load_model(std::string file, bool normalize = true) { bool normalise = normalize; @@ -209,6 +210,7 @@ bool w2v_save_model(SEXP ptr, std::string file) { bool success = model->save(file); return success; } +*/ // [[Rcpp::export]] std::vector w2v_dictionary(SEXP ptr) { diff --git a/src/word2vec/lib/word2vec.cpp b/src/word2vec/lib/word2vec.cpp index dbd8caa..51d6b21 100644 --- a/src/word2vec/lib/word2vec.cpp +++ b/src/word2vec/lib/word2vec.cpp @@ -8,7 +8,7 @@ #include #include "word2vec.hpp" #include "wordReader.hpp" -#include "vocabulary.hpp" +//#include "vocabulary.hpp" #include "trainer.hpp" namespace w2v { @@ -34,7 +34,7 @@ namespace w2v { // } // build vocabulary, skip stop-words and words with frequency < minWordFreq - std::shared_ptr vocabulary; + //std::shared_ptr vocabulary; // if (!_trainFile.empty()) { // vocabulary.reset(new vocabulary_t(trainWordsMapper, // stopWordsMapper, @@ -44,29 +44,29 @@ namespace w2v { // _vocabularyProgressCallback, // _vocabularyStatsCallback)); // } else { - vocabulary.reset(new vocabulary_t(corpus, - _trainSettings.minWordFreq, - _vocabularyProgressCallback, - _vocabularyStatsCallback)); + // vocabulary.reset(new vocabulary_t(corpus, + // _trainSettings.minWordFreq, + // _vocabularyProgressCallback, + // _vocabularyStatsCallback)); //} // key words descending ordered by their indexes - std::vector words; - vocabulary->words(words); + //std::vector words; + //vocabulary->words(words); m_vectorSize = _trainSettings.size; - m_mapSize = vocabulary->size(); + m_mapSize = corpus->types.size(); // train model std::vector _trainMatrix; trainer_t(std::make_shared(_trainSettings), - vocabulary, + //vocabulary, corpus, //trainWordsMapper, // NOTE: remove _trainProgressCallback)(_trainMatrix); //Rcpp::Rcout << "_trainMatrix: " << _trainMatrix.size() << "\n"; std::size_t wordIndex = 0; - for (auto const &i:words) { + for (auto const &i : corpus->types) { auto &v = m_map[i]; v.resize(m_vectorSize); std::copy(&_trainMatrix[wordIndex * m_vectorSize], @@ -84,7 +84,7 @@ namespace w2v { return false; } - + /* bool w2vModel_t::save(const std::string &_modelFile) const noexcept { try { // save trained data in original word2vec format @@ -334,4 +334,5 @@ namespace w2v { i /= med; } } + */ } diff --git a/src/word2vec/lib/word2vec.hpp b/src/word2vec/lib/word2vec.hpp index 9407bb9..ae34896 100644 --- a/src/word2vec/lib/word2vec.hpp +++ b/src/word2vec/lib/word2vec.hpp @@ -206,9 +206,9 @@ namespace w2v { const map_t &map() {return m_map;} /// pure virtual method to save model of a derived class - virtual bool save(const std::string &_modelFile) const noexcept = 0; + //virtual bool save(const std::string &_modelFile) const noexcept = 0; /// pure virtual method to load model of a derived class - virtual bool load(const std::string &_modelFile, bool normalize = true) noexcept = 0; + //virtual bool load(const std::string &_modelFile, bool normalize = true) noexcept = 0; /** * Vector access by key value @@ -334,9 +334,9 @@ namespace w2v { trainProgressCallback_t _trainProgressCallback) noexcept; /// saves word vectors to file with _modelFile name - bool save(const std::string &_modelFile) const noexcept override; + //bool save(const std::string &_modelFile) const noexcept override; /// loads word vectors from file with _modelFile name - bool load(const std::string &_modelFile, bool normalize = true) noexcept override; + //bool load(const std::string &_modelFile, bool normalize = true) noexcept override; /** * Normalise vectors */ @@ -390,9 +390,9 @@ namespace w2v { m_mapSize = m_map.size(); } /// saves document vectors to file with _modelFile name - bool save(const std::string &_modelFile) const noexcept override; + //bool save(const std::string &_modelFile) const noexcept override; /// loads document vectors from file with _modelFile name - bool load(const std::string &_modelFile, bool normalize = true) noexcept override; + //bool load(const std::string &_modelFile, bool normalize = true) noexcept override; }; /** From a369ead3cea26bef0287ae760604a5325ed63578 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sun, 11 Feb 2024 09:34:28 +0900 Subject: [PATCH 13/31] Remove progress bar and callback for vocaburary --- src/rcpp_word2vec.cpp | 79 ++++++++++++++++++----------------- src/word2vec/lib/word2vec.cpp | 7 ++-- src/word2vec/lib/word2vec.hpp | 4 +- 3 files changed, 46 insertions(+), 44 deletions(-) diff --git a/src/rcpp_word2vec.cpp b/src/rcpp_word2vec.cpp index a5714ae..dc22480 100644 --- a/src/rcpp_word2vec.cpp +++ b/src/rcpp_word2vec.cpp @@ -83,28 +83,28 @@ Rcpp::List w2v_train(Rcpp::List texts_, Progress p(100, true); trained = model->train(trainSettings, corpus, //trainFile, stopWordsFile, // NOTE: remove - [&p] (float _percent) { - p.update(_percent / 2); - /* - std::cout << "\rParsing train data... " - << std::fixed << std::setprecision(2) - << _percent << "%" << std::flush; - */ - }, - [&vocWords, &trainWords, &totalWords] (std::size_t _vocWords, std::size_t _trainWords, std::size_t _totalWords) { - /* - Rcpp::Rcerr << std::endl - << "Finished reading data: " << std::endl - << "Vocabulary size: " << _vocWords << std::endl - << "Train words: " << _trainWords << std::endl - << "Total words: " << _totalWords << std::endl - << "Start training" << std::endl - << std::endl; - */ - vocWords = _vocWords; - trainWords = _trainWords; - totalWords = _totalWords; - }, + // [&p] (float _percent) { + // p.update(_percent / 2); + // /* + // std::cout << "\rParsing train data... " + // << std::fixed << std::setprecision(2) + // << _percent << "%" << std::flush; + // */ + // }, + // [&vocWords, &trainWords, &totalWords] (std::size_t _vocWords, std::size_t _trainWords, std::size_t _totalWords) { + // /* + // Rcpp::Rcerr << std::endl + // << "Finished reading data: " << std::endl + // << "Vocabulary size: " << _vocWords << std::endl + // << "Train words: " << _trainWords << std::endl + // << "Total words: " << _totalWords << std::endl + // << "Start training" << std::endl + // << std::endl; + // */ + // vocWords = _vocWords; + // trainWords = _trainWords; + // totalWords = _totalWords; + // }, [&p] (float _alpha, float _percent) { /* std::cout << '\r' @@ -116,30 +116,31 @@ Rcpp::List w2v_train(Rcpp::List texts_, << _percent << "%" << std::flush; */ - p.update(50 + (_percent / 2)); + p.update(_percent); } ); //std::cout << std::endl; } else { trained = model->train(trainSettings, corpus, //trainFile, stopWordsFile, // NOTE: remove - nullptr, - [&vocWords, &trainWords, &totalWords] (std::size_t _vocWords, std::size_t _trainWords, std::size_t _totalWords) { - /* - Rcpp::Rcerr << std::endl - << "Finished reading data: " << std::endl - << "Vocabulary size: " << _vocWords << std::endl - << "Train words: " << _trainWords << std::endl - << "Total words: " << _totalWords << std::endl - << "Start training" << std::endl - << std::endl; - */ - vocWords = _vocWords; - trainWords = _trainWords; - totalWords = _totalWords; - }, + // nullptr, + // [&vocWords, &trainWords, &totalWords] (std::size_t _vocWords, std::size_t _trainWords, std::size_t _totalWords) { + // /* + // Rcpp::Rcerr << std::endl + // << "Finished reading data: " << std::endl + // << "Vocabulary size: " << _vocWords << std::endl + // << "Train words: " << _trainWords << std::endl + // << "Total words: " << _totalWords << std::endl + // << "Start training" << std::endl + // << std::endl; + // */ + // vocWords = _vocWords; + // trainWords = _trainWords; + // totalWords = _totalWords; + // }, nullptr); } + Rcpp::Rcout << "Training done\n"; //return Rcpp::List::create(); bool success = true; if (!trained) { @@ -149,7 +150,7 @@ Rcpp::List w2v_train(Rcpp::List texts_, // NORMALISE UPFRONT - DIFFERENT THAN ORIGINAL CODE // - original code dumps data to disk, next imports it and during import normalisation happens after which we can do nearest calculations // - the R wrapper only writes to disk at request so we need to normalise upfront in order to do directly nearest calculations - if(normalize){ + if (normalize) { //Rcpp::Rcout << "Finished training: finalising with embedding normalisation" << std::endl; model->normalize(); } diff --git a/src/word2vec/lib/word2vec.cpp b/src/word2vec/lib/word2vec.cpp index 51d6b21..41a1234 100644 --- a/src/word2vec/lib/word2vec.cpp +++ b/src/word2vec/lib/word2vec.cpp @@ -16,8 +16,8 @@ namespace w2v { const corpus_t &_corpus, //const std::string &_trainFile, // NOTE: remove //const std::string &_stopWordsFile, // NOTE: remove - vocabularyProgressCallback_t _vocabularyProgressCallback, - vocabularyStatsCallback_t _vocabularyStatsCallback, + //vocabularyProgressCallback_t _vocabularyProgressCallback, + //vocabularyStatsCallback_t _vocabularyStatsCallback, trainProgressCallback_t _trainProgressCallback) noexcept { try { // store tokens @@ -57,7 +57,7 @@ namespace w2v { m_mapSize = corpus->types.size(); // train model - std::vector _trainMatrix; + std::vector _trainMatrix; // NOTE: consider directly making m_map trainer_t(std::make_shared(_trainSettings), //vocabulary, corpus, @@ -67,6 +67,7 @@ namespace w2v { std::size_t wordIndex = 0; for (auto const &i : corpus->types) { + //Rcpp::Rcout << i << "\n"; auto &v = m_map[i]; v.resize(m_vectorSize); std::copy(&_trainMatrix[wordIndex * m_vectorSize], diff --git a/src/word2vec/lib/word2vec.hpp b/src/word2vec/lib/word2vec.hpp index ae34896..e2f2dfe 100644 --- a/src/word2vec/lib/word2vec.hpp +++ b/src/word2vec/lib/word2vec.hpp @@ -329,8 +329,8 @@ namespace w2v { const corpus_t &_corpus, //const std::string &_trainFile, // NOTE: remove //const std::string &_stopWordsFile, // NOTE: remove - vocabularyProgressCallback_t _vocabularyProgressCallback, - vocabularyStatsCallback_t _vocabularyStatsCallback, + //vocabularyProgressCallback_t _vocabularyProgressCallback, + //vocabularyStatsCallback_t _vocabularyStatsCallback, trainProgressCallback_t _trainProgressCallback) noexcept; /// saves word vectors to file with _modelFile name From a4898dbc03a0153a97e79ebda32ceca6dfcc9e3d Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sun, 11 Feb 2024 10:31:05 +0900 Subject: [PATCH 14/31] Improve handling of sentence lenghts --- src/word2vec/lib/trainThread.cpp | 41 ++++++++++++++++---------------- src/word2vec/lib/trainThread.hpp | 4 ++-- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/src/word2vec/lib/trainThread.cpp b/src/word2vec/lib/trainThread.cpp index 1844ad4..6e15457 100644 --- a/src/word2vec/lib/trainThread.cpp +++ b/src/word2vec/lib/trainThread.cpp @@ -18,19 +18,19 @@ namespace w2v { if (!m_sharedData.trainSettings) { throw std::runtime_error("train settings are not initialized"); } - if (!m_sharedData.vocabulary) { - throw std::runtime_error("vocabulary object is not initialized"); - } + // if (!m_sharedData.vocabulary) { + // throw std::runtime_error("vocabulary object is not initialized"); + // } if (m_sharedData.trainSettings->sample > 0.0f) { m_downSampling.reset(new downSampling_t(m_sharedData.trainSettings->sample, - m_sharedData.vocabulary->trainWords())); + m_sharedData.corpus->trainWords)); } if (m_sharedData.trainSettings->negative > 0) { - std::vector frequencies; - m_sharedData.vocabulary->frequencies(frequencies); - m_nsDistribution.reset(new nsDistribution_t(frequencies)); + //std::vector frequencies; + //m_sharedData.vocabulary->frequencies(frequencies); + m_nsDistribution.reset(new nsDistribution_t(m_sharedData.corpus->frequency)); } if (m_sharedData.trainSettings->withHS && !m_sharedData.huffmanTree) { @@ -64,7 +64,7 @@ namespace w2v { std::size_t h = range.first; // NOTE: only used for corpus auto wordsPerAllThreads = m_sharedData.trainSettings->iterations - * m_sharedData.vocabulary->trainWords(); + * m_sharedData.corpus->trainWords; auto wordsPerAlpha = wordsPerAllThreads / 10000; while (!exitFlag) { // calc alpha @@ -86,32 +86,26 @@ namespace w2v { } } - // read sentence - std::vector sentence; - - // Rcpp::Rcout << "h: " << h << "\n"; if (h > range.second) { exitFlag = true; // EOF or end of requested region break; } text_t text = m_sharedData.corpus->texts[h]; + // read sentence + std::vector sentence; + sentence.reserve(text.size()); for (size_t i = 0; i < text.size(); i++) { - unsigned int word = text[i]; + auto &word = text[i]; if (word == 0) { continue; // padding } - // auto wordData = m_sharedData.vocabulary->data(word); - // if (wordData == nullptr) { - // continue; // no such word - // } - + threadProcessedWords++; - if (m_sharedData.trainSettings->sample > 0.0f) { // down-sampling... - //if ((*m_downSampling)(wordData->frequency, m_randomGenerator)) { - if ((*m_downSampling)(m_sharedData.corpus->frequency[word], m_randomGenerator)) { + if (m_sharedData.trainSettings->sample > 0.0f) { + if ((*m_downSampling)(m_sharedData.corpus->frequency[word - 1], m_randomGenerator)) { continue; // skip this word } } @@ -132,6 +126,9 @@ namespace w2v { inline void trainThread_t::cbow(const std::vector &_sentence, std::vector &_trainMatrix) noexcept { + + if (_sentence.size() == 0) + return; for (std::size_t i = 0; i < _sentence.size(); ++i) { // hidden layers initialized with 0 values std::memset(m_hiddenLayerVals->data(), 0, m_hiddenLayerVals->size() * sizeof(float)); @@ -187,6 +184,8 @@ namespace w2v { inline void trainThread_t::skipGram(const std::vector &_sentence, std::vector &_trainMatrix) noexcept { + if (_sentence.size() == 0) + return; for (std::size_t i = 0; i < _sentence.size(); ++i) { auto rndShift = m_rndWindowShift(m_randomGenerator); for (auto j = rndShift; j < m_sharedData.trainSettings->window * 2 + 1 - rndShift; ++j) { diff --git a/src/word2vec/lib/trainThread.hpp b/src/word2vec/lib/trainThread.hpp index 63f5f17..f7af72a 100644 --- a/src/word2vec/lib/trainThread.hpp +++ b/src/word2vec/lib/trainThread.hpp @@ -19,7 +19,7 @@ #include "word2vec.hpp" //#include "wordReader.hpp" -#include "vocabulary.hpp" +//#include "vocabulary.hpp" #include "huffmanTree.hpp" #include "nsDistribution.hpp" #include "downSampling.hpp" @@ -41,7 +41,7 @@ namespace w2v { */ struct sharedData_t final { std::shared_ptr trainSettings; ///< trainSettings structure - std::shared_ptr vocabulary; ///< words data + //std::shared_ptr vocabulary; ///< words data std::shared_ptr corpus; ///< train data //std::shared_ptr fileMapper; /// NOTE: remove std::shared_ptr> bpWeights; ///< back propagation weights From 621bee71efa1333ae210486e208569d7e932c621 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sun, 11 Feb 2024 11:17:25 +0900 Subject: [PATCH 15/31] Use for loop --- src/word2vec/lib/trainThread.cpp | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/word2vec/lib/trainThread.cpp b/src/word2vec/lib/trainThread.cpp index 6e15457..389a618 100644 --- a/src/word2vec/lib/trainThread.cpp +++ b/src/word2vec/lib/trainThread.cpp @@ -56,17 +56,21 @@ namespace w2v { void trainThread_t::worker(std::vector &_trainMatrix) noexcept { + Rcpp::Rcout << "Texts: " << range.first << " to " << range.second << "\n"; for (auto g = m_sharedData.trainSettings->iterations; g > 0; --g) { //Rcpp::Rcout << "g: " << (int)g << "\n"; - bool exitFlag = false; + //bool exitFlag = false; std::size_t threadProcessedWords = 0; std::size_t prvThreadProcessedWords = 0; - std::size_t h = range.first; // NOTE: only used for corpus + //std::size_t h = range.first; // NOTE: only used for corpus auto wordsPerAllThreads = m_sharedData.trainSettings->iterations * m_sharedData.corpus->trainWords; auto wordsPerAlpha = wordsPerAllThreads / 10000; - while (!exitFlag) { + //while (!exitFlag) { + //while (h <= range.second) { + for (std::size_t h = range.first; h <= range.second; h++) { + // calc alpha if (threadProcessedWords - prvThreadProcessedWords > wordsPerAlpha) { // next 0.01% processed *m_sharedData.processedWords += threadProcessedWords - prvThreadProcessedWords; @@ -86,10 +90,10 @@ namespace w2v { } } - if (h > range.second) { - exitFlag = true; // EOF or end of requested region - break; - } + // if (h > range.second) { + // exitFlag = true; // EOF or end of requested region + // break; + // } text_t text = m_sharedData.corpus->texts[h]; // read sentence @@ -119,7 +123,7 @@ namespace w2v { } else { cbow(sentence, _trainMatrix); } - h++; // move to next text + //h++; // move to next text } } } From 3b19210a41feb60e2da47a5c436b0dd72ad17ca1 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sun, 11 Feb 2024 12:27:04 +0900 Subject: [PATCH 16/31] Clean up the code --- src/word2vec/lib/trainer.cpp | 44 ++++++++++++------------------------ 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/src/word2vec/lib/trainer.cpp b/src/word2vec/lib/trainer.cpp index 5836c55..8cecbaa 100644 --- a/src/word2vec/lib/trainer.cpp +++ b/src/word2vec/lib/trainer.cpp @@ -11,7 +11,7 @@ namespace w2v { trainer_t::trainer_t(const std::shared_ptr &_trainSettings, - const std::shared_ptr &_vocabulary, + //const std::shared_ptr &_vocabulary, const std::shared_ptr &_corpus, //const std::shared_ptr &_fileMapper, // NOTE: remove std::function _progressCallback): m_threads() { @@ -22,23 +22,17 @@ namespace w2v { } sharedData.trainSettings = _trainSettings; - if (!_vocabulary) { - throw std::runtime_error("vocabulary object is not initialized"); - } - sharedData.vocabulary = _vocabulary; + // if (!_vocabulary) { + // throw std::runtime_error("vocabulary object is not initialized"); + // } + // sharedData.vocabulary = _vocabulary; if (!_corpus) { throw std::runtime_error("corpus is object is not initialized"); } sharedData.corpus = _corpus; - // if (!_corpus && !_fileMapper) { - // throw std::runtime_error("corpus and file mapper objects are not initialized"); - // } - // sharedData.corpus = _corpus; - // sharedData.fileMapper = _fileMapper; - - sharedData.bpWeights.reset(new std::vector(_trainSettings->size * _vocabulary->size(), 0.0f)); + sharedData.bpWeights.reset(new std::vector(_trainSettings->size * _corpus->types.size(), 0.0f)); sharedData.expTable.reset(new std::vector(_trainSettings->expTableSize)); for (uint16_t i = 0; i < _trainSettings->expTableSize; ++i) { // Precompute the exp() table @@ -50,9 +44,7 @@ namespace w2v { } if (_trainSettings->withHS) { - std::vector frequencies; - _vocabulary->frequencies(frequencies); - sharedData.huffmanTree.reset(new huffmanTree_t(frequencies));; + sharedData.huffmanTree.reset(new huffmanTree_t(_corpus->frequency));; } if (_progressCallback != nullptr) { @@ -62,15 +54,9 @@ namespace w2v { sharedData.processedWords.reset(new std::atomic(0)); sharedData.alpha.reset(new std::atomic(_trainSettings->alpha)); - // if (_corpus) { - // // NOTE : corpus has no sentence delimiter - // m_matrixSize = sharedData.trainSettings->size * sharedData.vocabulary->size() + 100; - // } else { - m_matrixSize = sharedData.trainSettings->size * sharedData.vocabulary->size(); - //} - //Rcpp::Rcout << "corpus->texts.size(): " << sharedData.corpus->texts.size() << "\n"; - //Rcpp::Rcout << "vocabulary->size(): " << sharedData.vocabulary->size() << "\n"; - //Rcpp::Rcout << "_trainSettings->threads: " << (int)_trainSettings->threads << "\n"; + // NOTE: consider setting size elsewhere + m_matrixSize = sharedData.trainSettings->size * sharedData.corpus->types.size(); + for (uint8_t i = 0; i < _trainSettings->threads; ++i) { // trainThread_t t(i, sharedData); // Rcpp::Rcout << "thread: " << (int)i << " from " << t.range.first << " to " << t.range.second << "\n"; @@ -86,15 +72,15 @@ namespace w2v { std::uniform_real_distribution rndMatrixInitializer(-0.005f, 0.005f); _trainMatrix.resize(m_matrixSize); std::generate(_trainMatrix.begin(), _trainMatrix.end(), [&]() { - float v = (float)(Rcpp::runif(1, -0.005f, 0.005f)[0]); - return v; - //return rndMatrixInitializer(randomGenerator); + //float v = (float)(Rcpp::runif(1, -0.005f, 0.005f)[0]); + //return v; + return rndMatrixInitializer(randomGenerator); // NOTE:: pass random number seed? }); - + for (auto &i:m_threads) { i->launch(_trainMatrix); } - + for (auto &i:m_threads) { i->join(); } From 32a3f90d11af4c9116b146230683035e3d793730 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sun, 11 Feb 2024 12:27:50 +0900 Subject: [PATCH 17/31] Fix word and document index --- src/word2vec/lib/trainThread.cpp | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/src/word2vec/lib/trainThread.cpp b/src/word2vec/lib/trainThread.cpp index 389a618..76f6b16 100644 --- a/src/word2vec/lib/trainThread.cpp +++ b/src/word2vec/lib/trainThread.cpp @@ -18,9 +18,6 @@ namespace w2v { if (!m_sharedData.trainSettings) { throw std::runtime_error("train settings are not initialized"); } - // if (!m_sharedData.vocabulary) { - // throw std::runtime_error("vocabulary object is not initialized"); - // } if (m_sharedData.trainSettings->sample > 0.0f) { m_downSampling.reset(new downSampling_t(m_sharedData.trainSettings->sample, @@ -28,8 +25,6 @@ namespace w2v { } if (m_sharedData.trainSettings->negative > 0) { - //std::vector frequencies; - //m_sharedData.vocabulary->frequencies(frequencies); m_nsDistribution.reset(new nsDistribution_t(m_sharedData.corpus->frequency)); } @@ -69,7 +64,7 @@ namespace w2v { auto wordsPerAlpha = wordsPerAllThreads / 10000; //while (!exitFlag) { //while (h <= range.second) { - for (std::size_t h = range.first; h <= range.second; h++) { + for (std::size_t h = range.first; h <= range.second; ++h) { // calc alpha if (threadProcessedWords - prvThreadProcessedWords > wordsPerAlpha) { // next 0.01% processed @@ -99,11 +94,11 @@ namespace w2v { // read sentence std::vector sentence; sentence.reserve(text.size()); - for (size_t i = 0; i < text.size(); i++) { + for (size_t i = 0; i < text.size(); ++i) { auto &word = text[i]; - if (word == 0) { - continue; // padding + if (word == 0) { // padding + continue; } threadProcessedWords++; @@ -115,7 +110,7 @@ namespace w2v { } //if (h == 1) // Rcpp::Rcout << word << ": " << wordData->index << "\n"; - sentence.push_back(word); + sentence.push_back(word - 1); // zero-based index of words } if (m_sharedData.trainSettings->withSG) { @@ -161,13 +156,13 @@ namespace w2v { for (std::size_t j = 0; j < m_sharedData.trainSettings->size; j++) { (*m_hiddenLayerVals)[j] /= cw; } - + if (m_sharedData.trainSettings->withHS) { hierarchicalSoftmax(_sentence[i], *m_hiddenLayerErrors, *m_hiddenLayerVals, 0); } else { negativeSampling(_sentence[i], *m_hiddenLayerErrors, *m_hiddenLayerVals, 0); } - + // hidden -> in for (auto j = rndShift; j < m_sharedData.trainSettings->window * 2 + 1 - rndShift; ++j) { if (j == m_sharedData.trainSettings->window) { From b249975faeb1df01ce4e5ff4ec0e0dda9d136f55 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Mon, 12 Feb 2024 08:22:21 +0900 Subject: [PATCH 18/31] Tidy up --- src/word2vec/lib/trainThread.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/word2vec/lib/trainThread.cpp b/src/word2vec/lib/trainThread.cpp index 76f6b16..e7d85b4 100644 --- a/src/word2vec/lib/trainThread.cpp +++ b/src/word2vec/lib/trainThread.cpp @@ -108,8 +108,6 @@ namespace w2v { continue; // skip this word } } - //if (h == 1) - // Rcpp::Rcout << word << ": " << wordData->index << "\n"; sentence.push_back(word - 1); // zero-based index of words } @@ -118,7 +116,6 @@ namespace w2v { } else { cbow(sentence, _trainMatrix); } - //h++; // move to next text } } } From 562d1a1ef36a4807f8d41c3aa3a8409156e64c1e Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Mon, 12 Feb 2024 08:27:57 +0900 Subject: [PATCH 19/31] Tidy up --- src/word2vec/lib/word2vec.cpp | 46 +++++------------------------------ src/word2vec/lib/word2vec.hpp | 20 +++------------ 2 files changed, 10 insertions(+), 56 deletions(-) diff --git a/src/word2vec/lib/word2vec.cpp b/src/word2vec/lib/word2vec.cpp index 41a1234..7cbed6b 100644 --- a/src/word2vec/lib/word2vec.cpp +++ b/src/word2vec/lib/word2vec.cpp @@ -14,57 +14,23 @@ namespace w2v { bool w2vModel_t::train(const trainSettings_t &_trainSettings, const corpus_t &_corpus, - //const std::string &_trainFile, // NOTE: remove - //const std::string &_stopWordsFile, // NOTE: remove - //vocabularyProgressCallback_t _vocabularyProgressCallback, - //vocabularyStatsCallback_t _vocabularyStatsCallback, trainProgressCallback_t _trainProgressCallback) noexcept { try { // store tokens std::shared_ptr corpus(new corpus_t(_corpus)); - // map train data set file to memory - // std::shared_ptr trainWordsMapper; - // if (!_trainFile.empty()) { - // trainWordsMapper.reset(new fileMapper_t(_trainFile)); - // } - // // map stop-words file to memory - // std::shared_ptr stopWordsMapper; - // if (!_stopWordsFile.empty()) { - // stopWordsMapper.reset(new fileMapper_t(_stopWordsFile)); - // } - - // build vocabulary, skip stop-words and words with frequency < minWordFreq - //std::shared_ptr vocabulary; - // if (!_trainFile.empty()) { - // vocabulary.reset(new vocabulary_t(trainWordsMapper, - // stopWordsMapper, - // _trainSettings.wordDelimiterChars, - // _trainSettings.endOfSentenceChars, - // _trainSettings.minWordFreq, - // _vocabularyProgressCallback, - // _vocabularyStatsCallback)); - // } else { - // vocabulary.reset(new vocabulary_t(corpus, - // _trainSettings.minWordFreq, - // _vocabularyProgressCallback, - // _vocabularyStatsCallback)); - //} - // key words descending ordered by their indexes - //std::vector words; - //vocabulary->words(words); m_vectorSize = _trainSettings.size; m_mapSize = corpus->types.size(); - + + Rcpp::Rcout << "_trainSettings.size: " << _trainSettings.size << "\n"; + // train model - std::vector _trainMatrix; // NOTE: consider directly making m_map + std::vector _trainMatrix; trainer_t(std::make_shared(_trainSettings), - //vocabulary, corpus, - //trainWordsMapper, // NOTE: remove _trainProgressCallback)(_trainMatrix); - //Rcpp::Rcout << "_trainMatrix: " << _trainMatrix.size() << "\n"; - + + // NOTE: directly make matrix from _trainMatrix std::size_t wordIndex = 0; for (auto const &i : corpus->types) { //Rcpp::Rcout << i << "\n"; diff --git a/src/word2vec/lib/word2vec.hpp b/src/word2vec/lib/word2vec.hpp index e2f2dfe..3485e3c 100644 --- a/src/word2vec/lib/word2vec.hpp +++ b/src/word2vec/lib/word2vec.hpp @@ -44,9 +44,7 @@ namespace w2v { corpus_t(): texts() {} corpus_t(texts_t _texts, types_t _types): texts(_texts), types(_types) {} - //corpus_t(texts_t _texts, words_t _stopWords): - // texts(_texts), stopWords(_stopWords) {} - + void setWordFreq() { frequency = frequency_t(types.size(), 0); @@ -72,6 +70,8 @@ namespace w2v { } Rcpp::Rcout << "trainWords: " << trainWords << "\n"; Rcpp::Rcout << "totalWords: " << totalWords << "\n"; + Rcpp::Rcout << "frequency.size(): " << frequency.size() << "\n"; + Rcpp::Rcout << "types.size(): " << types.size() << "\n"; } }; @@ -91,6 +91,7 @@ namespace w2v { uint8_t iterations = 5; ///< train iterations float alpha = 0.05f; ///< starting learn rate bool withSG = false; ///< use Skip-Gram instead of CBOW + // TODO: remove std::string wordDelimiterChars = " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r"; std::string endOfSentenceChars = ".\n?!"; trainSettings_t() = default; @@ -205,11 +206,6 @@ namespace w2v { /// Direct access to the word-vector map const map_t &map() {return m_map;} - /// pure virtual method to save model of a derived class - //virtual bool save(const std::string &_modelFile) const noexcept = 0; - /// pure virtual method to load model of a derived class - //virtual bool load(const std::string &_modelFile, bool normalize = true) noexcept = 0; - /** * Vector access by key value * @param _key key value uniquely identifying vector in model @@ -327,16 +323,8 @@ namespace w2v { */ bool train(const trainSettings_t &_trainSettings, const corpus_t &_corpus, - //const std::string &_trainFile, // NOTE: remove - //const std::string &_stopWordsFile, // NOTE: remove - //vocabularyProgressCallback_t _vocabularyProgressCallback, - //vocabularyStatsCallback_t _vocabularyStatsCallback, trainProgressCallback_t _trainProgressCallback) noexcept; - /// saves word vectors to file with _modelFile name - //bool save(const std::string &_modelFile) const noexcept override; - /// loads word vectors from file with _modelFile name - //bool load(const std::string &_modelFile, bool normalize = true) noexcept override; /** * Normalise vectors */ From f6a3dc252d935c06eced75541ee44ffec023d3d1 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Mon, 12 Feb 2024 08:47:51 +0900 Subject: [PATCH 20/31] Tidy up --- src/word2vec/lib/trainer.cpp | 16 +++------------- src/word2vec/lib/trainer.hpp | 4 ---- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/src/word2vec/lib/trainer.cpp b/src/word2vec/lib/trainer.cpp index 8cecbaa..550e729 100644 --- a/src/word2vec/lib/trainer.cpp +++ b/src/word2vec/lib/trainer.cpp @@ -11,9 +11,7 @@ namespace w2v { trainer_t::trainer_t(const std::shared_ptr &_trainSettings, - //const std::shared_ptr &_vocabulary, const std::shared_ptr &_corpus, - //const std::shared_ptr &_fileMapper, // NOTE: remove std::function _progressCallback): m_threads() { trainThread_t::sharedData_t sharedData; @@ -22,11 +20,6 @@ namespace w2v { } sharedData.trainSettings = _trainSettings; - // if (!_vocabulary) { - // throw std::runtime_error("vocabulary object is not initialized"); - // } - // sharedData.vocabulary = _vocabulary; - if (!_corpus) { throw std::runtime_error("corpus is object is not initialized"); } @@ -58,11 +51,8 @@ namespace w2v { m_matrixSize = sharedData.trainSettings->size * sharedData.corpus->types.size(); for (uint8_t i = 0; i < _trainSettings->threads; ++i) { - // trainThread_t t(i, sharedData); - // Rcpp::Rcout << "thread: " << (int)i << " from " << t.range.first << " to " << t.range.second << "\n"; m_threads.emplace_back(new trainThread_t(i, sharedData)); } - //throw std::runtime_error("m_threads.emplace_back()"); } void trainer_t::operator()(std::vector &_trainMatrix) noexcept { @@ -72,9 +62,9 @@ namespace w2v { std::uniform_real_distribution rndMatrixInitializer(-0.005f, 0.005f); _trainMatrix.resize(m_matrixSize); std::generate(_trainMatrix.begin(), _trainMatrix.end(), [&]() { - //float v = (float)(Rcpp::runif(1, -0.005f, 0.005f)[0]); - //return v; - return rndMatrixInitializer(randomGenerator); // NOTE:: pass random number seed? + float v = (float)(Rcpp::runif(1, -0.005f, 0.005f)[0]); + return v; + //return rndMatrixInitializer(randomGenerator); // NOTE:: pass random number seed? }); for (auto &i:m_threads) { diff --git a/src/word2vec/lib/trainer.hpp b/src/word2vec/lib/trainer.hpp index fcfba3b..30bc26f 100644 --- a/src/word2vec/lib/trainer.hpp +++ b/src/word2vec/lib/trainer.hpp @@ -15,8 +15,6 @@ #include #include "word2vec.hpp" -//#include "wordReader.hpp" -#include "vocabulary.hpp" #include "trainThread.hpp" namespace w2v { @@ -40,9 +38,7 @@ namespace w2v { * @param _progressCallback callback function to be called on each new 0.01% processed train data */ trainer_t(const std::shared_ptr &_trainSettings, - const std::shared_ptr &_vocabulary, const std::shared_ptr &_corpus, - //const std::shared_ptr &_fileMapper, // NOTE: remove std::function _progressCallback); /** From 1b6be1619a1cebfbb61212ef9407a0f1694c77ba Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Mon, 12 Feb 2024 08:51:31 +0900 Subject: [PATCH 21/31] Tidy up --- src/word2vec/lib/word2vec.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/word2vec/lib/word2vec.cpp b/src/word2vec/lib/word2vec.cpp index 7cbed6b..0699812 100644 --- a/src/word2vec/lib/word2vec.cpp +++ b/src/word2vec/lib/word2vec.cpp @@ -22,8 +22,6 @@ namespace w2v { m_vectorSize = _trainSettings.size; m_mapSize = corpus->types.size(); - Rcpp::Rcout << "_trainSettings.size: " << _trainSettings.size << "\n"; - // train model std::vector _trainMatrix; trainer_t(std::make_shared(_trainSettings), From 2045cdb80b29f779d33b5890ef7652d71aec64d4 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Mon, 12 Feb 2024 08:55:28 +0900 Subject: [PATCH 22/31] Tidy up --- src/word2vec/lib/trainThread.cpp | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/word2vec/lib/trainThread.cpp b/src/word2vec/lib/trainThread.cpp index e7d85b4..ad52cc5 100644 --- a/src/word2vec/lib/trainThread.cpp +++ b/src/word2vec/lib/trainThread.cpp @@ -51,19 +51,15 @@ namespace w2v { void trainThread_t::worker(std::vector &_trainMatrix) noexcept { - Rcpp::Rcout << "Texts: " << range.first << " to " << range.second << "\n"; for (auto g = m_sharedData.trainSettings->iterations; g > 0; --g) { - //Rcpp::Rcout << "g: " << (int)g << "\n"; - //bool exitFlag = false; + std::size_t threadProcessedWords = 0; std::size_t prvThreadProcessedWords = 0; - //std::size_t h = range.first; // NOTE: only used for corpus auto wordsPerAllThreads = m_sharedData.trainSettings->iterations * m_sharedData.corpus->trainWords; auto wordsPerAlpha = wordsPerAllThreads / 10000; - //while (!exitFlag) { - //while (h <= range.second) { + for (std::size_t h = range.first; h <= range.second; ++h) { // calc alpha @@ -85,10 +81,6 @@ namespace w2v { } } - // if (h > range.second) { - // exitFlag = true; // EOF or end of requested region - // break; - // } text_t text = m_sharedData.corpus->texts[h]; // read sentence From 4a904835e703448b22cbe37258348ea60e54d6c2 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Mon, 12 Feb 2024 09:00:41 +0900 Subject: [PATCH 23/31] Build --- R/RcppExports.R | 12 ++---------- src/RcppExports.cpp | 34 ++++----------------------------- src/word2vec/lib/CMakeLists.txt | 4 ++-- 3 files changed, 8 insertions(+), 42 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index d1533a5..d6c90b8 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,16 +1,8 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -w2v_train <- function(texts_, stopWords_, modelFile = "", minWordFreq = 5L, size = 100L, window = 5L, expTableSize = 1000L, expValueMax = 6L, sample = 0.001, withHS = FALSE, negative = 5L, threads = 1L, iterations = 5L, alpha = 0.05, withSG = FALSE, verbose = FALSE, normalize = TRUE) { - .Call('_word2vec_w2v_train', PACKAGE = 'word2vec', texts_, stopWords_, modelFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, verbose, normalize) -} - -w2v_load_model <- function(file, normalize = TRUE) { - .Call('_word2vec_w2v_load_model', PACKAGE = 'word2vec', file, normalize) -} - -w2v_save_model <- function(ptr, file) { - .Call('_word2vec_w2v_save_model', PACKAGE = 'word2vec', ptr, file) +w2v_train <- function(texts_, types_, modelFile = "", minWordFreq = 5L, size = 100L, window = 5L, expTableSize = 1000L, expValueMax = 6L, sample = 0.001, withHS = FALSE, negative = 5L, threads = 1L, iterations = 5L, alpha = 0.05, withSG = FALSE, verbose = FALSE, normalize = TRUE) { + .Call('_word2vec_w2v_train', PACKAGE = 'word2vec', texts_, types_, modelFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, verbose, normalize) } w2v_dictionary <- function(ptr) { diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index c8bfcda..ebd9209 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -11,13 +11,13 @@ Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // w2v_train -Rcpp::List w2v_train(Rcpp::List texts_, Rcpp::CharacterVector stopWords_, std::string modelFile, uint16_t minWordFreq, uint16_t size, uint8_t window, uint16_t expTableSize, uint8_t expValueMax, float sample, bool withHS, uint8_t negative, uint8_t threads, uint8_t iterations, float alpha, bool withSG, bool verbose, bool normalize); -RcppExport SEXP _word2vec_w2v_train(SEXP texts_SEXP, SEXP stopWords_SEXP, SEXP modelFileSEXP, SEXP minWordFreqSEXP, SEXP sizeSEXP, SEXP windowSEXP, SEXP expTableSizeSEXP, SEXP expValueMaxSEXP, SEXP sampleSEXP, SEXP withHSSEXP, SEXP negativeSEXP, SEXP threadsSEXP, SEXP iterationsSEXP, SEXP alphaSEXP, SEXP withSGSEXP, SEXP verboseSEXP, SEXP normalizeSEXP) { +Rcpp::List w2v_train(Rcpp::List texts_, Rcpp::CharacterVector types_, std::string modelFile, uint16_t minWordFreq, uint16_t size, uint8_t window, uint16_t expTableSize, uint8_t expValueMax, float sample, bool withHS, uint8_t negative, uint8_t threads, uint8_t iterations, float alpha, bool withSG, bool verbose, bool normalize); +RcppExport SEXP _word2vec_w2v_train(SEXP texts_SEXP, SEXP types_SEXP, SEXP modelFileSEXP, SEXP minWordFreqSEXP, SEXP sizeSEXP, SEXP windowSEXP, SEXP expTableSizeSEXP, SEXP expValueMaxSEXP, SEXP sampleSEXP, SEXP withHSSEXP, SEXP negativeSEXP, SEXP threadsSEXP, SEXP iterationsSEXP, SEXP alphaSEXP, SEXP withSGSEXP, SEXP verboseSEXP, SEXP normalizeSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< Rcpp::List >::type texts_(texts_SEXP); - Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type stopWords_(stopWords_SEXP); + Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type types_(types_SEXP); Rcpp::traits::input_parameter< std::string >::type modelFile(modelFileSEXP); Rcpp::traits::input_parameter< uint16_t >::type minWordFreq(minWordFreqSEXP); Rcpp::traits::input_parameter< uint16_t >::type size(sizeSEXP); @@ -33,31 +33,7 @@ BEGIN_RCPP Rcpp::traits::input_parameter< bool >::type withSG(withSGSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); Rcpp::traits::input_parameter< bool >::type normalize(normalizeSEXP); - rcpp_result_gen = Rcpp::wrap(w2v_train(texts_, stopWords_, modelFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, verbose, normalize)); - return rcpp_result_gen; -END_RCPP -} -// w2v_load_model -Rcpp::List w2v_load_model(std::string file, bool normalize); -RcppExport SEXP _word2vec_w2v_load_model(SEXP fileSEXP, SEXP normalizeSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< std::string >::type file(fileSEXP); - Rcpp::traits::input_parameter< bool >::type normalize(normalizeSEXP); - rcpp_result_gen = Rcpp::wrap(w2v_load_model(file, normalize)); - return rcpp_result_gen; -END_RCPP -} -// w2v_save_model -bool w2v_save_model(SEXP ptr, std::string file); -RcppExport SEXP _word2vec_w2v_save_model(SEXP ptrSEXP, SEXP fileSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< SEXP >::type ptr(ptrSEXP); - Rcpp::traits::input_parameter< std::string >::type file(fileSEXP); - rcpp_result_gen = Rcpp::wrap(w2v_save_model(ptr, file)); + rcpp_result_gen = Rcpp::wrap(w2v_train(texts_, types_, modelFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, verbose, normalize)); return rcpp_result_gen; END_RCPP } @@ -128,8 +104,6 @@ END_RCPP static const R_CallMethodDef CallEntries[] = { {"_word2vec_w2v_train", (DL_FUNC) &_word2vec_w2v_train, 17}, - {"_word2vec_w2v_load_model", (DL_FUNC) &_word2vec_w2v_load_model, 2}, - {"_word2vec_w2v_save_model", (DL_FUNC) &_word2vec_w2v_save_model, 2}, {"_word2vec_w2v_dictionary", (DL_FUNC) &_word2vec_w2v_dictionary, 1}, {"_word2vec_w2v_embedding", (DL_FUNC) &_word2vec_w2v_embedding, 2}, {"_word2vec_w2v_nearest", (DL_FUNC) &_word2vec_w2v_nearest, 4}, diff --git a/src/word2vec/lib/CMakeLists.txt b/src/word2vec/lib/CMakeLists.txt index c15279d..ad6c414 100644 --- a/src/word2vec/lib/CMakeLists.txt +++ b/src/word2vec/lib/CMakeLists.txt @@ -18,8 +18,8 @@ set(PRJ_SRCS # ${PROJECT_INCLUDE_DIR}/word2vec.h # ${PROJECT_SOURCE_DIR}/c_binding.cpp ${PROJECT_SOURCE_DIR}/mapper.cpp - ${PROJECT_SOURCE_DIR}/vocabulary.hpp - ${PROJECT_SOURCE_DIR}/vocabulary.cpp +# ${PROJECT_SOURCE_DIR}/vocabulary.hpp +# ${PROJECT_SOURCE_DIR}/vocabulary.cpp ${PROJECT_SOURCE_DIR}/huffmanTree.hpp ${PROJECT_SOURCE_DIR}/huffmanTree.cpp ${PROJECT_SOURCE_DIR}/nsDistribution.hpp From b15313d78d5fec887d2655472330248e4f1a7fe9 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Mon, 12 Feb 2024 09:01:06 +0900 Subject: [PATCH 24/31] Update --- tests/test.R | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/test.R b/tests/test.R index e232226..8f524ca 100644 --- a/tests/test.R +++ b/tests/test.R @@ -3,29 +3,29 @@ library(word2vec) corp <- data_corpus_inaugural %>% corpus_reshape() -toks <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE) -lis <- unclass(toks) +toks <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE) %>% + tokens_remove(stopwords(), padding = TRUE) %>% + tokens_tolower() +ndoc(toks) -type <- types(toks) -type[type %in% stopwords()] <- "" -mod <- word2vec:::w2v_train(toks, type, verbose = TRUE) +mod <- word2vec:::w2v_train(toks, types(toks), verbose = TRUE, size = 300, + iterations = 5, minWordFreq = 5) dim(as.matrix(mod)) +predict(mod, c("people", "american"), type = "nearest") -mod2 <- word2vec:::w2v_train(unclass(toks)[1:10], types(toks), verbose = TRUE) -dim(as.matrix(mod2)) +require(LSX) +lss <- as.textmodel_lss(t(as.matrix(mod)), "good") +head(coef(lss)) +tail(coef(lss)) +lis <- as.list(toks) mod_lis <- word2vec(lis, dim = 50, iter = 5, min_count = 5, verbose = TRUE, threads = 4) emb_lis <- as.matrix(mod_lis) dim(emb_lis) -predict(mod_lis, c("people", "American"), type = "nearest") - -mod_txt <- word2vec(txt, dim = 50, iter = 5, split = c("[ \n]", "\n"), min_count = 5, - verbose = TRUE, threads = 4) -emb_txt <- as.matrix(mod_txt) -dim(emb_txt) -predict(mod_txt, c("people", "American"), type = "nearest") +pred_lis <- predict(mod_lis, c("people", "American"), type = "nearest") +#saveRDS(mod_lis, "tests/word2vec_v04.RDS") microbenchmark::microbenchmark( "lis" = word2vec(lis, dim = 50, iter = 5, min_count = 5, From 4c79ddc0948b5eeb72a93782958201c2be1cfcf6 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Mon, 12 Feb 2024 09:16:00 +0900 Subject: [PATCH 25/31] Update --- tests/test.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test.R b/tests/test.R index 8f524ca..181791c 100644 --- a/tests/test.R +++ b/tests/test.R @@ -1,7 +1,9 @@ library(quanteda) library(word2vec) -corp <- data_corpus_inaugural %>% +data_corpus_guardian <- readRDS("/home/kohei/Dropbox/Public/data_corpus_guardian2016-10k.rds") +corp <- data_corpus_guardian %>% +#corp <- data_corpus_inaugural %>% corpus_reshape() toks <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE) %>% tokens_remove(stopwords(), padding = TRUE) %>% @@ -9,7 +11,7 @@ toks <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE) %>% ndoc(toks) mod <- word2vec:::w2v_train(toks, types(toks), verbose = TRUE, size = 300, - iterations = 5, minWordFreq = 5) + iterations = 5, minWordFreq = 5, threads = 6) dim(as.matrix(mod)) predict(mod, c("people", "american"), type = "nearest") From 708fd05feec2299bbe0c62502e48063fc9e7d799 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Mon, 12 Feb 2024 09:28:02 +0900 Subject: [PATCH 26/31] Update --- tests/test.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/test.R b/tests/test.R index 181791c..e081c84 100644 --- a/tests/test.R +++ b/tests/test.R @@ -1,5 +1,6 @@ library(quanteda) library(word2vec) +library(LSX) data_corpus_guardian <- readRDS("/home/kohei/Dropbox/Public/data_corpus_guardian2016-10k.rds") corp <- data_corpus_guardian %>% @@ -15,11 +16,16 @@ mod <- word2vec:::w2v_train(toks, types(toks), verbose = TRUE, size = 300, dim(as.matrix(mod)) predict(mod, c("people", "american"), type = "nearest") -require(LSX) -lss <- as.textmodel_lss(t(as.matrix(mod)), "good") +dfmt <- dfm(toks, remove_padding = TRUE) %>% + dfm_trim(min_termfreq = 5) +lss <- textmodel_lss(dfmt, c("good" = 1, "bad" = -1), cache = TRUE) head(coef(lss)) tail(coef(lss)) +lss2 <- as.textmodel_lss(t(as.matrix(mod)), c("good" = 1, "bad" = -1)) +head(coef(lss2)) +tail(coef(lss2)) + lis <- as.list(toks) mod_lis <- word2vec(lis, dim = 50, iter = 5, min_count = 5, verbose = TRUE, threads = 4) From 276f3cb501d5b4bb1d14bd1ab1736a84395c8ca0 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Tue, 5 Mar 2024 08:06:32 +0900 Subject: [PATCH 27/31] Remove files for file IO --- R/RcppExports.R | 4 - src/Makevars | 1 - src/RcppExports.cpp | 14 --- src/rcpp_word2vec.cpp | 49 +---------- src/word2vec/include/mapper.hpp | 82 ----------------- src/word2vec/include/wordReader.hpp | 132 ---------------------------- src/word2vec/lib/CMakeLists.txt | 2 - src/word2vec/lib/mapper.cpp | 78 ---------------- src/word2vec/lib/trainThread.hpp | 2 - src/word2vec/lib/word2vec.cpp | 2 - 10 files changed, 3 insertions(+), 363 deletions(-) delete mode 100644 src/word2vec/include/mapper.hpp delete mode 100644 src/word2vec/include/wordReader.hpp delete mode 100644 src/word2vec/lib/mapper.cpp diff --git a/R/RcppExports.R b/R/RcppExports.R index d6c90b8..08c533a 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -21,7 +21,3 @@ w2v_nearest_vector <- function(ptr, x, top_n = 10L, min_distance = 0.0) { .Call('_word2vec_w2v_nearest_vector', PACKAGE = 'word2vec', ptr, x, top_n, min_distance) } -w2v_read_binary <- function(modelFile, normalize, n) { - .Call('_word2vec_w2v_read_binary', PACKAGE = 'word2vec', modelFile, normalize, n) -} - diff --git a/src/Makevars b/src/Makevars index 620ba58..0008eff 100644 --- a/src/Makevars +++ b/src/Makevars @@ -2,7 +2,6 @@ PKG_LIBS = -pthread PKG_CPPFLAGS = -pthread -DSTRICT_R_HEADERS -I./word2vec/include -I./word2vec/lib SOURCES = word2vec/lib/huffmanTree.cpp \ - word2vec/lib/mapper.cpp \ word2vec/lib/nsDistribution.cpp \ word2vec/lib/trainer.cpp \ word2vec/lib/trainThread.cpp \ diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index ebd9209..e72dd60 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -88,19 +88,6 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// w2v_read_binary -Rcpp::NumericMatrix w2v_read_binary(const std::string modelFile, bool normalize, std::size_t n); -RcppExport SEXP _word2vec_w2v_read_binary(SEXP modelFileSEXP, SEXP normalizeSEXP, SEXP nSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const std::string >::type modelFile(modelFileSEXP); - Rcpp::traits::input_parameter< bool >::type normalize(normalizeSEXP); - Rcpp::traits::input_parameter< std::size_t >::type n(nSEXP); - rcpp_result_gen = Rcpp::wrap(w2v_read_binary(modelFile, normalize, n)); - return rcpp_result_gen; -END_RCPP -} static const R_CallMethodDef CallEntries[] = { {"_word2vec_w2v_train", (DL_FUNC) &_word2vec_w2v_train, 17}, @@ -108,7 +95,6 @@ static const R_CallMethodDef CallEntries[] = { {"_word2vec_w2v_embedding", (DL_FUNC) &_word2vec_w2v_embedding, 2}, {"_word2vec_w2v_nearest", (DL_FUNC) &_word2vec_w2v_nearest, 4}, {"_word2vec_w2v_nearest_vector", (DL_FUNC) &_word2vec_w2v_nearest_vector, 4}, - {"_word2vec_w2v_read_binary", (DL_FUNC) &_word2vec_w2v_read_binary, 3}, {NULL, NULL, 0} }; diff --git a/src/rcpp_word2vec.cpp b/src/rcpp_word2vec.cpp index dc22480..4853e42 100644 --- a/src/rcpp_word2vec.cpp +++ b/src/rcpp_word2vec.cpp @@ -5,7 +5,6 @@ #include #include #include "word2vec.hpp" -#include "wordReader.hpp" #include // [[Rcpp::depends(RcppProgress)]] @@ -82,29 +81,6 @@ Rcpp::List w2v_train(Rcpp::List texts_, if (verbose) { // NOTE: consider removing progress bar Progress p(100, true); trained = model->train(trainSettings, corpus, - //trainFile, stopWordsFile, // NOTE: remove - // [&p] (float _percent) { - // p.update(_percent / 2); - // /* - // std::cout << "\rParsing train data... " - // << std::fixed << std::setprecision(2) - // << _percent << "%" << std::flush; - // */ - // }, - // [&vocWords, &trainWords, &totalWords] (std::size_t _vocWords, std::size_t _trainWords, std::size_t _totalWords) { - // /* - // Rcpp::Rcerr << std::endl - // << "Finished reading data: " << std::endl - // << "Vocabulary size: " << _vocWords << std::endl - // << "Train words: " << _trainWords << std::endl - // << "Total words: " << _totalWords << std::endl - // << "Start training" << std::endl - // << std::endl; - // */ - // vocWords = _vocWords; - // trainWords = _trainWords; - // totalWords = _totalWords; - // }, [&p] (float _alpha, float _percent) { /* std::cout << '\r' @@ -119,26 +95,8 @@ Rcpp::List w2v_train(Rcpp::List texts_, p.update(_percent); } ); - //std::cout << std::endl; } else { - trained = model->train(trainSettings, corpus, - //trainFile, stopWordsFile, // NOTE: remove - // nullptr, - // [&vocWords, &trainWords, &totalWords] (std::size_t _vocWords, std::size_t _trainWords, std::size_t _totalWords) { - // /* - // Rcpp::Rcerr << std::endl - // << "Finished reading data: " << std::endl - // << "Vocabulary size: " << _vocWords << std::endl - // << "Train words: " << _trainWords << std::endl - // << "Total words: " << _totalWords << std::endl - // << "Start training" << std::endl - // << std::endl; - // */ - // vocWords = _vocWords; - // trainWords = _trainWords; - // totalWords = _totalWords; - // }, - nullptr); + trained = model->train(trainSettings, corpus, nullptr); } Rcpp::Rcout << "Training done\n"; //return Rcpp::List::create(); @@ -313,6 +271,8 @@ Rcpp::List w2v_nearest_vector(SEXP ptr, return out; } +/* NOTE: temporarily disabled + // [[Rcpp::export]] Rcpp::NumericMatrix w2v_read_binary(const std::string modelFile, bool normalize, std::size_t n) { try { @@ -416,9 +376,6 @@ Rcpp::NumericMatrix w2v_read_binary(const std::string modelFile, bool normalize, return embedding_default; } -/* NOTE: temporarily disabled - - // [[Rcpp::export]] Rcpp::List d2vec(SEXP ptr, Rcpp::StringVector x, std::string wordDelimiterChars = " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r") { Rcpp::XPtr model_w2v(ptr); diff --git a/src/word2vec/include/mapper.hpp b/src/word2vec/include/mapper.hpp deleted file mode 100644 index afa066c..0000000 --- a/src/word2vec/include/mapper.hpp +++ /dev/null @@ -1,82 +0,0 @@ -/** - * @file - * @brief mapper classes - mapping wrappers - * @author Max Fomichev - * @date 19.04.2016 - * @copyright Apache License v.2 (http://www.apache.org/licenses/LICENSE-2.0) -*/ - -#ifndef WORD2VEC_MAPPER_H -#define WORD2VEC_MAPPER_H - -#include - -namespace w2v { - /// @brief base class for different data sources (file, std::string etc) to be mapped - class mapper_t { - protected: - union { // mapped memory - char *rwData; // read/write access - const char *roData; // read only access - } m_data; - off_t m_size = 0; // mapped memory size - - public: - mapper_t(): m_data() {} - mapper_t(char *_data, off_t _size): m_data(), m_size(_size) {m_data.rwData = _data;} - mapper_t(const char *_data, off_t _size): m_data(), m_size(_size) {m_data.roData = _data;} - virtual ~mapper_t() = default; - - /// @returns pointer to mapped data in read-only mode - inline const char *data() const noexcept {return m_data.roData;} - /// @returns pointer to mapped data in read/write mode - inline char *data() noexcept {return m_data.rwData;} - /// @returns mapped memory size - inline off_t size() const noexcept {return m_size;} - }; - - class stringMapper_t final: public mapper_t { - public: - /** - * Constructs a fileMapper object for reading or writing, depending on parameters - * @param _fileName file name to be opened for reading or created for writing - * @param _wrFlag create file for writing (default is false - open for reading) - * @param _size size of a new created file (_wrFlag == true) - * @throws std::runtime_error In case of failed file or mapping operations - */ - explicit stringMapper_t(const std::string &_source): - mapper_t(_source.c_str(), static_cast(_source.length())) {} - - // copying prohibited - stringMapper_t(const stringMapper_t &) = delete; - void operator=(const stringMapper_t &) = delete; - }; - /** - * @brief C++ wrapper on mmap() system call - * - * fileMapper class is a simple wrapper on mmap() system call. Both reading from and writing to file are supported. - */ - class fileMapper_t final: public mapper_t { - private: - const std::string m_fileName; // name of the file to be mapped - int m_fd = -1; // file descriptor - const bool m_wrFlag = false; // write mode - - public: - /** - * Constructs a fileMapper object for reading or writing, depending on parameters - * @param _fileName file name to be opened for reading or created for writing - * @param _wrFlag create file for writing (default is false, open for reading) - * @param _size size of a new created file (_wrFlag must be true) - * @throws std::runtime_error In case of failed file or mapping operations - */ - explicit fileMapper_t(const std::string &_fileName, bool _wrFlag = false, off_t _size = 0); - ~fileMapper_t() final; - - // copying prohibited - fileMapper_t(const fileMapper_t &) = delete; - void operator=(const fileMapper_t &) = delete; - }; -} - -#endif //WORD2VEC_MAPPER_H diff --git a/src/word2vec/include/wordReader.hpp b/src/word2vec/include/wordReader.hpp deleted file mode 100644 index 8522d75..0000000 --- a/src/word2vec/include/wordReader.hpp +++ /dev/null @@ -1,132 +0,0 @@ -/** - * @file - * @brief wordReader class - fast text parsing - * @author Max Fomichev - * @date 19.04.2016 - * @copyright Apache License v.2 (http://www.apache.org/licenses/LICENSE-2.0) -*/ - -#ifndef WORD2VEC_WORDREADER_H -#define WORD2VEC_WORDREADER_H - -#include -#include -#include - -#include "mapper.hpp" - -namespace w2v { - /** - * @brief Text parser (word by word) - * - * wordReader class is a word by word parser of a file mapped into memory by mapper_t derived class object. - * It makes easy to parse a file like memory allocated char array without any read/write calls etc) - */ - template - class wordReader_t final { - private: - const dataMapper_t &m_mapper; // reference to mapper_t derived class object - std::string m_wordDelimiterChars; - std::string m_endOfSentenceChars; - const uint16_t m_maxWordLen; // max word length - off_t m_offset; // current offset - const off_t m_startFrom; // start from position - const off_t m_stopAt; // stop at position - std::string m_word; // current word buffer - std::size_t m_wordPos = 0; // position in the current word buffer - bool m_prvEOS = false; // is the previous char a sentence delimiter char? - - public: - /** - * Constructs a wordReader of a memory mapped file (_mapper object) - * @param _mapper mapper_t derived class object that provides read access to a mapped memory - * @param _offset start parsing from this offset position - * @param _stopAt stop parsing at this position - * @param _maxWordLen max length of a parsing word - * @throws std::range_error In case of _offset or/and _stopAt are out of bounds - */ - wordReader_t(const dataMapper_t &_mapper, - std::string _wordDelimiterChars, - std::string _endOfSentenceChars, - off_t _offset = 0, off_t _stopAt = 0, uint16_t _maxWordLen = 100): - m_mapper(_mapper), - m_wordDelimiterChars(std::move(_wordDelimiterChars)), - m_endOfSentenceChars(std::move(_endOfSentenceChars)), - m_maxWordLen(_maxWordLen), m_offset(_offset), - m_startFrom(m_offset), m_stopAt((_stopAt == 0)?_mapper.size() - 1:_stopAt), - m_word(m_maxWordLen, 0) { - - if (m_stopAt >= m_mapper.size()) { - throw std::range_error("wordReader: bounds are out of the file size"); - } - if (m_offset > m_stopAt) { - throw std::range_error("wordReader: offset is out of the bounds"); - } - } - - // copying prohibited - wordReader_t(const wordReader_t &) = delete; - void operator=(const wordReader_t &) = delete; - - /// @returns current offset - inline off_t offset() const noexcept {return m_offset;} - - /// Resets parser state, start parsing from the begining - inline void reset() noexcept { - m_offset = m_startFrom; - m_wordPos = 0; - m_prvEOS = false; - } - - /** - * Reads next word - * @param[out] _word string where the next parsed word to be stored. Empty string means end of sentence. - * @returns true if word is succesfuly parsed, false in case of EOF or end of parsing block reached (_stopAt). - */ - inline bool nextWord(std::string &_word) noexcept { - while (m_offset <= m_stopAt) { - char ch = m_mapper.data()[m_offset++]; - if (m_wordDelimiterChars.find(ch) != std::string::npos) { // is it a word/sentence delimiter? - if (m_endOfSentenceChars.find(ch) != std::string::npos) { // is it the end of sentence (EOS)? - if (m_wordPos > 0) { // is here any buffered word? if yes - return this word and move back - m_offset--; - m_prvEOS = false; - break; - } else { - if (!m_prvEOS) { // Do not return repeated EOS, return only the first occurrence. - _word.clear(); - m_prvEOS = true; - return true; - } else { - continue; // skip this EOS - } - } - } - if (m_wordPos > 0) { // it is a word delimiter, is here any buffered word? - m_prvEOS = false; - break; - } else { - continue; // skip repeated word delimiters - } - } - if (m_wordPos < m_maxWordLen) { // check bounds - m_word[m_wordPos++] = ch; // it's next char of buffered word - } - } - if (m_wordPos > 0) { // return buffered word - try { - _word.resize(m_wordPos); - std::copy(m_word.data(), m_word.data() + m_wordPos, &_word[0]); - } catch (...) { // bad_alloc - return false; - } - m_wordPos = 0; - return true; - } - - return false; // eof or end of the requested block - } - }; -} - -#endif // WORD2VEC_WORDREADER_H diff --git a/src/word2vec/lib/CMakeLists.txt b/src/word2vec/lib/CMakeLists.txt index ad6c414..00a6a6d 100644 --- a/src/word2vec/lib/CMakeLists.txt +++ b/src/word2vec/lib/CMakeLists.txt @@ -37,5 +37,3 @@ target_link_libraries(${PROJECT_NAME} ${LIBS}) install(TARGETS ${PROJECT_NAME} DESTINATION lib) install(FILES ${PROJECT_INCLUDE_DIR}/word2vec.hpp DESTINATION include) -install(FILES ${PROJECT_INCLUDE_DIR}/mapper.hpp DESTINATION include) -install(FILES ${PROJECT_INCLUDE_DIR}/wordReader.hpp DESTINATION include) diff --git a/src/word2vec/lib/mapper.cpp b/src/word2vec/lib/mapper.cpp deleted file mode 100644 index 16c0b9e..0000000 --- a/src/word2vec/lib/mapper.cpp +++ /dev/null @@ -1,78 +0,0 @@ -/** - * @file - * @brief fileMapper & wordReader classes - fast text file/memory parsing - * @author Max Fomichev - * @date 19.04.2016 - * @copyright Apache License v.2 (http://www.apache.org/licenses/LICENSE-2.0) -*/ - -#include -#include -#ifdef WIN32 -#include "win/mman.h" -#else -#include -#endif -#include -#include -#include - -#include -#include - -#include "mapper.hpp" - -namespace w2v { - fileMapper_t::fileMapper_t(const std::string &_fileName, bool _wrFlag, off_t _size): - mapper_t(), m_fileName(_fileName), m_wrFlag(_wrFlag) { - - if (m_wrFlag) { - m_size = _size; - } - - // open file - m_fd = ::open(m_fileName.c_str(), m_wrFlag?(O_RDWR | O_CREAT):O_RDONLY, 0600); - if (m_fd < 0) { - std::string err = std::string("fileMapper: ") + _fileName + " - " + std::strerror(errno); - throw std::runtime_error(err); - } - - // get file size - struct stat fst{}; - if (fstat(m_fd, &fst) < 0) { - std::string err = std::string("fileMapper: ") + _fileName + " - " + std::strerror(errno); - throw std::runtime_error(err); - } - - if (!m_wrFlag) { - if (fst.st_size <= 0) { - throw std::runtime_error(std::string("fileMapper: file ") + _fileName + " is empty, nothing to read"); - } - - m_size = fst.st_size; - } else { - if (ftruncate(m_fd, m_size) == -1) { - std::string err = std::string("fileMapper: ") + _fileName + " - " + std::strerror(errno); - throw std::runtime_error(err); - } - } - - // map file to memory - m_data.rwData = static_cast(mmap(nullptr, static_cast(m_size), - m_wrFlag?(PROT_READ | PROT_WRITE):PROT_READ , MAP_SHARED, - m_fd, 0)); - if (m_data.rwData == static_cast(MAP_FAILED)) { - std::string err = std::string("fileMapper: ") + _fileName + " - " + std::strerror(errno); - throw std::runtime_error(err); - } - } - - fileMapper_t::~fileMapper_t() { -#if defined(sun) || defined(__sun) - munmap(m_data.rwData, static_cast(m_size)); -#else - munmap(reinterpret_cast(m_data.rwData), static_cast(m_size)); -#endif - close(m_fd); - } -} diff --git a/src/word2vec/lib/trainThread.hpp b/src/word2vec/lib/trainThread.hpp index f7af72a..a6c1b92 100644 --- a/src/word2vec/lib/trainThread.hpp +++ b/src/word2vec/lib/trainThread.hpp @@ -18,8 +18,6 @@ #include #include "word2vec.hpp" -//#include "wordReader.hpp" -//#include "vocabulary.hpp" #include "huffmanTree.hpp" #include "nsDistribution.hpp" #include "downSampling.hpp" diff --git a/src/word2vec/lib/word2vec.cpp b/src/word2vec/lib/word2vec.cpp index 0699812..1a90521 100644 --- a/src/word2vec/lib/word2vec.cpp +++ b/src/word2vec/lib/word2vec.cpp @@ -7,8 +7,6 @@ */ #include #include "word2vec.hpp" -#include "wordReader.hpp" -//#include "vocabulary.hpp" #include "trainer.hpp" namespace w2v { From a3b499122d61f46b68d0cae2584efec7a22c17a2 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Tue, 5 Mar 2024 08:54:21 +0900 Subject: [PATCH 28/31] Convert character to integer --- DESCRIPTION | 4 ++-- R/word2vec.R | 13 +++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d6399ba..6da01c0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -13,8 +13,8 @@ Description: Learn vector representations of words by continuous bag of words an URL: https://github.com/bnosac/word2vec License: Apache License (>= 2.0) Encoding: UTF-8 -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.0 Depends: R (>= 2.10) -Imports: Rcpp (>= 0.11.5), stats +Imports: Rcpp (>= 0.11.5), stats, fastmatch LinkingTo: Rcpp, RcppProgress Suggests: udpipe diff --git a/R/word2vec.R b/R/word2vec.R index 2eb2433..18046fd 100644 --- a/R/word2vec.R +++ b/R/word2vec.R @@ -182,10 +182,15 @@ word2vec.list <- function(x, iter <- as.integer(iter) lr <- as.numeric(lr) skipgram <- as.logical(type %in% "skip-gram") - encoding <- "UTF-8" - model <- w2v_train(x, stopwords, - modelFile = model, - minWordFreq = min_count, + + vocaburary <- unique(unlist(x, use.names = FALSE)) + vocaburary <- setdiff(vocaburary, stopwords) + x <- lapply(x, function(x) { + v <- fastmatch::fmatch(x, vocaburary) + v[is.na(v)] <- 0L + return(v) + }) + model <- w2v_train(x, vocaburary, minWordFreq = min_count, size = dim, window = window, #expTableSize = expTableSize, expValueMax = expValueMax, sample = sample, withHS = hs, negative = negative, threads = threads, iterations = iter, alpha = lr, withSG = skipgram, ...) From e05f8091f73be504c6df9485696753d2385029d1 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Fri, 2 Aug 2024 09:39:56 +0900 Subject: [PATCH 29/31] Remove mapper.cpp form Makevars --- src/Makevars.win | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Makevars.win b/src/Makevars.win index 459c5a1..948d8c7 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -2,7 +2,6 @@ PKG_LIBS = -pthread PKG_CPPFLAGS = -pthread -DSTRICT_R_HEADERS -I./word2vec/include -I./word2vec/lib SOURCES = word2vec/lib/huffmanTree.cpp \ - word2vec/lib/mapper.cpp \ word2vec/lib/nsDistribution.cpp \ word2vec/lib/trainer.cpp \ word2vec/lib/trainThread.cpp \ From b063d205fe813bcb1a9f71e9060ac3b97f7c2b8a Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Fri, 2 Aug 2024 09:40:11 +0900 Subject: [PATCH 30/31] Build --- R/RcppExports.R | 4 ---- src/RcppExports.cpp | 16 ---------------- 2 files changed, 20 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index d6c90b8..08c533a 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -21,7 +21,3 @@ w2v_nearest_vector <- function(ptr, x, top_n = 10L, min_distance = 0.0) { .Call('_word2vec_w2v_nearest_vector', PACKAGE = 'word2vec', ptr, x, top_n, min_distance) } -w2v_read_binary <- function(modelFile, normalize, n) { - .Call('_word2vec_w2v_read_binary', PACKAGE = 'word2vec', modelFile, normalize, n) -} - diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 54f48c5..e72dd60 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -89,28 +89,12 @@ BEGIN_RCPP END_RCPP } -// w2v_read_binary -Rcpp::NumericMatrix w2v_read_binary(const std::string modelFile, bool normalize, std::size_t n); -RcppExport SEXP _word2vec_w2v_read_binary(SEXP modelFileSEXP, SEXP normalizeSEXP, SEXP nSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const std::string >::type modelFile(modelFileSEXP); - Rcpp::traits::input_parameter< bool >::type normalize(normalizeSEXP); - Rcpp::traits::input_parameter< std::size_t >::type n(nSEXP); - rcpp_result_gen = Rcpp::wrap(w2v_read_binary(modelFile, normalize, n)); - return rcpp_result_gen; -END_RCPP -} - - static const R_CallMethodDef CallEntries[] = { {"_word2vec_w2v_train", (DL_FUNC) &_word2vec_w2v_train, 17}, {"_word2vec_w2v_dictionary", (DL_FUNC) &_word2vec_w2v_dictionary, 1}, {"_word2vec_w2v_embedding", (DL_FUNC) &_word2vec_w2v_embedding, 2}, {"_word2vec_w2v_nearest", (DL_FUNC) &_word2vec_w2v_nearest, 4}, {"_word2vec_w2v_nearest_vector", (DL_FUNC) &_word2vec_w2v_nearest_vector, 4}, - {"_word2vec_w2v_read_binary", (DL_FUNC) &_word2vec_w2v_read_binary, 3}, {NULL, NULL, 0} }; From 0151da5d5a7afe90daf7a6c98f00ae1c4cc1724e Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Fri, 2 Aug 2024 10:03:22 +0900 Subject: [PATCH 31/31] Add word2vec.tokens --- DESCRIPTION | 2 +- NAMESPACE | 1 + R/word2vec.R | 30 ++++++++++++++------ man/{word2vec.list.Rd => word2vec.tokens.Rd} | 6 ++-- 4 files changed, 27 insertions(+), 12 deletions(-) rename man/{word2vec.list.Rd => word2vec.tokens.Rd} (98%) diff --git a/DESCRIPTION b/DESCRIPTION index c3c3b44..83694c6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -13,7 +13,7 @@ Description: Learn vector representations of words by continuous bag of words an URL: https://github.com/bnosac/word2vec License: Apache License (>= 2.0) Encoding: UTF-8 -RoxygenNote: 7.3.0 +RoxygenNote: 7.3.1 Depends: R (>= 2.10) Imports: Rcpp (>= 0.11.5), stats, fastmatch LinkingTo: Rcpp, RcppProgress diff --git a/NAMESPACE b/NAMESPACE index e2f823c..520c183 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ S3method(predict,word2vec_trained) S3method(summary,word2vec) S3method(summary,word2vec_trained) S3method(word2vec,list) +S3method(word2vec,tokens) export(doc2vec) export(read.word2vec) export(read.wordvectors) diff --git a/R/word2vec.R b/R/word2vec.R index 01b92e0..8d85a58 100644 --- a/R/word2vec.R +++ b/R/word2vec.R @@ -156,13 +156,14 @@ word2vec <- function(x, #' modelb <- word2vec(x = txt, dim = 15, iter = 20, split = c(" \n\r", "\n\r")) #' all.equal(as.matrix(modela), as.matrix(modelb)) #' \dontshow{\} # End of main if statement running only if the required packages are installed} -word2vec.list <- function(x, +word2vec.tokens <- function(x, type = c("cbow", "skip-gram"), dim = 50, window = ifelse(type == "cbow", 5L, 10L), iter = 5L, lr = 0.05, hs = FALSE, negative = 5L, sample = 0.001, min_count = 5L, stopwords = integer(), threads = 1L, ...){ + #x <- lapply(x, as.character) type <- match.arg(type) stopwords <- as.integer(stopwords) @@ -183,6 +184,24 @@ word2vec.list <- function(x, lr <- as.numeric(lr) skipgram <- as.logical(type %in% "skip-gram") + model <- w2v_train(x, attr(x, "types"), minWordFreq = min_count, + size = dim, window = window, #expTableSize = expTableSize, expValueMax = expValueMax, + sample = sample, withHS = hs, negative = negative, threads = threads, iterations = iter, + alpha = lr, withSG = skipgram, ...) + model$data$stopwords <- stopwords + model +} + +#' @export +word2vec.list <- function(x, ...){ + if (!is.character(attr(x, "types"))) { + x <- serialize(x, stopwords) + class(x) <- "tokens" + } + word2vec(x, ...) +} + +serialize <- function(x, stopwords) { vocaburary <- unique(unlist(x, use.names = FALSE)) vocaburary <- setdiff(vocaburary, stopwords) x <- lapply(x, function(x) { @@ -190,15 +209,10 @@ word2vec.list <- function(x, v[is.na(v)] <- 0L return(v) }) - model <- w2v_train(x, vocaburary, minWordFreq = min_count, - size = dim, window = window, #expTableSize = expTableSize, expValueMax = expValueMax, - sample = sample, withHS = hs, negative = negative, threads = threads, iterations = iter, - alpha = lr, withSG = skipgram, ...) - model$data$stopwords <- stopwords - model + attr(x, "types") <- vocaburary + return(x) } - #' @title Get the word vectors of a word2vec model #' @description Get the word vectors of a word2vec model as a dense matrix. #' @param x a word2vec model as returned by \code{\link{word2vec}} or \code{\link{read.word2vec}} diff --git a/man/word2vec.list.Rd b/man/word2vec.tokens.Rd similarity index 98% rename from man/word2vec.list.Rd rename to man/word2vec.tokens.Rd index b92d8f8..af58632 100644 --- a/man/word2vec.list.Rd +++ b/man/word2vec.tokens.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/word2vec.R -\name{word2vec.list} -\alias{word2vec.list} +\name{word2vec.tokens} +\alias{word2vec.tokens} \title{Train a word2vec model on text} \usage{ -\method{word2vec}{list}( +\method{word2vec}{tokens}( x, type = c("cbow", "skip-gram"), dim = 50,