Skip to content

Commit

Permalink
documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
jwijffels committed Oct 4, 2023
1 parent 30462c7 commit 37cdaf9
Show file tree
Hide file tree
Showing 4 changed files with 431 additions and 43 deletions.
73 changes: 49 additions & 24 deletions R/word2vec.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#' @title Train a word2vec model on text
#' @description Construct a word2vec model on text. The algorithm is explained at \url{https://arxiv.org/pdf/1310.4546.pdf}
#' @param x a character vector with text or the path to the file on disk containing training data
#' @param x a character vector with text or the path to the file on disk containing training data or a list of tokens. See the examples.
#' @param type the type of algorithm to use, either 'cbow' or 'skip-gram'. Defaults to 'cbow'
#' @param dim dimension of the word vectors. Defaults to 50.
#' @param iter number of training iterations. Defaults to 5.
Expand All @@ -10,12 +10,8 @@
#' @param negative integer with the number of negative samples. Only used in case hs is set to FALSE
#' @param sample threshold for occurrence of words. Defaults to 0.001
#' @param min_count integer indicating the number of time a word should occur to be considered as part of the training vocabulary. Defaults to 5.
#' @param split a character vector of length 2 where the first element indicates how to split words and the second element indicates how to split sentences in \code{x}
#' @param stopwords a character vector of stopwords to exclude from training
#' @param threads number of CPU threads to use. Defaults to 1.
#' @param encoding the encoding of \code{x} and \code{stopwords}. Defaults to 'UTF-8'.
#' Calculating the model always starts from files allowing to build a model on large corpora. The encoding argument
#' is passed on to \code{file} when writing \code{x} to hard disk in case you provided it as a character vector.
#' @param ... further arguments passed on to the C++ function \code{w2v_train} - for expert use only
#' @return an object of class \code{w2v_trained} which is a list with elements
#' \itemize{
Expand All @@ -36,7 +32,12 @@
#' \item{argument window: for skip-gram usually around 10, for cbow around 5}
#' \item{argument sample: sub-sampling of frequent words: can improve both accuracy and speed for large data sets (useful values are in range 0.001 to 0.00001)}
#' }
#' @seealso \code{\link{predict.word2vec}}, \code{\link{as.matrix.word2vec}}
#' Some notes on the tokenisation
#' \itemize{
#' \item{If you provide to \code{x} a list, each list element should correspond to a sentence (or what you consider as a sentence) and should contain a character vector of tokens.}
#' \item{If you provide to \code{x} a character vector or the path to the file on disk, the tokenisation into words depends on the first element provided in \code{split} and the tokenisation into sentences depends on the second element provided in \code{split} when passed on to \code{\link{word2vec.character}}}
#' }
#' @seealso \code{\link{predict.word2vec}}, \code{\link{as.matrix.word2vec}}, \code{\link{word2vec}}, \code{\link{word2vec.character}}, \code{\link{word2vec.list}}
#' @export
#' @examples
#' \dontshow{if(require(udpipe))\{}
Expand All @@ -50,20 +51,20 @@
#' model <- word2vec(x = x, dim = 15, iter = 20)
#' emb <- as.matrix(model)
#' head(emb)
#' emb <- predict(model, c("bus", "toilet", "unknownword"), type = "embedding")
#' emb <- predict(model, c("bus", "toilet", "unknownword"), type = "embedding")
#' emb
#' nn <- predict(model, c("bus", "toilet"), type = "nearest", top_n = 5)
#' nn <- predict(model, c("bus", "toilet"), type = "nearest", top_n = 5)
#' nn
#'
#' ## Get vocabulary
#' vocab <- summary(model, type = "vocabulary")
#' vocab <- summary(model, type = "vocabulary")
#'
#' # Do some calculations with the vectors and find similar terms to these
#' emb <- as.matrix(model)
#' vector <- emb["buurt", ] - emb["rustige", ] + emb["restaurants", ]
#' emb <- as.matrix(model)
#' vector <- emb["buurt", ] - emb["rustige", ] + emb["restaurants", ]
#' predict(model, vector, type = "nearest", top_n = 10)
#'
#' vector <- emb["gastvrouw", ] - emb["gastvrij", ]
#' vector <- emb["gastvrouw", ] - emb["gastvrij", ]
#' predict(model, vector, type = "nearest", top_n = 5)
#'
#' vectors <- emb[c("gastheer", "gastvrouw"), ]
Expand All @@ -81,7 +82,19 @@
#' \dontshow{
#' file.remove(path)
#' }
#'
#' ##
#' ## Example of word2vec with a list of tokens
#' ## which gives the same embeddings as with a similarly tokenised character vector of texts
#' ##
#' txt <- txt_clean_word2vec(x, ascii = TRUE, alpha = TRUE, tolower = TRUE, trim = TRUE)
#' table(unlist(strsplit(txt, "")))
#' set.seed(1234)
#' toks <- strsplit(txt, split = " ")
#' model <- word2vec(x = toks, dim = 15, iter = 20)
#' emb <- as.matrix(model)
#' set.seed(1234)
#' model <- word2vec(x = txt, dim = 15, iter = 20, split = c(" \n\r", "\n\r"))
#' all.equal(emb, as.matrix(model))
#'
#' ##
#' ## Example getting word embeddings
Expand All @@ -106,21 +119,32 @@
#' nn
#'
#' \dontshow{\} # End of main if statement running only if the required packages are installed}
word2vec <- function(x, ...) {
UseMethod("word2vec")
}

#' @export
word2vec.character <- function(x,
word2vec <- function(x,
type = c("cbow", "skip-gram"),
dim = 50, window = ifelse(type == "cbow", 5L, 10L),
iter = 5L, lr = 0.05, hs = FALSE, negative = 5L, sample = 0.001, min_count = 5L,
split = c(" \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r",
".\n?!"),
stopwords = character(),
threads = 1L,
encoding = "UTF-8",
...){
...) {
UseMethod("word2vec")
}

#' @inherit word2vec
#' @param split a character vector of length 2 where the first element indicates how to split words and the second element indicates how to split sentences in \code{x}
#' @param encoding the encoding of \code{x} and \code{stopwords}. Defaults to 'UTF-8'.
#' Calculating the model always starts from files allowing to build a model on large corpora. The encoding argument
#' is passed on to \code{file} when writing \code{x} to hard disk in case you provided it as a character vector.
#' @export
word2vec.character <- function(x,
type = c("cbow", "skip-gram"),
dim = 50, window = ifelse(type == "cbow", 5L, 10L),
iter = 5L, lr = 0.05, hs = FALSE, negative = 5L, sample = 0.001, min_count = 5L,
stopwords = character(),
threads = 1L,
split = c(" \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r",
".\n?!"),
encoding = "UTF-8",
...){
type <- match.arg(type)
stopw <- stopwords
model <- file.path(tempdir(), "w2v.bin")
Expand Down Expand Up @@ -172,14 +196,15 @@ word2vec.character <- function(x,
model
}

#' @inherit word2vec
#' @export
word2vec.list <- function(x,
type = c("cbow", "skip-gram"),
dim = 50, window = ifelse(type == "cbow", 5L, 10L),
iter = 5L, lr = 0.05, hs = FALSE, negative = 5L, sample = 0.001, min_count = 5L,
stopwords = character(),
threads = 1L,
...){
...){
type <- match.arg(type)
stopwords <- as.character(stopwords)
model <- file.path(tempdir(), "w2v.bin")
Expand Down
63 changes: 44 additions & 19 deletions man/word2vec.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 37cdaf9

Please sign in to comment.