% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/sentencepiece.R
\name{sentencepiece}
\alias{sentencepiece}
\title{Construct a Sentencepiece model}
\usage{
sentencepiece(
  x,
  type = c("bpe", "char", "unigram", "word"),
  vocab_size = 8000,
  coverage = 0.9999,
  model_prefix = "sentencepiece",
  model_dir = tempdir(),
  threads = 1L,
  args,
  verbose = FALSE
)
}
\arguments{
\item{x}{a character vector of path(s) to the text files containing training data}

\item{type}{either one of 'bpe', 'char', 'unigram' or 'word' for Byte Pair Encoding, Character level encoding,
Unigram encoding or pretokenised word encoding. Defaults to 'bpe' (Byte Pair Encoding).}

\item{vocab_size}{integer indicating the number of tokens in the final vocabulary. Defaults to 8000.}

\item{coverage}{fraction of characters covered by the model. Must be in the range [0, 1]. A good value to use is about 0.9999.}

\item{model_prefix}{character string with the name of the model. Defaults to 'sentencepiece'.
When executing the function 2 files will be created in the directory specified by \code{model_dir}, namely
sentencepiece.model with the model and sentencepiece.vocab containing the vocabulary of the model. 
You can change the name of the model by providing the \code{model_prefix} argument.}

\item{model_dir}{directory where the model will be saved. Defaults to the temporary directory (tempdir())}

\item{threads}{integer indicating number of threads to use when building the model}

\item{args}{character string with arguments passed on to sentencepiece::SentencePieceTrainer::Train (for expert use only)}

\item{verbose}{logical indicating to show progress of sentencepiece training. Defaults to \code{FALSE}.}
}
\value{
an object of class \code{sentencepiece} which is defined at \code{\link{sentencepiece_load_model}}
}
\description{
Construct a Sentencepiece model on text.
}
\examples{
library(tokenizers.bpe)
data(belgium_parliament, package = "tokenizers.bpe")
path   <- "traindata.txt" 
folder <- getwd() 
\dontshow{
path   <- tempfile("traindata_", fileext = ".txt")
folder <- tempdir()
}
writeLines(belgium_parliament$text, con = path)
\dontshow{
model <- sentencepiece(path, type = "char", vocab_size = 30, model_dir = folder)
model <- sentencepiece(path, type = "unigram", vocab_size = 50, model_dir = folder)
model <- sentencepiece(path, type = "bpe", vocab_size = 200, model_dir = folder)
}
\donttest{
model <- sentencepiece(path, type = "char", 
                       model_dir = folder, verbose = TRUE)
model <- sentencepiece(path, type = "unigram", vocab_size = 20000, 
                       model_dir = folder, verbose = TRUE)
model <- sentencepiece(path, type = "bpe", vocab_size = 4000, 
                       model_dir = folder, verbose = TRUE)

txt <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.",
         "On est d'accord sur le prix de la biere?")
sentencepiece_encode(model, x = txt, type = "subwords")
sentencepiece_encode(model, x = txt, type = "ids")


model <- sentencepiece_load_model(file.path(folder, "sentencepiece.model"))
sentencepiece_encode(model, x = txt, type = "subwords")
sentencepiece_encode(model, x = txt, type = "ids")
}

\dontshow{
# clean up for CRAN
file.remove(file.path(folder, "sentencepiece.model"))
file.remove(file.path(folder, "sentencepiece.vocab"))
file.remove(path)
}
}
\seealso{
\code{\link{sentencepiece_load_model}}
}
