% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/nametagger.R
\name{nametagger}
\alias{nametagger}
\title{Train a Named Entity Recognition Model using NameTag}
\usage{
nametagger(
  x.train,
  x.test = NULL,
  iter = 30L,
  lr = c(0.1, 0.01),
  lambda = 0.5,
  stages = 1L,
  weight_missing = -0.2,
  control = nametagger_options(token = list(window = 2)),
  type = if (inherits(control, "nametagger_options")) control$type else "generic",
  tagger = if (inherits(control, "nametagger_options")) control$tagger else "trivial",
  file = if (inherits(control, "nametagger_options")) control$file else
    "nametagger.ner"
)
}
\arguments{
\item{x.train}{a file with training data or a data.frame which can be passed on to \code{\link{write_nametagger}}}

\item{x.test}{optionally, a file with test data or a data.frame which can be passed on to \code{\link{write_nametagger}}}

\item{iter}{the number of iterations performed when training each stage of the recognizer. With more iterations, training take longer (the recognition time is unaffected), but the model gets over-trained when too many iterations are used. Values from 10 to 30 or 50 are commonly used.}

\item{lr}{learning rates used. Should be a vector of length 2 where 
\itemize{
\item{element 1: learning rate used in the first iteration of SGD training method of the log-linear model. Common value is 0.1.}
\item{element 2: learning rate used in the last iteration of SGD training method of the log-linear model. Common values are in range from 0.1 to 0.001, with 0.01 working reasonably well.}
}}

\item{lambda}{the value of Gaussian prior imposed on the weights. In other words, value of L2-norm regularizer. Common value is either 0 for no regularization, or small real number like 0.5.}

\item{stages}{the number of stages performed during recognition. Common values are either 1 or 2. With more stages, the model is larger and recognition is slower, but more accurate.}

\item{weight_missing}{default value of missing weights in the log-linear model. Common values are small negative real numbers like -0.2.}

\item{control}{the result of a call to \code{\link{nametagger_options}} a file with predictive feature transformations serving as predictive elements in the model}

\item{type}{either one of 'generic', 'english' or 'czech'}

\item{tagger}{either one of 'trivial' (no lemma used in the training data), 'external' (you provided your own lemma in the training data)}

\item{file}{path to the filename where the model will be saved}
}
\value{
an object of class \code{nametagger} containing an extra list element called stats containing information on the evolution of the log probability and the accuracy on the training and optionally the test set
}
\description{
Train a Named Entity Recognition Model using NameTag. Details at \url{http://ufal.mff.cuni.cz/nametag/1}.
}
\examples{
data(europeananews)
x <- subset(europeananews, doc_id \%in\% "enp_NL.kb.bio")
traindata <- subset(x, sentence_id >  100)
testdata  <- subset(x, sentence_id <= 100)
path <- "nametagger-nl.ner" 
\dontshow{
path <- tempfile("nametagger-nl_", fileext = ".ner")
traindata <- subset(x, sentence_id >  100 & sentence_id < 300)
testdata  <- subset(x, sentence_id <= 100)
} 
opts <- nametagger_options(file = path,
                           token = list(window = 2),
                           token_normalisedsuffix = list(window = 0, from = 1, to = 4),
                           ner_previous = list(window = 2),
                           time = list(use = TRUE),
                           url_email = list(url = "URL", email = "EMAIL"))
\dontshow{
model <- nametagger(x.train = traindata, x.test = testdata,
                    iter = 1, lambda = 0.5, control = opts)
}
\donttest{
model <- nametagger(x.train = traindata, 
                    x.test = testdata,
                    iter = 30, lambda = 0.5,
                    control = opts)
}
model
model$stats
plot(model$stats$iteration, model$stats$logprob, type = "b")
plot(model$stats$iteration, model$stats$accuracy_train, type = "b", ylim = c(95, 100))
lines(model$stats$iteration, model$stats$accuracy_test, type = "b", lty = 2, col = "red")
\dontshow{if(require(udpipe))\{}
predict(model, 
        "Ik heet Karel je kan me bereiken op paul@duchanel.be of www.duchanel.be", 
        split = "[[:space:]]+")
\dontshow{\} # End of main if statement running only if the required packages are installed}

features <- system.file(package = "nametagger", 
                        "models", "features_default.txt")
cat(readLines(features), sep = "\n")
path_traindata <- "traindata.txt" 
\dontshow{
path_traindata <- tempfile("traindata_", fileext = ".txt")
}
write_nametagger(x, file = path_traindata)
\dontshow{
model <- nametagger(path_traindata, iter = 1, control = features, file = path)
}
\donttest{
model <- nametagger(path_traindata, iter = 30, control = features, file = path)
model
}

\dontshow{
# clean up for CRAN
file.remove(path)
file.remove(path_traindata)
}
}
