\name{findk}
\alias{findk}
\title{
Estimate the Number of Clusters in a Data Set
}
\description{
Based on some of descriptive statistics of the peak counts in the frequency polygon of a feature, this function proposes a list of estimates of the number of clusters in a data set.
}
\usage{
findk(x, binrule, nbins, tcmethod, tc, trmethod, tv, rms=FALSE, rcs=FALSE, tpc=1)
}

\arguments{
  \item{x}{a numeric data frame or matrix.}
  \item{binrule}{a string specifying the binning rule to compute the number of classes of a frequency polygon.}
  \item{nbins}{an integer specifying the number of classes (bins). It is internally computed according to the selected binning rule except \option{usr}. See all available options in \code{\link{genpolygon}}.}
  \item{tcmethod}{a string representing a threshold method to compute a threshold distance value to discard the small or empty bins of a frequency polygon. See all available options in \code{\link{findpolypeaks}}.}
  \item{tc}{an integer for threshold frequency value assigned by \code{tcmethod}.}
  \item{trmethod}{a string used to specify a removal method to discard the shoulders around the main peaks in a frequency polygon. See all available options in \code{\link{rmshoulders}}.}
  \item{tv}{a numeric threshold distance value assigned by \code{trmethod}.}
  \item{rms}{a logical value whether the shoulders removal is applied or not. Default value is \option{FALSE}.}
  \item{rcs}{a logical value whether the estimates of \var{k} computed on the reduced counts set instead of the full set. Default value is \option{FALSE}, and set to \code{TRUE} in order to use the reduced counts set.}
  \item{tpc}{an integer threshold value for creating the reduced set of the peak counts. Default value is 1.}
}

\details{
The function \code{findk} returns a list of \var{k} values which are proposed as the estimates of number of clusters in a given data set. The estimation is based on various descriptive statistics of the peak counts in the frequency polygon of the features. Firstly, the classes of frequency polygons of the features are generated by using the function \code{\link{genpolygon}}. Then, the main peaks in frequency polygons are determined by using the function \code{\link{findpolypeaks}}. If desired, with the function \code{\link{rmshoulders}} the shoulder peaks are removed from the peaks matrix returned by the function \code{\link{findpolypeaks}}. In the returned peaks matrix, the peaks are counted for each feature, and a list of estimates of \var{k} is produced by using various descriptive statistics of the peak counts.
}

\value{a list of the estimates of \var{k} consists of the following items which are computed from the peak counts of the features in a given data set:
   \item{am}{arithmetic mean of peak counts.}
   \item{med}{median of peak counts.}
   \item{mod}{mode of peak counts.}
   \item{cr}{center of the range of peak counts.}
   \item{ciqr}{center of the interquartile range (IQR) of peak counts.}
   \item{mppc}{overall mean of the pairwise means of peak counts.}
   \item{mq3m}{mean of the third quartile (Q3) and maximum of peak counts.}
   \item{mtl}{mean of two largest value of peak counts.}
   \item{avgk}{proposed \var{k} as the mean of all the estimates.}
   \item{modk}{proposed \var{k} as the mode of all the estimates.}
   \item{mtlk}{proposed \var{k} as the mean of two largest estimates.}
   \item{dst}{a string representing the type of counts set which is used in computations.}
   \item{pcounts}{an integer vector containing the peak counts of the features.}
}

\author{
Zeynel Cebeci, Cagatay Cebeci
}

\note{
As the input arguments, \code{findk} normally uses the outputs from the functions \code{\link{findpolypeaks}} and \code{\link{rmshoulders}}.
}

\seealso{
 \code{\link{findpolypeaks}}, 
 \code{\link{rmshoulders}}
}

\examples{
# Estimate the number of clusters in x5p4c data set
data(x5p4c)
estk <- findk(x5p4c, binrule="sturges")
print(estk)
summary(estk$pcounts)
cat("Estimated the number of clusters as the mean of Q3 and max peak count:", estk$mq3m, fill=TRUE)
cat("Proposed number of clusters based on the mean of two largest estimates:", estk$mtlk, fill=TRUE)

# Estimate the number of clusters in x5p4c data set by using threshold frequency method 'avg' 
# and shoulders removal method 'q1'
estk <- findk(x5p4c, binrule="usr", nbins=15, tcmethod="usr", tc=1, trmethod="avg", rms=TRUE)
print(estk)
summary(estk$pcounts)
cat("Proposed number of clusters based on the mean of two largest estimates:", estk$mtlk, fill=TRUE)

# Estimate the number of clusters in iris data set
data(iris)
estk <- findk(iris[,1:4], binrule="bc", rcs=FALSE)
print(estk)
summary(estk$pcounts)
cat("Proposed number of clusters based on the mean of estimates:", estk$avgk, fill=TRUE)
cat("Proposed number of clusters based on the mode of estimates:", estk$modk, fill=TRUE)
cat("Proposed number of clusters based on the mean of two largest estimates:", estk$mtlk, fill=TRUE)
}
\concept{number of clusters}
\concept{partitioning clustering}
\concept{cluster analysis}
\keyword{Statistics|cluster}