% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/TaxonInfluence.R
\encoding{UTF-8}
\name{TaxonInfluence}
\alias{TaxonInfluence}
\title{Rank taxa by their influence on phylogenetic results}
\usage{
TaxonInfluence(
  dataset,
  tree = NULL,
  Distance = ClusteringInfoDistance,
  calcWeighted = TRUE,
  savePath = NULL,
  useCache = FALSE,
  verbosity = 3L,
  ...
)
}
\arguments{
\item{dataset}{A phylogenetic data matrix of \pkg{phangorn} class
\code{phyDat}, whose names correspond to the labels of any accompanying tree.}

\item{tree}{Optimal tree or summary tree (of class "phylo") or list of trees
(of class "list" or "multiPhylo") against which results should be evaluated.
If \code{NULL}, an optimal tree will be sought using parsimony search with
the parameters provided in \code{\dots}.}

\item{Distance}{Function to calculate tree distance; default:
\code{\link[=ClusteringInfoDistance]{ClusteringInfoDistance()}}.}

\item{calcWeighted}{Logical specifying whether to compute the
distance-weighted mean value.}

\item{savePath}{Character giving prefix of path to which reduced trees will be
saved (with \code{\link[=write.nexus]{write.nexus()}}). File names will follow the pattern
\code{paste0(savePath, droppedTaxonName, ".nex")}; \code{savePath} should thus contain
a trailing \code{/} if writing to a directory, which will be created if it does
not exist.  Special characters will be removed from leaf labels when
creating the file path (using \code{\link[=path_sanitize]{path_sanitize()}}).
If \code{NULL}, computed trees will not be saved.}

\item{useCache}{Logical vector; if \code{TRUE}, previous tree search results will
be loaded from the location given by \code{savePath}, instead of running a fresh
search with the specified dataset and parameters.}

\item{verbosity, \dots}{Parameters for \code{\link[=MaximizeParsimony]{MaximizeParsimony()}}.
Tree search will be conducted using \code{tree} as a starting tree.}
}
\value{
\code{TaxonInfluence()} returns a matrix listing the phylogenetic
influence of each taxon, measured in the units of the chosen tree distance
metric (default = bits).
Columns denote taxa; rows denote the maximum, distance-weighted mean,
and minimum distance between optimal tree sets.
}
\description{
\code{TaxonInfluence()} ranks taxa according to their influence on the most
parsimonious topology.
}
\details{
\code{TaxonInfluence()} follows the approach of
\insertCite{Mariadassou2012sb;textual}{TreeSearch} in repeating tree search
whilst leaving each taxon in turn out of the analysis, and measuring
the distance of reconstructed trees from the optimal tree obtained when
all taxa are included in phylogenetic inference.

As \insertCite{Denton2018ee;textual}{TreeSearch} emphasize, the
Robinson–Foulds distance is unsuitable for this purpose; this function
allows the user to specify a preferred tree distance measure, defaulting
to the clustering information distance \insertCite{Smith2020}{TreeSearch}.
Because optimal parsimony trees are not equiprobable, taxon influence is
ranked based on the maximum and minimum tree-to-tree distances between
optimal trees.
}
\section{Distance-weighted mean}{

Sets of equally parsimonious trees are not statistical samples of tree space,
but are biased towards areas of uncertainty.
It is possible that a set of trees contains all possible resolutions of a
particular clade, and a single other topology in which that clade does not
exist – essentially two distinct solutions, one (\emph{a}) which could be
summarised with a summary tree that contains a polytomy, and another (\emph{b})
which could be summarized by a perfectly resolved tree.
Neither of these scenarios is preferable under the principles of parsimony;
but summary statistics (e.g. mean, median) will be strongly influenced by the
many trees in group \emph{a}, thus underplaying the existence of solution \emph{b}.

\code{TaxonInfluence()} uses an \emph{ad hoc} method to produce summary statistics
after weighting for trees' distance from other trees.  Trees that have few
close neighbours contribute more to the weighted mean, thus reducing the
influence of many trees that differ only in small details.
This distance-weighted mean is thus less prone to bias than a simple mean
– it is no more statistically valid, but (potentially) provides a more
representative summary of comparisons between sets of trees.
}

\examples{
#' # Load data for analysis in R
library("TreeTools")
data("congreveLamsdellMatrices", package = "TreeSearch")

# Small dataset for demonstration purposes
dataset <- congreveLamsdellMatrices[[42]][1:8, ]
bestTree <- MaximizeParsimony(dataset, verbosity = 0)[[1]]

# Calculate tip influence
influence <- TaxonInfluence(dataset, ratchIt = 0, startIt = 0, verbos = 0)

# Colour tip labels according to their influence
upperBound <- 2 * TreeDist::ClusteringEntropy(
  PectinateTree(NTip(dataset) - 1))
nBin <- 128
bin <- cut(
  influence["max", ],
  breaks = seq(0, upperBound, length.out = nBin),
  include.lowest = TRUE
)
palette <- hcl.colors(nBin, "inferno")

plot(bestTree, tip.color = palette[bin])
PlotTools::SpectrumLegend(
  "bottomleft",
  palette = palette,
  title = "Tip influence / bits",
  legend = signif(seq(upperBound, 0, length.out = 4), 3),
  bty = "n"
)
}
\references{
\insertAllCited{}
}
\seealso{
Other tree scoring: 
\code{\link{CharacterLength}()},
\code{\link{IWScore}()},
\code{\link{LengthAdded}()},
\code{\link{MinimumLength}()},
\code{\link{MorphyTreeLength}()}
}
\author{
\href{https://smithlabdurham.github.io/}{Martin R. Smith}
(\href{mailto:martin.smith@durham.ac.uk}{martin.smith@durham.ac.uk})
}
\concept{tree scoring}
