% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/sequence.R
\name{gseq.pwm}
\alias{gseq.pwm}
\title{Score DNA sequences with a PWM over a region of interest}
\usage{
gseq.pwm(
  seqs,
  pssm,
  mode = c("lse", "max", "pos", "count"),
  bidirect = TRUE,
  strand = 0L,
  score.thresh = 0,
  start_pos = NULL,
  end_pos = NULL,
  extend = FALSE,
  spat.factor = NULL,
  spat.bin = 1L,
  spat.min = NULL,
  spat.max = NULL,
  return_strand = FALSE,
  skip_gaps = TRUE,
  gap_chars = c("-", "."),
  neutral_chars = c("N", "n", "*"),
  neutral_chars_policy = c("average", "log_quarter", "na"),
  prior = 0.01
)
}
\arguments{
\item{seqs}{character vector of DNA sequences (A/C/G/T/N; case-insensitive)}

\item{pssm}{numeric matrix or data frame with columns named A, C, G, T (additional columns are allowed and will be ignored)}

\item{mode}{character; one of "lse", "max", "pos", or "count"}

\item{bidirect}{logical; if TRUE, scans both strands (default: TRUE)}

\item{strand}{integer; 1=forward, -1=reverse, 0=both strands (default: 0)}

\item{score.thresh}{numeric; score threshold for \code{mode="count"} (default: 0)}

\item{start_pos}{integer or NULL; 1-based inclusive start of ROI (default: 1)}

\item{end_pos}{integer or NULL; 1-based inclusive end of ROI (default: sequence length)}

\item{extend}{logical or integer; extension of allowed window starts (default: FALSE)}

\item{spat.factor}{numeric vector; spatial weighting factors (optional)}

\item{spat.bin}{integer; bin size for spatial weighting}

\item{spat.min}{numeric; start of scanning window}

\item{spat.max}{numeric; end of scanning window}

\item{return_strand}{logical; if TRUE and \code{mode="pos"}, returns data.frame with
\code{pos} and \code{strand} columns}

\item{skip_gaps}{logical; if TRUE, treat gap characters as holes and skip them while
scanning. Windows are w consecutive non-gap bases (default: TRUE)}

\item{gap_chars}{character vector; which characters count as gaps (default: c("-", "."))}

\item{neutral_chars}{character vector; bases treated as unknown and scored with the average
log probability per position (default: c("N", "n", "*"))}

\item{neutral_chars_policy}{character string; how to treat neutral characters. One of
\code{"average"} (default; use the column's mean log-probability), \code{"log_quarter"}
(always use \code{log(1/4)}), or \code{"na"} (return NA when a neutral character is
encountered in the scanning window).}

\item{prior}{numeric; pseudocount added to frequencies (default: 0.01). Set to 0 for no pseudocounts.}
}
\value{
Numeric vector (for "lse"/"max"/"count" modes), integer vector (for "pos" mode),
  or data.frame with \code{pos} and \code{strand} columns (for "pos" mode with
  \code{return_strand=TRUE}). Returns NA when no valid windows exist.
}
\description{
Scores full DNA sequences using a Position Weight Matrix (PWM) over a specified
region of interest (ROI). The ROI is defined by \code{start_pos} and \code{end_pos}
(1-based, inclusive), with optional extension controlled by \code{extend}.
All reported positions are on the full input sequence.
}
\details{
This function scores DNA sequences directly without requiring a genomics database.
For detailed documentation on PWM scoring modes, parameters, and spatial weighting,
see \code{\link{gvtrack.create}} (functions "pwm", "pwm.max", "pwm.max.pos", "pwm.count").

The ROI (region of interest) is defined by \code{start_pos} and \code{end_pos}.
The \code{extend} parameter controls whether motif matches can extend beyond the ROI boundaries.

When \code{skip_gaps=TRUE}, characters specified in \code{gap_chars} are treated as gaps.
Windows are defined as w consecutive non-gap bases. All positions (\code{pos}) are reported
as 1-based indices on the original full sequence (including gaps). \code{start_pos} and
\code{end_pos} are interpreted as physical coordinates on the full sequence.

Neutral characters (\code{neutral_chars}, default \code{c("N", "n", "*")}) are treated as
unknown bases in both orientations. Each neutral contributes the mean log-probability of the
corresponding PSSM column, yielding identical penalties on forward and reverse strands without
hard-coded background scores. In \code{mode = "max"} the reported value is the single best
strand score after applying any spatial weights; forward and reverse contributions are not
aggregated. This matches the default behavior of the PWM virtual tracks (\code{pwm.max},
\code{pwm.max.pos}, etc.).
}
\examples{
\dontrun{
# Create a PSSM (position-specific scoring matrix) with frequency values
pssm <- matrix(
    c(
        0.7, 0.1, 0.1, 0.1, # Position 1: mostly A
        0.1, 0.7, 0.1, 0.1, # Position 2: mostly C
        0.1, 0.1, 0.7, 0.1, # Position 3: mostly G
        0.1, 0.1, 0.1, 0.7 # Position 4: mostly T
    ),
    ncol = 4, byrow = TRUE
)
colnames(pssm) <- c("A", "C", "G", "T")

# Example sequences
seqs <- c("ACGTACGTACGT", "GGGGACGTCCCC", "TTTTTTTTTTT")

# Score sequences using log-sum-exp (default mode)
gseq.pwm(seqs, pssm, mode = "lse")

# Get maximum score
gseq.pwm(seqs, pssm, mode = "max")

# Find position of best match
gseq.pwm(seqs, pssm, mode = "pos")

# Find position with strand information
gseq.pwm(seqs, pssm, mode = "pos", bidirect = TRUE, return_strand = TRUE)

# Count matches above threshold
gseq.pwm(seqs, pssm, mode = "count", score.thresh = 0.5)

# Score only a region of interest
gseq.pwm(seqs, pssm, mode = "max", start_pos = 3, end_pos = 10)

# Allow matches to extend beyond ROI boundaries
gseq.pwm(seqs, pssm, mode = "count", start_pos = 5, end_pos = 8, extend = TRUE)

# Spatial weighting example: higher weight in the center
spatial_weights <- c(0.5, 1.0, 2.0, 1.0, 0.5)
gseq.pwm(seqs, pssm,
    mode = "lse",
    spat.factor = spatial_weights,
    spat.bin = 2
)
}

}
\seealso{
\code{\link{gvtrack.create}} for detailed PWM parameter documentation
}
