% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/chksum.R
\name{seguid}
\alias{seguid}
\alias{lsseguid}
\alias{csseguid}
\alias{ldseguid}
\alias{cdseguid}
\title{SEGUID checksums for linear, circular, single- and double-stranded sequences}
\usage{
seguid(seq, alphabet = "{DNA}", form = c("long", "short", "both"))

lsseguid(seq, alphabet = "{DNA}", form = c("long", "short", "both"))

csseguid(seq, alphabet = "{DNA}", form = c("long", "short", "both"))

ldseguid(watson, crick, alphabet = "{DNA}", form = c("long", "short", "both"))

cdseguid(watson, crick, alphabet = "{DNA}", form = c("long", "short", "both"))
}
\arguments{
\item{seq}{(character string) The sequence for which the checksum
should be calculated.  The sequence may only comprise of symbols
in the alphabet specified by the \code{alphabet} argument.}

\item{alphabet}{(character string) The type of sequence used.
If \code{"{DNA}"} (default), then the input is a DNA sequence.
If \code{"{RNA}"}, then the input is an RNA sequence.
If \code{"{protein}"}, then the input is an amino-acid sequence.
If \code{"{DNA-extended}"} or \code{"{RNA-extended}"}, then the input is a
DNA or RNA sequence specified an extended set of symbols, including
If \code{"{protein-extended}"}, then the input is an amino-acid sequence
with an extended set of symbols, including IUPAC symbols (4).
A custom alphabet may also be used.
A non-complementary alphabet is specified as a comma-separated
set of single symbols, e.g. \code{"X,Y,Z"}.
A complementary alphabet is specified as a comma-separated
set of paired symbols, e.g. \code{"AT,CG"}.
It is also possible to extend a pre-defined alphabet, e.g.
\code{"{DNA},XY"}.}

\item{form}{(character string) How the checksum is presented.
If \code{"long"} (default), the full-length checksum is outputted.
If \code{"short"}, the short, six-digit checksum is outputted.
If \code{"both"}, both the short and the long checksums are outputted.}

\item{watson, crick}{(character strings) Two reverse-complementary DNA
sequences. Both sequences should be specified in the 5'-to-3' direction.}
}
\value{
The SEGUID functions return a single character string, if \code{form} is
either \code{"long"} or \code{"short"}. If \code{form} is \code{"both"}, then a character
vector of length two is return, where the first component holds the
"short" checksum and the second the "long" checksum.
The long checksum, without the prefix, is string with 27 characters.
The short checksum, without the prefix, is the first six characters
of the long checksum.
All checksums are prefixed with a label indicating which SEGUID
method was used.
Except for \code{seguid()}, which uses \emph{base64} encoding, all functions
produce checksums using the \emph{base64url} encoding ("Base 64 Encoding
with URL and Filename Safe Alphabet").

\code{seguid()} calculates the SEGUID v1 checksum for a linear,
single-stranded sequence.

\code{lsseguid()} calculates the SEGUID v2 checksum for a linear,
single-stranded sequence.

\code{csseguid()} calculates the SEGUID v2 checksum for a circular,
single-stranded sequence.

\code{ldseguid()} calculates the SEGUID v2 checksum for a linear,
double-stranded sequence.

\code{cdseguid()} calculates the SEGUID v2 checksum for a circular,
double-stranded sequence.
}
\description{
SEGUID checksums for linear, circular, single- and double-stranded sequences
}
\section{Base64 and Base64url encodings}{

The base64url encoding is the base64 encoding with non-URL-safe characters
substituted with URL-safe ones. Specifically, the plus symbol (\code{+}) is
replaced by the minus symbol (\code{-}), and the forward slash (\code{/}) is
replaced by the underscore symbol (\verb{_}).

The Base64 checksum, which is used for the original SEGUID checksum,
is not guaranteed to comprise symbols that can
safely be used as-is in Uniform Resource Locator (URL). Specifically,
it may consist of forward slashes (\code{/}) and plus symbols (\code{+}), which
are characters that carry special meaning in a URL.
For the same reason, a Base64 checksum cannot safely be used
as a file or directory name, because it may have a forward slash.

The checksum returned is always 27-character long. This is because the
representation always end with a padding character (\code{=}) so that the
length is a multiple of four character. We relax this requirement, by
dropping the padding character.
}

\examples{
## SEGUID v1 on linear single-stranded DNA
seguid("GATTACA")
#> seguid=tp2jzeCM2e3W4yxtrrx09CMKa/8

## SEGUID v2 on linear single-stranded DNA
lsseguid("GATTACA")
#> lsseguid=tp2jzeCM2e3W4yxtrrx09CMKa_8

## SEGUID v2 on cicular single-stranded DNA
## GATTACA = ATTACAG = ... = AGATTAC
csseguid("GATTACA")
#> csseguid=mtrvbtuwr6_MoBxvtm4BEpv-jKQ

## SEGUID v2 on blunt, linear double-stranded DNA
##   GATTACA
##   CTAATGT
ldseguid("GATTACA", "TGTAATC")
#> ldseguid=AcRsEcNFrui5wCxI7xxo6wnDYPY

## SEGUID v2 on staggered, linear double-stranded DNA
##   -ATTACA
##   CTAAT--
ldseguid("-ATTACA", "--TAATC")
#> ldseguid=98Klwxd3ZQPGHqnH3BheIuZVHQQ

## SEGUID v2 on circular double-stranded DNA
## GATTACA = ATTACAG = ... = AGATTAC
## CTAATGT = TAATGTC = ... = TCTAATG
cdseguid("GATTACA", "TGTAATC")
#> cdseguid=zCuq031K3_-40pArbl-Y4N9RLnA

## SEGUID v2 on linear single-stranded expanded
## epigenetic sequence (Viner et al., 2024)
viner_DNA <- "{DNA},m1,h2,f3,c4"
lsseguid("AmT2C", alphabet = viner_DNA)
#> lsseguid=MW4Rh3lGY2mhwteaSKh1-Kn2fGA

## SEGUID v2 on linear double-stranded expanded
## epigenetic sequence (Viner et al., 2024)
ldseguid("AmT2C", "GhA1T", alphabet = viner_DNA)
#> ldseguid=rsPDjP4SWr3-ploCeXTdTA80u0Y
}
\references{
\enumerate{
\item Babnigg, G., Giometti, CS. A database of unique protein sequence
identifiers for proteome studies. Proteomics.
2006 Aug;6(16):4514-22. \doi{10.1002/pmic.200600032}.
\item Josefsson, S., The Base16, Base32, and Base64 Data Encodings,
RFC 4648, \doi{10.17487/RFC4648}, October 2006,
\url{https://www.rfc-editor.org/info/rfc4648}.
\item Wikpedia article 'Nucleic acid notation', February 2024.
\url{https://en.wikipedia.org/wiki/Nucleic_acid_notation}.
\item Wikpedia article 'Nucleic acid notation', February 2024,
\url{https://en.wikipedia.org/wiki/Amino_acid}.
\item Wikipedia article 'SHA-1' (Secure Hash Algorithm 1), December 2023.
\url{https://en.wikipedia.org/wiki/SHA-1}.
}
}
