% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/multiResGrid.R, R/zzz.R
\name{multiResGrid.MRG}
\alias{multiResGrid.MRG}
\alias{multiResGrid.data.frame}
\alias{multiResGrid.list}
\alias{multiResGrid}
\title{Function that creates a multi-resolution grid with larger grid cells in
regions with lower resolution of data, or where data needs to
be anonymized for disclosure control reasons. The function can also be used
to create a grid of new variables, following an existing grid.}
\usage{
\method{multiResGrid}{MRG}(MRGobject, ...)

\method{multiResGrid}{data.frame}(himg, ...)

\method{multiResGrid}{list}(
  gdl,
  ifg,
  vars,
  weights,
  countFeatureOrTotal = "feature",
  mincount = 10,
  nlarge = 2,
  plim = 0.85,
  verbose = FALSE,
  nclus = 1,
  clusType,
  domEstat = TRUE,
  outfile = NULL,
  checkDominance = TRUE,
  checkReliability = FALSE,
  userfun,
  strat = NULL,
  confrules = "individual",
  suppresslim = 0,
  sumsmall = FALSE,
  suppresslimSum = NULL,
  reliabilitySplit = TRUE,
  plotIntermediate = FALSE,
  addIntermediate = FALSE,
  postProcess = TRUE,
  rounding = -1,
  remCols = TRUE,
  ...
)

multiResGrid(...)
}
\arguments{
\item{MRGobject}{An object including all
the necessary variables and parameters, from a call to createMRGobject}

\item{...}{Possible arguments to userfun or other internal functions}

\item{himg}{The grid resulting from a call to multiResGrid}

\item{gdl}{A list of gridded data - with different resolutions}

\item{ifg}{Either a data.frame or tibble or sf-object with the locations and the data of the survey or census data,
or a list of such objects.}

\item{vars}{Variable(s) of interest that should be aggregated (necessary when ifg is
used for individual farm specific anonymization rules)}

\item{weights}{Extrapolation factor(s) (weights) wi of unit i in the sample of units nc 
falling into
         a specific cell c. Weights are used for disclosure control measures. 
         A weight of 1 will be used if missing.
       If only one weight is given, it will be used for all variables. If the length is more than one,
       the length has to be equal to the number of variables. If the same weight is used for several variables,
       it must be repeated in the weights-vector}

\item{countFeatureOrTotal}{Should the frequency limit be applied on records with a positive
value for a certain feature, or on all records, independent of value of feature}

\item{mincount}{The minimum number of farms for a grid cell (threshold rule)}

\item{nlarge}{Parameter to be used if the nlarge(st) farms should count for maximum plim percent of
the total value for the variable in the grid cell (see details of \code{\link{gridData}})}

\item{plim}{See nlarge}

\item{verbose}{indicates if some extra output should be printed}

\item{nclus}{Number of clusters to use for parallel processing. No parallelization is used
for \code{nclus = 1}.}

\item{clusType}{The type of cluster; see \code{\link[parallel]{makeCluster}} for more details.
The default of makeCluster is used if type is missing or NA}

\item{domEstat}{Should the dominance rule be applied as in the IFS handbook (TRUE), where 
 the weights are rounded before finding the  first nlarge contributors, or should 
it be the first nlarge contributors*weight, where also fractions are considered (FALSE)?}

\item{outfile}{File to direct the output in case of parallel processing, 
see \code{\link[parallel]{makeCluster}} for more details.}

\item{checkDominance}{Logical - should the dominance rule be applied?}

\item{checkReliability}{Logical - should the prediction variance be checked, and used for the aggregation?
This considerably increases computation time}

\item{userfun}{This gives the possibility to add a user defined function with additional confidentiality rules which 
the grid cell has to pass}

\item{strat}{Column name defining the strata for stratified sampling, used if checkReliability is TRUE}

\item{confrules}{Should the frequency rule (number of holdings) refer to the number of holdings with 
a value of the individual vars above zero ("individual") or the total number of holdings in 
the data set ("total")?}

\item{suppresslim}{Parameter that can be used to avoid that almost empty grid cells are merged with cells 
with considerably higher number of observations. The value is a minimum percentage of the total
potential new cell for a grid cell to be aggregated.}

\item{sumsmall}{Logical; should the suppresslimSum value be applied on the sum of
small grid cells within the lower resolution grid cell?
Note that different combinations of suppreslim and suppreslimSum values 
might not give completely intuitive results.For instance, if both are equal, then
a higher value can lead to more grid cells being left unaggregated for smaller grid sizes, leading
to aggregation for a large grid cell}

\item{suppresslimSum}{Parameter similar to suppreslim, but affecting the total
of grid cells to be suppressed}

\item{reliabilitySplit}{Logical or number - parameter to be used in
calculation of the reliability (if checkReliability = TRUE). It can either give the 
number of groups, or if TRUE, it will create groups of approdcimately 50,000 records per group. 
If FALSE, the data set will not be split, independent on the size.}

\item{plotIntermediate}{Logical or number - make a simple plot showing which grid cells have already 
passed the frequency rule. plotintermediate = TRUE, the function will wait 5 seconds after plotting 
before continuing, otherwise it will wait plotintermediate seconds.}

\item{addIntermediate}{Logical; will add a list of all intermediate himgs
and lohs (overlay of himg and the lower resolution grid) as an attribute to
the object to be returned}

\item{postProcess}{Logical; should the postprocessing be done as part
of creation of the multiresolution grid (TRUE), or be done in a separate 
step afterwards (FALSE). The second option is useful when wanting
to check the confidential grid cells of the final map}

\item{rounding}{either logical (FALSE) or an integer indicating the number 
of decimal places 
 to be used. Negative values are allowed (such as the default
 value rounding to the closest 10). See also the details
 for \code{digits} in \code{\link{round}}.}

\item{remCols}{Logical; Should intermediate columns be removed? Can be set
to FALSE for further analyses. Temporary columns will not be removed if their names 
partly match the variable names of \code{vars}}
}
\value{
The function will return a multi-resolution grid with observations
gridded to different grid cell sizes according to the confidentiality rules
to be applied. It can also include some additional columns that indicates 
which of the different confidentiality rules that have been applied.

Note that the function might (if \code{postProcess = FALSE})
return values also for the confidential grid-cells.
}
\description{
Two main confidentiality rules are considered:
- Threshold rule (suppression due to a minimum number of counts)
- Dominance rule (suppression due to dominance by one or more units)
}
\details{
This function will find the highest resolution data set that fulfills the 
       confidentiality rules and potential reliability rules for variable(s) of interest.
       Starting with the second highest resolution (5 km in the default settings),
       the function will check if any of the 1 km sub pixels will have values not fulfilling
       any of the confidentiality rules (number of farms, values of the 2 largest compared to values of the 
       entire grid cell).
       If all values are above the confidentiality limits, the grid cells will be kept at a 1 km resolution,
       otherwise only the 5 km grid cell will be kept. This will again be tested against the confidentiality
       rules in the next iteration, when grid cells will possibly be merged to 10 km grid cells.

       The function can also be called if it is necessary to create a grid of a new variable for the same
       grid as an already existing variable. The confidentiality rules will then be applied to the new
       variables for the existing grid cells, and mask the ones that do not respect the rules.
       The function will not do any further merging
       of grid cells, for this it is necessary to grid the variables together.
       This feature is useful when the new data set has a similar resolution 
       as the original data set. It will give a high number of missing values if
       the resolution of the new data is more sparse than the original. In the examples below,
       this means that it is possible to copy the grid of organic organic 
       agricultural area to a grid of all agricultural area, whereas the opposite
       will not work well.

       The standard threshold rule for spatial data is at least 10 units (mincount). 
       The parameters nlarge and plim are used for determining the dominance treatment for the variable of interest,
       with default values of \code{nlarge = 2} and \code{plim = 0.85}. 
       If more than plim of the values of the grid cell (e.g. UAA, arable land, number of livestock)
       is explained by 1-nlarge weighted holdings, the grid cell will not pass the confidentiality rule.
       
       The concept of reliability is explained in details in section 4.6 in the integrated farm survey handbook for 2023:
       https://wikis.ec.europa.eu/display/IFS/Integrated+Farm+Statistics+Manual+%7C+2023+edition
       In short, it is an estimate of the coefficient of variation for an estimate (a grid cell in this case),
       based on the number in the sample relative to the number in the population, and taking into account
       possible stratified sampling approaches. The number is zero if all holdings in the population in 
       a grid cell has been sampled, and the default requirement is that the CV is less than 35%.
       
       There are some cases where aggregation might not be desired. In the situation where a 
       relatively large single grid cell does not respect the confidentiality rules, it is fine to 
       aggregate it if the neighbouring grid cells are also relatively large. However, it can be seen
       as unfortunate if the single cell was aggregated with many smaller grid cells that could otherwise 
       be disseminated at a high resolution. The added value of being able to present a value for a 
       region with very few farms is perhaps lower than what is lost by having to aggregate to a 
       lower resolution. The parameter \code{suppresslim} indicates the minimum value in a grid cell 
       relative to the possible lower resolution grid cell 
       before it is necessary to aggregate. If the limit is 0.05, a grid cell would only cause an aggregation
       to lower resolution if the value in the grid cell is more than 5% of the value in the lower resolution 
       grid cell. Instead, it would be left as it is, and will be suppressed in the post-processing step.

       There are cases when the built-in confidentiality checks are not what the user needs. 
       That is why it is possible to submit a user defined function. This function needs to follow
       certain rules. 
       \enumerate{
         \item The first argument must be a data.frame with name \code{df}. 
               This is a data.frame with the individual records
               for a particular grid cell. It has three columns: 
             \enumerate{
                \item himgid - the ID of the current grid cell. This is the grouping variable
                       and is constant for the data.frame
                \item gridvar - a new common name for the current variable to be gridded
                \item weight - the weight of the variable to be gridded
             }  
         \item The function can include additional parameters for calculation of confidentiality
                (or reliability, or suitability, if the meaning of the function refers to something else).
                This can be new parameters to this particular function (through the 
                ellipsis argument (...) of \code{multiResGrid}), existing parameters to \code{multiResGrid},
                or potentially internal variables of \code{multiResGrid.})
         \item The result of the function must be a logical, either the rule was passed 
               for the records of this grid cell, or not (TRUE/FALSE)
         \item The function can potentially use internal variables of \code{multiResGrid}, however,
               the meaning of these will have to be understood from the code
       }
       A simple example of a \code{userfun} is given in the example section below (the one producing \code{himg6})
}
\examples{
\donttest{
library(sf)
library(viridis)
library(ggplot2)
library(patchwork)
library(giscoR)

# These are SYNTHETIC agricultural FSS data 
data(ifs_dk) # Census data
ifs_weight = ifs_dk \%>\% dplyr::filter(Sample == 1) # Extract weighted subsample

# Create spatial data
ifg = fssgeo(ifs_dk, locAdj = "LL")
fsg = fssgeo(ifs_weight, locAdj = "LL")
# Read country borders, only used for plotting
borders = gisco_get_nuts(nuts_level = 0)
dkb = borders[borders$CNTR_CODE == "DK",] \%>\% st_transform(crs = 3035)

ress = c(1,5,10,20,40, 80, 160)*1000
# Gridding Utilized agricultural area (UAA)
ifl = gridData(ifg, "UAA",res = ress)
# Gridding organic utilized agricultural area
ifl2 = gridData(ifg, vars = "UAAXK0000_ORG", res = ress)

# Gridding UAA and organic UAA together
ifl3 = gridData(ifg, vars = c("UAA", "UAAXK0000_ORG"), res = ress)

# Gridding the UAA from the survey - the survey weights are in the column EXT_MODULE
fsl = gridData(fsg,  vars = c("UAA"), weights = "EXT_MODULE",  res = ress)

# Create a multi-resolution grid only with farm number as confidentiality rule, then plot results
himg0 = multiResGrid(ifl, checkReliability = FALSE, suppresslim = 0)
ggplot(himg0) + geom_sf(aes(fill = count))

# Create a multi-resolution grid of UAA, also based on the dominance rule (default)
himg1 = multiResGrid(ifl, vars = "UAA", ifg = ifg)
p1 = ggplot(himg1) + geom_sf(aes(fill = UAA))
p1
# Create multi-resolution grid of organic UAA
himg2 = multiResGrid(ifl2, vars = "UAAXK0000_ORG", ifg = ifg)
himg21 = multiResGrid(ifl2, vars = "UAAXK0000_ORG", ifg = ifg, postProcess = FALSE)

ggplot(himg2) + geom_sf(aes(fill = UAAXK0000_ORG))

# Create joint multi-resolution grid of organic UAA and total UAA
himg3 = multiResGrid(ifl3, vars = c("UAA", "UAAXK0000_ORG"), ifg = ifg, 
                  checkReliability = FALSE, suppresslim = 0)
# Create multi-resolution grid of organic UAA, based on the UAA grid
# The large number of missing values indicates that this feature should
# mainly be used for data that have similar or higher resolution as the
# original data set.
himg33 = multiResGrid(himg1, vars = c("UAAXK0000_ORG"), ifg = ifg, 
                  checkReliability = FALSE, suppresslim = 0)
p31 = ggplot(himg3) + geom_sf(aes(fill = UAA))
p32 = ggplot(himg3) + geom_sf(aes(fill = UAAXK0000_ORG))
p33 = ggplot(himg33) + geom_sf(aes(fill = UAAXK0000_ORG))
p31 + p32 + p33

# Create multi-resolution grid of UAA, based on survey data,
# with and without applying reliability check
# Slow!
himg4 = multiResGrid(fsl,  vars = c("UAA"), weights = "EXT_MODULE", ifg = fsg, 
                      strat = "STRA_ID_CORE", checkReliability = FALSE)
# The parameter reliabilitySplit = 15 will divide the data set in 15 groups for the 
# reliabilityCheck.
# This is more than recommended, but speeds up the computation for this example
himg5 = multiResGrid(fsl,  vars = c("UAA"), weights = "EXT_MODULE", ifg = fsg, 
                      strat = "STRA_ID_CORE", checkReliability = TRUE, 
                      reliabilitySplit = 15)
                      
# Apply suppreslim to suppress insignificant grid cells
# Show intermediate maps of confidential cells (wait 5 seconds)
pint = ifelse(interactive(), 5, FALSE)
#himg11 = multiResGrid(ifl, vars = "UAA", ifg = ifg, 
#                  suppresslim = 0, plotIntermediate = pint)
himg11 = himg1
himg12 = multiResGrid(ifl, vars = "UAA", ifg = ifg, 
                 suppresslim = 0.02, plotIntermediate = pint)
himg13 = multiResGrid(ifl, vars = "UAA", ifg = ifg, 
                 suppresslim = 0.05, plotIntermediate = pint)
himg14 = multiResGrid(ifl, vars = "UAA", ifg = ifg, 
                 suppresslim = 0.1, plotIntermediate = pint)
 
 
 # This is an example of a userfun that can be used for alternative restrictions
 # for a grid cell. This particular toy example assures that there are at least
 # nabove records with a value (UAA in this case) above limit. 
 ufun = function(df, nabove, limit) {
   sum(df$gridvar > limit) < nabove
 }
 
himg6 = multiResGrid(ifl, vars = "UAA", ifg = ifg, 
                 suppresslim = 0.2, plotIntermediate = pint, userfun = ufun, nabove = 5, limit = 10)
 
 
 
himg00 = st_intersection(dkb, himg0)
ggplot() + geom_sf(data = himg00, aes(fill = count, color = count)) +
  scale_fill_viridis( name = "number of farms", trans = "log10") +
  scale_color_viridis( name = "number of farms", trans = "log10") +
  geom_sf(data = dkb, fill = NA, colour='black', lwd = 1) +
  coord_sf(crs = 3035) +#, xlim = c(2377294, 6400000), ylim = c(1313597, 5628510)) +
  ggtitle("Number of farms for variable grid cell size, only frequency confidentiality") +
  theme_bw()
 
himg01 = st_intersection(dkb, himg1)
ggplot() + geom_sf(data = himg01, aes(fill = count, color = count)) +
  scale_fill_viridis( name = "number of farms", trans = "log10") +
  scale_color_viridis( name = "number of farms", trans = "log10") +
  geom_sf(data = dkb, fill = NA, colour='black', lwd = 1) +
  coord_sf(crs = 3035) +#, xlim = c(2377294, 6400000), ylim = c(1313597, 5628510)) +
  ggtitle("Number of farms for variable grid cell size, frequency and dominance confidentiality") +
  theme_bw()

  
# Plot the density of organic agriculture, as hectares per square km
himg02 = st_intersection(dkb, himg2)
himg02$orgarea = himg02$UAAXK0000_ORG/units::set_units(st_area(himg02), "km^2")
units(himg02$orgarea) = NULL
ggplot() + geom_sf(data = himg02, aes(fill = orgarea), lwd = 0) +
  scale_fill_viridis( name = "ha / km2") +
  geom_sf(data = dkb, fill = NA, colour='black', lwd = 1) +
  coord_sf(crs = 3035) +#, xlim = c(2377294, 6400000), ylim = c(1313597, 5628510)) +
  ggtitle("Organic UAA density")  +
  theme_bw()

# Plot the relative abundance of organic UAA relative to total UAA
himg03 = st_intersection(dkb, himg3)
himg03$ouaashare = himg03$UAAXK0000_ORG/himg03$UAA*100
ggplot() + geom_sf(data = himg03, aes(fill = ouaashare), lwd = 0) +
  scale_fill_viridis( name = "\% Organic") +
  geom_sf(data = dkb, fill = NA, colour='black', lwd = 1) +
  coord_sf(crs = 3035) +#, xlim = c(2377294, 6400000), ylim = c(1313597, 5628510)) +
  ggtitle("Organic share")  +
  theme_bw()
  
  
# Plot maps from survey data before and after adding the reliability constraint 
# The percentage of UAA can be above 100\% due to farm area being registered at the location
# of the administration building, but the map without reliability check has too high values 
# for too many cells

himg04 = st_intersection(dkb, himg4)
himg04$area = st_area(himg04)/1e6
units(himg04$area) = NULL
himg04$uaashare = himg04$UAA/himg04$area
himg04$uaashare[himg04$uaashare > 1000] = 1000
g4 = ggplot() + geom_sf(data = himg04, aes(fill = uaashare), lwd = 0) +
  scale_fill_viridis( name = "\% UAA",  trans = "log10", limits = c(1,1000)) +
  geom_sf(data = dkb, fill = NA, colour='black', lwd = 1) +
  coord_sf(crs = 3035) +#, xlim = c(2377294, 6400000), ylim = c(1313597, 5628510)) +
  ggtitle("UAA share (sample without reliability check)")  +
  theme_bw()
  
himg05 = st_intersection(dkb, himg5)
himg05$area = st_area(himg05)/1e6
units(himg05$area) = NULL
himg05$uaashare = himg05$UAA/himg05$area
himg05$uaashare[himg05$uaashare > 1000] = 1000
g5 = ggplot() + geom_sf(data = himg05, aes(fill = uaashare), lwd = 0) +
  scale_fill_viridis( name = "\% UAA",  trans = "log10", limits = c(1,1000)) +
  geom_sf(data = dkb, fill = NA, colour='black', lwd = 1) +
  coord_sf(crs = 3035) +#, xlim = c(2377294, 6400000), ylim = c(1313597, 5628510)) +
  ggtitle("UAA share (sample with reliability check)")  +
  theme_bw()
  
g4 + g5 + plot_layout(guides = "collect")
  
himg06 = st_intersection(dkb, himg6)
ggplot() + geom_sf(data = himg06, aes(fill = UAA), lwd = 0) +
  scale_fill_viridis( name = "ha") +
  geom_sf(data = dkb, fill = NA, colour='black', lwd = 1) +
  coord_sf(crs = 3035) +#, xlim = c(2377294, 6400000), ylim = c(1313597, 5628510)) +
  ggtitle("UAA, with additional user defined function")  +
  theme_bw()
  
     
# Plot the different maps from using different suppreslim values
himgs = list(himg11, himg12, himg13, himg14)
slims = c(0, 0.02, 0.05, 0.1, 0.2)
plots = list()
uaas = c(himg11$UAA, himg12$UAA, himg13$UAA, himg14$UAA)
lims = range(uaas[uaas > 0], na.rm = TRUE)
for (ii in 1:4) {
  himg = st_intersection(dkb, himgs[[ii]])
  plots[[ii]] = 
   ggplot() + geom_sf(data = himg, aes(fill = UAA), lwd = 0) +
    scale_fill_viridis( name = "UAA (ha)", trans = "log10", limits = lims, na.value="red") +
    geom_sf(data = dkb, fill = NA, colour='black', lwd = 0.5) +
    ggtitle(paste("Suppresslim = ", slims[[ii]])) +
    xlab("") + ylab("") +
    theme_bw()
}

plots[[1]]  + plots[[2]] + plots[[3]]  + plots[[4]] + plot_layout(guides = "collect")
 

}


}
