12 Statistical learning

The solutions assume the following packages are attached (other packages will be attached when needed):

library(dplyr)
# library(kernlab)
library(mlr3)
library(mlr3learners)
library(mlr3extralearners)
library(mlr3spatiotempcv)
library(mlr3tuning)
library(qgisprocess)
library(terra)
library(sf)
library(tmap)

E1. Compute the following terrain attributes from the elev dataset loaded with terra::rast(system.file("raster/ta.tif", package = "spDataLarge"))$elev with the help of R-GIS bridges (see the bridges to GIS software chapter):

Slope
Plan curvature
Profile curvature
Catchment area

# attach data
dem = terra::rast(system.file("raster/ta.tif", package = "spDataLarge"))$elev

algs = qgisprocess::qgis_algorithms()
qgis_search_algorithms("curvature")
alg = "sagang:slopeaspectcurvature"
qgisprocess::qgis_show_help(alg)
qgisprocess::qgis_get_argument_specs(alg)
# terrain attributes (ta)
out_nms = paste0(tempdir(), "/", c("slope", "cplan", "cprof"),
                 ".sdat")
args = rlang::set_names(out_nms, c("SLOPE", "C_PLAN", "C_PROF"))
out = qgis_run_algorithm(alg, ELEVATION = dem, METHOD = 6, 
                         UNIT_SLOPE = "[1] degree",
                         !!!args,
                         .quiet = TRUE
                         )
ta = out[names(args)] |> unlist() |> terra::rast()
names(ta) = c("slope", "cplan", "cprof")
# catchment area
qgis_search_algorithms("[Cc]atchment")
alg = "sagang:catchmentarea"
qgis_show_help(alg)
qgis_get_argument_specs(alg)
carea = qgis_run_algorithm(alg,
                           ELEVATION = dem, 
                           METHOD = 4, 
                           FLOW = file.path(tempdir(), "carea.sdat"))
# transform carea
carea = terra::rast(carea$FLOW[1])
log10_carea = log10(carea)
names(log10_carea) = "log10_carea"
# add log_carea and dem to the terrain attributes
ta = c(ta, dem, log10_carea)

E2. Extract the values from the corresponding output rasters to the lsl data frame (data("lsl", package = "spDataLarge") by adding new variables called slope, cplan, cprof, elev and log_carea.

# attach terrain attribute raster stack (in case you have skipped the previous
# exercise)
data("lsl", package = "spDataLarge")
ta = terra::rast(system.file("raster/ta.tif", package = "spDataLarge"))
lsl = select(lsl, x, y, lslpts)
# extract values to points, i.e., create predictors
lsl[, names(ta)] = terra::extract(ta, lsl[, c("x", "y")]) |>
  select(-ID)

E3. Use the derived terrain attribute rasters in combination with a GLM to make a spatial prediction map similar to that shown in Figure 12.2. Running data("study_mask", package = "spDataLarge") attaches a mask of the study area.

# attach data (in case you have skipped exercises 1) and 2)
# landslide points with terrain attributes and terrain attribute raster stack
data("lsl", "study_mask", package = "spDataLarge")
ta = terra::rast(system.file("raster/ta.tif", package = "spDataLarge"))

# fit the model
fit = glm(lslpts ~ slope + cplan + cprof + elev + log10_carea, 
          data = lsl, family = binomial())

# make the prediction
pred = terra::predict(object = ta, model = fit, type = "response")

# make the map
lsl_sf = sf::st_as_sf(lsl, coords = c("x", "y"), crs = 32717)
lsl_sf = sf::st_as_sf(lsl, coords = c("x", "y"), crs = 32717)
hs = terra::shade(ta$slope * pi / 180,
                  terra::terrain(ta$elev, v = "aspect", unit = "radians"))
rect = tmaptools::bb_poly(raster::raster(hs))
bbx = tmaptools::bb(raster::raster(hs), xlim = c(-0.00001, 1),
                    ylim = c(-0.00001, 1), relative = TRUE)

tm_shape(terra::mask(hs, study_mask), bbox = bbx) +
  tm_grid(col = "black", n.x = 1, n.y = 1, labels.inside.frame = FALSE,
          labels.rot = c(0, 90), lines = FALSE) +
    tm_raster(col.scale = tm_scale(values = gray(0:100 / 100), n = 100),
            col.legend = tm_legend_hide()) +
    # prediction raster
  tm_shape(terra::mask(pred, study_mask)) +
    tm_raster(col_alpha = 0.5,
            col.scale = tm_scale(values = "Reds", n = 6),
            col.legend = tm_legend(title = "Susceptibility")) +
    # rectangle and outer margins
  tm_shape(rect) + 
  tm_borders() +
    tm_layout(legend.position = c("left", "bottom"),
              legend.title.size = 0.9)

E4. Compute a 100-repeated 5-fold non-spatial cross-validation and spatial CV based on the GLM learner and compare the AUROC values from both resampling strategies with the help of boxplots.

Hint: You need to specify a non-spatial resampling strategy.

Another hint: You might want to solve Excercises 4 to 6 in one go with the help of mlr3::benchmark() and mlr3::benchmark_grid() (for more information, please refer to https://mlr3book.mlr-org.com/chapters/chapter10/advanced_technical_aspects_of_mlr3.html#sec-fallback). When doing so, keep in mind that the computation can take very long, probably several days. This, of course, depends on your system. Computation time will be shorter the more RAM and cores you have at your disposal.

# attach data (in case you have skipped exercises 1) and 2)
data("lsl", package = "spDataLarge")  # landslide points with terrain attributes

# create task
task = TaskClassifST$new(
  id = "lsl_ecuador",
  backend = mlr3::as_data_backend(lsl), target = "lslpts", positive = "TRUE",
  coordinate_names = c("x", "y"),
  coords_as_features = FALSE,
  crs = 32717
)

# construct learners (for all subsequent exercises)
# GLM
lrn_glm = lrn("classif.log_reg", predict_type = "prob")
lrn_glm$fallback = lrn("classif.featureless", predict_type = "prob")

# SVM
# construct SVM learner (using ksvm function from the kernlab package)
lrn_ksvm = lrn("classif.ksvm", predict_type = "prob", kernel = "rbfdot",
               type = "C-svc")
lrn_ksvm$fallback = lrn("classif.featureless", predict_type = "prob")

# specify nested resampling and adjust learner accordingly
# five spatially disjoint partitions
tune_level = rsmp("spcv_coords", folds = 5)
# use 50 randomly selected hyperparameters
terminator = trm("evals", n_evals = 50)
tuner = tnr("random_search")
# define the outer limits of the randomly selected hyperparameters
ps = ps(
  C = p_dbl(lower = -12, upper = 15, trafo = function(x) 2^x),
  sigma = p_dbl(lower = -15, upper = 6, trafo = function(x) 2^x)
)
at_ksvm = AutoTuner$new(
  learner = lrn_ksvm,
  resampling = tune_level,
  measure = msr("classif.auc"),
  search_space = ps,
  terminator = terminator,
  tuner = tuner
)

# QDA
lrn_qda = lrn("classif.qda", predict_type = "prob")
lrn_qda$fallback = lrn("classif.featureless", predict_type = "prob")

# SVM without tuning hyperparameters
vals = lrn_ksvm$param_set$values
lrn_ksvm_notune = lrn_ksvm$clone()
lrn_ksvm_notune$param_set$values = c(vals, C = 1, sigma = 1)

# define resampling strategies
# specify the reampling method, i.e. spatial CV with 100 repetitions and 5 folds
# -> in each repetition dataset will be splitted into five folds
# method: repeated_spcv_coords -> spatial partioning
rsmp_sp = rsmp("repeated_spcv_coords", folds = 5, repeats = 100)
# method: repeated_cv -> non-spatial partitioning
rsmp_nsp = rsmp("repeated_cv", folds = 5, repeats = 100)

# (spatial) cross-validataion
#****************************
# create your design
grid = benchmark_grid(tasks = task, 
                      learners = list(lrn_glm, at_ksvm, lrn_qda, 
                                      lrn_ksvm_notune),
                      resamplings = list(rsmp_sp, rsmp_nsp))
# execute the cross-validation
library(future)
# execute the outer loop sequentially and parallelize the inner loop
future::plan(list("sequential", "multisession"), 
             workers = floor(availableCores() / 2))
set.seed(021522)
bmr = benchmark(grid, 
                store_backends = FALSE, 
                store_models = FALSE, 
                encapsulate = "evaluate")
# stop parallelization
future:::ClusterRegistry("stop")
# save your result, e.g., to 
# saveRDS(bmr, file = "extdata/12-bmr.rds")

# plot your result
autoplot(bmr, measure = msr("classif.auc"))

E5. Model landslide susceptibility using a quadratic discriminant analysis (QDA). Assess the predictive performance of the QDA. What is the a difference between the spatially cross-validated mean AUROC value of the QDA and the GLM?

# attach data (in case you have skipped exercise 4)
bmr = readRDS("extdata/12-bmr.rds")

# plot your result
autoplot(bmr, measure = msr("classif.auc"))
# QDA has higher AUROC values on average than GLM which indicates moderately
# non-linear boundaries

E6. Run the SVM without tuning the hyperparameters. Use the rbfdot kernel with $\sigma$ = 1 and C = 1. Leaving the hyperparameters unspecified in kernlab’s ksvm() would otherwise initialize an automatic non-spatial hyperparameter tuning.

# attach data (in case you have skipped exercise 4)
bmr = readRDS("extdata/12-bmr.rds")
# plot your result
autoplot(bmr, measure = msr("classif.auc"))

11 Scripts, algorithms and functions

13 Transportation

Geocomputation with R: Solutions

12 Statistical learning

Solutions to the Second Edition's exercises