BIBC2025 computer vision workshop – Hyperparameter tuning

Deep Learning and Computer Vision in R: A Practical Introduction (2.3)

BIBC2025 workshop - Hyperparameter tuning

Patrick Li

RSFAS, ANU

Content summary

Overview of computer vision (CV)
reticulate basics
Image classification
> Hyperparameter tuning
CV model interpretation
Object detection
Image segmentation

Hyperparameter tuning

Hyperparameter tuning in CNNs is crucial: with the wrong choices, like a network that’s too small or a learning rate that’s too unstable, the model may not just perform poorly, it may fail to learn anything at all.

CNNs involve many hyperparameters, including:

Topology: number of layers, filters, layout, and units in dense layers
Regularization: dropout, L1/L2 penalties, batch normalization, layer normalization
Training setup: optimizer, learning rate and schedules, batch size, epochs
…

`keras_tuner`

keras_tuner is a hyperparameter tuning library built for keras.

It offers:

Specification of various types of hyperparameters
Multiple algorithms for hyperparameter tuning
Pre-defined search spaces for specific model families, such as ResNet.

Hyperparameters

Boolean

keras_tuner <- import("keras_tuner", 
                      convert = FALSE)
hp <- keras_tuner$HyperParameters()
hp$Boolean(name = "bool")

False

Choice

hp$Choice(name = "class", 
          values = c("a", "b"))

'a'

Float

hp$Float(name = "f", 
         min_value = 0, 
         max_value = 1, 
         step = 0.1)

0.0

Integer

hp$Int(name = "i", 
       min_value = 0, 
       max_value = 10, 
       step = 2)

Hyperparameters

All the defined hyperparameters will be recorded in the hyperparameter space.

hp$space

[Boolean(name: "bool", default: False), Choice(name: 'class', values: ['a', 'b'], ordered: False, default: a), Float(name: 'f', min_value: '0.0', max_value: '1.0', step: '0.1', sampling: 'linear', default: '0.0'), Int(name: 'i', min_value: 0, max_value: 10, step: 2, sampling: linear, default: 0)]

Model building with hyperparameters

Consider a simple CNN model, where we want to tune the number of filters:

input <- keras$layers$Input(tuple(32L, 32L, 3L))
x <- keras$layers$Conv2D(16L, tuple(3L, 3L), padding = "same", activation = "relu")(input)
x <- keras$layers$MaxPool2D(tuple(2L, 2L))(x)
x <- keras$layers$GlobalAveragePooling2D()(x)
output <- keras$layers$Dense(1L, activation = "sigmoid")(x)
model <- keras$Model(input, output)
model$compile(optimizer = "sgd", 
              loss = "binary_crossentropy", 
              metrics = list("accuracy"))

Model building with hyperparameters

We can use the Int hyperparameter filters as a placeholder for the argument.

build_model <- function(hp) {
  filters <- hp$Int("filters", min_value = 8L, max_value = 64L)
  
  input <- keras$layers$Input(tuple(32L, 32L, 3L))
  x <- keras$layers$Conv2D(filters, tuple(3L, 3L), padding = "same", activation = "relu")(input)
  x <- keras$layers$MaxPool2D(tuple(2L, 2L))(x)
  x <- keras$layers$GlobalAveragePooling2D()(x)
  output <- keras$layers$Dense(1L, activation = "sigmoid")(x)
  model <- keras$Model(input, output)
  model$compile(optimizer = "sgd", 
                loss = "binary_crossentropy",
                metrics = list("accuracy"))
  return(model)
}

Model building with hyperparameters

Similarly, we can customize the learning rate using Float.

build_model <- function(hp) {
  filters <- hp$Int("filters", min_value = 8L, max_value = 64L)
  lr <- hp$Float("lr", min_value = 1e-5, max_value = 1e-2, 
                 step = 2L, sampling = "log")
  
  input <- keras$layers$Input(tuple(32L, 32L, 3L))
  x <- keras$layers$Conv2D(filters, tuple(3L, 3L), padding = "same", activation = "relu")(input)
  x <- keras$layers$MaxPool2D(tuple(2L, 2L))(x)
  x <- keras$layers$GlobalAveragePooling2D()(x)
  output <- keras$layers$Dense(1L, activation = "sigmoid")(x)
  model <- keras$Model(input, output)
  model$compile(optimizer = keras$optimizers$SGD(learning_rate = lr), 
                loss = "binary_crossentropy",
                metrics = list("accuracy"))
  return(model)
}

Model building with hyperparameters

Branching can be done via Choice or Boolean.

build_model <- function(hp) {
  filters <- hp$Int("filters", min_value = 8L, max_value = 64L)
  lr <- hp$Float("lr", min_value = 1e-5, max_value = 1e-2, 
                 step = 2L, sampling = "log")
  gp <- hp$Choice("gp", values = c("max", "ave"))
  
  input <- keras$layers$Input(tuple(32L, 32L, 3L))
  x <- keras$layers$Conv2D(filters, tuple(3L, 3L), padding = "same", activation = "relu")(input)
  x <- keras$layers$MaxPool2D(tuple(2L, 2L))(x)
  if (py_to_r(gp) == "ave") {
    x <- keras$layers$GlobalAveragePooling2D()(x)
  } else {
    x <- keras$layers$GlobalMaxPool2D()(x)
  }
  output <- keras$layers$Dense(1L, activation = "sigmoid")(x)
  model <- keras$Model(input, output)
  model$compile(optimizer = keras$optimizers$SGD(learning_rate = lr), 
                loss = "binary_crossentropy",
                metrics = list("accuracy"))
  return(model)
}

Model building with hyperparameters

Combined with for loop, Int can be used to stack convolutional blocks.

build_model <- function(hp) {
  filters <- hp$Int("filters", min_value = 8L, max_value = 64L)
  lr <- hp$Float("lr", min_value = 1e-5, max_value = 1e-2, 
                 step = 2L, sampling = "log")
  gp <- hp$Choice("gp", values = c("max", "ave"))
  blocks <- hp$Int("blocks", min_value = 1L, max_value = 4L)
  
  input <- keras$layers$Input(tuple(32L, 32L, 3L))
  x <- input
  py_for(i ~ py_builtins$range(blocks), {
    x <- keras$layers$Conv2D(filters, tuple(3L, 3L), padding = "same", activation = "relu")(x)
    x <- keras$layers$MaxPool2D(tuple(2L, 2L))(x)
  })
  if (py_to_r(gp) == "ave") {
    x <- keras$layers$GlobalAveragePooling2D()(x)
  } else {
    x <- keras$layers$GlobalMaxPool2D()(x)
  }
  output <- keras$layers$Dense(1L, activation = "sigmoid")(x)
  model <- keras$Model(input, output)
  model$compile(optimizer = keras$optimizers$SGD(learning_rate = lr), 
                loss = "binary_crossentropy",
                metrics = list("accuracy"))
  return(model)
}

Search strategies

Random search (keras_tuner$RandomSearch)
- Randomly samples hyperparameter combinations and often finds good results faster than exhaustive methods.
Grid search (keras_tuner$GridSearch)
- Systematically tests all possible hyperparameter combinations in a predefined grid.
Bayesian optimization (keras_tuner$BayesianOptimization)
- Uses Bayesian inference to build a probabilistic model of performance and selects new hyperparameters that maximize the expected improvement based on prior evaluations.
Hyperband (keras_tuner$Hyperband)
- Allocates resources dynamically by early-stopping poor configurations and focusing on promising ones.

tuner <- keras_tuner$BayesianOptimization(
  hypermodel = build_model,
  objective = "val_accuracy",
  max_trials = 10L,
  num_initial_points = 5L,
  executions_per_trial = 1L,
  overwrite = FALSE,
  directory = "keras_tuner",
  project_name = "cat_and_dog_bayesian"
)

tuner$search_space_summary()

Reloading Tuner from keras_tuner/cat_and_dog_bayesian/tuner0.json

Search space summary
Default search space size: 4
filters (Int)
{'default': None, 'conditions': [], 'min_value': 8, 'max_value': 64, 'step': 1, 'sampling': 'linear'}
lr (Float)
{'default': 1e-05, 'conditions': [], 'min_value': 1e-05, 'max_value': 0.01, 'step': 2, 'sampling': 'log'}
gp (Choice)
{'default': 'max', 'conditions': [], 'values': ['max', 'ave'], 'ordered': False}
blocks (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 4, 'step': 1, 'sampling': 'linear'}

Start the search

search can be used in the same way as fit.

tuner$search(x, y, epochs = 50L, validation_split = 0.2)
tuner$results_summary()

Results summary
Results in keras_tuner/cat_and_dog_bayesian
Showing 10 best trials
Objective(name="val_accuracy", direction="max")

Trial 00 summary
Hyperparameters:
filters: 48
lr: 1e-05
gp: ave
blocks: 1
Score: 0.7127913236618042

Trial 04 summary
Hyperparameters:
filters: 62
lr: 2e-05
gp: max
blocks: 2
Score: 0.6926796436309814

Trial 02 summary
Hyperparameters:
filters: 33
lr: 0.00032
gp: ave
blocks: 3
Score: 0.6901034116744995

Trial 03 summary
Hyperparameters:
filters: 56
lr: 4e-05
gp: max
blocks: 3
Score: 0.6897758841514587

Trial 01 summary
Hyperparameters:
filters: 24
lr: 0.00128
gp: ave
blocks: 4
Score: 0.6739906668663025

Trial 05 summary
Hyperparameters:
filters: 8
lr: 0.00512
gp: max
blocks: 4
Score: 0.6547898054122925

Trial 06 summary
Hyperparameters:
filters: 8
lr: 0.00512
gp: max
blocks: 1
Score: 0.6524869203567505

Trial 07 summary
Hyperparameters:
filters: 42
lr: 0.00512
gp: max
blocks: 1
Score: 0.6261098980903625

Trial 09 summary
Hyperparameters:
filters: 64
lr: 0.00512
gp: max
blocks: 1
Score: 0.6083124876022339

Trial 08 summary
Hyperparameters:
filters: 64
lr: 0.00512
gp: max
blocks: 1
Score: 0.6056148409843445

Tune model training

To tune the model training process, we need to subclass HyperModel.

HYPER_MODEL <- py_class("HYPER_MODEL", inherit = keras_tuner$HyperModel,
                        build = function(self, hp) {
                          return(build_model(hp))
                        },
                        
                        fit = function(self, hp, model, ...) {
                          epochs <- hp$Int("epochs", 
                                           min_value = 10L, 
                                           max_value = 100L, 
                                           step = 10L)
                          model$fit(epochs = epochs, ...)
                        })

Tune model training

Code
Output

tuner <- keras_tuner$BayesianOptimization(
  hypermodel = HYPER_MODEL(),
  objective = "val_accuracy",
  max_trials = 10L,
  num_initial_points = 5L,
  executions_per_trial = 1L,
  overwrite = FALSE,
  directory = "keras_tuner",
  project_name = "cat_and_dog_bayesian_fit"
)

tuner$search_space_summary()

Reloading Tuner from keras_tuner/cat_and_dog_bayesian_fit/tuner0.json

Search space summary
Default search space size: 5
filters (Int)
{'default': None, 'conditions': [], 'min_value': 8, 'max_value': 64, 'step': 1, 'sampling': 'linear'}
lr (Float)
{'default': 1e-05, 'conditions': [], 'min_value': 1e-05, 'max_value': 0.01, 'step': 2, 'sampling': 'log'}
gp (Choice)
{'default': 'max', 'conditions': [], 'values': ['max', 'ave'], 'ordered': False}
blocks (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 4, 'step': 1, 'sampling': 'linear'}
epochs (Int)
{'default': None, 'conditions': [], 'min_value': 10, 'max_value': 100, 'step': 10, 'sampling': 'linear'}

Start the search

tuner$search(x, y, validation_split = 0.2)
tuner$results_summary()

Results summary
Results in keras_tuner/cat_and_dog_bayesian_fit
Showing 10 best trials
Objective(name="val_accuracy", direction="max")

Trial 07 summary
Hyperparameters:
filters: 50
lr: 0.00512
gp: max
blocks: 1
epochs: 100
Score: 0.6940000057220459

Trial 05 summary
Hyperparameters:
filters: 32
lr: 0.00512
gp: max
blocks: 1
epochs: 70
Score: 0.6909999847412109

Trial 09 summary
Hyperparameters:
filters: 64
lr: 0.00512
gp: max
blocks: 1
epochs: 60
Score: 0.6884999871253967

Trial 01 summary
Hyperparameters:
filters: 25
lr: 0.00512
gp: max
blocks: 1
epochs: 90
Score: 0.6884999871253967

Trial 08 summary
Hyperparameters:
filters: 39
lr: 0.00512
gp: max
blocks: 3
epochs: 100
Score: 0.6759999990463257

Trial 04 summary
Hyperparameters:
filters: 56
lr: 0.00128
gp: max
blocks: 2
epochs: 20
Score: 0.6464999914169312

Trial 06 summary
Hyperparameters:
filters: 8
lr: 0.00512
gp: max
blocks: 1
epochs: 30
Score: 0.6230000257492065

Trial 03 summary
Hyperparameters:
filters: 52
lr: 0.00032
gp: max
blocks: 3
epochs: 40
Score: 0.6175000071525574

Trial 00 summary
Hyperparameters:
filters: 61
lr: 0.00256
gp: ave
blocks: 2
epochs: 10
Score: 0.5684999823570251

Trial 02 summary
Hyperparameters:
filters: 55
lr: 0.00016
gp: ave
blocks: 1
epochs: 50
Score: 0.5460000038146973

Get the best model

best_model <- tuner$get_best_models()[0]
best_model$summary()

Model: "functional"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer (InputLayer)        │ (None, 32, 32, 3)      │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ conv2d (Conv2D)                 │ (None, 32, 32, 50)     │         1,400 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ max_pooling2d (MaxPooling2D)    │ (None, 16, 16, 50)     │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ global_max_pooling2d            │ (None, 50)             │             0 │
│ (GlobalMaxPooling2D)            │                        │               │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense (Dense)                   │ (None, 1)              │            51 │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 1,451 (5.67 KB)
 Trainable params: 1,451 (5.67 KB)
 Non-trainable params: 0 (0.00 B)

Retrain the best model

After selecting the optimal hyperparameters, we typically retrain the model using the full dataset.

This differs from the hyperparameter search stage, where a portion of the training data is usually held out for validation.

best_hp <- tuner$get_best_hyperparameters()[0]
model <- build_model(best_hp)
model$fit(x, y, epochs = best_hp$values$epochs,
          validation_data = list(x_test, y_test))