In this post, we will walk through the process of creating a deep segmentation model using a pytorch library SMP (Segmentation Models Pytorch). To verify that the modeling pipeline is working and the parameters are set up properly (as an initial experiment), we will also train the model using a small subset of the cloud satellite data in the Kaggle competition "Understanding the Cloud".
Highlights:
First, let's import the necessary libraries and set up the environment.
import tensorflow as tf
tf.test.gpu_device_name()
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
PATH = '/content/drive/My Drive/kaggle_cloud/data'
%cd /content/drive/My\ Drive/kaggle_cloud/src_cloudflower2/CloudFlower2
# If we need to check out the latest code, uncomment below:
!git pull
# The necessary libraries and the version numbers are specified in the requirement.txt
!pip install -r requirements.txt
import cv2
import matplotlib.pyplot as plt
import albumentations as albu
from albumentations import torch as AT
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, Dataset
from torch import nn
################################################################
# Set a seed for the random number generator (so the results can be reproduced)
################################################################
torch.manual_seed(0)
np.random.seed(0)
In this part, we will loan and take a quick view of the cloud image dataset. To demonstrate the neural network has a correct definition of its architecture and the optimization algorithm is working properly, we will only use fifty images for training and validation.
from utils.dataset_helper import read_train_df, split_image_dataset
PATH = '/content/drive/My Drive/kaggle_cloud/data'
csvfile = f'{PATH}/train.csv'
data_df = read_train_df(csvfile)
print(data_df.shape)
data_df.head()
FOLDER = 'train_images'
sample_file = 'fStafea4f4.jpg'
CLOUD_LABELS = ['Fish', 'Flower', 'Gravel', 'Sugar']
torch.cuda.empty_cache()
################################################################
# Set up the data size in the experiment using max_n_images
################################################################
# Since we are testing the model for a correct implementation, using only
# a small subset of the data.
df_train, df_valid = split_image_dataset(data_df, train_ratio=0.75, max_n_images=50)
print("#rows for the avaialble dataset:", data_df.shape)
print("#rows for the training data: ", df_train.shape)
print("#rows for the validation data: ", df_valid.shape)
In this part, we will set up a UNET model using Segmentation Models in Pytorch (smp). For detailed steps, please refer to: https://github.com/qubvel/segmentation_models.pytorch#start.
For UNET, we provide two dataset abstractions: (1) CloudDataset with images and masks; (2) DataLoader supporting data batching and shuffling. They will form the inputs for the FCNN model we will build in the next sections.
A tutorial to write DataLoader in pytorch is availabel: https://pytorch.org/tutorials/beginner/data_loading_tutorial.html?highlight=dataloader.
For simplicity, we scaled the pixel values into a range of [0, 1] by dividing them by 255.
NOTE: Image augmentation is skipped for this experiment and the augmentation function only involves a resizing. No further normalization is applied.
from utils.cloud_dataset import CloudDataset
def simple_preprocessing(image, mask):
# A simple preprocessing function to re-scale the image into the range of [0,1]
gv_max, gv_min = np.max(image), np.min(image)
image = (image - gv_min) / (gv_max - gv_min)
image = image.transpose(2, 0, 1).astype('float32')
mask = mask.transpose(2, 0, 1).astype('float32')
preprocessed = {'image': image, 'mask': mask}
return preprocessed
def test_simple_preprocessing():
image = 100. * np.random.rand(4,4,3)
mask = np.ones((4,4,1))
rst = simple_preprocessing(image, mask)
print(np.max(rst['image']))
print(np.min(rst['image']))
# assert(-1e-6 < np.max(rst['image']) - 1.0 < 1e-6)
test_simple_preprocessing()
import segmentation_models_pytorch as smp
from utils.dataset_helper import get_training_augmentation
from utils.dataset_helper import get_validation_augmentation
from utils.dataset_helper import get_preprocessing
# ENCODER = 'resnet50'
# ENCODER = 'inceptionresnetv2'
ENCODER = 'resnet18'
ENCODER_WEIGHTS = 'imagenet'
preprocessing_fn = smp.encoders.get_preprocessing_fn(ENCODER, ENCODER_WEIGHTS)
image_folder = f'{PATH}/{FOLDER}'
# get_validation_augmentation() only has a resize transformation:
train_dataset = CloudDataset(df_train, image_folder,
transforms = get_validation_augmentation(),
preprocessing=simple_preprocessing)
valid_dataset = CloudDataset(df_valid, image_folder,
transforms = get_validation_augmentation(),
preprocessing=simple_preprocessing)
#####################################
# Data:
#####################################
num_workers = 0 # number of subprocesses to load the data
bs = 16 # batch_size
train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True,
num_workers=num_workers)
valid_loader = DataLoader(valid_dataset, batch_size=bs, shuffle=True,
num_workers=num_workers)
loaders = {'train': train_loader,
'valid': valid_loader}
sample_ind = 0
# The original data sample:
data_ori = train_dataset.get_data_by_index(sample_ind)
# Preprocessed data:
data_pp = train_dataset[sample_ind]
assert(data_ori[0].ndim==3 and data_ori[0].shape[0]==3)
assert(data_pp[0].ndim==3 and data_pp[0].shape[0]==3)
from utils.dataset_helper import viz_image_mask_arrays
viz_image_mask_arrays(data_ori[0], data_ori[1])
viz_image_mask_arrays(data_pp[0], data_pp[1])
Here since we only applied a resize
operation on the original image, we don't observe any change except the size (changed into 320 x 640). We can verify that the mask and the images are resized consistently.
print("The size of the training dataset:", len(train_dataset))
print("The shape of input image data: ", data_pp[0].shape)
print("The shape of the input labels:", data_pp[1].shape)
print()
print("The max/min of the original image:", np.max(data_ori[0]), np.min(data_ori[0]))
print("The max/min of the preprocessed image:", np.max(data_pp[0]), np.min(data_pp[0]))
print("The mean/std of the preprocessed image:", np.mean(data_pp[0]), np.std(data_pp[0]))
#####################################
# UNET model using smp:
# Reference: https://github.com/qubvel/segmentation_models.pytorch#models
#####################################
# TODO: We put the activation in the loss function, so no need to repeat it here.
ACTIVATION = None
# ACTIVATION = 'sigmoid'
model = smp.Unet(
encoder_name=ENCODER,
encoder_weights=ENCODER_WEIGHTS,
classes=4,
activation=ACTIVATION,
)
from loss.dice_loss import BCEDiceLoss, DiceLoss, dice_no_threshold
criterion = BCEDiceLoss(activation='sigmoid')
#####################################
# Optimizer settings:
#####################################
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR
optimizer = torch.optim.Adam([
{'params': model.decoder.parameters(), 'lr': 1e-2},
{'params': model.encoder.parameters(), 'lr': 1e-3},
])
# Other values of learning rates which have been tested:
# (1e-3, 1e-4), (5e-3, 5e-4), (5e-2, 1e-2)
# opt_level = 'O1'
# model.cuda()
# model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
# Using scheduler to apply learning reduce on Plateau:
scheduler = ReduceLROnPlateau(optimizer, factor=0.15, patience=2)
from tqdm.auto import tqdm as tq
# for i, p in enumerate(model.encoder.parameters()):
# print(p.data.max(), p.data.min())
print("BEFORE training:")
params = list(model.encoder.parameters())
print("#parameter groups:", len(params))
print("The max/min of the params[0]:", params[0].data.max(), params[0].data.min())
print("The mean/std of the params[0]:", params[0].data.mean(), params[0].data.std())
print(params[0].grad is None)
torch.cuda.empty_cache()
from models.runner import Runner
from datetime import datetime
rst_path = "/content/drive/My Drive/kaggle_cloud/run_20200617"
cld_runner = Runner(model, criterion)
train_rst = cld_runner.train(train_loader, valid_loader,
optimizer, scheduler,
valid_score_fn = dice_no_threshold,
n_epochs = 50, train_on_gpu=True, verbose=True, rst_path = rst_path)
train_loss_list, valid_loss_list, dice_score_list = train_rst[0], train_rst[1], train_rst[2]
params = list(model.encoder.parameters())
print("AFTER training:")
print("")
print("#parameter groups:", len(params))
print("The max/min of the params[0]:", params[0].data.max(), params[0].data.min())
print("The mean/std of the params[0]:", params[0].data.mean(), params[0].data.std())
print(params[0].grad is None)
import matplotlib.pyplot as plt
plt.figure(figsize=(8,5))
plt.plot(train_loss_list, marker='o', label="Training Loss")
plt.ylabel('loss', fontsize=22)
plt.legend()
plt.show()
plt.figure(figsize=(8, 5))
plt.plot(valid_loss_list[5:], marker='o', label="Validation Loss")
plt.ylabel('loss', fontsize=22)
plt.legend()
plt.show()
plt.figure(figsize=(8,5))
plt.plot(dice_score_list)
plt.ylabel('Dice score')
plt.show()
Summary:
In this simple experiment, we have achieved the following: