Species mapping using Random Forest
Mapping species in Neukalen, Germany from 2020 to 2022 using Python based on Random Forest classifier and UAV multispectral images
Author: Viet Nguyen
Earth Observation and Geoinformation Science Lab
University of Greifswald | Institute of Geography and Geology
Email: duc.nguyen@uni-greifswald.de
Date: November 2024
🛈 Information: This project was hosted at GitHub.
import library
In [2]:
import os
import glob
from osgeo import gdal
from multiprocessing import Pool
import time
import rasterio as rio
from rasterio.plot import show
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
import geopandas as gpd
import fiona
import seaborn as sns
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from joblib import dump
# set wd
os.getcwd()
os.chdir(os.getcwd().replace('\\script',''))
os.getcwd()
Out[2]:
'h:\\02_work\\202312_Greifswald\\07_Teichweide_map'
Data paths
In [3]:
# get raw raster paths (raw dem + orthomosaic)
raw_rast_paths = glob.glob('00_data/*.tif')
print('\n Raw raster paths:')
print(raw_rast_paths)
# set preprocessd raster directory (clip and aggregate)
prep_rast_dir = '01_output/01_prep_rast/'
# get preprocessed ortho paths
prep_ortho_paths = glob.glob(f'{prep_rast_dir}/*ortho*')
print(prep_ortho_paths)
# get preprocessed raster paths by year
rast_2020_paths = glob.glob(f'{prep_rast_dir}/2020*.tif')
rast_2020_paths
rast_2021_paths = glob.glob(f'{prep_rast_dir}/2021*.tif')
rast_2021_paths
rast_2022_paths = glob.glob(f'{prep_rast_dir}/2022*.tif')
rast_2022_paths
rast_paths = [rast_2020_paths, rast_2021_paths, rast_2022_paths]
rast_paths
Raw raster paths: ['00_data\\20200917_dem.tif', '00_data\\20200917_ortho.tif', '00_data\\20210907_dem.tif', '00_data\\20210907_ortho.tif', '00_data\\20220831_dem.tif', '00_data\\20220831_ortho.tif'] ['01_output/01_prep_rast\\20200917_ortho_clip_50cm_med.tif', '01_output/01_prep_rast\\20200917_ortho_clip_50cm_q1.tif', '01_output/01_prep_rast\\20200917_ortho_clip_50cm_q3.tif', '01_output/01_prep_rast\\20210907_ortho_clip_50cm_med.tif', '01_output/01_prep_rast\\20210907_ortho_clip_50cm_q1.tif', '01_output/01_prep_rast\\20210907_ortho_clip_50cm_q3.tif', '01_output/01_prep_rast\\20220831_ortho_clip_50cm_med.tif', '01_output/01_prep_rast\\20220831_ortho_clip_50cm_med.tif.aux.xml', '01_output/01_prep_rast\\20220831_ortho_clip_50cm_q1.tif', '01_output/01_prep_rast\\20220831_ortho_clip_50cm_q3.tif']
Out[3]:
[['01_output/01_prep_rast\\20200917_dem_clip_50cm_med.tif', '01_output/01_prep_rast\\20200917_dem_clip_50cm_q1.tif', '01_output/01_prep_rast\\20200917_dem_clip_50cm_q3.tif', '01_output/01_prep_rast\\20200917_ndvi_clip_50cm_med.tif', '01_output/01_prep_rast\\20200917_ndvi_clip_50cm_q1.tif', '01_output/01_prep_rast\\20200917_ndvi_clip_50cm_q3.tif', '01_output/01_prep_rast\\20200917_ortho_clip_50cm_med.tif', '01_output/01_prep_rast\\20200917_ortho_clip_50cm_q1.tif', '01_output/01_prep_rast\\20200917_ortho_clip_50cm_q3.tif'], ['01_output/01_prep_rast\\20210907_dem_clip_50cm_med.tif', '01_output/01_prep_rast\\20210907_dem_clip_50cm_q1.tif', '01_output/01_prep_rast\\20210907_dem_clip_50cm_q3.tif', '01_output/01_prep_rast\\20210907_ndvi_clip_50cm_med.tif', '01_output/01_prep_rast\\20210907_ndvi_clip_50cm_q1.tif', '01_output/01_prep_rast\\20210907_ndvi_clip_50cm_q3.tif', '01_output/01_prep_rast\\20210907_ortho_clip_50cm_med.tif', '01_output/01_prep_rast\\20210907_ortho_clip_50cm_q1.tif', '01_output/01_prep_rast\\20210907_ortho_clip_50cm_q3.tif'], ['01_output/01_prep_rast\\20220831_dem_clip_50cm_med.tif', '01_output/01_prep_rast\\20220831_dem_clip_50cm_q1.tif', '01_output/01_prep_rast\\20220831_dem_clip_50cm_q3.tif', '01_output/01_prep_rast\\20220831_ndvi_clip_50cm_med.tif', '01_output/01_prep_rast\\20220831_ndvi_clip_50cm_q1.tif', '01_output/01_prep_rast\\20220831_ndvi_clip_50cm_q3.tif', '01_output/01_prep_rast\\20220831_ortho_clip_50cm_med.tif', '01_output/01_prep_rast\\20220831_ortho_clip_50cm_q1.tif', '01_output/01_prep_rast\\20220831_ortho_clip_50cm_q3.tif']]
1. Raster preparation
1.1. Clip and resample drone images
In [ ]:
# Tell GDAL to throw Python exceptions, and register all drivers
gdal.UseExceptions()
gdal.AllRegister()
# list of quantiles for resampling
quantiles = ['med','q1','q3']
### function to clip and resampling raster
def clip_func(path):
name = path.split('\\')[1].split('.')[0]
print(f'\n clipping {name} ...\n')
for quantile in quantiles:
with gdal.Open(path) as source:
gdal.Warp(destNameOrDestDS = f'{prep_rast_dir}{name}_clip_50cm_{quantile}.tif',
srcDSOrSrcDSTab = source,
options=f'-t_srs EPSG:5650 -of GTiff -ot Float32\
-cutline 00_data/area_studysite.shp \
-cl area_studysite -crop_to_cutline \
-tr 0.5 0.5 -r {quantile}\
-multi -co NUM_THREADS=ALL_CPUS -wo NUM_THREADS=ALL_CPUS \
-co TILED=YES -co BIGTIFF=YES')
# -co COMPRESS=DEFLATE
print(f'\n Done {name}_clip !\n')
source = None
return None
if __name__ == "__main__":
# Use multiprocessing Pool to process rasters in parallel
start_time = time.time()
with Pool(processes = os.cpu_count()) as pool:
pool.map(clip_func, raw_rast_paths)
pool.close()
end_time = time.time()
print(f"Elapsed time: {(end_time - start_time)/60} minutes")
1.2. NDVI calculation
In [ ]:
for path in prep_ortho_paths:
# get base name
year = path.split('\\')[1].split('_')[0]
quantile = path.split('_')[7].split('.')[0]
# load red and NIR band
with rasterio.open(path) as src:
red_band = src.read(3)
nir_band = src.read(5)
# Allow division by zero
numpy.seterr(divide='ignore', invalid='ignore')
# calculte NDVI
ndvi = (nir_band.astype(float) - red_band.astype(float))/(nir_band+red_band)
# Set metadata spatial characteristics of the output raster to mirror the input
kwargs = src.meta
kwargs.update(dtype=rasterio.float32, count = 1)
kwargs
# write ndvi raster
with rasterio.open(f'{prep_rast_dir}/{year}_ndvi_clip_50cm_{quantile}.tif', 'w', **kwargs) as dst:
dst.write_band(1, ndvi.astype(rasterio.float32))
In [ ]:
# check ndvi
print(numpy.nanmin(ndvi))
print(numpy.nanmax(ndvi))
#### plot NDVI
# Set min/max values from NDVI range for image
min=numpy.nanmin(ndvi)
max=numpy.nanmax(ndvi)
# color scheme
class MidpointNormalize(colors.Normalize):
def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
self.midpoint = midpoint
colors.Normalize.__init__(self, vmin, vmax, clip)
def __call__(self, value, clip=None):
x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
return numpy.ma.masked_array(numpy.interp(value, x, y), numpy.isnan(value))
# Set our custom midpoint for most effective NDVI analysis
mid=0.1
# Setting color scheme ref:https://matplotlib.org/users/colormaps.html as a reference
colormap = plt.cm.RdYlGn
norm = MidpointNormalize(vmin=min, vmax=max, midpoint=mid)
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(111)
# Use 'imshow' to specify the input data, colormap, min, max, and norm for the colorbar
cbar_plot = ax.imshow(ndvi, cmap=colormap, norm=norm)
# Turn off the display of axis labels
ax.axis('off')
# Set a title
ax.set_title('Normalized Difference Vegetation Index', fontsize=17, fontweight='bold')
# Configure the colorbar
cbar = fig.colorbar(cbar_plot, orientation='horizontal', shrink=0.65)
# Call 'savefig' to save this plot to an image file
# fig.savefig("ndvi-image.png", dpi=200, bbox_inches='tight', pad_inches=0.7)
# let's visualize
plt.show()
1.3. Raster stacking
Stack DEM + NDVI + orthomosaic together per year
In [ ]:
for rast_year_path in rast_paths:
year = rast_year_path[0].split('\\')[1].split('_')[0]
for i in range(len(rast_year_path)):
if i == 0:
# read dem med
with rasterio.open(rast_year_path[i]) as dem_src:
dem_med = dem_src.read(1)
if i == 1:
# read dem q1
with rasterio.open(rast_year_path[i]) as dem_src:
dem_q1 = dem_src.read(1)
if i == 2:
# read dem q3
with rasterio.open(rast_year_path[i]) as dem_src:
dem_q3 = dem_src.read(1)
if i == 3:
# read ndvi med
with rasterio.open(rast_year_path[i]) as ndvi_src:
ndvi_med = ndvi_src.read(1)
if i == 4:
# read ndvi q1
with rasterio.open(rast_year_path[i]) as ndvi_src:
ndvi_q1 = ndvi_src.read(1)
if i == 5:
# read ndvi q3
with rasterio.open(rast_year_path[i]) as ndvi_src:
ndvi_q3 = ndvi_src.read(1)
if i == 6:
# read ortho med
with rasterio.open(rast_year_path[i]) as ortho_src:
ortho_med_1 = ortho_src.read(1)
ortho_med_2 = ortho_src.read(2)
ortho_med_3 = ortho_src.read(3)
ortho_med_4 = ortho_src.read(4)
ortho_med_5 = ortho_src.read(5)
if i == 7:
# read ortho q1
with rasterio.open(rast_year_path[i]) as ortho_src:
ortho_q1_1 = ortho_src.read(1)
ortho_q1_2 = ortho_src.read(2)
ortho_q1_3 = ortho_src.read(3)
ortho_q1_4 = ortho_src.read(4)
ortho_q1_5 = ortho_src.read(5)
if i == 8:
# read ortho q3
with rasterio.open(rast_year_path[i]) as ortho_src:
ortho_q3_1 = ortho_src.read(1)
ortho_q3_2 = ortho_src.read(2)
ortho_q3_3 = ortho_src.read(3)
ortho_q3_4 = ortho_src.read(4)
ortho_q3_5 = ortho_src.read(5)
# metadata for stacked raster ouput
meta = dem_src.meta
meta.update(nodata = None, count = 21)
# write stacked raster output
with rasterio.open(f'01_output/02_stack_rast/{year}_stack.tif',mode='w',**meta) as dst:
dst.write_band(1, dem_med)
dst.write_band(2, dem_q1)
dst.write_band(3, dem_q3)
dst.write_band(4, ndvi_med)
dst.write_band(5, ndvi_q1)
dst.write_band(6, ndvi_q3)
dst.write_band(7, ortho_med_1)
dst.write_band(8, ortho_med_2)
dst.write_band(9, ortho_med_3)
dst.write_band(10, ortho_med_4)
dst.write_band(11, ortho_med_5)
dst.write_band(12, ortho_q1_1)
dst.write_band(13, ortho_q1_2)
dst.write_band(14, ortho_q1_3)
dst.write_band(15, ortho_q1_4)
dst.write_band(16, ortho_q1_5)
dst.write_band(17, ortho_q3_1)
dst.write_band(18, ortho_q3_2)
dst.write_band(19, ortho_q3_3)
dst.write_band(20, ortho_q3_4)
dst.write_band(21, ortho_q3_5)
Visualize raster
In [7]:
# orthomosaic 2020
with rio.open('01_output/02_stack_rast/20200917_stack.tif') as src:
blue = src.read(7)
green = src.read(8)
red = src.read(9)
In [8]:
def normalize(array):
"""Normalizes numpy arrays into scale 0.0 - 1.0"""
array_min, array_max = array.min(), array.max()
return ((array - array_min)/(array_max - array_min))
# Normalize the bands
redn = normalize(red)
greenn = normalize(green)
bluen = normalize(blue)
# plot RGB
rgb = np.dstack((redn, greenn, bluen))
plt.imshow(rgb)
Out[8]:
<matplotlib.image.AxesImage at 0x1efe3ec6850>
Visualize each band
In [9]:
# Initialize subplots
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, nrows=1, figsize=(10, 4), sharey=True)
# Plot Red, Green and Blue (rgb)
ax1.imshow(redn, cmap='Reds')
ax2.imshow(greenn, cmap='Greens')
ax3.imshow(bluen, cmap='Blues')
# Add titles
ax1.set_title("Red")
ax2.set_title("Green")
ax3.set_title("Blue")
Out[9]:
Text(0.5, 1.0, 'Blue')
2. Feature extraction
In [160]:
# list of processing year
years = ['2020', '2021', '2022']
# get all ground truth point data paths
point_path = glob.glob('*00_data/groundTruth*')
point_path
Out[160]:
['00_data\\groundTruth.gpkg']
In [161]:
# get all stacked raster paths
stack_rast_paths = glob.glob('01_output/02_stack_rast/*.tif')
stack_rast_paths
Out[161]:
['01_output/02_stack_rast\\20200917_stack.tif', '01_output/02_stack_rast\\20210907_stack.tif', '01_output/02_stack_rast\\20220831_stack.tif']
2.1. Extract spectral for correspoinding groundtruth points
In [162]:
# dictionary to store spectral data of groundtruth points
point_spec_val_dict = {}
# dictionary to store
data_dict={}
#### loop through each stacked raster and extract spectral from ground truth points
for i in range(len(stack_rast_paths)):
# get the year being processed
year = stack_rast_paths[i].split('\\')[1].split('_')[0][0:4]
print(f'\n \n Processing the year of: {year} ..... \n ----------------------------------------- \n')
### load correspoinding ground truth data by year
point_gdf = gpd.read_file(filename = point_path[0], layer=years[i])
# converting gdf to df and removing geometry
point_df = pd.DataFrame(point_gdf.drop(columns='geometry'))
# how many bands in stack raster?
with rio.open(stack_rast_paths[i]) as img:
n_band = (img.read()).shape[0]
print(f'Bands of {year} stacked raster: ', n_band)
# create a list of band names
# band_names = []
# for n in range(n_band):
# band_names.append('band'+str(n+1))
# print('Bands names: ', band_names)
band_names = ['dem_med', 'dem_25', 'dem_75', 'ndvi_med', 'ndvi_25', 'ndvi_75', 'ortho_med_blue',
'ortho_med_green', 'ortho_med_red', 'ortho_med_RE', 'ortho_med_NIR', 'ortho_25_blue',
'ortho_25_green', 'ortho_25_red', 'ortho_25_RE', 'ortho_25_NIR', 'ortho_75_blue',
'ortho_75_green', 'ortho_75_red', 'ortho_75_RE', 'ortho_75_NIR']
#### loop through each ground-truth point and extract n_band values correspondingly
point_spec_val_dict[f'point_spec_val_{year}']= pd.DataFrame(columns = band_names)
for id, point in point_gdf.iterrows():
coords = [point['geometry'].x, point['geometry'].y]
# Read pixel value at the given coordinates using Rasterio
# NB: `sample()` returns an iterable of ndarrays.
with rio.open(stack_rast_paths[i]) as stack_rast:
value = [v for v in stack_rast.sample([coords])]
# add spectral value of current point to a df
point_spec_val_dict[f'point_spec_val_{year}'] = pd.concat([point_spec_val_dict[f'point_spec_val_{year}'],
pd.DataFrame(value, columns=band_names)],
ignore_index=True)
# merge spectral of groundtruth point with species class
data_dict[f'{year}'] = pd.merge(point_spec_val_dict[f'point_spec_val_{year}'], point_df,
left_index=True, right_index=True)
print(f'\n Spectral data of groundtruth point in {year}: \n')
print(data_dict[f'{year}'])
Processing the year of: 2020 ..... ----------------------------------------- Bands of 2020 stacked raster: 21
C:\Users\nguyend\AppData\Local\Temp\10\ipykernel_9956\320025553.py:42: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation. point_spec_val_dict[f'point_spec_val_{year}'] = pd.concat([point_spec_val_dict[f'point_spec_val_{year}'],
Spectral data of groundtruth point in 2020: dem_med dem_25 dem_75 ... ortho_75_RE ortho_75_NIR class 0 38.243900 38.228729 38.291622 ... 0.296504 0.445417 6 1 38.367538 38.322418 38.385239 ... 0.235840 0.455389 4 2 38.447495 38.429539 38.499443 ... 0.202871 0.375671 4 3 38.521294 38.507980 38.531776 ... 0.180146 0.356689 4 4 38.719654 38.705757 38.750511 ... 0.238447 0.477911 4 .. ... ... ... ... ... ... ... 166 37.709091 37.707867 37.709782 ... 0.166026 0.176763 2 167 37.965477 37.963757 38.027512 ... 0.121712 0.203327 3 168 37.968903 37.956253 37.982864 ... 0.102138 0.175629 3 169 37.998089 37.994606 38.011143 ... 0.118582 0.205304 3 170 39.061253 39.046356 39.144108 ... 0.314284 0.571336 5 [171 rows x 22 columns] Processing the year of: 2021 ..... ----------------------------------------- Bands of 2021 stacked raster: 21
C:\Users\nguyend\AppData\Local\Temp\10\ipykernel_9956\320025553.py:42: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation. point_spec_val_dict[f'point_spec_val_{year}'] = pd.concat([point_spec_val_dict[f'point_spec_val_{year}'],
Spectral data of groundtruth point in 2021: dem_med dem_25 dem_75 ... ortho_75_RE ortho_75_NIR class 0 38.703575 38.609634 38.913795 ... 0.259286 0.352564 6 1 38.592854 38.559116 38.820065 ... 0.264986 0.345769 6 2 38.701145 38.678696 38.727768 ... 0.207185 0.277126 6 3 38.788811 38.764797 38.833294 ... 0.265108 0.368318 6 4 39.562054 39.548443 39.688572 ... 0.177118 0.340976 4 .. ... ... ... ... ... ... ... 251 38.807865 38.638855 38.877888 ... 0.147262 0.230815 7 252 39.027084 39.006386 39.066467 ... 0.349206 0.605853 7 253 39.487732 39.461876 39.567738 ... 0.372454 0.632818 7 254 39.088299 39.087814 39.103333 ... 0.227871 0.388484 7 255 38.973553 38.967258 38.996727 ... 0.158475 0.253910 4 [256 rows x 22 columns] Processing the year of: 2022 ..... ----------------------------------------- Bands of 2022 stacked raster: 21
C:\Users\nguyend\AppData\Local\Temp\10\ipykernel_9956\320025553.py:42: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation. point_spec_val_dict[f'point_spec_val_{year}'] = pd.concat([point_spec_val_dict[f'point_spec_val_{year}'],
Spectral data of groundtruth point in 2022: dem_med dem_25 dem_75 ... ortho_75_RE ortho_75_NIR class 0 42.497337 42.485523 42.554829 ... 0.271389 0.330226 2 1 42.465767 42.454601 42.535225 ... 0.408643 0.509144 2 2 42.487720 42.481647 42.730988 ... 0.351485 0.450867 2 3 42.457203 42.453789 42.465508 ... 0.433127 0.530413 2 4 43.430702 43.430519 43.451508 ... 0.119938 0.217432 4 .. ... ... ... ... ... ... ... 187 42.775085 42.686943 42.820290 ... 0.251538 0.361172 7 188 0.000000 0.000000 0.000000 ... 0.000000 0.000000 2 189 43.879959 43.851959 43.939892 ... 0.117746 0.222496 4 190 42.700260 42.697182 42.818119 ... 0.189733 0.307613 7 191 43.029823 42.982452 43.054001 ... 0.211307 0.350419 7 [192 rows x 22 columns]
2.2. Split data for training and testing
In [163]:
# dictionary to store spectral data
spectral_dict = {}
# dictionary to store species class
class_dict = {}
# dictionary to store spectral training data
spectral_train_dict = {}
# dictionary to store spectral testing data
spectral_test_dict = {}
# dictionary to store species class training data
class_train_dict = {}
# dictionary to store species class testing data
class_test_dict = {}
# loop through each year data and split
for year in years:
# get spectral only
spectral_dict[year] = data_dict[year][band_names].values
# get species class only
class_dict[year] = data_dict[year]['class'].values
# split to 80% training and 20% testing
spectral_train_dict[year], spectral_test_dict[year], class_train_dict[year], class_test_dict[year] = train_test_split(spectral_dict[year], class_dict[year], test_size=0.2, stratify = class_dict[year], random_state=10)
# print data shape
print(f"\n spectral_train_{year} shape: {spectral_train_dict[year].shape} \n spectral test_{year} shape: {spectral_test_dict[year].shape} \n class_train_{year} shape: {class_train_dict[year].shape} \n class_test_{year} shape: {class_test_dict[year].shape}")
spectral_train_2020 shape: (136, 21) spectral test_2020 shape: (35, 21) class_train_2020 shape: (136,) class_test_2020 shape: (35,) spectral_train_2021 shape: (204, 21) spectral test_2021 shape: (52, 21) class_train_2021 shape: (204,) class_test_2021 shape: (52,) spectral_train_2022 shape: (153, 21) spectral test_2022 shape: (39, 21) class_train_2022 shape: (153,) class_test_2022 shape: (39,)
3. Random Forest
3.1. Train RF classifier
In [ ]:
# dictionary to store classifier
clf_dict = {}
#### loop through each year data and train RF classifier
for year in years:
clf = RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=None,
min_samples_split=2, min_samples_leaf=1, max_features='sqrt',
bootstrap=True, n_jobs = os.cpu_count()-4, random_state=10,
max_samples=None, oob_score=True)
# fit model to training data
clf.fit(spectral_train_dict[year], class_train_dict[year])
# add model to model list
clf_dict[year] = clf
# save model to disk
dump(clf, f'01_output/04_model/{year}_RF_model.joblib')
# apply trained model to the test data
clf_pred = clf.predict(spectral_test_dict[year])
# feature importance scores
print(f'\nfeature importance scores {year} \n')
for band, score in zip(band_names, clf.feature_importances_):
print(f'Band {band} importance: {score}')
# plot feature importance scores
sorted_idx = np.argsort(clf.feature_importances_)
fig = plt.figure(figsize=(5, 6))
plt.barh(range(len(sorted_idx)), clf.feature_importances_[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)),np.array(band_names)[sorted_idx])
plt.title(f'Feature Importance {year}')
# out-of-bag error
print(f'\n out-of-bag error {year} is: {clf.oob_score_*100}%')
print(f'\n Accuracy {year}: {accuracy_score(class_test_dict[year], clf_pred)*100}')
print(classification_report(class_test_dict[year], clf_pred))
# Confusion matrix
cm = confusion_matrix(class_test_dict[year], clf_pred)
# print(f'\n Confusion Matrix {year}: \n',cm)
cm_percent = cm/np.sum(cm)
plt.figure(figsize=(5,5), facecolor='w', edgecolor='k')
sns.set(font_scale=1.5)
sns.heatmap(cm, # or cm_percent
xticklabels=['water','Algae','Juncus','Typha','Phragmites','Phalaris','others'],
yticklabels=['water','Algae','Juncus','Typha','Phragmites','Phalaris','others'],
cmap="YlGn",
annot=True,
# fmt='.2%',
cbar=False,
linewidths=2,
linecolor='black')
plt.title(f'RF Confusion Matrix {year} \n')
plt.xlabel('Predicted')
plt.ylabel('Measured')
plt.savefig(f'01_output/05_accuracy/{year}_confusionMatrix.png', dpi=200,bbox_inches='tight')
feature importance scores 2020 Band dem_med importance: 0.06175971519655704 Band dem_25 importance: 0.061812903310249606 Band dem_75 importance: 0.05487287292578411 Band ndvi_med importance: 0.05378750679340201 Band ndvi_25 importance: 0.057841568771800286 Band ndvi_75 importance: 0.057549705707713686 Band ortho_med_blue importance: 0.038636604319977404 Band ortho_med_green importance: 0.041308740612326235 Band ortho_med_red importance: 0.06481009205425638 Band ortho_med_RE importance: 0.024609754584435102 Band ortho_med_NIR importance: 0.04778166767288972 Band ortho_25_blue importance: 0.036739505527101785 Band ortho_25_green importance: 0.03168338637773056 Band ortho_25_red importance: 0.05181114439960789 Band ortho_25_RE importance: 0.017874585701086215 Band ortho_25_NIR importance: 0.032877016168672175 Band ortho_75_blue importance: 0.05533425265588289 Band ortho_75_green importance: 0.05344915112556545 Band ortho_75_red importance: 0.08622205127358054 Band ortho_75_RE importance: 0.031550192048615644 Band ortho_75_NIR importance: 0.03768758277276534 out-of-bag error 2020 is: 93.38235294117648% Accuracy 2020: 97.14285714285714 precision recall f1-score support 1 1.00 1.00 1.00 4 2 1.00 1.00 1.00 5 3 1.00 1.00 1.00 5 4 1.00 0.67 0.80 3 5 1.00 1.00 1.00 4 6 1.00 1.00 1.00 10 7 0.80 1.00 0.89 4 accuracy 0.97 35 macro avg 0.97 0.95 0.96 35 weighted avg 0.98 0.97 0.97 35 feature importance scores 2021 Band dem_med importance: 0.13207024057055713 Band dem_25 importance: 0.1091061975955221 Band dem_75 importance: 0.10592287888505308 Band ndvi_med importance: 0.04493456294350694 Band ndvi_25 importance: 0.04269630495760121 Band ndvi_75 importance: 0.04380672371272012 Band ortho_med_blue importance: 0.04710066738335477 Band ortho_med_green importance: 0.030127542953546093 Band ortho_med_red importance: 0.05171767803970204 Band ortho_med_RE importance: 0.02883692936803497 Band ortho_med_NIR importance: 0.027861213348664336 Band ortho_25_blue importance: 0.027456132502241774 Band ortho_25_green importance: 0.023459869010858204 Band ortho_25_red importance: 0.03453647577960472 Band ortho_25_RE importance: 0.02104401330188131 Band ortho_25_NIR importance: 0.02799166093386803 Band ortho_75_blue importance: 0.046925947025521116 Band ortho_75_green importance: 0.02720517015162966 Band ortho_75_red importance: 0.06051976396308031 Band ortho_75_RE importance: 0.03429867373262007 Band ortho_75_NIR importance: 0.03238135384043208 out-of-bag error 2021 is: 96.07843137254902% Accuracy 2021: 92.3076923076923 precision recall f1-score support 1 1.00 1.00 1.00 4 2 1.00 1.00 1.00 5 3 0.86 1.00 0.92 6 4 0.88 1.00 0.93 7 5 1.00 0.86 0.92 7 6 0.94 1.00 0.97 17 7 0.75 0.50 0.60 6 accuracy 0.92 52 macro avg 0.92 0.91 0.91 52 weighted avg 0.92 0.92 0.92 52 feature importance scores 2022 Band dem_med importance: 0.08785085458059329 Band dem_25 importance: 0.09363523032297347 Band dem_75 importance: 0.07084228211415876 Band ndvi_med importance: 0.01904105523432381 Band ndvi_25 importance: 0.017547499807743467 Band ndvi_75 importance: 0.01961063038727275 Band ortho_med_blue importance: 0.07935565889120699 Band ortho_med_green importance: 0.048389220690924974 Band ortho_med_red importance: 0.027111873810045702 Band ortho_med_RE importance: 0.03915605989157391 Band ortho_med_NIR importance: 0.028432236129968348 Band ortho_25_blue importance: 0.07865214822990128 Band ortho_25_green importance: 0.04664574130225781 Band ortho_25_red importance: 0.024784775045543772 Band ortho_25_RE importance: 0.02930079772509586 Band ortho_25_NIR importance: 0.03091248677752542 Band ortho_75_blue importance: 0.09040121841202761 Band ortho_75_green importance: 0.050856114599807996 Band ortho_75_red importance: 0.01993043881319182 Band ortho_75_RE importance: 0.05680656512469081 Band ortho_75_NIR importance: 0.04073711210917217 out-of-bag error 2022 is: 94.11764705882352% Accuracy 2022: 92.3076923076923 precision recall f1-score support 1 1.00 1.00 1.00 4 2 1.00 0.89 0.94 9 3 1.00 1.00 1.00 2 4 0.83 1.00 0.91 5 5 1.00 0.88 0.93 8 6 0.67 1.00 0.80 4 7 1.00 0.86 0.92 7 accuracy 0.92 39 macro avg 0.93 0.95 0.93 39 weighted avg 0.94 0.92 0.93 39
3.2. Predict and map
In [165]:
for i in range(len(stack_rast_paths)):
# open stack raster
with rio.open(stack_rast_paths[i]) as img:
img_arr = img.read()
# print raster shape
print(f'\nstacked raster {years[i]} shape: ')
print(f'Height: {img_arr.shape[1]}\nWidth: {img_arr.shape[2]}\nBands: {img_arr.shape[0]}\n')
# reshape array
img_n = np.moveaxis(img_arr, 0, -1)
img_n = img_n.reshape(-1, len(band_names))
print('reshaped full data shape for prediction: ',img_n.shape)
# mask the NA value
img_n[img_n == 0] = np.nan
# predict
pred_full = clf_dict[years[i]].predict(img_n)
# export predicted map
print('Prediction Done, now exporting raster... \n')
# Predefining out raster meta
temp_arr = img.read(1)
temp_arr = temp_arr.reshape(-1,1)
metadata = img.meta
height = metadata.get('height')
width = metadata.get('width')
crs = metadata.get('crs')
transform = metadata.get('transform')
# reshape prediction array
pred_reshape = pred_full.reshape(height, width)
# mask the output raster
img_mask = img_arr[1] # get dem array
pred_reshape = pred_reshape.astype(np.float16) # change to float 16 datatype to be able to hold nan
pred_reshape[np.isnan(img_mask)] = 0 # take nan where it located in dem
# write classification raster output
with rio.open(f'01_output/03_pred_rast/Neukalen_{years[i]}_RF_classified.tif', mode='w', driver='GTiff',
height=height, width=width, count=1, dtype='int8', crs=crs,
transform = transform, nodata = 0) as output:
output.write_band(1, pred_reshape)
print(f'Map {years[i]} save!')
stacked raster 2020 shape: Height: 619 Width: 940 Bands: 21 reshaped full data shape for prediction: (581860, 21) Prediction Done, now exporting raster... Map 2020 save! stacked raster 2021 shape: Height: 619 Width: 940 Bands: 21 reshaped full data shape for prediction: (581860, 21) Prediction Done, now exporting raster... Map 2021 save! stacked raster 2022 shape: Height: 619 Width: 940 Bands: 21 reshaped full data shape for prediction: (581860, 21) Prediction Done, now exporting raster... Map 2022 save!
In [ ]:
Reference
- https://github.com/waleedgeo/lulc_py/blob/main/lulc_py.ipynb
- https://github.com/ramiqcom/lc-classification-python/blob/master/classification.ipynb
- https://github.com/MasoumehVahedi/Random_Forest_LandUse_LandCover_Classification-/blob/main/Random%20Forest%20Land%20cover%20Classification.ipynb
- https://ceholden.github.io/open-geo-tutorial/python/chapter_5_classification.html
- https://menvuthy.readthedocs.io/en/latest/Content/Documentation/machine-learning/1-land_use_classification_with_random_forests.html