import os
import glob
from osgeo import gdal
from multiprocessing import Pool
import time
import rasterio as rio
from rasterio.plot import show
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
import geopandas as gpd
import fiona
import seaborn as sns
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from joblib import dump

# set wd
os.getcwd()
os.chdir(os.getcwd().replace('\\script',''))
os.getcwd()

'h:\\02_work\\202312_Greifswald\\07_Teichweide_map'

# get raw raster paths (raw dem + orthomosaic)
raw_rast_paths = glob.glob('00_data/*.tif')
print('\n Raw raster paths:')
print(raw_rast_paths)

# set preprocessd raster directory (clip and aggregate)
prep_rast_dir = '01_output/01_prep_rast/'

# get preprocessed ortho paths
prep_ortho_paths = glob.glob(f'{prep_rast_dir}/*ortho*')
print(prep_ortho_paths)

# get preprocessed raster paths by year
rast_2020_paths = glob.glob(f'{prep_rast_dir}/2020*.tif')
rast_2020_paths
rast_2021_paths = glob.glob(f'{prep_rast_dir}/2021*.tif')
rast_2021_paths
rast_2022_paths = glob.glob(f'{prep_rast_dir}/2022*.tif')
rast_2022_paths
rast_paths = [rast_2020_paths, rast_2021_paths, rast_2022_paths]
rast_paths

 Raw raster paths:
['00_data\\20200917_dem.tif', '00_data\\20200917_ortho.tif', '00_data\\20210907_dem.tif', '00_data\\20210907_ortho.tif', '00_data\\20220831_dem.tif', '00_data\\20220831_ortho.tif']
['01_output/01_prep_rast\\20200917_ortho_clip_50cm_med.tif', '01_output/01_prep_rast\\20200917_ortho_clip_50cm_q1.tif', '01_output/01_prep_rast\\20200917_ortho_clip_50cm_q3.tif', '01_output/01_prep_rast\\20210907_ortho_clip_50cm_med.tif', '01_output/01_prep_rast\\20210907_ortho_clip_50cm_q1.tif', '01_output/01_prep_rast\\20210907_ortho_clip_50cm_q3.tif', '01_output/01_prep_rast\\20220831_ortho_clip_50cm_med.tif', '01_output/01_prep_rast\\20220831_ortho_clip_50cm_med.tif.aux.xml', '01_output/01_prep_rast\\20220831_ortho_clip_50cm_q1.tif', '01_output/01_prep_rast\\20220831_ortho_clip_50cm_q3.tif']

[['01_output/01_prep_rast\\20200917_dem_clip_50cm_med.tif',
  '01_output/01_prep_rast\\20200917_dem_clip_50cm_q1.tif',
  '01_output/01_prep_rast\\20200917_dem_clip_50cm_q3.tif',
  '01_output/01_prep_rast\\20200917_ndvi_clip_50cm_med.tif',
  '01_output/01_prep_rast\\20200917_ndvi_clip_50cm_q1.tif',
  '01_output/01_prep_rast\\20200917_ndvi_clip_50cm_q3.tif',
  '01_output/01_prep_rast\\20200917_ortho_clip_50cm_med.tif',
  '01_output/01_prep_rast\\20200917_ortho_clip_50cm_q1.tif',
  '01_output/01_prep_rast\\20200917_ortho_clip_50cm_q3.tif'],
 ['01_output/01_prep_rast\\20210907_dem_clip_50cm_med.tif',
  '01_output/01_prep_rast\\20210907_dem_clip_50cm_q1.tif',
  '01_output/01_prep_rast\\20210907_dem_clip_50cm_q3.tif',
  '01_output/01_prep_rast\\20210907_ndvi_clip_50cm_med.tif',
  '01_output/01_prep_rast\\20210907_ndvi_clip_50cm_q1.tif',
  '01_output/01_prep_rast\\20210907_ndvi_clip_50cm_q3.tif',
  '01_output/01_prep_rast\\20210907_ortho_clip_50cm_med.tif',
  '01_output/01_prep_rast\\20210907_ortho_clip_50cm_q1.tif',
  '01_output/01_prep_rast\\20210907_ortho_clip_50cm_q3.tif'],
 ['01_output/01_prep_rast\\20220831_dem_clip_50cm_med.tif',
  '01_output/01_prep_rast\\20220831_dem_clip_50cm_q1.tif',
  '01_output/01_prep_rast\\20220831_dem_clip_50cm_q3.tif',
  '01_output/01_prep_rast\\20220831_ndvi_clip_50cm_med.tif',
  '01_output/01_prep_rast\\20220831_ndvi_clip_50cm_q1.tif',
  '01_output/01_prep_rast\\20220831_ndvi_clip_50cm_q3.tif',
  '01_output/01_prep_rast\\20220831_ortho_clip_50cm_med.tif',
  '01_output/01_prep_rast\\20220831_ortho_clip_50cm_q1.tif',
  '01_output/01_prep_rast\\20220831_ortho_clip_50cm_q3.tif']]

# Tell GDAL to throw Python exceptions, and register all drivers
gdal.UseExceptions()
gdal.AllRegister()

# list of quantiles for resampling
quantiles = ['med','q1','q3']

### function to clip and resampling raster
def clip_func(path):
  name = path.split('\\')[1].split('.')[0]
  print(f'\n clipping {name} ...\n')
  for quantile in quantiles:
      with gdal.Open(path) as source:
          gdal.Warp(destNameOrDestDS = f'{prep_rast_dir}{name}_clip_50cm_{quantile}.tif',
                    srcDSOrSrcDSTab = source,
                    options=f'-t_srs EPSG:5650 -of GTiff -ot Float32\
                        -cutline 00_data/area_studysite.shp \
                            -cl area_studysite -crop_to_cutline \
                                -tr 0.5 0.5 -r {quantile}\
                                    -multi -co NUM_THREADS=ALL_CPUS -wo NUM_THREADS=ALL_CPUS \
                                        -co TILED=YES -co BIGTIFF=YES') 
                                    # -co COMPRESS=DEFLATE 
                                    
  print(f'\n Done {name}_clip !\n')
  source = None
  return None

if __name__ == "__main__":
  # Use multiprocessing Pool to process rasters in parallel
  start_time = time.time()
  with Pool(processes = os.cpu_count()) as pool:
    pool.map(clip_func, raw_rast_paths)
    pool.close()
  end_time = time.time()
  print(f"Elapsed time: {(end_time - start_time)/60} minutes")

for path in prep_ortho_paths:
    # get base name
    year = path.split('\\')[1].split('_')[0]
    quantile = path.split('_')[7].split('.')[0]
    
    # load red and NIR band
    with rasterio.open(path) as src:
        red_band = src.read(3)
        nir_band = src.read(5)
        
    # Allow division by zero
    numpy.seterr(divide='ignore', invalid='ignore')
        
    # calculte NDVI
    ndvi = (nir_band.astype(float) - red_band.astype(float))/(nir_band+red_band)
    
    # Set metadata spatial characteristics of the output raster to mirror the input
    kwargs = src.meta
    kwargs.update(dtype=rasterio.float32, count = 1)
    kwargs
    
    # write ndvi raster
    with rasterio.open(f'{prep_rast_dir}/{year}_ndvi_clip_50cm_{quantile}.tif', 'w', **kwargs) as dst:
        dst.write_band(1, ndvi.astype(rasterio.float32))

# check ndvi
print(numpy.nanmin(ndvi)) 
print(numpy.nanmax(ndvi))

#### plot NDVI
# Set min/max values from NDVI range for image
min=numpy.nanmin(ndvi)
max=numpy.nanmax(ndvi)

# color scheme
class MidpointNormalize(colors.Normalize):
   
    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        self.midpoint = midpoint
        colors.Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
       
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        return numpy.ma.masked_array(numpy.interp(value, x, y), numpy.isnan(value))
    
# Set our custom midpoint for most effective NDVI analysis
mid=0.1

# Setting color scheme ref:https://matplotlib.org/users/colormaps.html as a reference
colormap = plt.cm.RdYlGn 
norm = MidpointNormalize(vmin=min, vmax=max, midpoint=mid)
fig = plt.figure(figsize=(20,10))


ax = fig.add_subplot(111)

# Use 'imshow' to specify the input data, colormap, min, max, and norm for the colorbar
cbar_plot = ax.imshow(ndvi, cmap=colormap, norm=norm)

# Turn off the display of axis labels 
ax.axis('off')

# Set a title 
ax.set_title('Normalized Difference Vegetation Index', fontsize=17, fontweight='bold')

# Configure the colorbar
cbar = fig.colorbar(cbar_plot, orientation='horizontal', shrink=0.65)

# Call 'savefig' to save this plot to an image file
# fig.savefig("ndvi-image.png", dpi=200, bbox_inches='tight', pad_inches=0.7)

# let's visualize
plt.show()

for rast_year_path in rast_paths:
    year = rast_year_path[0].split('\\')[1].split('_')[0]
    for i in range(len(rast_year_path)):
        if i == 0:
            # read dem med
            with rasterio.open(rast_year_path[i]) as dem_src:
                dem_med = dem_src.read(1)
        if i == 1:
            # read dem q1
            with rasterio.open(rast_year_path[i]) as dem_src:
                dem_q1 = dem_src.read(1)
        if i == 2:
            # read dem q3
            with rasterio.open(rast_year_path[i]) as dem_src:
                dem_q3 = dem_src.read(1)
        if i == 3:
            # read ndvi med
            with rasterio.open(rast_year_path[i]) as ndvi_src:
                ndvi_med = ndvi_src.read(1)
        if i == 4:
            # read ndvi q1 
            with rasterio.open(rast_year_path[i]) as ndvi_src:
                ndvi_q1 = ndvi_src.read(1)
        if i == 5:
            # read ndvi q3
            with rasterio.open(rast_year_path[i]) as ndvi_src:
                ndvi_q3 = ndvi_src.read(1)
        if i == 6:
            # read ortho med
            with rasterio.open(rast_year_path[i]) as ortho_src:
                ortho_med_1 = ortho_src.read(1)
                ortho_med_2 = ortho_src.read(2)
                ortho_med_3 = ortho_src.read(3)
                ortho_med_4 = ortho_src.read(4)
                ortho_med_5 = ortho_src.read(5)
        if i == 7:
            # read ortho q1
            with rasterio.open(rast_year_path[i]) as ortho_src:
                ortho_q1_1 = ortho_src.read(1)
                ortho_q1_2 = ortho_src.read(2)
                ortho_q1_3 = ortho_src.read(3)
                ortho_q1_4 = ortho_src.read(4)
                ortho_q1_5 = ortho_src.read(5)
        if i == 8:
            # read ortho q3
            with rasterio.open(rast_year_path[i]) as ortho_src:
                ortho_q3_1 = ortho_src.read(1)
                ortho_q3_2 = ortho_src.read(2)
                ortho_q3_3 = ortho_src.read(3)
                ortho_q3_4 = ortho_src.read(4)
                ortho_q3_5 = ortho_src.read(5)
                
            # metadata for stacked raster ouput
            meta = dem_src.meta
            meta.update(nodata = None, count = 21)
            
            # write stacked raster output
            with rasterio.open(f'01_output/02_stack_rast/{year}_stack.tif',mode='w',**meta) as dst:
                dst.write_band(1, dem_med)
                dst.write_band(2, dem_q1)
                dst.write_band(3, dem_q3)
                dst.write_band(4, ndvi_med)
                dst.write_band(5, ndvi_q1)
                dst.write_band(6, ndvi_q3)
                dst.write_band(7, ortho_med_1)
                dst.write_band(8, ortho_med_2)
                dst.write_band(9, ortho_med_3)
                dst.write_band(10, ortho_med_4)
                dst.write_band(11, ortho_med_5)
                dst.write_band(12, ortho_q1_1)
                dst.write_band(13, ortho_q1_2)
                dst.write_band(14, ortho_q1_3)
                dst.write_band(15, ortho_q1_4)
                dst.write_band(16, ortho_q1_5)
                dst.write_band(17, ortho_q3_1)
                dst.write_band(18, ortho_q3_2)
                dst.write_band(19, ortho_q3_3)
                dst.write_band(20, ortho_q3_4)
                dst.write_band(21, ortho_q3_5)

# orthomosaic 2020
with rio.open('01_output/02_stack_rast/20200917_stack.tif') as src:
    blue = src.read(7)
    green = src.read(8)
    red = src.read(9)

def normalize(array):
    """Normalizes numpy arrays into scale 0.0 - 1.0"""
    array_min, array_max = array.min(), array.max()
    return ((array - array_min)/(array_max - array_min))

# Normalize the bands
redn = normalize(red)
greenn = normalize(green)
bluen = normalize(blue)

# plot RGB
rgb = np.dstack((redn, greenn, bluen))
plt.imshow(rgb)

<matplotlib.image.AxesImage at 0x1efe3ec6850>

# Initialize subplots
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, nrows=1, figsize=(10, 4), sharey=True)

# Plot Red, Green and Blue (rgb)
ax1.imshow(redn, cmap='Reds')
ax2.imshow(greenn, cmap='Greens')
ax3.imshow(bluen, cmap='Blues')

# Add titles
ax1.set_title("Red")
ax2.set_title("Green")
ax3.set_title("Blue")

Text(0.5, 1.0, 'Blue')

# list of processing year
years = ['2020', '2021', '2022']
# get all ground truth point data paths
point_path = glob.glob('*00_data/groundTruth*')
point_path

['00_data\\groundTruth.gpkg']

# get all stacked raster paths
stack_rast_paths = glob.glob('01_output/02_stack_rast/*.tif')
stack_rast_paths

['01_output/02_stack_rast\\20200917_stack.tif',
 '01_output/02_stack_rast\\20210907_stack.tif',
 '01_output/02_stack_rast\\20220831_stack.tif']

# dictionary to store spectral data of groundtruth points
point_spec_val_dict = {}
# dictionary to store
data_dict={}

#### loop through each stacked raster and extract spectral from ground truth points
for i in range(len(stack_rast_paths)):
    # get the year being processed
    year = stack_rast_paths[i].split('\\')[1].split('_')[0][0:4]
    print(f'\n \n Processing the year of: {year} ..... \n ----------------------------------------- \n')
    
    ### load correspoinding ground truth data by year
    point_gdf = gpd.read_file(filename = point_path[0], layer=years[i])
    # converting gdf to df and removing geometry
    point_df = pd.DataFrame(point_gdf.drop(columns='geometry'))

    #  how many bands in stack raster?
    with rio.open(stack_rast_paths[i]) as img:
        n_band = (img.read()).shape[0]
    print(f'Bands of {year} stacked raster: ', n_band)
    
    # create a list of band names
    # band_names = []
    # for n in range(n_band):
    #     band_names.append('band'+str(n+1))
    # print('Bands names: ', band_names)
    band_names = ['dem_med', 'dem_25', 'dem_75', 'ndvi_med', 'ndvi_25', 'ndvi_75', 'ortho_med_blue', 
                  'ortho_med_green', 'ortho_med_red', 'ortho_med_RE', 'ortho_med_NIR', 'ortho_25_blue', 
                  'ortho_25_green', 'ortho_25_red', 'ortho_25_RE', 'ortho_25_NIR', 'ortho_75_blue', 
                  'ortho_75_green', 'ortho_75_red', 'ortho_75_RE', 'ortho_75_NIR']

    #### loop through each ground-truth point and extract n_band values correspondingly
    point_spec_val_dict[f'point_spec_val_{year}']= pd.DataFrame(columns = band_names)

    for id, point in point_gdf.iterrows():
        coords = [point['geometry'].x, point['geometry'].y]
        # Read pixel value at the given coordinates using Rasterio
        # NB: `sample()` returns an iterable of ndarrays.
        with rio.open(stack_rast_paths[i]) as stack_rast:
                    value = [v for v in stack_rast.sample([coords])]
        # add spectral value of current point to a df
        point_spec_val_dict[f'point_spec_val_{year}'] = pd.concat([point_spec_val_dict[f'point_spec_val_{year}'],
                                                                   pd.DataFrame(value, columns=band_names)],
                                                                  ignore_index=True)

    # merge spectral of groundtruth point with species class
    data_dict[f'{year}'] = pd.merge(point_spec_val_dict[f'point_spec_val_{year}'], point_df,
                                         left_index=True, right_index=True)
    print(f'\n Spectral data of groundtruth point in {year}: \n')
    print(data_dict[f'{year}'])

 
 Processing the year of: 2020 ..... 
 ----------------------------------------- 

Bands of 2020 stacked raster:  21

C:\Users\nguyend\AppData\Local\Temp\10\ipykernel_9956\320025553.py:42: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.
  point_spec_val_dict[f'point_spec_val_{year}'] = pd.concat([point_spec_val_dict[f'point_spec_val_{year}'],

 Spectral data of groundtruth point in 2020: 

       dem_med     dem_25     dem_75  ...  ortho_75_RE  ortho_75_NIR  class
0    38.243900  38.228729  38.291622  ...     0.296504      0.445417      6
1    38.367538  38.322418  38.385239  ...     0.235840      0.455389      4
2    38.447495  38.429539  38.499443  ...     0.202871      0.375671      4
3    38.521294  38.507980  38.531776  ...     0.180146      0.356689      4
4    38.719654  38.705757  38.750511  ...     0.238447      0.477911      4
..         ...        ...        ...  ...          ...           ...    ...
166  37.709091  37.707867  37.709782  ...     0.166026      0.176763      2
167  37.965477  37.963757  38.027512  ...     0.121712      0.203327      3
168  37.968903  37.956253  37.982864  ...     0.102138      0.175629      3
169  37.998089  37.994606  38.011143  ...     0.118582      0.205304      3
170  39.061253  39.046356  39.144108  ...     0.314284      0.571336      5

[171 rows x 22 columns]

 
 Processing the year of: 2021 ..... 
 ----------------------------------------- 

Bands of 2021 stacked raster:  21

C:\Users\nguyend\AppData\Local\Temp\10\ipykernel_9956\320025553.py:42: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.
  point_spec_val_dict[f'point_spec_val_{year}'] = pd.concat([point_spec_val_dict[f'point_spec_val_{year}'],

 Spectral data of groundtruth point in 2021: 

       dem_med     dem_25     dem_75  ...  ortho_75_RE  ortho_75_NIR  class
0    38.703575  38.609634  38.913795  ...     0.259286      0.352564      6
1    38.592854  38.559116  38.820065  ...     0.264986      0.345769      6
2    38.701145  38.678696  38.727768  ...     0.207185      0.277126      6
3    38.788811  38.764797  38.833294  ...     0.265108      0.368318      6
4    39.562054  39.548443  39.688572  ...     0.177118      0.340976      4
..         ...        ...        ...  ...          ...           ...    ...
251  38.807865  38.638855  38.877888  ...     0.147262      0.230815      7
252  39.027084  39.006386  39.066467  ...     0.349206      0.605853      7
253  39.487732  39.461876  39.567738  ...     0.372454      0.632818      7
254  39.088299  39.087814  39.103333  ...     0.227871      0.388484      7
255  38.973553  38.967258  38.996727  ...     0.158475      0.253910      4

[256 rows x 22 columns]

 
 Processing the year of: 2022 ..... 
 ----------------------------------------- 

Bands of 2022 stacked raster:  21

C:\Users\nguyend\AppData\Local\Temp\10\ipykernel_9956\320025553.py:42: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.
  point_spec_val_dict[f'point_spec_val_{year}'] = pd.concat([point_spec_val_dict[f'point_spec_val_{year}'],

 Spectral data of groundtruth point in 2022: 

       dem_med     dem_25     dem_75  ...  ortho_75_RE  ortho_75_NIR  class
0    42.497337  42.485523  42.554829  ...     0.271389      0.330226      2
1    42.465767  42.454601  42.535225  ...     0.408643      0.509144      2
2    42.487720  42.481647  42.730988  ...     0.351485      0.450867      2
3    42.457203  42.453789  42.465508  ...     0.433127      0.530413      2
4    43.430702  43.430519  43.451508  ...     0.119938      0.217432      4
..         ...        ...        ...  ...          ...           ...    ...
187  42.775085  42.686943  42.820290  ...     0.251538      0.361172      7
188   0.000000   0.000000   0.000000  ...     0.000000      0.000000      2
189  43.879959  43.851959  43.939892  ...     0.117746      0.222496      4
190  42.700260  42.697182  42.818119  ...     0.189733      0.307613      7
191  43.029823  42.982452  43.054001  ...     0.211307      0.350419      7

[192 rows x 22 columns]

# dictionary to store spectral data
spectral_dict = {}
# dictionary to store species class
class_dict = {}
# dictionary to store spectral training data
spectral_train_dict = {}
# dictionary to store spectral testing data
spectral_test_dict = {}
# dictionary to store species class training data
class_train_dict = {}
# dictionary to store species class testing data
class_test_dict = {}

# loop through each year data and split
for year in years:
    # get spectral only
    spectral_dict[year] = data_dict[year][band_names].values
    # get species class only
    class_dict[year] = data_dict[year]['class'].values
    # split to 80% training and 20% testing
    spectral_train_dict[year], spectral_test_dict[year], class_train_dict[year], class_test_dict[year] = train_test_split(spectral_dict[year], class_dict[year], test_size=0.2, stratify = class_dict[year], random_state=10)
    # print data shape
    print(f"\n spectral_train_{year} shape: {spectral_train_dict[year].shape} \n spectral test_{year} shape: {spectral_test_dict[year].shape} \n class_train_{year} shape: {class_train_dict[year].shape} \n class_test_{year} shape: {class_test_dict[year].shape}")

 spectral_train_2020 shape: (136, 21) 
 spectral test_2020 shape: (35, 21) 
 class_train_2020 shape: (136,) 
 class_test_2020 shape: (35,)

 spectral_train_2021 shape: (204, 21) 
 spectral test_2021 shape: (52, 21) 
 class_train_2021 shape: (204,) 
 class_test_2021 shape: (52,)

 spectral_train_2022 shape: (153, 21) 
 spectral test_2022 shape: (39, 21) 
 class_train_2022 shape: (153,) 
 class_test_2022 shape: (39,)

# dictionary to store classifier
clf_dict = {}

#### loop through each year data and train RF classifier
for year in years:
    clf = RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=None,
                                 min_samples_split=2, min_samples_leaf=1, max_features='sqrt', 
                                 bootstrap=True, n_jobs = os.cpu_count()-4, random_state=10, 
                                 max_samples=None, oob_score=True)
    # fit model to training data
    clf.fit(spectral_train_dict[year], class_train_dict[year])
    # add model to model list
    clf_dict[year] = clf
    # save model to disk
    dump(clf, f'01_output/04_model/{year}_RF_model.joblib')
    # apply trained model to the test data
    clf_pred = clf.predict(spectral_test_dict[year])

    # feature importance scores
    print(f'\nfeature importance scores {year} \n')
    for band, score in zip(band_names, clf.feature_importances_):
        print(f'Band {band} importance: {score}')

    # plot feature importance scores
    sorted_idx = np.argsort(clf.feature_importances_)
    fig = plt.figure(figsize=(5, 6))
    plt.barh(range(len(sorted_idx)), clf.feature_importances_[sorted_idx], align='center')
    plt.yticks(range(len(sorted_idx)),np.array(band_names)[sorted_idx])
    plt.title(f'Feature Importance {year}')

    # out-of-bag error
    print(f'\n out-of-bag error {year} is: {clf.oob_score_*100}%')

    print(f'\n Accuracy {year}: {accuracy_score(class_test_dict[year], clf_pred)*100}')
    print(classification_report(class_test_dict[year], clf_pred))
    
    # Confusion matrix
    cm = confusion_matrix(class_test_dict[year], clf_pred)
    # print(f'\n Confusion Matrix {year}: \n',cm)
    cm_percent = cm/np.sum(cm)

    plt.figure(figsize=(5,5), facecolor='w', edgecolor='k')
    sns.set(font_scale=1.5)

    sns.heatmap(cm, # or cm_percent
                xticklabels=['water','Algae','Juncus','Typha','Phragmites','Phalaris','others'],
                yticklabels=['water','Algae','Juncus','Typha','Phragmites','Phalaris','others'],
                cmap="YlGn",
                annot=True,
                # fmt='.2%',
                cbar=False,
                linewidths=2,
                linecolor='black')
    plt.title(f'RF Confusion Matrix {year} \n')
    plt.xlabel('Predicted')
    plt.ylabel('Measured')
    plt.savefig(f'01_output/05_accuracy/{year}_confusionMatrix.png', dpi=200,bbox_inches='tight')

feature importance scores 2020 

Band dem_med importance: 0.06175971519655704
Band dem_25 importance: 0.061812903310249606
Band dem_75 importance: 0.05487287292578411
Band ndvi_med importance: 0.05378750679340201
Band ndvi_25 importance: 0.057841568771800286
Band ndvi_75 importance: 0.057549705707713686
Band ortho_med_blue importance: 0.038636604319977404
Band ortho_med_green importance: 0.041308740612326235
Band ortho_med_red importance: 0.06481009205425638
Band ortho_med_RE importance: 0.024609754584435102
Band ortho_med_NIR importance: 0.04778166767288972
Band ortho_25_blue importance: 0.036739505527101785
Band ortho_25_green importance: 0.03168338637773056
Band ortho_25_red importance: 0.05181114439960789
Band ortho_25_RE importance: 0.017874585701086215
Band ortho_25_NIR importance: 0.032877016168672175
Band ortho_75_blue importance: 0.05533425265588289
Band ortho_75_green importance: 0.05344915112556545
Band ortho_75_red importance: 0.08622205127358054
Band ortho_75_RE importance: 0.031550192048615644
Band ortho_75_NIR importance: 0.03768758277276534

 out-of-bag error 2020 is: 93.38235294117648%

 Accuracy 2020: 97.14285714285714
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         4
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         5
           4       1.00      0.67      0.80         3
           5       1.00      1.00      1.00         4
           6       1.00      1.00      1.00        10
           7       0.80      1.00      0.89         4

    accuracy                           0.97        35
   macro avg       0.97      0.95      0.96        35
weighted avg       0.98      0.97      0.97        35


feature importance scores 2021 

Band dem_med importance: 0.13207024057055713
Band dem_25 importance: 0.1091061975955221
Band dem_75 importance: 0.10592287888505308
Band ndvi_med importance: 0.04493456294350694
Band ndvi_25 importance: 0.04269630495760121
Band ndvi_75 importance: 0.04380672371272012
Band ortho_med_blue importance: 0.04710066738335477
Band ortho_med_green importance: 0.030127542953546093
Band ortho_med_red importance: 0.05171767803970204
Band ortho_med_RE importance: 0.02883692936803497
Band ortho_med_NIR importance: 0.027861213348664336
Band ortho_25_blue importance: 0.027456132502241774
Band ortho_25_green importance: 0.023459869010858204
Band ortho_25_red importance: 0.03453647577960472
Band ortho_25_RE importance: 0.02104401330188131
Band ortho_25_NIR importance: 0.02799166093386803
Band ortho_75_blue importance: 0.046925947025521116
Band ortho_75_green importance: 0.02720517015162966
Band ortho_75_red importance: 0.06051976396308031
Band ortho_75_RE importance: 0.03429867373262007
Band ortho_75_NIR importance: 0.03238135384043208

 out-of-bag error 2021 is: 96.07843137254902%

 Accuracy 2021: 92.3076923076923
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         4
           2       1.00      1.00      1.00         5
           3       0.86      1.00      0.92         6
           4       0.88      1.00      0.93         7
           5       1.00      0.86      0.92         7
           6       0.94      1.00      0.97        17
           7       0.75      0.50      0.60         6

    accuracy                           0.92        52
   macro avg       0.92      0.91      0.91        52
weighted avg       0.92      0.92      0.92        52


feature importance scores 2022 

Band dem_med importance: 0.08785085458059329
Band dem_25 importance: 0.09363523032297347
Band dem_75 importance: 0.07084228211415876
Band ndvi_med importance: 0.01904105523432381
Band ndvi_25 importance: 0.017547499807743467
Band ndvi_75 importance: 0.01961063038727275
Band ortho_med_blue importance: 0.07935565889120699
Band ortho_med_green importance: 0.048389220690924974
Band ortho_med_red importance: 0.027111873810045702
Band ortho_med_RE importance: 0.03915605989157391
Band ortho_med_NIR importance: 0.028432236129968348
Band ortho_25_blue importance: 0.07865214822990128
Band ortho_25_green importance: 0.04664574130225781
Band ortho_25_red importance: 0.024784775045543772
Band ortho_25_RE importance: 0.02930079772509586
Band ortho_25_NIR importance: 0.03091248677752542
Band ortho_75_blue importance: 0.09040121841202761
Band ortho_75_green importance: 0.050856114599807996
Band ortho_75_red importance: 0.01993043881319182
Band ortho_75_RE importance: 0.05680656512469081
Band ortho_75_NIR importance: 0.04073711210917217

 out-of-bag error 2022 is: 94.11764705882352%

 Accuracy 2022: 92.3076923076923
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         4
           2       1.00      0.89      0.94         9
           3       1.00      1.00      1.00         2
           4       0.83      1.00      0.91         5
           5       1.00      0.88      0.93         8
           6       0.67      1.00      0.80         4
           7       1.00      0.86      0.92         7

    accuracy                           0.92        39
   macro avg       0.93      0.95      0.93        39
weighted avg       0.94      0.92      0.93        39

for i in range(len(stack_rast_paths)):
    # open stack raster
    with rio.open(stack_rast_paths[i]) as img:
        img_arr = img.read()
        # print raster shape
        print(f'\nstacked raster {years[i]} shape: ')
        print(f'Height: {img_arr.shape[1]}\nWidth: {img_arr.shape[2]}\nBands: {img_arr.shape[0]}\n')
        
        # reshape array
        img_n = np.moveaxis(img_arr, 0, -1)
        img_n = img_n.reshape(-1, len(band_names))
        print('reshaped full data shape  for prediction: ',img_n.shape)

        # mask the NA value
        img_n[img_n == 0] = np.nan
        
        # predict
        pred_full = clf_dict[years[i]].predict(img_n)
        
        # export predicted map
        print('Prediction Done, now exporting raster... \n')
        # Predefining out raster meta
        temp_arr = img.read(1)
        temp_arr = temp_arr.reshape(-1,1)
        metadata = img.meta
        height = metadata.get('height')
        width = metadata.get('width')
        crs = metadata.get('crs')
        transform = metadata.get('transform')

        # reshape prediction array
        pred_reshape = pred_full.reshape(height, width)
        # mask the output raster
        img_mask = img_arr[1] # get dem array
        pred_reshape = pred_reshape.astype(np.float16) # change to float 16 datatype to be able to hold nan
        pred_reshape[np.isnan(img_mask)] = 0 # take nan where it located in dem
        
        # write classification raster output
        with rio.open(f'01_output/03_pred_rast/Neukalen_{years[i]}_RF_classified.tif', mode='w', driver='GTiff', 
                      height=height, width=width, count=1, dtype='int8', crs=crs, 
                      transform = transform, nodata = 0) as output:
            output.write_band(1, pred_reshape)

    print(f'Map {years[i]} save!')

stacked raster 2020 shape: 
Height: 619
Width: 940
Bands: 21

reshaped full data shape  for prediction:  (581860, 21)
Prediction Done, now exporting raster... 

Map 2020 save!

stacked raster 2021 shape: 
Height: 619
Width: 940
Bands: 21

reshaped full data shape  for prediction:  (581860, 21)
Prediction Done, now exporting raster... 

Map 2021 save!

stacked raster 2022 shape: 
Height: 619
Width: 940
Bands: 21

reshaped full data shape  for prediction:  (581860, 21)
Prediction Done, now exporting raster... 

Map 2022 save!

Species mapping using Random Forest

Mapping species in Neukalen, Germany from 2020 to 2022 using Python based on Random Forest classifier and UAV multispectral images

import library

Data paths

1. Raster preparation

1.1. Clip and resample drone images

1.2. NDVI calculation

1.3. Raster stacking

Visualize raster

Visualize each band

2. Feature extraction

2.1. Extract spectral for correspoinding groundtruth points

2.2. Split data for training and testing

3. Random Forest

3.1. Train RF classifier

3.2. Predict and map

Reference