Calculate correlation of sPlot CWM and iNaturalist averages#

Here we correlate the aggregated trait values close each sPlot (with a certain range) to the community weighted trait mean of each plot (cwm).

This section includes:

  • Plot r for each buffer size

  • Scatter correlation plots for 64,000 m buffer size

import pandas as pd
import numpy as np
import os

#plotting
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LogNorm, Normalize
import cartopy.crs as ccrs
from matplotlib.colors import BoundaryNorm
from matplotlib.ticker import MaxNLocator

import math
sPlot = pd.read_csv("sPlotOpen/cwm_loc.csv")

Plot r for each buffer size#

buffer_sizes = [1000,2000,4000,8000,16000,32000,64000,128000,256000]

trait =['Leaf Area',
 'SSD',
 'SLA',
 'Leaf C',
 'Leaf N per mass',
 'Leaf P',
 'Plant Height',
 'Seed mass',
 'Seed length',
 'LDMC',
 'Leaf N per area',
 'Leaf N P ratio',
 'Leaf delta15N',
 'Seeds per rep. unit',
 'Leaf fresh mass',
 'Stem conduit density',
 'Dispersal unit length',
 'Conduit element length'
       ]

r_all = pd.DataFrame(columns=trait)


for buffer in buffer_sizes:
    
    file_name = "Buffer_Rerun/all_buffer_means_" + str(buffer) + ".csv"
    buffer_means = pd.read_csv(file_name, 
                          sep=",", 
                          usecols=['NumberiNatObservations','PlotObservationID', 'Leaf Area', 
                                   'SSD',
 'SLA',
 'Leaf C',
 'Leaf N per mass',
 'Leaf P',
 'Plant Height',
 'Seed mass',
 'Seed length',
 'LDMC',
 'Leaf N per area',
 'Leaf N P ratio',
 'Leaf delta15N',
 'Seeds per rep. unit',
 'Leaf fresh mass',
 'Stem conduit density',
 'Dispersal unit length',
 'Conduit element length'
                                  ], 
                          index_col=False)

    buffer_means = buffer_means[~buffer_means.isin([np.nan, np.inf, -np.inf]).any(1)]
    #transform dataframe from wide to long

    sPlot_t = sPlot.melt(id_vars=["PlotObservationID", "Latitude", "Longitude", "Biome", "Naturalness", "Forest", 
                              "Shrubland", "Grassland", "Wetland", "Sparse_vegetation"], 
                     value_name="TraitValue", 
                     var_name="Trait",
                     value_vars=trait)

    buffer_means_t = buffer_means.melt(id_vars=["PlotObservationID", "NumberiNatObservations"], 
                                   value_name="TraitValue", 
                                   var_name="Trait",
                                   value_vars=trait)

    sPlot_buffers_merged = pd.merge(sPlot_t, buffer_means_t, on=["PlotObservationID", "Trait"])
    
    
    # claculate r and ranges for all traits

    r_buffer=[]

    for i in trait:
        #corr_trait = sPlot[i].fillna(0).corr(buffer_means[i].fillna(0))
        corr_trait = sPlot[i].corr(buffer_means[i])
        r_trait = corr_trait
        r_buffer.append(r_trait)
        
    s = pd.Series(r_buffer, index=r_all.columns)
    r_all = r_all.append(s, ignore_index=True)


r_all['BufferSize'] = buffer_sizes
r_all
Leaf Area SSD SLA Leaf C Leaf N per mass Leaf P Plant Height Seed mass Seed length LDMC Leaf N per area Leaf N P ratio Leaf delta15N Seeds per rep. unit Leaf fresh mass Stem conduit density Dispersal unit length Conduit element length BufferSize
0 0.327212 0.393096 0.388285 0.310698 0.318676 0.417778 0.599255 0.491661 0.394473 0.285530 0.228102 0.229549 0.050140 0.132439 0.397651 0.345894 0.007653 0.085008 1000
1 0.426574 0.357159 0.444459 0.285129 0.399287 0.432745 0.593935 0.477698 0.325800 0.291813 0.332025 0.348768 0.071325 0.116958 0.364058 0.392696 0.042383 0.126451 2000
2 0.434995 0.379288 0.471005 0.275669 0.418899 0.458121 0.585162 0.483190 0.304954 0.331761 0.384651 0.374946 0.094760 0.087250 0.370357 0.399894 -0.000913 0.116891 4000
3 0.433900 0.386507 0.488867 0.252580 0.437119 0.452988 0.567086 0.468941 0.262529 0.373375 0.418641 0.362618 0.118075 0.102610 0.336556 0.417393 -0.019950 0.131345 8000
4 0.423593 0.421343 0.484484 0.240975 0.439395 0.433246 0.562454 0.454250 0.258084 0.390314 0.440648 0.356950 0.151906 0.127337 0.349924 0.422772 0.000643 0.152324 16000
5 0.447045 0.466672 0.487007 0.244310 0.407360 0.414086 0.552194 0.463876 0.253451 0.401911 0.490071 0.402536 0.193645 0.140352 0.372263 0.447351 0.029196 0.202838 32000
6 0.469007 0.513363 0.496539 0.252682 0.389497 0.420769 0.552098 0.470084 0.277404 0.419255 0.531011 0.425185 0.246662 0.126832 0.409334 0.465274 0.025032 0.259420 64000
7 0.450491 0.525816 0.487563 0.254286 0.405706 0.437688 0.541702 0.445024 0.258906 0.428992 0.523541 0.433611 0.244218 0.120030 0.405460 0.477046 -0.009207 0.262008 128000
8 0.454712 0.545861 0.488528 0.235307 0.389228 0.422237 0.537227 0.431674 0.205320 0.427026 0.506207 0.414711 0.250333 0.040132 0.411150 0.513977 -0.041144 0.289280 256000
# https://stackoverflow.com/questions/44941082/plot-multiple-columns-of-pandas-dataframe-using-seaborn
# https://lost-stats.github.io/Presentation/Figures/line_graph_with_labels_at_the_beginning_or_end.html

# data 
data_dropnan = r_all.dropna(axis=1, how='all')
data_melt=pd.melt(data_dropnan, ['BufferSize'], value_name="r")
data_melt =data_melt.astype({"BufferSize": str}, errors='raise') 

# label names
trait_names = data_melt["variable"].unique()

sns.set(rc={'figure.figsize':(8,10)})
sns.set_theme(style="white")
fig, ax = plt.subplots()

# plot all lines into one plot

sns.lineplot(x='BufferSize', 
            y='r', 
            hue='variable', 
            data=data_melt,
            ax=ax,
            marker='o',
            legend=None,
            linewidth=0.6)

label_pos=[]

# Add the text--for each line, find the end, annotate it with a label
for line, variable in zip(ax.lines, trait_names):
    y = line.get_ydata()[-1]
    x = line.get_xdata()[-1]
    if not np.isfinite(y):
        y=next(reversed(line.get_ydata()[~line.get_ydata().mask]),float("nan"))
    if not np.isfinite(y) or not np.isfinite(x):
        continue 
    x=round(x)
    y=round(y,2)
    xy=(x*1.02, y)
    if xy in label_pos:
        xy=(x*1.02, y-0.01)
    if xy in label_pos:
        xy=(x*1.02, y+0.01)
    label_pos.append(xy)
    text = ax.annotate(variable,
                       xy=(xy),
                       xytext=(0, 0),
                       color=line.get_color(),
                       xycoords=(ax.get_xaxis_transform(),
                                 ax.get_yaxis_transform()),
                       textcoords="offset points")
    text_width = (text.get_window_extent(
    fig.canvas.get_renderer()).transformed(ax.transData.inverted()).width)
    #if np.isfinite(text_width):
     #   ax.set_xlim(ax.get_xlim()[0], text.xy[0] + text_width * 1.05)
        
# Format the date axis to be prettier.
sns.despine()
plt.xlabel("Buffer Size in meters")
plt.ylabel("r") 
plt.tight_layout()

plt.savefig('../Figures/r_buffer.pdf', bbox_inches='tight')  
_images/Chapter_13_Correlation_Buffers_7_0.png

Scatter correlation plots for 64,000 m buffer size#

optimal_buffer_size = 64000
file_name = "Buffer_Rerun/all_buffer_means_" + str(optimal_buffer_size) + ".csv"
buffer_means = pd.read_csv(file_name, 
                          sep=",", 
                          usecols=['NumberiNatObservations','PlotObservationID', 'Leaf Area', 
                                   'SSD',
 'SLA',
 'Leaf C',
 'Leaf N per mass',
 'Leaf P',
 'Plant Height',
 'Seed mass',
 'Seed length',
 'LDMC',
 'Leaf N per area',
 'Leaf N P ratio',
 'Leaf delta15N',
 'Seeds per rep. unit',
 'Leaf fresh mass',
 'Stem conduit density',
 'Dispersal unit length',
 'Conduit element length'
                                  ],
                          index_col=False)
buffer_means = buffer_means[~buffer_means.isin([np.nan, np.inf, -np.inf]).any(1)]
#transform dataframe from wide to long

sPlot_t = sPlot.melt(id_vars=["PlotObservationID", "Latitude", "Longitude", "Biome", "Naturalness", "Forest", 
                              "Shrubland", "Grassland", "Wetland", "Sparse_vegetation"], 
                     value_name="TraitValue", 
                     var_name="Trait",
                     value_vars=trait)

buffer_means_t = buffer_means.melt(id_vars=["PlotObservationID", "NumberiNatObservations"], 
                                   value_name="TraitValue", 
                                   var_name="Trait",
                                   value_vars=trait)

sPlot_buffers_merged = pd.merge(sPlot_t, buffer_means_t, on=["PlotObservationID", "Trait"])
trait=['Leaf Area', 
 'SSD',
 'SLA',
 'Leaf C',
 'Leaf N per mass',
 'Leaf P',
 'Plant Height',
 'Seed mass',
 'Seed length',
 'LDMC',
 'Leaf N per area',
 'Leaf N P ratio',
 'Leaf delta15N',
 'Seeds per rep. unit',
 'Leaf fresh mass',
 'Stem conduit density',
 'Dispersal unit length',
 'Conduit element length']
# calculate max-min ranges

def min__max_ranges(df, col_1, col_2, variable_col, variables):

    range_all =[]

    for i in variables:
        df_sub = df[df[variable_col]==i]
        df_sub = df_sub.dropna(subset = [col_1, col_2])
    
        xmin = df_sub[col_1].quantile(0.01)
        xmax = df_sub[col_1].quantile(0.99)
    
        ymin = df_sub[col_2].quantile(0.01)
        ymax = df_sub[col_2].quantile(0.99)
    
    
        if xmin>ymin:
            if not np.isfinite(ymin):
                pass
            else:
                xmin = ymin
        else:
            pass
    
        if xmax<ymax:
            xmax=ymax
        else:
            pass
        
        range_sub = [xmin, xmax]
    
        range_all.append(range_sub)
        
    ranges  = pd.DataFrame()
    ranges['variable'] = variables
    ranges['min'] = [i[0] for i in range_all]
    ranges['max'] = [i[1] for i in range_all]
    ranges = ranges.set_index('variable')

    return ranges
ranges = min__max_ranges(sPlot_buffers_merged, 'TraitValue_x', 'TraitValue_y', 
                         variable_col='Trait', variables=trait)
ranges
min max
variable
Leaf Area 2.491325 9.293933
SSD -1.780534 -0.281056
SLA 1.391215 3.694397
Leaf C 5.987216 6.251625
Leaf N per mass 2.325956 3.522470
Leaf P -0.430572 1.155212
Plant Height -2.443895 3.027281
Seed mass -2.659446 5.554995
Seed length -0.120123 2.489541
LDMC -2.016191 -0.565417
Leaf N per area -0.303591 1.325981
Leaf N P ratio 1.918633 3.063715
Leaf delta15N -0.611625 2.018824
Seeds per rep. unit 2.394954 11.270200
Leaf fresh mass -5.332373 1.764311
Stem conduit density 1.988650 7.005393
Dispersal unit length -0.114169 2.806126
Conduit element length 4.705128 6.971030

This might take a few minutes to plot:

fig, axes = plt.subplots(ncols=4, nrows=5, figsize=(20,25))

sns.set_theme(style="white", font_scale=1.7)

for i, ax in zip(trait, axes.flat):

    sub_df = sPlot_buffers_merged[sPlot_buffers_merged["Trait"]==i]

    index=0
    trait_title= str(i) + "\n" + "r = " + str(round(r_all.loc[6, i], 2))

    sns.kdeplot(
        data=sub_df,
        x="TraitValue_x", 
        y="TraitValue_y",
        ax=ax,
        ).set(title=trait_title, xlabel='sPlotOpen plot', ylabel='iNaturalist obs. mean')

    ax.axline([0, 0], [1, 1], color= "black", alpha=0.6, ls = ":")
    
    space = (ranges.loc[i, "max"]-[ranges.loc[i, "min"]]) * 0.2
    ax.set_xlim(ranges.loc[i, "min"] - abs(space), ranges.loc[i, "max"] + abs(space))
    ax.set_ylim(ranges.loc[i, "min"] - abs(space), ranges.loc[i, "max"] + abs(space))
    
    index+=1

fig.tight_layout()

plt.savefig('../Figures/corr_buffer_all_64k_kde.pdf', bbox_inches='tight')  
_images/Chapter_13_Correlation_Buffers_16_0.png