Compare sPlotOpen and iNaturalist trait maps#

We will create trait maps with the iNaturalist observation data and evaluate the maps using the sPlotOpen data.

This section covers:

  • Load Data

  • Visualizing the trait maps - a first look

  • Grid mean trait values at different resolutions

  • Calculate weighted r

  • Determine slope of correlation

  • Plot correlation plots at 2 degree resolution

# packages
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
from matplotlib.colors import LogNorm, Normalize
from matplotlib.ticker import MaxNLocator
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from matplotlib.colors import BoundaryNorm
import geopandas as gpd

from pyproj import Proj # allows for different projections 
from shapely.geometry import shape # for calculating areas

Load Data#

iNat_TRY = pd.read_csv("iNat_TRY_log.csv")
iNat_TRY.head(2)
gbifID scientificName decimalLatitude decimalLongitude eventDate dateIdentified Dispersal unit length Leaf Area SLA Leaf C ... Leaf delta15N Leaf N P ratio Leaf P Plant Height Seed mass Seed length Seeds per rep. unit Stem conduit density SSD Conduit element length
0 1229615436 Commelina communis 35.987483 -79.057546 2013-07-07T00:00:00 2013-07-07T20:33:11 NaN NaN NaN NaN ... NaN 2.5362 0.536493 NaN 2.13771 NaN NaN NaN NaN NaN
1 3384000233 Commelina communis 42.093762 -75.923660 2021-08-23T13:06:06 2021-09-17T21:15:37 NaN NaN NaN NaN ... NaN 2.5362 0.536493 NaN 2.13771 NaN NaN NaN NaN NaN

2 rows × 24 columns

sPlot = pd.read_csv("sPlotOpen/cwm_loc.csv")
sPlot.head(2)
PlotObservationID TraitCoverage_cover Species_richness TraitCoverage_pa Leaf Area SSD SLA Leaf C Leaf N per mass Leaf P ... Height_shrubs_lowest Height_herbs_average Height_herbs_lowest Height_herbs_highest SoilClim_PC1 SoilClim_PC2 Resample_1 Resample_2 Resample_3 Resample_1_consensus
0 16 0.277778 3 0.333333 3.678311 -1.047293 2.890748 6.128157 2.873263 1.114036 ... NaN NaN NaN NaN -3.66 0.546 True False False True
1 17 0.038462 2 0.500000 3.678311 -1.047293 2.890748 6.128157 2.873263 1.114036 ... NaN NaN NaN NaN -3.66 0.546 True False False True

2 rows × 86 columns

Visualize trait maps#

def plot_grid(df, lon, lat, variable, dataset_name, deg, log=True):

    plt.rcParams.update({'font.size': 15})

    # define raster shape for plotting
    step = int((360/deg) + 1)
    bins_x = np.linspace(-180,180,step) 
    bins_y= np.linspace(-90,90,int(((step - 1)/2)+1))


    df['x_bin'] = pd.cut(df[lon], bins=bins_x)
    df['y_bin'] = pd.cut(df[lat], bins=bins_y)

    df['x_bin'] = df['x_bin'].apply(lambda x: x.left)
    df['y_bin'] = df['y_bin'].apply(lambda x: x.left)

    grouped_df = df.groupby(['x_bin', 'y_bin'], as_index=False)[variable].mean()
    raster = grouped_df.pivot('y_bin', 'x_bin', variable)

    # data format
    data_crs = ccrs.PlateCarree()
    
    #for colorbar
    levels = MaxNLocator(nbins=15).tick_values(grouped_df[variable].min(), grouped_df[variable].max())
    cmap = plt.get_cmap('YlGnBu') # colormap
    norm = BoundaryNorm(levels, ncolors=cmap.N, clip=True) 
    im_ratio = raster.shape[0]/raster.shape[1] # for size of colorbar

    #create base plot of a world map
    ax = fig.add_subplot(1, 1, 1, projection=ccrs.Robinson()) # I used the PlateCarree projection from cartopy
    ax.set_global()
        
    #add grid with values
    im = ax.pcolormesh(bins_x, bins_y, raster, cmap="YlGnBu", 
                       vmin=grouped_df[t].min(),
                       vmax=grouped_df[t].max(),
                       transform=data_crs) 
    
    #add color bar
    if log==True:
            label= "log " + str(t)
    else:
        label= str(t)
    
    fig.colorbar(im,fraction=0.046*im_ratio, pad=0.04, label=label)

    #add coastlines
    ax.coastlines(resolution='110m', color='pink', linewidth=1.5)

    #set title
    ax.set_title( variable + ' ' + dataset_name, size=14)

To visualize the trait maps we use the custom function plot_grid().

trait = iNat_TRY.columns[6:24]

for t in trait:

    fig = plt.figure(figsize=(12, 12))
    filename = '../Figures/iNat_traitmap_' + t +'.pdf'
    plot_grid(iNat_TRY, "decimalLongitude", "decimalLatitude",variable=t, dataset_name="iNaturalist", deg=2)
    plt.savefig(filename, bbox_inches='tight')
_images/Chapter_6_Compare_trait_maps_sPlot_iNat_10_0.png _images/Chapter_6_Compare_trait_maps_sPlot_iNat_10_1.png _images/Chapter_6_Compare_trait_maps_sPlot_iNat_10_2.png _images/Chapter_6_Compare_trait_maps_sPlot_iNat_10_3.png _images/Chapter_6_Compare_trait_maps_sPlot_iNat_10_4.png _images/Chapter_6_Compare_trait_maps_sPlot_iNat_10_5.png _images/Chapter_6_Compare_trait_maps_sPlot_iNat_10_6.png _images/Chapter_6_Compare_trait_maps_sPlot_iNat_10_7.png _images/Chapter_6_Compare_trait_maps_sPlot_iNat_10_8.png _images/Chapter_6_Compare_trait_maps_sPlot_iNat_10_9.png _images/Chapter_6_Compare_trait_maps_sPlot_iNat_10_10.png _images/Chapter_6_Compare_trait_maps_sPlot_iNat_10_11.png _images/Chapter_6_Compare_trait_maps_sPlot_iNat_10_12.png _images/Chapter_6_Compare_trait_maps_sPlot_iNat_10_13.png _images/Chapter_6_Compare_trait_maps_sPlot_iNat_10_14.png _images/Chapter_6_Compare_trait_maps_sPlot_iNat_10_15.png _images/Chapter_6_Compare_trait_maps_sPlot_iNat_10_16.png _images/Chapter_6_Compare_trait_maps_sPlot_iNat_10_17.png

Compare to maps created using R raster: Load traitmaps from geotiff data#

from os import listdir
from os.path import isfile, join
path = "./iNaturalist_traits-main/iNat_log/"
files = [f for f in listdir(path) if isfile(join(path, f))]
files.sort()
import xarray as xr
files
['iNat_Conduit.element.length_2_ln.tif',
 'iNat_Dispersal.unit.length_2_ln.tif',
 'iNat_LDMC_2_ln.tif',
 'iNat_Leaf.Area_2_ln.tif',
 'iNat_Leaf.C_2_ln.tif',
 'iNat_Leaf.N.P.ratio_2_ln.tif',
 'iNat_Leaf.N.per.area_2_ln.tif',
 'iNat_Leaf.N.per.mass_2_ln.tif',
 'iNat_Leaf.P_2_ln.tif',
 'iNat_Leaf.delta15N_2_ln.tif',
 'iNat_Leaf.fresh.mass_2_ln.tif',
 'iNat_Plant.Height_2_ln.tif',
 'iNat_SLA_2_ln.tif',
 'iNat_SSD_2_ln.tif',
 'iNat_Seed.length_2_ln.tif',
 'iNat_Seed.mass_2_ln.tif',
 'iNat_Seeds.per.rep..unit_2_ln.tif',
 'iNat_Stem.conduit.density_2_ln.tif']
def cubeFile(file):
    name = file.replace(".tif","")
    sr = xr.open_dataset(path + file,engine = "rasterio",chunks = 1024).sel(band = 1)
    sr = sr.assign_coords({"variable":name})
    return sr

da = xr.concat([cubeFile(x) for x in files],dim = "variable")
da.band_data.sel(variable = "iNat_Leaf.Area_2_ln").plot.imshow()
<matplotlib.image.AxesImage at 0x7f9f25c954c0>
_images/Chapter_6_Compare_trait_maps_sPlot_iNat_16_1.png

Plot one band of multidimensional xarray:

fig = plt.figure(figsize=(12, 12))

axis = fig.subplots(
    1, 1, subplot_kw=dict(projection=ccrs.Robinson())
)

variable = "iNat_Leaf.Area_2_ln"
im= da.band_data.sel(variable = variable).plot.pcolormesh(
    ax=axis,
    transform=ccrs.PlateCarree(),  # this is important!
    # usual xarray stuff
    cbar_kwargs={"orientation": "horizontal", "shrink": 0.7},
    cmap='YlGnBu',
    #vmin=da.band_data.sel(variable = "iNat_Conduit.element.length").min(),
    #vmax=da.band_data.sel(variable = "iNat_Conduit.element.length").max(),
    #norm=LogNorm()

)

axis.coastlines(resolution='110m', color='pink', linewidth=1.5)  # cartopy function
#plt.tight_layout()
<cartopy.mpl.feature_artist.FeatureArtist at 0x7f9f25d10790>
_images/Chapter_6_Compare_trait_maps_sPlot_iNat_18_1.png
df = da.band_data.to_dataset().to_dataframe().reset_index()
df.head()
x y variable band spatial_ref band_data
0 -179.0 89.0 iNat_Conduit.element.length_2_ln 1 0 NaN
1 -179.0 89.0 iNat_Dispersal.unit.length_2_ln 1 0 NaN
2 -179.0 89.0 iNat_LDMC_2_ln 1 0 NaN
3 -179.0 89.0 iNat_Leaf.Area_2_ln 1 0 NaN
4 -179.0 89.0 iNat_Leaf.C_2_ln 1 0 NaN
df_spread = df.pivot(index= ['x','y'],columns='variable',values='band_data').reset_index()
df_spread.head()
variable x y iNat_Conduit.element.length_2_ln iNat_Dispersal.unit.length_2_ln iNat_LDMC_2_ln iNat_Leaf.Area_2_ln iNat_Leaf.C_2_ln iNat_Leaf.N.P.ratio_2_ln iNat_Leaf.N.per.area_2_ln iNat_Leaf.N.per.mass_2_ln iNat_Leaf.P_2_ln iNat_Leaf.delta15N_2_ln iNat_Leaf.fresh.mass_2_ln iNat_Plant.Height_2_ln iNat_SLA_2_ln iNat_SSD_2_ln iNat_Seed.length_2_ln iNat_Seed.mass_2_ln iNat_Seeds.per.rep..unit_2_ln iNat_Stem.conduit.density_2_ln
0 -179.0 -89.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 -179.0 -87.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 -179.0 -85.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 -179.0 -83.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 -179.0 -81.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
df_spread.shape
(16200, 20)

Grid mean trait values at different resolutions#

def global_grid_data(df, long, lat, deg, variables):
    
    # create new dataframe to save the average value of each grid cell and variable
    grouped_df = pd.DataFrame()
    
    # convert degree into step size
    step = int((360/deg) + 1)
    
    bins_x = np.linspace(-180,180,step) 
    bins_y= np.linspace(-90,90,int(((step - 1)/2)+1))
    
    # group latitude and longitude coordinates into bins
    # create new columns 'x_bin' and 'y_bin'
    df['x_bin'] = pd.cut(df[long], bins=bins_x)
    df['y_bin'] = pd.cut(df[lat], bins=bins_y)
    
    # raster coordinates are in center of raster cell
    df['x_bin'] = df['x_bin'].apply(lambda x: ((x.left + x.right) /2) )
    df['y_bin'] = df['y_bin'].apply(lambda x: ((x.left + x.right) /2) )
    
    grouped_df = df.drop_duplicates(subset=['x_bin', 'y_bin'], keep='last')
    grouped_df = grouped_df[['x_bin', 'y_bin']]
    
    for v in variables:
        
        sub_df = df[['y_bin', 'x_bin', v]]
        grouped_v = sub_df.groupby(['x_bin', 'y_bin'], as_index=False)[v].mean()
        
        
        grouped_df = pd.merge(grouped_df, grouped_v, 
                    on= ['x_bin', 'y_bin'], 
                    how='left')
        
    return grouped_df

Check function at 2 degree resolution#

To create a dataframe with all traits and the mean log trait values per cell we call global_grid_data().

deg = 2

trait = iNat_TRY.columns[6:24]

df_iNat = global_grid_data(iNat_TRY, 'decimalLongitude', 'decimalLatitude', deg, trait)

df_iNat_t = df_iNat.melt(id_vars=["x_bin", "y_bin"], 
                     value_name="TraitValue_iNat", 
                     var_name="Trait")
df_iNat_t
x_bin y_bin Trait TraitValue_iNat
0 -85.0 5.0 Dispersal unit length NaN
1 -49.0 61.0 Dispersal unit length 0.893636
2 -173.0 69.0 Dispersal unit length 0.893636
3 1.0 31.0 Dispersal unit length 0.388658
4 37.0 39.0 Dispersal unit length 1.189367
... ... ... ... ...
69313 23.0 -33.0 Conduit element length 6.299584
69314 147.0 -43.0 Conduit element length 6.089187
69315 -117.0 33.0 Conduit element length 5.760924
69316 115.0 23.0 Conduit element length 5.893173
69317 95.0 51.0 Conduit element length 5.461003

69318 rows × 4 columns

df_iNat_t['Trait'].nunique()
18

Calculate averages for different resolutions#

We now compute these global grids for various grid sizes (in degrees) and all traits. We merge iNat and sPlot into one dataframe and save it.

deg = [4, 2, 1, 0.5, 0.25, 0.125, 0.0625]


trait = iNat_TRY.columns[6:24]

for d in deg:
    df_iNat = global_grid_data(iNat_TRY, 'decimalLongitude', 'decimalLatitude', d, trait)
    
    df_sPlot = global_grid_data(sPlot, 'Longitude', 'Latitude', d, trait)
    
    
    # reshape data, so that we have only one Trait column
    df_iNat_t = df_iNat.melt(id_vars=["x_bin", "y_bin"], 
                     value_name="TraitValue_iNat", 
                     var_name="Trait")
    
    df_sPlot_t = df_sPlot.melt(id_vars=["x_bin", "y_bin"], 
                     value_name="TraitValue_sPlot", 
                     var_name="Trait")
    
    # merge sPlot and iNat data into one dataframe
    df_merged = pd.merge(df_sPlot_t, df_iNat_t, on=["x_bin", "y_bin", "Trait"] )
    
    # keep only lines where we have a pixel in both datasets
    df_merged = df_merged.dropna()
    
    
    # save result to csv
    filename="grid_means_" + str(d) + "_deg.csv"
    df_merged.to_csv(filename, index=False)
    

Calculate weighted r#

Calculate weights per grid cell area and weighted r for all traits and all grid sizes.

def lat_weights(lat_unique, deg):
    
    from pyproj import Proj
    from shapely.geometry import shape

    # determine weights per grid cell based on longitude
    # keep only one exemplary cell at each distance from equator
    # weights per approximated area of grid size depending on distance from equator
    
    # make dictionary
    weights = dict()

    for j in lat_unique:
        
        # the four corner points of the grid cell
        
        p1 = (0 , j+(deg/2))
        p2 = (deg , j+(deg/2))
        p3 = (deg, j-(deg/2))
        p4 = (0, j-(deg/2))
        
        # Calculate polygon surface area
        # https://stackoverflow.com/questions/4681737/how-to-calculate-the-area-of-a-polygon-on-the-earths-surface-using-python
        
        # Define corner points
        co = {"type": "Polygon", "coordinates": [[p1,p2,p3,p4]]}
        lat_1=p1[1]
        lat_2=p3[1]
        lat_0=(p1[1]+p3[1])/2
        lon_0=deg/2
        
        # Caveat: Connot go accross equator
        value1 = abs(lat_1 + lat_2)
        value2 = abs((lat_1) + abs(lat_2))
        
        # if grid cell overlaps equator:
        if value1 < value2:
            
            lat_1=p1[1]
            lat_2=0
            lat_0=(p1[1]+lat_2)/2
            lon_0=deg/2
            
            # Projection equal area used: https://proj.org/operations/projections/aea.html
            projection_string="+proj=aea +lat_1=" + str(lat_1) + " +lat_2=" + str(lat_2) + " +lat_0=" + str(lat_0) + " +lon_0=" + str(lon_0) 
            lon, lat = zip(*co['coordinates'][0])
        
            pa = Proj(projection_string)
            x, y = pa(lon, lat)
            cop = {"type": "Polygon", "coordinates": [zip(x, y)]}
        
            area = (shape(cop).area/1000000)*2
        
        # if grid cell is on one side of equator:
        else:
            # Projection equal area used: https://proj.org/operations/projections/aea.html
            projection_string="+proj=aea +lat_1=" + str(lat_1) + " +lat_2=" + str(lat_2) + " +lat_0=" + str(lat_0) + " +lon_0=" + str(lon_0) 
            lon, lat = zip(*co['coordinates'][0])
        
            pa = Proj(projection_string)
            x, y = pa(lon, lat)
            cop = {"type": "Polygon", "coordinates": [zip(x, y)]}
        
            area = (shape(cop).area/1000000)

        
        # set coord to center of grid cell
        coord = j
        
        # map area to weights dictionary
        weights[coord] = area
    
    # convert area into proportion with area/max.area:
    max_area = max(weights.values())
    
    for key in weights.keys():
        
        weights[key] = weights[key]/max_area
        
    return weights
def weighted_r(df, col_1, col_2, col_lat, weights, r2=False):
    
    # map weights to dataframe
    df['Weights'] = df[col_lat].map(weights)
        
    # calculate weighted correlation 
    # https://www.statsmodels.org/stable/generated/statsmodels.stats.weightstats.DescrStatsW.html
    import statsmodels.api as statmod
    
    d1 = statmod.stats.DescrStatsW( df[[col_1, col_2]], df['Weights'] )
        
    corr = d1.corrcoef[0][1]
    
    # optional
    # calculate r2
    if r2 == True:
        corr = corr**2
    
    return corr
# get trait names
filename="grid_means_" + str(2) + "_deg.csv"
raster_means = pd.read_csv(filename)
trait = raster_means["Trait"].unique()

Calculate r for all traits and a range of resolutions:

# resolutions we want to calculate r for
deg = [4, 2, 1, 0.5, 0.25, 0.125, 0.0625]

r_all = pd.DataFrame(columns=trait)

for i in deg:
    
    import numpy as np
    import statsmodels.api as statmod
    
    # open saved raster mean files
    filename="grid_means_" + str(i) + "_deg.csv"
    raster_means = pd.read_csv(filename)
    raster_means = raster_means[~raster_means.isin([np.nan, np.inf, -np.inf]).any(1)]
    # determine weights per grid cell based on longitude
    
    lat_unique = raster_means['y_bin'].unique()
    weights = lat_weights(lat_unique, deg=i)
    
    # initiate  
    r_grid = []  
    
    for t in trait:
        # subset only one trait
        raster_means_trait = raster_means[raster_means['Trait']==t]
        
        # drop nan's
        raster_means_trait = raster_means_trait.dropna()
        
        # calculate weighted r
        r_trait = weighted_r(raster_means_trait, "TraitValue_sPlot", "TraitValue_iNat", "y_bin", weights)
        
        # add to trait r's
        r_grid.append(r_trait)
    
    
    s = pd.Series(r_grid, index=r_all.columns)
    
    # add new series of r at a certain resolution to df 
    r_all = r_all.append(s, ignore_index=True)

    
# add resolution to r-df
r_all['Resolution'] = [4, 2, 1, 0.5, 0.25, 0.13, 0.06]
r_all
Dispersal unit length Leaf Area SLA Leaf C LDMC Leaf fresh mass Leaf N per area Leaf N per mass Leaf delta15N Leaf N P ratio Leaf P Plant Height Seed mass Seed length Seeds per rep. unit Stem conduit density SSD Conduit element length Resolution
0 -0.064129 0.648441 0.541325 0.313925 0.395695 0.675771 0.530652 0.410528 0.335074 0.402132 0.417266 0.573948 0.520051 0.227821 0.033105 0.694801 0.639448 0.360548 4.00
1 -0.040254 0.591867 0.559321 0.214475 0.337698 0.593019 0.592591 0.36652 0.261048 0.459257 0.3891 0.58217 0.48431 0.26135 0.061491 0.680519 0.629418 0.319216 2.00
2 -0.008955 0.528993 0.545211 0.281464 0.352232 0.518514 0.542192 0.412746 0.266304 0.425005 0.421088 0.579327 0.460602 0.272997 0.122855 0.635591 0.628607 0.313588 1.00
3 0.015484 0.443391 0.511926 0.223498 0.343284 0.426179 0.533832 0.369761 0.203682 0.356999 0.40571 0.508895 0.368398 0.250143 0.146879 0.57069 0.556945 0.278735 0.50
4 0.012678 0.386012 0.464901 0.208586 0.325511 0.313482 0.490444 0.350628 0.168444 0.292185 0.371618 0.4243 0.298729 0.196469 0.121625 0.494574 0.484192 0.240417 0.25
5 0.004708 0.325776 0.423198 0.195532 0.302716 0.2446 0.446798 0.344023 0.173674 0.272116 0.346877 0.361929 0.239897 0.172903 0.109652 0.436362 0.443396 0.200705 0.13
6 -0.006774 0.309105 0.415142 0.164298 0.308849 0.208823 0.424765 0.365924 0.195232 0.252934 0.356953 0.352254 0.236357 0.152757 0.089049 0.425663 0.415955 0.186163 0.06

Save result to .csv

r_all.to_csv("r_all.csv", index=False)

Visualize different r at different resolutions as line graph (The straight lines connecting the dots only there to facilitate readibility):

r_all = pd.read_csv("r_all.csv")

Median correlation at 2 degree resolution

np.median(r_all.iloc[1])
0.4592574614778774
#### Plot 
# https://stackoverflow.com/questions/44941082/plot-multiple-columns-of-pandas-dataframe-using-seaborn

trait_names = ['Dispersal unit len.', 'Leaf Area', 'SLA', 'Leaf C', 'LDMC',
       'Leaf fresh mass', 'Leaf N per area', 'Leaf N per mass',
       'Leaf delta15N', 'Leaf N P ratio', 'Leaf P', 'Plant Height',
       'Seed mass', 'Seed length', 'Seeds per rep. unit',
       'Stem conduit \ndensity', 'SSD', 'Conduit element len.']

# data 
data_dropnan = r_all.dropna(axis=1, how='all')
data_melt= pd.melt(data_dropnan, ['Resolution'], value_name="r")

# change Resolution to string, so it is not interpreted as a number
data_melt = data_melt.astype({"Resolution": str}, errors='raise') 

# set plotting parameters
sns.set(rc={'figure.figsize':(6,10)}) # size
sns.set(font_scale = 2) # make font larger
sns.set_theme(style="white") # plain white plot, no grids

# initiate plot
fig, ax = plt.subplots()

# plot all lines into one plot

sns.lineplot(x='Resolution', 
            y='r', 
            hue='variable', 
            data=data_melt,
            ax=ax,
            marker='o',
            linewidth=0.6,
            legend=None)

label_pos=[]

# Add the text--for each line, find the end, annotate it with a label
# https://lost-stats.github.io/Presentation/Figures/line_graph_with_labels_at_the_beginning_or_end.html

for line, variable in zip(ax.lines, trait_names):
    y = line.get_ydata()[0]
    x = line.get_xdata()[0]
    if not np.isfinite(y):
        y=next(reversed(line.get_ydata()[~line.get_ydata().mask]),float("nan"))
    if not np.isfinite(y) or not np.isfinite(x):
        continue 
    x=round(x)
    y=round(y,2)
    xy=(x-0.1, y)
    if xy in label_pos:
        xy=(x-0.1, y-0.01)
    if xy in label_pos:
        xy=(x-0.1, y+0.015)
    if variable=="Seed Mass":
        xy=(x-0.1, y-0.02)
    if variable=="Leaf Area":
        xy=(x-0.1, y+0.005)
    label_pos.append(xy)
    text = ax.annotate(variable,
                       xy=(xy),
                       xytext=(0, 0),
                       color=line.get_color(),
                       xycoords=(ax.get_xaxis_transform(),
                                 ax.get_yaxis_transform()),
                       textcoords="offset points")
    text_width = (text.get_window_extent(
    fig.canvas.get_renderer()).transformed(ax.transData.inverted()).width)

# Format the date axis to be prettier.
sns.despine()
plt.xlabel("Resolution in degrees",  fontsize = 12)
plt.ylabel("r",  fontsize = 14) 

ax.invert_xaxis()

plt.tight_layout()

# save figure as pdf
plt.savefig('../Figures/r_vs_resolution_long.pdf', bbox_inches='tight')  
_images/Chapter_6_Compare_trait_maps_sPlot_iNat_48_0.png

Determine slope of the correlation#

from pylr2 import regress2

Load pixel trait means at 2 degrees:

deg = 2
filename="grid_means_" + str(deg) + "_deg.csv"
raster_means = pd.read_csv(filename)

# remove inf. values
raster_means = raster_means[~raster_means.isin([np.nan, np.inf, -np.inf]).any(1)]

# filter only for trait means, we don't need the counts per pixel
raster_means = raster_means[raster_means['Trait'].isin(trait)]

Check method for one trait:

# subset only one trait
raster_means_trait = raster_means[raster_means['Trait']=="Plant Height"]
        
# drop nan's
raster_means_trait = raster_means_trait.dropna(subset = ['TraitValue_iNat', 'TraitValue_sPlot'])
x = raster_means_trait["TraitValue_sPlot"]
y = raster_means_trait["TraitValue_iNat"]
results = regress2(x, y, _method_type_2="reduced major axis")
results
{'slope': 0.7606001314977557,
 'intercept': -0.15827807085124396,
 'r': 0.5985443572644118,
 'std_slope': 0.021256591178264863,
 'std_intercept': 0.026223999235003997,
 'predict': 10519   -1.216659
 10520   -0.959619
 10521   -0.800042
 10522   -0.109246
 10523   -0.047238
            ...   
 11544   -0.731485
 11545   -0.584747
 11546   -0.868299
 11547   -0.820544
 11548   -0.791688
 Name: TraitValue_sPlot, Length: 1030, dtype: float64}
slope_boot = []

for i in range(0,3000):
    sample_df = raster_means_trait.sample(n=len(raster_means_trait), replace=True, axis=0)
    
    x = sample_df["TraitValue_sPlot"]
    y = sample_df["TraitValue_iNat"]

    results = regress2(x, y, _method_type_2="reduced major axis")
    slope_boot.append(results["slope"])
    
    
np.quantile(slope_boot,0.05)
0.7271391098609535
np.quantile(slope_boot,0.95)
0.7941371924153917

For all traits:

slopes = pd.DataFrame(index=['slope','intercept','conf_int'])

for t in trait:
    # subset only one trait
    raster_means_trait = raster_means[raster_means['Trait']==t]
        
    # drop nan's
    raster_means_trait = raster_means_trait.dropna(subset = ['TraitValue_iNat', 'TraitValue_sPlot'])
    
    # filter out outliers
    cols = ['TraitValue_iNat', 'TraitValue_sPlot'] # one or more
    
    Q1 = raster_means_trait[cols].quantile(0.01)
    Q3 = raster_means_trait[cols].quantile(0.99)
    IQR = Q3 - Q1

    raster_means_trait = raster_means_trait[~((raster_means_trait[cols] < (Q1 - 1.5 * IQR)) |(raster_means_trait[cols] > (Q3 + 1.5 * IQR))).any(axis=1)]
    
    
    x = raster_means_trait["TraitValue_sPlot"]
    y = raster_means_trait["TraitValue_iNat"]
    
    results = regress2(x, y, _method_type_2="reduced major axis")
        
    slope = results["slope"]
    intercept = results["intercept"]
    
    # get 95% confidence interval
    
    slope_boot = []
    
    for i in range(0,3000):
        sample_df = raster_means_trait.sample(n=len(raster_means_trait), replace=True, axis=0)
    
        x = sample_df["TraitValue_sPlot"]
        y = sample_df["TraitValue_iNat"]

        results = regress2(x, y, _method_type_2="reduced major axis")
        slope_boot.append(results["slope"])
    
    conf_int = [round(np.quantile(slope_boot,0.05),2), round(np.quantile(slope_boot,0.95),2)] # 95% conf_interval
    
    slopes[t] = [slope, intercept, conf_int]
slopes
Dispersal unit length Leaf Area SLA Leaf C LDMC Leaf fresh mass Leaf N per area Leaf N per mass Leaf delta15N Leaf N P ratio Leaf P Plant Height Seed mass Seed length Seeds per rep. unit Stem conduit density SSD Conduit element length
slope 0.882832 1.027592 1.414497 1.238462 1.0617 1.129783 1.392226 1.0775 2.007867 1.147387 1.290807 0.7606 0.841936 1.279172 1.351475 1.435743 2.574361 0.987289
intercept -0.111272 0.160015 -0.934332 -1.486539 -0.207463 0.67872 -0.126172 -0.082166 -0.464251 -0.375518 0.027418 -0.158278 0.417664 -0.642623 0.0792 -1.459013 0.914904 -0.002103
conf_int [-0.94, 0.97] [0.97, 1.08] [1.34, 1.49] [1.12, 1.37] [0.99, 1.14] [1.07, 1.19] [1.32, 1.47] [1.01, 1.15] [1.89, 2.14] [1.08, 1.22] [1.2, 1.38] [0.73, 0.79] [0.79, 0.9] [1.14, 1.42] [1.18, 1.53] [1.38, 1.5] [2.47, 2.69] [0.92, 1.07]

Calculate SMA stats in R:

library(smatr)

iNat <- read.csv("grid_means_2_deg.csv")

traits <- unique(iNat$Trait)

test <- data.frame()

for (t in 1:length(traits)){
      sub <- iNat[iNat$Trait == traits[[t]],]
      results <- sma(TraitValue_iNat~TraitValue_sPlot, data=sub, slope.test=1, elev.test=0)
      coef <- data.frame(results$coef)
      ci <- data.frame(traits[[t]], coef$coef.SMA.[1], coef$coef.SMA.[2], coef$lower.limit[2], coef$upper.limit[2], results$slopetest[[1]]$p)
      test <- rbind(test,ci)}

write.csv(test, "sma_stats.csv")

slopes = pd.read_csv("sma_stats.csv")
slopes.set_index(slopes["traits..t.."], inplace=True)
slopes.drop(["Unnamed: 0", "traits..t.."], axis=1, inplace=True)
slopes = slopes.transpose()
slopes.index = list(["intercept", "slope", "ci_lower", "ci_upper", "p"])
slopes
traits..t.. Dispersal unit length Leaf Area SLA Leaf C LDMC Leaf fresh mass Leaf N per area Leaf N per mass Leaf delta15N Leaf N P ratio Leaf P Plant Height Seed mass Seed length Seeds per rep. unit Stem conduit density SSD Conduit element length
intercept 2.501463 0.160015 -0.934332 -1.486539e+00 -0.207463 0.678720 -0.126172 -0.082166 -0.464251 -3.755179e-01 0.027418 -0.158278 4.176638e-01 -6.426235e-01 0.079200 -1.459013 0.914904 -0.002103
slope -1.137526 1.027592 1.414497 1.238462e+00 1.061700 1.129783 1.392226 1.077500 2.007867 1.147387e+00 1.290807 0.760600 8.419362e-01 1.279172e+00 1.351475 1.435743 2.574361 0.987289
ci_lower -1.215182 0.976077 1.343843 1.165017e+00 1.000283 1.067211 1.324024 1.017220 1.885989 1.086091e+00 1.219092 0.724223 7.991787e-01 1.200008e+00 1.265306 1.369064 2.452730 0.924341
ci_upper -1.064833 1.081825 1.488866 1.316536e+00 1.126888 1.196024 1.463942 1.141351 2.137621 1.212142e+00 1.366740 0.798804 8.869814e-01 1.363558e+00 1.443513 1.505670 2.702024 1.054524
p 0.000133 0.299447 0.000000 8.592571e-12 0.048924 0.000028 0.000000 0.011068 0.000000 9.796284e-07 0.000000 0.000000 1.199024e-10 5.906386e-14 0.000000 0.000000 0.000000 0.703398

Plot correlation plots for all traits at 2 degree resolution#

We want to look at the scatterplots a bit closer now. The previous plot shows that we have a high correlation for a number of traits at a resolution of 2 degrees, so we choose this resolution in the following, more detailed analyses.

Calculate axes ranges#

# calculate max-min ranges

def min__max_ranges(df, col_1, col_2, variable_col, variables):

    range_all =[]

    for i in variables:
        df_sub = df[df[variable_col]==i]
        df_sub = df_sub.dropna(subset = [col_1, col_2])
    
        xmin = df_sub[col_1].quantile(0.01)
        xmax = df_sub[col_1].quantile(0.99)
    
        ymin = df_sub[col_2].quantile(0.01)
        ymax = df_sub[col_2].quantile(0.99)
    
    
        if xmin>ymin:
            if not np.isfinite(ymin):
                pass
            else:
                xmin = ymin
        else:
            pass
    
        if xmax<ymax:
            xmax=ymax
        else:
            pass
        
        range_sub = [xmin, xmax]
    
        range_all.append(range_sub)
        
    ranges  = pd.DataFrame()
    ranges['variable'] = variables
    ranges['min'] = [i[0] for i in range_all]
    ranges['max'] = [i[1] for i in range_all]
    ranges = ranges.set_index('variable')

    return ranges

We claculate data ranges for all traits, so that we can plot symmetrical scatterplots:

ranges = min__max_ranges(raster_means, 'TraitValue_sPlot', 'TraitValue_iNat', 
                         variable_col='Trait', variables=trait)
ranges
min max
variable
Dispersal unit length -0.227319 2.460527
Leaf Area 3.609251 9.130945
SLA 1.285192 3.254503
Leaf C 5.980399 6.249313
LDMC -1.831323 -0.627124
Leaf fresh mass -4.256507 1.441722
Leaf N per area -0.043052 1.378253
Leaf N per mass 2.360681 3.430899
Leaf delta15N -0.969118 2.051529
Leaf N P ratio 2.098447 3.175909
Leaf P -0.736326 1.001090
Plant Height -2.034971 3.131525
Seed mass -1.488567 5.511339
Seed length -0.274437 2.744297
Seeds per rep. unit 1.736470 12.390493
Stem conduit density 1.945910 7.149330
SSD -1.921280 -0.072944
Conduit element length 5.159063 6.681987

Now we can plot a scatter plot for each trait in one large panel:

def plot_scatterplots(df, x, y, col, variables, labels, label_x, label_y, col_wrap):
    sns.set_theme(style="white", font_scale=1.5)
    
    # The relplot function can plot multiple variable in on large figure
    g = sns.relplot(
        data=df,
        x=x, y=y,
        col=col, 
        kind="scatter", 
        col_wrap=col_wrap, 
        linewidth=0,
        alpha=0.5,
        color="midnightblue",
        palette='crest',
        facet_kws={'sharey': False, 'sharex': False}
    )

    plt.subplots_adjust(hspace=0.35)
    index=0
    two = "2"
    

    # Iterate over each subplot to customize further
    for variables, ax in g.axes_dict.items():

        # Plot the 1:1 line
        ax.axline([0, 0], [1, 1], color= "black", alpha=0.6, ls = ":")
        
        # Plot the splope
        m = slopes[str(variables)].iloc[1]
        b = slopes[str(variables)].iloc[0]
        ci = [round(slopes[str(variables)].iloc[2],2), round(slopes[str(variables)].iloc[3],2)]
        # get min and max values 
        x_1 = ranges.loc[variables, "min"]
        #x_2 = ranges.loc[variables, "max"]
        ax.axline([x_1, ((m*x_1)+b)], slope=m, color='red')
        
        
        # Add the title as an annotation within the plot
        if b < 0:
            sub_title= str(labels[index])+ "\n" + "r = " + str(round(r_all.loc[1, variables], 2)) + "\n" + "y = " + str(round(m,2))   + "x - " + str(abs(round(b,2)))+ "\n95CI(m) = " + str(ci)
        else:
            sub_title= str(labels[index])+ "\n" + "r = " + str(round(r_all.loc[1, variables], 2) ) +  "\n" + "y = " + str(round(m,2)) +  "x + " + str(round(b,2))+ "\n95CI(m) = " + str(ci)
        ax.text(.05, 1, sub_title, transform=ax.transAxes)
        

        index+=1
        
    for variables, ax in g.axes_dict.items():
    # Set the axis ranges
        space = (ranges.loc[variables, "max"]-[ranges.loc[variables, "min"]]) * 0.2
        ax.set_xlim(ranges.loc[variables, "min"] - abs(space), ranges.loc[variables, "max"] + abs(space))
        ax.set_ylim(ranges.loc[variables, "min"] - abs(space), ranges.loc[variables, "max"] + abs(space))


    # Edit some supporting aspects of the plot
    g.set_titles("")
    g.set_axis_labels(label_x, label_y)
    g.tight_layout()

Plot scatterplots for all traits#

trait_names = ['Dispersal unit length (ln mm)', 'Leaf Area (ln mm$^2$)', 'SLA (ln m$^2$/kg)', 'Leaf C (ln mg/g)', 'LDMC (ln g/g)',
       'Leaf fresh mass (ln g)', 'Leaf N per area (ln g/m$^2$)', 'Leaf N per mass (ln mg/g)',
       'Leaf delta15N (ln ppm)', 'Leaf N P ratio (ln g/g)', 'Leaf P (ln mg/g)', 'Plant Height (ln m)',
       'Seed mass (ln mg)', 'Seed length (ln mm)', 'Seeds per rep. unit (ln)',
       'Stem conduit density (ln mm$^-$$^2$)', 'SSD (ln g/cm$^3$)', 'Conduit element length (ln $\mu$m)']


plot_scatterplots(raster_means, "TraitValue_sPlot", "TraitValue_iNat", col="Trait", 
                  variables=trait, labels=trait_names, 
                  label_x = "sPlotOpen grid", label_y = "iNaturalist grid",
                 col_wrap=4)

# Save figure
plt.savefig('../Figures/corr_plots_all_r.pdf', bbox_inches='tight')
_images/Chapter_6_Compare_trait_maps_sPlot_iNat_80_0.png
raster_means
x_bin y_bin Trait TraitValue_sPlot TraitValue_iNat
0 -148.6565 69.4065 Dispersal unit length 0.752731 1.337428
1 -144.3435 69.4690 Dispersal unit length 1.089437 0.298622
2 -136.3435 62.2810 Dispersal unit length 1.198785 0.901360
3 -148.7810 70.2810 Dispersal unit length 0.730777 0.903318
4 -148.6565 70.2810 Dispersal unit length 0.745368 0.000000
... ... ... ... ... ...
207105 12.9065 46.5935 Conduit element length 6.106800 5.954530
207106 23.5310 56.9690 Conduit element length 6.306639 6.109988
207107 23.4690 56.9690 Conduit element length 6.349083 5.981163
207108 23.3435 57.0310 Conduit element length 5.473169 5.876802
207109 22.5935 57.0310 Conduit element length 5.945436 6.742487

207110 rows × 5 columns

Plot top traits only#

# select top traits
top_trait = ['Leaf Area', 'SLA', 'Leaf fresh mass', 
               'Leaf N per area','Plant Height', 'Stem conduit density']

trait_names = ['Leaf Area (ln mm$^2$)', 'SLA (ln m$^2$/kg)', 'Leaf fresh mass (ln g)', 
               'Leaf N per area (ln g/m$^2$)','Plant Height (ln m)', 'Stem cond. density (ln mm$^-$$^2$)']

raster_means_top = raster_means[raster_means['Trait'].isin(top_trait)]
plot_scatterplots(raster_means_top, "TraitValue_sPlot", "TraitValue_iNat", col="Trait", 
                  variables=top_trait, labels=trait_names, 
                  label_x = "sPlotOpen grid", label_y = "iNaturalist grid", col_wrap =2)
# Save figure
plt.savefig('../Figures/corr_plots_top6.pdf', bbox_inches='tight')
_images/Chapter_6_Compare_trait_maps_sPlot_iNat_84_0.png