Flower Statistical Data Analysis¶

In [1]:
# Importing Neccessary Libraries
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
import seaborn as sns
from matplotlib import pyplot as plt
#!pip install pingouin
import pingouin as pg
In [2]:
# Reading the data into pandas dataframe
df = pd.read_excel('Daten.xlsx')
In [3]:
# Viewing the first 5 rows of the the dataset
df.head()
Out[3]:
Variante Wiederholung Beigabemenge [Vol.-%] Material pH g KCl / l Substrat mg NO3-N / l Substrat mg NH4-N / l Substrat mg P2O5 / l Substrat mg K2O / l Substrat ... Frischmasse [g] Blütenanzahl Wurzelbonitur [1-5] Wurzelbonitur [1-5]i Pflanzenhöhe [cm] Pflanzenhöhe [cm]i Pflanzenbreite [cm] Pflanzenbreite [cm]i Chlorophyllgehalt Trockenmasseanteil [%]
0 1 1 0 Torf 5.8 0.26 3.704760 3.280000 14.270560 16.400 ... 95.300000 42 4.75 4.500000 13 22.166667 20.714286 46 38.566667 11.235955
1 1 2 0 Torf 5.7 0.33 3.659580 3.240000 16.684848 17.172 ... 86.150000 37 4.75 5.000000 11 23.000000 20.785714 44 37.866667 11.475410
2 1 3 0 Torf 5.7 0.27 3.834508 3.643685 15.016605 19.440 ... 86.933333 36 4.75 5.000000 13 28.166667 21.357143 41 39.633333 11.111111
3 1 4 0 Torf 5.7 0.31 3.659580 3.240000 12.106260 16.200 ... 89.683333 34 4.25 4.666667 14 26.833333 19.928571 43 37.166667 11.949686
4 2 1 30 Holzfasern 6.0 0.46 2.891520 5.016666 18.884092 32.256 ... 70.816667 40 5.00 5.000000 12 23.833333 19.357143 36 35.766667 11.206897

5 rows × 25 columns

In [4]:
# Checking the datatypes of each column
df.dtypes
Out[4]:
Variante                    int64
Wiederholung                int64
Beigabemenge [Vol.-%]       int64
Material                   object
pH                        float64
g KCl / l Substrat        float64
mg NO3-N / l Substrat     float64
mg NH4-N / l Substrat     float64
mg P2O5 / l Substrat      float64
mg K2O / l Substrat       float64
Gesamt-N [% TS]           float64
P [% TS]                  float64
K [% TS]                  float64
Ca [% TS]                 float64
Mg [% TS]                 float64
Frischmasse [g]           float64
Blütenanzahl                int64
Wurzelbonitur [1-5]       float64
Wurzelbonitur [1-5]i      float64
Pflanzenhöhe [cm]           int64
Pflanzenhöhe [cm]i        float64
Pflanzenbreite [cm]       float64
Pflanzenbreite [cm]i        int64
Chlorophyllgehalt         float64
Trockenmasseanteil [%]    float64
dtype: object
In [5]:
# Changing the datatype of columns as per requirements for analysis
df['pH'] = pd.to_numeric(df['pH'],errors = 'coerce')
df['P [% TS]'] = pd.to_numeric(df['P [% TS]'],errors = 'coerce')
df['Beigabemenge [Vol.-%]'] = df['Beigabemenge [Vol.-%]'].astype('string') 
In [6]:
# Creating a label column to create boxplot
df['label'] = df['Beigabemenge [Vol.-%]'] + '% ' + df['Material']
# Sorting the dataframe by label
df = df.sort_values(by='label')
In [7]:
# Creating boxplot between varient and pH
sns.set(rc = {'figure.figsize':(30,15)})
ax = sns.boxplot(x="label",y="pH",data=df)
ax.tick_params(axis='x', rotation=45)
ax.set_title('Variente vs pH', fontsize =20)
plt.show()
In [8]:
# Creating boxplot between Variente and g KCl / l Substrat
sns.set(rc = {'figure.figsize':(30,15)})
ax = sns.boxplot(x="label",y="g KCl / l Substrat",data=df)
ax.tick_params(axis='x', rotation=45)
ax.set_title('Variente vs g KCl / l Substrat', fontsize =20)
plt.show()
In [9]:
# Creating boxplot between variante and mg NO3-N / l Substrat
sns.set(rc = {'figure.figsize':(30,15)})
ax = sns.boxplot(x="label",y="mg NO3-N / l Substrat",data=df)
ax.tick_params(axis='x', rotation=45)
ax.set_title('Variente vs mg NO3-N / l Substrat', fontsize =20)
plt.show()
In [10]:
# Creating boxplot between variente and mg NH4-N / l Substrat
sns.set(rc = {'figure.figsize':(30,15)})
ax = sns.boxplot(x="label",y="mg NH4-N / l Substrat",data=df)
ax.tick_params(axis='x', rotation=45)
ax.set_title('Variente vs mg NH4-N / l Substrat', fontsize =20)
plt.show()
In [11]:
# Creating boxplot for Variente vs mg P2O5 / l Substrat
sns.set(rc = {'figure.figsize':(30,15)})
ax = sns.boxplot(x="label",y="mg P2O5 / l Substrat",data=df)
ax.tick_params(axis='x', rotation=45)
ax.set_title('Variente vs mg P2O5 / l Substrat', fontsize =20)
plt.show()
In [12]:
# Creating boxplot for Variente vs mg K2O / l Substrat
sns.set(rc = {'figure.figsize':(30,15)})
ax = sns.boxplot(x="label",y="mg K2O / l Substrat",data=df)
ax.tick_params(axis='x', rotation=45)
ax.set_title('Variente vs mg K2O / l Substrat', fontsize =20)
plt.show()
In [13]:
# Creating boxplot for Variente vs P [% TS]
sns.set(rc = {'figure.figsize':(30,15)})
ax = sns.boxplot(x="label",y="P [% TS]",data=df)
ax.tick_params(axis='x', rotation=45)
ax.set_title('Variente vs P [% TS]', fontsize =20)
plt.show()
In [14]:
# Creating boxplot for Variente vs K [% TS]
sns.set(rc = {'figure.figsize':(30,15)})
ax = sns.boxplot(x="label",y="K [% TS]",data=df)
ax.tick_params(axis='x', rotation=45)
ax.set_title('Variente vs K [% TS]', fontsize =20)
plt.show()
In [15]:
# Creating boxplot for Variente vs Ca [% TS]
sns.set(rc = {'figure.figsize':(30,15)})
ax = sns.boxplot(x="label",y="Ca [% TS]",data=df)
ax.tick_params(axis='x', rotation=45)
ax.set_title('Variente vs Ca [% TS]', fontsize =20)
plt.show()
In [16]:
# Creating boxplot for Variente vs Mg [% TS]
sns.set(rc = {'figure.figsize':(30,15)})
ax = sns.boxplot(x="label",y="Mg [% TS]",data=df)
ax.tick_params(axis='x', rotation=45)
ax.set_title('Variente vs Mg [% TS]', fontsize =20)
plt.show()
In [17]:
# List of Dependant Variables to execute the loop
DV = ['pH', 'g KCl / l Substrat', 'mg NO3-N / l Substrat', 'mg NH4-N / l Substrat', 'mg P2O5 / l Substrat', 'mg K2O / l Substrat',
       'Gesamt-N [% TS]', 'P [% TS]', 'K [% TS]', 'Ca [% TS]', 'Mg [% TS]', 'Frischmasse [g]', 'Blütenanzahl', 'Wurzelbonitur [1-5]', 'Wurzelbonitur [1-5]i',
       'Pflanzenhöhe [cm]', 'Pflanzenhöhe [cm]i', 'Pflanzenbreite [cm]',
       'Pflanzenbreite [cm]i', 'Chlorophyllgehalt', 'Trockenmasseanteil [%]']
In [18]:
# Doing Anova Analysis

list_results = []
for dv in DV:
  # run anova and create dv variable to identify dependent variable
  aov = pg.anova(data=df, dv=dv, between='Variante', detailed=True)\
    .assign(dv=dv)
  # append to list of results
  list_results.append(aov)

# concat all results into a DataFrame
df_results = pd.concat(list_results, axis=0)
# Dropping Within Rows
df_results.drop(df_results.index[df_results['Source'] == 'Within'], inplace=True)
# Export to Excel
df_results.to_excel('Anova.xlsx')
df_results
Out[18]:
Source SS DF MS F p-unc np2 dv
0 Variante 1.315000e+00 10 0.131500 19.074725 5.706652e-11 0.852512 pH
0 Variante 3.406064e+00 10 0.340606 17.898105 1.342357e-10 0.844326 g KCl / l Substrat
0 Variante 2.928979e+03 10 292.897886 2.055398 5.871264e-02 0.383799 mg NO3-N / l Substrat
0 Variante 8.194135e+03 10 819.413528 29.275912 1.413117e-13 0.898698 mg NH4-N / l Substrat
0 Variante 7.110291e+04 10 7110.291326 30.411283 8.144041e-14 0.902110 mg P2O5 / l Substrat
0 Variante 3.600441e+06 10 360044.083265 52.798553 2.140272e-17 0.941175 mg K2O / l Substrat
0 Variante 2.026364e+00 10 0.202636 4.487919 4.974350e-04 0.576267 Gesamt-N [% TS]
0 Variante 8.042137e-02 10 0.008042 5.475764 1.039140e-04 0.631156 P [% TS]
0 Variante 5.384432e+01 10 5.384432 111.753635 5.665210e-22 0.972163 K [% TS]
0 Variante 2.014378e-01 10 0.020144 4.997903 2.270779e-04 0.609656 Ca [% TS]
0 Variante 2.027775e-01 10 0.020278 71.291246 5.524975e-19 0.957042 Mg [% TS]
0 Variante 1.362700e+04 10 1362.699505 39.392323 1.799251e-15 0.922703 Frischmasse [g]
0 Variante 3.034045e+03 10 303.404545 5.349906 1.134698e-04 0.618493 Blütenanzahl
0 Variante 3.475000e+01 10 3.475000 20.850000 1.697146e-11 0.863354 Wurzelbonitur [1-5]
0 Variante 5.315025e+01 10 5.315025 31.492519 4.898803e-14 0.905152 Wurzelbonitur [1-5]i
0 Variante 8.163636e+01 10 8.163636 8.353488 1.420132e-06 0.716823 Pflanzenhöhe [cm]
0 Variante 2.370694e+02 10 23.706944 10.234887 1.463084e-07 0.756186 Pflanzenhöhe [cm]i
0 Variante 3.659474e+02 10 36.594736 31.640511 4.575051e-14 0.905554 Pflanzenbreite [cm]
0 Variante 1.442227e+03 10 144.222727 28.541829 2.038334e-13 0.896363 Pflanzenbreite [cm]i
0 Variante 4.438153e+01 10 4.438153 2.537912 2.153880e-02 0.434729 Chlorophyllgehalt
0 Variante 1.103886e+01 10 1.103886 0.906046 5.387308e-01 0.215415 Trockenmasseanteil [%]
In [19]:
# Doing Post-Hoc Analysis

list_results = []
for dv in DV:
  # run anova and create dv variable to identify dependent variable
  posthocs = pg.pairwise_ttests(data=df, dv=dv, between='Variante')\
    .assign(dv=dv)
  # append to list of results
  list_results.append(posthocs)

# concat all results into a DataFrame
df_results2 = pd.concat(list_results, axis=0)

# Export to Excel
df_results2.to_excel('Post_Hoc.xlsx')
df_results2
C:\Users\shahz\anaconda3\lib\site-packages\pingouin\bayesian.py:145: IntegrationWarning: The integral is probably divergent, or slowly convergent.
  integr = quad(fun, 0, np.inf, args=(t, n, r, df))[0]
Out[19]:
Contrast A B Paired Parametric T dof alternative p-unc BF10 hedges dv
0 Variante 1 2 False True -3.655631 6.0 two-sided 0.010635 4.984 -2.247758 pH
1 Variante 1 3 False True -2.777460 6.0 two-sided 0.032104 2.49 -1.707792 pH
2 Variante 1 4 False True -5.656854 6.0 two-sided 0.001311 21.217 -3.478261 pH
3 Variante 1 5 False True -7.549834 6.0 two-sided 0.000280 67.374 -4.642208 pH
4 Variante 1 6 False True -1.805788 6.0 two-sided 0.120979 1.171 -1.110335 pH
... ... ... ... ... ... ... ... ... ... ... ... ...
50 Variante 8 10 False True -1.537128 6.0 two-sided 0.175175 0.967 -0.945142 Trockenmasseanteil [%]
51 Variante 8 11 False True -1.054266 6.0 two-sided 0.332353 0.716 -0.648242 Trockenmasseanteil [%]
52 Variante 9 10 False True -0.800652 6.0 two-sided 0.453861 0.631 -0.492301 Trockenmasseanteil [%]
53 Variante 9 11 False True -0.450991 6.0 two-sided 0.667836 0.556 -0.277303 Trockenmasseanteil [%]
54 Variante 10 11 False True 0.291481 6.0 two-sided 0.780498 0.537 0.179224 Trockenmasseanteil [%]

1155 rows × 12 columns