A Guide to Normalizing Data for Different Treatments in R

November 12, 2023 JK

I have data, as shown below, regarding iron contents in soil and the plant uptake of iron at different growth stages in winter wheat. I want to analyze the relationship between the iron content in the soil and the plant uptake of iron at different growth stages in winter wheat.

library(readr)
github="https://raw.githubusercontent.com/agronomy4future/raw_data_practice/main/wheat_grain_Fe_uptake.csv"
dataA= data.frame(read_csv(url(github), show_col_types= FALSE))

    Location Season Genotype Reps Iron_ton_ha      Stage   Fe
1       East   2021      CV1    1     21.7127 Vegetative 0.44
2       East   2021      CV1    2      8.7340 Vegetative 0.30
3       East   2021      CV1    3      9.5003 Vegetative 0.31
4       East   2021      CV1    4      5.9481 Vegetative 0.37
5       East   2021      CV1    5      7.4608 Vegetative 0.30
6       East   2021      CV1    6     20.5326 Vegetative 0.33
7       East   2021      CV1    7     19.8532 Vegetative 0.29
8       East   2021      CV1    8      7.9718 Vegetative 0.35
9       East   2021      CV1    9     15.0087 Vegetative 0.38
10      East   2021      CV1   10      5.6608 Vegetative 0.40
.
.
.

We can simply draw a regression graph. However, before doing that, we need to reshape the data. I’ll transpose the data from rows to columns based on the variables in the Stage column.

library(dplyr)
library(tidyr)
dataB= data.frame(dataA %>%
  group_by(Location, Season, Genotype, Iron_ton_ha, Stage) %>%
  spread(key=Stage, value = Fe))

dataB
    Location Season Genotype Iron_ton_ha Maturity Reproductive Vegetative
1       East   2021      CV1      5.6608     0.14         0.31       0.40
2       East   2021      CV1      5.9481     0.18         0.30       0.37
3       East   2021      CV1      6.2757     0.08         0.29       0.38
4       East   2021      CV1      7.4608     0.12         0.34       0.30
5       East   2021      CV1      7.5574     0.15         0.28       0.36
6       East   2021      CV1      7.8064     0.15         0.29       0.34
7       East   2021      CV1      7.9718     0.17         0.31       0.35
8       East   2021      CV1      8.0175     0.15         0.30       0.45
9       East   2021      CV1      8.2875     0.14         0.31       0.36
10      East   2021      CV1      8.6052     0.11         0.32       0.38
.
.
.

Now, I will analyze the relationship between the iron (Fe) content in the soil and the uptake of iron by plants at maturity.

library(ggplot2)
FIGA= ggplot(data=dataB, aes(x=Iron_ton_ha, y=Reproductive))+
  geom_point(aes(fill=as.factor(Season), shape=as.factor(Season)),color="black", size=5) +
  scale_fill_manual(values= c("grey15","grey35","grey55")) +
  scale_shape_manual(values= c(21,22,24)) +
  scale_x_continuous(breaks=seq(0,60,20),limits=c(0,60)) + 
  scale_y_continuous(breaks=seq(0,0.5,0.1),limits=c(0,0.5)) +  
  facet_wrap(~ Location) +
  annotate("segment", x=20, xend=40, y=Inf, yend=Inf, color="black",lwd=1)+
  labs(x="Fe (Mg/ha) in soil", y="Plant Fe uptake (%) at maturity") +
  theme_classic(base_size=18, base_family="serif") +
  theme(legend.position=c(0.90,0.17),
        legend.title=element_blank(),
        legend.key=element_rect(color="white", fill="white"),
        legend.text=element_text(family="serif", face="plain", size=13, color="black"),
        legend.background= element_rect(fill="white"),
        strip.background=element_rect(color="white", linewidth=0.5, linetype="solid"),
        axis.line = element_line(linewidth = 0.5, colour="black"))
FIGA + windows(width=9, height=5)
ggsave("C:/Users/dream/Desktop/R_OUTPUT/FIGA.jpg", 
       FIGA, width=9*2.54, height=5*2.54, units="cm", dpi=1000)

Now, I’ll normalize the data for each treatment combination (i.e., East, 2021, CV1) using the mutate() function. For easier normalization calculations, a vertically organized data format is preferable. Therefore, I’ll be using the dataA dataset.

dataC= data.frame(dataA %>%
       group_by(Location, Season, Genotype) %>%
       mutate(
              Normalized_Fe_plant=(Fe-mean(Fe))/sd(Fe),
              Normalized_Fe_soil=(Iron_ton_ha-mean(Iron_ton_ha))/sd(Iron_ton_ha)
              ))

dataC
    Location Season Genotype Reps Iron_ton_ha      Stage   Fe Normalized_Fe_plant Normalized_Fe_soil
1       East   2021      CV1    1     21.7127 Vegetative 0.44         1.682394399         1.85849093
2       East   2021      CV1    2      8.7340 Vegetative 0.30         0.246204058        -0.48486395
3       East   2021      CV1    3      9.5003 Vegetative 0.31         0.348789083        -0.34650550
4       East   2021      CV1    4      5.9481 Vegetative 0.37         0.964299229        -0.98786906
5       East   2021      CV1    5      7.4608 Vegetative 0.30         0.246204058        -0.71474518

Then, let’s reshape the data from row to column regarding the Stage.

library(dplyr)
library(tidyr)
dataD= data.frame(dataC %>%
                    group_by(Location, Season, Genotype, Normalized_Fe_soil, Stage) %>%
                    spread(key=Stage, value= Normalized_Fe_plant))

dataD
    Location Season Genotype Normalized_Fe_soil    Maturity Reproductive   Vegetative
1       East   2021      CV1        -1.03974219 -1.39515633  0.348789083  1.272054302
2       East   2021      CV1        -0.98786906 -0.98481623  0.246204058  0.964299229
3       East   2021      CV1        -0.92871960 -2.01066648  0.143619034  1.066884253
4       East   2021      CV1        -0.71474518 -1.60032638  0.656544156  0.246204058
5       East   2021      CV1        -0.69730367 -1.29257131  0.041034010  0.861714204

Let’s draw the graph again.

library(ggplot2)
FIGB= ggplot(data=dataD, aes(x=Normalized_Fe_soil, y=Maturity))+
  geom_point(aes(fill=as.factor(Season), shape=as.factor(Season)),color="black", size=5) +
  scale_fill_manual(values= c("grey15","grey35","grey55")) +
  scale_shape_manual(values= c(21,22,24)) +
  scale_x_continuous(breaks=seq(-5,5,2.5),limits=c(-5,5)) + 
  scale_y_continuous(breaks=seq(-5,5,2.5),limits=c(-5,5)) +  
  geom_vline(xintercept=0, linetype="dashed", color="black") +
  geom_hline(yintercept=0, linetype="dashed", color= "black") +
  facet_wrap(~ Location) +
  annotate("segment", x=20, xend=40, y=Inf, yend=Inf, color="black",lwd=1)+
  labs(x="Fe (Mg/ha) in soil", y="Plant Fe uptake (%) at maturity") +
  theme_classic(base_size=18, base_family="serif") +
  theme(legend.position=c(0.90,0.17),
        legend.title=element_blank(),
        legend.key=element_rect(color="white", fill="white"),
        legend.text=element_text(family="serif", face="plain", size=13, color="black"),
        legend.background= element_rect(fill="white"),
        strip.background=element_rect(color="white", linewidth=0.5, linetype="solid"),
        axis.line = element_line(linewidth = 0.5, colour="black"))
FIGB + windows(width=9, height=5)
ggsave("C:/Users/dream/Desktop/R_OUTPUT/FIGB.jpg", 
       FIGB, width=9*2.54, height=5*2.54, units="cm", dpi=1000)

### full code 
https://github.com/agronomy4future/r_code/blob/main/data_normalization

library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)

#data upload
github="https://raw.githubusercontent.com/agronomy4future/raw_data_practice/main/wheat_Fe_uptake.csv"
dataA= data.frame(read_csv(url(github), show_col_types= FALSE))

# data normalization
dataB= data.frame(dataA %>%
                  group_by(Location, Season, Genotype) %>%
                  mutate(
                  Normalized_Fe_plant=(Fe-mean(Fe))/sd(Fe),
                  Normalized_Fe_soil=(Iron_ton_ha-mean(Iron_ton_ha))/sd(Iron_ton_ha)
                    ))

# data reshape
dataC= data.frame(dataB %>%
                  group_by(Location, Season, Genotype, Normalized_Fe_soil, Stage) %>%
                  spread(key=Stage, value= Normalized_Fe_plant))

# graph
FIGB= ggplot(data=dataC, aes(x=Normalized_Fe_soil, y=Maturity))+
  geom_point(aes(fill=as.factor(Season), shape=as.factor(Season)),color="black", size=5) +
  scale_fill_manual(values= c("grey15","grey35","grey55")) +
  scale_shape_manual(values= c(21,22,24)) +
  scale_x_continuous(breaks=seq(-5,5,2.5),limits=c(-5,5)) + 
  scale_y_continuous(breaks=seq(-5,5,2.5),limits=c(-5,5)) +  
  geom_vline(xintercept=0, linetype="dashed", color="black") +
  geom_hline(yintercept=0, linetype="dashed", color= "black") +
  facet_wrap(~ Location) +
  annotate("segment", x=20, xend=40, y=Inf, yend=Inf, color="black",lwd=1)+
  labs(x="Fe (Mg/ha) in soil", y="Plant Fe uptake (%) at maturity") +
  theme_classic(base_size=18, base_family="serif") +
  theme(legend.position=c(0.90,0.17),
        legend.title=element_blank(),
        legend.key=element_rect(color="white", fill="white"),
        legend.text=element_text(family="serif", face="plain", size=13, color="black"),
        legend.background= element_rect(fill="white"),
        strip.background=element_rect(color="white", linewidth=0.5, linetype="solid"),
        axis.line = element_line(linewidth = 0.5, colour="black"))
FIGB + windows(width=9, height=5)

Is R calculation correct?

I have just used the code and now I’m wondering whether the code I used correctly calculates normalization. Therefore, I will manually calculate normalization and compare the values.

I pooled data from two seasons and normalized yield based on location and fertilizer. Now, I’ll check whether this normalized data matches what R calculated.

library(readr)
github="https://raw.githubusercontent.com/agronomy4future/raw_data_practice/main/normalization_practice.csv"
dataA= data.frame(read_csv(url(github), show_col_types= FALSE))

dataB= data.frame(dataA %>%
                    group_by(location, fertilizer) %>%
                    mutate(
                      Normalized_R=(normalized-mean(normalized))/sd(normalized),
                    ))

dataB$Normalized_R=round(dataB$Normalized_R,digits=2)

dataB
   location surphur_amount season fertilizer yield  mean Stdev normalized Normalized_R
1     North              0   2020    Control 100.5 122.5  15.1      -1.45        -1.45
2     North             60   2020    Control 120.3 122.5  15.1      -0.14        -0.14
3     North            120   2020    Control 135.9 122.5  15.1       0.89         0.89
4     North              0   2021    Control 110.3 122.5  15.1      -0.81        -0.81
5     North             60   2021    Control 128.3 122.5  15.1       0.39         0.39
6     North            120   2021    Control 139.5 122.5  15.1       1.13         1.13
7     North              0   2020       Fast 149.0 145.5  19.9       0.17         0.17
8     North             60   2020       Fast 166.3 145.5  19.9       1.04         1.04
9     North            120   2020       Fast 169.9 145.5  19.9       1.23         1.23
10    North              0   2021       Fast 119.2 145.5  19.9      -1.32        -1.32
11    North             60   2021       Fast 133.0 145.5  19.9      -0.63        -0.63
12    North            120   2021       Fast 135.9 145.5  19.9      -0.48        -0.48
13    North              0   2020       Slow 171.3 167.3  22.9       0.17         0.17
14    North             60   2020       Slow 191.1 167.3  22.9       1.04         1.04
15    North            120   2020       Slow 195.3 167.3  22.9       1.23         1.23
16    North              0   2021       Slow 137.0 167.3  22.9      -1.32        -1.32
17    North             60   2021       Slow 152.9 167.3  22.9      -0.63        -0.63
18    North            120   2021       Slow 156.3 167.3  22.9      -0.48        -0.48
19    South              0   2020    Control 109.6 121.4  15.5      -0.76        -0.76
20    South             60   2020    Control 122.3 121.4  15.5       0.06         0.06
21    South            120   2020    Control 125.0 121.4  15.5       0.24         0.24
22    South              0   2021    Control 100.0 121.4  15.5      -1.38        -1.38
23    South             60   2021    Control 125.9 121.4  15.5       0.29         0.29
24    South            120   2021    Control 145.3 121.4  15.5       1.54         1.54
25    South              0   2020       Fast 163.9 160.1  21.9       0.17         0.17
26    South             60   2020       Fast 182.9 160.1  21.9       1.04         1.04
27    South            120   2020       Fast 186.9 160.1  21.9       1.23         1.23
28    South              0   2021       Fast 131.1 160.1  21.9      -1.32        -1.32
29    South             60   2021       Fast 146.3 160.1  21.9      -0.63        -0.63
30    South            120   2021       Fast 149.5 160.1  21.9      -0.48        -0.48
31    South              0   2020       Slow 167.8 164.0  22.4       0.17         0.17
32    South             60   2020       Slow 187.3 164.0  22.4       1.04         1.04
33    South            120   2020       Slow 191.4 164.0  22.4       1.23         1.23
34    South              0   2021       Slow 134.3 164.0  22.4      -1.32        -1.32
35    South             60   2021       Slow 149.8 164.0  22.4      -0.63        -0.63
36    South            120   2021       Slow 153.1 164.0  22.4      -0.48        -0.48

My calculation matches the R calculation, so I can trust the code.

Agronomy4future

Stories about cereals and statistics (plus coding). We aim to develop open-source code for agronomy.

A Guide to Normalizing Data for Different Treatments in R

November 12, 2023 JK

Is R calculation correct?