Practices in Data Normalization using normtools() in R
■ [R package] Normalization Methods for Data Scaling (Feat. normtools)
In my previous post, I introduced the R package normtools()
, which I developed to normalize data using various methods. This time, I’ll demonstrate how to use the R package normtools()
for data normalization.
1. Data upload
if(!require(readr)) install.packages("readr")
library(readr)
github="https://raw.githubusercontent.com/agronomy4future/raw_data_practice/refs/heads/main/corn_yield_components.csv"
df= data.frame(read_csv(url(github), show_col_types=FALSE))
head(df,5)
year season variety population location ear AGW KN GY
2022 Long Season CV1 High D Site 1 1 297.5 3550 10439
2022 Long Season CV1 High D Site 1 2 283.9 3408 9562
2022 Long Season CV1 High D Site 1 3 270.8 3834 10264
2022 Long Season CV1 High D Site 1 4 328.1 3408 11053
2022 Long Season CV1 High D Site 1 5 298.1 3692 10878
.
.
.
This data includes kernel number (KN), average kernel weight (AGW), and grain yield (GY) for different corn varieties across various years, populations, and locations.
2. Data normalization
if(!require(remotes)) install.packages("remotes")
if (!requireNamespace("normtools", quietly = TRUE)) {
remotes::install_github("agronomy4future/normtools")
}
library(remotes)
library(normtools)
z_test= normtools(df, c("year","season","location"), c("AGW","KN","GY"), method= 1) # 1 or "z_test"
This is the normtools()
package. First, I’ll normalize the data using the z-test. Next, I’ll summarize the normalized data.
if(!require(dplyr)) install.packages("dplyr")
library(dplyr)
df1 = data.frame(z_test %>%
group_by(year, season, population, variety) %>%
dplyr::summarize(across(c(Normalized_AGW, Normalized_KN, Normalized_GY),
.fns= list(Mean=~mean(., na.rm= TRUE),
SD= ~sd(., na.rm= TRUE),
n=~length(.),
se=~sd(.,na.rm= TRUE) / sqrt(length(.))))))
3. to create graph
Then, let’s create a graph.
if(!require(readr)) install.packages("ggplot2")
library(ggplot2)
KN_GY=ggplot(data=df1, aes(x=Normalized_KN_Mean, y=Normalized_GY_Mean)) +
geom_errorbar(aes(xmin=Normalized_KN_Mean-Normalized_KN_se,
xmax=Normalized_KN_Mean+Normalized_KN_se),
position=position_dodge(0.9), width=0.5) +
geom_errorbar(aes(ymin=Normalized_GY_Mean-Normalized_GY_se,
ymax=Normalized_GY_Mean+Normalized_GY_se),
position=position_dodge(0.9), width=0.5) +
geom_point(aes(fill=season, shape=season), color="black", size=4.5) +
scale_fill_manual(values= c("darkred","orange")) +
scale_shape_manual(values= c(21,21)) +
scale_x_continuous(breaks=seq(-5,5,2.5),limits=c(-5,5)) +
scale_y_continuous(breaks=seq(-5,5,2.5),limits=c(-5,5)) +
geom_vline(xintercept=0, linetype="dashed", color="black") +
geom_hline(yintercept=0, linetype="dashed", color="black") +
#geom_abline(slope=1, linetype="dashed", color="grey55", linewidth=0.5) +
facet_wrap (~ season) +
annotate("segment", x=-2.5, xend=2.5, y=Inf, yend=Inf, color="black", lwd=1)+
labs(x="Normalized kernel number", y="Normalized grain yield") +
theme_classic(base_size=18, base_family="serif") +
theme(legend.position="none",
legend.title=element_blank(),
legend.key=element_rect(color="white", fill="white"),
legend.text=element_text(family="serif", face="plain", size=13, color="black"),
legend.background= element_rect(fill="white"),
strip.background=element_rect(color="white", linewidth=0.5, linetype="solid"),
axis.line = element_line(linewidth = 0.5, colour="black"))
KN_GY + windows(width=5.5, height=5)
ggsave("C:/Users/kimjk/Desktop/Coding_Output/KN_GY.jpg",
KN_GY, width=10*2.54, height=7*2.54, units="cm", dpi=1000)
This data indicates kernel number is correlated with grain yield.
Next, I’ll analyze the relationship between kernel weight and grain yield.
if(!require(readr)) install.packages("ggplot2")
library(ggplot2)
AGW_GY=ggplot(data=df1, aes(x=Normalized_AGW_Mean, y=Normalized_GY_Mean)) +
geom_errorbar(aes(xmin=Normalized_AGW_Mean-Normalized_AGW_se,
xmax=Normalized_AGW_Mean+Normalized_AGW_se), position=position_dodge(0.9), width=0.5) +
geom_errorbar(aes(ymin=Normalized_GY_Mean-Normalized_GY_se,
ymax=Normalized_GY_Mean+Normalized_GY_se), position=position_dodge(0.9), width=0.5) +
geom_point(aes(fill=season, shape=season), color="black", size=4.5) +
scale_fill_manual(values= c("darkred","orange")) +
scale_shape_manual(values= c(21,21)) +
scale_x_continuous(breaks=seq(-5,5,2.5),limits=c(-5,5)) +
scale_y_continuous(breaks=seq(-5,5,2.5),limits=c(-5,5)) +
geom_vline(xintercept=0, linetype="dashed", color="black") +
geom_hline(yintercept=0, linetype="dashed", color= "black") +
#geom_abline(slope=1, linetype= "dashed", color="grey55", linewidth=0.5) +
facet_wrap (~ season) +
annotate("segment", x=-2.5, xend=2.5, y=Inf, yend=Inf, color="black", lwd=1)+
labs(x="Normalized kernel weight", y="Normalized grain yield") +
theme_classic(base_size=18, base_family="serif") +
theme(legend.position="none",
legend.title=element_blank(),
legend.key=element_rect(color="white", fill="white"),
legend.text=element_text(family="serif", face="plain", size=13, color="black"),
legend.background= element_rect(fill="white"),
strip.background=element_rect(color="white", linewidth=0.5, linetype="solid"),
axis.line = element_line(linewidth = 0.5, colour="black"))
AGW_GY + windows(width=6, height=5)
ggsave("C:/Users/kimjk/Desktop/Coding_Output/AGW_GY.jpg",
AGW_GY, width=10*2.54, height=7*2.54, units="cm", dpi=1000)
Finally, I’ll analyze the relationship between kernel number and kernel weight.
if(!require(readr)) install.packages("ggplot2")
library(ggplot2)
KN_AGW=ggplot(data=df1, aes(x=Normalized_KN_Mean, y=Normalized_AGW_Mean)) +
geom_errorbar(aes(xmin=Normalized_KN_Mean-Normalized_KN_se,
xmax=Normalized_KN_Mean+Normalized_KN_se), position=position_dodge(0.9), width=0.5) +
geom_errorbar(aes(ymin=Normalized_AGW_Mean-Normalized_AGW_se,
ymax=Normalized_AGW_Mean+Normalized_AGW_se), position=position_dodge(0.9), width=0.5) +
geom_point(aes(fill=season, shape=season), color="black", size=4.5) +
scale_fill_manual(values= c("darkred","orange")) +
scale_shape_manual(values= c(21,21)) +
scale_x_continuous(breaks=seq(-5,5,2.5),limits=c(-5,5)) +
scale_y_continuous(breaks=seq(-5,5,2.5),limits=c(-5,5)) +
geom_vline(xintercept=0, linetype="dashed", color="black") +
geom_hline(yintercept=0, linetype="dashed", color= "black") +
#geom_abline(slope=1, linetype= "dashed", color="grey55", linewidth=0.5) +
facet_wrap (~ season) +
annotate("segment", x=-2.5, xend=2.5, y=Inf, yend=Inf, color="black", lwd=1)+
labs(x="Normalized kernel number", y="Normalized kernel weight") +
theme_classic(base_size=18, base_family="serif") +
theme(legend.position="none",
legend.title=element_blank(),
legend.key=element_rect(color="white", fill="white"),
legend.text=element_text(family="serif", face="plain", size=13, color="black"),
legend.background= element_rect(fill="white"),
strip.background=element_rect(color="white", linewidth=0.5, linetype="solid"),
axis.line = element_line(linewidth = 0.5, colour="black"))
KN_AGW + windows(width=6, height=5)
ggsave("C:/Users/kimjk/Desktop/Coding_Output/KN_AGW.jpg",
KN_AGW, width=10*2.54, height=7*2.54, units="cm", dpi=1000)
<Full code>
https://github.com/agronomy4future/r_code/blob/main/Practices_in_Data_Normalization_using_normtools_in_R.ipynb
© 2022 – 2023 https://agronomy4future.com