Step-by-Step Guide to Calculating and Adding Variable Means in R
Here is one dataset.
treatment=rep(c("A","B","C","D","E"), each=3)
rep=rep(c("I","II","III"), time=5)
yield=c(10,11,21,13,23,23,13,13,5,33,21,13,42,12,13)
dataA=data.frame(treatment,rep, yield)
dataA
treatment rep yield
1 A I 10
2 A II 11
3 A III 21
4 B I 13
5 B II 23
6 B III 23
7 C I 13
8 C II 13
9 C III 5
10 D I 33
11 D II 21
12 D III 13
13 E I 42
14 E II 12
15 E III 13
I want to add the mean of each treatment to a new column, and I am using the following code.
dataA$mean= NA #to create an empty column
dataA$mean[dataA$treatment=="A"]=mean(dataA$yield[dataA$treatment=="A"], na.rm=TRUE)
dataA$mean[dataA$treatment=="B"]=mean(dataA$yield[dataA$treatment=="B"], na.rm=TRUE)
dataA$mean[dataA$treatment=="C"]=mean(dataA$yield[dataA$treatment=="C"], na.rm=TRUE)
dataA$mean[dataA$treatment=="D"]=mean(dataA$yield[dataA$treatment=="D"], na.rm=TRUE)
dataA$mean[dataA$treatment=="E"]=mean(dataA$yield[dataA$treatment=="E"], na.rm=TRUE)
dataA
treatment rep yield mean
1 A I 10 14.00000
2 A II 11 14.00000
3 A III 21 14.00000
4 B I 13 19.66667
5 B II 23 19.66667
6 B III 23 19.66667
7 C I 13 10.33333
8 C II 13 10.33333
9 C III 5 10.33333
10 D I 33 22.33333
11 D II 21 22.33333
12 D III 13 22.33333
13 E I 42 22.33333
14 E II 12 22.33333
15 E III 13 22.33333
However, the code is quite lengthy. Let’s simplify it using tapply()
library(base)
dataA$mean2=tapply(dataA$yield, dataA$treatment, mean, na.rm=TRUE)[dataA$treatment]
dataA
treatment rep yield mean mean2
1 A I 10 14.00000 14.00000
2 A II 11 14.00000 14.00000
3 A III 21 14.00000 14.00000
4 B I 13 19.66667 19.66667
5 B II 23 19.66667 19.66667
6 B III 23 19.66667 19.66667
7 C I 13 10.33333 10.33333
8 C II 13 10.33333 10.33333
9 C III 5 10.33333 10.33333
10 D I 33 22.33333 22.33333
11 D II 21 22.33333 22.33333
12 D III 13 22.33333 22.33333
13 E I 42 22.33333 22.33333
14 E II 12 22.33333 22.33333
15 E III 13 22.33333 22.33333
How about there are more variables?
treatment=rep(rep(c("A","B","C","D","E"), each=3),2)
rep=rep(rep(c("I","II","III"), time=5),2)
environment=rep(c("East","West","North"), each=10)
yield=c(10,11,21,13,23,23,13,13,5,33,21,13,42,12,13,10,11,54,45,39,33,29,43,55,33,24,32,42,28,43)
dataA=data.frame(treatment,rep, environment, yield)
dataA
treatment rep environment yield mean
1 A I East 10 14.00000
2 A II East 11 14.00000
3 A III East 21 14.00000
4 B I East 13 19.66667
5 B II East 23 19.66667
6 B III East 23 19.66667
7 C I East 13 10.33333
8 C II East 13 10.33333
9 C III East 5 10.33333
10 D I East 33 33.00000
11 D II West 21 17.00000
12 D III West 13 17.00000
13 E I West 42 22.33333
14 E II West 12 22.33333
15 E III West 13 22.33333
16 A I West 10 25.00000
17 A II West 11 25.00000
18 A III West 54 25.00000
19 B I West 45 42.00000
20 B II West 39 42.00000
21 B III North 33 33.00000
22 C I North 29 42.33333
23 C II North 43 42.33333
24 C III North 55 42.33333
25 D I North 33 29.66667
26 D II North 24 29.66667
27 D III North 32 29.66667
28 E I North 42 37.66667
29 E II North 28 37.66667
30 E III North 43 37.66667
Now, I want to add the mean of the combination of treatment and environment.
library(base)
dataA$mean=tapply(dataA$yield, list(dataA$treatment, dataA$environment), mean, na.rm=TRUE)[cbind(dataA$treatment, dataA$environment)]
dataA
treatment rep environment yield mean
1 A I East 10 14.00000
2 A II East 11 14.00000
3 A III East 21 14.00000
4 B I East 13 19.66667
5 B II East 23 19.66667
6 B III East 23 19.66667
7 C I East 13 10.33333
8 C II East 13 10.33333
9 C III East 5 10.33333
10 D I East 33 33.00000
11 D II West 21 17.00000
12 D III West 13 17.00000
13 E I West 42 22.33333
14 E II West 12 22.33333
15 E III West 13 22.33333
16 A I West 10 25.00000
17 A II West 11 25.00000
18 A III West 54 25.00000
19 B I West 45 42.00000
20 B II West 39 42.00000
21 B III North 33 33.00000
22 C I North 29 42.33333
23 C II North 43 42.33333
24 C III North 55 42.33333
25 D I North 33 29.66667
26 D II North 24 29.66667
27 D III North 32 29.66667
28 E I North 42 37.66667
29 E II North 28 37.66667
30 E III North 43 37.66667
I want to calculate the mean of combination between A and North
mean(dataA$yield[dataA$treatment=="A" & dataA$environment=="West"], na.rm=TRUE)
# 25
This value is the same as that in the column.
© 2022 – 2023 https://agronomy4future.com