Step-by-Step Guide to Calculating and Adding Variable Means in R

Step-by-Step Guide to Calculating and Adding Variable Means in R


Here is one dataset.

treatment=rep(c("A","B","C","D","E"), each=3)
rep=rep(c("I","II","III"), time=5)
yield=c(10,11,21,13,23,23,13,13,5,33,21,13,42,12,13)
dataA=data.frame(treatment,rep, yield)

dataA
   treatment rep yield
1          A   I    10
2          A  II    11
3          A III    21
4          B   I    13
5          B  II    23
6          B III    23
7          C   I    13
8          C  II    13
9          C III     5
10         D   I    33
11         D  II    21
12         D III    13
13         E   I    42
14         E  II    12
15         E III    13

I want to add the mean of each treatment to a new column, and I am using the following code.

dataA$mean= NA #to create an empty column

dataA$mean[dataA$treatment=="A"]=mean(dataA$yield[dataA$treatment=="A"], na.rm=TRUE)
dataA$mean[dataA$treatment=="B"]=mean(dataA$yield[dataA$treatment=="B"], na.rm=TRUE)
dataA$mean[dataA$treatment=="C"]=mean(dataA$yield[dataA$treatment=="C"], na.rm=TRUE)
dataA$mean[dataA$treatment=="D"]=mean(dataA$yield[dataA$treatment=="D"], na.rm=TRUE)
dataA$mean[dataA$treatment=="E"]=mean(dataA$yield[dataA$treatment=="E"], na.rm=TRUE)

dataA
   treatment rep yield     mean
1          A   I    10 14.00000
2          A  II    11 14.00000
3          A III    21 14.00000
4          B   I    13 19.66667
5          B  II    23 19.66667
6          B III    23 19.66667
7          C   I    13 10.33333
8          C  II    13 10.33333
9          C III     5 10.33333
10         D   I    33 22.33333
11         D  II    21 22.33333
12         D III    13 22.33333
13         E   I    42 22.33333
14         E  II    12 22.33333
15         E III    13 22.33333


However, the code is quite lengthy. Let’s simplify it using tapply()

library(base)
dataA$mean2=tapply(dataA$yield, dataA$treatment, mean, na.rm=TRUE)[dataA$treatment]

dataA
   treatment rep yield     mean    mean2
1          A   I    10 14.00000 14.00000
2          A  II    11 14.00000 14.00000
3          A III    21 14.00000 14.00000
4          B   I    13 19.66667 19.66667
5          B  II    23 19.66667 19.66667
6          B III    23 19.66667 19.66667
7          C   I    13 10.33333 10.33333
8          C  II    13 10.33333 10.33333
9          C III     5 10.33333 10.33333
10         D   I    33 22.33333 22.33333
11         D  II    21 22.33333 22.33333
12         D III    13 22.33333 22.33333
13         E   I    42 22.33333 22.33333
14         E  II    12 22.33333 22.33333
15         E III    13 22.33333 22.33333


How about there are more variables?

treatment=rep(rep(c("A","B","C","D","E"), each=3),2)
rep=rep(rep(c("I","II","III"), time=5),2)
environment=rep(c("East","West","North"), each=10)
yield=c(10,11,21,13,23,23,13,13,5,33,21,13,42,12,13,10,11,54,45,39,33,29,43,55,33,24,32,42,28,43)
dataA=data.frame(treatment,rep, environment, yield)

dataA
   treatment rep environment yield     mean
1          A   I        East    10 14.00000
2          A  II        East    11 14.00000
3          A III        East    21 14.00000
4          B   I        East    13 19.66667
5          B  II        East    23 19.66667
6          B III        East    23 19.66667
7          C   I        East    13 10.33333
8          C  II        East    13 10.33333
9          C III        East     5 10.33333
10         D   I        East    33 33.00000
11         D  II        West    21 17.00000
12         D III        West    13 17.00000
13         E   I        West    42 22.33333
14         E  II        West    12 22.33333
15         E III        West    13 22.33333
16         A   I        West    10 25.00000
17         A  II        West    11 25.00000
18         A III        West    54 25.00000
19         B   I        West    45 42.00000
20         B  II        West    39 42.00000
21         B III       North    33 33.00000
22         C   I       North    29 42.33333
23         C  II       North    43 42.33333
24         C III       North    55 42.33333
25         D   I       North    33 29.66667
26         D  II       North    24 29.66667
27         D III       North    32 29.66667
28         E   I       North    42 37.66667
29         E  II       North    28 37.66667
30         E III       North    43 37.66667

Now, I want to add the mean of the combination of treatment and environment.

library(base)
dataA$mean=tapply(dataA$yield, list(dataA$treatment, dataA$environment), mean, na.rm=TRUE)[cbind(dataA$treatment, dataA$environment)]

dataA
   treatment rep environment yield mean
1          A   I        East    10 14.00000
2          A  II        East    11 14.00000
3          A III        East    21 14.00000
4          B   I        East    13 19.66667
5          B  II        East    23 19.66667
6          B III        East    23 19.66667
7          C   I        East    13 10.33333
8          C  II        East    13 10.33333
9          C III        East     5 10.33333
10         D   I        East    33 33.00000
11         D  II        West    21 17.00000
12         D III        West    13 17.00000
13         E   I        West    42 22.33333
14         E  II        West    12 22.33333
15         E III        West    13 22.33333
16         A   I        West    10 25.00000
17         A  II        West    11 25.00000
18         A III        West    54 25.00000
19         B   I        West    45 42.00000
20         B  II        West    39 42.00000
21         B III       North    33 33.00000
22         C   I       North    29 42.33333
23         C  II       North    43 42.33333
24         C III       North    55 42.33333
25         D   I       North    33 29.66667
26         D  II       North    24 29.66667
27         D III       North    32 29.66667
28         E   I       North    42 37.66667
29         E  II       North    28 37.66667
30         E III       North    43 37.66667

I want to calculate the mean of combination between A and North

mean(dataA$yield[dataA$treatment=="A" & dataA$environment=="West"], na.rm=TRUE)
# 25

This value is the same as that in the column.



Comments are closed.