[R] average and median values for each of the class

Sun Apr 27 11:01:51 CEST 2014

On 27-04-2014, at 02:37, Nico Met <nicomet80 at gmail.com> wrote:

> Dear all,
> 
> 
> 
> I have a matrix (dimension, 16 x 12) where  2nd column represents class
> (1,1,1,1,1,2,2,2, etc) information. I want to estimate average  and median
> values for each of the class and add this information as a row at end of
> the each classes.
> 
> 
> for example:
> 
> dput(dat)
> 
> structure(list(class = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
> 
> 3L, 3L, 3L, 4L, 4L, 4L, 5L), name1 = c(2.554923977, 2.371586762,
> 
> 2.497293431, 2.464827875, 2.981934845, 2.228995664, 2.099640729,
> 
> 1.900314302, 2.630005966, 2.632590262, 2.581887814, 2.408797563,
> 
> 2.098761103, 3.070460716, 1.436980716, 1.645121806), name2 = c(1.297412278,
> 
> 1.104804244, 1.30621114, 1.126009533, 1.466740841, 1.012041118,
> 
> 0.923466541, 0.840575023, 1.285530176, 1.041909333, 1.194917856,
> 
> 1.085015826, 1.047492703, 1.587558217, 0.593340012, 0.723630088
> 
> ), name3 = c(0.587160798, 0.596127884, 0.623760721, 0.549016135,
> 
> 0.686642084, 0.487523394, 0.458620467, 0.397974913, 0.615928976,
> 
> 0.546005649, 0.657383069, 0.546613129, 0.476503461, 0.749062102,
> 
> 0.304160587, 0.29037358), name4 = c(2.833441759, 2.713374426,
> 
> 2.532626548, 2.409093102, 3.014912721, 2.113507947, 2.017291324,
> 
> 1.667744912, 2.602560666, 2.31649643, 2.761204809, 2.433963493,
> 
> 2.229911767, 3.191646399, 1.269919241, 1.387479858), name5 = c(2.172365295,
> 
> 1.955695471, 2.141072829, 1.975743278, 2.377018372, 1.791300389,
> 
> 1.669079382, 1.500209628, 2.164401874, 1.830038378, 2.106750025,
> 
> 1.92888294, 1.707217549, 2.585082653, 1.114841754, 1.315712452
> 
> ), name6 = c(0.715129844, 0.688186262, 0.70133748, 0.709362008,
> 
> 0.712145174, 0.563593885, 0.532109761, 0.472197304, 0.690165016,
> 
> 0.65635473, 0.615835066, 0.64310098, 0.562974891, 0.900622255,
> 
> 0.408546784, 0.416284408), name7 = c(1.995505133, 1.860095899,
> 
> 1.843151597, 1.709861774, 2.155993511, 1.506409746, 1.315405587,
> 
> 1.234544153, 1.96629927, 1.74879757, 1.93994009, 1.660173854,
> 
> 1.556735295, 2.355723318, 0.866634243, 1.013367677), name8 = c(0.275484997,
> 
> 0.233856392, 0.294021245, 0.315504347, 0.251906585, 0.250263636,
> 
> 0.348599173, 0.273806933, 0.32067937, 0.278581115, 0.293726291,
> 
> 0.308350808, 0.201297444, 0.351927886, 0.204230625, 0.185681471
> 
> ), name9 = c(2.461066627, 2.210756164, 2.289047888, 2.253988252,
> 
> 2.668184733, 1.911697836, 1.793443775, 1.560027186, 2.36941155,
> 
> 1.961911111, 2.391501376, 2.002215107, 1.932144233, 2.73705052,
> 
> 1.15580754, 1.807697999), name10 = c(0.723025351, 0.613147422,
> 
> 0.805399925, 0.65651577, 0.779389048, 0.54260459, 0.492283542,
> 
> 0.507969501, 0.749700016, 0.644231327, 0.810319215, 0.620331891,
> 
> 0.600240557, 0.884775748, 0.40006142, 0.391661912), name11 = c(0.308565619,
> 
> 0.453808281, 0.363716904, 0.376332596, 0.324998876, 0.361013073,
> 
> 0.430744786, 0.468818055, 0.166072668, 0.369262627, 0.297666411,
> 
> 0.256091173, 0.123021464, 0.308188684, 0.646436241, 0.722972632
> 
> )), .Names = c("class", "name1", "name2", "name3", "name4", "name5",
> 
> "name6", "name7", "name8", "name9", "name10", "name11"), class = "data.frame",
> row.names = c("ara1",
> 
> "ara2", "ara3", "ara4", "ara5", "ara6", "ara7", "ara8", "ara9",
> 
> "ara10", "ara11", "ara12", "ara13", "ara14", "ara15", "ara16"
> 
> ))
> 
> 
> I wrote this:
> 
> 
> 
> avg<-as.data.frame(aggregate(dat[,2:dim(dat)[2]], dat["class"],
> function(x) mean(x,na.rm=T)) )
> 
> 
> med<-as.data.frame(aggregate(dat[,2:dim(dat)[2]], dat["class"], function(x)
> median(x,na.rm=T)) )
> 
> 
> # avg
> 
> #  class    name1     name2     name3    name4    name5     name6    name7
>    name#8    name9    name10    name11
> 
> #1     1 2.574113 1.2602356 0.6085415 2.700690 2.124379 0.7052322 1.912922
> #0.2741547 2.376609 0.7154955 0.3654845
> 
> #2     2 2.214739 1.0154032 0.4900119 2.100276 1.781248 0.5645165 1.505665
> #0.2983373 1.908645 0.5731394 0.3566621
> 
> #3     3 2.541092 1.1072810 0.5833339 2.503888 1.955224 0.6384303 1.782971
> #0.2935527 2.118543 0.6916275 0.3076734
> 
> #4     4 2.202068 1.0761303 0.5099087 2.230492 1.802381 0.6240480 1.593031
> #0.2524853 1.941667 0.6283592 0.3592155
> 
> #5     5 1.645122 0.7236301 0.2903736 1.387480 1.315712 0.4162844 1.013368
> #0.1856815 1.807698 0.3916619 0.7229726
> 
> #> med
> 
> #  class    name1     name2     name3    name4    name5     name6    name7
>    name#8    name9    name10    name11
> 
> #1     1 2.497293 1.2974123 0.5961279 2.713374 2.141073 0.7093620 1.860096
> #0.2754850 2.289048 0.7230254 0.3637169
> 
> #2     2 2.164318 0.9677538 0.4730719 2.065400 1.730190 0.5478518 1.410908
> #0.2972432 1.852571 0.5252870 0.3958789
> 
> #3     3 2.581888 1.0850158 0.5466131 2.433963 1.928883 0.6431010 1.748798
> #0.2937263 2.002215 0.6442313 0.2976664
> 
> #4     4 2.098761 1.0474927 0.4765035 2.229912 1.707218 0.5629749 1.556735
> #0.2042306 1.932144 0.6002406 0.3081887
> 
> #5     5 1.645122 0.7236301 0.2903736 1.387480 1.315712 0.4162844 1.013368
> #0.1856815 1.807698 0.3916619 0.7229726
> 
> 
> 
> 
> But I do not know how can I add this information in the original data?
> 
> 
> For example, for class 1, the output will look like this:
> 
> dput(res1)
> 
> structure(list(class = c(1L, 1L, 1L, 1L, 1L, 1L, 1L), name1 =
> c(2.554923977,
> 
> 2.371586762, 2.497293431, 2.464827875, 2.981934845, 2.574113378,
> 
> 2.497293431), name2 = c(1.297412278, 1.104804244, 1.30621114,
> 
> 1.126009533, 1.466740841, 1.260235607, 1.297412278), name3 = c(0.587160798,
> 
> 0.596127884, 0.623760721, 0.549016135, 0.686642084, 0.608541525,
> 
> 0.596127884), name4 = c(2.833441759, 2.713374426, 2.532626548,
> 
> 2.409093102, 3.014912721, 2.700689711, 2.713374426), name5 = c(2.172365295,
> 
> 1.955695471, 2.141072829, 1.975743278, 2.377018372, 2.124379049,
> 
> 2.141072829), name6 = c(0.715129844, 0.688186262, 0.70133748,
> 
> 0.709362008, 0.712145174, 0.705232154, 0.709362008), name7 = c(1.995505133,
> 
> 1.860095899, 1.843151597, 1.709861774, 2.155993511, 1.912921583,
> 
> 1.860095899), name8 = c(0.275484997, 0.233856392, 0.294021245,
> 
> 0.315504347, 0.251906585, 0.274154713, 0.275484997), name9 = c(2.461066627,
> 
> 2.210756164, 2.289047888, 2.253988252, 2.668184733, 2.376608733,
> 
> 2.289047888), name10 = c(0.723025351, 0.613147422, 0.805399925,
> 
> 0.65651577, 0.779389048, 0.715495503, 0.723025351), name11 = c(0.308565619,
> 
> 0.453808281, 0.363716904, 0.376332596, 0.324998876, 0.365484455,
> 
> 0.363716904)), .Names = c("class", "name1", "name2", "name3",
> 
> "name4", "name5", "name6", "name7", "name8", "name9", "name10",
> 
> "name11"), class = "data.frame", row.names = c("ara1", "ara2",
> 
> "ara3", "ara4", "ara5", "Avg", "Med"))
> 
> 
> 
> And same will be for other classes.

Please do not post in HTML, as requested by Posting Guide.
It tends to mess things up and makes your code and results unreadable.

You cannot use “Avg” and “Med” unmodified as rownames.
For each “class” (group would ba better name) you must append something different e.g. the “class”-number.

Try this:

library(plyr)
g <- function(dat) { 
    avg <- as.data.frame(aggregate(dat[,2:dim(dat)[2]], dat["class"], function(x) mean(x,na.rm=T)) )
    med <- as.data.frame(aggregate(dat[,2:dim(dat)[2]], dat["class"], function(x) median(x,na.rm=T)) )   
    z <- rbind(dat,avg,med) 
    z
}
DAT1 <- ddply(dat,.(class),.fun=g)    
rownames(DAT1) <- do.call(c,lapply(split(dat,dat["class"]),
                                        FUN=function(x) c(rownames(x),paste0("Avg",x[,"class"][1]),paste0("Med",x[,"class"][1]))))
DAT1

Convoluted but it works. Maybe someone else can come up with something shorter and more elegant.

Berend