[R] Code is too slow: mean-centering variables in a data framebysubgroup

Wed Apr 7 22:14:50 CEST 2010

Yes, Tom's solution is indeed the fastest!
On my PC it took .17-.22 seconds while using ave() took .23-.27 seconds.
And of course - the last two methods I mentioned took 1.3 SECONDS, not
MINUTES (it was a typo).

All that is left to me is to understand what .SD stands for.
:-)

Dimitri

On Wed, Apr 7, 2010 at 4:04 PM, Rob Forler <rforler at uchicago.edu> wrote:
> Leave it up to Tom to solve things wickedly fast :)
>
> Just as an fyi Dimitri, Tom is one of the developers of data.table.
>
> -Rob
>
> On Wed, Apr 7, 2010 at 2:51 PM, Dimitri Liakhovitski <ld7631 at gmail.com>
> wrote:
>>
>> Wow, thank you, Tom!
>>
>> On Wed, Apr 7, 2010 at 3:46 PM, Tom Short <tshort.rlists at gmail.com> wrote:
>> > Here's how I would have done the data.table method. It's a bit faster
>> > than the ave approach on my machine:
>> >
>> >> # install.packages("data.table",repos="http://R-Forge.R-project.org")
>> >> library(data.table)
>> >>
>> >> f3 <- function(frame) {
>> > +   frame <- as.data.table(frame)
>> > +   frame[, lapply(.SD[,2:ncol(.SD), with = FALSE],
>> > +                  function(x) x / mean(x, na.rm = TRUE)),
>> > +         by = "group"]
>> > + }
>> >>
>> >> system.time(new.frame2 <- f2(frame)) # ave
>> >   user  system elapsed
>> >   0.50    0.08    1.24
>> >> system.time(new.frame3 <- f3(frame)) # data.table
>> >   user  system elapsed
>> >   0.25    0.01    0.30
>> >
>> > - Tom
>> >
>> > Tom Short
>> >
>> >
>> > On Wed, Apr 7, 2010 at 12:46 PM, Dimitri Liakhovitski <ld7631 at gmail.com>
>> > wrote:
>> >> I would like to thank once more everyone who helped me with this
>> >> question.
>> >> I compared the speed for different approaches. Below are the results
>> >> of my comparisons - in case anyone is interested:
>> >>
>> >> ### Building an EXAMPLE FRAME with N rows - with groups and a lot of
>> >> NAs:
>> >> N<-100000
>> >> set.seed(1234)
>> >>
>> >> frame<-data.frame(group=rep(paste("group",1:10),N/10),a=rnorm(1:N),b=rnorm(1:N),c=rnorm(1:N),d=rnorm(1:N),e=rnorm(1:N),f=rnorm(1:N),g=rnorm(1:N))
>> >> frame<-frame[order(frame$group),]
>> >>
>> >> ## Introducing 60% NAs:
>> >> names.used<-names(frame)[2:length(frame)]
>> >> set.seed(1234)
>> >> for(i in names.used){
>> >>      i.for.NA<-sample(1:N,round((N*.6),0))
>> >>      frame[[i]][i.for.NA]<-NA
>> >> }
>> >> lapply(frame[2:8], function(x) length(x[is.na(x)])) # Checking that it
>> >> worked
>> >> ORIGframe<-frame ## placeholder for the unchanged original frame
>> >>
>> >> ####### Objective of the code - divide each value by its group mean
>> >> ####
>> >>
>> >> ### METHOD 1 - the FASTEST - using ave():##############################
>> >> frame<-ORIGframe
>> >> f2 <- function(frame) {
>> >>  for(i in 2:ncol(frame)) {
>> >>     frame[,i] <- ave(frame[,i], frame[,1],
>> >> FUN=function(x)x/mean(x,na.rm=TRUE))
>> >>  }
>> >>  frame
>> >> }
>> >> system.time({new.frame<-f2(frame)})
>> >> # Took me 0.23-0.27 sec
>> >> #######################################
>> >>
>> >> ### METHOD 2 - fast, just a bit slower - using data.table:
>> >> ##############################
>> >>
>> >> # If you don't have it - install the package - NOT from CRAN:
>> >> install.packages("data.table",repos="http://R-Forge.R-project.org")
>> >> library(data.table)
>> >> frame<-ORIGframe
>> >> system.time({
>> >> table<-data.table(frame)
>> >> colMeanFunction<-function(data,key){
>> >>  data[[key]]=NULL
>> >>
>> >>  ret=as.matrix(data)/matrix(rep(as.numeric(colMeans(as.data.frame(data),na.rm=T)),nrow(data)),nrow=nrow(data),ncol=ncol(data),byrow=T)
>> >>  return(ret)
>> >> }
>> >> groupedMeans = table[,colMeanFunction(.SD, "group"), by="group"]
>> >> names.to.use<-names(groupedMeans)
>> >> for(i in
>> >> 1:length(groupedMeans)){groupedMeans[[i]]<-as.data.frame(groupedMeans[[i]])}
>> >> groupedMeans<-do.call(cbind, groupedMeans)
>> >> names(groupedMeans)<-names.to.use
>> >> })
>> >> # Took me 0.37-.45 sec
>> >> #######################################
>> >>
>> >> ### METHOD 3 - fast, a tad slower (using model.matrix & matrix
>> >> multiplication):##############################
>> >> frame<-ORIGframe
>> >> system.time({
>> >> mat <- as.matrix(frame[,-1])
>> >> mm <- model.matrix(~0+group,frame)
>> >> col.grp.N <- crossprod( !is.na(mat), mm ) # Use this line if don't
>> >> want to use NAs for mean calculations
>> >> # col.grp.N <- crossprod( mat != 0 , mm ) # Use this line if don't
>> >> want to use zeros for mean calculations
>> >> mat[is.na(mat)] <- 0.0
>> >> col.grp.sum <- crossprod( mat, mm )
>> >> mat <- mat / ( t(col.grp.sum/col.grp.N)[ frame$group,] )
>> >> is.na(mat) <- is.na(frame[,-1])
>> >> mat<-as.data.frame(mat)
>> >> })
>> >> # Took me 0.44-0.50 sec
>> >> #######################################
>> >>
>> >> ### METHOD 5-  much slower - it's the one I started
>> >> with:##############################
>> >> frame<-ORIGframe
>> >> system.time({
>> >> frame <- do.call(cbind, lapply(names.used, function(x){
>> >>        unlist(by(frame, frame$group, function(y) y[,x] /
>> >> mean(y[,x],na.rm=T)))
>> >>        }))
>> >> })
>> >> # Took me 1.25-1.32 min
>> >> #######################################
>> >>
>> >> ### METHOD 6 -  the slowest; using "plyr" and
>> >> "ddply":##############################
>> >> frame<-ORIGframe
>> >> library(plyr)
>> >> function3 <- function(x) x / mean(x, na.rm = TRUE)
>> >> system.time({
>> >> grouping.factor<-"group"
>> >> myvariables<-names(frame)[2:8]
>> >> frame3<-ddply(frame, grouping.factor, colwise(function3, myvariables))
>> >> })
>> >> # Took me 1.36-1.47 min
>> >> #######################################
>> >>
>> >>
>> >> Thanks again!
>> >> Dimitri
>> >>
>> >>
>> >> On Wed, Mar 31, 2010 at 8:29 PM, William Dunlap <wdunlap at tibco.com>
>> >> wrote:
>> >>> Dimitri,
>> >>>
>> >>> You might try applying ave() to each column.  E.g., use
>> >>>
>> >>> f2 <- function(frame) {
>> >>>   for(i in 2:ncol(frame)) {
>> >>>      frame[,i] <- ave(frame[,i], frame[,1],
>> >>> FUN=function(x)x/mean(x,na.rm=TRUE))
>> >>>   }
>> >>>   frame
>> >>> }
>> >>>
>> >>> Note that this returns a data.frame and retains the
>> >>> grouping column (the first) while your original
>> >>> code returns a matrix without the grouping column.
>> >>>
>> >>> Bill Dunlap
>> >>> Spotfire, TIBCO Software
>> >>> wdunlap tibco.com
>> >>>
>> >>>> -----Original Message-----
>> >>>> From: r-help-bounces at r-project.org
>> >>>> [mailto:r-help-bounces at r-project.org] On Behalf Of Bert Gunter
>> >>>> Sent: Tuesday, March 30, 2010 10:52 AM
>> >>>> To: 'Dimitri Liakhovitski'; 'r-help'
>> >>>> Subject: Re: [R] Code is too slow: mean-centering variables
>> >>>> in a data framebysubgroup
>> >>>>
>> >>>> ?scale
>> >>>>
>> >>>> Bert Gunter
>> >>>> Genentech Nonclinical Biostatistics
>> >>>>
>> >>>>
>> >>>>
>> >>>> -----Original Message-----
>> >>>> From: r-help-bounces at r-project.org
>> >>>> [mailto:r-help-bounces at r-project.org] On
>> >>>> Behalf Of Dimitri Liakhovitski
>> >>>> Sent: Tuesday, March 30, 2010 8:05 AM
>> >>>> To: r-help
>> >>>> Subject: [R] Code is too slow: mean-centering variables in a
>> >>>> data frame
>> >>>> bysubgroup
>> >>>>
>> >>>> Dear R-ers,
>> >>>>
>> >>>> I have  a large data frame (several thousands of rows and about 2.5
>> >>>> thousand columns). One variable ("group") is a grouping variable with
>> >>>> over 30 levels. And I have a lot of NAs.
>> >>>> For each variable, I need to divide each value by variable mean - by
>> >>>> subgroup. I have the code but it's way too slow - takes me about 1.5
>> >>>> hours.
>> >>>> Below is a data example and my code that is too slow. Is there a
>> >>>> different, faster way of doing the same thing?
>> >>>> Thanks a lot for your advice!
>> >>>>
>> >>>> Dimitri
>> >>>>
>> >>>>
>> >>>> # Building an example frame - with groups and a lot of NAs:
>> >>>> set.seed(1234)
>> >>>> frame<-data.frame(group=rep(paste("group",1:10),10),a=rnorm(1:
>> >>> 100),b=rnorm(1
>> >>>> :100),c=rnorm(1:100),d=rnorm(1:100),e=rnorm(1:100),f=rnorm(1:1
>> >>>> 00),g=rnorm(1:
>> >>>> 100))
>> >>>> frame<-frame[order(frame$group),]
>> >>>> names.used<-names(frame)[2:length(frame)]
>> >>>> set.seed(1234)
>> >>>> for(i in names.used){
>> >>>>        i.for.NA<-sample(1:100,60)
>> >>>>        frame[[i]][i.for.NA]<-NA
>> >>>> }
>> >>>> frame
>> >>>>
>> >>>> ### Code that does what's needed but is too slow:
>> >>>> Start<-Sys.time()
>> >>>> frame <- do.call(cbind, lapply(names.used, function(x){
>> >>>>   unlist(by(frame, frame$group, function(y) y[,x] /
>> >>>> mean(y[,x],na.rm=T)))
>> >>>> }))
>> >>>> Finish<-Sys.time()
>> >>>> print(Finish-Start) # Takes too long
>> >>>>
>> >>>> --
>> >>>> Dimitri Liakhovitski
>> >>>> Ninah.com
>> >>>> Dimitri.Liakhovitski at ninah.com
>> >>>>
>> >>>> ______________________________________________
>> >>>> R-help at r-project.org mailing list
>> >>>> https://stat.ethz.ch/mailman/listinfo/r-help
>> >>>> PLEASE do read the posting guide
>> >>>> http://www.R-project.org/posting-guide.html
>> >>>> and provide commented, minimal, self-contained, reproducible code.
>> >>>>
>> >>>> ______________________________________________
>> >>>> R-help at r-project.org mailing list
>> >>>> https://stat.ethz.ch/mailman/listinfo/r-help
>> >>>> PLEASE do read the posting guide
>> >>>> http://www.R-project.org/posting-guide.html
>> >>>> and provide commented, minimal, self-contained, reproducible code.
>> >>>>
>> >>>
>> >>
>> >>
>> >>
>> >> --
>> >> Dimitri Liakhovitski
>> >> Ninah.com
>> >> Dimitri.Liakhovitski at ninah.com
>> >>
>> >> ______________________________________________
>> >> R-help at r-project.org mailing list
>> >> https://stat.ethz.ch/mailman/listinfo/r-help
>> >> PLEASE do read the posting guide
>> >> http://www.R-project.org/posting-guide.html
>> >> and provide commented, minimal, self-contained, reproducible code.
>> >>
>> >
>>
>>
>>
>> --
>> Dimitri Liakhovitski
>> Ninah.com
>> Dimitri.Liakhovitski at ninah.com
>>
>> ______________________________________________
>> R-help at r-project.org mailing list
>> https://stat.ethz.ch/mailman/listinfo/r-help
>> PLEASE do read the posting guide
>> http://www.R-project.org/posting-guide.html
>> and provide commented, minimal, self-contained, reproducible code.
>
>

-- 
Dimitri Liakhovitski
Ninah.com
Dimitri.Liakhovitski at ninah.com