[R] Speeding up R code - Apply a function to each row of a matrix using the dplyr package
Jeff Newmiller
jdnewm|| @end|ng |rom dcn@d@v|@@c@@u@
Fri Nov 2 00:06:23 CET 2018
As Don suggests, looking for ways to do the whole calculation at once is a
big efficiency booster. Also, avoiding unnecessary calculations (e.g. mean
of 1:n is (n+1)/2 and mean(x+a) where a is a constant is mean(x)+a.
Reproducible example:
####################
#library(tictoc)
library(microbenchmark)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from
'package:stats':
#>
#> filter, lag
#> The following objects are masked from
'package:base':
#>
#> intersect, setdiff, setequal, union
library(purrr)
func1 <- function( coord, A, B, C ) {
X1 <- as.vector( coord[ 1 ] )
Y1 <- as.vector( coord[ 2 ] )
X2 <- as.vector( coord[ 3 ] )
Y2 <- as.vector( coord[ 4 ] )
if( C == 0 ) {
res1 <- mean( c( ( X1 - A ) : ( X1 - 1 )
, ( Y1 + 1 ) : ( Y1 + 40 )
)
)
res2 <- mean( c( ( X2 - A ) : ( X2 - 1 )
, ( Y2 + 1 ) : ( Y2 + 40 )
)
)
res <- matrix( c( res1, res2 )
, ncol=2
, nrow=1
)
} else {
res1 <- mean( c( ( X1 - A ) : ( X1 - 1 )
, ( Y1 + 1 ) : ( Y1 + 40 )
)
)*B
res2 <- mean( c( ( X2 - A ) : ( X2 - 1 )
, ( Y2 + 1 ) : ( Y2 + 40 )
)
)*B
res <- matrix( c( res1, res2 )
, ncol=2
, nrow=1
)
}
res
}
#' @param coord is a one-row data frame
func2 <- function( coord, A, B, C ) {
X1 <- coord[[ 1 ]]
Y1 <- coord[[ 2 ]]
X2 <- coord[[ 3 ]]
Y2 <- coord[[ 4 ]]
res <- matrix( c( mean( c( X1, Y1 ) )
, mean( c( X2, Y2 ) )
)
, ncol=2
, nrow=1
) + ( 40 - A ) / 2
if ( C != 0 ) {
res <- res * B
}
setNames( as.data.frame( res ), c( "V1", "V2" ) )
}
#' @param coord is a numeric vector of length 4
#' @return Numeric vector of length 2
func3 <- function( coord, A, B, C ) {
res <- ( c( ( coord[ 1 ] + coord[ 2 ] )
, ( coord[ 3 ] + coord[ 4 ] )
)
+ ( 40 - A )
) / 2
if ( C != 0 ) {
res <- res * B
}
res
}
#' @param coord is a matrix with four columns
func4 <- function( coord, A, B, C ) {
res <- ( cbind( ( coord[ , 1 ] + coord[ , 2 ] )
, ( coord[ , 3 ] + coord[ , 4 ] )
)
+ ( 40 - A )
) / 2
if ( length( C ) == nrow( coord ) || length( C ) == 1 ) {
idx <- C == 1
res[ idx, ] <- res[ idx, ] * B
}
res
}
## Apply the function
set.seed( 1 )
n <- 1000
N <- 100
Nseq <- seq.int( N )
# Using T instead of TRUE is asking to get an
unexpected result someday
tabDF <- data.frame( x1 = sample( Nseq, n, replace = TRUE )
, y1 = sample( Nseq, n, replace = TRUE )
, x2 = sample( Nseq, n, replace = TRUE )
, y2 = sample( Nseq, n, replace = TRUE )
)
tab <- as.matrix( tabDF )
fTest1 <- function() {
test <- tab %>%
split( 1:nrow(tab) ) %>%
map(~ func1(.x, 40, 5, 1) ) %>%
do.call( "rbind", . )
}
fTest2 <- function() {
# conventional dplyr approach
test <- tabDF %>%
rowwise %>%
do({
func2( ., 40, 5, 1 )
}) %>%
ungroup
}
fTest3 <- function() {
t( apply( tab, 1, func3, A=40, B=5, C=1 ) )
}
fTest4 <- function() {
func4( tabDF, A=40, B=5, C=1 )
}
microbenchmark( result1 <- fTest1()
, result2 <- fTest2()
, result3 <- fTest3()
, result4 <- fTest4()
)
#> Unit: microseconds
#> expr min lq mean median
#> result1 <- fTest1() 20305.562 23384.359 26939.6559 26262.8495
#> result2 <- fTest2() 255441.229 276794.201 290628.3221 286046.6385
#> result3 <- fTest3() 4869.288 5772.462 7242.2194 6615.7900
#> result4 <- fTest4() 52.862 94.962 216.3508 105.7235
#> uq max neval
#> 29324.2775 46207.632 100
#> 294248.0795 473898.379 100
#> 7874.6455 21288.783 100
#> 127.0565 9253.006 100
stopifnot( result1[ , 1 ] == result2[[ 1 ]] )
stopifnot( result1[ , 2 ] == result2[[ 2 ]] )
stopifnot( result1 == result3 )
stopifnot( result1 == result4 )
####################
On Thu, 1 Nov 2018, MacQueen, Don via R-help wrote:
> Without more study, I can only give some general pointers.
>
> The as.vector() in X1 <- as.vector(coord[1]) is almost certainly not needed. It will add a little bit to your execution time.
> Converting the output of func() to a one row matrix is almost certainly not needed. Just return c(res1, res2).
>
> Your data frame appears to be entirely numeric, in which case you don't need to ever use a data frame.
>
> Try
> apply( tab, 1, func, a=40, b=5, c=1 )
> instead of all that dplyr stuff.
>
>
> Your function can be redefined as
>
> func <- function(coord, a, b, c){
>
> X1 <- as.vector(coord[1])
> Y1 <- as.vector(coord[2])
> X2 <- as.vector(coord[3])
> Y2 <- as.vector(coord[4])
>
> res1 <- mean(c((X1 - a) : (X1 - 1), (Y1 + 1) : (Y1 + 40)))
> res2 <- mean(c((X2 - a) : (X2 - 1), (Y2 + 1) : (Y2 + 40)))
>
> if (c==0) c(res1, res2) else c(res1, res2)*b
> }
>
> I suspect you can operate on the entire matrix, without looping (which both the apply() method, and the split/rbind method do, in effect), and if so it will be much faster. But I can't say for sure without more study.
>
> --
> Don MacQueen
> Lawrence Livermore National Laboratory
> 7000 East Ave., L-627
> Livermore, CA 94550
> 925-423-1062
> Lab cell 925-724-7509
>
>
>
> On 11/1/18, 12:35 PM, "R-help on behalf of Nelly Reduan" <r-help-bounces using r-project.org on behalf of nell.redu using hotmail.fr> wrote:
>
> Hello,
>
> I have a input data frame with multiple rows. For each row, I want to apply a function. The input data frame has 1,000,000+ rows. How can I speed up my code ? I would like to keep the function "func".
>
> Here is a reproducible example with a simple function:
>
> library(tictoc)
> library(dplyr)
>
> func <- function(coord, a, b, c){
>
> X1 <- as.vector(coord[1])
> Y1 <- as.vector(coord[2])
> X2 <- as.vector(coord[3])
> Y2 <- as.vector(coord[4])
>
> if(c == 0) {
>
> res1 <- mean(c((X1 - a) : (X1 - 1), (Y1 + 1) : (Y1 + 40)))
> res2 <- mean(c((X2 - a) : (X2 - 1), (Y2 + 1) : (Y2 + 40)))
> res <- matrix(c(res1, res2), ncol=2, nrow=1)
>
> } else {
>
> res1 <- mean(c((X1 - a) : (X1 - 1), (Y1 + 1) : (Y1 + 40)))*b
> res2 <- mean(c((X2 - a) : (X2 - 1), (Y2 + 1) : (Y2 + 40)))*b
> res <- matrix(c(res1, res2), ncol=2, nrow=1)
>
> }
>
> return(res)
> }
>
> ## Apply the function
> set.seed(1)
> n = 10000000
> tab <- as.matrix(data.frame(x1 = sample(1:100, n, replace = T), y1 = sample(1:100, n, replace = T), x2 = sample(1:100, n, replace = T), y2 = sample(1:100, n, replace = T)))
>
>
> tic("test 1")
> test <- tab %>%
> split(1:nrow(tab)) %>%
> map(~ func(.x, 40, 5, 1)) %>%
> do.call("rbind", .)
> toc()
>
> test 1: 599.2 sec elapsed
>
> Thanks very much for your time
> Have a nice day
> Nell
>
> [[alternative HTML version deleted]]
>
> ______________________________________________
> R-help using r-project.org mailing list -- To UNSUBSCRIBE and more, see
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.
>
>
> ______________________________________________
> R-help using r-project.org mailing list -- To UNSUBSCRIBE and more, see
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.
---------------------------------------------------------------------------
Jeff Newmiller The ..... ..... Go Live...
DCN:<jdnewmil using dcn.davis.ca.us> Basics: ##.#. ##.#. Live Go...
Live: OO#.. Dead: OO#.. Playing
Research Engineer (Solar/Batteries O.O#. #.O#. with
/Software/Embedded Controllers) .OO#. .OO#. rocks...1k
---------------------------------------------------------------------------
More information about the R-help
mailing list