[R] segfault when using data.table package in conjunction with foreach
Uwe Ligges
ligges at statistik.tu-dortmund.de
Fri Feb 24 16:29:47 CET 2012
0. Read the posting guide! It tells you to
1. Do not cross post!
2. Do try a recent version of R and all packages you have in use.
3. If it still fails, send a reproducible example.
Uwe Ligges
On 23.02.2012 18:04, Matthew Keller wrote:
> Hi all,
>
> I'm trying to use the package read.table within a foreach loop. I'm
> grabbing 500M rows of data at a time from two different files and then
> doing an aggregate/tapply like function in read.table after that. I
> had planned on doing a foreach loop 39 times at once for the 39 files
> I have, but obviously that won't work until I figure out why the
> segfault is occurring. The sessionInfo, code, and error are pasted
> below. If you have any ideas, would love to hear them. (I have no
> control over the version of R - 2.13.0 - being used). Best
>
> Matt
>
>
> SESSION INFO:
>
>> sessionInfo()
> R version 2.13.0 (2011-04-13)
> Platform: x86_64-unknown-linux-gnu (64-bit)
>
> locale:
> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
> LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=C
> [6] LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C
> LC_ADDRESS=C LC_TELEPHONE=C
> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
>
> attached base packages:
> [1] stats graphics grDevices utils datasets methods base
>
> other attached packages:
> [1] data.table_1.7.10 doMC_1.2.2 multicore_0.1-5
> foreach_1.3.2 codetools_0.2-8 iterators_1.0.3
>
>
>
> MY CODE:
>
> computeAllPairSums<- function(filename, nbindiv,nrows.to.read)
> {
> con<- file(filename, open="r")
> on.exit(close(con))
> ans<- matrix(numeric(nbindiv * nbindiv), nrow=nbindiv)
> chunk<- 0L
> while (TRUE) {
> #read.table faster than scan
> df0<- read.table(con,col.names=c("ID1", "ID2", "ignored", "sharing"),
> colClasses=c("integer", "integer", "NULL",
> "numeric"),nrows=nrows.to.read,comment.char="")
>
> DT<- data.table(df0)
> setkey(DT,ID1,ID2)
> ss<- DT[,sum(sharing),by="ID1,ID2"]
>
> if (nrow(df0) == 0L)
> break
>
> chunk<- chunk + 1L
> cat("Processing chunk", chunk, "... ")
>
> idd<- as.matrix(subset(ss,select=1:2))
> newvec<- as.vector(as.matrix(subset(ss,select=3)))
> ans[idd]<- ans[idd] + newvec
>
> cat("OK\n")
> }
> ans
> }
>
>
>
> require(foreach)
> require(doMC)
> registerDoMC(cores=2)
>
>
> num<- 8891
> nr<- 500000000L #500 million rows at a time
>
>
> MMM<- foreach(IT = 1:2) %dopar% {
> require(data.table)
> if (IT==1){ x<- system.time({computeAllPairSums(
> paste(GERMLINE,"bc.chr22.q.20.file",sep=''),num,nr)}) } #Run it on
> regular file PID 6489, 24 gb
> if (IT==2){ z<- system.time({computeAllPairSums.gz(
> paste(GERMLINE,"bc.chr22.q.20.gz",sep=''),num,nr)}) } #Run it on gz
> file PID 6490, 24 gb
> }
>
>
> MY R OUTPUT/ERROR:
>
> MMM<- foreach(IT = 1:2) %dopar% {
> + require(data.table)
> + if (IT==1){ x<- system.time({computeAllPairSums(
> paste(GERMLINE,"bc.chr22.q.20.file",sep=''),num,nr)}) } #Run it on
> regular file PID 6053, 5.9 gb
> + if (IT==2){ z<- system.time({computeAllPairSums.gz(
> paste(GERMLINE,"bc.chr22.q.20.gz",sep=''),num,nr)}) } #Run it on gz
> file PID 6054, 4 gb
> + }
>
> Loading required package: data.table
> Loading required package: data.table
> data.table 1.7.10 For help type: help("data.table")
> data.table 1.7.10 For help type: help("data.table")
>
> *** caught segfault ***
> address 0x2ae93df90000, cause 'memory not mapped'
>
> Traceback:
> 1: .Call("dogroups", x, xcols, o__, f__, len__, jsub, SDenv, testj,
> byretn, byval, i, as.integer(icols), i[1, ivars, with = FALSE],
> if (length(ivars)) paste("i.", ivars, sep = ""), is.na(nomatch),
> verbose, PACKAGE = "data.table")
> 2: `[.data.table`(DT, , sum(sharing), by = "ID1,ID2")
> 3: DT[, sum(sharing), by = "ID1,ID2"]
> 4: computeAllPairSums(paste(GERMLINE, "bc.chr22.q.20.file", sep =
> ""), num, nr)
> 5: system.time({ computeAllPairSums(paste(GERMLINE,
> "bc.chr22.q.20.file", sep = ""), num, nr)})
> 6: eval(expr, envir, enclos)
> 7: eval(c.expr, envir = args, enclos = envir)
> 8: doTryCatch(return(expr), name, parentenv, handler)
> 9: tryCatchOne(expr, names, parentenv, handlers[[1L]])
> 10: tryCatchList(expr, classes, parentenv, handlers)
> 11: tryCatch(eval(c.expr, envir = args, enclos = envir), error = function(e) e)
> 12: FUN(X[[1L]], ...)
> 13: lapply(S, FUN, ...)
> 14: doTryCatch(return(expr), name, parentenv, handler)
> 15: tryCatchOne(expr, names, parentenv, handlers[[1L]])
> 16: tryCatchList(expr, classes, parentenv, handlers)
> 17: tryCatch(expr, error = function(e) { call<- conditionCall(e)
> if (!is.null(call)) { if (identical(call[[1L]],
> quote(doTryCatch))) call<- sys.call(-4L) dcall<-
> deparse(call)[1L] prefix<- paste("Error in", dcall, ": ")
> LONG<- 75L msg<- conditionMessage(e) sm<-
> strsplit(msg, "\n")[[1L]] w<- 14L + nchar(dcall, type = "w") +
> nchar(sm[1L], type = "w") if (is.na(w)) w<- 14L +
> nchar(dcall, type = "b") + nchar(sm[1L], type = "b")
> if (w> LONG) prefix<- paste(prefix, "\n ", sep =
> "") } else prefix<- "Error : " msg<- paste(prefix,
> conditionMessage(e), "\n", sep = "")
> .Internal(seterrmessage(msg[1L])) if (!silent&&
> identical(getOption("show.error.messages"), TRUE)) {
> cat(msg, file = stderr()) .Internal(printDeferredWarnings())
> } invisible(structure(msg, class = "try-error"))})
> 18: try(lapply(S, FUN, ...), silent = TRUE)
> 19: sendMaster(try(lapply(S, FUN, ...), silent = TRUE))
> 20: FUN(1:2[[1L]], ...)
> 21: lapply(1:cores, inner.do)
> 22: mclapply(argsList, FUN, mc.preschedule = preschedule, mc.set.seed
> = set.seed, mc.silent = silent, mc.cores = cores)
> 23: e$fun(obj, substitute(ex), parent.frame(), e$data)
> 24: foreach(IT = 1:2) %dopar% { require(data.table) if (IT == 1)
> { x<- system.time({
> computeAllPairSums(paste(GERMLINE, "bc.chr22.q.20.file",
> sep = ""), num, nr) }) } if (IT == 2) { z<-
> system.time({ computeAllPairSums.gz(paste(GERMLINE,
> "bc.chr22.q.20.gz", sep = ""), num, nr) })
> }}
>
> Possible actions:
> 1: abort (with core dump, if enabled)
> 2: normal R exit
> 3: exit R without saving workspace
> 4: exit R saving workspace
>
>
>
More information about the R-help
mailing list