[R] : Quantile and rowMean from multiple files in a folder
arun
smartpink111 at yahoo.com
Tue Apr 15 05:48:17 CEST 2014
Hi,
Q1 solution already sent.
Regarding Q2, one of the files in the new Observed folder doesn't have any data (just the Year column alone).
That may be the reason for the problem.
### Q1: working directory: Observed #Only one file per Site. Assuming this is the
### case for the full dataset, then I guess there is no need to average
dir.create("final")
lst1 <- split(list.files(pattern = ".csv"), gsub("\\_.*", "", list.files(pattern = ".csv")))
lst2 <- lapply(lst1, function(x1) lapply(x1, function(x2) {
lines1 <- readLines(x2)
header1 <- lines1[1:2]
dat1 <- read.table(text = lines1, header = FALSE, sep = ",", stringsAsFactors = FALSE,
skip = 2)
colnames(dat1) <- Reduce(paste, strsplit(header1, ","))
dat1[-c(nrow(dat1), nrow(dat1) - 1), ]
}))
lst3 <- lst2[sapply(seq_along(lst2),function(i){lstN <- sapply(lst2[[i]],function(x) is.integer(ncol(x)))})]
#difference in column number
sapply(seq_along(lst3), function(i) {
sapply(lst3[[i]], function(x) ncol(x))
})
#
#[1] 157 258 258 98 157 258 256 258 250 258 258 147 157 250 250 256 249 240
# [19] 181 188 256 146 117 258 153 256 255 246 255 256 258 257 145 258 258 255
# [37] 258 157 164 144 265 258 254 258 258 157 258 176 258 256 257 258 258 258
# [55] 248 258 156 258 157 157 258 258 258 258 258 148 258 258 258 258 257 258
# [73] 258 258 157 154 153 258 248 255 257 256 258 258 157 256 256 257 257 250
# [91] 257 139 155 256 256 257 257 256 258 258 257 258 258 258 258 157 157 157
#[109] 258 258 258 258 256 258 157 258 258 256 258
library(plyr)
library(stringr)
lst4 <- setNames(lapply(seq_along(lst3), function(i) {
lapply(lst3[[i]], function(x) {
names(x)[-1] <- paste(names(x)[-1], names(lst1)[i], sep = "_")
names(x) <- str_trim(names(x))
x
})[[1]]
}), names(lst3))
df1 <- join_all(lst4, by = "Year")
dim(df1)
# [1] 9 27311
sapply(split(names(df1)[-1], gsub(".*\\_", "", names(df1)[-1])), function(x) {
df2 <- df1[, x]
df3 <- data.frame(Percentiles = paste0(seq(0, 100, by = 1), "%"), numcolwise(function(y) quantile(y,
seq(0, 1, by = 0.01), na.rm = TRUE))(df2), stringsAsFactors = FALSE)
ncol(df3)
})
#
#G100 G101 G102 G103 G104 G105 G106 G107 G108 G109 G110 G111 G112 G113 G114 G115
# 157 258 258 98 157 258 256 258 250 258 258 147 157 250 250 256
#G116 G117 G118 G119 G120 GG10 GG11 GG12 GG13 GG14 GG15 GG16 GG17 GG18 GG19 GG20
# 249 240 181 188 256 146 117 258 153 256 255 246 255 256 258 257
#GG21 GG22 GG23 GG24 GG25 GG26 GG27 GG28 GG29 GG30 GG31 GG32 GG33 GG34 GG35 GG36
# 145 258 258 255 258 157 164 144 265 258 254 258 258 157 258 176
#GG37 GG38 GG39 GG40 GG41 GG42 GG43 GG44 GG45 GG46 GG47 GG48 GG49 GG50 GG51 GG52
# 258 256 257 258 258 258 248 258 156 258 157 157 258 258 258 258
#GG53 GG54 GG55 GG56 GG57 GG58 GG59 GG60 GG61 GG62 GG63 GG64 GG65 GG66 GG67 GG68
# 258 148 258 258 258 258 257 258 258 258 157 154 153 258 248 255
#GG69 GG70 GG71 GG72 GG73 GG74 GG75 GG76 GG77 GG78 GG79 GG80 GG81 GG82 GG83 GG84
# 257 256 258 258 157 256 256 257 257 250 257 139 155 256 256 257
#GG85 GG86 GG87 GG88 GG89 GG90 GG91 GG92 GG93 GG94 GG95 GG96 GG97 GG98 GG99 GGG1
# 257 256 258 258 257 258 258 258 258 157 157 157 258 258 258 258
#GGG2 GGG3 GGG4 GGG5 GGG6 GGG7 GGG8
# 256 258 157 258 258 256 258
lst5 <- split(names(df1)[-1], gsub(".*\\_", "", names(df1)[-1]))
lapply(seq_along(lst5), function(i) {
df2 <- df1[, lst5[[i]]]
df3 <- data.frame(Percentiles = paste0(seq(0, 100, by = 1), "%"), numcolwise(function(y) quantile(y,
seq(0, 1, by = 0.01), na.rm = TRUE))(df2), stringsAsFactors = FALSE)
df3[1:3, 1:3]
write.csv(df3, paste0(paste(getwd(), "final", paste(names(lst4)[[i]], "Quantile",
sep = "_"), sep = "/"), ".csv"), row.names = FALSE, quote = FALSE)
})
ReadOut1 <- lapply(list.files(recursive = TRUE)[grep("Quantile", list.files(recursive = TRUE))],
function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))
sapply(ReadOut1, dim)[,1:3]
# [,1] [,2] [,3]
#[1,] 101 101 101
#[2,] 157 258 258
lapply(ReadOut1, function(x) x[1:2, 1:3])[1:3]
#[[1]]
# Percentiles pav.DJF_G100 pav.MAM_G100
#1 0% 0 0.640500
#2 1% 0 0.664604
#
#[[2]]
# Percentiles txav.DJF_G101 txav.MAM_G101
#1 0% -13.8756 4.742400
#2 1% -13.8140 4.817184
#
#[[3]]
# Percentiles txav.DJF_G102 txav.MAM_G102
#1 0% -15.05000 4.520700
#2 1% -14.96833 4.543828
### Q2: Observed data
dir.create("Indices")
names1 <- unlist(lapply(ReadOut1, function(x) names(x)[-1]))
names2 <- gsub("\\_.*", "", names1)
names3 <- unique(gsub("[.]", " ", names2))
res <- do.call(rbind, lapply(seq_along(lst5), function(i) {
df2 <- df1[, lst5[[i]]]
vec1 <- colMeans(df2, na.rm = TRUE)
vec2 <- rep(NA, length(names3))
names(vec2) <- paste(names3, names(lst5)[[i]], sep = "_")
vec2[names(vec2) %in% names(vec1)] <- vec1
names(vec2) <- gsub("\\_.*", "", names(vec2))
vec2
}))
dim(res)
#[1] 119 264
lapply(seq_len(ncol(res)), function(i) {
mat1 <- t(res[, i, drop = FALSE])
colnames(mat1) <- names(lst4)
write.csv(mat1, paste0(paste(getwd(), "Indices", gsub(" ", "_", rownames(mat1)),
sep = "/"), ".csv"), row.names = FALSE, quote = FALSE)
})
## Output2:
ReadOut2 <- lapply(list.files(recursive = TRUE)[grep("Indices", list.files(recursive = TRUE))],
function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))
length(ReadOut2)
# [1] 264
list.files(recursive = TRUE)[grep("Indices", list.files(recursive = TRUE))][1]
# [1] 'Indices/pav_ANN.csv'
res[, "pav ANN", drop = FALSE]
ReadOut2[[1]]
Attached is the updated Quantilecode2.txt.
A.K.
On Monday, April 14, 2014 10:41 PM, Zilefac Elvis <zilefacelvis at yahoo.com> wrote:
Hi AK,
Q1) Please try to correct the error using the larger data set (Sample.zip). The issue is that once you write the codes and restrict it to smaller data sets, I find it difficult to generalize it to larger data sets.
Q2) From the Quantilecode2.txt you just sent, you forgot to do the following section using the Observed.zip file. I tried to run the code to section Q1 in Quantilecode2.txt using a larger data set and received the same error :Error in 2:nrow(lstNew) : argument of length 0. I have attached a larger data set too for you to generalize the code to suit the larger data set. Please do not forget to include the code below in the final code of Q2.
Once you fix these two, I should be able to fix the rest following these examples.
Thanks AK. Sorry for overloading you with much work.
Atem.
#==============================================================================================================
dir.create("Indices")
names1 <- lapply(ReadOut1, function(x) names(x))[[1]]
lstNew <- simplify2array(ReadOut1) lapply(2:nrow(lstNew), function(i) { dat1 <- data.frame(lstNew[1], do.call(cbind, lstNew[i, ]), stringsAsFactors = FALSE) colnames(dat1) <- c(rownames(lstNew)[1], paste(names(lst1), rep(rownames(lstNew)[i], length(lst1)), sep = "_"))
write.csv(dat1, paste0(paste(getwd(), "Indices", rownames(lstNew)[i], sep = "/"), ".csv"), row.names = FALSE, quote = FALSE)
})
## Output2:
ReadOut2 <- lapply(list.files(recursive = TRUE)[grep("Indices", list.files(recursive = TRUE))], function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))
length(ReadOut2)
# [1] 257
head(ReadOut2[[1]], 2)
#==============================================================================================================
=================================================
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: Quantilecode2.txt
URL: <https://stat.ethz.ch/pipermail/r-help/attachments/20140414/c97a58e5/attachment-0002.txt>
More information about the R-help
mailing list