[BioC] GO enrichment significance
Grimes Mark
mark.grimes at mso.umt.edu
Tue Jul 10 19:52:42 CEST 2012
Dear People
Paul Shannon suggested I submit the following question to this list.
I am investigating clustering techniques and would like to know
whether GO term enrichment can be used as an external evaluation to
determine whether clusters are any different from a random selection
of genes from my data set. I used the following code to determine the
'background' retrieval of GO terms from a set of random gene clusters.
My conclusion is that a random sample of 11 to 34 genes from this gene
set returns an average of 0.35 enriched GO terms per gene. Does this
make sense to use as 'background' retrieval of GO terms?
Thanks for your help,
Mark Grimes
University of Montana
#
# AUTHOR: MARK GRIMES (mark.grimes at mso.umt.edu)
#
# Question: what is the background for retrieval of GO terms from my
data set?
# data set is 2482 genes from proteomic data from lung cancer samples
(data frame is cyf; genes are in GENE SYMBOL format)
# I won't include the data here but this could be in principle
tested with any gene list
# set gene'universe' to this data set
lcgenes <<- as.character (sapply (cyf$Gene.Name, function (id)
org.Hs.egSYMBOL2EG [[id]]))
#
# random genes: 11 to 34 from this list:
randgene.list <- as.list(NULL)
rand.d=data.frame(NULL)
for(i in (11:34)) {
rand.temp=data.frame(NULL)
rand.temp <- data.frame(as.character(cyf[sample(2482, i),
"Gene.Name"]))
rand.temp$group <- i
names(rand.temp) <- c("Gene.Name", "genes")
rand.d <- rbind(rand.d, rand.temp)
}
randgene.list <- dlply(rand.d, .(genes)) # GROUP LIST
# Now fetch GO terms using a function written by Paul Shannon (pasted
below)
# Use randgenes.list to get gene IDs
randgo.list <- as.list(NULL)
randgo.df=data.frame(NULL)
for(k in 1:length(randgenes.list)) {
genes <- as.character (sapply (as.character(randgenes.list[[k]]
$Gene.Name), function (id) org.Hs.egSYMBOL2EG [[id]]))
tbl.go <<- tabular.go.enrichment (genes, ontology='BP',
pvalue=0.01, geneUniverse=lcgenes)
# If there is any enrichment, the number of genes for significant
terms should be > 2
tbl.go2 <- tbl.go[tbl.go$Count>=2,]
randgo.list[[k]] <- tbl.go2
randgo.df[k,1] <- length(unique(genes))
randgo.df[k,2] <- length(unique(tbl.go2$Term))
randgo.df[k,3] <- mean(unique(tbl.go2$Count))
randgo.df[k,4] <- mean(unique(tbl.go2$Count)/
unique(tbl.go2$Expected))
#
}
names(randgo.df) <- c("genes", "GO.terms", "mean.count",
"mean.count.over.expected")
randgo <- randgo.df
randgo$mean.count[is.nan(randgo$mean.count)] <- 0
randgo$mean.count.over.expected[is.nan(randgo
$mean.count.over.expected)] <- 0
mean(randgo$GO.terms) # 7.76
sd(randgo$GO.terms) # 8.747762
mean(randgo$mean.count) # 3.498667
sd(randgo$mean.count) # 2.06647
mean(randgo$mean.count.over.expected) # 18.34359
sd(randgo$mean.count.over.expected) # 10.89013
randgo$GOterms.per.gene <- randgo$GO.terms/randgo$genes
mean(randgo$GOterms.per.gene) # 0.3478717
sd(randgo$GOterms.per.gene) # 0.3464063
#
# Conclusion: a random sample of this gene set returns an average of
0.35 enriched GO terms per gene
# Question Is this a resonable background number to test whether
clusters that have real interactions are meaninful?
# _______________________________________________________________
# functions thanks to Paul Shannon:
tabular.go.enrichment <- function (genes, ontology, pvalue,
geneUniverse=character (0))
{
write (sprintf ('about to call go.enrichment with %d genes, ontolog=
%s, pvalue=%s', length (genes), ontology, pvalue), stderr ())
hgr = go.enrichment (genes, ontology, geneUniverse, pvalue)
write (sprintf ('back from call to go.enrichment'), stderr ())
tbl <<- summary (hgr)
tbl <<- tbl [, c (7, 1, 2, 4, 5, 6)]
colnames (tbl) [2] <<- 'GOID'
rownames (tbl) <<- NULL
virgin.tbl = TRUE
sample.row = list (Term='bogus', GOID='bogus', Pvalue=0.0,
Expected=0, Count=0, Size=0, ID='bogus', Gene='bogus') #, Depth=99)
df = data.frame (sample.row, stringsAsFactors=FALSE)
for (r in 1:nrow (tbl)) {
#write (sprintf ('assembling gene-based table, summary row %d',
r), stderr ())
# print (tbl [r,])
goTerm = tbl [r, 2]
gene.ids = geneIdsByCategory (hgr) [[goTerm]]
for (gene.id in gene.ids) {
old.row = tbl [r,]
rownames (old.row) = NULL
gene.symbol = geneSymbol (gene.id)
#depth = go.depth (goTerm)
new.row = cbind (old.row, Id=gene.id, zz=gene.symbol,
stringsAsFactors=FALSE) # Depth=depth, stringsAsFactors=FALSE)
colnames (new.row) = c ('Term', 'GOID', 'Pvalue', 'Expected',
'Count', 'Size', 'ID', 'Gene') #, 'Depth')
if (virgin.tbl) {
df [1,] = new.row
virgin.tbl = FALSE
}
else
df = rbind (df, new.row)
} # for gene.id
} # for r
return (df)
}
#
go.enrichment = function (genes, ontology, universe=character(0),
pvalue=0.05, annotation='org.Hs.eg.db', conditionalSearch=TRUE)
{
params = new ("GOHyperGParams", geneIds=genes, ontology=ontology,
annotation=annotation,
universeGeneIds=universe, pvalueCutoff = pvalue,
conditional=conditionalSearch,
testDirection = "over")
hgr <<- hyperGTest (params)
return (hgr)
} # go.enrichment
sessionInfo()
R version 2.15.1 (2012-06-22)
Platform: x86_64-apple-darwin9.8.0/x86_64 (64-bit)
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] grid splines stats graphics grDevices utils
datasets methods base
other attached packages:
[1] GOstats_2.22.0 Category_2.22.0 GO.db_2.7.1
rgl_0.92.879 gplots_2.10.1 KernSmooth_2.23-7
caTools_1.13
[8] bitops_1.0-4.1 gdata_2.8.2 gtools_2.6.2
tsne_0.1-2 vegan_2.0-3 permute_0.7-0
RUnit_0.4.26
[15] bio3d_1.1-3 RCytoscape_1.6.3 XMLRPC_0.2-4
graph_1.34.0 org.Hs.eg.db_2.7.1 RSQLite_0.11.1 DBI_0.2-5
[22] AnnotationDbi_1.18.0 Biobase_2.16.0 BiocGenerics_0.2.0
adegenet_1.3-4 ade4_1.4-17 MASS_7.3-18
cluster_1.14.2
[29] Hmisc_3.9-3 survival_2.36-14 plyr_1.7.1
loaded via a namespace (and not attached):
[1] annotate_1.34.0 genefilter_1.38.0 GSEABase_1.18.0
IRanges_1.14.2 lattice_0.20-6 RBGL_1.32.0
RCurl_1.91-1 stats4_2.15.1
[9] tools_2.15.1 XML_3.9-4 xtable_1.7-0
More information about the Bioconductor
mailing list