[BioC] GO enrichment significance

Tue Jul 10 19:52:42 CEST 2012

Dear People

Paul Shannon suggested I submit the following question to this list.   
I am investigating clustering techniques and would like to know  
whether GO term enrichment can be used as an external evaluation to  
determine whether clusters are any different from a random selection  
of genes from my data set. I used the following code to determine the  
'background' retrieval of GO terms from a set of random gene clusters.  
My conclusion is that a random sample of 11 to 34 genes from this gene  
set returns an average of 0.35 enriched GO terms per gene.  Does this  
make sense to use as  'background' retrieval of GO terms?

Thanks for your help,

Mark Grimes
University of Montana

#
#	AUTHOR: MARK GRIMES (mark.grimes at mso.umt.edu)
#	
# Question: what is the background for retrieval of GO terms from my  
data set?
# data set is 2482 genes from proteomic data from lung cancer samples  
(data frame is cyf; genes are in GENE SYMBOL format)
	#  I won't include the data here but this could be in principle  
tested with any gene list
	# set gene'universe' to this data set
	lcgenes <<- as.character (sapply (cyf$Gene.Name, function (id)  
org.Hs.egSYMBOL2EG [[id]]))
	#
	# random genes: 11 to 34 from this list:
		randgene.list <- as.list(NULL)
		rand.d=data.frame(NULL)
	for(i in (11:34))		{
		rand.temp=data.frame(NULL)
		rand.temp <- data.frame(as.character(cyf[sample(2482, i),  
"Gene.Name"]))
		rand.temp$group <- i
		names(rand.temp) <- c("Gene.Name", "genes")
		rand.d <- rbind(rand.d, rand.temp)
		}
	randgene.list <- dlply(rand.d, .(genes))  # GROUP LIST
# Now fetch GO terms using a function written by Paul Shannon (pasted  
below)	
			#	Use randgenes.list to get gene IDs
		randgo.list <- as.list(NULL)
		randgo.df=data.frame(NULL)
			for(k in 1:length(randgenes.list)) {
				genes <- as.character (sapply (as.character(randgenes.list[[k]] 
$Gene.Name), function (id) org.Hs.egSYMBOL2EG [[id]]))
				tbl.go <<- tabular.go.enrichment (genes, ontology='BP',  
pvalue=0.01, geneUniverse=lcgenes)
	 	#	If there is any enrichment, the number of genes for significant  
terms should be > 2
				tbl.go2 <- tbl.go[tbl.go$Count>=2,]
				randgo.list[[k]] <- tbl.go2
				randgo.df[k,1] <- length(unique(genes))
				randgo.df[k,2] <- length(unique(tbl.go2$Term))
				randgo.df[k,3] <- mean(unique(tbl.go2$Count))
				randgo.df[k,4] <- mean(unique(tbl.go2$Count)/ 
unique(tbl.go2$Expected))
				 #
			}
			names(randgo.df) <- c("genes", "GO.terms", "mean.count",  
"mean.count.over.expected")
			randgo <- randgo.df
			randgo$mean.count[is.nan(randgo$mean.count)] <- 0
			randgo$mean.count.over.expected[is.nan(randgo 
$mean.count.over.expected)] <- 0
			mean(randgo$GO.terms)  # 7.76
			sd(randgo$GO.terms)  #  8.747762
			mean(randgo$mean.count)  #  3.498667
			sd(randgo$mean.count)  #  2.06647
			mean(randgo$mean.count.over.expected)  # 18.34359
			sd(randgo$mean.count.over.expected)  # 10.89013
			randgo$GOterms.per.gene <- randgo$GO.terms/randgo$genes
			mean(randgo$GOterms.per.gene)  # 0.3478717
			sd(randgo$GOterms.per.gene)  # 0.3464063
#
#  Conclusion: a random sample of this gene set returns an average of  
0.35 enriched GO terms per gene
#  	Question	   Is this a resonable background number to test whether  
clusters that have real interactions are meaninful?
# _______________________________________________________________
# functions thanks to Paul Shannon:
tabular.go.enrichment <- function (genes, ontology, pvalue,  
geneUniverse=character (0))
{
   write (sprintf ('about to call go.enrichment with %d genes, ontolog= 
%s, pvalue=%s', length (genes), ontology, pvalue), stderr ())
   hgr = go.enrichment (genes, ontology, geneUniverse, pvalue)
   write (sprintf ('back from call to go.enrichment'), stderr ())
   tbl <<- summary (hgr)
   tbl <<- tbl [, c (7, 1, 2, 4, 5, 6)]
   colnames (tbl) [2] <<- 'GOID'
   rownames (tbl) <<- NULL
   virgin.tbl = TRUE
   sample.row =  list (Term='bogus', GOID='bogus', Pvalue=0.0,  
Expected=0, Count=0, Size=0, ID='bogus', Gene='bogus') #, Depth=99)
   df = data.frame (sample.row, stringsAsFactors=FALSE)

   for (r in 1:nrow (tbl)) {
     #write (sprintf ('assembling gene-based table, summary row %d',  
r), stderr ())
     # print (tbl [r,])
     goTerm = tbl [r, 2]
     gene.ids = geneIdsByCategory (hgr) [[goTerm]]
     for (gene.id in gene.ids) {
       old.row = tbl [r,]
       rownames (old.row) = NULL
       gene.symbol = geneSymbol (gene.id)
       #depth = go.depth (goTerm)
       new.row = cbind (old.row, Id=gene.id, zz=gene.symbol,  
stringsAsFactors=FALSE) # Depth=depth, stringsAsFactors=FALSE)
       colnames (new.row) = c ('Term', 'GOID', 'Pvalue', 'Expected',  
'Count', 'Size', 'ID', 'Gene') #, 'Depth')
       if (virgin.tbl) {
         df [1,] = new.row
         virgin.tbl = FALSE
         }
       else
         df = rbind (df, new.row)
       } # for gene.id
     } # for r

   return (df)

}
#
go.enrichment = function (genes, ontology, universe=character(0),  
pvalue=0.05, annotation='org.Hs.eg.db', conditionalSearch=TRUE)
{
    params = new ("GOHyperGParams", geneIds=genes,  ontology=ontology,  
annotation=annotation,
                  universeGeneIds=universe, pvalueCutoff = pvalue,  
conditional=conditionalSearch,
                  testDirection = "over")
    hgr <<- hyperGTest (params)
    return (hgr)

} # go.enrichment

sessionInfo()
R version 2.15.1 (2012-06-22)
Platform: x86_64-apple-darwin9.8.0/x86_64 (64-bit)

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] grid      splines   stats     graphics  grDevices utils      
datasets  methods   base

other attached packages:
  [1] GOstats_2.22.0       Category_2.22.0      GO.db_2.7.1           
rgl_0.92.879         gplots_2.10.1        KernSmooth_2.23-7     
caTools_1.13
  [8] bitops_1.0-4.1       gdata_2.8.2          gtools_2.6.2          
tsne_0.1-2           vegan_2.0-3          permute_0.7-0         
RUnit_0.4.26
[15] bio3d_1.1-3          RCytoscape_1.6.3     XMLRPC_0.2-4          
graph_1.34.0         org.Hs.eg.db_2.7.1   RSQLite_0.11.1       DBI_0.2-5
[22] AnnotationDbi_1.18.0 Biobase_2.16.0       BiocGenerics_0.2.0    
adegenet_1.3-4       ade4_1.4-17          MASS_7.3-18           
cluster_1.14.2
[29] Hmisc_3.9-3          survival_2.36-14     plyr_1.7.1

loaded via a namespace (and not attached):
  [1] annotate_1.34.0   genefilter_1.38.0 GSEABase_1.18.0    
IRanges_1.14.2    lattice_0.20-6    RBGL_1.32.0        
RCurl_1.91-1      stats4_2.15.1
  [9] tools_2.15.1      XML_3.9-4         xtable_1.7-0