I asked a question on the superb biostar stackexchange site. It’s here: http://biostar.stackexchange.com/questions/1054/homology-bioconductor
It’s about finding geneome-wide homologies using bioconductor. It turns out that bioconductor has a package called biomaRt which allows you to query the Ensembl databases with ease. (Ensembl stores gene information for a bunch of different organisms).
I thought I’d write down my solution here, as a sort of extended answer to my question on biostar, in case anyone trips up on the question there and would like a more complete answer. You’ll need to read the question before any of this code makes sense!
library(biomaRt) gen_hs2mm <- function(affyids){ ensembl_hs <- useMart( "ensembl", dataset = "hsapiens_gene_ensembl" ) hs2mm_filters <- c( "affy_hg_u133a", "with_mmusculus_homolog" ) hs2mm_gene_atts <- c( "affy_hg_u133a", "ensembl_gene_id" ) hs2mm_homo_atts <- c( "ensembl_gene_id", "mouse_ensembl_gene" ) # the names in these lists are arbitrary hs2mm_value = list( affyid=affyids, with_homolog=TRUE ) # get the human genes and mouse orthologues hs2mm_gene <- getBM( attributes = hs2mm_gene_atts, filters = hs2mm_filters, value = hs2mm_value, mart = ensembl_hs ) hs2mm_homo <- getBM( attributes = hs2mm_homo_atts, filters = hs2mm_filters, value = hs2mm_value, mart = ensembl_hs ) # merge the two lists! hs2mm <- merge(hs2mm_gene,hs2mm_homo) } gen_mm2hs <- function(affyids){ ensembl_mm <- useMart("ensembl", dataset = "mmusculus_gene_ensembl") mm2hs_filters <- c( "affy_mogene_1_0_st_v1", "with_hsapiens_homolog" ) mm2hs_gene_atts <- c( "affy_mogene_1_0_st_v1", "ensembl_gene_id" ) mm2hs_homo_atts <- c( "ensembl_gene_id", "human_ensembl_gene" ) # the names in these lists are arbitrary mm2hs_value = list( affyids=affyids, with_homolog=TRUE ) # get the mouse genes and human orthologues mm2hs_gene <- getBM( attributes = mm2hs_gene_atts , filters = mm2hs_filters, value = mm2hs_value, mart = ensembl_mm ) mm2hs_homo <- getBM( attributes = mm2hs_homo_atts, filters = mm2hs_filters, value = mm2hs_value, mart = ensembl_mm ) mm2hs <- merge(mm2hs_gene,mm2hs_homo) } source('load_data.r') # here immgen and cd4T are different experession set objects # from Bioconductor. # immgen is mouse data (from the Immunological Genome Project) # and cd4T is human data # cd4T can be found on GEO using the accessionID GDS785 # See ref[1] immgen <- load_immgen() cd4T <- load_GDS785() hs2mm <- gen_hs2mm(rownames(exprs(cd4T))) mm2hs <- gen_mm2hs(rownames(exprs(immgen))) colnames(hs2mm)[1] <- 'human_ensembl_gene' colnames(mm2hs)[1] <- 'mouse_ensembl_gene' # the final thing is to merge the two tables to make a single # table containing all the probes that are homologous, along # with their respsective EnsemblIDs homol <- merge(hs2mm,mm2hs)
Gene expression profiles during human CD4+ T cell differentiation. Int Immunol2004 Aug;16(8):1109-24. PMID: 15210650
