I asked a question on the superb biostar stackexchange site. It’s here: http://biostar.stackexchange.com/questions/1054/homology-bioconductor
It’s about finding geneome-wide homologies using bioconductor. It turns out that bioconductor has a package called biomaRt which allows you to query the Ensembl databases with ease. (Ensembl stores gene information for a bunch of different organisms).
I thought I’d write down my solution here, as a sort of extended answer to my question on biostar, in case anyone trips up on the question there and would like a more complete answer. You’ll need to read the question before any of this code makes sense!
library(biomaRt)
gen_hs2mm <- function(affyids){
ensembl_hs <- useMart(
"ensembl",
dataset = "hsapiens_gene_ensembl"
)
hs2mm_filters <- c(
"affy_hg_u133a",
"with_mmusculus_homolog"
)
hs2mm_gene_atts <- c(
"affy_hg_u133a",
"ensembl_gene_id"
)
hs2mm_homo_atts <- c(
"ensembl_gene_id",
"mouse_ensembl_gene"
)
# the names in these lists are arbitrary
hs2mm_value = list(
affyid=affyids,
with_homolog=TRUE
)
# get the human genes and mouse orthologues
hs2mm_gene <- getBM(
attributes = hs2mm_gene_atts,
filters = hs2mm_filters,
value = hs2mm_value,
mart = ensembl_hs
)
hs2mm_homo <- getBM(
attributes = hs2mm_homo_atts,
filters = hs2mm_filters,
value = hs2mm_value,
mart = ensembl_hs
)
# merge the two lists!
hs2mm <- merge(hs2mm_gene,hs2mm_homo)
}
gen_mm2hs <- function(affyids){
ensembl_mm <- useMart("ensembl",
dataset = "mmusculus_gene_ensembl")
mm2hs_filters <- c(
"affy_mogene_1_0_st_v1",
"with_hsapiens_homolog"
)
mm2hs_gene_atts <- c(
"affy_mogene_1_0_st_v1",
"ensembl_gene_id"
)
mm2hs_homo_atts <- c(
"ensembl_gene_id",
"human_ensembl_gene"
)
# the names in these lists are arbitrary
mm2hs_value = list(
affyids=affyids,
with_homolog=TRUE
)
# get the mouse genes and human orthologues
mm2hs_gene <- getBM(
attributes = mm2hs_gene_atts ,
filters = mm2hs_filters,
value = mm2hs_value,
mart = ensembl_mm
)
mm2hs_homo <- getBM(
attributes = mm2hs_homo_atts,
filters = mm2hs_filters,
value = mm2hs_value,
mart = ensembl_mm
)
mm2hs <- merge(mm2hs_gene,mm2hs_homo)
}
source('load_data.r')
# here immgen and cd4T are different experession set objects
# from Bioconductor.
# immgen is mouse data (from the Immunological Genome Project)
# and cd4T is human data
# cd4T can be found on GEO using the accessionID GDS785
# See ref[1]
immgen <- load_immgen()
cd4T <- load_GDS785()
hs2mm <- gen_hs2mm(rownames(exprs(cd4T)))
mm2hs <- gen_mm2hs(rownames(exprs(immgen)))
colnames(hs2mm)[1] <- 'human_ensembl_gene'
colnames(mm2hs)[1] <- 'mouse_ensembl_gene'
# the final thing is to merge the two tables to make a single
# table containing all the probes that are homologous, along
# with their respsective EnsemblIDs
homol <- merge(hs2mm,mm2hs)
Gene expression profiles during human CD4+ T cell differentiation. Int Immunol2004 Aug;16(8):1109-24. PMID: 15210650