#################################################################################
####### GEO/ArrayExpress Data Collection (Normalized) #######
#################################################################################
### Illumina HumanHT-12 V4.0 Expression Beadchip as an example
library(dplyr)
library(tibble)
### helper function: select the most informative probe, MAX IQR
selectProbeFun <- function(expr) {
expr$IQR <- apply(expr[,-which(colnames(expr)=='ID')], 1, IQR)
expr <- expr %>% group_by(ID) %>%
filter(row_number() == which.max(IQR)) %>%
column_to_rownames('ID')
expr <- expr[,-which(colnames(expr)=='IQR')]
return(expr)
}
pcadb.anno <- readRDS('data/Homo_Sapiens_Gene_Annotation_ENSEMBL_HGNC_ENTRE.RDS')
# BiocManager::install('illuminaHumanv3.db')
library(illuminaHumanv4.db)
#columns(illuminaHumanv4.db)
library(illuminaHumanv3.db)
#columns(illuminaHumanv3.db)
### helper function: map the probe ids with Ensembl ids
IlluminaFun <- function(exprData, anno='illuminaHumanv4.db') {
if (!exists('pcadb.anno')) {
pcadb.anno <- readRDS('data/Homo_Sapiens_Gene_Annotation_ENSEMBL_HGNC_ENTRE.RDS')
}
if (anno=='illuminaHumanv4.db') {
illumina.anno <- AnnotationDbi::select(illuminaHumanv4.db,
keys = rownames(exprData),
columns=c('ENSEMBL', 'GENETYPE', "SYMBOL","ENTREZID"), # "SYMBOL","ENTREZID",
keytype="PROBEID")
} else if (anno=='illuminaHumanv3.db') {
illumina.anno <- AnnotationDbi::select(illuminaHumanv3.db,
keys = rownames(exprData),
columns=c('ENSEMBL', 'GENETYPE', "SYMBOL","ENTREZID"), # "SYMBOL","ENTREZID",
keytype="PROBEID")
}
idx <- which(startsWith(illumina.anno$ENSEMBL, 'ENSG'))
illumina.anno <- illumina.anno[idx,]
filter <- which(!illumina.anno$ENSEMBL %in% pcadb.anno$ensembl_id)
illumina.anno <- illumina.anno[-filter,]
illumina.anno <- illumina.anno %>% group_by(PROBEID) %>%
filter(if (length(GENETYPE)>1) GENETYPE == 'protein-coding' else !is.na(GENETYPE))
### optional (if a probe mapped to multiple genes with some of them are novel proteins)
illumina.anno$PROBE_ENTREZ_ID <- paste0(illumina.anno$PROBEID, '_', illumina.anno$ENTREZID)
dup.probe.entrez <- illumina.anno$PROBE_ENTREZ_ID[which(duplicated(illumina.anno$PROBE_ENTREZ_ID))]
idx <- which(illumina.anno$PROBE_ENTREZ_ID %in% dup.probe.entrez)
illumina.anno$ENSEMBL_ENTREZ_ID <- paste0(illumina.anno$ENSEMBL, '_', illumina.anno$ENTREZID)
i <- match(illumina.anno$ENSEMBL_ENTREZ_ID[idx], paste0(pcadb.anno$ensembl_id, '_', pcadb.anno$entrez_id))
i <- i[which(!is.na(pcadb.anno$tax_id[i]))]
filter <- idx[which(!illumina.anno$ENSEMBL_ENTREZ_ID[idx] %in% paste0(pcadb.anno$ensembl_id, '_', pcadb.anno$entrez_id)[i])]
illumina.anno <- illumina.anno[-filter,]
#### for probes mapped to multiple genes
probes <- illumina.anno$PROBEID[which(duplicated(illumina.anno$PROBEID))]
filter <- which(illumina.anno$PROBEID %in% probes)
illumina.anno <- illumina.anno[-filter,]
###
exprData <- data.frame(exprData[illumina.anno$PROBEID,])
exprData$ID <- illumina.anno$ENSEMBL
idx <- which(!is.na(rowSums(exprData[,-ncol(exprData)])))
exprData <- selectProbeFun(exprData[idx,])
if (max(exprData) > 100) {
exprData <- log2(exprData)
}
rownames(exprData) <- unlist(lapply(rownames(exprData), function(x) strsplit(x, '_|\\.', )[[1]][1]))
return (exprData)
}
gse <- 'GSE28680' # Illumina HumanHT-12 V4.0 Expression Beadchip
seriesMatrix <- getGEO(gse, AnnotGPL = FALSE, getGPL = FALSE, GSEMatrix = TRUE, destdir = 'data/fromGEO/') # AnnotGPL = TRUE
i.matrix <- 1
exprData <- exprs(seriesMatrix[[i.matrix]])
exprData <- IlluminaFun(exprData, anno = 'illuminaHumanv4.db')
saveRDS(exprData, file=paste0('data/rData/', gse, '_Expression.RDS'))