#################################################################################
#######            GEO/ArrayExpress Data Collection (Affymetrix)          #######
#################################################################################

library(GEOquery)
library(oligo)

### Annotation package

# http://brainarray.mbni.med.umich.edu/Brainarray/Database/CustomCDF/CDF_download.asp
# Version 24
# R Source Package: O

# Affymetrix Human Exon 1.0 ST Array: pd.huex10st.hs.gencodeg
# Affymetrix Human Gene 2.0 ST Array: pd.huex20st.hs.gencodeg
# Affymetrix Human Transcriptome Array 2.0: pd.hta20.hs.gencodeg
# Affymetrix Human Genome U133A Array: pd.hgu133a.hs.gencodeg
# Affymetrix Human Genome U133 Plus 2.0 Array: pd.hgu133plus2.hs.gencodeg
# Affymetrix Human Genome U133A 2.0 Array: pd.hgu133a2.hs.gencodeg

# library(pd.hg.u133.plus.2) # from Bioconductor (NOT IN USE)

#install.packages("http://mbni.org/customcdf/24.0.0/gencodeg.download/pd.huex10st.hs.gencodeg_24.0.0.tar.gz",
#                repos = NULL, type = "source")

library(pd.huex10st.hs.gencodeg)

gse <- 'GSE12378'
anno <- 'pd.huex10st.hs.gencodeg'

filePaths = getGEOSuppFiles(gse, baseDir = 'data/fromGEO', makeDirectory = FALSE, filter_regex = 'RAW')
untar(paste0('data/fromGEO/', gse, '_RAW.tar'), exdir = paste0('data/fromGEO/', gse, '_RAW'))

celFiles = list.celfiles(paste0('data/fromGEO/', gse, '_RAW'), full.names=T, listGzipped=T)
rawData = read.celfiles(celFiles, pkgname = anno)
probesetData = oligo::rma(rawData)
exprData = exprs(probesetData)

rownames(exprData) <- unlist(lapply(rownames(exprData), function(x) strsplit(x, '_|\\.')[[1]][1]))
colnames(exprData) <- unlist(lapply(colnames(exprData), function(x) strsplit(x, '_|\\.')[[1]][1]))

filter <- which(!startsWith(rownames(exprData), 'ENSG'))
if (length(filter) > 0) {
  exprData <- exprData[-filter,]
}

saveRDS(exprData, file=paste0('data/rData/', gse, '_Expression.RDS'))


########## ArrayExpress ##########

### E-MTAB-26 dataset as an example
# wget https://www.ebi.ac.uk/arrayexpress/files/E-TABM-26/E-TABM-26.sdrf.txt
# wget https://www.ebi.ac.uk/arrayexpress/files/E-TABM-26/E-TABM-26.raw.1.zip
# wget https://www.ebi.ac.uk/arrayexpress/files/E-TABM-26/E-TABM-26.raw.2.zip
# wget https://www.ebi.ac.uk/arrayexpress/files/E-TABM-26/E-TABM-26.raw.3.zip
# unzip E-TABM-26.raw.1.zip -d E-TABM-26/
# unzip E-TABM-26.raw.1.zip -d E-TABM-26/
# unzip E-TABM-26.raw.1.zip -d E-TABM-26/
# 
# mkdir E-TABM-26/U133A E-TABM-26/U133B
# mv E-TABM-26/^B*CEL E-TABM-26/U133B/
# mv E-TABM-26/*B.CEL E-TABM-26/U133B/
# mv E-TABM-26/*.CEL E-TABM-26/U133A/