####################################################################
#######               cBioPortal Data Collection             #######
####################################################################

### helper function: select the most informative probe, MAX IQR
library(dplyr)

selectProbeFun <- function(expr) {
  expr$IQR <- apply(expr[,-which(colnames(expr)=='ID')], 1, IQR)
  
  expr <- expr %>% group_by(ID) %>% 
    filter(row_number() == which.max(IQR)) %>%
    column_to_rownames('ID')
  
  expr <- expr[,-which(colnames(expr)=='IQR')]
  
  return(expr)
  
}
### DKFZ dataset as an example
wget https://cbioportal-datahub.s3.amazonaws.com/prostate_dkfz_2018.tar.gz -P data/cBioPortal/
tar -xvzf prostate_dkfz_2018.tar.gz
dataset <- 'DKFZ'

exprData <- read.table('data/cBioPortal/prostate_dkfz_2018/data_mrna_seq_rpkm.txt',
                       sep='\t', header = T, stringsAsFactors = F)

## ENSEMBL 62; ftp://ftp.ensembl.org/pub/release-62/gtf/homo_sapiens/
gtf <- readGFF('data/Annotation/Homo_sapiens.GRCh37.62.gtf.gz', version=2L)
filter <- which(duplicated(gtf$gene_id))
gtf <- gtf[-filter,]

# exprData <- add_column(.data = exprData, .before = 3, Ensembl=NA)
# exprData$Ensembl <- gtf$gene_id[match(exprData$Hugo_Symbol, gtf$gene_name)]
# exprData[1:5,1:5]
# 
# filter <- which(duplicated(exprData$Ensembl))
# filter

exprData$ID <- gtf$gene_id[match(exprData$Hugo_Symbol, gtf$gene_name)]

idx <- which(!is.na(rowSums(exprData[,-ncol(exprData)])))
exprData <- selectProbeFun(exprData[idx,])
  
if (max(exprData) > 100) {
  exprData <- log2(exprData)
}
  
rownames(exprData) <- unlist(lapply(rownames(exprData), function(x) strsplit(x, '_|\\.', )[[1]][1]))

saveRDS(exprData, file=paste0('data/rData/', dataset, '_Expression.RDS'))