####################################################################
#######                TCGA-PRAD Data Collection             #######
####################################################################

library(GDCRNATools)
library(edgeR)
library(limma)

project <- 'TCGA-PRAD'
rnadir <- paste('data', project, 'RNAseq', sep='/')

### Download RNAseq data
gdcRNADownload(project.id     = project, 
               data.type      = 'RNAseq', 
               write.manifest = FALSE,
               method         = 'gdc-client',
               directory      = rnadir)

### Parse RNAseq metadata
meta.rna <- gdcParseMetadata(project.id = project,
                                   data.type  = 'RNAseq', 
                                   write.meta = FALSE)

### Filter duplicated samples
meta.rna <- gdcFilterDuplicate(meta.rna)

### Filter non-Primary Tumor and non-Solid Tissue Normal samples
meta.rna <- gdcFilterSampleType(meta.rna)


### Merge RNAseq data
rnaCounts <- gdcRNAMerge(metadata  = meta.rna, 
                         path      = rnadir, # the folder in which the data stored
                         organized = FALSE, # if the data are in separate folders
                         data.type = 'RNAseq')

### TMM normalization
dge <-  DGEList(counts = rnaCounts)
dge = calcNormFactors(dge, method = 'TMM')

exprData <- edgeR::cpm(dge,log = TRUE)

saveRDS(exprData, file=paste0('data/rData/', project, '_Expression.RDS'))

##### For downstream analysis
### Filter out low-expression genes (logCPM < 1 in more than 50% of the samples)
# keep <- rowSums(edgeR::cpm(dge) > 1) >= 0.5*ncol(rnaCounts)
# sum(keep)
# dge <- dge[keep,,keep.lib.sizes = TRUE]

### Voom normalization
#v <- voom(dge, design=NULL, plot = FALSE)