metadata.knit

###########################################################################
#######                Harmonization of Sample Metadata             #######
###########################################################################

#**********************************#
#         Helper Functions         #
#**********************************#

library(stringr)

generatePhenoFun <- function(samples) {
  ### Harmonized metadata
  traits <- c('sample_id','patient_id','tissue','batch','platform','sample_type',
              'age_at_diagnosis','ethnicity','race','clinical_stage','clinical_t_stage',
              'clinical_n_stage','clinical_m_stage','pathological_stage','pathological_t_stage',
              'pathological_n_stage','pathological_m_stage','preop_psa','gleason_primary_pattern',
              'gleason_secondary_pattern','gleason_tertiary_pattern','gleason_group','gleason_score',
              'time_to_death','os_status','time_to_bcr','bcr_status','time_to_metastasis',
              'metastasis_status','risk_group','treatment','additional_treatment')
  
  phenoData <- data.frame(matrix(NA, nrow=length(samples), ncol=length(traits)), stringsAsFactors = F)
  
  colnames(phenoData) <- traits
  rownames(phenoData) <- samples
  
  return (phenoData)
}

#*******************************************#
#         TCGA-PRAD Sample Metadata         #
#*******************************************#

library(GDCRNATools)

project <- 'TCGA-PRAD'

### Parse RNAseq metadata
meta.rna <- gdcParseMetadata(project.id = project, 
                             data.type  = 'RNAseq', 
                             write.meta = FALSE)

### Filter duplicated samples
meta.rna <- gdcFilterDuplicate(meta.rna)

### Filter non-Primary Tumor and non-Solid Tissue Normal samples
meta.rna <- gdcFilterSampleType(meta.rna)

### Download clinical data
clinical.dir <- paste('data/Clinical', sep='/')
gdcClinicalDownload(project.id     = project, 
                    write.manifest = FALSE,
                    method         = 'gdc-client',
                    directory      = clinical.dir)

### Merge clinical data
clinical.data <- gdcClinicalMerge(path = clinicaldir, key.info = TRUE)
clinical.data[1:6,5:10]

### Meatadata
rownames(meta.rna) <- substr(rownames(meta.rna),start = 1, stop = 15)

pheno1 <- clinical.data
rownames(pheno1) <- paste0(rownames(pheno1), '-01')

pheno1 <- pheno1[match(rownames(meta.rna), rownames(pheno1)),]
rownames(pheno1) <- rownames(meta.rna)

pheno2 <- readRDS('data/rData/Clinical_TCGA_PRAD_With_PreopPSA_and_BCR.RDS')
pheno2 <- pheno2[match(rownames(meta.rna), rownames(pheno2)),]

rownames(pheno2) <- rownames(meta.rna)

#######
samples <- rownames(pheno1)
phenoData <- generatePhenoFun(samples)

phenoData$sample_id <- rownames(pheno1)
phenoData$patient_id <- substr(phenoData$sample_id, start = 1, stop = 12)
phenoData$platform <- 'Illumina HiSeq 2000'
phenoData$sample_type <- ifelse(grepl('-11', phenoData$sample_id), 'Normal', 'Primary')
phenoData$age_at_diagnosis <- pheno1$age_at_initial_pathologic_diagnosis
phenoData$ethnicity <- pheno1$ethnicity
phenoData$race <- pheno1$race
phenoData$clinical_t_stage <- pheno1$clinical_T
phenoData$clinical_n_stage <- pheno1$clinical_N
phenoData$clinical_m_stage <- pheno1$clinical_M
phenoData$pathological_t_stage <- pheno1$pathologic_T
phenoData$pathological_n_stage <- pheno1$pathologic_N
phenoData$pathological_m_stage <- pheno1$pathologic_M
phenoData$preop_psa <- pheno2$preop_psa
phenoData$gleason_primary_pattern <- pheno2$primary_pattern
phenoData$gleason_secondary_pattern <- pheno2$secondary_pattern
phenoData$gleason_tertiary_pattern <- pheno2$tertiary_pattern
phenoData$gleason_score <- phenoData$gleason_primary_pattern + phenoData$gleason_secondary_pattern
phenoData$gleason_group <- ifelse(is.na(phenoData$gleason_score), NA, paste(phenoData$gleason_primary_pattern, phenoData$gleason_secondary_pattern, sep='+'))

phenoData$time_to_death <- ifelse(!is.na(pheno2$days_to_death), 
                                  round(as.numeric(pheno2$days_to_death/365*12),2), 
                                  round(as.numeric(pheno2$days_to_last_followup/365*12),2))
phenoData$os_status <- ifelse(!is.na(pheno2$days_to_death), 1, 0)

phenoData$time_to_bcr <- ifelse(!is.na(pheno2$days_to_first_biochemical_recurrence),
                                round(as.numeric(pheno2$days_to_first_biochemical_recurrence/365*12),2), 
                                round(as.numeric(pheno2$days_to_last_followup/365*12),2))
phenoData$bcr_status <- ifelse(!is.na(pheno2$days_to_first_biochemical_recurrence), 1, 0)

phenoData$pcadb_group <- phenoData$sample_type

saveRDS(phenoData, file=paste0('data/rData/', project, '_Metadata.RDS'))

#*************************************#
#         GEO Sample Metadata         #
#*************************************#

library(GEOquery)

# GSE2443
gse <- 'GSE2443'
seriesMatrix <- getGEO(gse, AnnotGPL = FALSE, getGPL = FALSE, 
                       GSEMatrix = TRUE, destdir = 'data/fromGEO/') # AnnotGPL = TRUE

pheno <- pData(seriesMatrix[[1]])

# if downloaded manually from WebGEO (http://bioinfo.jialab-ucr.org/WebGEO/)
#pheno <- read.csv(paste0('data/Metadata/', gse, '.csv'), header = T, row.names = 1, 
#                  stringsAsFactors = F)
#View(pheno)

#######
samples <- rownames(pheno)
phenoData <- generatePhenoFun(samples)

phenoData$sample_id <- pheno$title
phenoData$tissue <- ifelse(grepl('AD', pheno$title), 
                           'Fresh frozen RP', 'Snap-frozen biopsy')
phenoData$platform <- 'Affymetrix Human Genome U133A Array'
phenoData$sample_type <- 'Primary'
phenoData$gleason_score <- as.numeric(gsub('=\\s+', '', str_extract(pheno$description, '=\\s+\\d')))

phenoData$pcadb_group <- ifelse(grepl('AD', pheno$title), 
                                'Androgen dependent primary PCa', 'Androgen independent primary PCa')

saveRDS(phenoData, file=paste0('data/rData/', gse, '_Metadata.RDS'))

#*************************************#
#         SRA Sample Metadata         #
#*************************************#

# SRP119917 (https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRP119917&o=acc_s%3Aa)
sra <- 'SRP119917'
pheno <- read.csv(paste0('data/Metadata/', sra, '.txt'), header = T, row.names = 1, 
                  stringsAsFactors = F)

https://www.ncbi.nlm.nih.gov/Traces/solr-proxy-be/solr-proxy-be.cgi?core=run_sel_index
#######
samples <- rownames(pheno)
phenoData <- generatePhenoFun(samples)

phenoData$sample_id <- pheno$Sample.Name
phenoData$patient_id <- pheno$Isolate
phenoData$platform <- 'Illumina HiSeq 2500'

phenoData$sample_type[pheno$tissue=='higher grade prostate tumor'] <- 'Primary'
phenoData$sample_type[pheno$tissue=='lower grade prostate tumor'] <- 'Primary'
phenoData$sample_type[pheno$tissue=='normal prostate tissue'] <- 'Normal'

phenoData$age_at_diagnosis <- pheno$AGE

phenoData$pcadb_group[pheno$tissue=='higher grade prostate tumor'] <- 'Higher grade tumor'
phenoData$pcadb_group[pheno$tissue=='lower grade prostate tumor'] <- 'Lower grade tumor'
phenoData$pcadb_group[pheno$tissue=='normal prostate tissue'] <- 'Normal'

saveRDS(phenoData, file=paste0('data/rData/', sra, '_Metadata.RDS'))

#********************************************#
#         cBioPortal Sample Metadata         #
#********************************************#

# DFKZ (https://www.cbioportal.org/study/clinicalData?id=prostate_dkfz_2018)
# wget https://cbioportal-datahub.s3.amazonaws.com/prostate_dkfz_2018.tar.gz -P data/cBioPortal/
# tar -xvzf prostate_dkfz_2018.tar.gz

dt <- 'DFKZ'
pheno <- read.table('data/cBioPortal/prostate_dkfz_2018/prostate_dkfz_2018_clinical_data.tsv', 
                    sep='\t', header=T, stringsAsFactors = F)

rownames(pheno) <- pheno$Sample.ID

#######
samples <- rownames(pheno)
phenoData <- generatePhenoFun(samples)

phenoData$sample_id <- pheno$Sample.ID
phenoData$patient_id <- pheno$Patient.ID
phenoData$platform <- 'Illumina HiSeq 2000'
phenoData$sample_type <- 'Primary'
phenoData$age_at_diagnosis <- pheno$Diagnosis.Age
phenoData$pathological_stage <- pheno$Stage
phenoData$preop_psa <- pheno$Preop.PSA
phenoData$gleason_primary_pattern <- as.numeric(lapply(pheno$Radical.Prostatectomy.Gleason.Score.for.Prostate.Cancer, function(x) strsplit(x, '+', fixed=T)[[1]][1]))
phenoData$gleason_secondary_pattern <- as.numeric(lapply(pheno$Radical.Prostatectomy.Gleason.Score.for.Prostate.Cancer, function(x) strsplit(x, '+', fixed=T)[[1]][2]))
phenoData$gleason_score <- phenoData$gleason_primary_pattern + phenoData$gleason_secondary_pattern
phenoData$gleason_group <- ifelse(is.na(phenoData$gleason_score), NA, paste(phenoData$gleason_primary_pattern, phenoData$gleason_secondary_pattern, sep='+'))
phenoData$time_to_bcr <- as.numeric(pheno$Time.from.Surgery.to.BCR.Last.Follow.Up)
phenoData$bcr_status <- as.numeric(pheno$BCR.Status)

phenoData$pcadb_group <- phenoData$sample_type

saveRDS(phenoData, file=paste0('data/rData/', dt, '_Metadata.RDS'))

#**********************************************#
#         ArrayExpress Sample Metadata         #
#**********************************************#

# E-MTAB-5021 ERP017433 (https://www.ebi.ac.uk/arrayexpress/files/E-MTAB-5021/E-MTAB-5021.sdrf.txt)
dt <- 'E-MTAB-5021'

url <- paste0('https://www.ebi.ac.uk/arrayexpress/files/', dt, '/', dt, '.sdrf.txt')
cmd <- paste0 ('wget ', url, ' -P data/Metadata/')
system(cmd)

pheno <- read.table(paste0('data/Metadata/', dt, '.sdrf.txt'), header = T, sep = '\t',
                  stringsAsFactors = F)

filter <- which(duplicated(pheno$Comment.ENA_RUN.))
pheno <- pheno[-filter,]
rownames(pheno) <- pheno$Comment.ENA_RUN.

#######
samples <- rownames(pheno)
phenoData <- generatePhenoFun(samples)

phenoData$sample_id <- pheno$Source.Name
phenoData$patient_id <- pheno$Characteristics.individual.
phenoData$tissue <- 'FFPE RP'
phenoData$platform <- 'Illumina HiSeq 2500'

phenoData$sample_type <- ifelse(pheno$Characteristics.clinical.information.=='morphologically normal tissue',
                                'Normal', 'Normal adjacent to tumor')

phenoData$pcadb_group <- paste0(phenoData$sample_type,
                                ifelse(pheno$Characteristics.sampling.site.=='peripheral zone of prostate',
                                ' (Peripheral zone)', ' (Transition zone)'))

saveRDS(phenoData, file=paste0('data/rData/', dt, '_Metadata.RDS'))