####################################################################
#######                 Gene Annotation Data                 #######
####################################################################

library(rtracklayer)
library(tibble)
library(biomaRt)
library(stringr)
library(devtools)
#install_github("js229/Vennerable")
library(Vennerable)

################## Step 1: Data Collection ##################

# GENCODE V32
getGENCODEAnnotation <- function(species='human', release='32', type='gene', gtf.file=NULL) {
  
  if (is.null(gtf.file)) {
    baseurl <- 'ftp://ftp.ebi.ac.uk/pub/databases/gencode/'
    gtf.file <- paste0(baseurl, 'Gencode_', species, '/release_', release, '/gencode.v', 
                       release, '.annotation.gtf.gz')
  }
  
  gtf <- readGFF(gtf.file, version=2L)
  
  if (type!='all') {
    gtf <- gtf[gtf$type==type,]
    ensembl <- sapply(gtf$gene_id, function(x) strsplit(x, '.', fixed=T)[[1]][1])
    gtf <- add_column(gtf, ensembl, .before = 'gene_id')
  }
  
  return(gtf)
}

# The gene annotation in "pseudoautosomal regions" (PAR) of chromosome Y is identical between chromosomes X and Y
# Gene names are usually HGNC or MGI-approved gene symbols mapped to the GENCODE genes by the Ensembl xref pipeline. 
# Sometimes, when there is no official gene symbol, the Havana clone-based name is used.
gencode <- getGENCODEAnnotation(species='human', release='32', type='gene')
rownames(gencode) <- ifelse(duplicated(gencode$ensembl), gencode$gene_id, gencode$ensembl) # PAR_Y, or filter out
gencode <- add_column(.data = gencode, .after = 'end', length=gencode$end-gencode$start+1)
#saveRDS(gencode, file='data/Annotation/GENCODE_Annotation_Human_V32.RDS')

idx <- which(duplicated(gencode$gene_name))
View(gencode[idx,])
sort(unique(gencode[idx,]$gene_name)) # 113 genes


# ENSEMBL V98
getENSEMBLAnnotation <- function(attributes=NULL) {
  
  if (is.null(attributes)) {
    attributes <- c('ensembl_gene_id','chromosome_name','start_position','end_position','strand',
                    'entrezgene_id','hgnc_id','hgnc_symbol','external_gene_name','description','gene_biotype')
  }
  
  #listMarts()
  
  ensembl=useMart("ensembl")
  datasets <- listDatasets(ensembl)

  ensembl = useDataset("hsapiens_gene_ensembl",mart=ensembl)
  #attributes <- ensembl@attributes
  #View(attributes)
  
  #affyids=c("202763_at","209310_s_at","207500_at")
  annotation <- getBM(attributes=attributes,
                      #filters = 'affy_hg_u133_plus_2', 
                      #values = affyids, 
                      mart = ensembl)
  
  return(annotation)
}

ensembl <- getENSEMBLAnnotation()
#saveRDS(ensembl, file='data/Annotation/ENSEMBL_Annotation_Human_V98.RDS')

ensembl[ensembl==''] <- NA

idx <- which(ensembl$hgnc_symbol != ensembl$external_gene_name)
ensembl[idx,]
ensembl$external_gene_name[idx] <- ensembl$hgnc_symbol[idx]

keep <- which(ensembl$chromosome_name %in% c(1:22, 'MT', 'X', 'Y'))
ensembl <- ensembl[keep,]

#filter <- which(duplicated(ensembl$ensembl_gene_id))
#ensembl <- ensembl[-filter,]
#dim(ensembl)


# HGNC

# wget ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt -O hgnc_complete_set_20191230.txt
# wget ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/withdrawn.txt -O hgnc_withdrawn_20191230.txt

hgnc <- read.table(file='data/Annotation/hgnc_complete_set_20191230.txt', sep='\t', quote='', 
                   comment.char='', header = T, stringsAsFactors = F)

hgnc.withdrawn <- read.table(file='data/Annotation/hgnc_withdrawn_20191230.txt', sep='\t', quote='', 
                             comment.char='', header = T, stringsAsFactors = F)

hgnc[hgnc==''] <- NA
hgnc.withdrawn[hgnc.withdrawn==''] <- NA

hgnc.withdrawn$CURRENT_ID <- unlist(lapply(hgnc.withdrawn$MERGED_INTO_REPORT.S...i.e.HGNC_ID.SYMBOL.STATUS.,
                                           function(x) strsplit(x,'|', fixed=T)[[1]][1]))
hgnc.withdrawn$CURRENT_SYMBOL <- unlist(lapply(hgnc.withdrawn$MERGED_INTO_REPORT.S...i.e.HGNC_ID.SYMBOL.STATUS.,
                                           function(x) strsplit(x,'|', fixed=T)[[1]][2]))
colnames(hgnc)
View(hgnc)
hgnc <- hgnc[,c(1:7,9:12,19:29)]
hgnc
hgnc <- hgnc[which(hgnc$status!='Entry Withdrawn'),]


# NCBI
ncbi.gene.info <- read.table('data/Annotation/NCBI_Homo_sapiens.gene_info.gz', header = T,
                             stringsAsFactors = F, sep='\t', quote='', comment.char = '')

ncbi.gene.info[ncbi.gene.info=='-'] <- NA

ncbi.gene.info$ensembl_gene_id <- str_extract(ncbi.gene.info$dbXrefs, 'ENSG\\d+')
ncbi.gene.info$hgnc_id <- str_extract(ncbi.gene.info$dbXrefs, 'HGNC:\\d+')

ncbi.gene.history <- read.table(file='data/Annotation/NCBI_Homo_sapiens.gene_history.txt', sep='\t',
                                header=T, stringsAsFactors = F, quote = '', comment.char = '')

ncbi.gene.history[ncbi.gene.history=='-'] <- NA
ncbi.gene.history$GeneID <- as.numeric(ncbi.gene.history$GeneID)
ncbi.gene.history$Discontinued_GeneID <- as.numeric(ncbi.gene.history$Discontinued_GeneID)

ncbi.gene.info
colnames(ncbi.gene.info)


################## Step 2: Update IDs ##################

# ENSEMBL

idx <- which(!ensembl$hgnc_id %in% c(hgnc$hgnc_id,NA))
idx
tmp <- hgnc.withdrawn[match(ensembl[idx,]$hgnc_id, hgnc.withdrawn$HGNC_ID),]
tmp

ensembl[idx,]$hgnc_id <- tmp$CURRENT_ID
ensembl[idx,]$hgnc_symbol <- tmp$CURRENT_SYMBOL
ensembl[idx,]$description <- NA

#which(ensembl$entrezgene_id %in% ncbi.gene.history$Discontinued_GeneID)
#which(!ensembl$entrezgene_id[which(!is.na(ensembl$entrezgene_id))] %in% ncbi.gene.info$GeneID)

idx <- which(ensembl$entrezgene_id %in% ncbi.gene.history$Discontinued_GeneID)
ensembl[idx,]$entrezgene_id

newids <- ncbi.gene.history$GeneID[match(ensembl[idx,]$entrezgene_id, ncbi.gene.history$Discontinued_GeneID)]
newids

ensembl[idx,]$entrezgene_id <- newids

#dupids <- ensembl$entrezgene_id[duplicated(ensembl$entrezgene_id, incomparables = NA)]
#dupids
#View(ensembl[ensembl$entrezgene_id %in% dupids,])

#ensembl$entrezgene_id[ensembl$entrezgene_id %in% dupids] <- NA


#dupids <- ensembl$hgnc_id[duplicated(ensembl$hgnc_id, incomparables = NA)]
#dupids
#View(ensembl[ensembl$entrezgene_id %in% dupids,])

#ensembl$hgnc_id[ensembl$hgnc_id %in% dupids] <- NA


# HGNC

idx <- which(!hgnc$ensembl_gene_id %in% c(ensembl$ensembl_gene_id,NA))
idx

View(hgnc[idx,])

hgnc$ensembl_gene_id[idx] <- NA

which(hgnc$entrez_id[!is.na(hgnc$entrez_id)] %in% ncbi.gene.history$Discontinued_GeneID)
idx <- which(!hgnc$entrez_id %in% c(ncbi.gene.info$GeneID,NA))
idx

#newids <- ncbi.gene.history$GeneID[match(hgnc[idx,]$entrez_id, ncbi.gene.history$Discontinued_GeneID)]
#newids
#hgnc[idx,]$entrez_id <- newids

hgnc$entrez_id[idx] <- NA

dupids <- hgnc$entrez_id[duplicated(hgnc$entrez_id, incomparables = NA)]
dupids
View(hgnc[hgnc$entrez_id %in% dupids,])

dupids <- hgnc$ensembl_gene_id[duplicated(hgnc$ensembl_gene_id, incomparables = NA)]
dupids
View(hgnc[hgnc$ensembl_gene_id %in% dupids,])


# NCBI

idx <- which(!ncbi.gene.info$ensembl_gene_id %in% c(ensembl$ensembl_gene_id,NA))
idx

dim(ncbi.gene.info)
sum(ncbi.gene.info$ensembl_gene_id %in% c(ensembl$ensembl_gene_id,NA))

ncbi.gene.info$ensembl_gene_id[idx] <- NA

#ensembl[which(ensembl$hgnc_id %in% hgnc.withdrawn$HGNC_ID),]

#ovlp <- intersect(hgnc$entrez_id, ncbi.gene.info$GeneID)
#ovlp

#gene1 <- hgnc$symbol[match(ovlp, hgnc$entrez_id)]
#gene2 <- ncbi.gene.info$Symbol[match(ovlp, ncbi.gene.info$GeneID)]

#data.frame(gene1, gene2)[which(gene1!=gene2),]

idx <- which(!ncbi.gene.info$hgnc_id %in% c(hgnc$hgnc_id,NA))
idx
#ids <- hgnc.withdrawn[match(ncbi.gene.info[idx,]$hgnc_id, hgnc.withdrawn$HGNC_ID),]
#ids
ncbi.gene.info[idx,]

ncbi.gene.info[idx,]$hgnc_id <- NA #ids$CURRENT_ID
ncbi.gene.info[idx,]$Symbol <- NA #ids$CURRENT_SYMBOL
ncbi.gene.info[idx,]$description <- NA


dupids <- ncbi.gene.info$ensembl_gene_id[which(duplicated(ncbi.gene.info$ensembl_gene_id, incomparables = NA))]
dupids

View(ncbi.gene.info[which(ncbi.gene.info$ensembl_gene_id%in%dupids),])

dupids <- ncbi.gene.info$hgnc_id[which(duplicated(ncbi.gene.info$hgnc_id, incomparables = NA))]
dupids

View(ncbi.gene.info[which(ncbi.gene.info$hgnc_id%in%dupids),])


View(ncbi.gene.info)
View(ncbi.gene.history)


# Check

set1 <- ensembl$ensembl_gene_id[which(!is.na(ensembl$hgnc_id))]
set2 <- hgnc$ensembl_gene_id[which(!is.na(hgnc$ensembl_gene_id))]

set1 <- ensembl$ensembl_gene_id[which(!is.na(ensembl$entrezgene_id))]
set2 <- ncbi.gene.info$ensembl_gene_id[which(!is.na(ncbi.gene.info$ensembl_gene_id))]

set1 <- hgnc$entrez_id[which(!is.na(hgnc$entrez_id))]
set2 <- ncbi.gene.info$GeneID[which(!is.na(ncbi.gene.info$hgnc_id))]


set.list <- list(set1, set2)
vennData <- Venn(set.list)
vennData


################## Step 3: Map IDs ##################

# ENSEMBL

idx<- which(is.na(ensembl$hgnc_id))
ensembl$hgnc_id[idx] <- hgnc$hgnc_id[match(ensembl$ensembl_gene_id[idx], hgnc$ensembl_gene_id)]

idx<- which(is.na(ensembl$entrezgene_id))
ensembl$entrezgene_id[idx] <- ncbi.gene.info$GeneID[match(ensembl$ensembl_gene_id[idx], ncbi.gene.info$ensembl_gene_id)]


# HGNC

idx<- which(is.na(hgnc$ensembl_gene_id))
idx
hgnc$ensembl_gene_id[idx] <- ensembl$ensembl_gene_id[match(hgnc$hgnc_id[idx], ensembl$hgnc_id)]

idx<- which(is.na(hgnc$entrez_id))
hgnc$entrez_id[idx] <-ncbi.gene.info$GeneID[match(hgnc$hgnc_id[idx], ncbi.gene.info$hgnc_id)]


# NCBI

idx<- which(is.na(ncbi.gene.info$ensembl_gene_id))
ncbi.gene.info$ensembl_gene_id[idx] <- ensembl$ensembl_gene_id[match(ncbi.gene.info$GeneID[idx], ensembl$entrezgene_id)]

idx<- which(is.na(ncbi.gene.info$hgnc_id))
ncbi.gene.info$hgnc_id[idx] <- hgnc$hgnc_id[match(ncbi.gene.info$GeneID[idx], hgnc$entrez_id)]


sum(!is.na(ensembl$hgnc_id)) # 38614
sum(!is.na(ensembl$entrezgene_id)) # 26479

sum(!is.na(hgnc$ensembl_gene_id)) # 38418
sum(!is.na(hgnc$entrez_id)) # 41668

sum(!is.na(ncbi.gene.info$ensembl_gene_id)) # 26411
sum(!is.na(ncbi.gene.info$hgnc_id)) # 41671


gene1 <- hgnc$ensembl_gene_id[which(!is.na(hgnc$ensembl_gene_id))]
gene1

gene2 <- ensembl$ensembl_gene_id[which(!is.na(ensembl$hgnc_id))]
gene2

gene2[which(!gene2 %in% gene1)]


dupids <- ncbi.gene.info$ensembl_gene_id[which(duplicated(ncbi.gene.info$ensembl_gene_id, incomparables = NA))]
View(ncbi.gene.info[which(ncbi.gene.info$ensembl_gene_id %in% dupids),])

dupids <- ncbi.gene.info$hgnc_id[which(duplicated(ncbi.gene.info$hgnc_id, incomparables = NA))]
dupids


################## Step 4: Merge Annotations ##################

# HGNC + NCBI

hgnc.tmp <- hgnc[match(ncbi.gene.info$GeneID, hgnc$entrez_id),]
dim(hgnc.tmp)
dim(ncbi.gene.info)

idx <- which(!hgnc$hgnc_id %in% hgnc.tmp$hgnc_id)
idx

hgnc.tmp <- rbind(hgnc.tmp, hgnc[idx,])
dim(hgnc.tmp)
View(hgnc.tmp)

ncbi <- ncbi.gene.info

ncbi[(nrow(ncbi)+1):nrow(hgnc.tmp),] <- NA
dim(ncbi)

ncbi.hgnc <- data.frame(ncbi, hgnc.tmp)

sum(!is.na(ncbi.hgnc$GeneID))
sum(!is.na(ncbi.hgnc$entrez_id))

idx <- which(is.na(ncbi.hgnc$GeneID))
sum(!is.na(ncbi.hgnc$entrez_id[idx]))

idx <- which(is.na(ncbi.hgnc$hgnc_id.1))
sum(!is.na(ncbi.hgnc$hgnc_id[idx]))
ncbi.hgnc[idx,][which(!is.na(ncbi.hgnc$hgnc_id[idx])),]


which(ncbi.hgnc$GeneID!=ncbi.hgnc$entrez_id)
which(ncbi.hgnc$hgnc_id!=ncbi.hgnc$hgnc_id.1)
which(ncbi.hgnc$ensembl_gene_id!=ncbi.hgnc$ensembl_gene_id.1)


idx <- which(ncbi.hgnc$hgnc_id!=ncbi.hgnc$hgnc_id.1)
idx
View(ncbi.hgnc[idx,])


idx <- which(is.na(ncbi.hgnc$ensembl_gene_id))
idx
ncbi.hgnc$ensembl_gene_id[idx] <- ncbi.hgnc$ensembl_gene_id.1[idx]

idx <- which(ncbi.hgnc$ensembl_gene_id!=ncbi.hgnc$ensembl_gene_id.1)
idx
ncbi.hgnc$ensembl_gene_id[idx] <- ncbi.hgnc$ensembl_gene_id.1[idx]


# HGNC + NCBI + ENSEMBL

ensembl.tmp <- ensembl[match(ncbi.hgnc$ensembl_gene_id, ensembl$ensembl_gene_id),]
dim(ensembl.tmp)

idx <- which(!ensembl$ensembl_gene_id %in% ensembl.tmp$ensembl_gene_id)
idx

ensembl.tmp <- rbind(ensembl.tmp, ensembl[idx,])
dim(ensembl.tmp)

ncbi.hgnc[(nrow(ncbi.hgnc)+1):nrow(ensembl.tmp),] <- NA
dim(ncbi.hgnc)

ncbi.hgnc.ensembl <- data.frame(ncbi.hgnc, ensembl.tmp)
View(ncbi.hgnc.ensembl)


###
idx <- which(is.na(ncbi.hgnc.ensembl$ensembl_gene_id))
idx
ncbi.hgnc.ensembl$ensembl_gene_id[idx] <- ncbi.hgnc.ensembl$ensembl_gene_id.2[idx]

idx <- which(is.na(ncbi.hgnc.ensembl$ensembl_gene_id))
idx
ncbi.hgnc.ensembl$ensembl_gene_id[idx] <- ncbi.hgnc.ensembl$ensembl_gene_id.1[idx]


###
idx <- which(is.na(ncbi.hgnc.ensembl$GeneID))
idx
ncbi.hgnc.ensembl$GeneID[idx] <- ncbi.hgnc.ensembl$entrez_id[idx]

idx <- which(is.na(ncbi.hgnc.ensembl$GeneID))
idx
ncbi.hgnc.ensembl$GeneID[idx] <- ncbi.hgnc.ensembl$entrezgene_id[idx]


###
idx <- which(is.na(ncbi.hgnc.ensembl$hgnc_id))
idx
ncbi.hgnc.ensembl$hgnc_id[idx] <- ncbi.hgnc.ensembl$hgnc_id.1[idx]

idx <- which(is.na(ncbi.hgnc.ensembl$hgnc_id))
idx
ncbi.hgnc.ensembl$hgnc_id[idx] <- ncbi.hgnc.ensembl$hgnc_id.2[idx]


################## Step 5: Final Check ##################

dim(ensembl)
unique(ensembl$ensembl_gene_id)
which(!is.na(ncbi.hgnc.ensembl$ensembl_gene_id))

dim(hgnc)
unique(hgnc$hgnc_id)
which(!is.na(ncbi.hgnc.ensembl$hgnc_id))


dim(ncbi.gene.info)
unique(ncbi.gene.info$GeneID)
which(!is.na(ncbi.hgnc.ensembl$GeneID))


# ENSEMBL ID
test <- data.frame(ncbi.hgnc.ensembl$ensembl_gene_id, ncbi.hgnc.ensembl$ensembl_gene_id.1, 
                   ncbi.hgnc.ensembl$ensembl_gene_id.2, stringsAsFactors = F)
which(test$ncbi.hgnc.ensembl.ensembl_gene_id!=test$ncbi.hgnc.ensembl.ensembl_gene_id.1)
which(test$ncbi.hgnc.ensembl.ensembl_gene_id!=test$ncbi.hgnc.ensembl.ensembl_gene_id.2) # good
which(test$ncbi.hgnc.ensembl.ensembl_gene_id.1!=test$ncbi.hgnc.ensembl.ensembl_gene_id.2) # good


# HGNC ID
test <- data.frame(ncbi.hgnc.ensembl$hgnc_id, ncbi.hgnc.ensembl$hgnc_id.1, 
                   ncbi.hgnc.ensembl$hgnc_id.2, stringsAsFactors = F)
which(test$ncbi.hgnc.ensembl.hgnc_id!=test$ncbi.hgnc.ensembl.hgnc_id.1) # good
which(test$ncbi.hgnc.ensembl.hgnc_id!=test$ncbi.hgnc.ensembl.hgnc_id.2)
which(test$ncbi.hgnc.ensembl.hgnc_id.1!=test$ncbi.hgnc.ensembl.hgnc_id.2)


idx <- which(test$ncbi.hgnc.ensembl.hgnc_id!=test$ncbi.hgnc.ensembl.hgnc_id.2)
#View(ncbi.hgnc.ensembl[idx,c(2,3,5,6,17:21,37:38,70,75:78)])
#View(ncbi.hgnc.ensembl[,c(2,3,5,6,17:21,37:38,70,75:78)])

colnames(ncbi.hgnc.ensembl)
ncbi.hgnc.ensembl[idx,41:51] <- NA


# ENTREZ ID
test <- data.frame(ncbi.hgnc.ensembl$GeneID, ncbi.hgnc.ensembl$entrez_id, ncbi.hgnc.ensembl$entrez_id)
which(test$ncbi.hgnc.ensembl.GeneID!=test$ncbi.hgnc.ensembl.entrez_id) # good
which(test$ncbi.hgnc.ensembl.GeneID!=test$ncbi.hgnc.ensembl.entrez_id.1) # good
which(test$ncbi.hgnc.ensembl.entrez_id!=test$ncbi.hgnc.ensembl.entrez_id.1) # good



###

which(duplicated(ncbi.hgnc.ensembl$GeneID, incomparables = NA))
which(duplicated(ncbi.hgnc.ensembl$hgnc_id, incomparables = NA))
which(duplicated(ncbi.hgnc.ensembl$ensembl_gene_id, incomparables = NA))

idx <- which(duplicated(ncbi.hgnc.ensembl$hgnc_id, incomparables = NA))
dupids <- ncbi.hgnc.ensembl$hgnc_id[idx]
dupids

View(ncbi.hgnc.ensembl[ncbi.hgnc.ensembl$hgnc_id %in% dupids,])


############################

idx <- which(is.na(ncbi.hgnc.ensembl$ensembl_gene_id))
sum(!is.na(ncbi.hgnc.ensembl$ensembl_gene_id.1[idx]))
sum(!is.na(ncbi.hgnc.ensembl$ensembl_gene_id.2[idx]))

idx <- which(is.na(ncbi.hgnc.ensembl$GeneID))
sum(!is.na(ncbi.hgnc.ensembl$entrez_id[idx]))
sum(!is.na(ncbi.hgnc.ensembl$entrezgene_id[idx]))

idx <- which(is.na(ncbi.hgnc.ensembl$hgnc_id))
sum(!is.na(ncbi.hgnc.ensembl$hgnc_id.1[idx]))
sum(!is.na(ncbi.hgnc.ensembl$hgnc_id.2[idx]))

colnames(ncbi.hgnc.ensembl)

idx <- which(is.na(ncbi.hgnc.ensembl$Symbol))
idx
ncbi.hgnc.ensembl$Symbol[idx] <- ncbi.hgnc.ensembl$symbol[idx]

idx <- which(is.na(ncbi.hgnc.ensembl$Symbol))
idx
ncbi.hgnc.ensembl$Symbol[idx] <- ncbi.hgnc.ensembl$external_gene_name[idx]

idx <- which(is.na(ncbi.hgnc.ensembl$Symbol))
idx
ncbi.hgnc.ensembl$Symbol[idx] <- ncbi.hgnc.ensembl$Symbol_from_nomenclature_authority[idx]


idx <- which(is.na(ncbi.hgnc.ensembl$description))
idx
ncbi.hgnc.ensembl$description[idx] <- ncbi.hgnc.ensembl$name[idx]

idx <- which(is.na(ncbi.hgnc.ensembl$description))
idx
ncbi.hgnc.ensembl$description[idx] <- ncbi.hgnc.ensembl$description.1[idx]

idx <- which(is.na(ncbi.hgnc.ensembl$description))
idx
ncbi.hgnc.ensembl$description[idx] <- ncbi.hgnc.ensembl$Full_name_from_nomenclature_authority[idx]


sum(ncbi.hgnc.ensembl$Symbol != ncbi.hgnc.ensembl$symbol, na.rm=T)

idx <- which(ncbi.hgnc.ensembl$Symbol != ncbi.hgnc.ensembl$symbol)
View(with(ncbi.hgnc.ensembl[idx,], data.frame(Symbol, symbol, external_gene_name)))
ncbi.hgnc.ensembl$Symbol[idx] <- ncbi.hgnc.ensembl$symbol[idx]

## keep both
idx <- which(ncbi.hgnc.ensembl$Symbol != ncbi.hgnc.ensembl$external_gene_name)
View(with(ncbi.hgnc.ensembl[idx,], data.frame(Symbol, symbol, external_gene_name)))


View(ncbi.hgnc.ensembl)
colnames(ncbi.hgnc.ensembl)
table(ncbi.hgnc.ensembl$locusTag)
final.annotation <- with(ncbi.hgnc.ensembl, 
                         data.frame(tax_id=X.tax_id,ensembl_id=ensembl_gene_id,entrez_id=GeneID,
                                    hgnc_id=hgnc_id,gene_name=Symbol,external_gene_name=external_gene_name,
                                    description, synonyms=Synonyms, alias_symbol, alias_name,
                                    previous_symbol=prev_symbol, previous_name=prev_name, type_of_gene,
                                    gene_biotype,
                                    map_location,chromosome=chromosome_name,start_position, end_position, strand,
                                    vega_id, ucsc_id, ena, refseq_accession, ccds_id, uniprot_ids,mgd_id,rgd_id),
                                    stringsAsFactors = F)

final.annotation <- final.annotation[order(final.annotation$ensembl_id),]
View(final.annotation)

saveRDS(final.annotation, file='data/Annotation/Homo_Sapiens_Gene_Annotation_ENSEMBL_HGNC_ENTREZ.RDS')


### for PCaDB
gene.annotation <- readRDS('data/Annotation/Homo_Sapiens_Gene_Annotation_ENSEMBL_HGNC_ENTREZ.RDS')
gene.annotation$alias_symbol <- gsub('"', '', gene.annotation$alias_symbol, fixed=T)

filter <- which(duplicated(gene.annotation$ensembl_id) | is.na(gene.annotation$ensembl_id))
gene.annotation <- gene.annotation[-filter,]

gene.annotation$ensembl_id <- as.character(gene.annotation$ensembl_id)
gene.annotation$entrez_id <- as.character(gene.annotation$entrez_id)
gene.annotation$hgnc_id <- as.character(gene.annotation$hgnc_id)
gene.annotation$gene_name <- as.character(gene.annotation$gene_name)
gene.annotation$description <- as.character(gene.annotation$description)
gene.annotation$alias_symbol <- gsub('|', ', ', as.character(gene.annotation$alias_symbol), fixed = T)

rownames(gene.annotation) <- gene.annotation$ensembl_id

gene.annotation <- gene.annotation[order(gene.annotation$gene_name, decreasing = F),]

saveRDS(gene.annotation, file='data/Annotation/PCaDB_Gene_Annotation.RDS')