RNA-seq processing for control-LT-LTtr-K331A

gene_x 0 like s 376 view s

Tags: RNA-seq

#nextflow run rnaseq --reads '/home/jhuang/DATA/Data_Denise_tx_epi_MCPyV/Raw_Data_RNAseq_K331A/*.fastq.gz' --fasta /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa --gtf /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf --bed12 /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.bed --singleEnd -profile standard --aligner star --saveReference -resume --saveAlignedIntermediates --skip_rseqc --skip_dupradar --skip_genebody_coverage --skip_preseq --skip_edger --fcGroupFeaturesType gene_name

#under jhuang@hamm:~/DATA/Data_Denise_LT_RNASeq/Raw_Data_RNAseq_K331A_d8_SPANDx$
#Replace "p600" with "control", "p602" with "LT", "p605" with "LTtr", and "p783" with "K331A". Please note that the RNAseq data from the LT_K331A_d8 replicates were sequenced alongside the ChIPseq batch.
ln -s ~/DATA/Data_Denise_LT_RNASeq/231006_NB501882_0434_AHCT3NBGXV/nf928/1_NHDF_Donor_2_p600_S23_R1_001.fastq.gz control-d8-DII_re.fastq.gz
ln -s ~/DATA/Data_Denise_LT_RNASeq/231006_NB501882_0434_AHCT3NBGXV/nf929/2_NHDF_Donor_2_p783_S24_R1_001.fastq.gz LT-K331A-d8-DII_re.fastq.gz

(rnaseq_2021) jhuang@hamm:~/DATA/Data_Denise_LT_RNASeq$ nextflow run rnaseq --reads 'Raw_Data_RNAseq_K331A_d8_SPANDx/*.fastq.gz' --fasta /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa --gtf /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf --bed12 /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.bed --singleEnd -profile standard --aligner star --saveReference -resume --saveAlignedIntermediates --skip_rseqc --skip_dupradar --skip_genebody_coverage --skip_preseq --skip_edger --fcGroupFeaturesType gene_name

nextflow run rnaseq --reads 'Raw_Data_RNAseq_K331A_d8_SPANDx/*.fastq.gz' --fasta /media/jhuang/Smarty/ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa --gtf /media/jhuang/Smarty/ref/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf --star_index /media/jhuang/Smarty/ref/Homo_sapiens/UCSC/hg38/Sequence/STARIndex/ --bed12 /media/jhuang/Smarty/ref/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.bed --singleEnd -profile standard --aligner star --saveReference -resume --saveAlignedIntermediates --skip_rseqc --skip_dupradar --skip_genebody_coverage --skip_preseq --skip_edger --fcGroupFeaturesType gene_name

rsync -a -P jhuang@hamm:~/REFs/Homo_sapiens/UCSC/hg38 .

#cut -f2- merged_gene_counts.txt > merged_gene_counts_2.txt

library("AnnotationDbi")
library("clusterProfiler")
library("ReactomePA")
library("org.Hs.eg.db")
library(DESeq2)
library(gplots)
setwd("/media/jhuang/Elements1/Data_Denise_LT_DNA_Bindung/results_rnaseq_K331A_hg38/featureCounts_together_d8_2")

d.raw<- read.delim2("merged_gene_counts_2.txt",sep="\t", header=TRUE, row.names=1)

 [1] K331A_DonorIAligned.sortedByCoord.out.bam           
 [2] V_8_2_4_p600_d8_DonorIAligned.sortedByCoord.out.bam 
 [3] V_8_2_3_p600_d8_DonorIIAligned.sortedByCoord.out.bam
 [4] V_8_4_2_p602_d3_DonorIIAligned.sortedByCoord.out.bam
 [5] V_8_2_4_p600_d3_DonorIAligned.sortedByCoord.out.bam 
 [6] V_8_4_2_p602_d3_DonorIAligned.sortedByCoord.out.bam 
 [7] K331A_DonorIIAligned.sortedByCoord.out.bam          
 [8] V_8_2_4_p605_d3_DonorIAligned.sortedByCoord.out.bam 
 [9] V_8_4_2_p602_d8_DonorIAligned.sortedByCoord.out.bam 
[10] V_8_2_3_p600_d3_DonorIIAligned.sortedByCoord.out.bam
[11] V_8_4_1_p602_d8_DonorIIAligned.sortedByCoord.out.bam
[12] V_8_2_3_p605_d8_DonorIIAligned.sortedByCoord.out.bam
[13] V_8_2_3_p605_d3_DonorIIAligned.sortedByCoord.out.bam
[14] V_8_2_4_p605_d8_DonorIAligned.sortedByCoord.out.bam

#replace p600 to control, p602 to LT, p605 to LTtr


colnames(d.raw)<- c("K331A_d8_DonorI","ctrl_d8_DonorI","ctrl_d8_DonorII","LT_d3_DonorII","ctrl_d3_DonorI","LT_d3_DonorI","K331A_d8_DonorII","LTtr_d3_DonorI","LT_d8_DonorI","ctrl_d3_DonorII","LT_d8_DonorII","LTtr_d8_DonorII","LTtr_d3_DonorII","LTtr_d8_DonorI")  #26485

col_order <- c("ctrl_d3_DonorI","ctrl_d3_DonorII","ctrl_d8_DonorI","ctrl_d8_DonorII",  "LT_d3_DonorI","LT_d3_DonorII","LT_d8_DonorI","LT_d8_DonorII",    "LTtr_d3_DonorI","LTtr_d3_DonorII","LTtr_d8_DonorI","LTtr_d8_DonorII",    "K331A_d8_DonorI","K331A_d8_DonorII")
reordered.raw <- d.raw[,col_order]

#selected all columns with "d8"
selected_reordered.raw <- reordered.raw[, c("ctrl_d8_DonorI","ctrl_d8_DonorII",  "LT_d8_DonorI","LT_d8_DonorII",  "LTtr_d8_DonorI","LTtr_d8_DonorII",  "K331A_d8_DonorI","K331A_d8_DonorII")]

condition = as.factor(c("control_d8","control_d8",  "LT_d8","LT_d8",  "LTtr_d8","LTtr_d8",  "K331A_d8","K331A_d8"))
ids = as.factor(c("ctrl_d8_DonorI","ctrl_d8_DonorII",  "LT_d8_DonorI","LT_d8_DonorII",  "LTtr_d8_DonorI","LTtr_d8_DonorII",  "K331A_d8_DonorI","K331A_d8_DonorII"))
#NOTE_3D_PCA (IMPORTANT): should always keep the name 'DI' and 'DII' as donor names; they are important for 3D PCA drawing!
#NOTE2_3D_PCA: correct the condition names: control_d8-->control d8, K331A_d8-->K331A d8, LT_d8-->LT d8, LTtr_d8-->LTtr d8.
donor = as.factor(c("DI","DII",  "DI","DII",  "DI","DII",  "DI","DII"))
#Note that we need selected_reordered.raw
cData = data.frame(row.names=colnames(selected_reordered.raw), condition=condition, donor=donor, ids=ids)
dds<-DESeqDataSetFromMatrix(countData=selected_reordered.raw, colData=cData, design=~donor+condition)  #batch+

> sizeFactors(dds)
NULL
> dds <- estimateSizeFactors(dds)
> sizeFactors(dds)
 ctrl_d3_DonorI ctrl_d3_DonorII  *ctrl_d8_DonorI *ctrl_d8_DonorII    LT_d3_DonorI 
      1.2221026       0.8126614       1.0631329       0.6271887       0.8553245 
  LT_d3_DonorII    *LT_d8_DonorI   *LT_d8_DonorII  LTtr_d3_DonorI LTtr_d3_DonorII 
      1.0386420       1.0324488       1.3568818       1.3070621       0.8283129 
 *LTtr_d8_DonorI *LTtr_d8_DonorII    *K331A_DonorI   *K331A_DonorII 
      1.6206130       0.8352236       1.6908424       0.6012788

ctrl_d8_DonorI ctrl_d8_DonorII    LT_d8_DonorI   LT_d8_DonorII  LTtr_d8_DonorI   LTtr_d8_DonorII    K331A_DonorI   K331A_DonorII 
     1.0429778       0.6131069       1.0162992       1.3406000       1.5984592       0.8268460       1.6634451       0.5943143 
> sizeFactors(dds)
  ctrl_d8_DonorI  ctrl_d8_DonorII     LT_d8_DonorI    LT_d8_DonorII 
       1.0429778        0.6131069        1.0162992        1.3406000 
  LTtr_d8_DonorI  LTtr_d8_DonorII  K331A_d8_DonorI K331A_d8_DonorII 
       1.5984592        0.8268460        1.6634451        0.5943143

./STAR/V_8_2_4_p600_d3_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_2_3_p600_d3_DonorIIAligned.sortedByCoord.out.bam
./STAR/V_8_2_4_p600_d8_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_2_3_p600_d8_DonorIIAligned.sortedByCoord.out.bam

./STAR/V_8_4_2_p602_d3_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_4_2_p602_d3_DonorIIAligned.sortedByCoord.out.bam
./STAR/V_8_4_2_p602_d8_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_4_1_p602_d8_DonorIIAligned.sortedByCoord.out.bam

./STAR/V_8_2_4_p605_d3_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_2_3_p605_d3_DonorIIAligned.sortedByCoord.out.bam
./STAR/V_8_2_4_p605_d8_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_2_3_p605_d8_DonorIIAligned.sortedByCoord.out.bam

./STAR/K331A_DonorIAligned.sortedByCoord.out.bam
./STAR/K331A_DonorIIAligned.sortedByCoord.out.bam


bamCoverage --bam ./STAR/V_8_2_4_p600_d3_DonorIAligned.sortedByCoord.out.bam -o bigWigs/ctrl_d3_DonorI_norm.bw --binSize 10 --scaleFactor 0.8182619037059573 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_3_p600_d3_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/ctrl_d3_DonorII_norm.bw --binSize 10 --scaleFactor 1.230524791752137 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_4_p600_d8_DonorIAligned.sortedByCoord.out.bam -o bigWigs/ctrl_d8_DonorI_norm.bw --binSize 10 --scaleFactor 0.9406161731990421 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_3_p600_d8_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/ctrl_d8_DonorII_norm.bw --binSize 10 --scaleFactor 1.5944164810367278 --effectiveGenomeSize 2913022398

bamCoverage --bam ./STAR/V_8_4_2_p602_d3_DonorIAligned.sortedByCoord.out.bam -o bigWigs/LT_d3_DonorI_norm.bw --binSize 10 --scaleFactor 1.1691469144166922 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_4_2_p602_d3_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/LT_d3_DonorII_norm.bw --binSize 10 --scaleFactor 0.9627956504743693 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_4_2_p602_d8_DonorIAligned.sortedByCoord.out.bam -o bigWigs/LT_d8_DonorI_norm.bw --binSize 10 --scaleFactor 0.9685710322875091 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_4_1_p602_d8_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/LT_d8_DonorII_norm.bw --binSize 10 --scaleFactor 0.7369838699288324 --effectiveGenomeSize 2913022398

bamCoverage --bam ./STAR/V_8_2_4_p605_d3_DonorIAligned.sortedByCoord.out.bam -o bigWigs/LTtr_d3_DonorI_norm.bw --binSize 10 --scaleFactor 0.7650745897995206 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_3_p605_d3_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/LTtr_d3_DonorII_norm.bw --binSize 10 --scaleFactor 1.2072732417906324 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_4_p605_d8_DonorIAligned.sortedByCoord.out.bam -o bigWigs/LTtr_d8_DonorI_norm.bw --binSize 10 --scaleFactor 0.617050461769713 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_3_p605_d8_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/LTtr_d8_DonorII_norm.bw --binSize 10 --scaleFactor 1.1972841763570858 --effectiveGenomeSize 2913022398

bamCoverage --bam ./STAR/K331A_DonorIAligned.sortedByCoord.out.bam -o bigWigs/K331A_DonorI_norm.bw --binSize 10 --scaleFactor 0.5914211756222816 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/K331A_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/K331A_DonorII_norm.bw --binSize 10 --scaleFactor 1.6631219993121327 --effectiveGenomeSize 2913022398
#END


rld <- rlogTransformation(dds)


library(ggplot2)
svg("pca6.svg")
data <- plotPCA(rld, intgroup=c("condition", "donor"), returnData=TRUE)
percentVar <- round(100 * attr(data, "percentVar"))
ggplot(data, aes(PC1, PC2, color=condition, shape=donor)) +
  geom_point(size=3) +
  scale_color_manual(values = c("control" = "grey",
                                "K331A"="#1f78b4")) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) +
  coord_fixed()
dev.off()

png("pca.png", width=800, height=800)
data <- plotPCA(rld, intgroup=c("condition", "donor"), returnData=TRUE)
percentVar <- round(100 * attr(data, "percentVar"))
ggplot(data, aes(PC1, PC2, color=condition, shape=donor)) +
  geom_point(size=3) +
  scale_color_manual(values = c("control" = "grey",
                                "K331A"="#1f78b4")) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) +
  coord_fixed()
dev.off()

png("pca2.png", 1200, 800)
plotPCA(rld, intgroup=c("condition"))
dev.off()




library(ggplot2)
data <- plotPCA(rld, intgroup=c("condition", "donor"), returnData=TRUE)
write.csv(data, file="plotPCA_data.csv")
#calculate all PCs including PC3 with the following codes
library(genefilter)
ntop <- 500
rv <- rowVars(assay(rld))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
mat <- t( assay(rld)[select, ] )
pc <- prcomp(mat)
pc$x[,1:3]
#df_pc <- data.frame(pc$x[,1:3])
df_pc <- data.frame(pc$x)
identical(rownames(data), rownames(df_pc)) #-->TRUE
## define the desired order of row names
#desired_order <- rownames(data)
## sort the data frame by the desired order of row names
#df <- df[match(desired_order, rownames(df_pc)), ]
data$PC1 <- NULL
data$PC2 <- NULL
merged_df <- merge(data, df_pc, by = "row.names")
#merged_df <- merged_df[, -1]
row.names(merged_df) <- merged_df$Row.names
merged_df$Row.names <- NULL  # remove the "name" column
merged_df$name <- NULL
merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","group","condition","donor")]
#results in 26PCs: merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","group","condition","donor")]
write.csv(merged_df, file="merged_df_8PCs.csv")
#> summary(pc)    #0.5657 0.2195 0.1347  --> 0.57 0.22 0.13
#Importance of components:
#                           PC1     PC2    PC3    PC4     PC5     PC6     PC7
#Standard deviation     19.4051 12.0878 9.4683 4.5569 4.01016 3.00610 2.71918
#Proportion of Variance  0.5657  0.2195 0.1347 0.0312 0.02416 0.01358 0.01111
#Cumulative Proportion   0.5657  0.7853 0.9200 0.9512 0.97531 0.98889 1.00000





#---- * to untreated ----
dds$condition <- relevel(dds$condition, "control_d8")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("K331A_d8_vs_control_d8", "LT_d8_vs_control_d8", "LTtr_d8_vs_control_d8")

for (i in clist) {
  contrast = paste("condition", i, sep="_")
  res = results(dds, name=contrast)
  res <- res[!is.na(res$log2FoldChange),]
  geness <- AnnotationDbi::select(org.Hs.eg.db, keys = rownames(res), keytype = "SYMBOL", columns = c("ENSEMBL", "ENTREZID", "SYMBOL", "GENENAME"))
  geness <- geness[!duplicated(geness$SYMBOL), ]
  res$SYMBOL = rownames(res)
  rownames(geness) <- geness$SYMBOL
  identical(rownames(res), rownames(geness))
  res_df <- as.data.frame(res)
  geness_res <- merge(geness, res_df)
  dim(geness_res)
  geness_res$padj <- ifelse(is.na(geness_res$padj), 1, geness_res$padj)
  geness_res$pvalue <- ifelse(is.na(geness_res$pvalue), 1, geness_res$pvalue)
  write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
  #write.csv(res_df, file = paste(i, "background.txt", sep="_"))
  up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
  down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
  write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
  write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}

#cd ${comp}_output; \
#~/Tools/csv2xls-0.4/csv_to_xls.py upregulated_filtered downregulated_filtered background -d$',' -o ../${comp}_degenes.xls; \
#cd ..; \


~/Tools/csv2xls-0.4/csv_to_xls.py \
K331A_d8_vs_control_d8-all.txt \
K331A_d8_vs_control_d8-up.txt \
K331A_d8_vs_control_d8-down.txt \
-d$',' -o K331A_d8_vs_control_d8.xls;

~/Tools/csv2xls-0.4/csv_to_xls.py \
LT_d8_vs_control_d8-all.txt \
LT_d8_vs_control_d8-up.txt \
LT_d8_vs_control_d8-down.txt \
-d$',' -o LT_d8_vs_control_d8.xls;

~/Tools/csv2xls-0.4/csv_to_xls.py \
LTtr_d8_vs_control_d8-all.txt \
LTtr_d8_vs_control_d8-up.txt \
LTtr_d8_vs_control_d8-down.txt \
-d$',' -o LTtr_d8_vs_control_d8.xls;

#   719 K331A_d8_vs_control_d8-up.txt
#   371 LT_d8_vs_control_d8-up.txt
#   292 LTtr_d8_vs_control_d8-up.txt
#  1382 total
#  323 K331A_d8_vs_control_d8-down.txt
#  141 LT_d8_vs_control_d8-down.txt
#   80 LTtr_d8_vs_control_d8-down.txt
#  544 total




library(ggrepel)

geness_res <- read.csv(file = paste("LT_d8_vs_control_d8", "all.txt", sep="-"), row.names=1)
geness_res$Color <- "NS or log2FC < 2.0"
geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
geness_res$Color[geness_res$padj < 0.001] <- "P-adj < 0.001"
geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
geness_res$Color <- factor(geness_res$Color,
                        levels = c("NS or log2FC < 2.0", "P < 0.05",
                                   "P-adj < 0.05", "P-adj < 0.001"))

geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
top_g <- c()
top_g <- c(top_g, geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:200]], geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:200]])
top_g <- unique(top_g)
geness_res <- geness_res[, -1*ncol(geness_res)]  #remove invert_P from matrix

png("LT_d8_vs_control_d8.png",width=1400, height=1000)
ggplot(geness_res,
       aes(x = log2FoldChange, y = -log10(pvalue),
           color = Color, label = SYMBOL)) +
  geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +
  geom_hline(yintercept = -log10(0.05), lty = "dashed") +
  geom_point() +
  labs(x = "log2(FC)",
       y = "Significance, -log10(P)",
       color = "Significance") +
  scale_color_manual(values = c(`P-adj < 0.001` = "dodgerblue",
                                `P-adj < 0.05` = "lightblue",
                                `P < 0.05` = "orange2",
                                `NS or log2FC < 2.0` = "gray"),
                     guide = guide_legend(override.aes = list(size = 4))) +
  scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
  geom_text_repel(data = subset(geness_res, SYMBOL %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)),
                  size = 4, point.padding = 0.15, color = "black",
                  min.segment.length = .1, box.padding = .2, lwd = 2) +
  theme_bw(base_size = 16) +
  theme(legend.position = "bottom")
dev.off()



up1 <-read.csv(file="./K331A_d8_vs_control_d8-up.txt")
up3 <-read.csv(file="./LT_d8_vs_control_d8-up.txt")
up5 <-read.csv(file="./LTtr_d8_vs_control_d8-up.txt")

down1 <-read.csv(file="./K331A_d8_vs_control_d8-down.txt")
down3 <-read.csv(file="./LT_d8_vs_control_d8-down.txt")
down5 <-read.csv(file="./LTtr_d8_vs_control_d8-down.txt")

RNASeq.NoCellLine <- assay(rld)
#Option3: as paper described, A heatmap showing expression values of all DEGs which are significant between any pair conditions.
all_genes <- c(up1$SYMBOL,down1$SYMBOL,  up3$SYMBOL,down3$SYMBOL,  up5$SYMBOL,down5$SYMBOL)  #1920
all_genes <- unique(all_genes)   #1223
RNASeq.NoCellLine_  <- RNASeq.NoCellLine[all_genes,]
write.csv(as.data.frame(RNASeq.NoCellLine_), file ="significant_gene_expressions.txt")


##### STEP4: clustering the genes and draw heatmap #####
#clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
datamat = RNASeq.NoCellLine_
hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.4) #1.08: 3 clusters; 
mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED",  "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");

mycol = mycol[as.vector(mycl)]
#sampleCols <- rep('GREY',ncol(RNASeq.NoCellLine_))
#names(sampleCols) <- c("mock_r1", "mock_r2", "mock_r3", "mock_r4", "WAP_r1", "WAP_r2",  "WAP_r3", "WAP_r4", "WAC_r1","WAC_r2")
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,4)=='mock'] <- 'GREY'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAP'] <- 'RED'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dM_'] <- 'CYAN'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dP_'] <- 'BLUE'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAC'] <- 'GREEN'
png("DEGs_heatmap.png", width=900, height=1010)
heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
            scale='row',trace='none',col=bluered(75), 
            RowSideColors = mycol, labRow="", srtCol=15, keysize=0.72)
dev.off()


#### cluster members #####
write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt') 
write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')  
write.csv(names(subset(mycl, mycl == '4')),file='cluster4_DARKMAGENTA.txt')  
write.csv(names(subset(mycl, mycl == '5')),file='cluster5_DARKCYAN.txt')

~/Tools/csv2xls-0.4/csv_to_xls.py \
cluster1_YELLOW.txt \
cluster2_DARKBLUE.txt \
cluster3_DARKORANGE.txt \
cluster4_DARKMAGENTA.txt \
cluster5_DARKCYAN.txt \
-d$',' -o gene_clusters.xls;

~/Tools/csv2xls-0.4/csv_to_xls.py \
significant_gene_expressions.txt \
-d',' -o DEGs_heatmap_data.xls;






# ------------  LT (p602) vs ctrl (p600) d8 ------------
colnames(d.raw)<- c("K331A_d8_DonorI","ctrl_d8_DonorI","ctrl_d8_DonorII","LT_d3_DonorII","ctrl_d3_DonorI","LT_d3_DonorI","K331A_d8_DonorII","LTtr_d3_DonorI","LT_d8_DonorI","ctrl_d3_DonorII","LT_d8_DonorII","LTtr_d8_DonorII","LTtr_d3_DonorII","LTtr_d8_DonorI")  #26485

col_order <- c("ctrl_d3_DonorI","ctrl_d3_DonorII","ctrl_d8_DonorI","ctrl_d8_DonorII",  "LT_d3_DonorI","LT_d3_DonorII","LT_d8_DonorI","LT_d8_DonorII",    "LTtr_d3_DonorI","LTtr_d3_DonorII","LTtr_d8_DonorI","LTtr_d8_DonorII",    "K331A_d8_DonorI","K331A_d8_DonorII")
reordered.raw <- d.raw[,col_order]

#selected all columns with "d8"
selected_reordered.raw <- reordered.raw[, c("ctrl_d8_DonorI","ctrl_d8_DonorII",  "LT_d8_DonorI","LT_d8_DonorII")]

condition = as.factor(c("control_d8","control_d8",  "LT_d8","LT_d8"))
ids = as.factor(c("ctrl_d8_DonorI","ctrl_d8_DonorII",  "LT_d8_DonorI","LT_d8_DonorII"))
donor = as.factor(c("DI","DII",  "DI","DII"))
cData = data.frame(row.names=colnames(selected_reordered.raw), condition=condition, donor=donor, ids=ids)
dds<-DESeqDataSetFromMatrix(countData=selected_reordered.raw, colData=cData, design=~donor+condition)  #batch+

#> sizeFactors(dds)
#NULL
dds <- estimateSizeFactors(dds)
sizeFactors(dds)
# ctrl_d8_DonorI ctrl_d8_DonorII    LT_d8_DonorI   LT_d8_DonorII 
#      1.0879047       0.6411992       1.0519079       1.3937541

rld <- rlogTransformation(dds)

setwd("~/DATA/Data_Denise_LT_DNA_Bindung/results_RNAseq_K331A_hg38/featureCounts_LT_vs_ctrl_d8")
dds$condition <- relevel(dds$condition, "control_d8")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("LT_d8_vs_control_d8")

for (i in clist) {
  contrast = paste("condition", i, sep="_")
  res = results(dds, name=contrast)
  res <- res[!is.na(res$log2FoldChange),]
  geness <- AnnotationDbi::select(org.Hs.eg.db, keys = rownames(res), keytype = "SYMBOL", columns = c("ENSEMBL", "ENTREZID", "SYMBOL", "GENENAME"))
  geness <- geness[!duplicated(geness$SYMBOL), ]
  res$SYMBOL = rownames(res)
  rownames(geness) <- geness$SYMBOL
  identical(rownames(res), rownames(geness))
  res_df <- as.data.frame(res)
  geness_res <- merge(geness, res_df)
  dim(geness_res)
  geness_res$padj <- ifelse(is.na(geness_res$padj), 1, geness_res$padj)
  geness_res$pvalue <- ifelse(is.na(geness_res$pvalue), 1, geness_res$pvalue)
  write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
  #write.csv(res_df, file = paste(i, "background.txt", sep="_"))
  up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
  down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
  write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
  write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}


#~/Tools/csv2xls-0.4/csv_to_xls.py \
#LT_d8_vs_control_d8-all.txt \
#LT_d8_vs_control_d8-up.txt \
#LT_d8_vs_control_d8-down.txt \
#-d$',' -o LT_d8_vs_control_d8.xls;

#   719 K331A_d8_vs_control_d8-up.txt
#   371 LT_d8_vs_control_d8-up.txt --> 399
#   292 LTtr_d8_vs_control_d8-up.txt
#  323 K331A_d8_vs_control_d8-down.txt
#  141 LT_d8_vs_control_d8-down.txt --> 184
#   80 LTtr_d8_vs_control_d8-down.txt

library(ggrepel)

geness_res <- read.csv(file = paste("LT_d8_vs_control_d8", "all.txt", sep="-"), row.names=1)
geness_res$Color <- "NS or log2FC < 2.0"
geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
geness_res$Color[geness_res$padj < 0.001] <- "P-adj < 0.001"
geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
geness_res$Color <- factor(geness_res$Color,
                        levels = c("NS or log2FC < 2.0", "P < 0.05",
                                   "P-adj < 0.05", "P-adj < 0.001"))

geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
top_g <- c()
top_g <- c(top_g, geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:200]], geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:200]])
top_g <- unique(top_g)
geness_res <- geness_res[, -1*ncol(geness_res)]  #remove invert_P from matrix

png("LT_d8_vs_control_d8.png",width=1400, height=1000)
ggplot(geness_res,
       aes(x = log2FoldChange, y = -log10(pvalue),
           color = Color, label = SYMBOL)) +
  geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +
  geom_hline(yintercept = -log10(0.05), lty = "dashed") +
  geom_point() +
  labs(x = "log2(FC)",
       y = "Significance, -log10(P)",
       color = "Significance") +
  scale_color_manual(values = c(`P-adj < 0.001` = "dodgerblue",
                                `P-adj < 0.05` = "lightblue",
                                `P < 0.05` = "orange2",
                                `NS or log2FC < 2.0` = "gray"),
                     guide = guide_legend(override.aes = list(size = 4))) +
  scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
  geom_text_repel(data = subset(geness_res, SYMBOL %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)),
                  size = 4, point.padding = 0.15, color = "black",
                  min.segment.length = .1, box.padding = .2, lwd = 2) +
  theme_bw(base_size = 16) +
  theme(legend.position = "bottom")
dev.off()



# ------------ LTtr (p605) vs ctrl (p600) d8 ------------
colnames(d.raw)<- c("K331A_d8_DonorI","ctrl_d8_DonorI","ctrl_d8_DonorII","LT_d3_DonorII","ctrl_d3_DonorI","LT_d3_DonorI","K331A_d8_DonorII","LTtr_d3_DonorI","LT_d8_DonorI","ctrl_d3_DonorII","LT_d8_DonorII","LTtr_d8_DonorII","LTtr_d3_DonorII","LTtr_d8_DonorI")  #26485

col_order <- c("ctrl_d3_DonorI","ctrl_d3_DonorII","ctrl_d8_DonorI","ctrl_d8_DonorII",  "LT_d3_DonorI","LT_d3_DonorII","LT_d8_DonorI","LT_d8_DonorII",    "LTtr_d3_DonorI","LTtr_d3_DonorII","LTtr_d8_DonorI","LTtr_d8_DonorII",    "K331A_d8_DonorI","K331A_d8_DonorII")
reordered.raw <- d.raw[,col_order]

#selected all columns with "d8"
selected_reordered.raw <- reordered.raw[, c("ctrl_d8_DonorI","ctrl_d8_DonorII",  "LTtr_d8_DonorI","LTtr_d8_DonorII")]

condition = as.factor(c("control_d8","control_d8",  "LTtr_d8","LTtr_d8"))
ids = as.factor(c("ctrl_d8_DonorI","ctrl_d8_DonorII",  "LTtr_d8_DonorI","LTtr_d8_DonorII"))
donor = as.factor(c("DI","DII",  "DI","DII"))
cData = data.frame(row.names=colnames(selected_reordered.raw), condition=condition, donor=donor, ids=ids)
dds<-DESeqDataSetFromMatrix(countData=selected_reordered.raw, colData=cData, design=~donor+condition)  #batch+

#> sizeFactors(dds)
#NULL
dds <- estimateSizeFactors(dds)
sizeFactors(dds)
# ctrl_d8_DonorI ctrl_d8_DonorII  LTtr_d8_DonorI LTtr_d8_DonorII 
#      1.0990996       0.6476647       1.6536010       0.8632049

rld <- rlogTransformation(dds)

setwd("~/DATA/Data_Denise_LT_DNA_Bindung/results_RNAseq_K331A_hg38/featureCounts_LTtr_vs_ctrl_d8")
dds$condition <- relevel(dds$condition, "control_d8")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("LTtr_d8_vs_control_d8")

for (i in clist) {
  contrast = paste("condition", i, sep="_")
  res = results(dds, name=contrast)
  res <- res[!is.na(res$log2FoldChange),]
  geness <- AnnotationDbi::select(org.Hs.eg.db, keys = rownames(res), keytype = "SYMBOL", columns = c("ENSEMBL", "ENTREZID", "SYMBOL", "GENENAME"))
  geness <- geness[!duplicated(geness$SYMBOL), ]
  res$SYMBOL = rownames(res)
  rownames(geness) <- geness$SYMBOL
  identical(rownames(res), rownames(geness))
  res_df <- as.data.frame(res)
  geness_res <- merge(geness, res_df)
  dim(geness_res)
  geness_res$padj <- ifelse(is.na(geness_res$padj), 1, geness_res$padj)
  geness_res$pvalue <- ifelse(is.na(geness_res$pvalue), 1, geness_res$pvalue)
  write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
  #write.csv(res_df, file = paste(i, "background.txt", sep="_"))
  up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
  down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
  write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
  write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}

#~/Tools/csv2xls-0.4/csv_to_xls.py \
#LTtr_d8_vs_control_d8-all.txt \
#LTtr_d8_vs_control_d8-up.txt \
#LTtr_d8_vs_control_d8-down.txt \
#-d$',' -o LTtr_d8_vs_control_d8.xls;

#   719 K331A_d8_vs_control_d8-up.txt
#   371 LT_d8_vs_control_d8-up.txt --> 399
#   292 LTtr_d8_vs_control_d8-up.txt --> 380
#  323 K331A_d8_vs_control_d8-down.txt
#  141 LT_d8_vs_control_d8-down.txt --> 184
#   80 LTtr_d8_vs_control_d8-down.txt --> 136

library(ggrepel)

geness_res <- read.csv(file = paste("LTtr_d8_vs_control_d8", "all.txt", sep="-"), row.names=1)
geness_res$Color <- "NS or log2FC < 2.0"
geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
geness_res$Color[geness_res$padj < 0.001] <- "P-adj < 0.001"
geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
geness_res$Color <- factor(geness_res$Color,
                        levels = c("NS or log2FC < 2.0", "P < 0.05",
                                   "P-adj < 0.05", "P-adj < 0.001"))

geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
top_g <- c()
top_g <- c(top_g, geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:200]], geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:200]])
top_g <- unique(top_g)
geness_res <- geness_res[, -1*ncol(geness_res)]  #remove invert_P from matrix

png("LTtr_d8_vs_control_d8.png",width=1400, height=1000)
ggplot(geness_res,
       aes(x = log2FoldChange, y = -log10(pvalue),
           color = Color, label = SYMBOL)) +
  geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +
  geom_hline(yintercept = -log10(0.05), lty = "dashed") +
  geom_point() +
  labs(x = "log2(FC)",
       y = "Significance, -log10(P)",
       color = "Significance") +
  scale_color_manual(values = c(`P-adj < 0.001` = "dodgerblue",
                                `P-adj < 0.05` = "lightblue",
                                `P < 0.05` = "orange2",
                                `NS or log2FC < 2.0` = "gray"),
                     guide = guide_legend(override.aes = list(size = 4))) +
  scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
  geom_text_repel(data = subset(geness_res, SYMBOL %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)),
                  size = 4, point.padding = 0.15, color = "black",
                  min.segment.length = .1, box.padding = .2, lwd = 2) +
  theme_bw(base_size = 16) +
  theme(legend.position = "bottom")
dev.off()





# ------------ LT K331A (p783) vs ctrl (p600) d8 ------------
colnames(d.raw)<- c("K331A_d8_DonorI","ctrl_d8_DonorI","ctrl_d8_DonorII","LT_d3_DonorII","ctrl_d3_DonorI","LT_d3_DonorI","K331A_d8_DonorII","LTtr_d3_DonorI","LT_d8_DonorI","ctrl_d3_DonorII","LT_d8_DonorII","LTtr_d8_DonorII","LTtr_d3_DonorII","LTtr_d8_DonorI")  #26485

col_order <- c("ctrl_d3_DonorI","ctrl_d3_DonorII","ctrl_d8_DonorI","ctrl_d8_DonorII",  "LT_d3_DonorI","LT_d3_DonorII","LT_d8_DonorI","LT_d8_DonorII",    "LTtr_d3_DonorI","LTtr_d3_DonorII","LTtr_d8_DonorI","LTtr_d8_DonorII",    "K331A_d8_DonorI","K331A_d8_DonorII")
reordered.raw <- d.raw[,col_order]

#selected all columns with "d8"
selected_reordered.raw <- reordered.raw[, c("ctrl_d8_DonorI","ctrl_d8_DonorII",  "K331A_d8_DonorI","K331A_d8_DonorII")]

condition = as.factor(c("control_d8","control_d8",  "K331A_d8","K331A_d8"))
ids = as.factor(c("ctrl_d8_DonorI","ctrl_d8_DonorII",  "K331A_d8_DonorI","K331A_d8_DonorII"))
donor = as.factor(c("DI","DII",  "DI","DII"))
cData = data.frame(row.names=colnames(selected_reordered.raw), condition=condition, donor=donor, ids=ids)
dds<-DESeqDataSetFromMatrix(countData=selected_reordered.raw, colData=cData, design=~donor+condition)  #batch+

#> sizeFactors(dds)
#NULL
dds <- estimateSizeFactors(dds)
sizeFactors(dds)
#  ctrl_d8_DonorI  ctrl_d8_DonorII  K331A_d8_DonorI K331A_d8_DonorII 
#       1.1747544        0.6871283        1.8656332        0.6817289

rld <- rlogTransformation(dds)

setwd("~/DATA/Data_Denise_LT_DNA_Bindung/results_RNAseq_K331A_hg38/featureCounts_K331A_vs_ctrl_d8")
dds$condition <- relevel(dds$condition, "control_d8")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("K331A_d8_vs_control_d8")

for (i in clist) {
  contrast = paste("condition", i, sep="_")
  res = results(dds, name=contrast)
  res <- res[!is.na(res$log2FoldChange),]
  geness <- AnnotationDbi::select(org.Hs.eg.db, keys = rownames(res), keytype = "SYMBOL", columns = c("ENSEMBL", "ENTREZID", "SYMBOL", "GENENAME"))
  geness <- geness[!duplicated(geness$SYMBOL), ]
  res$SYMBOL = rownames(res)
  rownames(geness) <- geness$SYMBOL
  identical(rownames(res), rownames(geness))
  res_df <- as.data.frame(res)
  geness_res <- merge(geness, res_df)
  dim(geness_res)
  geness_res$padj <- ifelse(is.na(geness_res$padj), 1, geness_res$padj)
  geness_res$pvalue <- ifelse(is.na(geness_res$pvalue), 1, geness_res$pvalue)
  write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
  #write.csv(res_df, file = paste(i, "background.txt", sep="_"))
  up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
  down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
  write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
  write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}

#~/Tools/csv2xls-0.4/csv_to_xls.py \
#K331A_d8_vs_control_d8-all.txt \
#K331A_d8_vs_control_d8-up.txt \
#K331A_d8_vs_control_d8-down.txt \
#-d$',' -o K331A_d8_vs_control_d8.xls;

#   719 K331A_d8_vs_control_d8-up.txt --> 315
#   371 LT_d8_vs_control_d8-up.txt --> 399
#   292 LTtr_d8_vs_control_d8-up.txt --> 380
#  323 K331A_d8_vs_control_d8-down.txt --> 277
#  141 LT_d8_vs_control_d8-down.txt --> 184
#   80 LTtr_d8_vs_control_d8-down.txt --> 136

library(ggrepel)

geness_res <- read.csv(file = paste("K331A_d8_vs_control_d8", "all.txt", sep="-"), row.names=1)
geness_res$Color <- "NS or log2FC < 2.0"
geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
geness_res$Color[geness_res$padj < 0.001] <- "P-adj < 0.001"
geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
geness_res$Color <- factor(geness_res$Color,
                        levels = c("NS or log2FC < 2.0", "P < 0.05",
                                   "P-adj < 0.05", "P-adj < 0.001"))

geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
top_g <- c()
top_g <- c(top_g, geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:200]], geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:200]])
top_g <- unique(top_g)
geness_res <- geness_res[, -1*ncol(geness_res)]  #remove invert_P from matrix

png("K331A_d8_vs_control_d8.png",width=1400, height=1000)
ggplot(geness_res,
       aes(x = log2FoldChange, y = -log10(pvalue),
           color = Color, label = SYMBOL)) +
  geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +
  geom_hline(yintercept = -log10(0.05), lty = "dashed") +
  geom_point() +
  labs(x = "log2(FC)",
       y = "Significance, -log10(P)",
       color = "Significance") +
  scale_color_manual(values = c(`P-adj < 0.001` = "dodgerblue",
                                `P-adj < 0.05` = "lightblue",
                                `P < 0.05` = "orange2",
                                `NS or log2FC < 2.0` = "gray"),
                     guide = guide_legend(override.aes = list(size = 4))) +
  scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
  geom_text_repel(data = subset(geness_res, SYMBOL %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)),
                  size = 4, point.padding = 0.15, color = "black",
                  min.segment.length = .1, box.padding = .2, lwd = 2) +
  theme_bw(base_size = 16) +
  theme(legend.position = "bottom")
dev.off()



# ------------------- heatmap for separate analyses ----------------
setwd("~/DATA/Data_Denise_LT_DNA_Bindung/results_rnaseq_K331A_hg38/heatmap_separate_analyses")
up1 <-read.csv(file="../featureCounts_K331A_vs_ctrl_d8_csv/K331A_d8_vs_control_d8-up.txt")
up3 <-read.csv(file="../featureCounts_LT_vs_ctrl_d8_csv/LT_d8_vs_control_d8-up.txt")
up5 <-read.csv(file="../featureCounts_LTtr_vs_ctrl_d8_csv/LTtr_d8_vs_control_d8-up.txt")

down1 <-read.csv(file="../featureCounts_K331A_vs_ctrl_d8_csv/K331A_d8_vs_control_d8-down.txt")
down3 <-read.csv(file="../featureCounts_LT_vs_ctrl_d8_csv/LT_d8_vs_control_d8-down.txt")
down5 <-read.csv(file="../featureCounts_LTtr_vs_ctrl_d8_csv/LTtr_d8_vs_control_d8-down.txt")

RNASeq.NoCellLine <- assay(rld)  # using the overview analysis, only choose the gene ids between conditions seperately. NOT DONE!
#Option3: as paper described, A heatmap showing expression values of all DEGs which are significant between any pair conditions.
all_genes <- c(up1$SYMBOL,down1$SYMBOL,  up3$SYMBOL,down3$SYMBOL,  up5$SYMBOL,down5$SYMBOL)  #1685
all_genes <- unique(all_genes)   #1685-->1016
RNASeq.NoCellLine_  <- RNASeq.NoCellLine[all_genes,]
write.csv(as.data.frame(RNASeq.NoCellLine_), file ="significant_gene_expressions.txt")


##### STEP4: clustering the genes and draw heatmap #####
#clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
datamat = RNASeq.NoCellLine_
hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.08)
mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED",  "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");

mycol = mycol[as.vector(mycl)]
#sampleCols <- rep('GREY',ncol(RNASeq.NoCellLine_))
#names(sampleCols) <- c("mock_r1", "mock_r2", "mock_r3", "mock_r4", "WAP_r1", "WAP_r2",  "WAP_r3", "WAP_r4", "WAC_r1","WAC_r2")
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,4)=='mock'] <- 'GREY'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAP'] <- 'RED'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dM_'] <- 'CYAN'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dP_'] <- 'BLUE'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAC'] <- 'GREEN'
png("DEGs_heatmap.png", width=900, height=1010)
heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
            scale='row',trace='none',col=bluered(75), 
            RowSideColors = mycol, labRow="", srtCol=15, keysize=0.72)
dev.off()


#### cluster members #####
write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt') 
write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')

~/Tools/csv2xls-0.4/csv_to_xls.py \
cluster1_YELLOW.txt \
cluster2_DARKBLUE.txt \
cluster3_DARKORANGE.txt \
-d$',' -o gene_culsters.xls;

~/Tools/csv2xls-0.4/csv_to_xls.py \
significant_gene_expressions.txt \
-d',' -o DEGs_heatmap_data.xls;

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum