RNA-seq skin organoids on GRCh38+chrHsv1

  1. import data and pca-plot

     # Import the required libraries
     library("AnnotationDbi")
     library("clusterProfiler")
     library("ReactomePA")
     library(gplots)
     library(tximport)
     library(DESeq2)
     setwd("~/DATA/Data_Manja_RNAseq_Organoids_Virus/results_chrHsv1_downstream/star_salmon")
     # Define paths to your Salmon output quantification files
     files <- c("control_r1" = "./control_r1/quant.sf",
               "control_r2" = "./control_r2/quant.sf",
               "HSV.d2_r1" = "./HSV.d2_r1/quant.sf",
               "HSV.d2_r2" = "./HSV.d2_r2/quant.sf",
               "HSV.d4_r1" = "./HSV.d4_r1/quant.sf",
               "HSV.d4_r2" = "./HSV.d4_r2/quant.sf",
               "HSV.d6_r1" = "./HSV.d6_r1/quant.sf",
               "HSV.d6_r2" = "./HSV.d6_r2/quant.sf",
               "HSV.d8_r1" = "./HSV.d8_r1/quant.sf",
               "HSV.d8_r2" = "./HSV.d8_r2/quant.sf")
     # Import the transcript abundance data with tximport
     txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
     # Define the replicates and condition of the samples
     replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2"))
     condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8", "HSV.d8"))
     # Define the colData for DESeq2
     colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
     # Create DESeqDataSet object
     dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
     # In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps.
     # Filter data to retain only genes with more than 2 counts > 3 across all samples
     # dds <- dds[rowSums(counts(dds) > 3) > 2, ]
     # Output raw count data to a CSV file
     write.csv(counts(dds), file="transcript_counts.csv")
    
     # -- gene-level count data --
     # Read in the tx2gene map from salmon_tx2gene.tsv
     #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
     tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
     # Set the column names
     colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
     # Remove the gene_name column if not needed
     tx2gene <- tx2gene[,1:2]
     # Import and summarize the Salmon data with tximport
     txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
     # Continue with the DESeq2 workflow as before...
     colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
     dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
     #dds <- dds[rowSums(counts(dds) > 3) > 2, ]    #60605-->26543
    
     write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")
     #~/Tools/csv2xls-0.4/csv_to_xls.py gene_counts.csv -d',' -o gene_counts.xls
     #TODO: why a lot of reads were removed due to the too_short?
     #STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output
     dim(counts(dds))
     head(counts(dds), 10)
    
     #DEBUG: DESeq should not used here!?
     #TODO_NEXT_WEEK: rerun without fistly DESeq(dds) to compare if the results is the same to process_1 
     #dds <- DESeq(dds)
     rld <- rlogTransformation(dds)
    
     # draw simple pca and heatmap
     library(gplots) 
     library("RColorBrewer")
     #mat <- assay(rld)
     #mm <- model.matrix(~condition, colData(rld))
     #mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm)
     #assay(rld) <- mat
     # -- pca --
     png("pca.png", 1200, 800)
     plotPCA(rld, intgroup=c("condition"))
     dev.off()
     # -- heatmap --
     png("heatmap.png", 1200, 800)
     distsRL <- dist(t(assay(rld)))
     mat <- as.matrix(distsRL)
     hc <- hclust(distsRL)
     hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
     heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
     dev.off()
  2. draw 3D PCA plots.

     library(gplots) 
     library("RColorBrewer")
    
     library(ggplot2)
     data <- plotPCA(rld, intgroup=c("condition", "replicate"), returnData=TRUE)
     write.csv(data, file="plotPCA_data.csv")
     #calculate all PCs including PC3 with the following codes
     library(genefilter)
     ntop <- 500
     rv <- rowVars(assay(rld))
     select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
     mat <- t( assay(rld)[select, ] )
     pc <- prcomp(mat)
     pc$x[,1:3]
     #df_pc <- data.frame(pc$x[,1:3])
     df_pc <- data.frame(pc$x)
     identical(rownames(data), rownames(df_pc)) #-->TRUE
    
     data$PC1 <- NULL
     data$PC2 <- NULL
     merged_df <- merge(data, df_pc, by = "row.names")
     #merged_df <- merged_df[, -1]
     row.names(merged_df) <- merged_df$Row.names
     merged_df$Row.names <- NULL  # remove the "name" column
     merged_df$name <- NULL
     merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10","group","condition","replicate")]
     write.csv(merged_df, file="merged_df_10PCs.csv")
     summary(pc)  
     #0.5333  0.2125 0.06852
     #0.8026 0.09042 0.06578
    
     draw_3D.py
    
     # adjust proportion to real values in the following plot
     import plotly.graph_objects as go
     import pandas as pd
     from sklearn.decomposition import PCA
     import numpy as np
     # Read in data as a pandas dataframe
     #df = pd.DataFrame({
     #    'PC1': [-13.999925, -12.504291, -12.443057, -13.065235, -17.316215],
     #    'PC2': [-1.498823, -3.342411, -6.067055, -8.205809, 3.293993],
     #    'PC3': [-3.335085, 15.207755, -14.725450, 15.078469, -6.917358],
     #    'condition': ['GFP d3', 'GFP d3', 'GFP d8', 'GFP d8', 'GFP+mCh d9/12'],
     #    'replicate': ['DI', 'DII', 'DI', 'DII', 'DI']
     #})
     df = pd.read_csv('merged_df_10PCs.csv', index_col=0, header=0)
     df['condition'] = df['condition'].replace("control", "control")
     df['condition'] = df['condition'].replace("HSV.d2", "day 2")
     df['condition'] = df['condition'].replace("HSV.d4", "day 4")
     df['condition'] = df['condition'].replace("HSV.d6", "day 6")
     df['condition'] = df['condition'].replace("HSV.d8", "day 8")
     # Fit PCA model to reduce data dimensions to 3
     pca = PCA(n_components=3)
     pca.fit(df.iloc[:, :-3])
     X_reduced = pca.transform(df.iloc[:, :-3])
    
     # Get variance ratios
     explained_variance_ratio = pca.explained_variance_ratio_
    
     # Add reduced data back to dataframe
     df['PC1'] = X_reduced[:, 0]
     df['PC2'] = X_reduced[:, 1]
     df['PC3'] = X_reduced[:, 2]
     # Create PCA plot with 3D scatter
     fig = go.Figure()
    
     ##ff7f00
     condition_color_map = {
         'control': 'rgb(100, 100, 100)',
         'day 2': '#33a02c',
         'day 4': '#1f78b4',
         'day 6': '#e31a1c',
         'day 8': 'magenta'
     }
     replicate_symbol_map = {'r1': 'circle', 'r2': 'diamond'}
     for replicate, replicate_symbol in replicate_symbol_map.items():
         for condition, condition_color in condition_color_map.items():
             mask = (df['condition'] == condition) & (df['replicate'] == replicate)
             fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'],
                                         mode='markers',
                                         name=f'{condition}' if replicate == 'r1' else None,
                                         legendgroup=f'{condition}',
                                         showlegend=True if replicate == 'r1' else False,
                                         marker=dict(size=6 if replicate_symbol in ['diamond'] else 10, opacity=0.8, color=condition_color, symbol=replicate_symbol)))
     for replicate, replicate_symbol in replicate_symbol_map.items():
         fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None],
                                     mode='markers',
                                     name=replicate,
                                     legendgroup=f'{replicate}',
                                     showlegend=True,
                                     marker=dict(size=10, opacity=1, color='black', symbol=replicate_symbol),
                                     hoverinfo='none'))
     # Annotations for the legend blocks
     #TODO: calculate the PC values.
     #TODO: adjust the axis length according to the actual size of axis!
     fig.update_layout(
         annotations=[
             dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False,
                   text='Condition', font=dict(size=15)),
             dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False,
                   text='Replicate', font=dict(size=15))
         ],
         scene=dict(
             #aspectmode='cube',
             #xaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC1: 53% v.', scaleratio=0.53),
             #yaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC2: 21% v.', scaleratio=0.21),
             #zaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC3: 7% variance', scaleratio=0.07),
             aspectmode='manual',
             aspectratio=dict(x=explained_variance_ratio[0], y=explained_variance_ratio[1], z=explained_variance_ratio[2]),
             xaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC1: 53% v.', range=[min(df['PC1']), max(df['PC1'])]),
             yaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC2: 21% v.', range=[min(df['PC2']), max(df['PC2'])]),
             zaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC3: 7% variance', range=[min(df['PC3']), max(df['PC3'])]),
    
             bgcolor='white'
         ),
         margin=dict(l=5, r=5, b=5, t=0)  # Adjust the margins to prevent clipping of axis titles
     )
     #fig.show()
     fig.write_image("fig1.svg")
    
     fig.update_layout(
         annotations=[
             dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False,
                 text='Condition', font=dict(size=15)),
             dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False,
                 text='Donor', font=dict(size=15)),
             dict(x=1.08, y=0.2, xref='paper', yref='paper', showarrow=False,
                 text=f'PC3: {explained_variance_ratio[2]*100:.2f}% v.', font=dict(size=15), textangle=-90)
         ],
         scene=dict(
             aspectmode='manual',
             aspectratio=dict(x=explained_variance_ratio[0]*2, y=explained_variance_ratio[1]*2, z=explained_variance_ratio[2]*2),
             #, range=[min(df['PC1']), max(df['PC1'])]
             #, range=[min(df['PC2']), max(df['PC2'])]
             #, range=[min(df['PC3']), max(df['PC3'])]
             xaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title=f'PC1: {explained_variance_ratio[0]*100:.2f}% variance'),
             yaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title=f'PC2: {explained_variance_ratio[1]*100:.2f}% v.'),
             zaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title=''),
             bgcolor='white'
         ),
         margin=dict(l=0, r=0, b=0, t=0)  # Adjust the margins to prevent clipping of axis titles
     )
     fig.write_image("PCA_3D.svg")
     #/usr/bin/convert g235.png -crop 3250x1680+1+750 PCA_3D_.png
  3. (optional) estimate size factors

     > head(dds)
     class: DESeqDataSet 
     dim: 6 10 
     metadata(1): version
     assays(6): counts avgTxLength ... H cooks
     rownames(6): ENSG00000000003 ENSG00000000005 ... ENSG00000000460
       ENSG00000000938
     rowData names(34): baseMean baseVar ... deviance maxCooks
     colnames(10): control_r1 control_r2 ... HSV.d8_r1 HSV.d8_r2
     colData names(2): condition replicate
    
     #convert bam to bigwig using deepTools by feeding inverse of DESeq’s size Factor
     sizeFactors(dds)
     #NULL
     dds <- estimateSizeFactors(dds)
     > sizeFactors(dds)
    
     raw_counts <- counts(dds)
     normalized_counts <- counts(dds, normalized=TRUE)
     #write.table(raw_counts, file="raw_counts.txt", sep="\t", quote=F, col.names=NA)
     #write.table(normalized_counts, file="normalized_counts.txt", sep="\t", quote=F, col.names=NA)
    
     # ---- DEBUG sizeFactors(dds) always NULL, see https://support.bioconductor.org/p/97676/ ----
     nm <- assays(dds)[["avgTxLength"]]
     sf <- estimateSizeFactorsForMatrix(counts(dds), normMatrix=nm)
    
     assays(dds)$counts  # for count data
     assays(dds)$avgTxLength  # for average transcript length, etc.
     assays(dds)$normalizationFactors
    
     In normal circumstances, the size factors should be stored in the DESeqDataSet object itself and not in the assays, so they are typically not retrievable via the assays() function. However, due to the issues you're experiencing, you might be able to manually compute the size factors and assign them back to the DESeqDataSet.
    
     To calculate size factors manually, DESeq2 uses the median ratio method. Here's a very simplified version of how you could compute this manually:
     > assays(dds)
     List of length 6
     names(6): counts avgTxLength normalizationFactors mu H cooks
    
     To calculate size factors manually, DESeq2 uses the median ratio method. Here's a very simplified version of how you could compute this manually:
    
     geoMeans <- apply(assays(dds)$counts, 1, function(row) if (all(row == 0)) 0 else exp(mean(log(row[row != 0]))))
     sizeFactors(dds) <- median(assays(dds)$counts / geoMeans, na.rm = TRUE)
    
     # ---- DEBUG END ----
    
     #unter konsole
     #  control_r1  ...
     # 1/0.9978755  ... 
    
     > sizeFactors(dds)
                         HeLa_TO_r1                      HeLa_TO_r2 
                           0.9978755                       1.1092227 
    
     1/0.9978755=1.002129023
     1/1.1092227=
    
     #bamCoverage --bam ../markDuplicates/${sample}Aligned.sortedByCoord.out.bam -o ${sample}_norm.bw --binSize 10 --scaleFactor  --effectiveGenomeSize 2864785220
     bamCoverage --bam ../markDuplicates/HeLa_TO_r1Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r1.bw --binSize 10 --scaleFactor 1.002129023     --effectiveGenomeSize 2864785220
     bamCoverage --bam ../markDuplicates/HeLa_TO_r2Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r2.bw --binSize 10 --scaleFactor  0.901532217        --effectiveGenomeSize 2864785220
  4. compare the normalization methods

     # ---- draw normalization before and after ----
     ### Let's implement such a function
     ### cds is a countDataset
     estimSf <- function (cds){
         # Get the count matrix
         cts <- counts(cds)
    
         # Compute the geometric mean
         geomMean <- function(x) prod(x)^(1/length(x))
    
         # Compute the geometric mean over the line
         gm.mean  <-  apply(cts, 1, geomMean)
    
         # Zero values are set to NA (avoid subsequentcdsdivision by 0)
         gm.mean[gm.mean == 0] <- NA
    
         # Divide each line by its corresponding geometric mean
         # sweep(x, MARGIN, STATS, FUN = "-", check.margin = TRUE, ...)
         # MARGIN: 1 or 2 (line or columns)
         # STATS: a vector of length nrow(x) or ncol(x), depending on MARGIN
         # FUN: the function to be applied
         cts <- sweep(cts, 1, gm.mean, FUN="/")
    
         # Compute the median over the columns
         med <- apply(cts, 2, median, na.rm=TRUE)
    
         # Return the scaling factor
         return(med)
     }
     #https://dputhier.github.io/ASG/practicals/rnaseq_diff_Snf2/rnaseq_diff_Snf2.html
     #http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#data-transformations-and-visualization
     #https://hbctraining.github.io/DGE_workshop/lessons/02_DGE_count_normalization.html
     #https://hbctraining.github.io/DGE_workshop/lessons/04_DGE_DESeq2_analysis.html
     #https://genviz.org/module-04-expression/0004/02/01/DifferentialExpression/
     #DESeq2’s median of ratios [1]
     #EdgeR’s trimmed mean of M values (TMM) [2]
     #http://www.nathalievialaneix.eu/doc/html/TP1_normalization.html  #very good website!
     test_normcount <- sweep(raw_counts, 2, sizeFactors(dds), "/")
     sum(test_normcount != normalized_counts)
    
     head(estimSf(dds))
     all(round(estimSf(dds),6) == round(sizeFactors(dds), 6))
     ## Checking the normalization
     png("normalization.png", width=800, height=600)
     epsilon <- 1 # pseudo-count to avoid problems with log(0)
     par(mfrow=c(1,2),cex.lab=0.7)
     boxplot(log2(raw_counts+epsilon), cex.axis=0.7, las=1, xlab="log2(raw counts)", horizontal=TRUE, main="Raw counts")
     boxplot(log2(normalized_counts+epsilon), cex.axis=0.7, las=1, xlab="log2(normalized counts)", horizontal=TRUE, main="Normalized counts") 
     #boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)
     #plotDensity(log2(counts(dds.norm)+epsilon),  col=col.pheno.selected, 
     #            xlab="log2(counts)", cex.lab=0.7, panel.first=grid()) 
     #plotDensity(log2(counts(dds.norm, normalized=TRUE)+epsilon), col=col.pheno.selected, 
     #            xlab="log2(normalized counts)", cex.lab=0.7, panel.first=grid()) 
     dev.off()
    
     # since we Gene-level differential expression analysis with DESeq2, the splicing plays no role in the analysis!
     # 用nanopore 可以 compare transcript length distribution. 有可能Cellline很长,Extracellular vesicles (EVs)很短!
    
     library(ggplot2)
     library(gridExtra)
     library(reshape2)
     library(mixOmics)
     library(RColorBrewer)
     library(DESeq)
     library(edgeR)
     library(VennDiagram)
     library(devtools)
     raw_counts_wn <- raw_counts[rowSums(raw_counts) > 0, ]
     dim(raw_counts_wn)
    
     #--Raw counts--
     pseudo_counts <- log2(raw_counts_wn + 1)
     head(pseudo_counts)
     df_raw <- melt(pseudo_counts, id = rownames(raw_counts_wn))
     names(df_raw)[1:2]<- c("id", "sample")
     df_raw$method <- rep("Raw counts", nrow(df_raw))  
     head(df_raw)
    
     #--DESeq--
     cData = data.frame(row.names=colnames(raw_counts_wn), replicates=replicates, ids=ids)
     dge<-DESeqDataSetFromMatrix(countData=raw_counts_wn, colData=cData, design=~replicates)
     dge <- estimateSizeFactors(dge)
     sizeFactors(dge)
     deseq_normcount <- counts(dge, normalized = TRUE)
     test_normcount <- sweep(raw_counts_wn, 2, sizeFactors(dge), "/")
     sum(test_normcount != deseq_normcount)
     pseudo_deseq <- log2(deseq_normcount + 1)
     df_deseq <- melt(pseudo_deseq, id = rownames(raw_counts_wn))
     names(df_deseq)[1:2]<- c("id", "sample")
     df_deseq$method <- rep("DESeq (RLE)", nrow(df_raw))  
    
     #--edgeR--
     dge2 <- DGEList(raw_counts_wn)
     dge2
     dge2$samples
    
     #--Total count--
     pseudo_TC <- log2(cpm(dge2) + 1)
     df_TC <- melt(pseudo_TC, id = rownames(raw_counts_wn))
     names(df_TC)[1:2] <- c ("id", "sample")
     df_TC$method <- rep("TC", nrow(df_TC))
    
     ##--RPKM--
     #gene_lengths_wn <- gene_lengths[rowSums(raw_counts) > 0]
     #pseudo_RPKM <- log2(rpkm(dge2, gene.length = gene_lengths_wn) + 1)
     #df_RPKM <- melt(pseudo_RPKM, id = rownames(raw_counts_wn))
     #names(df_RPKM)[1:2] <- c ("id", "sample")
     #df_RPKM$method <- rep("RPKM", nrow(df_RPKM))
    
     #--Upper quartile--
     dge2 <- calcNormFactors(dge2, method = "upperquartile")
     dge2$samples
     test_normcount <- sweep(dge2$counts, 2,
                             dge2$samples$lib.size*dge2$samples$norm.factors / 10^6,
                             "/")
     range(as.vector(test_normcount - cpm(dge2)))
     pseudo_UQ <- log2(cpm(dge2) + 1)
    
     df_UQ <- melt(pseudo_UQ, id = rownames(raw_counts_wn))
     names(df_UQ)[1:2] <- c ("id", "sample")
     df_UQ$method <- rep("UQ", nrow(df_UQ))
    
     #--TMM--
     dge2 <- calcNormFactors(dge2, method = "TMM")
     dge2$samples
     pseudo_TMM <- log2(cpm(dge2) + 1)
     df_TMM <- melt(pseudo_TMM, id = rownames(raw_counts_wn))
     names(df_TMM)[1:2] <- c ("id", "sample")
     #MODIFIED!
     df_TMM$method <- rep("DESeq (RLE)", nrow(df_TMM))  #TMM
    
     #--Comparison--
     png("normalization.png", width=800, height=600)
     #df_allnorm <- rbind(df_raw, df_deseq, df_TC, df_UQ, df_TMM)
     #df_allnorm$method <- factor(df_allnorm$method, levels = c("Raw counts", "DESeq (RLE)", "TC",  "TMM", "UQ"))
     df_allnorm <- rbind(df_raw, df_TMM)
     df_allnorm$method <- factor(df_allnorm$method, levels = c("Raw counts", "DESeq (RLE)"))
     p <- ggplot(data=df_allnorm, aes(x=sample, y=value, fill=method))
     p <- p + geom_boxplot()  
     p <- p + theme_bw()
     p <- p + ggtitle("Boxplots of normalized pseudo counts\n
     for all samples by normalization methods")
     p <- p + facet_grid(. ~ method) 
     p <- p + ylab(expression(log[2] ~ (normalized ~ count + 1))) + xlab("")
     p <- p + theme(title = element_text(size=10), axis.text.x = element_blank(), 
                   axis.ticks.x = element_blank())
     print(p)
     dev.off()
  5. select the differentially expressed genes

     #https://galaxyproject.eu/posts/2020/08/22/three-steps-to-galaxify-your-tool/
     #https://www.biostars.org/p/282295/
     #https://www.biostars.org/p/335751/
    
     #> condition
     # [1] control control HSV.d2  HSV.d2  HSV.d4  HSV.d4  HSV.d6  HSV.d6  HSV.d8  HSV.d8 
     #Levels: control HSV.d2 HSV.d4 HSV.d6 HSV.d8
    
     #CONSOLE: mkdir /home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/results_chrHsv1_downstream/star_salmon/degenes
     setwd("/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/results_chrHsv1_downstream/star_salmon/degenes")
     #---- relevel to control ----
     dds$condition <- relevel(dds$condition, "control")
     dds = DESeq(dds, betaPrior=FALSE)
     resultsNames(dds)
     clist <- c("HSV.d2_vs_control","HSV.d4_vs_control", "HSV.d6_vs_control", "HSV.d8_vs_control")
    
     dds$condition <- relevel(dds$condition, "HSV.d2")
     dds = DESeq(dds, betaPrior=FALSE)
     resultsNames(dds)
     clist <- c("HSV.d4_vs_HSV.d2", "HSV.d6_vs_HSV.d2", "HSV.d8_vs_HSV.d2")
    
     dds$condition <- relevel(dds$condition, "HSV.d4")
     dds = DESeq(dds, betaPrior=FALSE)
     resultsNames(dds)
     clist <- c("HSV.d6_vs_HSV.d4", "HSV.d8_vs_HSV.d4")
    
     dds$condition <- relevel(dds$condition, "HSV.d6")
     dds = DESeq(dds, betaPrior=FALSE)
     resultsNames(dds)
     clist <- c("HSV.d8_vs_HSV.d6")
    
     for (i in clist) {
       contrast = paste("condition", i, sep="_")
       res = results(dds, name=contrast)
       res <- res[!is.na(res$log2FoldChange),]
       res_df <- as.data.frame(res)
    
       write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
       up <- subset(res_df, padj<=0.05 & log2FoldChange>=1)
       down <- subset(res_df, padj<=0.05 & log2FoldChange<=-1)
       write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
       write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
     }
    
       echo "contrast = paste(\"condition\", \"${i}\", sep=\"_\")"
       echo "res = results(dds, name=contrast)"
       #echo "res <- res[!is.na(res$log2FoldChange),]"
       echo "res <- na.omit(res)"
    
     ##https://github.com/kevinblighe/EnhancedVolcano
     #BiocManager::install("EnhancedVolcano")
     library("EnhancedVolcano")
     for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do
     #for i in HSV.d8_vs_control; do
       echo "res <- read.csv(file = paste(\"${i}\", \"all.txt\", sep=\"-\"), row.names = 1)"
       echo "res_df <- as.data.frame(res)"
       echo "png(\"${i}.png\",width=800, height=600)"
       #legendPosition = 'right',legendLabSize = 12,  arrowheads = FALSE,
       #echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.2, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(~Delta*\"$(echo $i | cut -d'_' -f1) versus \" *~Delta*\"$(echo $i | cut -d'_' -f3)\"))"
       echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(\"$(echo $i | cut -d'_' -f1) versus $(echo $i | cut -d'_' -f3)\"))"
       echo "dev.off()"
     done
    
     #DEBUG: why some genes in HSV.d8 in control high regulated --> ERROR! We should keep the number of reads in the raw counts, leading all genes low regulated! Not using the default normalization method!!!!
     #res <- read.csv(file = paste("HSV.d8_vs_control", "all.txt", sep="-"), row.names = 1)
     #res_df <- as.data.frame(res)
     #png("HSV.d8_vs_control.png",width=800, height=600)
     #EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("HSV.d8 versus control"))
     #dev.off()
    
     #under DIR degenes under KONSOLE
     for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;"; done
  6. clustering the genes and draw heatmap

     for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"; echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"; done
    
     cat *.id | sort -u > ids
     #add Gene_Id in the first line, delete the ""
     GOI <- read.csv("ids")$Gene_Id
     RNASeq.NoCellLine <- assay(rld)
    
     #install.packages("gplots")
     library("gplots")
    
     #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
     #datamat = RNASeq.NoCellLine[GOI, ]
     datamat = RNASeq.NoCellLine
     write.csv(as.data.frame(datamat), file ="gene_expressions.txt")
     constant_rows <- apply(datamat, 1, function(row) var(row) == 0)
     if(any(constant_rows)) {
       cat("Removing", sum(constant_rows), "constant rows.\n")
       datamat <- datamat[!constant_rows, ]
     }
     hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
     hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
     mycl = cutree(hr, h=max(hr$height)/1.5)
     mycol = c("YELLOW", "BLUE", "ORANGE", "MAGENTA", "CYAN", "RED", "GREEN", "MAROON", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN", "LIGHTRED", "LIGHTGREEN");
     mycol = mycol[as.vector(mycl)]
    
     #png("DEGs_heatmap.png", width=900, height=800)
     #cex.lab=10, labRow="",
     png("DEGs_heatmap.png", width=900, height=1000)
     heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
                 scale='row',trace='none',col=bluered(75), 
                 RowSideColors = mycol, margins=c(10,20), cexRow=1.5, srtCol=45, lhei = c(2, 8))  #rownames(datamat)  
     #heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(5,5), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', labRow="", srtCol=30, lhei=c(0.1,2))
     dev.off()
    
     #### cluster members #####
     write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
     write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt') 
     write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')  
     #~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o DEGs_heatmap_cluster_members.xls
    
     ~/Tools/csv2xls-0.4/csv_to_xls.py \
     significant_gene_expressions.txt \
     -d',' -o DEGs_heatmap_expression_data.xls;

Leave a Reply

Your email address will not be published. Required fields are marked *