RNA-seq skin organoids on GRCh38+chrHsv1

import data and pca-plot

 # Import the required libraries
 library("AnnotationDbi")
 library("clusterProfiler")
 library("ReactomePA")
 library(gplots)
 library(tximport)
 library(DESeq2)
 setwd("~/DATA/Data_Manja_RNAseq_Organoids_Virus/results_chrHsv1_downstream/star_salmon")
 # Define paths to your Salmon output quantification files
 files <- c("control_r1" = "./control_r1/quant.sf",
           "control_r2" = "./control_r2/quant.sf",
           "HSV.d2_r1" = "./HSV.d2_r1/quant.sf",
           "HSV.d2_r2" = "./HSV.d2_r2/quant.sf",
           "HSV.d4_r1" = "./HSV.d4_r1/quant.sf",
           "HSV.d4_r2" = "./HSV.d4_r2/quant.sf",
           "HSV.d6_r1" = "./HSV.d6_r1/quant.sf",
           "HSV.d6_r2" = "./HSV.d6_r2/quant.sf",
           "HSV.d8_r1" = "./HSV.d8_r1/quant.sf",
           "HSV.d8_r2" = "./HSV.d8_r2/quant.sf")
 # Import the transcript abundance data with tximport
 txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
 # Define the replicates and condition of the samples
 replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2"))
 condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8", "HSV.d8"))
 # Define the colData for DESeq2
 colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
 # Create DESeqDataSet object
 dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
 # In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps.
 # Filter data to retain only genes with more than 2 counts > 3 across all samples
 # dds <- dds[rowSums(counts(dds) > 3) > 2, ]
 # Output raw count data to a CSV file
 write.csv(counts(dds), file="transcript_counts.csv")

 # -- gene-level count data --
 # Read in the tx2gene map from salmon_tx2gene.tsv
 #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
 tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
 # Set the column names
 colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
 # Remove the gene_name column if not needed
 tx2gene <- tx2gene[,1:2]
 # Import and summarize the Salmon data with tximport
 txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
 # Continue with the DESeq2 workflow as before...
 colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
 dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
 #dds <- dds[rowSums(counts(dds) > 3) > 2, ]    #60605-->26543

 write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")
 #~/Tools/csv2xls-0.4/csv_to_xls.py gene_counts.csv -d',' -o gene_counts.xls
 #TODO: why a lot of reads were removed due to the too_short?
 #STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output
 dim(counts(dds))
 head(counts(dds), 10)

 #DEBUG: DESeq should not used here!?
 #TODO_NEXT_WEEK: rerun without fistly DESeq(dds) to compare if the results is the same to process_1 
 #dds <- DESeq(dds)
 rld <- rlogTransformation(dds)

 # draw simple pca and heatmap
 library(gplots) 
 library("RColorBrewer")
 #mat <- assay(rld)
 #mm <- model.matrix(~condition, colData(rld))
 #mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm)
 #assay(rld) <- mat
 # -- pca --
 png("pca.png", 1200, 800)
 plotPCA(rld, intgroup=c("condition"))
 dev.off()
 # -- heatmap --
 png("heatmap.png", 1200, 800)
 distsRL <- dist(t(assay(rld)))
 mat <- as.matrix(distsRL)
 hc <- hclust(distsRL)
 hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
 heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
 dev.off()

draw 3D PCA plots.

 library(gplots) 
 library("RColorBrewer")

 library(ggplot2)
 data <- plotPCA(rld, intgroup=c("condition", "replicate"), returnData=TRUE)
 write.csv(data, file="plotPCA_data.csv")
 #calculate all PCs including PC3 with the following codes
 library(genefilter)
 ntop <- 500
 rv <- rowVars(assay(rld))
 select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
 mat <- t( assay(rld)[select, ] )
 pc <- prcomp(mat)
 pc$x[,1:3]
 #df_pc <- data.frame(pc$x[,1:3])
 df_pc <- data.frame(pc$x)
 identical(rownames(data), rownames(df_pc)) #-->TRUE

 data$PC1 <- NULL
 data$PC2 <- NULL
 merged_df <- merge(data, df_pc, by = "row.names")
 #merged_df <- merged_df[, -1]
 row.names(merged_df) <- merged_df$Row.names
 merged_df$Row.names <- NULL  # remove the "name" column
 merged_df$name <- NULL
 merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10","group","condition","replicate")]
 write.csv(merged_df, file="merged_df_10PCs.csv")
 summary(pc)  
 #0.5333  0.2125 0.06852
 #0.8026 0.09042 0.06578

 draw_3D.py

 # adjust proportion to real values in the following plot
 import plotly.graph_objects as go
 import pandas as pd
 from sklearn.decomposition import PCA
 import numpy as np
 # Read in data as a pandas dataframe
 #df = pd.DataFrame({
 #    'PC1': [-13.999925, -12.504291, -12.443057, -13.065235, -17.316215],
 #    'PC2': [-1.498823, -3.342411, -6.067055, -8.205809, 3.293993],
 #    'PC3': [-3.335085, 15.207755, -14.725450, 15.078469, -6.917358],
 #    'condition': ['GFP d3', 'GFP d3', 'GFP d8', 'GFP d8', 'GFP+mCh d9/12'],
 #    'replicate': ['DI', 'DII', 'DI', 'DII', 'DI']
 #})
 df = pd.read_csv('merged_df_10PCs.csv', index_col=0, header=0)
 df['condition'] = df['condition'].replace("control", "control")
 df['condition'] = df['condition'].replace("HSV.d2", "day 2")
 df['condition'] = df['condition'].replace("HSV.d4", "day 4")
 df['condition'] = df['condition'].replace("HSV.d6", "day 6")
 df['condition'] = df['condition'].replace("HSV.d8", "day 8")
 # Fit PCA model to reduce data dimensions to 3
 pca = PCA(n_components=3)
 pca.fit(df.iloc[:, :-3])
 X_reduced = pca.transform(df.iloc[:, :-3])

 # Get variance ratios
 explained_variance_ratio = pca.explained_variance_ratio_

 # Add reduced data back to dataframe
 df['PC1'] = X_reduced[:, 0]
 df['PC2'] = X_reduced[:, 1]
 df['PC3'] = X_reduced[:, 2]
 # Create PCA plot with 3D scatter
 fig = go.Figure()

 ##ff7f00
 condition_color_map = {
     'control': 'rgb(100, 100, 100)',
     'day 2': '#33a02c',
     'day 4': '#1f78b4',
     'day 6': '#e31a1c',
     'day 8': 'magenta'
 }
 replicate_symbol_map = {'r1': 'circle', 'r2': 'diamond'}
 for replicate, replicate_symbol in replicate_symbol_map.items():
     for condition, condition_color in condition_color_map.items():
         mask = (df['condition'] == condition) & (df['replicate'] == replicate)
         fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'],
                                     mode='markers',
                                     name=f'{condition}' if replicate == 'r1' else None,
                                     legendgroup=f'{condition}',
                                     showlegend=True if replicate == 'r1' else False,
                                     marker=dict(size=6 if replicate_symbol in ['diamond'] else 10, opacity=0.8, color=condition_color, symbol=replicate_symbol)))
 for replicate, replicate_symbol in replicate_symbol_map.items():
     fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None],
                                 mode='markers',
                                 name=replicate,
                                 legendgroup=f'{replicate}',
                                 showlegend=True,
                                 marker=dict(size=10, opacity=1, color='black', symbol=replicate_symbol),
                                 hoverinfo='none'))
 # Annotations for the legend blocks
 #TODO: calculate the PC values.
 #TODO: adjust the axis length according to the actual size of axis!
 fig.update_layout(
     annotations=[
         dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False,
               text='Condition', font=dict(size=15)),
         dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False,
               text='Replicate', font=dict(size=15))
     ],
     scene=dict(
         #aspectmode='cube',
         #xaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC1: 53% v.', scaleratio=0.53),
         #yaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC2: 21% v.', scaleratio=0.21),
         #zaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC3: 7% variance', scaleratio=0.07),
         aspectmode='manual',
         aspectratio=dict(x=explained_variance_ratio[0], y=explained_variance_ratio[1], z=explained_variance_ratio[2]),
         xaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC1: 53% v.', range=[min(df['PC1']), max(df['PC1'])]),
         yaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC2: 21% v.', range=[min(df['PC2']), max(df['PC2'])]),
         zaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC3: 7% variance', range=[min(df['PC3']), max(df['PC3'])]),

         bgcolor='white'
     ),
     margin=dict(l=5, r=5, b=5, t=0)  # Adjust the margins to prevent clipping of axis titles
 )
 #fig.show()
 fig.write_image("fig1.svg")

 fig.update_layout(
     annotations=[
         dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False,
             text='Condition', font=dict(size=15)),
         dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False,
             text='Donor', font=dict(size=15)),
         dict(x=1.08, y=0.2, xref='paper', yref='paper', showarrow=False,
             text=f'PC3: {explained_variance_ratio[2]*100:.2f}% v.', font=dict(size=15), textangle=-90)
     ],
     scene=dict(
         aspectmode='manual',
         aspectratio=dict(x=explained_variance_ratio[0]*2, y=explained_variance_ratio[1]*2, z=explained_variance_ratio[2]*2),
         #, range=[min(df['PC1']), max(df['PC1'])]
         #, range=[min(df['PC2']), max(df['PC2'])]
         #, range=[min(df['PC3']), max(df['PC3'])]
         xaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title=f'PC1: {explained_variance_ratio[0]*100:.2f}% variance'),
         yaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title=f'PC2: {explained_variance_ratio[1]*100:.2f}% v.'),
         zaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title=''),
         bgcolor='white'
     ),
     margin=dict(l=0, r=0, b=0, t=0)  # Adjust the margins to prevent clipping of axis titles
 )
 fig.write_image("PCA_3D.svg")
 #/usr/bin/convert g235.png -crop 3250x1680+1+750 PCA_3D_.png

(optional) estimate size factors

 > head(dds)
 class: DESeqDataSet 
 dim: 6 10 
 metadata(1): version
 assays(6): counts avgTxLength ... H cooks
 rownames(6): ENSG00000000003 ENSG00000000005 ... ENSG00000000460
   ENSG00000000938
 rowData names(34): baseMean baseVar ... deviance maxCooks
 colnames(10): control_r1 control_r2 ... HSV.d8_r1 HSV.d8_r2
 colData names(2): condition replicate

 #convert bam to bigwig using deepTools by feeding inverse of DESeq’s size Factor
 sizeFactors(dds)
 #NULL
 dds <- estimateSizeFactors(dds)
 > sizeFactors(dds)

 raw_counts <- counts(dds)
 normalized_counts <- counts(dds, normalized=TRUE)
 #write.table(raw_counts, file="raw_counts.txt", sep="\t", quote=F, col.names=NA)
 #write.table(normalized_counts, file="normalized_counts.txt", sep="\t", quote=F, col.names=NA)

 # ---- DEBUG sizeFactors(dds) always NULL, see https://support.bioconductor.org/p/97676/ ----
 nm <- assays(dds)[["avgTxLength"]]
 sf <- estimateSizeFactorsForMatrix(counts(dds), normMatrix=nm)

 assays(dds)$counts  # for count data
 assays(dds)$avgTxLength  # for average transcript length, etc.
 assays(dds)$normalizationFactors

 In normal circumstances, the size factors should be stored in the DESeqDataSet object itself and not in the assays, so they are typically not retrievable via the assays() function. However, due to the issues you're experiencing, you might be able to manually compute the size factors and assign them back to the DESeqDataSet.

 To calculate size factors manually, DESeq2 uses the median ratio method. Here's a very simplified version of how you could compute this manually:
 > assays(dds)
 List of length 6
 names(6): counts avgTxLength normalizationFactors mu H cooks

 To calculate size factors manually, DESeq2 uses the median ratio method. Here's a very simplified version of how you could compute this manually:

 geoMeans <- apply(assays(dds)$counts, 1, function(row) if (all(row == 0)) 0 else exp(mean(log(row[row != 0]))))
 sizeFactors(dds) <- median(assays(dds)$counts / geoMeans, na.rm = TRUE)

 # ---- DEBUG END ----

 #unter konsole
 #  control_r1  ...
 # 1/0.9978755  ... 

 > sizeFactors(dds)
                     HeLa_TO_r1                      HeLa_TO_r2 
                       0.9978755                       1.1092227 

 1/0.9978755=1.002129023
 1/1.1092227=

 #bamCoverage --bam ../markDuplicates/${sample}Aligned.sortedByCoord.out.bam -o ${sample}_norm.bw --binSize 10 --scaleFactor  --effectiveGenomeSize 2864785220
 bamCoverage --bam ../markDuplicates/HeLa_TO_r1Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r1.bw --binSize 10 --scaleFactor 1.002129023     --effectiveGenomeSize 2864785220
 bamCoverage --bam ../markDuplicates/HeLa_TO_r2Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r2.bw --binSize 10 --scaleFactor  0.901532217        --effectiveGenomeSize 2864785220

compare the normalization methods

 # ---- draw normalization before and after ----
 ### Let's implement such a function
 ### cds is a countDataset
 estimSf <- function (cds){
     # Get the count matrix
     cts <- counts(cds)

     # Compute the geometric mean
     geomMean <- function(x) prod(x)^(1/length(x))

     # Compute the geometric mean over the line
     gm.mean  <-  apply(cts, 1, geomMean)

     # Zero values are set to NA (avoid subsequentcdsdivision by 0)
     gm.mean[gm.mean == 0] <- NA

     # Divide each line by its corresponding geometric mean
     # sweep(x, MARGIN, STATS, FUN = "-", check.margin = TRUE, ...)
     # MARGIN: 1 or 2 (line or columns)
     # STATS: a vector of length nrow(x) or ncol(x), depending on MARGIN
     # FUN: the function to be applied
     cts <- sweep(cts, 1, gm.mean, FUN="/")

     # Compute the median over the columns
     med <- apply(cts, 2, median, na.rm=TRUE)

     # Return the scaling factor
     return(med)
 }
 #https://dputhier.github.io/ASG/practicals/rnaseq_diff_Snf2/rnaseq_diff_Snf2.html
 #http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#data-transformations-and-visualization
 #https://hbctraining.github.io/DGE_workshop/lessons/02_DGE_count_normalization.html
 #https://hbctraining.github.io/DGE_workshop/lessons/04_DGE_DESeq2_analysis.html
 #https://genviz.org/module-04-expression/0004/02/01/DifferentialExpression/
 #DESeq2’s median of ratios [1]
 #EdgeR’s trimmed mean of M values (TMM) [2]
 #http://www.nathalievialaneix.eu/doc/html/TP1_normalization.html  #very good website!
 test_normcount <- sweep(raw_counts, 2, sizeFactors(dds), "/")
 sum(test_normcount != normalized_counts)

 head(estimSf(dds))
 all(round(estimSf(dds),6) == round(sizeFactors(dds), 6))
 ## Checking the normalization
 png("normalization.png", width=800, height=600)
 epsilon <- 1 # pseudo-count to avoid problems with log(0)
 par(mfrow=c(1,2),cex.lab=0.7)
 boxplot(log2(raw_counts+epsilon), cex.axis=0.7, las=1, xlab="log2(raw counts)", horizontal=TRUE, main="Raw counts")
 boxplot(log2(normalized_counts+epsilon), cex.axis=0.7, las=1, xlab="log2(normalized counts)", horizontal=TRUE, main="Normalized counts") 
 #boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)
 #plotDensity(log2(counts(dds.norm)+epsilon),  col=col.pheno.selected, 
 #            xlab="log2(counts)", cex.lab=0.7, panel.first=grid()) 
 #plotDensity(log2(counts(dds.norm, normalized=TRUE)+epsilon), col=col.pheno.selected, 
 #            xlab="log2(normalized counts)", cex.lab=0.7, panel.first=grid()) 
 dev.off()

 # since we Gene-level differential expression analysis with DESeq2, the splicing plays no role in the analysis!
 # 用nanopore 可以 compare transcript length distribution. 有可能Cellline很长，Extracellular vesicles (EVs)很短!

 library(ggplot2)
 library(gridExtra)
 library(reshape2)
 library(mixOmics)
 library(RColorBrewer)
 library(DESeq)
 library(edgeR)
 library(VennDiagram)
 library(devtools)
 raw_counts_wn <- raw_counts[rowSums(raw_counts) > 0, ]
 dim(raw_counts_wn)

 #--Raw counts--
 pseudo_counts <- log2(raw_counts_wn + 1)
 head(pseudo_counts)
 df_raw <- melt(pseudo_counts, id = rownames(raw_counts_wn))
 names(df_raw)[1:2]<- c("id", "sample")
 df_raw$method <- rep("Raw counts", nrow(df_raw))  
 head(df_raw)

 #--DESeq--
 cData = data.frame(row.names=colnames(raw_counts_wn), replicates=replicates, ids=ids)
 dge<-DESeqDataSetFromMatrix(countData=raw_counts_wn, colData=cData, design=~replicates)
 dge <- estimateSizeFactors(dge)
 sizeFactors(dge)
 deseq_normcount <- counts(dge, normalized = TRUE)
 test_normcount <- sweep(raw_counts_wn, 2, sizeFactors(dge), "/")
 sum(test_normcount != deseq_normcount)
 pseudo_deseq <- log2(deseq_normcount + 1)
 df_deseq <- melt(pseudo_deseq, id = rownames(raw_counts_wn))
 names(df_deseq)[1:2]<- c("id", "sample")
 df_deseq$method <- rep("DESeq (RLE)", nrow(df_raw))  

 #--edgeR--
 dge2 <- DGEList(raw_counts_wn)
 dge2
 dge2$samples

 #--Total count--
 pseudo_TC <- log2(cpm(dge2) + 1)
 df_TC <- melt(pseudo_TC, id = rownames(raw_counts_wn))
 names(df_TC)[1:2] <- c ("id", "sample")
 df_TC$method <- rep("TC", nrow(df_TC))

 ##--RPKM--
 #gene_lengths_wn <- gene_lengths[rowSums(raw_counts) > 0]
 #pseudo_RPKM <- log2(rpkm(dge2, gene.length = gene_lengths_wn) + 1)
 #df_RPKM <- melt(pseudo_RPKM, id = rownames(raw_counts_wn))
 #names(df_RPKM)[1:2] <- c ("id", "sample")
 #df_RPKM$method <- rep("RPKM", nrow(df_RPKM))

 #--Upper quartile--
 dge2 <- calcNormFactors(dge2, method = "upperquartile")
 dge2$samples
 test_normcount <- sweep(dge2$counts, 2,
                         dge2$samples$lib.size*dge2$samples$norm.factors / 10^6,
                         "/")
 range(as.vector(test_normcount - cpm(dge2)))
 pseudo_UQ <- log2(cpm(dge2) + 1)

 df_UQ <- melt(pseudo_UQ, id = rownames(raw_counts_wn))
 names(df_UQ)[1:2] <- c ("id", "sample")
 df_UQ$method <- rep("UQ", nrow(df_UQ))

 #--TMM--
 dge2 <- calcNormFactors(dge2, method = "TMM")
 dge2$samples
 pseudo_TMM <- log2(cpm(dge2) + 1)
 df_TMM <- melt(pseudo_TMM, id = rownames(raw_counts_wn))
 names(df_TMM)[1:2] <- c ("id", "sample")
 #MODIFIED!
 df_TMM$method <- rep("DESeq (RLE)", nrow(df_TMM))  #TMM

 #--Comparison--
 png("normalization.png", width=800, height=600)
 #df_allnorm <- rbind(df_raw, df_deseq, df_TC, df_UQ, df_TMM)
 #df_allnorm$method <- factor(df_allnorm$method, levels = c("Raw counts", "DESeq (RLE)", "TC",  "TMM", "UQ"))
 df_allnorm <- rbind(df_raw, df_TMM)
 df_allnorm$method <- factor(df_allnorm$method, levels = c("Raw counts", "DESeq (RLE)"))
 p <- ggplot(data=df_allnorm, aes(x=sample, y=value, fill=method))
 p <- p + geom_boxplot()  
 p <- p + theme_bw()
 p <- p + ggtitle("Boxplots of normalized pseudo counts\n
 for all samples by normalization methods")
 p <- p + facet_grid(. ~ method) 
 p <- p + ylab(expression(log[2] ~ (normalized ~ count + 1))) + xlab("")
 p <- p + theme(title = element_text(size=10), axis.text.x = element_blank(), 
               axis.ticks.x = element_blank())
 print(p)
 dev.off()

select the differentially expressed genes

 #https://galaxyproject.eu/posts/2020/08/22/three-steps-to-galaxify-your-tool/
 #https://www.biostars.org/p/282295/
 #https://www.biostars.org/p/335751/

 #> condition
 # [1] control control HSV.d2  HSV.d2  HSV.d4  HSV.d4  HSV.d6  HSV.d6  HSV.d8  HSV.d8 
 #Levels: control HSV.d2 HSV.d4 HSV.d6 HSV.d8

 #CONSOLE: mkdir /home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/results_chrHsv1_downstream/star_salmon/degenes
 setwd("/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/results_chrHsv1_downstream/star_salmon/degenes")
 #---- relevel to control ----
 dds$condition <- relevel(dds$condition, "control")
 dds = DESeq(dds, betaPrior=FALSE)
 resultsNames(dds)
 clist <- c("HSV.d2_vs_control","HSV.d4_vs_control", "HSV.d6_vs_control", "HSV.d8_vs_control")

 dds$condition <- relevel(dds$condition, "HSV.d2")
 dds = DESeq(dds, betaPrior=FALSE)
 resultsNames(dds)
 clist <- c("HSV.d4_vs_HSV.d2", "HSV.d6_vs_HSV.d2", "HSV.d8_vs_HSV.d2")

 dds$condition <- relevel(dds$condition, "HSV.d4")
 dds = DESeq(dds, betaPrior=FALSE)
 resultsNames(dds)
 clist <- c("HSV.d6_vs_HSV.d4", "HSV.d8_vs_HSV.d4")

 dds$condition <- relevel(dds$condition, "HSV.d6")
 dds = DESeq(dds, betaPrior=FALSE)
 resultsNames(dds)
 clist <- c("HSV.d8_vs_HSV.d6")

 for (i in clist) {
   contrast = paste("condition", i, sep="_")
   res = results(dds, name=contrast)
   res <- res[!is.na(res$log2FoldChange),]
   res_df <- as.data.frame(res)

   write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
   up <- subset(res_df, padj<=0.05 & log2FoldChange>=1)
   down <- subset(res_df, padj<=0.05 & log2FoldChange<=-1)
   write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
   write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
 }

   echo "contrast = paste(\"condition\", \"${i}\", sep=\"_\")"
   echo "res = results(dds, name=contrast)"
   #echo "res <- res[!is.na(res$log2FoldChange),]"
   echo "res <- na.omit(res)"

 ##https://github.com/kevinblighe/EnhancedVolcano
 #BiocManager::install("EnhancedVolcano")
 library("EnhancedVolcano")
 for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do
 #for i in HSV.d8_vs_control; do
   echo "res <- read.csv(file = paste(\"${i}\", \"all.txt\", sep=\"-\"), row.names = 1)"
   echo "res_df <- as.data.frame(res)"
   echo "png(\"${i}.png\",width=800, height=600)"
   #legendPosition = 'right',legendLabSize = 12,  arrowheads = FALSE,
   #echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.2, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(~Delta*\"$(echo $i | cut -d'_' -f1) versus \" *~Delta*\"$(echo $i | cut -d'_' -f3)\"))"
   echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(\"$(echo $i | cut -d'_' -f1) versus $(echo $i | cut -d'_' -f3)\"))"
   echo "dev.off()"
 done

 #DEBUG: why some genes in HSV.d8 in control high regulated --> ERROR! We should keep the number of reads in the raw counts, leading all genes low regulated! Not using the default normalization method!!!!
 #res <- read.csv(file = paste("HSV.d8_vs_control", "all.txt", sep="-"), row.names = 1)
 #res_df <- as.data.frame(res)
 #png("HSV.d8_vs_control.png",width=800, height=600)
 #EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("HSV.d8 versus control"))
 #dev.off()

 #under DIR degenes under KONSOLE
 for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;"; done

clustering the genes and draw heatmap

 for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"; echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"; done

 cat *.id | sort -u > ids
 #add Gene_Id in the first line, delete the ""
 GOI <- read.csv("ids")$Gene_Id
 RNASeq.NoCellLine <- assay(rld)

 #install.packages("gplots")
 library("gplots")

 #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
 #datamat = RNASeq.NoCellLine[GOI, ]
 datamat = RNASeq.NoCellLine
 write.csv(as.data.frame(datamat), file ="gene_expressions.txt")
 constant_rows <- apply(datamat, 1, function(row) var(row) == 0)
 if(any(constant_rows)) {
   cat("Removing", sum(constant_rows), "constant rows.\n")
   datamat <- datamat[!constant_rows, ]
 }
 hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
 hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
 mycl = cutree(hr, h=max(hr$height)/1.5)
 mycol = c("YELLOW", "BLUE", "ORANGE", "MAGENTA", "CYAN", "RED", "GREEN", "MAROON", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN", "LIGHTRED", "LIGHTGREEN");
 mycol = mycol[as.vector(mycl)]

 #png("DEGs_heatmap.png", width=900, height=800)
 #cex.lab=10, labRow="",
 png("DEGs_heatmap.png", width=900, height=1000)
 heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
             scale='row',trace='none',col=bluered(75), 
             RowSideColors = mycol, margins=c(10,20), cexRow=1.5, srtCol=45, lhei = c(2, 8))  #rownames(datamat)  
 #heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(5,5), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', labRow="", srtCol=30, lhei=c(0.1,2))
 dev.off()

 #### cluster members #####
 write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
 write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt') 
 write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')  
 #~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o DEGs_heatmap_cluster_members.xls

 ~/Tools/csv2xls-0.4/csv_to_xls.py \
 significant_gene_expressions.txt \
 -d',' -o DEGs_heatmap_expression_data.xls;

Microbial bioinformatics

Microbial bioinformatics uses computational tools to analyze genomes, track evolution, and study functions in microorganisms, including bacteria and viruses.

Leave a Reply Cancel reply