RNA-seq data analysis (R-part) for S. epidermidis 1585

  1. Data input and generate PCA plot

     # Import the required libraries
     library("AnnotationDbi")
     library("clusterProfiler")
     library("ReactomePA")
     library(gplots)
     library(tximport)
     library(DESeq2)
     setwd("/home/jhuang/DATA/Data_Marc_RNA-seq_Sepidermidis/results_1585/star_salmon")
     # Define paths to your Salmon output quantification files
     files <- c("15m_A" = "./EX_15_min_A/quant.sf",
               "15m_B" = "./EX_15_min_B/quant.sf",
               "15m_C" = "./EX_15_min_C/quant.sf",
               "1h_A" = "./EX_1h_A/quant.sf",
               "1h_B" = "./EX_1h_B/quant.sf",
               "1h_C" = "./EX_1h_C/quant.sf",
               "2h_A" = "./EX_2h_A/quant.sf",
               "2h_B" = "./EX_2h_B/quant.sf",
               "2h_C" = "./EX_2h_C/quant.sf",
               "4h_A" = "./EX_4h_A/quant.sf",
               "4h_B" = "./EX_4h_B/quant.sf",
               "4h_C" = "./EX_4h_C/quant.sf",
               "6h_A" = "./EX_6h_A/quant.sf",
               "6h_B" = "./EX_6h_B/quant.sf",
               "6h_C" = "./EX_6h_C/quant.sf",
               "4d_A" = "./EX_Day_4_A/quant.sf",
               "4d_B" = "./EX_Day_4_B/quant.sf",
               "4d_C" = "./EX_Day_4_C/quant.sf")
     # Import the transcript abundance data with tximport
     txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
     # Define the replicates and condition of the samples
     replicate <- factor(c("A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B", "C"))
     condition <- factor(c("15m", "15m", "15m", "1h", "1h", "1h", "2h", "2h", "2h", "4h", "4h", "4h", "6h", "6h", "6h", "4d", "4d", "4d"))
     # Define the colData for DESeq2
     colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
     # Create DESeqDataSet object
     dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
     # In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps.
     # Filter data to retain only genes with more than 2 counts > 3 across all samples
     # dds <- dds[rowSums(counts(dds) > 3) > 2, ]
     # Output raw count data to a CSV file
     write.csv(counts(dds), file="transcript_counts.csv")
     # -- gene-level count data --
     # Read in the tx2gene map from salmon_tx2gene.tsv
     #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
     tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
     # Set the column names
     colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
     # Remove the gene_name column if not needed
     tx2gene <- tx2gene[,1:2]
     # Import and summarize the Salmon data with tximport
     txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
     # Continue with the DESeq2 workflow as before...
     colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
     dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
     #dds <- dds[rowSums(counts(dds) > 3) > 2, ]    #60605-->26543
     write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")
     #~/Tools/csv2xls-0.4/csv_to_xls.py gene_counts.csv -d',' -o gene_counts.xls
    
     #TODO: why a lot of reads were removed due to the too_short?
     #STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output
     dim(counts(dds))
     head(counts(dds), 10)
     #DEBUG: DESeq should not used here!?
     #TODO_NEXT_WEEK: rerun without fistly DESeq(dds) to compare if the results is the same to process_1 
     #dds <- DESeq(dds)
     rld <- rlogTransformation(dds)
    
     # draw simple pca and heatmap
     library(gplots) 
     library("RColorBrewer")
     #mat <- assay(rld)
     #mm <- model.matrix(~condition, colData(rld))
     #mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm)
     #assay(rld) <- mat
     # -- pca --
     png("pca.png", 1200, 800)
     plotPCA(rld, intgroup=c("condition"))
     dev.off()
     # -- heatmap --
     png("heatmap.png", 1200, 800)
     distsRL <- dist(t(assay(rld)))
     mat <- as.matrix(distsRL)
     hc <- hclust(distsRL)
     hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
     heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
     dev.off()
  2. Analysis on differentially expressed genes

     setwd("/home/jhuang/DATA/Data_Marc_RNA-seq_Sepidermidis/results_1585/star_salmon/degenes")
    
     dds$condition <- relevel(dds$condition, "15m")
     dds = DESeq(dds, betaPrior=FALSE)
     resultsNames(dds)
     clist <- c("1h_vs_15m", "2h_vs_15m", "4h_vs_15m", "6h_vs_15m",  "4d_vs_15m")
    
     for (i in clist) {
       contrast = paste("condition", i, sep="_")
       res = results(dds, name=contrast)
       res <- res[!is.na(res$log2FoldChange),]
       res_df <- as.data.frame(res)
       write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
       up <- subset(res_df, padj<=0.05 & log2FoldChange>=2)
       down <- subset(res_df, padj<=0.05 & log2FoldChange<=-2)
       write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
       write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
     }
    
     library("EnhancedVolcano")
     for i in 1h_vs_15m 2h_vs_15m 4h_vs_15m 6h_vs_15m 4d_vs_15m; do
       echo "res <- read.csv(file = paste(\"${i}\", \"all.txt\", sep=\"-\"), row.names = 1)"
       echo "res_df <- as.data.frame(res)"
       echo "png(\"${i}.png\",width=800, height=600)"
       #legendPosition = 'right',legendLabSize = 12,  arrowheads = FALSE,
       #echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.2, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(~Delta*\"$(echo $i | cut -d'_' -f1) versus \" *~Delta*\"$(echo $i | cut -d'_' -f3)\"))"
       echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(\"$(echo $i | cut -d'_' -f1) versus $(echo $i | cut -d'_' -f3)\"))"
       echo "dev.off()"
     done
    
     res <- read.csv(file = paste("1h_vs_15m", "all.txt", sep="-"), row.names = 1)
     res_df <- as.data.frame(res)
     png("1h_vs_15m.png",width=1000, height=800)
     EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', ylim = c(0, 1), pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("1h versus 15m"))
     dev.off()
    
     res <- read.csv(file = paste("2h_vs_15m", "all.txt", sep="-"), row.names = 1)
     res_df <- as.data.frame(res)
     png("2h_vs_15m.png",width=1000, height=1000)
     EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', ylim = c(0, 4), pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("2h versus 15m"))
     dev.off()
    
     res <- read.csv(file = paste("4h_vs_15m", "all.txt", sep="-"), row.names = 1)
     res_df <- as.data.frame(res)
     png("4h_vs_15m.png",width=1000, height=1200)
     EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', ylim = c(0, 5.5), pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("4h versus 15m"))
     dev.off()
    
     res <- read.csv(file = paste("6h_vs_15m", "all.txt", sep="-"), row.names = 1)
     res_df <- as.data.frame(res)
     png("6h_vs_15m.png",width=1000, height=1600)
     EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', ylim = c(0, 17), pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("6h versus 15m"))
     dev.off()
    
     res <- read.csv(file = paste("4d_vs_15m", "all.txt", sep="-"), row.names = 1)
     res_df <- as.data.frame(res)
     png("4d_vs_15m.png",width=1000, height=1600)
     EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("4d versus 15m"))
     dev.off()
    
     #under DIR degenes under KONSOLE
     for i in 1h_vs_15m 2h_vs_15m 4h_vs_15m 6h_vs_15m 4d_vs_15m; do 
     echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;"; 
     done
  3. Clustering the genes and draw heatmap

     for i in 1h_vs_15m 2h_vs_15m 4h_vs_15m 6h_vs_15m 4d_vs_15m; do
     echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"; echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"; done
     cat *.id | sort -u > ids
     #add Gene_Id in the first line, delete the ""
     GOI <- read.csv("ids")$Gene_Id
     RNASeq.NoCellLine <- assay(rld)
     #install.packages("gplots")
     library("gplots")
     #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
     datamat = RNASeq.NoCellLine[GOI, ]
     #datamat = RNASeq.NoCellLine
     write.csv(as.data.frame(datamat), file ="gene_expressions.txt")
     constant_rows <- apply(datamat, 1, function(row) var(row) == 0)
     if(any(constant_rows)) {
       cat("Removing", sum(constant_rows), "constant rows.\n")
       datamat <- datamat[!constant_rows, ]
     }
     hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
     hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
     mycl = cutree(hr, h=max(hr$height)/1.4)
     mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "MAGENTA", "CYAN", "GREEN", "RED", "MAROON", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN", "LIGHTGREEN"); #"LIGHTRED", 
     mycol = mycol[as.vector(mycl)]
     #png("DEGs_heatmap.png", width=900, height=800)
     #cex.lab=10, labRow="",
     png("DEGs_heatmap.png", width=900, height=1000)
     #heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
     #            scale='row',trace='none',col=bluered(75), 
     #            RowSideColors = mycol, margins=c(10,20), cexRow=1.5, srtCol=45, lhei = c(2, 8))  #rownames(datamat)  
     # cexCol, cexRow
     heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(5,2), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', labRow="", srtCol=40, lhei=c(1,7), cexCol=2)
     dev.off()
    
     #### cluster members #####
     write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
     write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt') 
     write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')  
     write.csv(names(subset(mycl, mycl == '4')),file='cluster4_MAGENTA.txt')
     write.csv(names(subset(mycl, mycl == '5')),file='cluster5_CYAN.txt')
     write.csv(names(subset(mycl, mycl == '6')),file='cluster6_GREEN.txt')
     ~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o DEGs_heatmap_cluster_members.xls
     #~/Tools/csv2xls-0.4/csv_to_xls.py significant_gene_expressions.txt -d',' -o DEGs_heatmap_expression_data.xls;
  4. Generate Excel files from Genbank-file

     python3 /home/jhuang/Scripts/gb_to_excel.py 1585.gb 1585.xlsx
     #insert C./P. (Chrom or Plasmid1, Plasmid2, Plasmid3) as the second chrom in the 1585.xlsx.

Leave a Reply

Your email address will not be published. Required fields are marked *