RNA-seq data analysis (R-part) for S. epidermidis 1585

Data input and generate PCA plot

 # Import the required libraries
 library("AnnotationDbi")
 library("clusterProfiler")
 library("ReactomePA")
 library(gplots)
 library(tximport)
 library(DESeq2)
 setwd("/home/jhuang/DATA/Data_Marc_RNA-seq_Sepidermidis/results_1585/star_salmon")
 # Define paths to your Salmon output quantification files
 files <- c("15m_A" = "./EX_15_min_A/quant.sf",
           "15m_B" = "./EX_15_min_B/quant.sf",
           "15m_C" = "./EX_15_min_C/quant.sf",
           "1h_A" = "./EX_1h_A/quant.sf",
           "1h_B" = "./EX_1h_B/quant.sf",
           "1h_C" = "./EX_1h_C/quant.sf",
           "2h_A" = "./EX_2h_A/quant.sf",
           "2h_B" = "./EX_2h_B/quant.sf",
           "2h_C" = "./EX_2h_C/quant.sf",
           "4h_A" = "./EX_4h_A/quant.sf",
           "4h_B" = "./EX_4h_B/quant.sf",
           "4h_C" = "./EX_4h_C/quant.sf",
           "6h_A" = "./EX_6h_A/quant.sf",
           "6h_B" = "./EX_6h_B/quant.sf",
           "6h_C" = "./EX_6h_C/quant.sf",
           "4d_A" = "./EX_Day_4_A/quant.sf",
           "4d_B" = "./EX_Day_4_B/quant.sf",
           "4d_C" = "./EX_Day_4_C/quant.sf")
 # Import the transcript abundance data with tximport
 txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
 # Define the replicates and condition of the samples
 replicate <- factor(c("A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B", "C"))
 condition <- factor(c("15m", "15m", "15m", "1h", "1h", "1h", "2h", "2h", "2h", "4h", "4h", "4h", "6h", "6h", "6h", "4d", "4d", "4d"))
 # Define the colData for DESeq2
 colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
 # Create DESeqDataSet object
 dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
 # In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps.
 # Filter data to retain only genes with more than 2 counts > 3 across all samples
 # dds <- dds[rowSums(counts(dds) > 3) > 2, ]
 # Output raw count data to a CSV file
 write.csv(counts(dds), file="transcript_counts.csv")
 # -- gene-level count data --
 # Read in the tx2gene map from salmon_tx2gene.tsv
 #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
 tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
 # Set the column names
 colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
 # Remove the gene_name column if not needed
 tx2gene <- tx2gene[,1:2]
 # Import and summarize the Salmon data with tximport
 txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
 # Continue with the DESeq2 workflow as before...
 colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
 dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
 #dds <- dds[rowSums(counts(dds) > 3) > 2, ]    #60605-->26543
 write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")
 #~/Tools/csv2xls-0.4/csv_to_xls.py gene_counts.csv -d',' -o gene_counts.xls

 #TODO: why a lot of reads were removed due to the too_short?
 #STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output
 dim(counts(dds))
 head(counts(dds), 10)
 #DEBUG: DESeq should not used here!?
 #TODO_NEXT_WEEK: rerun without fistly DESeq(dds) to compare if the results is the same to process_1 
 #dds <- DESeq(dds)
 rld <- rlogTransformation(dds)

 # draw simple pca and heatmap
 library(gplots) 
 library("RColorBrewer")
 #mat <- assay(rld)
 #mm <- model.matrix(~condition, colData(rld))
 #mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm)
 #assay(rld) <- mat
 # -- pca --
 png("pca.png", 1200, 800)
 plotPCA(rld, intgroup=c("condition"))
 dev.off()
 # -- heatmap --
 png("heatmap.png", 1200, 800)
 distsRL <- dist(t(assay(rld)))
 mat <- as.matrix(distsRL)
 hc <- hclust(distsRL)
 hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
 heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
 dev.off()

Analysis on differentially expressed genes

 setwd("/home/jhuang/DATA/Data_Marc_RNA-seq_Sepidermidis/results_1585/star_salmon/degenes")

 dds$condition <- relevel(dds$condition, "15m")
 dds = DESeq(dds, betaPrior=FALSE)
 resultsNames(dds)
 clist <- c("1h_vs_15m", "2h_vs_15m", "4h_vs_15m", "6h_vs_15m",  "4d_vs_15m")

 for (i in clist) {
   contrast = paste("condition", i, sep="_")
   res = results(dds, name=contrast)
   res <- res[!is.na(res$log2FoldChange),]
   res_df <- as.data.frame(res)
   write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
   up <- subset(res_df, padj<=0.05 & log2FoldChange>=2)
   down <- subset(res_df, padj<=0.05 & log2FoldChange<=-2)
   write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
   write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
 }

 library("EnhancedVolcano")
 for i in 1h_vs_15m 2h_vs_15m 4h_vs_15m 6h_vs_15m 4d_vs_15m; do
   echo "res <- read.csv(file = paste(\"${i}\", \"all.txt\", sep=\"-\"), row.names = 1)"
   echo "res_df <- as.data.frame(res)"
   echo "png(\"${i}.png\",width=800, height=600)"
   #legendPosition = 'right',legendLabSize = 12,  arrowheads = FALSE,
   #echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.2, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(~Delta*\"$(echo $i | cut -d'_' -f1) versus \" *~Delta*\"$(echo $i | cut -d'_' -f3)\"))"
   echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(\"$(echo $i | cut -d'_' -f1) versus $(echo $i | cut -d'_' -f3)\"))"
   echo "dev.off()"
 done

 res <- read.csv(file = paste("1h_vs_15m", "all.txt", sep="-"), row.names = 1)
 res_df <- as.data.frame(res)
 png("1h_vs_15m.png",width=1000, height=800)
 EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', ylim = c(0, 1), pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("1h versus 15m"))
 dev.off()

 res <- read.csv(file = paste("2h_vs_15m", "all.txt", sep="-"), row.names = 1)
 res_df <- as.data.frame(res)
 png("2h_vs_15m.png",width=1000, height=1000)
 EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', ylim = c(0, 4), pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("2h versus 15m"))
 dev.off()

 res <- read.csv(file = paste("4h_vs_15m", "all.txt", sep="-"), row.names = 1)
 res_df <- as.data.frame(res)
 png("4h_vs_15m.png",width=1000, height=1200)
 EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', ylim = c(0, 5.5), pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("4h versus 15m"))
 dev.off()

 res <- read.csv(file = paste("6h_vs_15m", "all.txt", sep="-"), row.names = 1)
 res_df <- as.data.frame(res)
 png("6h_vs_15m.png",width=1000, height=1600)
 EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', ylim = c(0, 17), pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("6h versus 15m"))
 dev.off()

 res <- read.csv(file = paste("4d_vs_15m", "all.txt", sep="-"), row.names = 1)
 res_df <- as.data.frame(res)
 png("4d_vs_15m.png",width=1000, height=1600)
 EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("4d versus 15m"))
 dev.off()

 #under DIR degenes under KONSOLE
 for i in 1h_vs_15m 2h_vs_15m 4h_vs_15m 6h_vs_15m 4d_vs_15m; do 
 echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;"; 
 done

Clustering the genes and draw heatmap

 for i in 1h_vs_15m 2h_vs_15m 4h_vs_15m 6h_vs_15m 4d_vs_15m; do
 echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"; echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"; done
 cat *.id | sort -u > ids
 #add Gene_Id in the first line, delete the ""
 GOI <- read.csv("ids")$Gene_Id
 RNASeq.NoCellLine <- assay(rld)
 #install.packages("gplots")
 library("gplots")
 #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
 datamat = RNASeq.NoCellLine[GOI, ]
 #datamat = RNASeq.NoCellLine
 write.csv(as.data.frame(datamat), file ="gene_expressions.txt")
 constant_rows <- apply(datamat, 1, function(row) var(row) == 0)
 if(any(constant_rows)) {
   cat("Removing", sum(constant_rows), "constant rows.\n")
   datamat <- datamat[!constant_rows, ]
 }
 hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
 hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
 mycl = cutree(hr, h=max(hr$height)/1.4)
 mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "MAGENTA", "CYAN", "GREEN", "RED", "MAROON", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN", "LIGHTGREEN"); #"LIGHTRED", 
 mycol = mycol[as.vector(mycl)]
 #png("DEGs_heatmap.png", width=900, height=800)
 #cex.lab=10, labRow="",
 png("DEGs_heatmap.png", width=900, height=1000)
 #heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
 #            scale='row',trace='none',col=bluered(75), 
 #            RowSideColors = mycol, margins=c(10,20), cexRow=1.5, srtCol=45, lhei = c(2, 8))  #rownames(datamat)  
 # cexCol, cexRow
 heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(5,2), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', labRow="", srtCol=40, lhei=c(1,7), cexCol=2)
 dev.off()

 #### cluster members #####
 write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
 write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt') 
 write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')  
 write.csv(names(subset(mycl, mycl == '4')),file='cluster4_MAGENTA.txt')
 write.csv(names(subset(mycl, mycl == '5')),file='cluster5_CYAN.txt')
 write.csv(names(subset(mycl, mycl == '6')),file='cluster6_GREEN.txt')
 ~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o DEGs_heatmap_cluster_members.xls
 #~/Tools/csv2xls-0.4/csv_to_xls.py significant_gene_expressions.txt -d',' -o DEGs_heatmap_expression_data.xls;

Generate Excel files from Genbank-file

 python3 /home/jhuang/Scripts/gb_to_excel.py 1585.gb 1585.xlsx
 #insert C./P. (Chrom or Plasmid1, Plasmid2, Plasmid3) as the second chrom in the 1585.xlsx.

M	T	W	T	F	S	S
						1
2	3	4	5	6	7	8
9	10	11	12	13	14	15
16	17	18	19	20	21	22
23	24	25	26	27	28	29
30	31

Microbial bioinformatics

Microbial bioinformatics uses computational tools to analyze genomes, track evolution, and study functions in microorganisms, including bacteria and viruses.

RNA-seq data analysis (R-part) for S. epidermidis 1585

Leave a Reply Cancel reply