-
Data input and generate PCA plot
# Import the required libraries library("AnnotationDbi") library("clusterProfiler") library("ReactomePA") library(gplots) library(tximport) library(DESeq2) setwd("/home/jhuang/DATA/Data_Marc_RNA-seq_Sepidermidis/results_1585/star_salmon") # Define paths to your Salmon output quantification files files <- c("15m_A" = "./EX_15_min_A/quant.sf", "15m_B" = "./EX_15_min_B/quant.sf", "15m_C" = "./EX_15_min_C/quant.sf", "1h_A" = "./EX_1h_A/quant.sf", "1h_B" = "./EX_1h_B/quant.sf", "1h_C" = "./EX_1h_C/quant.sf", "2h_A" = "./EX_2h_A/quant.sf", "2h_B" = "./EX_2h_B/quant.sf", "2h_C" = "./EX_2h_C/quant.sf", "4h_A" = "./EX_4h_A/quant.sf", "4h_B" = "./EX_4h_B/quant.sf", "4h_C" = "./EX_4h_C/quant.sf", "6h_A" = "./EX_6h_A/quant.sf", "6h_B" = "./EX_6h_B/quant.sf", "6h_C" = "./EX_6h_C/quant.sf", "4d_A" = "./EX_Day_4_A/quant.sf", "4d_B" = "./EX_Day_4_B/quant.sf", "4d_C" = "./EX_Day_4_C/quant.sf") # Import the transcript abundance data with tximport txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE) # Define the replicates and condition of the samples replicate <- factor(c("A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B", "C")) condition <- factor(c("15m", "15m", "15m", "1h", "1h", "1h", "2h", "2h", "2h", "4h", "4h", "4h", "6h", "6h", "6h", "4d", "4d", "4d")) # Define the colData for DESeq2 colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files)) # Create DESeqDataSet object dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition) # In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps. # Filter data to retain only genes with more than 2 counts > 3 across all samples # dds <- dds[rowSums(counts(dds) > 3) > 2, ] # Output raw count data to a CSV file write.csv(counts(dds), file="transcript_counts.csv") # -- gene-level count data -- # Read in the tx2gene map from salmon_tx2gene.tsv #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE) tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE) # Set the column names colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name") # Remove the gene_name column if not needed tx2gene <- tx2gene[,1:2] # Import and summarize the Salmon data with tximport txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE) # Continue with the DESeq2 workflow as before... colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files)) dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition) #dds <- dds[rowSums(counts(dds) > 3) > 2, ] #60605-->26543 write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv") #~/Tools/csv2xls-0.4/csv_to_xls.py gene_counts.csv -d',' -o gene_counts.xls #TODO: why a lot of reads were removed due to the too_short? #STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output dim(counts(dds)) head(counts(dds), 10) #DEBUG: DESeq should not used here!? #TODO_NEXT_WEEK: rerun without fistly DESeq(dds) to compare if the results is the same to process_1 #dds <- DESeq(dds) rld <- rlogTransformation(dds) # draw simple pca and heatmap library(gplots) library("RColorBrewer") #mat <- assay(rld) #mm <- model.matrix(~condition, colData(rld)) #mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm) #assay(rld) <- mat # -- pca -- png("pca.png", 1200, 800) plotPCA(rld, intgroup=c("condition")) dev.off() # -- heatmap -- png("heatmap.png", 1200, 800) distsRL <- dist(t(assay(rld))) mat <- as.matrix(distsRL) hc <- hclust(distsRL) hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100) heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13)) dev.off()
-
Analysis on differentially expressed genes
setwd("/home/jhuang/DATA/Data_Marc_RNA-seq_Sepidermidis/results_1585/star_salmon/degenes") dds$condition <- relevel(dds$condition, "15m") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("1h_vs_15m", "2h_vs_15m", "4h_vs_15m", "6h_vs_15m", "4d_vs_15m") for (i in clist) { contrast = paste("condition", i, sep="_") res = results(dds, name=contrast) res <- res[!is.na(res$log2FoldChange),] res_df <- as.data.frame(res) write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-")) up <- subset(res_df, padj<=0.05 & log2FoldChange>=2) down <- subset(res_df, padj<=0.05 & log2FoldChange<=-2) write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-")) write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-")) } library("EnhancedVolcano") for i in 1h_vs_15m 2h_vs_15m 4h_vs_15m 6h_vs_15m 4d_vs_15m; do echo "res <- read.csv(file = paste(\"${i}\", \"all.txt\", sep=\"-\"), row.names = 1)" echo "res_df <- as.data.frame(res)" echo "png(\"${i}.png\",width=800, height=600)" #legendPosition = 'right',legendLabSize = 12, arrowheads = FALSE, #echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.2, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(~Delta*\"$(echo $i | cut -d'_' -f1) versus \" *~Delta*\"$(echo $i | cut -d'_' -f3)\"))" echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(\"$(echo $i | cut -d'_' -f1) versus $(echo $i | cut -d'_' -f3)\"))" echo "dev.off()" done res <- read.csv(file = paste("1h_vs_15m", "all.txt", sep="-"), row.names = 1) res_df <- as.data.frame(res) png("1h_vs_15m.png",width=1000, height=800) EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', ylim = c(0, 1), pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("1h versus 15m")) dev.off() res <- read.csv(file = paste("2h_vs_15m", "all.txt", sep="-"), row.names = 1) res_df <- as.data.frame(res) png("2h_vs_15m.png",width=1000, height=1000) EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', ylim = c(0, 4), pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("2h versus 15m")) dev.off() res <- read.csv(file = paste("4h_vs_15m", "all.txt", sep="-"), row.names = 1) res_df <- as.data.frame(res) png("4h_vs_15m.png",width=1000, height=1200) EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', ylim = c(0, 5.5), pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("4h versus 15m")) dev.off() res <- read.csv(file = paste("6h_vs_15m", "all.txt", sep="-"), row.names = 1) res_df <- as.data.frame(res) png("6h_vs_15m.png",width=1000, height=1600) EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', ylim = c(0, 17), pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("6h versus 15m")) dev.off() res <- read.csv(file = paste("4d_vs_15m", "all.txt", sep="-"), row.names = 1) res_df <- as.data.frame(res) png("4d_vs_15m.png",width=1000, height=1600) EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("4d versus 15m")) dev.off() #under DIR degenes under KONSOLE for i in 1h_vs_15m 2h_vs_15m 4h_vs_15m 6h_vs_15m 4d_vs_15m; do echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;"; done
-
Clustering the genes and draw heatmap
for i in 1h_vs_15m 2h_vs_15m 4h_vs_15m 6h_vs_15m 4d_vs_15m; do echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"; echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"; done cat *.id | sort -u > ids #add Gene_Id in the first line, delete the "" GOI <- read.csv("ids")$Gene_Id RNASeq.NoCellLine <- assay(rld) #install.packages("gplots") library("gplots") #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). pearson or spearman datamat = RNASeq.NoCellLine[GOI, ] #datamat = RNASeq.NoCellLine write.csv(as.data.frame(datamat), file ="gene_expressions.txt") constant_rows <- apply(datamat, 1, function(row) var(row) == 0) if(any(constant_rows)) { cat("Removing", sum(constant_rows), "constant rows.\n") datamat <- datamat[!constant_rows, ] } hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete") hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete") mycl = cutree(hr, h=max(hr$height)/1.4) mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "MAGENTA", "CYAN", "GREEN", "RED", "MAROON", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN", "LIGHTGREEN"); #"LIGHTRED", mycol = mycol[as.vector(mycl)] #png("DEGs_heatmap.png", width=900, height=800) #cex.lab=10, labRow="", png("DEGs_heatmap.png", width=900, height=1000) #heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row', # scale='row',trace='none',col=bluered(75), # RowSideColors = mycol, margins=c(10,20), cexRow=1.5, srtCol=45, lhei = c(2, 8)) #rownames(datamat) # cexCol, cexRow heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(5,2), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', labRow="", srtCol=40, lhei=c(1,7), cexCol=2) dev.off() #### cluster members ##### write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt') write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt') write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt') write.csv(names(subset(mycl, mycl == '4')),file='cluster4_MAGENTA.txt') write.csv(names(subset(mycl, mycl == '5')),file='cluster5_CYAN.txt') write.csv(names(subset(mycl, mycl == '6')),file='cluster6_GREEN.txt') ~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o DEGs_heatmap_cluster_members.xls #~/Tools/csv2xls-0.4/csv_to_xls.py significant_gene_expressions.txt -d',' -o DEGs_heatmap_expression_data.xls;
-
Generate Excel files from Genbank-file
python3 /home/jhuang/Scripts/gb_to_excel.py 1585.gb 1585.xlsx #insert C./P. (Chrom or Plasmid1, Plasmid2, Plasmid3) as the second chrom in the 1585.xlsx.
RNA-seq data analysis (R-part) for S. epidermidis 1585
Leave a reply