-
nextflow
(rnaseq) [jhuang@sage Data_Denise_LT_RNAseq]$ nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38 -profile test_full -resume --max_memory 256.GB --max_time 2400.h --save_align_intermeds --save_unaligned --aligner 'star_salmon' --skip_multiqc ln -s ~/Tools/rnaseq/assets/multiqc_config.yaml multiqc_config.yaml multiqc -f --config multiqc_config.yaml . 2>&1 rm multiqc_config.yaml
-
construct DESeqDataSet
# Import the required libraries library("AnnotationDbi") library("clusterProfiler") library("ReactomePA") library(gplots) library(tximport) library(DESeq2) setwd("~/DATA/Data_Denise_LT_RNASeq/results_GRCh38/star_salmon") # Define paths to your Salmon output quantification files, quant.sf refers to tx-counts, later will be summaried as gene-counts. files <- c("control_DI" = "./control_d8_DI/quant.sf", "control_DII" = "./control_d8_DII/quant.sf", #"control_DII" = "./control_d8_DII_re/quant.sf", "LT_DI" = "./LT_d8_DI/quant.sf", "LT_DII" = "./LT_d8_DII/quant.sf", "LTtr_DI" = "./LTtr_d8_DI/quant.sf", "LTtr_DII" = "./LTtr_d8_DII/quant.sf", "LT_K331A_DI" = "./LT_K331A_d8_DI/quant.sf", #"LT_K331A_DII" = "./LT_K331A_d8_DII/quant.sf", "LT_K331A_DII" = "./LT_K331A_d8_DII_re/quant.sf") # ---- tx-level count data ---- # Import the transcript abundance data with tximport txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE) # Define the replicates (or donors) and condition of the samples #donor <- factor(c("DI", "DII", "DII", "DI", "DII", "DI", "DII", "DI", "DII", "DII")) #batch <- factor(c("batch1", "batch1", "batch2", "batch1", "batch1", "batch1", "batch1", "batch1", "batch1", "batch2")) #condition <- factor(c("control", "control", "control", "LT", "LT", "LTtr", "LTtr", "LT_K331A", "LT_K331A", "LT_K331A")) donor <- factor(c("DI", "DII", "DI", "DII", "DI", "DII", "DI", "DII")) batch <- factor(c("batch1", "batch1", "batch1", "batch1", "batch1", "batch1", "batch1", "batch2")) condition <- factor(c("control", "control", "LT", "LT", "LTtr", "LTtr", "LT_K331A", "LT_K331A")) # Output raw count data to a CSV file write.csv(counts(dds), file="transcript_counts.csv") # ---- gene-level count data ---- # Read in the tx2gene map from salmon_tx2gene.tsv #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE) tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE) # Set the column names colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name") # Remove the gene_name column if not needed tx2gene <- tx2gene[,1:2] # Import and summarize the Salmon data with tximport txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE) # Continue with the DESeq2 workflow as before... colData <- data.frame(donor=donor, batch=batch, condition=condition, row.names=names(files)) dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~donor+condition) #dds <- dds[rowSums(counts(dds) > 3) > 2, ] #60605-->26543 dds <- DESeq(dds) rld <- rlogTransformation(dds) write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv") dim(counts(dds)) head(counts(dds), 10)
-
draw 3D PCA plots.
library(gplots) library("RColorBrewer") library(ggplot2) data <- plotPCA(rld, intgroup=c("condition", "donor"), returnData=TRUE) write.csv(data, file="plotPCA_data.csv") #calculate all PCs including PC3 with the following codes library(genefilter) ntop <- 500 rv <- rowVars(assay(rld)) select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))] mat <- t( assay(rld)[select, ] ) pc <- prcomp(mat) pc$x[,1:3] #df_pc <- data.frame(pc$x[,1:3]) df_pc <- data.frame(pc$x) identical(rownames(data), rownames(df_pc)) #-->TRUE data$PC1 <- NULL data$PC2 <- NULL merged_df <- merge(data, df_pc, by = "row.names") #merged_df <- merged_df[, -1] row.names(merged_df) <- merged_df$Row.names merged_df$Row.names <- NULL # remove the "name" column merged_df$name <- NULL merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","group","condition","donor")] write.csv(merged_df, file="merged_df_8PCs.csv") summary(pc) #0.3311 0.2376 0.1247 #0.3637 0.2564 0.1184 --> 0.36, 026, 0,12 draw_3D.py
-
draw_3D.py
import plotly.graph_objects as go import pandas as pd from sklearn.decomposition import PCA import numpy as np from scipy.linalg import eigh, sqrtm # Read in data as a pandas dataframe #df = pd.DataFrame({ # 'PC1': [-13.999925, -12.504291, -12.443057, -13.065235, -17.316215], # 'PC2': [-1.498823, -3.342411, -6.067055, -8.205809, 3.293993], # 'PC3': [-3.335085, 15.207755, -14.725450, 15.078469, -6.917358], # 'condition': ['GFP d3', 'GFP d3', 'GFP d8', 'GFP d8', 'GFP+mCh d9/12'], # 'donor': ['DI', 'DII', 'DI', 'DII', 'DI'] #}) df = pd.read_csv('merged_df_8PCs.csv', index_col=0, header=0) df['condition'] = df['condition'].replace("control", "ctrl d8") df['condition'] = df['condition'].replace("LTtr", "LTtr d8") df['condition'] = df['condition'].replace("LT", "LT d8") df['condition'] = df['condition'].replace("LT_K331A", "LT K331A d8") # Fit PCA model to reduce data dimensions to 3 pca = PCA(n_components=3) pca.fit(df.iloc[:, :-3]) X_reduced = pca.transform(df.iloc[:, :-3]) # Add reduced data back to dataframe df['PC1'] = X_reduced[:, 0] df['PC2'] = X_reduced[:, 1] df['PC3'] = X_reduced[:, 2] # Create PCA plot with 3D scatter fig = go.Figure() #['circle', 'circle-open', 'square', 'square-open', 'diamond', 'diamond-open', 'cross', 'x'] # if donor == 'DI' else marker=dict(size=2, opacity=0.8, color=condition_color, symbol=donor_symbol) #decrease diamond size to 6 while keep the circle as size 10 in the following code: #'rgb(128, 150, 128)' #I need three families of colors, always from light to deep, the first one should close to grey. #the first serie for 'ctrl LTtr+sT d9/12', 'LTtr+sT d9/12' #the second serie for 'ctrl LT/LTtr d3', 'ctrl LT/LTtr d8', 'LT d3', 'LT d8', 'LTtr d3', 'LTtr d8' #the third serie for 'ctrl sT d3', 'ctrl sT d8', 'sT d3', 'sT d8', 'sT+LT d3' condition_color_map_untreated = {'untreated':'black'} donor_symbol_map_untreated = {'DI': 'circle-open', 'DII': 'diamond-open'} #condition_color_map = {'ctrl LTtr+sT d9/12': 'green', 'GFP d3': 'blue', 'GFP d8': 'red', 'GFP+mCh d9/12': 'green', 'LT d3': 'orange'} condition_color_map = { 'ctrl d8': '#A9A9A9', 'LT d8': '#a6cee3', 'LT K331A d8': '#1f78b4', 'LTtr d8': '#e31a1c' } donor_symbol_map = {'DI': 'circle', 'DII': 'diamond'} for donor, donor_symbol in donor_symbol_map_untreated.items(): for condition, condition_color in condition_color_map_untreated.items(): mask = (df['condition'] == condition) & (df['donor'] == donor) fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'], mode='markers', name=f'{condition}' if donor == 'DI' else None, legendgroup=f'{condition}', showlegend=True if donor == 'DI' else False, marker=dict(size=4 if donor_symbol in ['diamond-open'] else 6, opacity=0.8, color=condition_color, symbol=donor_symbol))) for donor, donor_symbol in donor_symbol_map.items(): for condition, condition_color in condition_color_map.items(): mask = (df['condition'] == condition) & (df['donor'] == donor) fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'], mode='markers', name=f'{condition}' if donor == 'DI' else None, legendgroup=f'{condition}', showlegend=True if donor == 'DI' else False, marker=dict(size=4 if donor_symbol in ['diamond'] else 6, opacity=0.8, color=condition_color, symbol=donor_symbol))) for donor, donor_symbol in donor_symbol_map.items(): fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None], mode='markers', name=donor, legendgroup=f'{donor}', showlegend=True, marker=dict(size=6, opacity=1, color='black', symbol=donor_symbol), hoverinfo='none')) # Annotations for the legend blocks fig.update_layout( annotations=[ dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False, text='Condition', font=dict(size=15)), dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False, text='Donor', font=dict(size=15)), dict(x=1.08, y=0.2, xref='paper', yref='paper', showarrow=False, text='PC3: 12% variance', font=dict(size=15), textangle=-90) ], scene=dict( xaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black', backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title='PC1: 36% variance'), yaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black', backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title='PC2: 26% v.'), zaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black', backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title=''), bgcolor='white' ), margin=dict(l=0, r=0, b=0, t=0) # Adjust the margins to prevent clipping of axis titles ) #0.3311 0.2376 0.1247 #summary(pc) #--> Proportion of Variance 0.3647 0.1731 0.1515 #percentVar <- round(100 * attr(data, "percentVar")) #percentVar <- c(36,17,15) #fig.show() fig.write_image("PCA_3D.svg")
-
draw 2D PCA plots.
library(gplots) library("RColorBrewer") #mat <- assay(rld) #mm <- model.matrix(~condition, colData(rld)) #mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm) #assay(rld) <- mat # -- pca -- png("pca.png", 1200, 800) plotPCA(rld, intgroup=c("condition")) dev.off() # -- heatmap -- png("heatmap.png", 1200, 800) distsRL <- dist(t(assay(rld))) mat <- as.matrix(distsRL) hc <- hclust(distsRL) hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100) heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13)) dev.off()
-
differential expressions
dds$condition <- relevel(dds$condition, "control") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("LT_K331A_vs_control", "LT_vs_control", "LTtr_vs_control") dds$condition <- relevel(dds$condition, "LT") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("LT_K331A_vs_LT") library(dplyr) library(tidyverse) library(biomaRt) listEnsembl() listMarts() ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="104") datasets <- listDatasets(ensembl) attributes = listAttributes(ensembl) attributes[1:25,] for (i in clist) { contrast = paste("condition", i, sep="_") res = results(dds, name=contrast) res <- res[!is.na(res$log2FoldChange),] geness <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'), filters = 'ensembl_gene_id', values = rownames(res), mart = ensembl) geness_uniq <- distinct(geness, ensembl_gene_id, .keep_all= TRUE) res$ENSEMBL = rownames(res) identical(rownames(res), geness_uniq$ensembl_gene_id) res_df <- as.data.frame(res) geness_res <- merge(geness_uniq, res_df, by.x="ensembl_gene_id", by.y="ENSEMBL") dim(geness_res) rownames(geness_res) <- geness_res$ensembl_gene_id geness_res$ensembl_gene_id <- NULL write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-")) up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2) down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2) write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-")) write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-")) }
-
volcano plots with automatically finding top_g
#sorted_geness_res <- geness_res %>% arrange(desc(log2FoldChange)) #CSF3, ADORA2A, RPS26P6, EHF, TNFRSF6B #sorted_geness_res <- geness_res %>% arrange(log2FoldChange) library(ggrepel) geness_res <- read.csv(file = "LT_K331A_vs_control-all.txt", sep=",", row.names=1) # Color setting geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray", ifelse(geness_res$log2FoldChange > 0, "red", "blue")) # Predefined genes colored in green predefined_genes <- c() #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green" geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange) top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:1000], geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:1000])) # Define the original and compressed ranges original_range <- c(18, 30) compressed_range <- c(18.0, 22.0) # Adjust the p-values based on the ranges geness_res$adjusted_pvalue <- with(geness_res, ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2], ((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1], ifelse(-log10(padj) > original_range[2], -log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]), -log10(padj)))) # Calculate breaks for the y-axis y_breaks_below <- seq(0, 15, by=5) y_breaks_compressed <- c(18.0, 22.0) y_breaks_above <- c(27.0) y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above) y_labels_below <- seq(0, 15, by=5) y_labels_compressed <- c(18, 30) y_labels_above <- c(35) y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above) # Create the plot png("LT_K331A_vs_control.png", width=1000, height=1000) ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) + geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) + geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) + geom_point(size = 3) + labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") + scale_color_identity() + geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)), size = 6, point.padding = 0.15, color = "black", min.segment.length = .1, box.padding = .2, lwd = 2) + theme_bw(base_size = 22) + theme(legend.position = "bottom") + annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") + annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") + annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) + annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) + scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels)) dev.off() geness_res <- read.csv(file = "LT_vs_control-all.txt", sep=",", row.names=1) # Color setting geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray", ifelse(geness_res$log2FoldChange > 0, "red", "blue")) # Predefined genes colored in green predefined_genes <- c() #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green" geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange) top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:400], geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:400])) # Define the original and compressed ranges original_range <- c(18, 38) compressed_range <- c(18.0, 22.0) # Adjust the p-values based on the ranges geness_res$adjusted_pvalue <- with(geness_res, ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2], ((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1], ifelse(-log10(padj) > original_range[2], -log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]), -log10(padj)))) # Calculate breaks for the y-axis y_breaks_below <- seq(0, 15, by=5) y_breaks_compressed <- c(18.0, 22.0) y_breaks_above <- c(27.0) y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above) y_labels_below <- seq(0, 15, by=5) y_labels_compressed <- c(18, 38) y_labels_above <- c(43) y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above) # Create the plot png("LT_vs_control.png", width=1000, height=1000) ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) + geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) + geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) + geom_point(size = 3) + labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") + scale_color_identity() + geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)), size = 6, point.padding = 0.15, color = "black", min.segment.length = .1, box.padding = .2, lwd = 2) + theme_bw(base_size = 22) + theme(legend.position = "bottom") + annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") + annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") + annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) + annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) + scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels)) dev.off() geness_res <- read.csv(file = "LTtr_vs_control-all.txt", sep=",", row.names=1) # Color setting geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray", ifelse(geness_res$log2FoldChange > 0, "red", "blue")) # Predefined genes colored in green predefined_genes <- c() #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green" geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange) top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:400], geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:400])) # Define the original and compressed ranges original_range <- c(15, 25) compressed_range <- c(15.0, 18.0) # Adjust the p-values based on the ranges geness_res$adjusted_pvalue <- with(geness_res, ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2], ((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1], ifelse(-log10(padj) > original_range[2], -log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]), -log10(padj)))) # Calculate breaks for the y-axis y_breaks_below <- seq(0, 10, by=5) y_breaks_compressed <- c(15.0, 18.0) y_breaks_above <- c(23.0) y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above) y_labels_below <- seq(0, 10, by=5) y_labels_compressed <- c(15, 25) y_labels_above <- c(30) y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above) # Create the plot png("LTtr_vs_control.png", width=1000, height=1000) ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) + geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) + geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) + geom_point(size = 3) + labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") + scale_color_identity() + geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)), size = 6, point.padding = 0.15, color = "black", min.segment.length = .1, box.padding = .2, lwd = 2) + theme_bw(base_size = 22) + theme(legend.position = "bottom") + annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") + annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") + annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) + annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) + scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels)) dev.off() geness_res <- read.csv(file = "LT_K331A_vs_LT-all.txt", sep=",", row.names=1) # Color setting geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray", ifelse(geness_res$log2FoldChange > 0, "red", "blue")) # Predefined genes colored in green predefined_genes <- c() #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green" geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange) top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:400], geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:400])) # Define the original and compressed ranges original_range <- c(10, 11) compressed_range <- c(10.0, 11.0) # Adjust the p-values based on the ranges geness_res$adjusted_pvalue <- with(geness_res, ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2], ((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1], ifelse(-log10(padj) > original_range[2], -log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]), -log10(padj)))) # Create the plot png("LT_K331A_vs_LT.png", width=1000, height=1000) ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) + geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) + geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) + geom_point(size = 3) + labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") + scale_color_identity() + geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)), size = 6, point.padding = 0.15, color = "black", min.segment.length = .1, box.padding = .2, lwd = 2) + theme_bw(base_size = 22) + theme(legend.position = "bottom") dev.off() for i in LT_K331A_vs_control LT_vs_control LTtr_vs_control LT_K331A_vs_LT; do echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;" done
-
clustering the genes and draw heatmap
install.packages("gplots") library("gplots") for i in LT_K331A_vs_control LT_vs_control LTtr_vs_control LT_K331A_vs_LT; do echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id" echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id" done # 307 LT_K331A_vs_control-down.id # 667 LT_K331A_vs_control-up.id # 66 LT_K331A_vs_LT-down.id # 71 LT_K331A_vs_LT-up.id # 157 LTtr_vs_control-down.id # 484 LTtr_vs_control-up.id # 379 LT_vs_control-down.id # 749 LT_vs_control-up.id # 2880 total cat *.id | sort -u > ids #add Gene_Id in the first line, delete the "" GOI <- read.csv("ids")$Gene_Id #570 genes RNASeq.NoCellLine <- assay(rld) # Defining the custom order column_order <- c( "control_DI", "control_DII", "LTtr_DI", "LTtr_DII", "LT_DI", "LT_DII", "LT_K331A_DI", "LT_K331A_DII" ) RNASeq.NoCellLine_reordered <- RNASeq.NoCellLine[, column_order] head(RNASeq.NoCellLine_reordered) #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). pearson or spearman datamat = RNASeq.NoCellLine_reordered[GOI, ] write.csv(as.data.frame(datamat), file ="DEGs_heatmap_data.csv") hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete") hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete") mycl = cutree(hr, h=max(hr$height)/1.05) mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED", "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN"); mycol = mycol[as.vector(mycl)] png("DEGs_heatmap.png", width=900, height=1010) heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row', scale='row',trace='none',col=bluered(75), RowSideColors = mycol, labRow="", srtCol=20, keysize=0.72, cexRow = 2, cexCol = 1.4) dev.off() # Extract rows from datamat where the row names match the identifiers in subset_1 #### cluster members ##### subset_1<-names(subset(mycl, mycl == '1')) data <- as.data.frame(datamat[rownames(datamat) %in% subset_1, ]) #579 subset_2<-names(subset(mycl, mycl == '2')) data <- as.data.frame(datamat[rownames(datamat) %in% subset_2, ]) #399 subset_3<-names(subset(mycl, mycl == '3')) data <- as.data.frame(datamat[rownames(datamat) %in% subset_3, ]) #70 subset_4<-names(subset(mycl, mycl == '4')) data <- as.data.frame(datamat[rownames(datamat) %in% subset_4, ]) #433 subset_5<-names(subset(mycl, mycl == '5')) data <- as.data.frame(datamat[rownames(datamat) %in% subset_5, ]) #140 # Initialize an empty data frame for the annotated data annotated_data <- data.frame() # Determine total number of genes total_genes <- length(rownames(data)) # Loop through each gene to annotate for (i in 1:total_genes) { gene <- rownames(data)[i] result <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'), filters = 'ensembl_gene_id', values = gene, mart = ensembl) # If multiple rows are returned, take the first one if (nrow(result) > 1) { result <- result[1, ] } # Check if the result is empty if (nrow(result) == 0) { result <- data.frame(ensembl_gene_id = gene, external_gene_name = NA, gene_biotype = NA, entrezgene_id = NA, chromosome_name = NA, start_position = NA, end_position = NA, strand = NA, description = NA) } # Transpose expression values expression_values <- t(data.frame(t(data[gene, ]))) colnames(expression_values) <- colnames(data) # Combine gene information and expression data combined_result <- cbind(result, expression_values) # Append to the final dataframe annotated_data <- rbind(annotated_data, combined_result) # Print progress every 100 genes if (i %% 100 == 0) { cat(sprintf("Processed gene %d out of %d\n", i, total_genes)) } } # Save the annotated data to a new CSV file write.csv(annotated_data, "cluster1_YELLOW.csv", row.names=FALSE) write.csv(annotated_data, "cluster2_DARKBLUE.csv", row.names=FALSE) write.csv(annotated_data, "cluster3_DARKORANGE.csv", row.names=FALSE) write.csv(annotated_data, "cluster4_DARKMAGENTA.csv", row.names=FALSE) write.csv(annotated_data, "cluster5_DARKCYAN.csv", row.names=FALSE) #~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.csv -d',' -o DEGs_heatmap_clusters.xls
RNAseq Analysis LT-K331A-LTtr-control
Leave a reply