RNAseq Analysis LT-K331A-LTtr-control

  1. nextflow

     (rnaseq) [jhuang@sage Data_Denise_LT_RNAseq]$ nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38 -profile test_full  -resume   --max_memory 256.GB --max_time 2400.h    --save_align_intermeds --save_unaligned    --aligner 'star_salmon' --skip_multiqc
    
     ln -s ~/Tools/rnaseq/assets/multiqc_config.yaml multiqc_config.yaml
     multiqc -f --config multiqc_config.yaml . 2>&1
     rm multiqc_config.yaml
  2. construct DESeqDataSet

     # Import the required libraries
     library("AnnotationDbi")
     library("clusterProfiler")
     library("ReactomePA")
     library(gplots)
     library(tximport)
     library(DESeq2)
     setwd("~/DATA/Data_Denise_LT_RNASeq/results_GRCh38/star_salmon")
     # Define paths to your Salmon output quantification files, quant.sf refers to tx-counts, later will be summaried as gene-counts.
     files <- c("control_DI" = "./control_d8_DI/quant.sf",
               "control_DII" = "./control_d8_DII/quant.sf",
               #"control_DII" = "./control_d8_DII_re/quant.sf",
               "LT_DI" = "./LT_d8_DI/quant.sf",
               "LT_DII" = "./LT_d8_DII/quant.sf",
               "LTtr_DI" = "./LTtr_d8_DI/quant.sf",
               "LTtr_DII" = "./LTtr_d8_DII/quant.sf",
               "LT_K331A_DI" = "./LT_K331A_d8_DI/quant.sf",
               #"LT_K331A_DII" = "./LT_K331A_d8_DII/quant.sf",
               "LT_K331A_DII" = "./LT_K331A_d8_DII_re/quant.sf")
     # ---- tx-level count data ----
     # Import the transcript abundance data with tximport
     txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
     # Define the replicates (or donors) and condition of the samples
     #donor <- factor(c("DI", "DII", "DII", "DI", "DII", "DI", "DII", "DI", "DII", "DII"))
     #batch <- factor(c("batch1", "batch1", "batch2", "batch1", "batch1", "batch1", "batch1", "batch1", "batch1", "batch2"))
     #condition <- factor(c("control", "control", "control", "LT", "LT", "LTtr", "LTtr", "LT_K331A", "LT_K331A", "LT_K331A"))
     donor <- factor(c("DI", "DII", "DI", "DII", "DI", "DII", "DI", "DII"))
     batch <- factor(c("batch1", "batch1", "batch1", "batch1", "batch1", "batch1", "batch1", "batch2"))
     condition <- factor(c("control", "control", "LT", "LT", "LTtr", "LTtr", "LT_K331A", "LT_K331A"))
    
     # Output raw count data to a CSV file
     write.csv(counts(dds), file="transcript_counts.csv")
    
     # ---- gene-level count data ----
     # Read in the tx2gene map from salmon_tx2gene.tsv
     #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
     tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
     # Set the column names
     colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
     # Remove the gene_name column if not needed
     tx2gene <- tx2gene[,1:2]
     # Import and summarize the Salmon data with tximport
     txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
     # Continue with the DESeq2 workflow as before...
     colData <- data.frame(donor=donor, batch=batch, condition=condition, row.names=names(files))
     dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~donor+condition)
     #dds <- dds[rowSums(counts(dds) > 3) > 2, ]    #60605-->26543
     dds <- DESeq(dds)
     rld <- rlogTransformation(dds)
     write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")
     dim(counts(dds))
     head(counts(dds), 10)
  3. draw 3D PCA plots.

     library(gplots) 
     library("RColorBrewer")
     library(ggplot2)
     data <- plotPCA(rld, intgroup=c("condition", "donor"), returnData=TRUE)
     write.csv(data, file="plotPCA_data.csv")
     #calculate all PCs including PC3 with the following codes
     library(genefilter)
     ntop <- 500
     rv <- rowVars(assay(rld))
     select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
     mat <- t( assay(rld)[select, ] )
     pc <- prcomp(mat)
     pc$x[,1:3]
     #df_pc <- data.frame(pc$x[,1:3])
     df_pc <- data.frame(pc$x)
     identical(rownames(data), rownames(df_pc)) #-->TRUE
     data$PC1 <- NULL
     data$PC2 <- NULL
     merged_df <- merge(data, df_pc, by = "row.names")
     #merged_df <- merged_df[, -1]
     row.names(merged_df) <- merged_df$Row.names
     merged_df$Row.names <- NULL  # remove the "name" column
     merged_df$name <- NULL
     merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","group","condition","donor")]
     write.csv(merged_df, file="merged_df_8PCs.csv")
     summary(pc)  
     #0.3311  0.2376 0.1247
     #0.3637  0.2564 0.1184 --> 0.36, 026, 0,12
     draw_3D.py
  4. draw_3D.py

     import plotly.graph_objects as go
     import pandas as pd
     from sklearn.decomposition import PCA
     import numpy as np
     from scipy.linalg import eigh, sqrtm
    
     # Read in data as a pandas dataframe
     #df = pd.DataFrame({
     #    'PC1': [-13.999925, -12.504291, -12.443057, -13.065235, -17.316215],
     #    'PC2': [-1.498823, -3.342411, -6.067055, -8.205809, 3.293993],
     #    'PC3': [-3.335085, 15.207755, -14.725450, 15.078469, -6.917358],
     #    'condition': ['GFP d3', 'GFP d3', 'GFP d8', 'GFP d8', 'GFP+mCh d9/12'],
     #    'donor': ['DI', 'DII', 'DI', 'DII', 'DI']
     #})
     df = pd.read_csv('merged_df_8PCs.csv', index_col=0, header=0)
     df['condition'] = df['condition'].replace("control", "ctrl d8")
     df['condition'] = df['condition'].replace("LTtr", "LTtr d8")
     df['condition'] = df['condition'].replace("LT", "LT d8")
     df['condition'] = df['condition'].replace("LT_K331A", "LT K331A d8")
    
     # Fit PCA model to reduce data dimensions to 3
     pca = PCA(n_components=3)
     pca.fit(df.iloc[:, :-3])
     X_reduced = pca.transform(df.iloc[:, :-3])
    
     # Add reduced data back to dataframe
     df['PC1'] = X_reduced[:, 0]
     df['PC2'] = X_reduced[:, 1]
     df['PC3'] = X_reduced[:, 2]
    
     # Create PCA plot with 3D scatter
     fig = go.Figure()
    
     #['circle', 'circle-open', 'square', 'square-open', 'diamond', 'diamond-open', 'cross', 'x']
     # if donor == 'DI' else marker=dict(size=2, opacity=0.8, color=condition_color, symbol=donor_symbol)
    
     #decrease diamond size to 6 while keep the circle as size 10 in the following code:
     #'rgb(128, 150, 128)'
     #I need three families of colors, always from light to deep, the first one should close to grey.
     #the first serie for 'ctrl LTtr+sT d9/12', 'LTtr+sT d9/12' 
     #the second serie for 'ctrl LT/LTtr d3', 'ctrl LT/LTtr d8', 'LT d3', 'LT d8', 'LTtr d3', 'LTtr d8'
     #the third serie for 'ctrl sT d3', 'ctrl sT d8', 'sT d3', 'sT d8', 'sT+LT d3'
    
     condition_color_map_untreated = {'untreated':'black'}
     donor_symbol_map_untreated = {'DI': 'circle-open', 'DII': 'diamond-open'}
     #condition_color_map = {'ctrl LTtr+sT d9/12': 'green', 'GFP d3': 'blue', 'GFP d8': 'red', 'GFP+mCh d9/12': 'green', 'LT d3': 'orange'}
     condition_color_map = {
         'ctrl d8': '#A9A9A9',
         'LT d8': '#a6cee3',
         'LT K331A d8': '#1f78b4',
         'LTtr d8': '#e31a1c'
     }
     donor_symbol_map = {'DI': 'circle', 'DII': 'diamond'}
    
     for donor, donor_symbol in donor_symbol_map_untreated.items():
         for condition, condition_color in condition_color_map_untreated.items():
             mask = (df['condition'] == condition) & (df['donor'] == donor)
             fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'],
                                       mode='markers',
                                       name=f'{condition}' if donor == 'DI' else None,
                                       legendgroup=f'{condition}',
                                       showlegend=True if donor == 'DI' else False,
                                       marker=dict(size=4 if donor_symbol in ['diamond-open'] else 6, opacity=0.8, color=condition_color, symbol=donor_symbol)))
    
     for donor, donor_symbol in donor_symbol_map.items():
         for condition, condition_color in condition_color_map.items():
             mask = (df['condition'] == condition) & (df['donor'] == donor)
             fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'],
                                       mode='markers',
                                       name=f'{condition}' if donor == 'DI' else None,
                                       legendgroup=f'{condition}',
                                       showlegend=True if donor == 'DI' else False,
                                       marker=dict(size=4 if donor_symbol in ['diamond'] else 6, opacity=0.8, color=condition_color, symbol=donor_symbol)))
    
     for donor, donor_symbol in donor_symbol_map.items():
         fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None],
                                   mode='markers',
                                   name=donor,
                                   legendgroup=f'{donor}',
                                   showlegend=True,
                                   marker=dict(size=6, opacity=1, color='black', symbol=donor_symbol),
                                   hoverinfo='none'))
    
     # Annotations for the legend blocks
     fig.update_layout(
         annotations=[
             dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False,
                 text='Condition', font=dict(size=15)),
             dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False,
                 text='Donor', font=dict(size=15)),
             dict(x=1.08, y=0.2, xref='paper', yref='paper', showarrow=False,
                 text='PC3: 12% variance', font=dict(size=15), textangle=-90)
         ],
         scene=dict(
             xaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black', backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title='PC1: 36% variance'),
             yaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black', backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title='PC2: 26% v.'),
             zaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black', backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title=''),
             bgcolor='white'
         ),
         margin=dict(l=0, r=0, b=0, t=0)  # Adjust the margins to prevent clipping of axis titles
     )
     #0.3311  0.2376 0.1247
     #summary(pc) #--> Proportion of Variance  0.3647 0.1731 0.1515
     #percentVar <- round(100 * attr(data, "percentVar"))
     #percentVar <- c(36,17,15)
    
     #fig.show()
     fig.write_image("PCA_3D.svg")
  5. draw 2D PCA plots.

     library(gplots) 
     library("RColorBrewer")
    
     #mat <- assay(rld)
     #mm <- model.matrix(~condition, colData(rld))
     #mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm)
     #assay(rld) <- mat
    
     # -- pca --
     png("pca.png", 1200, 800)
     plotPCA(rld, intgroup=c("condition"))
     dev.off()
     # -- heatmap --
     png("heatmap.png", 1200, 800)
     distsRL <- dist(t(assay(rld)))
     mat <- as.matrix(distsRL)
     hc <- hclust(distsRL)
     hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
     heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
     dev.off()
  6. differential expressions

     dds$condition <- relevel(dds$condition, "control")
     dds = DESeq(dds, betaPrior=FALSE)
     resultsNames(dds)
     clist <- c("LT_K331A_vs_control", "LT_vs_control", "LTtr_vs_control")
    
     dds$condition <- relevel(dds$condition, "LT")
     dds = DESeq(dds, betaPrior=FALSE)
     resultsNames(dds)
     clist <- c("LT_K331A_vs_LT")
    
     library(dplyr)
     library(tidyverse)
     library(biomaRt)
     listEnsembl()
     listMarts()
     ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="104")
     datasets <- listDatasets(ensembl)
     attributes = listAttributes(ensembl)
     attributes[1:25,]
    
     for (i in clist) {
       contrast = paste("condition", i, sep="_")
       res = results(dds, name=contrast)
       res <- res[!is.na(res$log2FoldChange),]
       geness <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
           filters = 'ensembl_gene_id',
           values = rownames(res), 
           mart = ensembl)
       geness_uniq <- distinct(geness, ensembl_gene_id, .keep_all= TRUE)
       res$ENSEMBL = rownames(res)
       identical(rownames(res), geness_uniq$ensembl_gene_id)
       res_df <- as.data.frame(res)
       geness_res <- merge(geness_uniq, res_df, by.x="ensembl_gene_id", by.y="ENSEMBL")
       dim(geness_res)
       rownames(geness_res) <- geness_res$ensembl_gene_id
       geness_res$ensembl_gene_id <- NULL
       write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
       up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
       down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
       write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
       write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
     }
  7. volcano plots with automatically finding top_g

     #sorted_geness_res <- geness_res %>% arrange(desc(log2FoldChange))
     #CSF3, ADORA2A, RPS26P6, EHF, TNFRSF6B
     #sorted_geness_res <- geness_res %>% arrange(log2FoldChange)
    
     library(ggrepel)
     geness_res <- read.csv(file = "LT_K331A_vs_control-all.txt", sep=",", row.names=1)
     # Color setting
     geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray", 
                               ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
     # Predefined genes colored in green
     predefined_genes <- c()  #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png
     geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
     geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
     top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:1000],
                     geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:1000]))
     # Define the original and compressed ranges
     original_range <- c(18, 30)
     compressed_range <- c(18.0, 22.0)
     # Adjust the p-values based on the ranges
     geness_res$adjusted_pvalue <- with(geness_res, 
                                       ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
                                               ((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
                                               ifelse(-log10(padj) > original_range[2], 
                                                     -log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
                                                     -log10(padj))))
     # Calculate breaks for the y-axis
     y_breaks_below <- seq(0, 15, by=5)
     y_breaks_compressed <- c(18.0, 22.0) 
     y_breaks_above <- c(27.0)
     y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above)
     y_labels_below <- seq(0, 15, by=5)
     y_labels_compressed <- c(18, 30)
     y_labels_above <- c(35)
     y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above)
    
     # Create the plot
     png("LT_K331A_vs_control.png", width=1000, height=1000)
     ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) + 
       geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) +  
       geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +     
       geom_point(size = 3) +
       labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") + 
       scale_color_identity() +
       geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)), 
                       size = 6, 
                       point.padding = 0.15, 
                       color = "black", 
                       min.segment.length = .1, 
                       box.padding = .2, 
                       lwd = 2) + 
       theme_bw(base_size = 22) +
       theme(legend.position = "bottom") +
       annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") +
       annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") +
       annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) +
       annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) +
       scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels))
     dev.off()
    
     geness_res <- read.csv(file = "LT_vs_control-all.txt", sep=",", row.names=1)
     # Color setting
     geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray", 
                               ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
     # Predefined genes colored in green
     predefined_genes <- c()  #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png
     geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
     geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
     top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:400],
                     geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:400]))
     # Define the original and compressed ranges
     original_range <- c(18, 38)
     compressed_range <- c(18.0, 22.0)
     # Adjust the p-values based on the ranges
     geness_res$adjusted_pvalue <- with(geness_res, 
                                       ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
                                               ((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
                                               ifelse(-log10(padj) > original_range[2], 
                                                     -log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
                                                     -log10(padj))))
     # Calculate breaks for the y-axis
     y_breaks_below <- seq(0, 15, by=5)
     y_breaks_compressed <- c(18.0, 22.0) 
     y_breaks_above <- c(27.0)
     y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above)
     y_labels_below <- seq(0, 15, by=5)
     y_labels_compressed <- c(18, 38)
     y_labels_above <- c(43)
     y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above)
    
     # Create the plot
     png("LT_vs_control.png", width=1000, height=1000)
     ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) + 
       geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) +  
       geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +     
       geom_point(size = 3) +
       labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") + 
       scale_color_identity() +
       geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)), 
                       size = 6, 
                       point.padding = 0.15, 
                       color = "black", 
                       min.segment.length = .1, 
                       box.padding = .2, 
                       lwd = 2) + 
       theme_bw(base_size = 22) +
       theme(legend.position = "bottom") +
       annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") +
       annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") +
       annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) +
       annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) +
       scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels))
     dev.off()
    
     geness_res <- read.csv(file = "LTtr_vs_control-all.txt", sep=",", row.names=1)
     # Color setting
     geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray", 
                               ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
     # Predefined genes colored in green
     predefined_genes <- c()  #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png
     geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
     geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
     top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:400],
                     geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:400]))
     # Define the original and compressed ranges
     original_range <- c(15, 25)
     compressed_range <- c(15.0, 18.0)
     # Adjust the p-values based on the ranges
     geness_res$adjusted_pvalue <- with(geness_res, 
                                       ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
                                               ((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
                                               ifelse(-log10(padj) > original_range[2], 
                                                     -log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
                                                     -log10(padj))))
     # Calculate breaks for the y-axis
     y_breaks_below <- seq(0, 10, by=5)
     y_breaks_compressed <- c(15.0, 18.0) 
     y_breaks_above <- c(23.0)
     y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above)
     y_labels_below <- seq(0, 10, by=5)
     y_labels_compressed <- c(15, 25)
     y_labels_above <- c(30)
     y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above)
    
     # Create the plot
     png("LTtr_vs_control.png", width=1000, height=1000)
     ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) + 
       geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) +  
       geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +     
       geom_point(size = 3) +
       labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") + 
       scale_color_identity() +
       geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)), 
                       size = 6, 
                       point.padding = 0.15, 
                       color = "black", 
                       min.segment.length = .1, 
                       box.padding = .2, 
                       lwd = 2) + 
       theme_bw(base_size = 22) +
       theme(legend.position = "bottom") +
       annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") +
       annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") +
       annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) +
       annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) +
       scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels))
     dev.off()
    
     geness_res <- read.csv(file = "LT_K331A_vs_LT-all.txt", sep=",", row.names=1)
     # Color setting
     geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray", 
                               ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
     # Predefined genes colored in green
     predefined_genes <- c()  #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png
     geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
     geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
     top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:400],
                     geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:400]))
     # Define the original and compressed ranges
     original_range <- c(10, 11)
     compressed_range <- c(10.0, 11.0)
     # Adjust the p-values based on the ranges
     geness_res$adjusted_pvalue <- with(geness_res, 
                                       ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
                                               ((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
                                               ifelse(-log10(padj) > original_range[2], 
                                                     -log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
                                                     -log10(padj))))
     # Create the plot
     png("LT_K331A_vs_LT.png", width=1000, height=1000)
     ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) + 
       geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) +  
       geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +     
       geom_point(size = 3) +
       labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") + 
       scale_color_identity() +
       geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)), 
                       size = 6, 
                       point.padding = 0.15, 
                       color = "black", 
                       min.segment.length = .1, 
                       box.padding = .2, 
                       lwd = 2) + 
       theme_bw(base_size = 22) +
       theme(legend.position = "bottom")
     dev.off()
    
     for i in LT_K331A_vs_control LT_vs_control LTtr_vs_control LT_K331A_vs_LT; do
       echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;"
     done
  8. clustering the genes and draw heatmap

     install.packages("gplots")
     library("gplots")
     for i in LT_K331A_vs_control LT_vs_control LTtr_vs_control LT_K331A_vs_LT; do
       echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"
       echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"
     done
     # 307 LT_K331A_vs_control-down.id
     # 667 LT_K331A_vs_control-up.id
     # 66 LT_K331A_vs_LT-down.id
     # 71 LT_K331A_vs_LT-up.id
     # 157 LTtr_vs_control-down.id
     # 484 LTtr_vs_control-up.id
     # 379 LT_vs_control-down.id
     # 749 LT_vs_control-up.id
     # 2880 total
     cat *.id | sort -u > ids
     #add Gene_Id in the first line, delete the ""
     GOI <- read.csv("ids")$Gene_Id  #570 genes
     RNASeq.NoCellLine <- assay(rld)
     # Defining the custom order
     column_order <- c(
       "control_DI", "control_DII", "LTtr_DI", "LTtr_DII", "LT_DI", "LT_DII", "LT_K331A_DI", "LT_K331A_DII"
     )
     RNASeq.NoCellLine_reordered <- RNASeq.NoCellLine[, column_order]
     head(RNASeq.NoCellLine_reordered)
     #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
     datamat = RNASeq.NoCellLine_reordered[GOI, ]
     write.csv(as.data.frame(datamat), file ="DEGs_heatmap_data.csv")
     hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
     hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
     mycl = cutree(hr, h=max(hr$height)/1.05)
     mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED",  "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");
     mycol = mycol[as.vector(mycl)]
     png("DEGs_heatmap.png", width=900, height=1010)
     heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
                 scale='row',trace='none',col=bluered(75),
                 RowSideColors = mycol, labRow="", srtCol=20, keysize=0.72, cexRow = 2, cexCol = 1.4)
     dev.off()
     # Extract rows from datamat where the row names match the identifiers in subset_1
     #### cluster members #####
     subset_1<-names(subset(mycl, mycl == '1'))
     data <- as.data.frame(datamat[rownames(datamat) %in% subset_1, ])  #579
     subset_2<-names(subset(mycl, mycl == '2'))
     data <- as.data.frame(datamat[rownames(datamat) %in% subset_2, ])  #399
     subset_3<-names(subset(mycl, mycl == '3'))
     data <- as.data.frame(datamat[rownames(datamat) %in% subset_3, ])  #70
     subset_4<-names(subset(mycl, mycl == '4'))
     data <- as.data.frame(datamat[rownames(datamat) %in% subset_4, ])  #433
     subset_5<-names(subset(mycl, mycl == '5'))
     data <- as.data.frame(datamat[rownames(datamat) %in% subset_5, ])  #140  
    
     # Initialize an empty data frame for the annotated data
     annotated_data <- data.frame()
     # Determine total number of genes
     total_genes <- length(rownames(data))
     # Loop through each gene to annotate
     for (i in 1:total_genes) {
         gene <- rownames(data)[i]
         result <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
                         filters = 'ensembl_gene_id',
                         values = gene,
                         mart = ensembl)
         # If multiple rows are returned, take the first one
         if (nrow(result) > 1) {
             result <- result[1, ]
         }
         # Check if the result is empty
         if (nrow(result) == 0) {
             result <- data.frame(ensembl_gene_id = gene,
                                 external_gene_name = NA,
                                 gene_biotype = NA,
                                 entrezgene_id = NA,
                                 chromosome_name = NA,
                                 start_position = NA,
                                 end_position = NA,
                                 strand = NA,
                                 description = NA)
         }
         # Transpose expression values
         expression_values <- t(data.frame(t(data[gene, ])))
         colnames(expression_values) <- colnames(data)
         # Combine gene information and expression data
         combined_result <- cbind(result, expression_values)
         # Append to the final dataframe
         annotated_data <- rbind(annotated_data, combined_result)
         # Print progress every 100 genes
         if (i %% 100 == 0) {
             cat(sprintf("Processed gene %d out of %d\n", i, total_genes))
         }
     }
    
     # Save the annotated data to a new CSV file
     write.csv(annotated_data, "cluster1_YELLOW.csv", row.names=FALSE)
     write.csv(annotated_data, "cluster2_DARKBLUE.csv", row.names=FALSE)
     write.csv(annotated_data, "cluster3_DARKORANGE.csv", row.names=FALSE)
     write.csv(annotated_data, "cluster4_DARKMAGENTA.csv", row.names=FALSE)
     write.csv(annotated_data, "cluster5_DARKCYAN.csv", row.names=FALSE)
     #~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.csv -d',' -o DEGs_heatmap_clusters.xls

Leave a Reply

Your email address will not be published. Required fields are marked *