RNAseq Analysis LT-K331A-LTtr-control

nextflow

(rnaseq) [jhuang@sage Data_Denise_LT_RNAseq]$ nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38 -profile test_full  -resume   --max_memory 256.GB --max_time 2400.h    --save_align_intermeds --save_unaligned    --aligner 'star_salmon' --skip_multiqc

ln -s ~/Tools/rnaseq/assets/multiqc_config.yaml multiqc_config.yaml
multiqc -f --config multiqc_config.yaml . 2>&1
rm multiqc_config.yaml

construct DESeqDataSet

# Import the required libraries
library("AnnotationDbi")
library("clusterProfiler")
library("ReactomePA")
library(gplots)
library(tximport)
library(DESeq2)
setwd("~/DATA/Data_Denise_LT_RNASeq/results_GRCh38/star_salmon")
# Define paths to your Salmon output quantification files, quant.sf refers to tx-counts, later will be summaried as gene-counts.
files <- c("control_DI" = "./control_d8_DI/quant.sf",
          "control_DII" = "./control_d8_DII/quant.sf",
          #"control_DII" = "./control_d8_DII_re/quant.sf",
          "LT_DI" = "./LT_d8_DI/quant.sf",
          "LT_DII" = "./LT_d8_DII/quant.sf",
          "LTtr_DI" = "./LTtr_d8_DI/quant.sf",
          "LTtr_DII" = "./LTtr_d8_DII/quant.sf",
          "LT_K331A_DI" = "./LT_K331A_d8_DI/quant.sf",
          #"LT_K331A_DII" = "./LT_K331A_d8_DII/quant.sf",
          "LT_K331A_DII" = "./LT_K331A_d8_DII_re/quant.sf")
# ---- tx-level count data ----
# Import the transcript abundance data with tximport
txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
# Define the replicates (or donors) and condition of the samples
#donor <- factor(c("DI", "DII", "DII", "DI", "DII", "DI", "DII", "DI", "DII", "DII"))
#batch <- factor(c("batch1", "batch1", "batch2", "batch1", "batch1", "batch1", "batch1", "batch1", "batch1", "batch2"))
#condition <- factor(c("control", "control", "control", "LT", "LT", "LTtr", "LTtr", "LT_K331A", "LT_K331A", "LT_K331A"))
donor <- factor(c("DI", "DII", "DI", "DII", "DI", "DII", "DI", "DII"))
batch <- factor(c("batch1", "batch1", "batch1", "batch1", "batch1", "batch1", "batch1", "batch2"))
condition <- factor(c("control", "control", "LT", "LT", "LTtr", "LTtr", "LT_K331A", "LT_K331A"))

# Output raw count data to a CSV file
write.csv(counts(dds), file="transcript_counts.csv")

# ---- gene-level count data ----
# Read in the tx2gene map from salmon_tx2gene.tsv
#tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
# Set the column names
colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
# Remove the gene_name column if not needed
tx2gene <- tx2gene[,1:2]
# Import and summarize the Salmon data with tximport
txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
# Continue with the DESeq2 workflow as before...
colData <- data.frame(donor=donor, batch=batch, condition=condition, row.names=names(files))
dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~donor+condition)
#dds <- dds[rowSums(counts(dds) > 3) > 2, ]    #60605-->26543
dds <- DESeq(dds)
rld <- rlogTransformation(dds)
write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")
dim(counts(dds))
head(counts(dds), 10)

draw 3D PCA plots.

library(gplots) 
library("RColorBrewer")
library(ggplot2)
data <- plotPCA(rld, intgroup=c("condition", "donor"), returnData=TRUE)
write.csv(data, file="plotPCA_data.csv")
#calculate all PCs including PC3 with the following codes
library(genefilter)
ntop <- 500
rv <- rowVars(assay(rld))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
mat <- t( assay(rld)[select, ] )
pc <- prcomp(mat)
pc$x[,1:3]
#df_pc <- data.frame(pc$x[,1:3])
df_pc <- data.frame(pc$x)
identical(rownames(data), rownames(df_pc)) #-->TRUE
data$PC1 <- NULL
data$PC2 <- NULL
merged_df <- merge(data, df_pc, by = "row.names")
#merged_df <- merged_df[, -1]
row.names(merged_df) <- merged_df$Row.names
merged_df$Row.names <- NULL  # remove the "name" column
merged_df$name <- NULL
merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","group","condition","donor")]
write.csv(merged_df, file="merged_df_8PCs.csv")
summary(pc)  
#0.3311  0.2376 0.1247
#0.3637  0.2564 0.1184 --> 0.36, 026, 0,12
draw_3D.py

draw_3D.py

import plotly.graph_objects as go
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
from scipy.linalg import eigh, sqrtm

# Read in data as a pandas dataframe
#df = pd.DataFrame({
#    'PC1': [-13.999925, -12.504291, -12.443057, -13.065235, -17.316215],
#    'PC2': [-1.498823, -3.342411, -6.067055, -8.205809, 3.293993],
#    'PC3': [-3.335085, 15.207755, -14.725450, 15.078469, -6.917358],
#    'condition': ['GFP d3', 'GFP d3', 'GFP d8', 'GFP d8', 'GFP+mCh d9/12'],
#    'donor': ['DI', 'DII', 'DI', 'DII', 'DI']
#})
df = pd.read_csv('merged_df_8PCs.csv', index_col=0, header=0)
df['condition'] = df['condition'].replace("control", "ctrl d8")
df['condition'] = df['condition'].replace("LTtr", "LTtr d8")
df['condition'] = df['condition'].replace("LT", "LT d8")
df['condition'] = df['condition'].replace("LT_K331A", "LT K331A d8")

# Fit PCA model to reduce data dimensions to 3
pca = PCA(n_components=3)
pca.fit(df.iloc[:, :-3])
X_reduced = pca.transform(df.iloc[:, :-3])

# Add reduced data back to dataframe
df['PC1'] = X_reduced[:, 0]
df['PC2'] = X_reduced[:, 1]
df['PC3'] = X_reduced[:, 2]

# Create PCA plot with 3D scatter
fig = go.Figure()

#['circle', 'circle-open', 'square', 'square-open', 'diamond', 'diamond-open', 'cross', 'x']
# if donor == 'DI' else marker=dict(size=2, opacity=0.8, color=condition_color, symbol=donor_symbol)

#decrease diamond size to 6 while keep the circle as size 10 in the following code:
#'rgb(128, 150, 128)'
#I need three families of colors, always from light to deep, the first one should close to grey.
#the first serie for 'ctrl LTtr+sT d9/12', 'LTtr+sT d9/12' 
#the second serie for 'ctrl LT/LTtr d3', 'ctrl LT/LTtr d8', 'LT d3', 'LT d8', 'LTtr d3', 'LTtr d8'
#the third serie for 'ctrl sT d3', 'ctrl sT d8', 'sT d3', 'sT d8', 'sT+LT d3'

condition_color_map_untreated = {'untreated':'black'}
donor_symbol_map_untreated = {'DI': 'circle-open', 'DII': 'diamond-open'}
#condition_color_map = {'ctrl LTtr+sT d9/12': 'green', 'GFP d3': 'blue', 'GFP d8': 'red', 'GFP+mCh d9/12': 'green', 'LT d3': 'orange'}
condition_color_map = {
    'ctrl d8': '#A9A9A9',
    'LT d8': '#a6cee3',
    'LT K331A d8': '#1f78b4',
    'LTtr d8': '#e31a1c'
}
donor_symbol_map = {'DI': 'circle', 'DII': 'diamond'}

for donor, donor_symbol in donor_symbol_map_untreated.items():
    for condition, condition_color in condition_color_map_untreated.items():
        mask = (df['condition'] == condition) & (df['donor'] == donor)
        fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'],
                                  mode='markers',
                                  name=f'{condition}' if donor == 'DI' else None,
                                  legendgroup=f'{condition}',
                                  showlegend=True if donor == 'DI' else False,
                                  marker=dict(size=4 if donor_symbol in ['diamond-open'] else 6, opacity=0.8, color=condition_color, symbol=donor_symbol)))

for donor, donor_symbol in donor_symbol_map.items():
    for condition, condition_color in condition_color_map.items():
        mask = (df['condition'] == condition) & (df['donor'] == donor)
        fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'],
                                  mode='markers',
                                  name=f'{condition}' if donor == 'DI' else None,
                                  legendgroup=f'{condition}',
                                  showlegend=True if donor == 'DI' else False,
                                  marker=dict(size=4 if donor_symbol in ['diamond'] else 6, opacity=0.8, color=condition_color, symbol=donor_symbol)))

for donor, donor_symbol in donor_symbol_map.items():
    fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None],
                              mode='markers',
                              name=donor,
                              legendgroup=f'{donor}',
                              showlegend=True,
                              marker=dict(size=6, opacity=1, color='black', symbol=donor_symbol),
                              hoverinfo='none'))

# Annotations for the legend blocks
fig.update_layout(
    annotations=[
        dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False,
            text='Condition', font=dict(size=15)),
        dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False,
            text='Donor', font=dict(size=15)),
        dict(x=1.08, y=0.2, xref='paper', yref='paper', showarrow=False,
            text='PC3: 12% variance', font=dict(size=15), textangle=-90)
    ],
    scene=dict(
        xaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black', backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title='PC1: 36% variance'),
        yaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black', backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title='PC2: 26% v.'),
        zaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black', backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title=''),
        bgcolor='white'
    ),
    margin=dict(l=0, r=0, b=0, t=0)  # Adjust the margins to prevent clipping of axis titles
)
#0.3311  0.2376 0.1247
#summary(pc) #--> Proportion of Variance  0.3647 0.1731 0.1515
#percentVar <- round(100 * attr(data, "percentVar"))
#percentVar <- c(36,17,15)

#fig.show()
fig.write_image("PCA_3D.svg")

draw 2D PCA plots.

library(gplots) 
library("RColorBrewer")

#mat <- assay(rld)
#mm <- model.matrix(~condition, colData(rld))
#mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm)
#assay(rld) <- mat

# -- pca --
png("pca.png", 1200, 800)
plotPCA(rld, intgroup=c("condition"))
dev.off()
# -- heatmap --
png("heatmap.png", 1200, 800)
distsRL <- dist(t(assay(rld)))
mat <- as.matrix(distsRL)
hc <- hclust(distsRL)
hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
dev.off()

differential expressions

dds$condition <- relevel(dds$condition, "control")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("LT_K331A_vs_control", "LT_vs_control", "LTtr_vs_control")

dds$condition <- relevel(dds$condition, "LT")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("LT_K331A_vs_LT")

library(dplyr)
library(tidyverse)
library(biomaRt)
listEnsembl()
listMarts()
ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="104")
datasets <- listDatasets(ensembl)
attributes = listAttributes(ensembl)
attributes[1:25,]

for (i in clist) {
  contrast = paste("condition", i, sep="_")
  res = results(dds, name=contrast)
  res <- res[!is.na(res$log2FoldChange),]
  geness <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
      filters = 'ensembl_gene_id',
      values = rownames(res), 
      mart = ensembl)
  geness_uniq <- distinct(geness, ensembl_gene_id, .keep_all= TRUE)
  res$ENSEMBL = rownames(res)
  identical(rownames(res), geness_uniq$ensembl_gene_id)
  res_df <- as.data.frame(res)
  geness_res <- merge(geness_uniq, res_df, by.x="ensembl_gene_id", by.y="ENSEMBL")
  dim(geness_res)
  rownames(geness_res) <- geness_res$ensembl_gene_id
  geness_res$ensembl_gene_id <- NULL
  write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
  up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
  down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
  write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
  write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}

volcano plots with automatically finding top_g

#sorted_geness_res <- geness_res %>% arrange(desc(log2FoldChange))
#CSF3, ADORA2A, RPS26P6, EHF, TNFRSF6B
#sorted_geness_res <- geness_res %>% arrange(log2FoldChange)

library(ggrepel)
geness_res <- read.csv(file = "LT_K331A_vs_control-all.txt", sep=",", row.names=1)
# Color setting
geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray", 
                          ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
# Predefined genes colored in green
predefined_genes <- c()  #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png
geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:1000],
                geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:1000]))
# Define the original and compressed ranges
original_range <- c(18, 30)
compressed_range <- c(18.0, 22.0)
# Adjust the p-values based on the ranges
geness_res$adjusted_pvalue <- with(geness_res, 
                                  ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
                                          ((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
                                          ifelse(-log10(padj) > original_range[2], 
                                                -log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
                                                -log10(padj))))
# Calculate breaks for the y-axis
y_breaks_below <- seq(0, 15, by=5)
y_breaks_compressed <- c(18.0, 22.0) 
y_breaks_above <- c(27.0)
y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above)
y_labels_below <- seq(0, 15, by=5)
y_labels_compressed <- c(18, 30)
y_labels_above <- c(35)
y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above)

# Create the plot
png("LT_K331A_vs_control.png", width=1000, height=1000)
ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) + 
  geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) +  
  geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +     
  geom_point(size = 3) +
  labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") + 
  scale_color_identity() +
  geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)), 
                  size = 6, 
                  point.padding = 0.15, 
                  color = "black", 
                  min.segment.length = .1, 
                  box.padding = .2, 
                  lwd = 2) + 
  theme_bw(base_size = 22) +
  theme(legend.position = "bottom") +
  annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") +
  annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") +
  annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) +
  annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) +
  scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels))
dev.off()

geness_res <- read.csv(file = "LT_vs_control-all.txt", sep=",", row.names=1)
# Color setting
geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray", 
                          ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
# Predefined genes colored in green
predefined_genes <- c()  #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png
geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:400],
                geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:400]))
# Define the original and compressed ranges
original_range <- c(18, 38)
compressed_range <- c(18.0, 22.0)
# Adjust the p-values based on the ranges
geness_res$adjusted_pvalue <- with(geness_res, 
                                  ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
                                          ((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
                                          ifelse(-log10(padj) > original_range[2], 
                                                -log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
                                                -log10(padj))))
# Calculate breaks for the y-axis
y_breaks_below <- seq(0, 15, by=5)
y_breaks_compressed <- c(18.0, 22.0) 
y_breaks_above <- c(27.0)
y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above)
y_labels_below <- seq(0, 15, by=5)
y_labels_compressed <- c(18, 38)
y_labels_above <- c(43)
y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above)

# Create the plot
png("LT_vs_control.png", width=1000, height=1000)
ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) + 
  geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) +  
  geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +     
  geom_point(size = 3) +
  labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") + 
  scale_color_identity() +
  geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)), 
                  size = 6, 
                  point.padding = 0.15, 
                  color = "black", 
                  min.segment.length = .1, 
                  box.padding = .2, 
                  lwd = 2) + 
  theme_bw(base_size = 22) +
  theme(legend.position = "bottom") +
  annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") +
  annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") +
  annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) +
  annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) +
  scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels))
dev.off()

geness_res <- read.csv(file = "LTtr_vs_control-all.txt", sep=",", row.names=1)
# Color setting
geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray", 
                          ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
# Predefined genes colored in green
predefined_genes <- c()  #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png
geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:400],
                geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:400]))
# Define the original and compressed ranges
original_range <- c(15, 25)
compressed_range <- c(15.0, 18.0)
# Adjust the p-values based on the ranges
geness_res$adjusted_pvalue <- with(geness_res, 
                                  ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
                                          ((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
                                          ifelse(-log10(padj) > original_range[2], 
                                                -log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
                                                -log10(padj))))
# Calculate breaks for the y-axis
y_breaks_below <- seq(0, 10, by=5)
y_breaks_compressed <- c(15.0, 18.0) 
y_breaks_above <- c(23.0)
y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above)
y_labels_below <- seq(0, 10, by=5)
y_labels_compressed <- c(15, 25)
y_labels_above <- c(30)
y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above)

# Create the plot
png("LTtr_vs_control.png", width=1000, height=1000)
ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) + 
  geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) +  
  geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +     
  geom_point(size = 3) +
  labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") + 
  scale_color_identity() +
  geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)), 
                  size = 6, 
                  point.padding = 0.15, 
                  color = "black", 
                  min.segment.length = .1, 
                  box.padding = .2, 
                  lwd = 2) + 
  theme_bw(base_size = 22) +
  theme(legend.position = "bottom") +
  annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") +
  annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") +
  annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) +
  annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) +
  scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels))
dev.off()

geness_res <- read.csv(file = "LT_K331A_vs_LT-all.txt", sep=",", row.names=1)
# Color setting
geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray", 
                          ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
# Predefined genes colored in green
predefined_genes <- c()  #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png
geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:400],
                geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:400]))
# Define the original and compressed ranges
original_range <- c(10, 11)
compressed_range <- c(10.0, 11.0)
# Adjust the p-values based on the ranges
geness_res$adjusted_pvalue <- with(geness_res, 
                                  ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
                                          ((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
                                          ifelse(-log10(padj) > original_range[2], 
                                                -log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
                                                -log10(padj))))
# Create the plot
png("LT_K331A_vs_LT.png", width=1000, height=1000)
ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) + 
  geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) +  
  geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +     
  geom_point(size = 3) +
  labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") + 
  scale_color_identity() +
  geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)), 
                  size = 6, 
                  point.padding = 0.15, 
                  color = "black", 
                  min.segment.length = .1, 
                  box.padding = .2, 
                  lwd = 2) + 
  theme_bw(base_size = 22) +
  theme(legend.position = "bottom")
dev.off()

for i in LT_K331A_vs_control LT_vs_control LTtr_vs_control LT_K331A_vs_LT; do
  echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;"
done

clustering the genes and draw heatmap

install.packages("gplots")
library("gplots")
for i in LT_K331A_vs_control LT_vs_control LTtr_vs_control LT_K331A_vs_LT; do
  echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"
  echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"
done
# 307 LT_K331A_vs_control-down.id
# 667 LT_K331A_vs_control-up.id
# 66 LT_K331A_vs_LT-down.id
# 71 LT_K331A_vs_LT-up.id
# 157 LTtr_vs_control-down.id
# 484 LTtr_vs_control-up.id
# 379 LT_vs_control-down.id
# 749 LT_vs_control-up.id
# 2880 total
cat *.id | sort -u > ids
#add Gene_Id in the first line, delete the ""
GOI <- read.csv("ids")$Gene_Id  #570 genes
RNASeq.NoCellLine <- assay(rld)
# Defining the custom order
column_order <- c(
  "control_DI", "control_DII", "LTtr_DI", "LTtr_DII", "LT_DI", "LT_DII", "LT_K331A_DI", "LT_K331A_DII"
)
RNASeq.NoCellLine_reordered <- RNASeq.NoCellLine[, column_order]
head(RNASeq.NoCellLine_reordered)
#clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
datamat = RNASeq.NoCellLine_reordered[GOI, ]
write.csv(as.data.frame(datamat), file ="DEGs_heatmap_data.csv")
hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.05)
mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED",  "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");
mycol = mycol[as.vector(mycl)]
png("DEGs_heatmap.png", width=900, height=1010)
heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
            scale='row',trace='none',col=bluered(75),
            RowSideColors = mycol, labRow="", srtCol=20, keysize=0.72, cexRow = 2, cexCol = 1.4)
dev.off()
# Extract rows from datamat where the row names match the identifiers in subset_1
#### cluster members #####
subset_1<-names(subset(mycl, mycl == '1'))
data <- as.data.frame(datamat[rownames(datamat) %in% subset_1, ])  #579
subset_2<-names(subset(mycl, mycl == '2'))
data <- as.data.frame(datamat[rownames(datamat) %in% subset_2, ])  #399
subset_3<-names(subset(mycl, mycl == '3'))
data <- as.data.frame(datamat[rownames(datamat) %in% subset_3, ])  #70
subset_4<-names(subset(mycl, mycl == '4'))
data <- as.data.frame(datamat[rownames(datamat) %in% subset_4, ])  #433
subset_5<-names(subset(mycl, mycl == '5'))
data <- as.data.frame(datamat[rownames(datamat) %in% subset_5, ])  #140  

# Initialize an empty data frame for the annotated data
annotated_data <- data.frame()
# Determine total number of genes
total_genes <- length(rownames(data))
# Loop through each gene to annotate
for (i in 1:total_genes) {
    gene <- rownames(data)[i]
    result <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
                    filters = 'ensembl_gene_id',
                    values = gene,
                    mart = ensembl)
    # If multiple rows are returned, take the first one
    if (nrow(result) > 1) {
        result <- result[1, ]
    }
    # Check if the result is empty
    if (nrow(result) == 0) {
        result <- data.frame(ensembl_gene_id = gene,
                            external_gene_name = NA,
                            gene_biotype = NA,
                            entrezgene_id = NA,
                            chromosome_name = NA,
                            start_position = NA,
                            end_position = NA,
                            strand = NA,
                            description = NA)
    }
    # Transpose expression values
    expression_values <- t(data.frame(t(data[gene, ])))
    colnames(expression_values) <- colnames(data)
    # Combine gene information and expression data
    combined_result <- cbind(result, expression_values)
    # Append to the final dataframe
    annotated_data <- rbind(annotated_data, combined_result)
    # Print progress every 100 genes
    if (i %% 100 == 0) {
        cat(sprintf("Processed gene %d out of %d\n", i, total_genes))
    }
}

# Save the annotated data to a new CSV file
write.csv(annotated_data, "cluster1_YELLOW.csv", row.names=FALSE)
write.csv(annotated_data, "cluster2_DARKBLUE.csv", row.names=FALSE)
write.csv(annotated_data, "cluster3_DARKORANGE.csv", row.names=FALSE)
write.csv(annotated_data, "cluster4_DARKMAGENTA.csv", row.names=FALSE)
write.csv(annotated_data, "cluster5_DARKCYAN.csv", row.names=FALSE)
#~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.csv -d',' -o DEGs_heatmap_clusters.xls

Microbial bioinformatics

Microbial bioinformatics uses computational tools to analyze genomes, track evolution, and study functions in microorganisms, including bacteria and viruses.

Leave a Reply Cancel reply