RNAseq Analysis LT-K331A-LTtr-control

gene_x 0 like s 689 view s

Tags: python, R, pipeline, RNA-seq

  1. nextflow

    (rnaseq) [jhuang@sage Data_Denise_LT_RNAseq]$ nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38 -profile test_full  -resume   --max_memory 256.GB --max_time 2400.h    --save_align_intermeds --save_unaligned    --aligner 'star_salmon' --skip_multiqc
    
    ln -s ~/Tools/rnaseq/assets/multiqc_config.yaml multiqc_config.yaml
    multiqc -f --config multiqc_config.yaml . 2>&1
    rm multiqc_config.yaml
    
  2. construct DESeqDataSet

    # Import the required libraries
    library("AnnotationDbi")
    library("clusterProfiler")
    library("ReactomePA")
    library(gplots)
    library(tximport)
    library(DESeq2)
    setwd("~/DATA/Data_Denise_LT_RNASeq/results_GRCh38/star_salmon")
    # Define paths to your Salmon output quantification files, quant.sf refers to tx-counts, later will be summaried as gene-counts.
    files <- c("control_DI" = "./control_d8_DI/quant.sf",
              "control_DII" = "./control_d8_DII/quant.sf",
              #"control_DII" = "./control_d8_DII_re/quant.sf",
              "LT_DI" = "./LT_d8_DI/quant.sf",
              "LT_DII" = "./LT_d8_DII/quant.sf",
              "LTtr_DI" = "./LTtr_d8_DI/quant.sf",
              "LTtr_DII" = "./LTtr_d8_DII/quant.sf",
              "LT_K331A_DI" = "./LT_K331A_d8_DI/quant.sf",
              #"LT_K331A_DII" = "./LT_K331A_d8_DII/quant.sf",
              "LT_K331A_DII" = "./LT_K331A_d8_DII_re/quant.sf")
    # ---- tx-level count data ----
    # Import the transcript abundance data with tximport
    txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
    # Define the replicates (or donors) and condition of the samples
    #donor <- factor(c("DI", "DII", "DII", "DI", "DII", "DI", "DII", "DI", "DII", "DII"))
    #batch <- factor(c("batch1", "batch1", "batch2", "batch1", "batch1", "batch1", "batch1", "batch1", "batch1", "batch2"))
    #condition <- factor(c("control", "control", "control", "LT", "LT", "LTtr", "LTtr", "LT_K331A", "LT_K331A", "LT_K331A"))
    donor <- factor(c("DI", "DII", "DI", "DII", "DI", "DII", "DI", "DII"))
    batch <- factor(c("batch1", "batch1", "batch1", "batch1", "batch1", "batch1", "batch1", "batch2"))
    condition <- factor(c("control", "control", "LT", "LT", "LTtr", "LTtr", "LT_K331A", "LT_K331A"))
    
    # Output raw count data to a CSV file
    write.csv(counts(dds), file="transcript_counts.csv")
    
    # ---- gene-level count data ----
    # Read in the tx2gene map from salmon_tx2gene.tsv
    #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
    tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
    # Set the column names
    colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
    # Remove the gene_name column if not needed
    tx2gene <- tx2gene[,1:2]
    # Import and summarize the Salmon data with tximport
    txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
    # Continue with the DESeq2 workflow as before...
    colData <- data.frame(donor=donor, batch=batch, condition=condition, row.names=names(files))
    dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~donor+condition)
    #dds <- dds[rowSums(counts(dds) > 3) > 2, ]    #60605-->26543
    dds <- DESeq(dds)
    rld <- rlogTransformation(dds)
    write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")
    dim(counts(dds))
    head(counts(dds), 10)
    
  3. draw 3D PCA plots.

    library(gplots) 
    library("RColorBrewer")
    library(ggplot2)
    data <- plotPCA(rld, intgroup=c("condition", "donor"), returnData=TRUE)
    write.csv(data, file="plotPCA_data.csv")
    #calculate all PCs including PC3 with the following codes
    library(genefilter)
    ntop <- 500
    rv <- rowVars(assay(rld))
    select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
    mat <- t( assay(rld)[select, ] )
    pc <- prcomp(mat)
    pc$x[,1:3]
    #df_pc <- data.frame(pc$x[,1:3])
    df_pc <- data.frame(pc$x)
    identical(rownames(data), rownames(df_pc)) #-->TRUE
    data$PC1 <- NULL
    data$PC2 <- NULL
    merged_df <- merge(data, df_pc, by = "row.names")
    #merged_df <- merged_df[, -1]
    row.names(merged_df) <- merged_df$Row.names
    merged_df$Row.names <- NULL  # remove the "name" column
    merged_df$name <- NULL
    merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","group","condition","donor")]
    write.csv(merged_df, file="merged_df_8PCs.csv")
    summary(pc)  
    #0.3311  0.2376 0.1247
    #0.3637  0.2564 0.1184 --> 0.36, 026, 0,12
    draw_3D.py
    
  4. draw_3D.py

    import plotly.graph_objects as go
    import pandas as pd
    from sklearn.decomposition import PCA
    import numpy as np
    from scipy.linalg import eigh, sqrtm
    
    # Read in data as a pandas dataframe
    #df = pd.DataFrame({
    #    'PC1': [-13.999925, -12.504291, -12.443057, -13.065235, -17.316215],
    #    'PC2': [-1.498823, -3.342411, -6.067055, -8.205809, 3.293993],
    #    'PC3': [-3.335085, 15.207755, -14.725450, 15.078469, -6.917358],
    #    'condition': ['GFP d3', 'GFP d3', 'GFP d8', 'GFP d8', 'GFP+mCh d9/12'],
    #    'donor': ['DI', 'DII', 'DI', 'DII', 'DI']
    #})
    df = pd.read_csv('merged_df_8PCs.csv', index_col=0, header=0)
    df['condition'] = df['condition'].replace("control", "ctrl d8")
    df['condition'] = df['condition'].replace("LTtr", "LTtr d8")
    df['condition'] = df['condition'].replace("LT", "LT d8")
    df['condition'] = df['condition'].replace("LT_K331A", "LT K331A d8")
    
    # Fit PCA model to reduce data dimensions to 3
    pca = PCA(n_components=3)
    pca.fit(df.iloc[:, :-3])
    X_reduced = pca.transform(df.iloc[:, :-3])
    
    # Add reduced data back to dataframe
    df['PC1'] = X_reduced[:, 0]
    df['PC2'] = X_reduced[:, 1]
    df['PC3'] = X_reduced[:, 2]
    
    # Create PCA plot with 3D scatter
    fig = go.Figure()
    
    #['circle', 'circle-open', 'square', 'square-open', 'diamond', 'diamond-open', 'cross', 'x']
    # if donor == 'DI' else marker=dict(size=2, opacity=0.8, color=condition_color, symbol=donor_symbol)
    
    #decrease diamond size to 6 while keep the circle as size 10 in the following code:
    #'rgb(128, 150, 128)'
    #I need three families of colors, always from light to deep, the first one should close to grey.
    #the first serie for 'ctrl LTtr+sT d9/12', 'LTtr+sT d9/12' 
    #the second serie for 'ctrl LT/LTtr d3', 'ctrl LT/LTtr d8', 'LT d3', 'LT d8', 'LTtr d3', 'LTtr d8'
    #the third serie for 'ctrl sT d3', 'ctrl sT d8', 'sT d3', 'sT d8', 'sT+LT d3'
    
    condition_color_map_untreated = {'untreated':'black'}
    donor_symbol_map_untreated = {'DI': 'circle-open', 'DII': 'diamond-open'}
    #condition_color_map = {'ctrl LTtr+sT d9/12': 'green', 'GFP d3': 'blue', 'GFP d8': 'red', 'GFP+mCh d9/12': 'green', 'LT d3': 'orange'}
    condition_color_map = {
        'ctrl d8': '#A9A9A9',
        'LT d8': '#a6cee3',
        'LT K331A d8': '#1f78b4',
        'LTtr d8': '#e31a1c'
    }
    donor_symbol_map = {'DI': 'circle', 'DII': 'diamond'}
    
    for donor, donor_symbol in donor_symbol_map_untreated.items():
        for condition, condition_color in condition_color_map_untreated.items():
            mask = (df['condition'] == condition) & (df['donor'] == donor)
            fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'],
                                      mode='markers',
                                      name=f'{condition}' if donor == 'DI' else None,
                                      legendgroup=f'{condition}',
                                      showlegend=True if donor == 'DI' else False,
                                      marker=dict(size=4 if donor_symbol in ['diamond-open'] else 6, opacity=0.8, color=condition_color, symbol=donor_symbol)))
    
    for donor, donor_symbol in donor_symbol_map.items():
        for condition, condition_color in condition_color_map.items():
            mask = (df['condition'] == condition) & (df['donor'] == donor)
            fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'],
                                      mode='markers',
                                      name=f'{condition}' if donor == 'DI' else None,
                                      legendgroup=f'{condition}',
                                      showlegend=True if donor == 'DI' else False,
                                      marker=dict(size=4 if donor_symbol in ['diamond'] else 6, opacity=0.8, color=condition_color, symbol=donor_symbol)))
    
    for donor, donor_symbol in donor_symbol_map.items():
        fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None],
                                  mode='markers',
                                  name=donor,
                                  legendgroup=f'{donor}',
                                  showlegend=True,
                                  marker=dict(size=6, opacity=1, color='black', symbol=donor_symbol),
                                  hoverinfo='none'))
    
    # Annotations for the legend blocks
    fig.update_layout(
        annotations=[
            dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False,
                text='Condition', font=dict(size=15)),
            dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False,
                text='Donor', font=dict(size=15)),
            dict(x=1.08, y=0.2, xref='paper', yref='paper', showarrow=False,
                text='PC3: 12% variance', font=dict(size=15), textangle=-90)
        ],
        scene=dict(
            xaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black', backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title='PC1: 36% variance'),
            yaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black', backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title='PC2: 26% v.'),
            zaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black', backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title=''),
            bgcolor='white'
        ),
        margin=dict(l=0, r=0, b=0, t=0)  # Adjust the margins to prevent clipping of axis titles
    )
    #0.3311  0.2376 0.1247
    #summary(pc) #--> Proportion of Variance  0.3647 0.1731 0.1515
    #percentVar <- round(100 * attr(data, "percentVar"))
    #percentVar <- c(36,17,15)
    
    #fig.show()
    fig.write_image("PCA_3D.svg")
    
  5. draw 2D PCA plots.

    library(gplots) 
    library("RColorBrewer")
    
    #mat <- assay(rld)
    #mm <- model.matrix(~condition, colData(rld))
    #mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm)
    #assay(rld) <- mat
    
    # -- pca --
    png("pca.png", 1200, 800)
    plotPCA(rld, intgroup=c("condition"))
    dev.off()
    # -- heatmap --
    png("heatmap.png", 1200, 800)
    distsRL <- dist(t(assay(rld)))
    mat <- as.matrix(distsRL)
    hc <- hclust(distsRL)
    hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
    heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
    dev.off()
    
  6. differential expressions

    dds$condition <- relevel(dds$condition, "control")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("LT_K331A_vs_control", "LT_vs_control", "LTtr_vs_control")
    
    dds$condition <- relevel(dds$condition, "LT")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("LT_K331A_vs_LT")
    
    library(dplyr)
    library(tidyverse)
    library(biomaRt)
    listEnsembl()
    listMarts()
    ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="104")
    datasets <- listDatasets(ensembl)
    attributes = listAttributes(ensembl)
    attributes[1:25,]
    
    for (i in clist) {
      contrast = paste("condition", i, sep="_")
      res = results(dds, name=contrast)
      res <- res[!is.na(res$log2FoldChange),]
      geness <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
          filters = 'ensembl_gene_id',
          values = rownames(res), 
          mart = ensembl)
      geness_uniq <- distinct(geness, ensembl_gene_id, .keep_all= TRUE)
      res$ENSEMBL = rownames(res)
      identical(rownames(res), geness_uniq$ensembl_gene_id)
      res_df <- as.data.frame(res)
      geness_res <- merge(geness_uniq, res_df, by.x="ensembl_gene_id", by.y="ENSEMBL")
      dim(geness_res)
      rownames(geness_res) <- geness_res$ensembl_gene_id
      geness_res$ensembl_gene_id <- NULL
      write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
      up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
      down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
      write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
      write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
    }
    
  7. volcano plots with automatically finding top_g

    #sorted_geness_res <- geness_res %>% arrange(desc(log2FoldChange))
    #CSF3, ADORA2A, RPS26P6, EHF, TNFRSF6B
    #sorted_geness_res <- geness_res %>% arrange(log2FoldChange)
    
    library(ggrepel)
    geness_res <- read.csv(file = "LT_K331A_vs_control-all.txt", sep=",", row.names=1)
    # Color setting
    geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray", 
                              ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
    # Predefined genes colored in green
    predefined_genes <- c()  #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png
    geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
    geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
    top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:1000],
                    geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:1000]))
    # Define the original and compressed ranges
    original_range <- c(18, 30)
    compressed_range <- c(18.0, 22.0)
    # Adjust the p-values based on the ranges
    geness_res$adjusted_pvalue <- with(geness_res, 
                                      ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
                                              ((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
                                              ifelse(-log10(padj) > original_range[2], 
                                                    -log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
                                                    -log10(padj))))
    # Calculate breaks for the y-axis
    y_breaks_below <- seq(0, 15, by=5)
    y_breaks_compressed <- c(18.0, 22.0) 
    y_breaks_above <- c(27.0)
    y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above)
    y_labels_below <- seq(0, 15, by=5)
    y_labels_compressed <- c(18, 30)
    y_labels_above <- c(35)
    y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above)
    
    # Create the plot
    png("LT_K331A_vs_control.png", width=1000, height=1000)
    ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) + 
      geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) +  
      geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +     
      geom_point(size = 3) +
      labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") + 
      scale_color_identity() +
      geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)), 
                      size = 6, 
                      point.padding = 0.15, 
                      color = "black", 
                      min.segment.length = .1, 
                      box.padding = .2, 
                      lwd = 2) + 
      theme_bw(base_size = 22) +
      theme(legend.position = "bottom") +
      annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") +
      annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") +
      annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) +
      annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) +
      scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels))
    dev.off()
    
    geness_res <- read.csv(file = "LT_vs_control-all.txt", sep=",", row.names=1)
    # Color setting
    geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray", 
                              ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
    # Predefined genes colored in green
    predefined_genes <- c()  #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png
    geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
    geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
    top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:400],
                    geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:400]))
    # Define the original and compressed ranges
    original_range <- c(18, 38)
    compressed_range <- c(18.0, 22.0)
    # Adjust the p-values based on the ranges
    geness_res$adjusted_pvalue <- with(geness_res, 
                                      ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
                                              ((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
                                              ifelse(-log10(padj) > original_range[2], 
                                                    -log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
                                                    -log10(padj))))
    # Calculate breaks for the y-axis
    y_breaks_below <- seq(0, 15, by=5)
    y_breaks_compressed <- c(18.0, 22.0) 
    y_breaks_above <- c(27.0)
    y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above)
    y_labels_below <- seq(0, 15, by=5)
    y_labels_compressed <- c(18, 38)
    y_labels_above <- c(43)
    y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above)
    
    # Create the plot
    png("LT_vs_control.png", width=1000, height=1000)
    ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) + 
      geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) +  
      geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +     
      geom_point(size = 3) +
      labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") + 
      scale_color_identity() +
      geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)), 
                      size = 6, 
                      point.padding = 0.15, 
                      color = "black", 
                      min.segment.length = .1, 
                      box.padding = .2, 
                      lwd = 2) + 
      theme_bw(base_size = 22) +
      theme(legend.position = "bottom") +
      annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") +
      annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") +
      annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) +
      annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) +
      scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels))
    dev.off()
    
    geness_res <- read.csv(file = "LTtr_vs_control-all.txt", sep=",", row.names=1)
    # Color setting
    geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray", 
                              ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
    # Predefined genes colored in green
    predefined_genes <- c()  #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png
    geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
    geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
    top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:400],
                    geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:400]))
    # Define the original and compressed ranges
    original_range <- c(15, 25)
    compressed_range <- c(15.0, 18.0)
    # Adjust the p-values based on the ranges
    geness_res$adjusted_pvalue <- with(geness_res, 
                                      ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
                                              ((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
                                              ifelse(-log10(padj) > original_range[2], 
                                                    -log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
                                                    -log10(padj))))
    # Calculate breaks for the y-axis
    y_breaks_below <- seq(0, 10, by=5)
    y_breaks_compressed <- c(15.0, 18.0) 
    y_breaks_above <- c(23.0)
    y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above)
    y_labels_below <- seq(0, 10, by=5)
    y_labels_compressed <- c(15, 25)
    y_labels_above <- c(30)
    y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above)
    
    # Create the plot
    png("LTtr_vs_control.png", width=1000, height=1000)
    ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) + 
      geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) +  
      geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +     
      geom_point(size = 3) +
      labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") + 
      scale_color_identity() +
      geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)), 
                      size = 6, 
                      point.padding = 0.15, 
                      color = "black", 
                      min.segment.length = .1, 
                      box.padding = .2, 
                      lwd = 2) + 
      theme_bw(base_size = 22) +
      theme(legend.position = "bottom") +
      annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") +
      annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") +
      annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) +
      annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) +
      scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels))
    dev.off()
    
    geness_res <- read.csv(file = "LT_K331A_vs_LT-all.txt", sep=",", row.names=1)
    # Color setting
    geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray", 
                              ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
    # Predefined genes colored in green
    predefined_genes <- c()  #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png
    geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
    geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
    top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:400],
                    geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:400]))
    # Define the original and compressed ranges
    original_range <- c(10, 11)
    compressed_range <- c(10.0, 11.0)
    # Adjust the p-values based on the ranges
    geness_res$adjusted_pvalue <- with(geness_res, 
                                      ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
                                              ((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
                                              ifelse(-log10(padj) > original_range[2], 
                                                    -log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
                                                    -log10(padj))))
    # Create the plot
    png("LT_K331A_vs_LT.png", width=1000, height=1000)
    ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) + 
      geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) +  
      geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +     
      geom_point(size = 3) +
      labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") + 
      scale_color_identity() +
      geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)), 
                      size = 6, 
                      point.padding = 0.15, 
                      color = "black", 
                      min.segment.length = .1, 
                      box.padding = .2, 
                      lwd = 2) + 
      theme_bw(base_size = 22) +
      theme(legend.position = "bottom")
    dev.off()
    
    for i in LT_K331A_vs_control LT_vs_control LTtr_vs_control LT_K331A_vs_LT; do
      echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;"
    done
    
  8. clustering the genes and draw heatmap

    install.packages("gplots")
    library("gplots")
    for i in LT_K331A_vs_control LT_vs_control LTtr_vs_control LT_K331A_vs_LT; do
      echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"
      echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"
    done
    # 307 LT_K331A_vs_control-down.id
    # 667 LT_K331A_vs_control-up.id
    # 66 LT_K331A_vs_LT-down.id
    # 71 LT_K331A_vs_LT-up.id
    # 157 LTtr_vs_control-down.id
    # 484 LTtr_vs_control-up.id
    # 379 LT_vs_control-down.id
    # 749 LT_vs_control-up.id
    # 2880 total
    cat *.id | sort -u > ids
    #add Gene_Id in the first line, delete the ""
    GOI <- read.csv("ids")$Gene_Id  #570 genes
    RNASeq.NoCellLine <- assay(rld)
    # Defining the custom order
    column_order <- c(
      "control_DI", "control_DII", "LTtr_DI", "LTtr_DII", "LT_DI", "LT_DII", "LT_K331A_DI", "LT_K331A_DII"
    )
    RNASeq.NoCellLine_reordered <- RNASeq.NoCellLine[, column_order]
    head(RNASeq.NoCellLine_reordered)
    #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
    datamat = RNASeq.NoCellLine_reordered[GOI, ]
    write.csv(as.data.frame(datamat), file ="DEGs_heatmap_data.csv")
    hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
    hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
    mycl = cutree(hr, h=max(hr$height)/1.05)
    mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED",  "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");
    mycol = mycol[as.vector(mycl)]
    png("DEGs_heatmap.png", width=900, height=1010)
    heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
                scale='row',trace='none',col=bluered(75),
                RowSideColors = mycol, labRow="", srtCol=20, keysize=0.72, cexRow = 2, cexCol = 1.4)
    dev.off()
    # Extract rows from datamat where the row names match the identifiers in subset_1
    #### cluster members #####
    subset_1<-names(subset(mycl, mycl == '1'))
    data <- as.data.frame(datamat[rownames(datamat) %in% subset_1, ])  #579
    subset_2<-names(subset(mycl, mycl == '2'))
    data <- as.data.frame(datamat[rownames(datamat) %in% subset_2, ])  #399
    subset_3<-names(subset(mycl, mycl == '3'))
    data <- as.data.frame(datamat[rownames(datamat) %in% subset_3, ])  #70
    subset_4<-names(subset(mycl, mycl == '4'))
    data <- as.data.frame(datamat[rownames(datamat) %in% subset_4, ])  #433
    subset_5<-names(subset(mycl, mycl == '5'))
    data <- as.data.frame(datamat[rownames(datamat) %in% subset_5, ])  #140
    
    # Initialize an empty data frame for the annotated data
    annotated_data <- data.frame()
    # Determine total number of genes
    total_genes <- length(rownames(data))
    # Loop through each gene to annotate
    for (i in 1:total_genes) {
        gene <- rownames(data)[i]
        result <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
                        filters = 'ensembl_gene_id',
                        values = gene,
                        mart = ensembl)
        # If multiple rows are returned, take the first one
        if (nrow(result) > 1) {
            result <- result[1, ]
        }
        # Check if the result is empty
        if (nrow(result) == 0) {
            result <- data.frame(ensembl_gene_id = gene,
                                external_gene_name = NA,
                                gene_biotype = NA,
                                entrezgene_id = NA,
                                chromosome_name = NA,
                                start_position = NA,
                                end_position = NA,
                                strand = NA,
                                description = NA)
        }
        # Transpose expression values
        expression_values <- t(data.frame(t(data[gene, ])))
        colnames(expression_values) <- colnames(data)
        # Combine gene information and expression data
        combined_result <- cbind(result, expression_values)
        # Append to the final dataframe
        annotated_data <- rbind(annotated_data, combined_result)
        # Print progress every 100 genes
        if (i %% 100 == 0) {
            cat(sprintf("Processed gene %d out of %d\n", i, total_genes))
        }
    }
    
    # Save the annotated data to a new CSV file
    write.csv(annotated_data, "cluster1_YELLOW.csv", row.names=FALSE)
    write.csv(annotated_data, "cluster2_DARKBLUE.csv", row.names=FALSE)
    write.csv(annotated_data, "cluster3_DARKORANGE.csv", row.names=FALSE)
    write.csv(annotated_data, "cluster4_DARKMAGENTA.csv", row.names=FALSE)
    write.csv(annotated_data, "cluster5_DARKCYAN.csv", row.names=FALSE)
    #~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.csv -d',' -o DEGs_heatmap_clusters.xls
    

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum