GSVA plots & tables for RNA-seq and NanoString

preparing gene expression matrix: calculate DESeq2 results

 #Input: merged_gene_counts.txt

 setwd("/home/jhuang/DATA/Data_Susanne_Carotis_RNASeq/run_2023_GSVA_table/")

 library("AnnotationDbi")
 library("clusterProfiler")
 library("ReactomePA")
 #BiocManager::install("org.Hs.eg.db")
 library("org.Hs.eg.db")
 library(DESeq2)
 library(gplots)

 d.raw<- read.delim2("merged_gene_counts.txt",sep="\t", header=TRUE, row.names=1)

 colnames(d.raw)<-c("gene_name", "leer_mock_2h_r2", "Ace2_mock_2h_r2", "leer_inf_24h_r1", "Ace2_inf_2h_r1", "leer_inf_24h_r2", "leer_inf_2h_r1", "leer_mock_2h_r1", "leer_inf_2h_r2", "Ace2_inf_2h_r2", "Ace2_mock_2h_r1", "Ace2_inf_24h_r2", "Ace2_inf_24h_r1")

 col_order <- c("gene_name", "leer_mock_2h_r1","leer_mock_2h_r2","leer_inf_2h_r1","leer_inf_2h_r2","leer_inf_24h_r1","leer_inf_24h_r2","Ace2_mock_2h_r1","Ace2_mock_2h_r2","Ace2_inf_2h_r1","Ace2_inf_2h_r2","Ace2_inf_24h_r1","Ace2_inf_24h_r2")

 reordered.raw <- d.raw[,col_order]
 reordered.raw$gene_name <- NULL
 #d <- d.raw[rowSums(reordered.raw>3)>2,]

 condition = as.factor(c("leer_mock_2h","leer_mock_2h","leer_inf_2h","leer_inf_2h","leer_inf_24h","leer_inf_24h","Ace2_mock_2h","Ace2_mock_2h","Ace2_inf_2h","Ace2_inf_2h","Ace2_inf_24h","Ace2_inf_24h"))
 ids = as.factor(c("leer_mock_2h_r1","leer_mock_2h_r2","leer_inf_2h_r1","leer_inf_2h_r2","leer_inf_24h_r1","leer_inf_24h_r2","Ace2_mock_2h_r1","Ace2_mock_2h_r2","Ace2_inf_2h_r1","Ace2_inf_2h_r2","Ace2_inf_24h_r1","Ace2_inf_24h_r2"))

 #cData = data.frame(row.names=colnames(reordered.raw), condition=condition,  batch=batch, ids=ids)
 #dds<-DESeqDataSetFromMatrix(countData=reordered.raw, colData=cData, design=~batch+condition)
 cData = data.frame(row.names=colnames(reordered.raw), condition=condition, ids=ids)
 dds<-DESeqDataSetFromMatrix(countData=reordered.raw, colData=cData, design=~condition)

 #----more detailed and specific with the following code!----
 dds$condition <- relevel(dds$condition, "Ace2_mock_2h")
 dds = DESeq(dds, betaPrior=FALSE)  # betaPrior default value is FALSE
 resultsNames(dds)

preparing selected_geneSets in gsva(exprs, selected_geneSets, method=”gsva”). Note that methods are different than methods for nanoString, here are ENSEMBL listed.

 #Input: "Signatures.xls" + "Signatures_additional.xls"
 library(readxl)
 library(gridExtra)
 library(ggplot2)
 library(GSVA)
 # Paths to the Excel files
 file_paths <- list("Signatures.xls", "Signatures_additional.xls")
 # Get sheet names for each file
 sheet_names_list <- lapply(file_paths, excel_sheets)

 # Initialize an empty list to hold gene sets
 geneSets <- list()
 # Loop over each file path and its corresponding sheet names
 for (i in 1:length(file_paths)) {
   file_path <- file_paths[[i]]
   sheet_names <- sheet_names_list[[i]]
   # Loop over each sheet, extract the ENSEMBL IDs, and add to the list
   for (sheet in sheet_names) {
     # Read the sheet
     data <- read_excel(file_path, sheet = sheet)

     # Process the GeneSet names (replacing spaces with underscores, for example)
     gene_set_name <- gsub(" ", "_", unique(data$GeneSet)[1])

     # Add ENSEMBL IDs to the list
     geneSets[[gene_set_name]] <- as.character(data$ENSEMBL)
   }
 }

 # Print the result to check
 #print(geneSets)
 #summary(geneSets)
 ##desired_geneSets <- c("Monocytes", "Plasma_cells", "T_regs", "Cyt._act._T_cells", "Neutrophils", "Inflammatory_neutrophils", "Suppressive_neutrophils", "LDG", "CD40_activated")
 #desired_geneSets <- c("IFN", "TNF", "IL-6R_complex", "IL-1_cytokines", "Pro-inflam._IL-1", "Monocyte_secreted", "Apoptosis", "NFkB_complex",   "NLRP3_inflammasome")
 #selected_geneSets <- geneSets[desired_geneSets]
 selected_geneSets <- geneSets
 # Print the selected gene sets
 print(selected_geneSets)

prepare violin plots and p-value table for RNA-seq data

 # 0. for RNAseq, the GSVA input requires a gene expression matrix where rows are genes and columns are samples. This matrix must be in non-log space.
 exprs <- counts(dds, normalized=TRUE)

 # 1. Compute GSVA scores:
 gsva_scores <- gsva(exprs, selected_geneSets, method="gsva")

 # 2. Convert to data.frame for ggplot:
 gsva_df <- as.data.frame(t(gsva_scores))

 # 3. Add conditions to gsva_df:
 gsva_df$Condition <- dds$condition

 # 4. Filter the gsva_df to retain only the desired conditions:
 #group 1 vs. group 3 in the nanostring data
 gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("Ace2_mock_2h", "Ace2_inf_24h"), ]

 # 5. Define a function to plot violin plots:
 # Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
 gsva_df_filtered$Condition <- gsub("Ace2_mock_2h", "Group3", gsva_df_filtered$Condition)  #group3=mock
 gsva_df_filtered$Condition <- gsub("Ace2_inf_24h", "Group1a", gsva_df_filtered$Condition)  #group1a=infection
 gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1a", "Group3"))

 #plot_violin <- function(data, gene_name) {
 #  # Calculate the t-test p-value for the two conditions
 #  condition1_data <- data[data$Condition == "Group1a", gene_name]
 #  condition2_data <- data[data$Condition == "Group3", gene_name]
 #  p_value <- t.test(condition1_data, condition2_data)$p.value
 #  # Convert p-value to annotation
 #  p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
 #  rounded_p_value <- paste0("p = ", round(p_value, 2))
 #  plot_title <- gsub("_", " ", gene_name)
 #  p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name), fill=Condition)) +
 #    geom_violin(linewidth=1.2) + 
 #    scale_fill_manual(values = custom_colors) +
 #    labs(title=plot_title, y="GSVA Score") +
 #    ylim(-1, 1) +
 #    theme_light() +
 #    theme(
 #      axis.title.x = element_text(size=12),
 #      axis.title.y = element_text(size=12),
 #      axis.text.x  = element_text(size=10),
 #      axis.text.y  = element_text(size=10),
 #      plot.title   = element_text(size=12, hjust=0.5),
 #      legend.position = "none" # Hide legend since the colors are self-explanatory
 #    )
 #  # Add p-value annotation to the plot
 #  p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
 #  return(p)
 #}
 ## 6. Generate the list of plots in a predefined order:
 #genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
 #genes <- genes[match(desired_order, genes)]
 #genes <- genes[!is.na(genes)]
 #first_row_plots <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))

 # -- This following code does not have the colors in the figure --
 plot_violin <- function(data, gene_name) {
   # Calculate the t-test p-value for the two conditions
   condition1_data <- data[data$Condition == "Group1a", gene_name]
   condition2_data <- data[data$Condition == "Group3", gene_name]
   p_value <- t.test(condition1_data, condition2_data)$p.value

   # Convert p-value to annotation
   p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
   rounded_p_value <- paste0("p = ", round(p_value, 2))

   plot_title <- gsub("_", " ", gene_name)
   p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name))) +
     geom_violin(linewidth=1.2) + 
     labs(title=plot_title, y="GSVA Score") +
     ylim(-1, 1) +
     theme_light() +
     theme(
       axis.title.x = element_text(size=12),
       axis.title.y = element_text(size=12),
       axis.text.x  = element_text(size=10),
       axis.text.y  = element_text(size=10),
       plot.title   = element_text(size=12, hjust=0.5)
     )

   # Add p-value annotation to the plot
   p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)

   #return(p)

   # Return both the plot and the p-value
   list(plot = p, p_value = p_value)
 }

 # 6. Generate the list of plots in a predefined order:
 #desired_order <- c("IFN", "TNF", "IL-6R_complex", "IL-1_cytokines", "Pro-inflam._IL-1", "Monocyte_secreted", "Apoptosis", "NFkB_complex",   "NLRP3_inflammasome")
 genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
 #genes <- genes[match(desired_order, genes)]
 #first_row_plots <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))

 # Correct the creation of p_values_df
 p_values_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene)$p_value)
 p_values_df <- data.frame(Gene = genes, P_Value = unlist(p_values_list))

 # Calculate adjusted p-values
 #p_values_df$Adjusted_P_Value <- p.adjust(p_values_df$P_Value, method = "fdr")

 write.xlsx(p_values_df, "p_values_df.xlsx")

5.1. preparing selected_geneSets in gsva(exprs, selected_geneSets, method=”gsva”) for NanoString

    #Input: Signatures.xls

    library(readxl)
    library(gridExtra)
    library(ggplot2)
    library(GSVA)

    # Paths to the Excel files
    file_paths <- list("Signatures.xls", "Signatures_additional.xls")

    # Get sheet names for each file
    sheet_names_list <- lapply(file_paths, excel_sheets)

    # Initialize an empty list to hold gene sets
    geneSets <- list()

    # Loop over each file path and its corresponding sheet names
    for (i in 1:length(file_paths)) {
      file_path <- file_paths[[i]]
      sheet_names <- sheet_names_list[[i]]

      # Loop over each sheet, extract the ENSEMBL IDs, and add to the list
      for (sheet in sheet_names) {
        # Read the sheet
        data <- read_excel(file_path, sheet = sheet)

        # Process the GeneSet names (replacing spaces with underscores, for example)
        gene_set_name <- gsub(" ", "_", unique(data$GeneSet)[1])

        # Add ENSEMBL IDs to the list
        geneSets[[gene_set_name]] <- unique(as.character(data$geneSymbol))
      }
    }

    # Print the result to check
    summary(geneSets)

    #desired_geneSets <- c("Monocytes", "Plasma_cells", "T_regs", "Cyt._act._T_cells", "Neutrophils", "Inflammatory_neutrophils", "Suppressive_neutrophils", "LDG", "CD40_activated")
    desired_geneSets <- c("IFN", "TNF", "IL-6R_complex", "IL-1_cytokines", "Pro-inflam._IL-1", "Monocyte_secreted", "Apoptosis", "NFkB_complex",   "NLRP3_inflammasome", "T_cells",    "Monocytes","Plasma_cells","T_regs","Cyt._act._T_cells","Neutrophils","Inflammatory_neutrophils","Suppressive_neutrophils","LDG","CD40_activated")
    selected_geneSets <- geneSets[desired_geneSets]
    # Print the selected gene sets
    print(selected_geneSets)

5.2. prepare violin plots and p-value table for NanoString data (Group 3 vs Group 1)

    #library(rmarkdown); render("run.Rmd", c("html_document")) under jhuang@hamburg:~/DATA/Data_Susanne_Carotis_spatialRNA_PUBLISHING/run_2023_2_GSVA_table with R

    # 0. for Nanostring, the GSVA input requires a gene expression matrix 'exprs' where rows are genes and columns are samples. This matrix must be in non-log space.
    exprs <- exprs(target_m666Data)  #18677    45
    #exprs <- exprs(filtered_or_neg_target_m666Data)  #2274   45
    # 1. Compute GSVA scores:
    gsva_scores <- gsva(exprs, selected_geneSets, method="gsva")
    # 2. Convert to data.frame for ggplot:
    gsva_df <- as.data.frame(t(gsva_scores))
    # 3. Add conditions to gsva_df:
    identical(rownames(pData(target_m666Data)), rownames(gsva_df))
    gsva_df$Condition <- pData(target_m666Data)$Grp
    #identical(rownames(gsva_df_filtered), rownames(pData(target_m666Data)) )
    gsva_df$SampleID <- pData(target_m666Data)$SampleID
    # 4. Filter the gsva_df to retain only the desired conditions:
    #group 1 vs. group 3 in the nanostring data
    gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1", "3"), ]
    # 5. Define a function to plot violin plots:
    # Define custom colors
    custom_colors <- c("Group1" = "lightblue", "Group1a" = "red", "Group3" = "grey")
    #To implement the custom colors, and make the adjustments to abbreviate "Inflammatory" and "Suppressive", as well as increase the font size for the groups on the x-axis, we can modify the plot_violin function as follows:
    gsva_df_filtered$Condition <- gsub("1", "Group1", gsva_df_filtered$Condition)
    gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
    gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1", "Group3"))
    plot_violin <- function(data, gene_name) {
      # Calculate the t-test p-value for the two conditions
      condition1_data <- data[data$Condition == "Group1", gene_name]
      condition2_data <- data[data$Condition == "Group3", gene_name]
      p_value <- t.test(condition1_data, condition2_data)$p.value
      # Convert p-value to annotation
      p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
      rounded_p_value <- paste0("p = ", round(p_value, 2))
      plot_title <- gsub("_", " ", gene_name)
      p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name), fill=Condition)) +
        geom_violin(linewidth=1.2) + 
        scale_fill_manual(values = custom_colors) +
        labs(title=plot_title, y="GSVA Score") +
        ylim(-1, 1) +
        theme_light() +
        theme(
          axis.title.x = element_text(size=12),
          axis.title.y = element_text(size=12),
          axis.text.x  = element_text(size=10),
          axis.text.y  = element_text(size=10),
          plot.title   = element_text(size=12, hjust=0.5),
          legend.position = "none" # Hide legend since the colors are self-explanatory
        )
      # Add p-value annotation to the plot
      p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
      #return(p)

      # Return both the plot and the p-value
      list(plot = p, p_value = p_value)
    }

    library(openxlsx)

    desired_order <- c("IFN", "TNF", "IL-6R_complex", "IL-1_cytokines", "Pro-inflam._IL-1", "Monocyte_secreted", "Apoptosis", "NFkB_complex",   "NLRP3_inflammasome", "T_cells",    "Monocytes","Plasma_cells","T_regs","Cyt._act._T_cells","Neutrophils","Inflammatory_neutrophils","Suppressive_neutrophils","LDG","CD40_activated")
    genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
    genes <- genes[match(desired_order, genes)]
    genes <- genes[!is.na(genes)]
    #second_row_plots <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))

    # Correct the creation of p_values_df
    p_values_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene)$p_value)
    p_values_df <- data.frame(Gene = genes, P_Value = unlist(p_values_list))

    write.xlsx(p_values_df, "p_values_df_Group3_vs_Group1_19sig.xlsx")

5.3. prepare violin plots and p-value table for NanoString data (Group 3 vs Group 1a)

    # 2. Convert to data.frame for ggplot:
    gsva_df <- as.data.frame(t(gsva_scores))
    # 3. Add conditions to gsva_df:
    identical(rownames(pData(target_m666Data)), rownames(gsva_df))
    gsva_df$Condition <- pData(target_m666Data)$Group
    #identical(rownames(gsva_df_filtered), rownames(pData(target_m666Data)) )
    gsva_df$SampleID <- pData(target_m666Data)$SampleID
    # 4. Filter the gsva_df to retain only the desired conditions:
    #group 1 vs. group 3 in the nanostring data
    gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1a", "3"), ]
    # 5. Define a function to plot violin plots:
    # Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
    gsva_df_filtered$Condition <- gsub("1a", "Group1a", gsva_df_filtered$Condition)
    gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
    gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1a", "Group3"))
    plot_violin <- function(data, gene_name) {
      # Calculate the t-test p-value for the two conditions
      condition1_data <- data[data$Condition == "Group1a", gene_name]
      condition2_data <- data[data$Condition == "Group3", gene_name]
      p_value <- t.test(condition1_data, condition2_data)$p.value
      # Convert p-value to annotation
      p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
      rounded_p_value <- paste0("p = ", round(p_value, 2))
      plot_title <- gsub("_", " ", gene_name)
      p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name), fill=Condition)) +
        geom_violin(linewidth=1.2) + 
        scale_fill_manual(values = custom_colors) +
        labs(title=plot_title, y="GSVA Score") +
        ylim(-1, 1) +
        theme_light() +
        theme(
          axis.title.x = element_text(size=12),
          axis.title.y = element_text(size=12),
          axis.text.x  = element_text(size=10),
          axis.text.y  = element_text(size=10),
          plot.title   = element_text(size=12, hjust=0.5),
          legend.position = "none" # Hide legend since the colors are self-explanatory
        )
      # Add p-value annotation to the plot
      p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
      #return(p)

      # Return both the plot and the p-value
      list(plot = p, p_value = p_value)
    }
    # 6. Generate the list of plots in a predefined order:
    genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
    genes <- genes[match(desired_order, genes)]
    genes <- genes[!is.na(genes)]

    # Correct the creation of p_values_df
    p_values_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene)$p_value)
    p_values_df <- data.frame(Gene = genes, P_Value = unlist(p_values_list))

    write.xlsx(p_values_df, "p_values_df_Group3_vs_Group1a_19sig.xlsx")

M	T	W	T	F	S	S
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30

Microbial bioinformatics

Microbial bioinformatics uses computational tools to analyze genomes, track evolution, and study functions in microorganisms, including bacteria and viruses.

GSVA plots & tables for RNA-seq and NanoString

Leave a Reply Cancel reply