GSVA calculation for NanoString data

gene_x 0 like s 1036 view s
Tags: R, sequencing, pipeline
library("rmarkdown")
library("tidyverse")
library(rmarkdown)
library("GeomxTools")
library("GeoMxWorkflows")
library("NanoStringNCTools")
setwd("/home/jhuang/DATA/Data_Susanne_Carotis_spatialRNA_PUBLISHING/run_2023")
render("run.Rmd", c("html_document"))



# ------------------------ Violin Presentation 1 --------------------
#identical(exprs(target_m666Data), assay(target_m666Data, "exprs")): contains the gene expression values that have been both normalized and log-transformed, which are often used for downstream analysis and visualization in many genomic studies.

#assay(target_m666Data, "log_q"): These are likely log2-transformed values of the quantile-normalized data. Quantile normalization ensures that the distributions of gene expression values across samples are the same, and taking the log2 of these normalized values is a common step in the analysis of high-throughput gene expression data. It's a way to make the data more suitable for downstream statistical analysis and visualization.

#assay(target_m666Data, "q_norm"): This typically represents the quantile-normalized gene expression values. Quantile normalization adjusts the expression values in each sample so that they have the same distribution. This normalization method helps remove systematic biases and makes the data more comparable across samples.

library(readxl)

# Path to the Excel file
file_path <- "Signatures.xls"

#example of a signature:
#geneSymbol geneEntrezID    ENSEMBL GeneSet
#CD160  11126   ENSG00000117281 Anergic or act. T cells
#CD244  51744   ENSG00000122223 Anergic or act. T cells
#CTLA4  1493    ENSG00000163599 Anergic or act. T cells
#HAVCR2 84868   ENSG00000135077 Anergic or act. T cells
#ICOS   29851   ENSG00000163600 Anergic or act. T cells
#KLRG1  10219   ENSG00000139187 Anergic or act. T cells
#LAG3   3902    ENSG00000089692 Anergic or act. T cells
#PDCD1  5133    ENSG00000188389 Anergic or act. T cells
#PDCD1  5133    ENSG00000276977 Anergic or act. T cells

# Get the names of the sheets
sheet_names <- excel_sheets(file_path)

# Initialize an empty list to hold gene sets
geneSets <- list()

# Loop over each sheet, extract the ENSEMBL IDs, and add to the list
for (sheet in sheet_names) {
  # Read the sheet
  data <- read_excel(file_path, sheet = sheet)

  # Process the GeneSet names (replacing spaces with underscores, for example)
  gene_set_name <- gsub(" ", "_", unique(data$GeneSet)[1])

  # Add ENSEMBL IDs to the list
  geneSets[[gene_set_name]] <- unique(as.character(data$geneSymbol))
}

# Print the result to check
print(geneSets)

library(gridExtra)
library(ggplot2)
library(GSVA)

# 0. for the following calculation, the GSVA input requires a gene expression matrix 'exprs' where rows are genes and columns are samples.
exprs <- exprs(target_m666Data)
#exprs <- exprs(filtered_or_neg_target_m666Data)

# 1. Compute GSVA scores:
gsva_scores <- gsva(exprs, geneSets, method="gsva")

# 2. Convert to data.frame for ggplot:
gsva_df <- as.data.frame(t(gsva_scores))

# 3. Add conditions to gsva_df:
identical(rownames(pData(target_m666Data)), rownames(gsva_df))
gsva_df$Condition <- pData(target_m666Data)$Grp
#identical(rownames(gsva_df_filtered), rownames(pData(target_m666Data)) )
gsva_df$SampleID <- pData(target_m666Data)$SampleID

# 4. Filter the gsva_df to retain only the desired conditions:
#group 1 vs. group 3 in the nanostring data
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1", "3"), ]

# 5. Define a function to plot violin plots:
# Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
gsva_df_filtered$Condition <- gsub("1", "Group1", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1", "Group3"))
plot_violin <- function(data, gene_name) {
  # Calculate the t-test p-value for the two conditions
  condition1_data <- data[data$Condition == "Group1", gene_name]
  condition2_data <- data[data$Condition == "Group3", gene_name]
  p_value <- t.test(condition1_data, condition2_data)$p.value

  # Convert p-value to annotation
  p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
  rounded_p_value <- paste0("p = ", round(p_value, 2))

  plot_title <- gsub("_", " ", gene_name)
  p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name))) +
    geom_violin(linewidth=1.2) + 
    labs(title=plot_title, y="GSVA Score") +
    ylim(-1, 1) +
    theme_light() +
    theme(
      axis.title.x = element_text(size=12),
      axis.title.y = element_text(size=12),
      axis.text.x  = element_text(size=10),
      axis.text.y  = element_text(size=10),
      plot.title   = element_text(size=12, hjust=0.5)
    )

  # Add p-value annotation to the plot
  p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)

  return(p)
}

# 6. Generate the list of plots:
#genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
#plots_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))

# 6. Generate the list of plots in a predefined order:
desired_order <- c("Platelets","Granulocytes","LDG","pDC","Anti-inflammation",  "Pro-inflam._IL-1","Dendritic_cells","MHC_II","Alt._complement","TNF",  "NLRP3_inflammasome","Unfolded_protein","B_cells","Monocyte_cell_surface","Inflammasome",  "Monocyte_secreted","IL-1_cytokines","SNOR_low_UP","CD40_activated","Lectin_complement",  "Classical_complement","Cell_cycle","Plasma_cells","IG_chains","Erythrocytes",  "IL-6R_complex","IFN","TCRB","TCRA","Cyt._act._T_cells",  "TCRG","T_cells","CD8T-NK-NKT","Anergic_or_act._T_cells","T_activated",  "NK_cells","TCRD","T_regs","SNOR_low_DOWN","Monocytes",  "Myeloid_cells","Neutrophils")
genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
genes <- genes[match(desired_order, genes)]
genes <- genes[!is.na(genes)]
plots_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))

# 7. Pad the list of plots:
remaining_plots <- 6 - (length(plots_list) %% 6)
if (remaining_plots != 6) {
  plots_list <- c(plots_list, rep(list(NULL), remaining_plots))
}

# 7. Pad the list of plots:
#plots_needed_for_full_grid <- ceiling(length(plots_list) / 6) * 6
#remaining_plots <- plots_needed_for_full_grid - length(plots_list)
#plots_list <- c(plots_list, rep(list(NULL), remaining_plots))

# 8. Create the plots and arrange them in a grid:
library(gridExtra)
plots_matrix <- matrix(plots_list, ncol=6, byrow=T)
#do.call("grid.arrange", c(plots_matrix, list(ncol=6)))

# 9. Save the plots to a PNG:
png("All_Violin_Plots_NanoString_3_vs_1.png", width=1000, height=1000)
do.call("grid.arrange", c(plots_matrix, list(ncol=6)))
dev.off()


# ------------------------ Violin Presentation 2 --------------------

# 2. Convert to data.frame for ggplot:
gsva_df <- as.data.frame(t(gsva_scores))

# 3. Add conditions to gsva_df:
identical(rownames(pData(target_m666Data)), rownames(gsva_df))
gsva_df$Condition <- pData(target_m666Data)$Group
#identical(rownames(gsva_df_filtered), rownames(pData(target_m666Data)) )
gsva_df$SampleID <- pData(target_m666Data)$SampleID

# 4. Filter the gsva_df to retain only the desired conditions:
#group 1 vs. group 3 in the nanostring data
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1a", "3"), ]

# 5. Define a function to plot violin plots:
# Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
gsva_df_filtered$Condition <- gsub("1a", "Group1a", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1a", "Group3"))
plot_violin <- function(data, gene_name) {
  # Calculate the t-test p-value for the two conditions
  condition1_data <- data[data$Condition == "Group1a", gene_name]
  condition2_data <- data[data$Condition == "Group3", gene_name]
  p_value <- t.test(condition1_data, condition2_data)$p.value

  # Convert p-value to annotation
  p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
  rounded_p_value <- paste0("p = ", round(p_value, 2))

  plot_title <- gsub("_", " ", gene_name)
  p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name))) +
    geom_violin(linewidth=1.2) + 
    labs(title=plot_title, y="GSVA Score") +
    ylim(-1, 1) +
    theme_light() +
    theme(
      axis.title.x = element_text(size=12),
      axis.title.y = element_text(size=12),
      axis.text.x  = element_text(size=10),
      axis.text.y  = element_text(size=10),
      plot.title   = element_text(size=12, hjust=0.5)
    )

  # Add p-value annotation to the plot
  p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)

  return(p)
}

# 6. Generate the list of plots:
#genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
#plots_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))

# 6. Generate the list of plots in a predefined order:
desired_order <- c("Platelets","Granulocytes","LDG","pDC","Anti-inflammation",  "Pro-inflam._IL-1","Dendritic_cells","MHC_II","Alt._complement","TNF",  "NLRP3_inflammasome","Unfolded_protein","B_cells","Monocyte_cell_surface","Inflammasome",  "Monocyte_secreted","IL-1_cytokines","SNOR_low_UP","CD40_activated","Lectin_complement",  "Classical_complement","Cell_cycle","Plasma_cells","IG_chains","Erythrocytes",  "IL-6R_complex","IFN","TCRB","TCRA","Cyt._act._T_cells",  "TCRG","T_cells","CD8T-NK-NKT","Anergic_or_act._T_cells","T_activated",  "NK_cells","TCRD","T_regs","SNOR_low_DOWN","Monocytes",  "Myeloid_cells","Neutrophils")
genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
genes <- genes[match(desired_order, genes)]
genes <- genes[!is.na(genes)]
plots_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))

# 7. Pad the list of plots:
remaining_plots <- 6 - (length(plots_list) %% 6)
if (remaining_plots != 6) {
  plots_list <- c(plots_list, rep(list(NULL), remaining_plots))
}

# 7. Pad the list of plots:
#plots_needed_for_full_grid <- ceiling(length(plots_list) / 6) * 6
#remaining_plots <- plots_needed_for_full_grid - length(plots_list)
#plots_list <- c(plots_list, rep(list(NULL), remaining_plots))

# 8. Create the plots and arrange them in a grid:
library(gridExtra)
plots_matrix <- matrix(plots_list, ncol=6, byrow=T)
#do.call("grid.arrange", c(plots_matrix, list(ncol=6)))

# 9. Save the plots to a PNG:
png("All_Violin_Plots_NanoString_3_vs_1a.png", width=1000, height=1000)
do.call("grid.arrange", c(plots_matrix, list(ncol=6)))
dev.off()




# ------------------------ Heatmap Presentation --------------------

# 3. Add conditions to gsva_df:
identical(rownames(pData(target_m666Data)), rownames(gsva_df))
gsva_df$Condition <- pData(target_m666Data)$Grp
gsva_df$SampleID <- pData(target_m666Data)$SampleID

# 4. Filter the gsva_df to retain only the desired conditions:
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1", "3"), ]

# 5. Define a function to plot violin plots:
# Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
gsva_df_filtered$Condition <- gsub("1", "Group1", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1", "Group3"))


# Filter the data for "Group1" samples
group1_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group1", genes]

# Set the median values for each column in the data.frame
# Calculate median values for each column
group1_medians <- apply(group1_data, 2, median)



# 3. Add conditions to gsva_df:
identical(rownames(pData(target_m666Data)), rownames(gsva_df))
gsva_df$Condition <- pData(target_m666Data)$Group
#identical(rownames(gsva_df_filtered), rownames(pData(target_m666Data)) )
gsva_df$SampleID <- pData(target_m666Data)$SampleID

# 4. Filter the gsva_df to retain only the desired conditions:
#group 1 vs. group 3 in the nanostring data
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1a", "1b", "3"), ]

# 5. Define a function to plot violin plots:
# Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
gsva_df_filtered$Condition <- gsub("1a", "Group1a", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("1b", "Group1b", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1a", "Group1b", "Group3"))

# Filter the data for "Group1a" samples
group1a_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group1a", genes]
group1a_medians <- apply(group1a_data, 2, median)

# Filter the data for "Group1b" samples
group1b_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group1b", genes]
group1b_medians <- apply(group1b_data, 2, median)

# Filter the data for "Group3" samples
group3_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group3", genes]
group3_medians <- apply(group3_data, 2, median)

# Create a new data frame with the middle values for both groups
heatmap_data_nanostring <- data.frame(Group1 = group1_medians, Group1a = group1a_medians, Group1b = group1b_medians, Group3 = group3_medians)

# Merge two heatmap_data_* to a matrix
heatmap_data <- merge(heatmap_data_nanostring, heatmap_data_rnaseq, by="row.names", all=TRUE)
rownames(heatmap_data) <- heatmap_data$Row.names
heatmap_data$Row.names <- NULL

# Transpose the data matrix to switch rows and columns
#heatmap_data <- t(heatmap_data)

# Convert the data frame to a numeric matrix
heatmap_matrix <- as.matrix(heatmap_data)

# Transpose the data matrix to switch rows and columns
#heatmap_data <- t(heatmap_data)

# Convert the data frame to a numeric matrix
heatmap_matrix <- as.matrix(heatmap_data)

# Specify heatmap colors
color_scheme <- colorRampPalette(c("blue", "white", "red"))(50)

#TODO: adjust the color, so that it is not so dark and light!!!!, add RNAseq data for mock and infection!!
# Create the heatmap without hierarchical clustering
library(gplots)
png("Heatmap.png", width=1000, height=1000)
heatmap.2(
  heatmap_matrix,
  Colv = FALSE,  # Turn off column dendrogram
  Rowv = FALSE,  # Turn off row dendrogram
  trace = "none",  # Turn off row and column dendrograms
  scale = "row",   # Scale the rows
  col = color_scheme,
  main = "GSVA Heatmap",  # Title of the heatmap
  key = TRUE,            # Show color key
  key.title = "GSVA Score",  # Color key title
  key.xlab = NULL,       # Remove x-axis label from color key
  density.info = "none",  # Remove density plot
  cexRow = 1.2,  # Increase font size of row names
  cexCol = 1.2,  # Increase font size of column names
  srtCol = 40, keysize = 0.72, margins = c(5, 14)
)
dev.off()

# Write the table to Excel-file
library(writexl)
# Specify the file path where you want to save the Excel file
excel_file <- "heatmap_data.xlsx"

# Save the data frame to the Excel file
write_xlsx(heatmap_data, excel_file)

# Verify that the Excel file has been saved
if (file.exists(excel_file)) {
  cat("Data frame has been saved to", excel_file, "\n")
} else {
  cat("Error: Failed to save the data frame to", excel_file, "\n")
}
like unlike
点赞本文的读者
还没有人对此文章表态
本文有评论

没有评论
GSVA calculation for NanoString data

本文有评论

看文章，发评论，不要沉默

最受欢迎文章

最新文章

最多评论文章

推荐相似文章