GSVA calculation for Mass spectrometry (MS)-based proteomics

gene_x 0 like s 1269 view s
Tags: R, processing, protein
library("rmarkdown")
library("tidyverse")
library(rmarkdown)
setwd("/home/jhuang/DATA/Data_Susanne_Carotis_MS/LIMMA-pipeline-proteomics/Results_20231006_165122")

# -1. prepare and read signatures (namely gene sets)
#Monocytes.csv #66
#Monocytes2.csv #201
#Neutrophils.csv #40
#Neutrophils2.csv #61
#~/Tools/csv2xls-0.4/csv_to_xls.py Alt._complement.csv Anergic_or_act._T_cells.csv Anti-inflammation.csv B_cells.csv CD40_activated.csv CD8T-NK-NKT.csv Cell_cycle.csv Classical_complement.csv Cyt._act._T_cells.csv Dendritic_cells.csv Erythrocytes.csv Granulocytes.csv IFN.csv IG_chains.csv IL-1_cytokines.csv IL-6R_complex.csv Inflammasome.csv LDG.csv Lectin_complement.csv MHC_II.csv Monocytes.csv Monocyte_cell_surface.csv Monocyte_secreted.csv Myeloid_cells.csv Neutrophils.csv NK_cells.csv NLRP3_inflammasome.csv pDC.csv Plasma_cells.csv Platelets.csv Pro-inflam._IL-1.csv SNOR_low_DOWN.csv SNOR_low_UP.csv TCRA.csv TCRB.csv TCRD.csv TCRG.csv TNF.csv T_activated.csv T_cells.csv T_regs.csv Unfolded_protein.csv  Inflammatory_neutrophils.csv Suppressive_neutrophils.csv Apoptosis.csv NFkB_complex.csv -d',' -o Signatures_46.xls

library(readxl)
file_path <- "../../Signatures_46.xls"
# Get the names of the sheets
sheet_names <- excel_sheets(file_path)
# Initialize an empty list to hold gene sets
geneSets <- list()
# Loop over each sheet, extract the ENSEMBL IDs, and add to the list
for (sheet in sheet_names) {
  # Read the sheet
  data <- read_excel(file_path, sheet = sheet)
  # Process the GeneSet names (replacing spaces with underscores, for example)
  gene_set_name <- gsub(" ", "_", unique(data$GeneSet)[1])
  # Add ENSEMBL IDs to the list
  geneSets[[gene_set_name]] <- unique(as.character(data$geneSymbol))
}
# Print the result to check
print(geneSets)
length(geneSets)

# 0. for the following calculation, the GSVA input requires a gene expression matrix 'exprs' where rows are genes and columns are samples.
library(gridExtra)
library(ggplot2)
library(GSVA)
exprs_ <- read.csv(sep="\t", file="20231006_165122_final_data.tsv")

# List of desired columns
desired_columns <- c("CA1", "CA2", "CA3", "CA10", "CA5", "CA6", "CA8", "CA14", "CA12", "CA13", "Con1", "Con2", "Con3", "Con4", "Con5", "Con6")
# # Find duplicated symbols
# duplicated_symbols <- exprs_$Symbol[duplicated(exprs_$Symbol) | duplicated(exprs_$Symbol, fromLast = TRUE)]
# print(duplicated_symbols)
# Filter out rows with empty 'Symbol' values
exprs_ <- exprs_[exprs_$Symbol != "", ]
# Subset the dataframe
exprs <- exprs_[, desired_columns]
# Set the 'Symbol' column as row names
rownames(exprs) <- exprs_$Symbol
exprs_matrix <- as.matrix(exprs)

# 1. Compute GSVA scores:
library(GSVA)
gsva_scores <- gsva(exprs_matrix, geneSets, method="gsva", useNames=TRUE)

# 2. Convert to data.frame for ggplot:
gsva_df <- as.data.frame(t(gsva_scores))

# 3. Add conditions to gsva_df:
gsva_df$Condition <- c("COVID","COVID","COVID","COVID","COVID","COVID","COVID","COVID","COVID","COVID","control","control","control","control","control","control")

# 4. Filter the gsva_df to retain only the desired conditions:

# 5. Define a function to plot violin plots:
gsva_df$Condition <- factor(gsva_df$Condition, levels = c("control", "COVID"))
plot_violin <- function(data, gene_name) {
  # Calculate the t-test p-value for the two conditions
  condition1_data <- data[data$Condition == "control", gene_name]
  condition2_data <- data[data$Condition == "COVID", gene_name]
  p_value <- t.test(condition1_data, condition2_data)$p.value
  # Convert p-value to annotation
  p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
  rounded_p_value <- paste0("p = ", round(p_value, 2))
  plot_title <- gsub("_", " ", gene_name)
  p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name))) +
    geom_violin(linewidth=1.2) + 
    labs(title=plot_title, y="GSVA Score") +
    ylim(-1, 1) +
    theme_light() +
    theme(
      axis.title.x = element_text(size=12),
      axis.title.y = element_text(size=12),
      axis.text.x  = element_text(size=10),
      axis.text.y  = element_text(size=10),
      plot.title   = element_text(size=12, hjust=0.5)
    )
  # Add p-value annotation to the plot
  p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
  return(p)
}

# 6. Generate the list of plots in a predefined order:
# [1] "Platelets"                "Granulocytes"            
# [3] "LDG"                      "pDC"                     
# [5] "Anti-inflammation"        "Pro-inflam._IL-1"        
# [7] "Dendritic_cells"          "MHC_II"                  
# [9] "Alt._complement"          "TNF"                     
#[11] "NLRP3_inflammasome"       "Unfolded_protein"        
#[13] "B_cells"                  "Monocyte_cell_surface"   
#[15] "Inflammasome"             "Monocyte_secreted"       
#[17] "IL-1_cytokines"           "SNOR_low_UP"             
#[19] "CD40_activated"           "Lectin_complement"       
#[21] "Classical_complement"     "Cell_cycle"              
#[23] "Plasma_cells"             "IG_chains"               
#[25] "Erythrocytes"             "IL-6R_complex"           
#[27] "IFN"                      "TCRB"                    
#[29] "TCRA"                     "Cyt._act._T_cells"       
#[31] "TCRG"                     "T_cells"                 
#[33] "CD8T-NK-NKT"              "Anergic_or_act._T_cells" 
#[35] "T_activated"              "Neutrophils"             
#[37] "NK_cells"                 "TCRD"                    
#[39] "T_regs"                   "SNOR_low_DOWN"           
#[41] "Monocytes"                "Myeloid_cells"           
#[43] "Inflammatory_neutrophils" "Suppressive_neutrophils" 
#[45] "Apoptosis"                "NFkB_complex"
desired_order <- c("Platelets","Granulocytes","LDG","pDC","Anti-inflammation",  "Pro-inflam._IL-1","Dendritic_cells","MHC_II","Alt._complement","TNF",  "NLRP3_inflammasome","Unfolded_protein","B_cells","Monocyte_cell_surface","Inflammasome",  "Monocyte_secreted","IL-1_cytokines","SNOR_low_UP","CD40_activated","Lectin_complement",  "Classical_complement","Cell_cycle","Plasma_cells","IG_chains","Erythrocytes",  "IL-6R_complex","IFN","TCRB","TCRA","Cyt._act._T_cells",  "TCRG","T_cells","CD8T-NK-NKT","Anergic_or_act._T_cells","T_activated", "Neutrophils", "NK_cells","TCRD","T_regs","SNOR_low_DOWN","Monocytes", "Myeloid_cells", "Inflammatory_neutrophils", "Suppressive_neutrophils", "Apoptosis", "NFkB_complex")
genes <- colnames(gsva_df)[!colnames(gsva_df) %in% "Condition"]
genes <- genes[match(desired_order, genes)]
filtered_genes <- genes[!is.na(genes)]
print(filtered_genes)
plots_list <- lapply(filtered_genes, function(filtered_genes) plot_violin(gsva_df, filtered_genes))

# 7. Pad the list of plots:
remaining_plots <- 6 - (length(plots_list) %% 6)
if (remaining_plots != 6) {
  plots_list <- c(plots_list, rep(list(NULL), remaining_plots))
}

# 8. Create the plots and arrange them in a grid:
library(gridExtra)
plots_matrix <- matrix(plots_list, ncol=6, byrow=T)
#do.call("grid.arrange", c(plots_matrix, list(ncol=6)))

# 9. Save the plots to a PNG:
png("Violin_Plots_GSVA_MS.png", width=1000, height=1000)
do.call("grid.arrange", c(plots_matrix, list(ncol=6)))
dev.off()

# 10. Save GSVA scores as Excel-file.
library(writexl)
write_xlsx(gsva_df, "GSVA_Scores_MS.xlsx")
like unlike
点赞本文的读者
还没有人对此文章表态
本文有评论

没有评论
GSVA calculation for Mass spectrometry (MS)-based proteomics

本文有评论

看文章，发评论，不要沉默

最受欢迎文章

最新文章

最多评论文章

推荐相似文章