Functional Clustering of Genes Based on COG Terms Using Eggnog and Blast2GO

COG_relative_occurrence

  1. Generate protein fasta

     python3 getProteinSequences_1.py > Proximity_labelling.fasta
     python3 getProteinSequences_2.py > ALFA_pulldown.fasta
     python3 getProteinSequences_3.py > schnittmenge.fasta
  2. Using eggnog_env to generate the intial annotations

     mamba activate eggnog_env
     emapper.py -i Proximity_labelling.fasta -o eggnog_out_Proximity_labelling --cpu 60  #--resume
     emapper.py -i ALFA_pulldown.fasta -o eggnog_out_ALFA_pulldown --cpu 60
     emapper.py -i schnittmenge.fasta -o eggnog_out_schnittmenge --cpu 60
  3. Using BLAST2GO to generate GO terms and EC terms

  4. Using R to merge the two files blast2goannot.annot2 and eggnog_out.emapper.annotations.txt

     #mamba activate r_env
     # PREPARING go_terms and ec_terms: annot_* file: cut -f1-2 -d$'\t' blast2go_annot.annot2 > blast2go_annot.annot2_
     # PREPARING cp eggnog_out.emapper.annotations eggnog_out.emapper.annotations.txt. Note that the first last 3 lines starting with "#" should be removed; '#query' to query
    
     # Load required libraries
     library(openxlsx)  # For Excel file handling
     library(dplyr)     # For data manipulation
     library(clusterProfiler)  # For KEGG and GO enrichment analysis
     #library(org.Hs.eg.db)  # Replace with appropriate organism database
     #BiocManager::install("GO.db")
     #BiocManager::install("AnnotationDbi")
     library(GO.db)
     library(AnnotationDbi)
    
     # Step 1: Load the blast2go annotation file with a check for missing columns
     annot_df <- read.table("/home/jhuang/b2gWorkspace_schnittmenge/blast2go_annot.annot2_",
                         header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE)
     # If the structure is inconsistent, we can make sure there are exactly 3 columns:
     colnames(annot_df) <- c("GeneID", "Term")
     # Step 2: Filter and aggregate GO and EC terms as before
     go_terms <- annot_df %>%
     filter(grepl("^GO:", Term)) %>%
     group_by(GeneID) %>%
     summarize(GOs = paste(Term, collapse = ","), .groups = "drop")
     ec_terms <- annot_df %>%
     filter(grepl("^EC:", Term)) %>%
     group_by(GeneID) %>%
     summarize(EC = paste(Term, collapse = ","), .groups = "drop")
    
     eggnog_data <- read.delim("~/DATA/Data_Michelle_ProteinClustering/eggnog_out_schnittmenge.emapper.annotations.txt", header = TRUE, sep = "\t")
    
     # Perform the left joins and rename columns
     eggnog_data_updated <- eggnog_data %>%
     left_join(go_terms, by = "GeneID") %>%
     left_join(ec_terms, by = "GeneID")
     #%>% select(-EC.x, -GOs.x)
     #%>% rename(EC = EC.y, GOs = GOs.y)
    
     # Create a new workbook
     wb <- createWorkbook()
     # Add the complete dataset as the first sheet (with annotations)
     addWorksheet(wb, "Complete_Data")
     writeData(wb, "Complete_Data", eggnog_data_updated)
     # Save the workbook to a file
     saveWorkbook(wb, "Gene_with_Annotations_schnittmenge.xlsx", overwrite = TRUE)
    
     #Manually DELETE GOs.x and EC.x and rename GOs.y to GOs, EC.y to EC.
  5. Draw group bar plot for relative occurrence

     import matplotlib.pyplot as plt
     import pandas as pd
     import numpy as np
    
     # COG annotations for the two gene sets
     proximity_labelling = [
         'D', 'S', 'K', 'S', 'T', 'S', 'S', 'F', 'E', 'J', 'G', 'C', 'C', 'S', 'I', 'L', 'I', 'S', 'S', 'G', 'O', 'H', 'IQ', 'L', 'C', 'H', 'C', 'NU', 'O', 'C',
         'M', 'S', 'IQ', 'G', 'S', 'M', 'M', '-', 'M', 'S', 'J', 'S', 'IQ', 'K', 'E', 'J', 'K', 'E', 'J', 'S', 'K', 'E', 'S', 'J', 'S', 'V', 'C', 'I', 'K', 'IQ',
         'M', 'S', 'T', 'P', 'C', 'J', 'S', 'E', 'C', 'K', 'H', 'S', 'H', 'H'
     ]
    
     alfa_pulldown = [
         'K', 'S', 'P', 'K', 'C', 'K', 'I', 'O', '-', 'F', 'C', 'P', 'HQ', '-', 'L', 'L', 'S', 'G', 'IQ', 'S'
     ]
    
     # Define all possible COG categories (including the combinations)
     all_cog_categories_ = ['J','A', 'K', 'L', 'B', 'D', 'Y', 'V', 'T', 'M', 'N', 'Z', 'W', 'U', 'O', 'C','G', 'E', 'F', 'H', 'I', 'P', 'Q', 'R', 'S']
     all_cog_categories = all_cog_categories_[::-1]
    
     # Define functional groups for COG categories
     functional_groups = {
         "INFORMATION STORAGE AND PROCESSING": ['J', 'A', 'K', 'L', 'B'],
         "CELLULAR PROCESSES AND SIGNALING": ['D', 'Y', 'V', 'T', 'M', 'N', 'Z', 'W', 'U', 'O'],
         "METABOLISM": ['C', 'G', 'E', 'F', 'H', 'I', 'P', 'Q'],
         "POORLY CHARACTERIZED": ['R', 'S']
     }
    
     # Function to count occurrences of each COG category in a gene set, handling combinations
     def count_cog_categories(gene_set):
         counts = {cog: gene_set.count(cog) for cog in all_cog_categories}
    
         # Treat combinations (e.g., IQ, HQ, NU) as belonging to both respective categories
         combination_categories = {
             'IQ': ['I', 'Q'],
             'HQ': ['H', 'Q'],
             'NU': ['N', 'U']
         }
    
         for comb, categories in combination_categories.items():
             comb_count = gene_set.count(comb)
             for category in categories:
                 counts[category] += comb_count
             counts[comb] = 0  # Remove the combination itself as it is distributed
    
         return counts
    
     # Count COG categories for each gene set
     proximity_counts = count_cog_categories(proximity_labelling)
     alfa_counts = count_cog_categories(alfa_pulldown)
    
     # Calculate relative occurrence (percentage) of each COG category for both gene sets
     total_proximity = len(proximity_labelling)
     total_alfa = len(alfa_pulldown)
    
     # Calculate percentages for each COG category in both gene sets
     proximity_percentage = {cog: count / total_proximity * 100 for cog, count in proximity_counts.items()}
     alfa_percentage = {cog: count / total_alfa * 100 for cog, count in alfa_counts.items()}
    
     # Create a DataFrame to store the percentages for both gene sets
     df_percentage = pd.DataFrame({
         'Proximity Labelling (%)': [proximity_percentage.get(cog, 0) for cog in all_cog_categories],
         'ALFA Pulldown (%)': [alfa_percentage.get(cog, 0) for cog in all_cog_categories]
     }, index=all_cog_categories)
    
     ## Reverse the order of the COG categories
     #df_percentage = df_percentage[::-1]
     # Round the values to 1 decimal place
     df_percentage = df_percentage.round(1)
    
     # Save the relative occurrence data in a text file
     file_path = "COG_relative_occurrence.txt"
     df_percentage.to_csv(file_path, sep='\t', header=True, index=True)
    
     # Save the relative occurrence data in an Excel file
     file_path = "COG_relative_occurrence.xlsx"
     df_percentage.to_excel(file_path, header=True, index=True)
    
     # Display file path where the data was saved
     print(f"Relative occurrence data has been saved to {file_path}")
    
     # --------------------
     # Horizontal Grouped Bar Plot
     # --------------------
    
     # Plot a horizontal grouped bar chart
     fig, ax = plt.subplots(figsize=(12, 8))
    
     # Create the positions for each group of bars
     bar_height = 0.35
     index = np.arange(len(df_percentage))
    
     # Plot bars for both gene sets
     bar1 = ax.barh(index, df_percentage['Proximity Labelling (%)'], bar_height, label='Proximity Labelling', color='skyblue')
     bar2 = ax.barh(index + bar_height, df_percentage['ALFA Pulldown (%)'], bar_height, label='ALFA Pulldown', color='lightgreen')
    
     # Customize the plot
     ax.set_ylabel('COG Categories')
     ax.set_xlabel('Relative Occurrence (%)')
     ax.set_title('Relative COG Category Distribution')
     ax.set_yticks(index + bar_height / 2)
     ax.set_yticklabels(all_cog_categories)
    
     # Annotate functional groups
     #, fontweight='bold'
     for group, categories in functional_groups.items():
         # Find indices of the categories in this group
         group_indices = [all_cog_categories.index(cog) for cog in categories]
         # Add a label for the functional group
         ax.annotate(group, xy=(1, group_indices[0] + 0.5), xycoords='data', fontsize=9, color='black')
    
     # Add legend
     ax.legend()
    
     # Show the plot
     plt.tight_layout()
     plt.show()

Leave a Reply

Your email address will not be published. Required fields are marked *