Functional Clustering of Genes Based on COG Terms Using Eggnog and Blast2GO

gene_x 0 like s 67 view s

Tags: pipeline

COG_relative_occurrence

  1. Generate protein fasta

    python3 getProteinSequences_1.py > Proximity_labelling.fasta
    python3 getProteinSequences_2.py > ALFA_pulldown.fasta
    python3 getProteinSequences_3.py > schnittmenge.fasta
    
  2. Using eggnog_env to generate the intial annotations

    mamba activate eggnog_env
    emapper.py -i Proximity_labelling.fasta -o eggnog_out_Proximity_labelling --cpu 60  #--resume
    emapper.py -i ALFA_pulldown.fasta -o eggnog_out_ALFA_pulldown --cpu 60
    emapper.py -i schnittmenge.fasta -o eggnog_out_schnittmenge --cpu 60
    
  3. Using BLAST2GO to generate GO terms and EC terms

  4. Using R to merge the two files blast2go_annot.annot2_ and eggnog_out.emapper.annotations.txt

    #mamba activate r_env
    # PREPARING go_terms and ec_terms: annot_* file: cut -f1-2 -d$'\t' blast2go_annot.annot2 > blast2go_annot.annot2_
    # PREPARING cp eggnog_out.emapper.annotations eggnog_out.emapper.annotations.txt. Note that the first last 3 lines starting with "#" should be removed; '#query' to query
    
    # Load required libraries
    library(openxlsx)  # For Excel file handling
    library(dplyr)     # For data manipulation
    library(clusterProfiler)  # For KEGG and GO enrichment analysis
    #library(org.Hs.eg.db)  # Replace with appropriate organism database
    #BiocManager::install("GO.db")
    #BiocManager::install("AnnotationDbi")
    library(GO.db)
    library(AnnotationDbi)
    
    # Step 1: Load the blast2go annotation file with a check for missing columns
    annot_df <- read.table("/home/jhuang/b2gWorkspace_schnittmenge/blast2go_annot.annot2_",
                        header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE)
    # If the structure is inconsistent, we can make sure there are exactly 3 columns:
    colnames(annot_df) <- c("GeneID", "Term")
    # Step 2: Filter and aggregate GO and EC terms as before
    go_terms <- annot_df %>%
    filter(grepl("^GO:", Term)) %>%
    group_by(GeneID) %>%
    summarize(GOs = paste(Term, collapse = ","), .groups = "drop")
    ec_terms <- annot_df %>%
    filter(grepl("^EC:", Term)) %>%
    group_by(GeneID) %>%
    summarize(EC = paste(Term, collapse = ","), .groups = "drop")
    
    eggnog_data <- read.delim("~/DATA/Data_Michelle_ProteinClustering/eggnog_out_schnittmenge.emapper.annotations.txt", header = TRUE, sep = "\t")
    
    # Perform the left joins and rename columns
    eggnog_data_updated <- eggnog_data %>%
    left_join(go_terms, by = "GeneID") %>%
    left_join(ec_terms, by = "GeneID")
    #%>% select(-EC.x, -GOs.x)
    #%>% rename(EC = EC.y, GOs = GOs.y)
    
    # Create a new workbook
    wb <- createWorkbook()
    # Add the complete dataset as the first sheet (with annotations)
    addWorksheet(wb, "Complete_Data")
    writeData(wb, "Complete_Data", eggnog_data_updated)
    # Save the workbook to a file
    saveWorkbook(wb, "Gene_with_Annotations_schnittmenge.xlsx", overwrite = TRUE)
    
    #Manually DELETE GOs.x and EC.x and rename GOs.y to GOs, EC.y to EC.
    
  5. Draw group bar plot for relative occurrence

    import matplotlib.pyplot as plt
    import pandas as pd
    import numpy as np
    
    # COG annotations for the two gene sets
    proximity_labelling = [
        'D', 'S', 'K', 'S', 'T', 'S', 'S', 'F', 'E', 'J', 'G', 'C', 'C', 'S', 'I', 'L', 'I', 'S', 'S', 'G', 'O', 'H', 'IQ', 'L', 'C', 'H', 'C', 'NU', 'O', 'C',
        'M', 'S', 'IQ', 'G', 'S', 'M', 'M', '-', 'M', 'S', 'J', 'S', 'IQ', 'K', 'E', 'J', 'K', 'E', 'J', 'S', 'K', 'E', 'S', 'J', 'S', 'V', 'C', 'I', 'K', 'IQ',
        'M', 'S', 'T', 'P', 'C', 'J', 'S', 'E', 'C', 'K', 'H', 'S', 'H', 'H'
    ]
    
    alfa_pulldown = [
        'K', 'S', 'P', 'K', 'C', 'K', 'I', 'O', '-', 'F', 'C', 'P', 'HQ', '-', 'L', 'L', 'S', 'G', 'IQ', 'S'
    ]
    
    # Define all possible COG categories (including the combinations)
    all_cog_categories_ = ['J','A', 'K', 'L', 'B', 'D', 'Y', 'V', 'T', 'M', 'N', 'Z', 'W', 'U', 'O', 'C','G', 'E', 'F', 'H', 'I', 'P', 'Q', 'R', 'S']
    all_cog_categories = all_cog_categories_[::-1]
    
    # Define functional groups for COG categories
    functional_groups = {
        "INFORMATION STORAGE AND PROCESSING": ['J', 'A', 'K', 'L', 'B'],
        "CELLULAR PROCESSES AND SIGNALING": ['D', 'Y', 'V', 'T', 'M', 'N', 'Z', 'W', 'U', 'O'],
        "METABOLISM": ['C', 'G', 'E', 'F', 'H', 'I', 'P', 'Q'],
        "POORLY CHARACTERIZED": ['R', 'S']
    }
    
    # Function to count occurrences of each COG category in a gene set, handling combinations
    def count_cog_categories(gene_set):
        counts = {cog: gene_set.count(cog) for cog in all_cog_categories}
    
        # Treat combinations (e.g., IQ, HQ, NU) as belonging to both respective categories
        combination_categories = {
            'IQ': ['I', 'Q'],
            'HQ': ['H', 'Q'],
            'NU': ['N', 'U']
        }
    
        for comb, categories in combination_categories.items():
            comb_count = gene_set.count(comb)
            for category in categories:
                counts[category] += comb_count
            counts[comb] = 0  # Remove the combination itself as it is distributed
    
        return counts
    
    # Count COG categories for each gene set
    proximity_counts = count_cog_categories(proximity_labelling)
    alfa_counts = count_cog_categories(alfa_pulldown)
    
    # Calculate relative occurrence (percentage) of each COG category for both gene sets
    total_proximity = len(proximity_labelling)
    total_alfa = len(alfa_pulldown)
    
    # Calculate percentages for each COG category in both gene sets
    proximity_percentage = {cog: count / total_proximity * 100 for cog, count in proximity_counts.items()}
    alfa_percentage = {cog: count / total_alfa * 100 for cog, count in alfa_counts.items()}
    
    # Create a DataFrame to store the percentages for both gene sets
    df_percentage = pd.DataFrame({
        'Proximity Labelling (%)': [proximity_percentage.get(cog, 0) for cog in all_cog_categories],
        'ALFA Pulldown (%)': [alfa_percentage.get(cog, 0) for cog in all_cog_categories]
    }, index=all_cog_categories)
    
    ## Reverse the order of the COG categories
    #df_percentage = df_percentage[::-1]
    # Round the values to 1 decimal place
    df_percentage = df_percentage.round(1)
    
    # Save the relative occurrence data in a text file
    file_path = "COG_relative_occurrence.txt"
    df_percentage.to_csv(file_path, sep='\t', header=True, index=True)
    
    # Save the relative occurrence data in an Excel file
    file_path = "COG_relative_occurrence.xlsx"
    df_percentage.to_excel(file_path, header=True, index=True)
    
    # Display file path where the data was saved
    print(f"Relative occurrence data has been saved to {file_path}")
    
    # --------------------
    # Horizontal Grouped Bar Plot
    # --------------------
    
    # Plot a horizontal grouped bar chart
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # Create the positions for each group of bars
    bar_height = 0.35
    index = np.arange(len(df_percentage))
    
    # Plot bars for both gene sets
    bar1 = ax.barh(index, df_percentage['Proximity Labelling (%)'], bar_height, label='Proximity Labelling', color='skyblue')
    bar2 = ax.barh(index + bar_height, df_percentage['ALFA Pulldown (%)'], bar_height, label='ALFA Pulldown', color='lightgreen')
    
    # Customize the plot
    ax.set_ylabel('COG Categories')
    ax.set_xlabel('Relative Occurrence (%)')
    ax.set_title('Relative COG Category Distribution')
    ax.set_yticks(index + bar_height / 2)
    ax.set_yticklabels(all_cog_categories)
    
    # Annotate functional groups
    #, fontweight='bold'
    for group, categories in functional_groups.items():
        # Find indices of the categories in this group
        group_indices = [all_cog_categories.index(cog) for cog in categories]
        # Add a label for the functional group
        ax.annotate(group, xy=(1, group_indices[0] + 0.5), xycoords='data', fontsize=9, color='black')
    
    # Add legend
    ax.legend()
    
    # Show the plot
    plt.tight_layout()
    plt.show()
    

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum