Analysis of Peak Distribution in Promoters

gene_x 0 like s 747 view s

Tags: python, Biopython, genomics

import pprint
import argparse
import matplotlib.pyplot as plt
import pandas as pd
import gffutils
import numpy as np

#db = gffutils.create_db('gencode.v43.annotation.gtf', dbfn='gencode.v43.annotation.db', force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True)
#python3 plot_peaks4.py peaks.bed gencode.v43.annotation.db

def plot_peak_distribution(peaks_file, gencode_db):
    db = gffutils.FeatureDB(gencode_db, keep_order=True)

    peaks = pd.read_table(peaks_file, header=None)
    peaks.columns = ['chrom', 'start', 'end', 'name', 'score']

    tss_positions = []
    tss_genes = []
    for gene in db.features_of_type('gene'):
        if gene.strand == '+':
            tss_positions.append(gene.start)
        else:
            tss_positions.append(gene.end)
        tss_genes.append(gene)

    peak_tss_distances = []
    peak_names = []
    closest_genes = []
    closest_gene_names = []
    closest_strands = []
    for _, peak in peaks.iterrows():
        #the double forward slash // is used to perform integer division instead of floating-point division
        peak_center = (peak['start'] + peak['end']) // 2
        closest_tss_index = np.argmin([abs(tss - peak_center) for tss in tss_positions])
        distance_to_tss = peak_center - tss_positions[closest_tss_index]
        peak_tss_distances.append(distance_to_tss)
        peak_names.append(peak['name'])
        closest_genes.append(tss_genes[closest_tss_index].id)
        #closest_genes.append(tss_genes[closest_tss_index].attributes['gene_name'][0])
        if 'gene_name' in tss_genes[closest_tss_index].attributes:
            closest_gene_name = tss_genes[closest_tss_index].attributes['gene_name'][0]
            #print(closest_genesymbol)
        else:
            closest_gene_name = 'N/A'  # Set a default value if 'gene_name' attribute is not present
            #print(closest_genesymbol)
        closest_gene_names.append(closest_gene_name)
        #attributes_obj = tss_genes[closest_tss_index].attributes
        #attribute_names = attributes_obj.keys()
        #print(attribute_names)
        #dict_keys(['gene_id', 'gene_type', 'gene_name', 'level', 'hgnc_id', 'tag', 'havana_gene'])
        #print(tss_genes[closest_tss_index].attributes['havana_gene'])
        #['ENSG00000235519.3'], ['lncRNA'], ['TLCD5'], ['2'], NONE, NONE, ['OTTHUMG00000195005.1']
        closest_strands.append(tss_genes[closest_tss_index].strand)

    # Save distances and gene info to an Excel file
    df = pd.DataFrame({
        'Peak Name': peak_names,
        'Distance to TSS': peak_tss_distances,
        'Closest Gene (Ensemble ID)': closest_genes,
        'Closest Gene (Gene name)': closest_gene_names,
        'Gene Strand': closest_strands
    })
    df.to_excel('peak_tss_distances.xlsx', index=False)

    # Calculate histogram and save bins and counts to a file
    counts, bin_edges = np.histogram(peak_tss_distances, bins=1000)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
    hist_df = pd.DataFrame({'Bin Center': bin_centers, 'Count': counts})
    hist_df.to_csv('peak_distribution.csv', index=False)
    total_peaks = hist_df['Count'].sum()
    with open('peak_distribution.csv', 'a') as f:
        f.write(f'\nTotal number of peaks: {total_peaks}')

    plt.hist(peak_tss_distances, bins=1000)
    plt.xlabel('Distance to TSS')
    plt.ylabel('Number of peaks')
    plt.title('Distribution of peaks around TSS')
    plt.savefig('peak_distribution.png')
    plt.show()

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Plot peak distribution around TSS.')
    parser.add_argument('peaks_file', type=str, help='Path to the peaks.bed file')
    parser.add_argument('gencode_db', type=str, help='Path to the gencode.v43.annotation.db file')

    args = parser.parse_args()
    plot_peak_distribution(args.peaks_file, args.gencode_db)

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默

最受欢迎文章

  1. Why Do Significant Gene Lists Change After Adding Additional Conditions in Differential Gene Expression Analysis?
  2. Updating Human Gene Identifiers using Ensembl BioMart: A Step-by-Step Guide
  3. Setup conda environments
  4. Motif Discovery in Biological Sequences: A Comparison of MEME and HOMER
  5. File format for single channel analysis of Agilent microarray data with Limma?
  6. pheatmap vs heatmap.2
  7. 洛那法尼(lonafarnib):从抗癌症到抗病毒的多功能药物
  8. RNAseq running with umi_tools
  9. Calling peaks using findPeaks of HOMER
  10. Guide to Submitting Data to GEO (Gene Expression Omnibus)

最新文章


最多评论文章


推荐相似文章

Defining and Categorizing Promoter Types Based on the 'GRGGC' Motif Frequency, Distribution, and Distance to the Transcription Start Site (TSS)

Evaluating the Proximity of Genomic Features (=integration sites) to Peaks Using a Permutation Test genomic features

Density of motif plots and its statistical tests

Identifying the Nearest Genomic Peaks within Defined Regions


© 2023 XGenes.com Impressum