Calculate AT content of the flanking regions of peaks

gene_x 0 like s 686 view s

Tags: python, scripts, ChIP-seq

#!/usr/bin/env python3
#python3 plot_peaks.py peaks.bed /home/jhuang/REFs/gencode.v43.annotation.gtf.db /ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa
#~/Tools/csv2xls-0.4/csv_to_xls.py _peaks_and_motifs_in_promoters_with_LT_peak.txt peak_tss_distances_.txt -d$'\t' -o peaks_and_motifs_in_promoters_with_LT_peak.xls;
#rename the name of sheet1 to "peaks_and_motifs_in_promoters_with_LT_peak" and "all_LT_peaks_NHDF" 
#mv peak_tss_distances_.xlsx all_LT_peaks_NHDF.xlsx

import pprint
import argparse
import matplotlib.pyplot as plt
import pandas as pd
import gffutils
import pyfaidx
import numpy as np

def calculate_at_content(seq):
    # Function to calculate the AT-content of a given sequence
    at_count = seq.count('A') + seq.count('T')
    total_count = len(seq)
    at_content = round((at_count / total_count) * 100, 1)
    return at_content

def plot_peak_distribution(peaks_file, gencode_db, fasta_file, upstream_length=500, downstream_length=500):
    db = gffutils.FeatureDB(gencode_db, keep_order=True)
    peaks = pd.read_table(peaks_file, header=None)
    peaks.columns = ['chrom', 'start', 'end', 'name', 'score']
    tss_positions = []
    tss_genes = []
    for gene in db.features_of_type('gene'):
        if gene.strand == '+':
            tss_positions.append(gene.start)
        else:
            tss_positions.append(gene.end)
        tss_genes.append(gene)
    closest_tss_indices = []
    peak_tss_distances = []
    peak_names = []
    closest_genes = []
    closest_gene_names = []
    closest_strands = []
    closest_gene_types = []  # New list for gene_type
    start_positions = []  # New list for Start Positions
    end_positions = []  # New list for End Positions
    start_distance_to_tss = []  # New list for Distance to TSS of Start Positions
    end_distance_to_tss = []  # New list for Distance to TSS of End Positions
    peak_starts = []
    peak_ends = []

    # Load the FASTA file using pyfaidx
    fasta = pyfaidx.Fasta(fasta_file)

    # New lists for AT-content
    start_flank_seqs = []
    end_flank_seqs = []
    start_at_content = []
    end_at_content = []

    for _, peak in peaks.iterrows():
        peak_center = (peak['start'] + peak['end']) // 2
        peak_start = peak['start']
        peak_end = peak['end']
        closest_tss_index = np.argmin([abs(tss - peak_center) for tss in tss_positions])
        closest_tss_indices.append(closest_tss_index)
        distance_to_tss = peak_center - tss_positions[closest_tss_index]
        peak_tss_distances.append(distance_to_tss)
        peak_names.append(peak['name'])
        closest_genes.append(tss_genes[closest_tss_index].id)
        if 'gene_name' in tss_genes[closest_tss_index].attributes:
            closest_gene_name = tss_genes[closest_tss_index].attributes['gene_name'][0]
        else:
            closest_gene_name = 'N/A'  # Set a default value if 'gene_name' attribute is not present
        closest_gene_names.append(closest_gene_name)
        closest_strands.append(tss_genes[closest_tss_index].strand)

        if 'gene_type' in tss_genes[closest_tss_index].attributes:
            closest_gene_type = tss_genes[closest_tss_index].attributes['gene_type'][0]
        else:
            closest_gene_type = 'N/A'  # Set a default value if 'gene_type' attribute is not present
        closest_gene_types.append(closest_gene_type)

        #start_distance_to_tss.append(peak_start - tss_positions[closest_tss_index])  # Add Distance to TSS of Start Position to the list
        #end_distance_to_tss.append(peak_end - tss_positions[closest_tss_index])  # Add Distance to TSS of End Position to the list

        peak_starts.append(peak_start)
        peak_ends.append(peak_end)

        # Get the flanking region coordinates
        start_flank_start = peak_start - upstream_length
        start_flank_end = peak_start - 1
        end_flank_start = peak_end + 1
        end_flank_end = peak_end + downstream_length

        # Retrieve the flanking region sequences from the FASTA file
        start_flank_seq = fasta[peak['chrom']][start_flank_start:start_flank_end].seq.upper()
        end_flank_seq = fasta[peak['chrom']][end_flank_start:end_flank_end].seq.upper()
        start_flank_seqs.append(start_flank_seq)
        end_flank_seqs.append(end_flank_seq)

        # # Calculate the AT-content of the flanking regions based on the gene strand
        # if tss_genes[closest_tss_index].strand== '+':
        #     start_at = calculate_at_content(start_flank_seq)
        #     end_at = calculate_at_content(end_flank_seq)
        # else:
        #     start_at = calculate_at_content(end_flank_seq)
        #     end_at = calculate_at_content(start_flank_seq)
        start_at = calculate_at_content(start_flank_seq)
        end_at = calculate_at_content(end_flank_seq)
        start_at_content.append(start_at)
        end_at_content.append(end_at)

    df = pd.DataFrame({
        'Peak Name': peak_names,
        'Peak Start': peak_starts,
        'Peak End': peak_ends,
        #'Closest TSS Index': closest_tss_indices,
        'Distance to TSS': peak_tss_distances,
        'Closest Gene (Ensemble ID)': closest_genes,
        'Closest Gene (Gene name)': closest_gene_names,
        'Gene Strand': closest_strands,
        'Gene Type': closest_gene_types,  # Add new column for gene_type
        #'start_distance_to_tss': start_distance_to_tss,
        #'end_distance_to_tss': end_distance_to_tss,
        'Start-Flanking AT-content (500 nt)': start_at_content,
        'End-Flanking AT-content (500 nt)': end_at_content,
        'Start-Flanking Sequence (500 nt)': start_flank_seqs,
        'End-Flanking Sequence (500 nt)': end_flank_seqs
    })

    df.to_excel('peak_tss_distances_.xlsx', index=False)
    df.to_csv('peak_tss_distances_.txt', sep='\t', index=False)  # Save as tab-separated text file
    counts, bin_edges = np.histogram(peak_tss_distances, bins=1000)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
    hist_df = pd.DataFrame({'Bin Center': bin_centers, 'Count': counts})
    hist_df.to_csv('peak_distribution_.csv', index=False)
    total_peaks = hist_df['Count'].sum()
    with open('peak_distribution_.csv', 'a') as f:
        f.write(f'\nTotal number of peaks: {total_peaks}')
    plt.hist(peak_tss_distances, bins=1000)
    plt.xlabel('Distance to TSS')
    plt.ylabel('Number of peaks')
    plt.title('Distribution of peaks around TSS')
    plt.savefig('peak_distribution_.png')
    plt.show()

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Plot peak distribution around TSS.')
    parser.add_argument('peaks_file', type=str, help='Path to the peaks.bed file')
    parser.add_argument('gencode_db', type=str, help='Path to the gencode.v43.annotation.db file')
    parser.add_argument('fasta_file', type=str, help='Path to the FASTA file')
    args = parser.parse_args()
    plot_peak_distribution(args.peaks_file, args.gencode_db, args.fasta_file)

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum