gene_x 0 like s 751 view s
Tags: python, scripts, ChIP-seq
#!/usr/bin/env python3
#python3 plot_peaks.py peaks.bed /home/jhuang/REFs/gencode.v43.annotation.gtf.db /ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa
#~/Tools/csv2xls-0.4/csv_to_xls.py _peaks_and_motifs_in_promoters_with_LT_peak.txt peak_tss_distances_.txt -d$'\t' -o peaks_and_motifs_in_promoters_with_LT_peak.xls;
#rename the name of sheet1 to "peaks_and_motifs_in_promoters_with_LT_peak" and "all_LT_peaks_NHDF"
#mv peak_tss_distances_.xlsx all_LT_peaks_NHDF.xlsx
import pprint
import argparse
import matplotlib.pyplot as plt
import pandas as pd
import gffutils
import pyfaidx
import numpy as np
def calculate_at_content(seq):
# Function to calculate the AT-content of a given sequence
at_count = seq.count('A') + seq.count('T')
total_count = len(seq)
at_content = round((at_count / total_count) * 100, 1)
return at_content
def plot_peak_distribution(peaks_file, gencode_db, fasta_file, upstream_length=500, downstream_length=500):
db = gffutils.FeatureDB(gencode_db, keep_order=True)
peaks = pd.read_table(peaks_file, header=None)
peaks.columns = ['chrom', 'start', 'end', 'name', 'score']
tss_positions = []
tss_genes = []
for gene in db.features_of_type('gene'):
if gene.strand == '+':
tss_positions.append(gene.start)
else:
tss_positions.append(gene.end)
tss_genes.append(gene)
closest_tss_indices = []
peak_tss_distances = []
peak_names = []
closest_genes = []
closest_gene_names = []
closest_strands = []
closest_gene_types = [] # New list for gene_type
start_positions = [] # New list for Start Positions
end_positions = [] # New list for End Positions
start_distance_to_tss = [] # New list for Distance to TSS of Start Positions
end_distance_to_tss = [] # New list for Distance to TSS of End Positions
peak_starts = []
peak_ends = []
# Load the FASTA file using pyfaidx
fasta = pyfaidx.Fasta(fasta_file)
# New lists for AT-content
start_flank_seqs = []
end_flank_seqs = []
start_at_content = []
end_at_content = []
for _, peak in peaks.iterrows():
peak_center = (peak['start'] + peak['end']) // 2
peak_start = peak['start']
peak_end = peak['end']
closest_tss_index = np.argmin([abs(tss - peak_center) for tss in tss_positions])
closest_tss_indices.append(closest_tss_index)
distance_to_tss = peak_center - tss_positions[closest_tss_index]
peak_tss_distances.append(distance_to_tss)
peak_names.append(peak['name'])
closest_genes.append(tss_genes[closest_tss_index].id)
if 'gene_name' in tss_genes[closest_tss_index].attributes:
closest_gene_name = tss_genes[closest_tss_index].attributes['gene_name'][0]
else:
closest_gene_name = 'N/A' # Set a default value if 'gene_name' attribute is not present
closest_gene_names.append(closest_gene_name)
closest_strands.append(tss_genes[closest_tss_index].strand)
if 'gene_type' in tss_genes[closest_tss_index].attributes:
closest_gene_type = tss_genes[closest_tss_index].attributes['gene_type'][0]
else:
closest_gene_type = 'N/A' # Set a default value if 'gene_type' attribute is not present
closest_gene_types.append(closest_gene_type)
#start_distance_to_tss.append(peak_start - tss_positions[closest_tss_index]) # Add Distance to TSS of Start Position to the list
#end_distance_to_tss.append(peak_end - tss_positions[closest_tss_index]) # Add Distance to TSS of End Position to the list
peak_starts.append(peak_start)
peak_ends.append(peak_end)
# Get the flanking region coordinates
start_flank_start = peak_start - upstream_length
start_flank_end = peak_start - 1
end_flank_start = peak_end + 1
end_flank_end = peak_end + downstream_length
# Retrieve the flanking region sequences from the FASTA file
start_flank_seq = fasta[peak['chrom']][start_flank_start:start_flank_end].seq.upper()
end_flank_seq = fasta[peak['chrom']][end_flank_start:end_flank_end].seq.upper()
start_flank_seqs.append(start_flank_seq)
end_flank_seqs.append(end_flank_seq)
# # Calculate the AT-content of the flanking regions based on the gene strand
# if tss_genes[closest_tss_index].strand== '+':
# start_at = calculate_at_content(start_flank_seq)
# end_at = calculate_at_content(end_flank_seq)
# else:
# start_at = calculate_at_content(end_flank_seq)
# end_at = calculate_at_content(start_flank_seq)
start_at = calculate_at_content(start_flank_seq)
end_at = calculate_at_content(end_flank_seq)
start_at_content.append(start_at)
end_at_content.append(end_at)
df = pd.DataFrame({
'Peak Name': peak_names,
'Peak Start': peak_starts,
'Peak End': peak_ends,
#'Closest TSS Index': closest_tss_indices,
'Distance to TSS': peak_tss_distances,
'Closest Gene (Ensemble ID)': closest_genes,
'Closest Gene (Gene name)': closest_gene_names,
'Gene Strand': closest_strands,
'Gene Type': closest_gene_types, # Add new column for gene_type
#'start_distance_to_tss': start_distance_to_tss,
#'end_distance_to_tss': end_distance_to_tss,
'Start-Flanking AT-content (500 nt)': start_at_content,
'End-Flanking AT-content (500 nt)': end_at_content,
'Start-Flanking Sequence (500 nt)': start_flank_seqs,
'End-Flanking Sequence (500 nt)': end_flank_seqs
})
df.to_excel('peak_tss_distances_.xlsx', index=False)
df.to_csv('peak_tss_distances_.txt', sep='\t', index=False) # Save as tab-separated text file
counts, bin_edges = np.histogram(peak_tss_distances, bins=1000)
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
hist_df = pd.DataFrame({'Bin Center': bin_centers, 'Count': counts})
hist_df.to_csv('peak_distribution_.csv', index=False)
total_peaks = hist_df['Count'].sum()
with open('peak_distribution_.csv', 'a') as f:
f.write(f'\nTotal number of peaks: {total_peaks}')
plt.hist(peak_tss_distances, bins=1000)
plt.xlabel('Distance to TSS')
plt.ylabel('Number of peaks')
plt.title('Distribution of peaks around TSS')
plt.savefig('peak_distribution_.png')
plt.show()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Plot peak distribution around TSS.')
parser.add_argument('peaks_file', type=str, help='Path to the peaks.bed file')
parser.add_argument('gencode_db', type=str, help='Path to the gencode.v43.annotation.db file')
parser.add_argument('fasta_file', type=str, help='Path to the FASTA file')
args = parser.parse_args()
plot_peak_distribution(args.peaks_file, args.gencode_db, args.fasta_file)
点赞本文的读者
还没有人对此文章表态
没有评论
Density of motif plots and its statistical tests
© 2023 XGenes.com Impressum