gene_x 0 like s 747 view s
Tags: python, Biopython, genomics
import pprint
import argparse
import matplotlib.pyplot as plt
import pandas as pd
import gffutils
import numpy as np
#db = gffutils.create_db('gencode.v43.annotation.gtf', dbfn='gencode.v43.annotation.db', force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True)
#python3 plot_peaks4.py peaks.bed gencode.v43.annotation.db
def plot_peak_distribution(peaks_file, gencode_db):
db = gffutils.FeatureDB(gencode_db, keep_order=True)
peaks = pd.read_table(peaks_file, header=None)
peaks.columns = ['chrom', 'start', 'end', 'name', 'score']
tss_positions = []
tss_genes = []
for gene in db.features_of_type('gene'):
if gene.strand == '+':
tss_positions.append(gene.start)
else:
tss_positions.append(gene.end)
tss_genes.append(gene)
peak_tss_distances = []
peak_names = []
closest_genes = []
closest_gene_names = []
closest_strands = []
for _, peak in peaks.iterrows():
#the double forward slash // is used to perform integer division instead of floating-point division
peak_center = (peak['start'] + peak['end']) // 2
closest_tss_index = np.argmin([abs(tss - peak_center) for tss in tss_positions])
distance_to_tss = peak_center - tss_positions[closest_tss_index]
peak_tss_distances.append(distance_to_tss)
peak_names.append(peak['name'])
closest_genes.append(tss_genes[closest_tss_index].id)
#closest_genes.append(tss_genes[closest_tss_index].attributes['gene_name'][0])
if 'gene_name' in tss_genes[closest_tss_index].attributes:
closest_gene_name = tss_genes[closest_tss_index].attributes['gene_name'][0]
#print(closest_genesymbol)
else:
closest_gene_name = 'N/A' # Set a default value if 'gene_name' attribute is not present
#print(closest_genesymbol)
closest_gene_names.append(closest_gene_name)
#attributes_obj = tss_genes[closest_tss_index].attributes
#attribute_names = attributes_obj.keys()
#print(attribute_names)
#dict_keys(['gene_id', 'gene_type', 'gene_name', 'level', 'hgnc_id', 'tag', 'havana_gene'])
#print(tss_genes[closest_tss_index].attributes['havana_gene'])
#['ENSG00000235519.3'], ['lncRNA'], ['TLCD5'], ['2'], NONE, NONE, ['OTTHUMG00000195005.1']
closest_strands.append(tss_genes[closest_tss_index].strand)
# Save distances and gene info to an Excel file
df = pd.DataFrame({
'Peak Name': peak_names,
'Distance to TSS': peak_tss_distances,
'Closest Gene (Ensemble ID)': closest_genes,
'Closest Gene (Gene name)': closest_gene_names,
'Gene Strand': closest_strands
})
df.to_excel('peak_tss_distances.xlsx', index=False)
# Calculate histogram and save bins and counts to a file
counts, bin_edges = np.histogram(peak_tss_distances, bins=1000)
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
hist_df = pd.DataFrame({'Bin Center': bin_centers, 'Count': counts})
hist_df.to_csv('peak_distribution.csv', index=False)
total_peaks = hist_df['Count'].sum()
with open('peak_distribution.csv', 'a') as f:
f.write(f'\nTotal number of peaks: {total_peaks}')
plt.hist(peak_tss_distances, bins=1000)
plt.xlabel('Distance to TSS')
plt.ylabel('Number of peaks')
plt.title('Distribution of peaks around TSS')
plt.savefig('peak_distribution.png')
plt.show()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Plot peak distribution around TSS.')
parser.add_argument('peaks_file', type=str, help='Path to the peaks.bed file')
parser.add_argument('gencode_db', type=str, help='Path to the gencode.v43.annotation.db file')
args = parser.parse_args()
plot_peak_distribution(args.peaks_file, args.gencode_db)
点赞本文的读者
还没有人对此文章表态
没有评论
Density of motif plots and its statistical tests
Identifying the Nearest Genomic Peaks within Defined Regions
© 2023 XGenes.com Impressum