Peak and Motif analyses in Promoters

./1_generate_promoter_sequences.py gencode.v43.annotation.gtf.db

1_generate_promoter_sequences.py
./2_find_motifs_in_promoters.py

2_find_motifs_in_promoters.py

generate peaks_and_motifs_in_promoters_with_LT_peak.xls

peaks_and_motifs_in_promoters_with_LT_peak.xls

#--3.1. generate peak_tss_distances.csv and peak_tss_distances.png
python3 plot_peaks.py peaks_NHDF_.bed /home/jhuang/REFs/gencode.v43.annotation.gtf.db /ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa
python3 plot_peaks.py peaks_HEK293_.bed /home/jhuang/REFs/gencode.v43.annotation.gtf.db /ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa
python3 plot_peaks.py peaks_PFSK-1_.bed /home/jhuang/REFs/gencode.v43.annotation.gtf.db /ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa
python3 plot_peaks.py peaks_K331A_.bed /home/jhuang/REFs/gencode.v43.annotation.gtf.db /ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa

#--3.2. split data into two dataframe, one is in which PromoterID occurres in filtered_peaks$Closest.Gene..Ensemble.ID., another is the remaining
#sed -i 's/[][]//g' motif_counts_positions.txt
data <- read.table("motif_counts_positions.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)

# Read the data
peaks <- read.table("peak_tss_distances.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
# Filter the rows
filtered_peaks <- subset(peaks, abs(`Distance.to.TSS`) <= 3000)
# Print the filtered peaks
write.table(filtered_peaks, file="filtered_peak_tss_distances.txt", sep="\t", row.names = FALSE)

# Create a new data frame for rows in 'data' where the PromoterID is in filtered_peaks's Closest Gene (Ensemble ID)
data_in_filtered <- data[data$PromoterID %in% filtered_peaks$Closest.Gene..Ensemble.ID., ]
data_in_filtered$Motif_GRGGC_Center_Positions <- sapply(data_in_filtered$Motif_GRGGC_Center_Positions, toString)
data_in_filtered$Motif_GCCYR_Center_Positions <- sapply(data_in_filtered$Motif_GCCYR_Center_Positions, toString)
write.table(data_in_filtered, file="motifs_in_promoters_with_LT_peak.txt", sep="\t", row.names = FALSE)
# Create another data frame for the remaining rows
data_not_in_filtered <- data[!data$PromoterID %in% filtered_peaks$Closest.Gene..Ensemble.ID., ]
data_not_in_filtered$Motif_GRGGC_Center_Positions <- sapply(data_not_in_filtered$Motif_GRGGC_Center_Positions, toString)
data_not_in_filtered$Motif_GCCYR_Center_Positions <- sapply(data_not_in_filtered$Motif_GCCYR_Center_Positions, toString)
write.table(data_not_in_filtered, file="motifs_in_promoters_without_LT_peak.txt", sep="\t", row.names = FALSE)
#780 vs 19262
#1707 vs ca. 60000
#sed -i 's/Symbol: //g' motifs_in_promoters_with_LT_peak.txt
#sed -i 's/Type: //g' motifs_in_promoters_with_LT_peak.txt
#sed -i 's/Symbol: //g' motifs_in_promoters_without_LT_peak.txt
#sed -i 's/Type: //g' motifs_in_promoters_without_LT_peak.txt
#sed -i 's/"//g' motifs_in_promoters_with_LT_peak.txt
#sed -i 's/"//g' motifs_in_promoters_without_LT_peak.txt

#--3.3. prepare data for motifs_in_promoters_with_LT_peak.txt
data1 <- read.table("motifs_in_promoters_with_LT_peak.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)

data1$Motif_GRGGC_Count <- as.numeric(data1$Motif_GRGGC_Count)
data1$Motif_GCCYR_Count <- as.numeric(data1$Motif_GCCYR_Count)
data1$Motif_GRGGC_Center_Positions <- strsplit(as.character(data1$Motif_GRGGC_Center_Positions), ",")
data1$Motif_GCCYR_Center_Positions <- strsplit(as.character(data1$Motif_GCCYR_Center_Positions), ",")
# Create data1 frames for each motif
data1_motif1 <- data.frame(PromoterID = rep(data1$PromoterID, data1$Motif_GRGGC_Count), 
                          Motif = "Motif GRGGC", 
                          Position = unlist(data1$Motif_GRGGC_Center_Positions))
data1_motif2 <- data.frame(PromoterID = rep(data1$PromoterID, data1$Motif_GCCYR_Count), 
                          Motif = "Motif GCCYR", 
                          Position = unlist(data1$Motif_GCCYR_Center_Positions))
# Combine motif data1 frames
data1_positions <- rbind(data1_motif1, data1_motif2)
# Get positions as numeric values, excluding NAs
motif1_positions_with_LT_peak <- as.numeric(data1_positions$Position[data1_positions$Motif == "Motif GRGGC"])
motif2_positions_with_LT_peak <- as.numeric(data1_positions$Position[data1_positions$Motif == "Motif GCCYR"])

#--3.4. prepare data for motifs_in_promoters_without_LT_peak.txt
data2 <- read.table("motifs_in_promoters_without_LT_peak.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
data2$Motif_GRGGC_Count <- as.numeric(data2$Motif_GRGGC_Count)
data2$Motif_GCCYR_Count <- as.numeric(data2$Motif_GCCYR_Count)
data2$Motif_GRGGC_Center_Positions <- strsplit(as.character(data2$Motif_GRGGC_Center_Positions), ",")
data2$Motif_GCCYR_Center_Positions <- strsplit(as.character(data2$Motif_GCCYR_Center_Positions), ",")
# Create data2 frames for each motif
data2_motif1 <- data.frame(PromoterID = rep(data2$PromoterID, data2$Motif_GRGGC_Count), 
                          Motif = "Motif GRGGC", 
                          Position = unlist(data2$Motif_GRGGC_Center_Positions))
data2_motif2 <- data.frame(PromoterID = rep(data2$PromoterID, data2$Motif_GCCYR_Count), 
                          Motif = "Motif GCCYR", 
                          Position = unlist(data2$Motif_GCCYR_Center_Positions))
# Combine motif data2 frames
data2_positions <- rbind(data2_motif1, data2_motif2)

# Get positions as numeric values, excluding NAs
motif1_positions_without_LT_peak <- as.numeric(data2_positions$Position[data2_positions$Motif == "Motif GRGGC"])
motif2_positions_without_LT_peak <- as.numeric(data2_positions$Position[data2_positions$Motif == "Motif GCCYR"])

#--3.5. Plot Histogram of counts and density of positions
# Define a common x-axis limit for all plots
x_limit <- max(max(data1$Motif_GRGGC_Count), max(data1$Motif_GCCYR_Count), max(data2$Motif_GRGGC_Count), max(data2$Motif_GCCYR_Count))
# Adjust the number of breaks (nbins) to control the width of histograms
nbins <- 30

y_limit1 <- 300
y_limit2 <- 12000
# Plot Histogram of counts with the same scale for x-axis
png("Histogram_of_Motif_GRGGC_in_promoters_with_LT_peak.png", width=800, height=800)
hist(data1$Motif_GRGGC_Count, main = "Histogram of Motif GRGGC in promoters with LT peak", xlab = "Counts", xlim = c(0, x_limit), ylim = c(0, y_limit1), breaks = 20)
dev.off()
png("Histogram_of_Motif_GRGGC_in_promoters_without_LT_peak.png", width=800, height=800)
hist(data2$Motif_GRGGC_Count, main = "Histogram of Motif GRGGC in promoters without LT peak", xlab = "Counts", xlim = c(0, x_limit), ylim = c(0, y_limit2), breaks = nbins)
dev.off()
png("Histogram_of_Motif_GCCYR_in_promoters_with_LT_peak.png", width=800, height=800)
hist(data1$Motif_GCCYR_Count, main = "Histogram of Motif GCCYR in promoters with LT peak", xlab = "Counts", xlim = c(0, x_limit), ylim = c(0, y_limit1), breaks = nbins)
dev.off()
png("Histogram_of_Motif_GCCYR_in_promoters_without_LT_peak.png", width=800, height=800)
hist(data2$Motif_GCCYR_Count, main = "Histogram of Motif GCCYR in promoters without LT peak", xlab = "Counts", xlim = c(0, x_limit), ylim = c(0, y_limit2), breaks = nbins)
dev.off()

# Set up the layout for the plots
png("Histogram_of_Motifs.png", width = 1000, height = 1000)
par(mfrow = c(2, 2))
hist(data1$Motif_GRGGC_Count, main = "Motif GRGGC Counts with LT peak", xlab = "Counts", xlim = c(0, x_limit), ylim = c(0, y_limit1), breaks = 20)
hist(data2$Motif_GRGGC_Count, main = "Motif GRGGC Counts without LT peak", xlab = "Counts", xlim = c(0, x_limit), ylim = c(0, y_limit2), breaks = nbins)
hist(data1$Motif_GCCYR_Count, main = "Motif GCCYR Counts with LT peak", xlab = "Counts", xlim = c(0, x_limit), ylim = c(0, y_limit1), breaks = nbins)
hist(data2$Motif_GCCYR_Count, main = "Motif GCCYR Counts without LT peak", xlab = "Counts", xlim = c(0, x_limit), ylim = c(0, y_limit2), breaks = nbins)
dev.off()

# Set the ylim value
ylim <- c(0.0001, 0.0003)
png("Density_of_Motif_GRGGC_in_promoters_with_LT_peak.png", width=800, height=800)
plot(density(motif1_positions_with_LT_peak, na.rm = TRUE), main="Density of GRGGC in promoters with LT peak", xlab="Position", ylim = ylim)
dev.off()
png("Density_of_Motif_GRGGC_in_promoters_without_LT_peak.png", width=800, height=800)
plot(density(motif1_positions_without_LT_peak, na.rm = TRUE), main="Density of GRGGC in promoters without LT peak", xlab="Position", ylim = ylim)
dev.off()
png("Density_of_Motif_GCCYR_in_promoters_with_LT_peak.png", width=800, height=800)
plot(density(motif2_positions_with_LT_peak, na.rm = TRUE), main="Density of GCCYR in promoters with LT peak", xlab="Position", ylim = ylim)
dev.off()
png("Density_of_Motif_GCCYR_in_promoters_without_LT_peak.png", width=800, height=800)
plot(density(motif2_positions_without_LT_peak, na.rm = TRUE), main="Density of GCCYR in promoters without LT peak", xlab="Position", ylim = ylim)
dev.off()

png("Density_of_Motifs.png", width = 1000, height = 1000)
par(mfrow = c(2, 2))
plot(density(motif1_positions_with_LT_peak, na.rm = TRUE), main = "Density of GRGGC in promoters with LT peak", xlab = "Position", ylim = ylim)
plot(density(motif1_positions_without_LT_peak, na.rm = TRUE), main = "Density of GRGGC in promoters without LT peak", xlab = "Position", ylim = ylim)
plot(density(motif2_positions_with_LT_peak, na.rm = TRUE), main = "Density of GCCYR in promoters with LT peak", xlab = "Position", ylim = ylim)
plot(density(motif2_positions_without_LT_peak, na.rm = TRUE), main = "Density of GCCYR in promoters without LT peak", xlab = "Position", ylim = ylim)
dev.off()

#--3.6. assign each promoter to a prefined class according to Motifs_Count and Avg_Distance_To_TSS_of_Motifs
library(dplyr)
data <- read.table("motifs_in_promoters_with_LT_peak.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count)
data$Motif_GCCYR_Count <- as.numeric(data$Motif_GCCYR_Count)
# Add a column for the sum of Motif_GRGGC_Count and Motif_GCCYR_Count
data <- data %>%
  mutate(Motifs_Count = Motif_GRGGC_Count + Motif_GCCYR_Count)
# Calculate absolute average Motif_GRGGC distance to TSS
data$Avg_Distance_To_TSS_of_Motif_GRGGC <- sapply(strsplit(data$Motif_GRGGC_Center_Positions, ", "), 
                                         function(x) round(mean(as.numeric(x), na.rm = TRUE)))
# Calculate absolute average Motif_GCCYR distance to TSS
data$Avg_Distance_To_TSS_of_Motif_GCCYR <- sapply(strsplit(data$Motif_GCCYR_Center_Positions, ", "), 
                                         function(x) round(mean(as.numeric(x), na.rm = TRUE)))
# Revised function to calculate the mean of two strings of comma-separated numbers
mean_of_strings <- function(str1, str2) {
  # Convert strings of numbers to numeric vectors
  vec1 <- as.numeric(strsplit(str1, split = ",")[[1]])
  vec2 <- as.numeric(strsplit(str2, split = ",")[[1]])
  # Handle cases where vec1 or vec2 is NULL (i.e., when str1 or str2 is an empty string)
  if (is.null(vec1)) {
    vec1 <- numeric(0)
  }
  if (is.null(vec2)) {
    vec2 <- numeric(0)
  }
  # Compute the mean of the two vectors
  return(round(mean(c(vec1, vec2), na.rm = TRUE)))
}
# Apply the function to each row of the dataframe
data$Avg_Distance_To_TSS_of_Motifs <- mapply(mean_of_strings, data$Motif_GRGGC_Center_Positions, data$Motif_GCCYR_Center_Positions)
# Assign classes based on the given rules
data <- data %>%
  mutate(
    Class_based_Motif_GRGGC = case_when(
      Motif_GRGGC_Count < 50 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) <= 500 ~ "Class 1",
      Motif_GRGGC_Count < 50 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) > 500 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) <= 1000 ~ "Class 2",
      Motif_GRGGC_Count < 50 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) > 1000 ~ "Class 3",
      Motif_GRGGC_Count >= 50 & Motif_GRGGC_Count < 100 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) <= 500 ~ "Class 4",
      Motif_GRGGC_Count >= 50 & Motif_GRGGC_Count < 100 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) > 500 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) <= 1000 ~ "Class 5",
      Motif_GRGGC_Count >= 50 & Motif_GRGGC_Count < 100 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) > 1000 ~ "Class 6",
      Motif_GRGGC_Count >= 100 & Motif_GRGGC_Count < 150 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) <= 500 ~ "Class 7",
      Motif_GRGGC_Count >= 100 & Motif_GRGGC_Count < 150 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) > 500 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) <= 1000 ~ "Class 8",
      Motif_GRGGC_Count >= 100 & Motif_GRGGC_Count < 150 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) > 1000 ~ "Class 9"
    )
  )
data <- data %>%
  mutate(
    Class_based_Motif_GCCYR = case_when(
      Motif_GCCYR_Count < 50 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) <= 500 ~ "Class 1",
      Motif_GCCYR_Count < 50 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) > 500 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) <= 1000 ~ "Class 2",
      Motif_GCCYR_Count < 50 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) > 1000 ~ "Class 3",
      Motif_GCCYR_Count >= 50 & Motif_GCCYR_Count < 100 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) <= 500 ~ "Class 4",
      Motif_GCCYR_Count >= 50 & Motif_GCCYR_Count < 100 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) > 500 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) <= 1000 ~ "Class 5",
      Motif_GCCYR_Count >= 50 & Motif_GCCYR_Count < 100 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) > 1000 ~ "Class 6",
      Motif_GCCYR_Count >= 100 & Motif_GCCYR_Count < 150 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) <= 500 ~ "Class 7",
      Motif_GCCYR_Count >= 100 & Motif_GCCYR_Count < 150 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) > 500 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) <= 1000 ~ "Class 8",
      Motif_GCCYR_Count >= 100 & Motif_GCCYR_Count < 150 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) > 1000 ~ "Class 9"
    )
  )
data <- data %>%
  mutate(
    Class_based_Motifs = case_when(
      Motifs_Count < 50 & abs(Avg_Distance_To_TSS_of_Motifs) <= 500 ~ "Class 1",
      Motifs_Count < 50 & abs(Avg_Distance_To_TSS_of_Motifs) > 500 & abs(Avg_Distance_To_TSS_of_Motifs) <= 1000 ~ "Class 2",
      Motifs_Count < 50 & abs(Avg_Distance_To_TSS_of_Motifs) > 1000 ~ "Class 3",
      Motifs_Count >= 50 & Motifs_Count < 100 & abs(Avg_Distance_To_TSS_of_Motifs) <= 500 ~ "Class 4",
      Motifs_Count >= 50 & Motifs_Count < 100 & abs(Avg_Distance_To_TSS_of_Motifs) > 500 & abs(Avg_Distance_To_TSS_of_Motifs) <= 1000 ~ "Class 5",
      Motifs_Count >= 50 & Motifs_Count < 100 & abs(Avg_Distance_To_TSS_of_Motifs) > 1000 ~ "Class 6",
      Motifs_Count >= 100 & Motifs_Count < 150 & abs(Avg_Distance_To_TSS_of_Motifs) <= 500 ~ "Class 7",
      Motifs_Count >= 100 & Motifs_Count < 150 & abs(Avg_Distance_To_TSS_of_Motifs) > 500 & abs(Avg_Distance_To_TSS_of_Motifs) <= 1000 ~ "Class 8",
      Motifs_Count >= 100 & Motifs_Count < 150 & abs(Avg_Distance_To_TSS_of_Motifs) > 1000 ~ "Class 9"
    )
  )
# Define the desired column order
desired_columns <- c("PromoterID","Gene_Symbol","Gene_Type","Motif_GRGGC_Count","Motif_GRGGC_Center_Positions","Avg_Distance_To_TSS_of_Motif_GRGGC","Class_based_Motif_GRGGC",  "Motif_GCCYR_Count","Motif_GCCYR_Center_Positions","Avg_Distance_To_TSS_of_Motif_GCCYR","Class_based_Motif_GCCYR",  "Motifs_Count","Avg_Distance_To_TSS_of_Motifs","Class_based_Motifs")
# Reorder the columns
data <- select(data, all_of(desired_columns))
#sed -i 's/Motifs/Motifs_GRGGC+GCCYR/g' motifs_in_promoters_with_LT_peak_Category.txt
write.table(data, file="motifs_in_promoters_with_LT_peak_Category.txt", sep="\t", row.names = FALSE)

#--3.7. add peak information into motif table
./add_peaks_to_motifs.py
#~/Tools/csv2xls-0.4/csv_to_xls.py _peaks_and_motifs_in_promoters_with_LT_peak.txt peak_tss_distances.txt -d$'\t' -o peaks_and_motifs_in_promoters_with_LT_peak.xls;
#rename the name of sheet1 to "peaks_and_motifs_in_promoters_with_LT_peak" and "all_LT_peaks_NHDF"

plot_peaks.py (updated code)

plot_peaks.py (updated code)

#python3 plot_peaks.py peaks_NHDF_.bed /home/jhuang/REFs/gencode.v43.annotation.gtf.db 
#mv peak_tss_distances.xlsx peak_tss_distances_NHDF.xlsx
#mv peak_tss_distances.txt peak_tss_distances_NHDF.txt
#mv peak_distribution.csv peak_distribution_NHDF.csv
#mv peak_distribution.png peak_distribution_NHDF.png
#python3 plot_peaks.py peaks_HEK293_.bed /home/jhuang/REFs/gencode.v43.annotation.gtf.db 
#mv peak_tss_distances.xlsx peak_tss_distances_HEK293.xlsx
#mv peak_tss_distances.txt peak_tss_distances_HEK293.txt
#mv peak_distribution.csv peak_distribution_HEK293.csv
#mv peak_distribution.png peak_distribution_HEK293.png
#python3 plot_peaks.py peaks_PFSK-1_.bed /home/jhuang/REFs/gencode.v43.annotation.gtf.db 
#mv peak_tss_distances.xlsx peak_tss_distances_PFSK-1.xlsx
#mv peak_tss_distances.txt peak_tss_distances_PFSK-1.txt
#mv peak_distribution.csv peak_distribution_PFSK-1.csv
#mv peak_distribution.png peak_distribution_PFSK-1.png

import pprint
import argparse
import matplotlib.pyplot as plt
import pandas as pd
import gffutils
import numpy as np

def plot_peak_distribution(peaks_file, gencode_db):
    db = gffutils.FeatureDB(gencode_db, keep_order=True)
    peaks = pd.read_table(peaks_file, header=None)
    peaks.columns = ['chrom', 'start', 'end', 'name', 'score']
    tss_positions = []
    tss_genes = []
    for gene in db.features_of_type('gene'):
        if gene.strand == '+':
            tss_positions.append(gene.start)
        else:
            tss_positions.append(gene.end)
        tss_genes.append(gene)
    peak_tss_distances = []
    peak_names = []
    closest_genes = []
    closest_gene_names = []
    closest_strands = []
    closest_gene_types = []  # New list for gene_type

    for _, peak in peaks.iterrows():
        peak_center = (peak['start'] + peak['end']) // 2
        closest_tss_index = np.argmin([abs(tss - peak_center) for tss in tss_positions])
        distance_to_tss = peak_center - tss_positions[closest_tss_index]
        peak_tss_distances.append(distance_to_tss)
        peak_names.append(peak['name'])
        closest_genes.append(tss_genes[closest_tss_index].id)
        if 'gene_name' in tss_genes[closest_tss_index].attributes:
            closest_gene_name = tss_genes[closest_tss_index].attributes['gene_name'][0]
        else:
            closest_gene_name = 'N/A'  # Set a default value if 'gene_name' attribute is not present
        closest_gene_names.append(closest_gene_name)
        closest_strands.append(tss_genes[closest_tss_index].strand)

        if 'gene_type' in tss_genes[closest_tss_index].attributes:
            closest_gene_type = tss_genes[closest_tss_index].attributes['gene_type'][0]
        else:
            closest_gene_type = 'N/A'  # Set a default value if 'gene_type' attribute is not present
        closest_gene_types.append(closest_gene_type)

    df = pd.DataFrame({
        'Peak Name': peak_names,
        'Distance to TSS': peak_tss_distances,
        'Closest Gene (Ensemble ID)': closest_genes,
        'Closest Gene (Gene name)': closest_gene_names,
        'Gene Strand': closest_strands,
        'Gene Type': closest_gene_types  # Add new column for gene_type
    })

    df.to_excel('peak_tss_distances.xlsx', index=False)
    df.to_csv('peak_tss_distances.txt', sep='\t', index=False)  # Save as tab-separated text file
    bins_no=600   #1000 for nHDF, 600 for HEK293 and PFSK-1
    counts, bin_edges = np.histogram(peak_tss_distances, bins=bins_no)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
    hist_df = pd.DataFrame({'Bin Center': bin_centers, 'Count': counts})
    hist_df.to_csv('peak_distribution.csv', index=False)
    total_peaks = hist_df['Count'].sum()
    with open('peak_distribution.csv', 'a') as f:
        f.write(f'\nTotal number of peaks: {total_peaks}')

    # Set the figure background to be transparent
    plt.gcf().set_facecolor('none')
    plt.gcf().patch.set_alpha(0.0)

    plt.hist(peak_tss_distances, bins=bins_no)

    plt.xlabel('Distance to TSS', fontsize=17)
    plt.ylabel('Number of peaks', fontsize=17)
    plt.title('')
    #plt.title('Distribution of peaks around TSS', fontsize=12)

    # Set tick label font sizes
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)

    # Adjust the space on the left of the figure
    plt.gcf().subplots_adjust(left=0.15)

    # Set x-axis range
    plt.xlim(-8000, 8000)

    plt.savefig('peak_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Plot peak distribution around TSS.')
    parser.add_argument('peaks_file', type=str, help='Path to the peaks.bed file')
    parser.add_argument('gencode_db', type=str, help='Path to the gencode.v43.annotation.db file')
    args = parser.parse_args()
    plot_peak_distribution(args.peaks_file, args.gencode_db)

Microbial bioinformatics

Microbial bioinformatics uses computational tools to analyze genomes, track evolution, and study functions in microorganisms, including bacteria and viruses.

Leave a Reply Cancel reply