Defining and Categorizing Promoter Types Based on the ‘GRGGC’ Motif Frequency and Distance to TSS

  1. generate_promter_sequences

     #!/usr/bin/env python3
     #./1_generate_promoter_sequences.py gencode.v43.annotation.gtf.db
     import gffutils
     from pyfaidx import Fasta
     import argparse
     from Bio import SeqIO
     from Bio.SeqRecord import SeqRecord
     from Bio.Seq import Seq
    
     def extract_promoter_sequences(gencode_db, genome_records, upstream_length=3000, downstream_length=3000):
         db = gffutils.FeatureDB(gencode_db, keep_order=True)
    
         promoters = []
         for gene in db.features_of_type('gene'): #CDS
             if gene.strand == '+':
                 promoter_start = max(gene.start - upstream_length, 1)
                 promoter_end = gene.start + downstream_length - 1
             else: # gene.strand == '-'
                 promoter_start = gene.end - downstream_length + 1
                 promoter_end = min(gene.end + upstream_length, len(genome_records[gene.chrom]))
    
             promoter_seq = str(genome_records[gene.chrom][promoter_start-1:promoter_end])
    
             # commenting the following two lines don't effect the output size, both are 62703 records 
             #if not set(promoter_seq.upper()) <= {"A", "C", "G", "T"}:
             #    print(f"Invalid sequence for id {gene.id}: {promoter_seq}")
    
             if gene.strand == '-':
                 promoter_seq = str(Seq(promoter_seq).reverse_complement())
    
             record = SeqRecord(Seq(promoter_seq.upper()), id=gene.id, description="")
             promoters.append(record)
         return promoters
    
     if __name__ == "__main__":
         parser = argparse.ArgumentParser(description='Plot peak distribution around TSS.')
         parser.add_argument('gencode_db', type=str, help='Path to the gencode.v43.annotation.gtf.db file')
         args = parser.parse_args()
    
         genome_records = Fasta('/ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa')
    
         promoter_sequences = extract_promoter_sequences(args.gencode_db, genome_records)
         SeqIO.write(promoter_sequences, "promoter_sequences_3000_3000.fasta", "fasta")
         #62703 containing ENSG00000198804.2
  2. find motifs in promoters

     #!/usr/bin/env python3
     #./find_motifs_in_promoters.py
     import re
     from Bio import SeqIO
    
     def find_motif_frequency_and_distribution(fasta_file, output_file, motif_1="GRGGC", motif_2="GCCYR"):
         motif_1 = motif_1.replace("R", "[AG]").replace("Y", "[CT]")
         motif_2 = motif_2.replace("R", "[AG]").replace("Y", "[CT]")
    
         promoter_sequences = SeqIO.parse(fasta_file, "fasta")
         #with open(output_file, 'w') as f:
         #    f.write("PromoterID\tMotif1_Count\tMotif2_Count\tMotif1_Positions\tMotif2_Positions\n")
         #    for promoter in promoter_sequences:
         #        motif1_positions = [m.start() for m in re.finditer(motif_1, str(promoter.seq))]
         #        motif2_positions = [m.start() for m in re.finditer(motif_2, str(promoter.seq))]
         #        line = f"{promoter.id}\t{len(motif1_positions)}\t{len(motif2_positions)}\t{motif1_positions}\t{motif2_positions}\n"
         #        f.write(line)
         with open(output_file, 'w') as f:
             f.write("PromoterID\tMotif1_Count\tMotif2_Count\tMotif1_Center_Positions\tMotif2_Center_Positions\n")
             for promoter in promoter_sequences:
                 motif1_positions = [(m.start() + len(motif_1) // 2 - 3000) for m in re.finditer(motif_1, str(promoter.seq))]
                 motif2_positions = [(m.start() + len(motif_2) // 2 - 3000) for m in re.finditer(motif_2, str(promoter.seq))]
                 line = f"{promoter.id}\t{len(motif1_positions)}\t{len(motif2_positions)}\t{motif1_positions}\t{motif2_positions}\n"
                 f.write(line)
    
     if __name__ == "__main__":
         find_motif_frequency_and_distribution("promoter_sequences_3000_3000.fasta", "motif_counts_positions.txt")
  3. R codes

     #--3.1. draw plots for all motifs
     #DELETE in Kate '[' and ']' in motif_counts_positions.txt
     data <- read.table("motif_counts_positions.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
    
     data <- rename(data, Motif_GRGGC_Count=Motif1_Count, Motif_GCCYR_Count=Motif2_Count, Motif_GRGGC_Center_Positions = Motif1_Center_Positions, Motif_GCCYR_Center_Positions = Motif2_Center_Positions)
    
     data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count)
     data$Motif_GCCYR_Count <- as.numeric(data$Motif_GCCYR_Count)
    
     # Histogram of Motif1 counts
     png("Histogram_of_Motif_GRGGC_counts.png", width=800, height=800)
     hist(data$Motif_GRGGC_Count, main="Motif GRGGC Counts", xlab="Counts")
     dev.off()
    
     # Histogram of Motif2 counts
     png("Histogram_of_Motif_GCCYR_counts.png", width=800, height=800)
     hist(data$Motif_GCCYR_Count, main="Motif GCCYR Counts", xlab="Counts")
     dev.off()
    
     # # Convert list-like string to actual list of numbers
     # data$Motif_GRGGC_Center_Positions <- sapply(strsplit(gsub("[\\[\\]\\s+]", "", data$Motif_GRGGC_Center_Positions), ","), function(x) as.numeric(x[nzchar(x)]))
     # data$Motif_GCCYR_Center_Positions <- sapply(strsplit(gsub("[\\[\\]\\s+]", "", data$Motif_GCCYR_Center_Positions), ","), function(x) as.numeric(x[nzchar(x)]))
     data$Motif_GRGGC_Center_Positions <- strsplit(as.character(data$Motif_GRGGC_Center_Positions), ",")
     data$Motif_GCCYR_Center_Positions <- strsplit(as.character(data$Motif_GCCYR_Center_Positions), ",")
    
     # Create data frames for each motif
     data_motif_GRGGC <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GRGGC_Count), 
                               Motif = "Motif GRGGC", 
                               Position = unlist(data$Motif_GRGGC_Center_Positions))
    
     data_motif_GCCYR <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GCCYR_Count), 
                               Motif = "Motif GCCYR", 
                               Position = unlist(data$Motif_GCCYR_Center_Positions))
    
     # Combine motif data frames
     data_positions <- rbind(data_motif_GRGGC, data_motif_GCCYR)
    
     # Get positions as numeric values, excluding NAs
     motif1_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GRGGC"])
     # Plot density of Motif1 positions
     png("Density_of_Motif_GRGGC_positions.png", width=800, height=800)
     plot(density(motif1_positions, na.rm = TRUE), main="Density Plot of Motif GRGGC Positions", xlab="Position")
     dev.off()
    
     # Get positions as numeric values, excluding NAs
     motif2_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GCCYR"])
     # Plot density of Motif2 positions
     png("Density_of_Motif_GCCYR_positions.png", width=800, height=800)
     plot(density(motif2_positions, na.rm = TRUE), main="Density Plot of Motif GCCYR Positions", xlab="Position")
     dev.off()
    
     #--3.2. split data into two dataframe, one is in which PromoterID occurres in filtered_data$Closest.Gene..Ensemble.ID., another is the remaining
     # Create a new data frame for rows in 'data' where the PromoterID is in filtered_data's Closest Gene (Ensemble ID)
     data_in_filtered <- data[data$PromoterID %in% filtered_data$Closest.Gene..Ensemble.ID., ]
     data_in_filtered$Motif_GRGGC_Center_Positions <- sapply(data_in_filtered$Motif_GRGGC_Center_Positions, toString)
     data_in_filtered$Motif_GCCYR_Center_Positions <- sapply(data_in_filtered$Motif_GCCYR_Center_Positions, toString)
     write.table(data_in_filtered, file="motifs_in_promoters_with_LT_peak.txt", sep="\t", row.names = FALSE)
    
     # Create another data frame for the remaining rows
     data_not_in_filtered <- data[!data$PromoterID %in% filtered_data$Closest.Gene..Ensemble.ID., ]
     data_not_in_filtered$Motif_GRGGC_Center_Positions <- sapply(data_not_in_filtered$Motif_GRGGC_Center_Positions, toString)
     data_not_in_filtered$Motif_GCCYR_Center_Positions <- sapply(data_not_in_filtered$Motif_GCCYR_Center_Positions, toString)
     write.table(data_not_in_filtered, file="motifs_in_promoters_without_LT_peak.txt", sep="\t", row.names = FALSE)
    
     #--3.3. draw plots for motifs_in_promoters_with_LT_peak.txt
     #DELETE with Kate '"' in motifs_in_promoters_with_LT_peak.txt
     data <- read.table("motifs_in_promoters_with_LT_peak.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
    
     data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count)
     data$Motif_GCCYR_Count <- as.numeric(data$Motif_GCCYR_Count)
    
     # Histogram of Motif_GRGGC counts
     png("Histogram_of_Motif_GRGGC_in_promoters_with_LT_peak.png", width=800, height=800)
     hist(data$Motif_GRGGC_Count, main="Motif GRGGC Counts", xlab="Counts")
     dev.off()
    
     # Histogram of Motif_GCCYR counts
     png("Histogram_of_Motif_GCCYR_in_promoters_with_LT_peak.png", width=800, height=800)
     hist(data$Motif_GCCYR_Count, main="Motif GCCYR Counts", xlab="Counts")
     dev.off()
    
     data$Motif_GRGGC_Center_Positions <- strsplit(as.character(data$Motif_GRGGC_Center_Positions), ",")
     data$Motif_GCCYR_Center_Positions <- strsplit(as.character(data$Motif_GCCYR_Center_Positions), ",")
    
     # Create data frames for each motif
     data_motif1 <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GRGGC_Count), 
                               Motif = "Motif GRGGC", 
                               Position = unlist(data$Motif_GRGGC_Center_Positions))
    
     data_motif2 <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GCCYR_Count), 
                               Motif = "Motif GCCYR", 
                               Position = unlist(data$Motif_GCCYR_Center_Positions))
    
     # Combine motif data frames
     data_positions <- rbind(data_motif1, data_motif2)
    
     # Get positions as numeric values, excluding NAs
     motif1_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GRGGC"])
     # Plot density of Motif_GRGGC positions
     png("Density_of_Motif_GRGGC_in_promoters_with_LT_peak.png", width=800, height=800)
     plot(density(motif1_positions, na.rm = TRUE), main="Density Plot of Motif GRGGC Positions", xlab="Position")
     dev.off()
    
     # Get positions as numeric values, excluding NAs
     motif2_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GCCYR"])
     # Plot density of Motif_GCCYR positions
     png("Density_of_Motif_GCCYR_in_promoters_with_LT_peak.png", width=800, height=800)
     plot(density(motif2_positions, na.rm = TRUE), main="Density Plot of Motif GCCYR Positions", xlab="Position")
     dev.off()
    
     #--3.4. draw plots for motifs_in_promoters_without_LT_peak.txt
     #DELETE '"' with Kate in motifs_in_promoters_without_LT_peak.txt
     data <- read.table("motifs_in_promoters_without_LT_peak.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
    
     data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count)
     data$Motif_GCCYR_Count <- as.numeric(data$Motif_GCCYR_Count)
    
     # Histogram of Motif_GRGGC counts
     png("Histogram_of_Motif_GRGGC_in_promoters_without_LT_peak.png", width=800, height=800)
     hist(data$Motif_GRGGC_Count, main="Motif GRGGC Counts", xlab="Counts")
     dev.off()
    
     # Histogram of Motif_GCCYR counts
     png("Histogram_of_Motif_GCCYR_in_promoters_without_LT_peak.png", width=800, height=800)
     hist(data$Motif_GCCYR_Count, main="Motif GCCYR Counts", xlab="Counts")
     dev.off()
    
     data$Motif_GRGGC_Center_Positions <- strsplit(as.character(data$Motif_GRGGC_Center_Positions), ",")
     data$Motif_GCCYR_Center_Positions <- strsplit(as.character(data$Motif_GCCYR_Center_Positions), ",")
    
     # Create data frames for each motif
     data_motif1 <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GRGGC_Count), 
                               Motif = "Motif GRGGC", 
                               Position = unlist(data$Motif_GRGGC_Center_Positions))
    
     data_motif2 <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GCCYR_Count), 
                               Motif = "Motif GCCYR", 
                               Position = unlist(data$Motif_GCCYR_Center_Positions))
    
     # Combine motif data frames
     data_positions <- rbind(data_motif1, data_motif2)
    
     # Get positions as numeric values, excluding NAs
     motif1_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GRGGC"])
     # Plot density of Motif_GRGGC positions
     png("Density_of_Motif_GRGGC_in_promoters_without_LT_peak.png", width=800, height=800)
     plot(density(motif1_positions, na.rm = TRUE), main="Density Plot of Motif GRGGC Positions", xlab="Position")
     dev.off()
    
     # Get positions as numeric values, excluding NAs
     motif2_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GCCYR"])
     # Plot density of Motif_GCCYR positions
     png("Density_of_Motif_GCCYR_in_promoters_without_LT_peak.png", width=800, height=800)
     plot(density(motif2_positions, na.rm = TRUE), main="Density Plot of Motif GCCYR Positions", xlab="Position")
     dev.off()
    
     #--3.5. assign each promoter to a prefined class according to Total_Motifs and Avg_Motif_Distance_To_TSS
     data <- read.table("motifs_in_promoters_with_LT_peak.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
     data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count)
     data$Motif_GCCYR_Count <- as.numeric(data$Motif_GCCYR_Count)
    
     # Add a column for the sum of Motif_GRGGC_Count and Motif_GCCYR_Count
     data <- data %>%
       mutate(Total_Motifs = Motif_GRGGC_Count + Motif_GCCYR_Count)
    
     # Calculate absolute average Motif_GRGGC distance to TSS
     data$Avg_Motif_GRGGC_Distance_To_TSS <- sapply(strsplit(data$Motif_GRGGC_Center_Positions, ", "), 
                                              function(x) round(mean(as.numeric(x), na.rm = TRUE)))
     # Calculate absolute average Motif_GCCYR distance to TSS
     data$Avg_Motif_GCCYR_Distance_To_TSS <- sapply(strsplit(data$Motif_GCCYR_Center_Positions, ", "), 
                                              function(x) round(mean(as.numeric(x), na.rm = TRUE)))
    
     # Revised function to calculate the mean of two strings of comma-separated numbers
     mean_of_strings <- function(str1, str2) {
       # Convert strings of numbers to numeric vectors
       vec1 <- as.numeric(strsplit(str1, split = ",")[[1]])
       vec2 <- as.numeric(strsplit(str2, split = ",")[[1]])
       # Handle cases where vec1 or vec2 is NULL (i.e., when str1 or str2 is an empty string)
       if (is.null(vec1)) {
         vec1 <- numeric(0)
       }
       if (is.null(vec2)) {
         vec2 <- numeric(0)
       }
       # Compute the mean of the two vectors
       return(round(mean(c(vec1, vec2), na.rm = TRUE)))
     }
     # Apply the function to each row of the dataframe
     data$Avg_Motif_Distance_To_TSS <- mapply(mean_of_strings, data$Motif_GRGGC_Center_Positions, data$Motif_GCCYR_Center_Positions)
    
     # Assign classes based on the given rules
     #change the class name as string "Class 1", "Class 2" ... in the following code.
     data <- data %>%
       mutate(
         Class = case_when(
           Total_Motifs < 50 & abs(Avg_Motif_Distance_To_TSS) <= 1000 ~ "Total_Motifs < 50 & abs(Avg_Motif_Distance_To_TSS) <= 1000",
           Total_Motifs < 50 & abs(Avg_Motif_Distance_To_TSS) > 1000 ~ "Total_Motifs < 50 & abs(Avg_Motif_Distance_To_TSS) > 1000",
           Total_Motifs >= 50 & Total_Motifs < 100 & abs(Avg_Motif_Distance_To_TSS) <= 1000 ~ "50 <= Total_Motifs < 100 & abs(Avg_Motif_Distance_To_TSS) <= 1000",
           Total_Motifs >= 50 & Total_Motifs < 100 & abs(Avg_Motif_Distance_To_TSS) > 1000 ~ "50 <= Total_Motifs < 100 & abs(Avg_Motif_Distance_To_TSS) > 1000",
           Total_Motifs >= 100 & Total_Motifs < 150 & abs(Avg_Motif_Distance_To_TSS) <= 1000 ~ "100 <= Total_Motifs < 150 & abs(Avg_Motif_Distance_To_TSS) <= 1000",
           Total_Motifs >= 100 & Total_Motifs < 150 & abs(Avg_Motif_Distance_To_TSS) > 1000 ~ "100 <= Total_Motifs < 150 & abs(Avg_Motif_Distance_To_TSS) > 1000"
         )
       )
    
     write.table(data, file="motifs_in_promoters_with_LT_peak_Category.txt", sep="\t", row.names = FALSE)  
     #~/Tools/csv2xls-0.4/csv_to_xls.py motifs_in_promoters_with_LT_peak_Category.txt -d$'\t' -o  motifs_in_promoters_with_LT_peak_Category.xls;

Leave a Reply

Your email address will not be published. Required fields are marked *