-
generate_promter_sequences
#!/usr/bin/env python3 #./1_generate_promoter_sequences.py gencode.v43.annotation.gtf.db import gffutils from pyfaidx import Fasta import argparse from Bio import SeqIO from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq def extract_promoter_sequences(gencode_db, genome_records, upstream_length=3000, downstream_length=3000): db = gffutils.FeatureDB(gencode_db, keep_order=True) promoters = [] for gene in db.features_of_type('gene'): #CDS if gene.strand == '+': promoter_start = max(gene.start - upstream_length, 1) promoter_end = gene.start + downstream_length - 1 else: # gene.strand == '-' promoter_start = gene.end - downstream_length + 1 promoter_end = min(gene.end + upstream_length, len(genome_records[gene.chrom])) promoter_seq = str(genome_records[gene.chrom][promoter_start-1:promoter_end]) # commenting the following two lines don't effect the output size, both are 62703 records #if not set(promoter_seq.upper()) <= {"A", "C", "G", "T"}: # print(f"Invalid sequence for id {gene.id}: {promoter_seq}") if gene.strand == '-': promoter_seq = str(Seq(promoter_seq).reverse_complement()) record = SeqRecord(Seq(promoter_seq.upper()), id=gene.id, description="") promoters.append(record) return promoters if __name__ == "__main__": parser = argparse.ArgumentParser(description='Plot peak distribution around TSS.') parser.add_argument('gencode_db', type=str, help='Path to the gencode.v43.annotation.gtf.db file') args = parser.parse_args() genome_records = Fasta('/ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa') promoter_sequences = extract_promoter_sequences(args.gencode_db, genome_records) SeqIO.write(promoter_sequences, "promoter_sequences_3000_3000.fasta", "fasta") #62703 containing ENSG00000198804.2
-
find motifs in promoters
#!/usr/bin/env python3 #./find_motifs_in_promoters.py import re from Bio import SeqIO def find_motif_frequency_and_distribution(fasta_file, output_file, motif_1="GRGGC", motif_2="GCCYR"): motif_1 = motif_1.replace("R", "[AG]").replace("Y", "[CT]") motif_2 = motif_2.replace("R", "[AG]").replace("Y", "[CT]") promoter_sequences = SeqIO.parse(fasta_file, "fasta") #with open(output_file, 'w') as f: # f.write("PromoterID\tMotif1_Count\tMotif2_Count\tMotif1_Positions\tMotif2_Positions\n") # for promoter in promoter_sequences: # motif1_positions = [m.start() for m in re.finditer(motif_1, str(promoter.seq))] # motif2_positions = [m.start() for m in re.finditer(motif_2, str(promoter.seq))] # line = f"{promoter.id}\t{len(motif1_positions)}\t{len(motif2_positions)}\t{motif1_positions}\t{motif2_positions}\n" # f.write(line) with open(output_file, 'w') as f: f.write("PromoterID\tMotif1_Count\tMotif2_Count\tMotif1_Center_Positions\tMotif2_Center_Positions\n") for promoter in promoter_sequences: motif1_positions = [(m.start() + len(motif_1) // 2 - 3000) for m in re.finditer(motif_1, str(promoter.seq))] motif2_positions = [(m.start() + len(motif_2) // 2 - 3000) for m in re.finditer(motif_2, str(promoter.seq))] line = f"{promoter.id}\t{len(motif1_positions)}\t{len(motif2_positions)}\t{motif1_positions}\t{motif2_positions}\n" f.write(line) if __name__ == "__main__": find_motif_frequency_and_distribution("promoter_sequences_3000_3000.fasta", "motif_counts_positions.txt")
-
R codes
#--3.1. draw plots for all motifs #DELETE in Kate '[' and ']' in motif_counts_positions.txt data <- read.table("motif_counts_positions.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE) data <- rename(data, Motif_GRGGC_Count=Motif1_Count, Motif_GCCYR_Count=Motif2_Count, Motif_GRGGC_Center_Positions = Motif1_Center_Positions, Motif_GCCYR_Center_Positions = Motif2_Center_Positions) data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count) data$Motif_GCCYR_Count <- as.numeric(data$Motif_GCCYR_Count) # Histogram of Motif1 counts png("Histogram_of_Motif_GRGGC_counts.png", width=800, height=800) hist(data$Motif_GRGGC_Count, main="Motif GRGGC Counts", xlab="Counts") dev.off() # Histogram of Motif2 counts png("Histogram_of_Motif_GCCYR_counts.png", width=800, height=800) hist(data$Motif_GCCYR_Count, main="Motif GCCYR Counts", xlab="Counts") dev.off() # # Convert list-like string to actual list of numbers # data$Motif_GRGGC_Center_Positions <- sapply(strsplit(gsub("[\\[\\]\\s+]", "", data$Motif_GRGGC_Center_Positions), ","), function(x) as.numeric(x[nzchar(x)])) # data$Motif_GCCYR_Center_Positions <- sapply(strsplit(gsub("[\\[\\]\\s+]", "", data$Motif_GCCYR_Center_Positions), ","), function(x) as.numeric(x[nzchar(x)])) data$Motif_GRGGC_Center_Positions <- strsplit(as.character(data$Motif_GRGGC_Center_Positions), ",") data$Motif_GCCYR_Center_Positions <- strsplit(as.character(data$Motif_GCCYR_Center_Positions), ",") # Create data frames for each motif data_motif_GRGGC <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GRGGC_Count), Motif = "Motif GRGGC", Position = unlist(data$Motif_GRGGC_Center_Positions)) data_motif_GCCYR <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GCCYR_Count), Motif = "Motif GCCYR", Position = unlist(data$Motif_GCCYR_Center_Positions)) # Combine motif data frames data_positions <- rbind(data_motif_GRGGC, data_motif_GCCYR) # Get positions as numeric values, excluding NAs motif1_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GRGGC"]) # Plot density of Motif1 positions png("Density_of_Motif_GRGGC_positions.png", width=800, height=800) plot(density(motif1_positions, na.rm = TRUE), main="Density Plot of Motif GRGGC Positions", xlab="Position") dev.off() # Get positions as numeric values, excluding NAs motif2_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GCCYR"]) # Plot density of Motif2 positions png("Density_of_Motif_GCCYR_positions.png", width=800, height=800) plot(density(motif2_positions, na.rm = TRUE), main="Density Plot of Motif GCCYR Positions", xlab="Position") dev.off() #--3.2. split data into two dataframe, one is in which PromoterID occurres in filtered_data$Closest.Gene..Ensemble.ID., another is the remaining # Create a new data frame for rows in 'data' where the PromoterID is in filtered_data's Closest Gene (Ensemble ID) data_in_filtered <- data[data$PromoterID %in% filtered_data$Closest.Gene..Ensemble.ID., ] data_in_filtered$Motif_GRGGC_Center_Positions <- sapply(data_in_filtered$Motif_GRGGC_Center_Positions, toString) data_in_filtered$Motif_GCCYR_Center_Positions <- sapply(data_in_filtered$Motif_GCCYR_Center_Positions, toString) write.table(data_in_filtered, file="motifs_in_promoters_with_LT_peak.txt", sep="\t", row.names = FALSE) # Create another data frame for the remaining rows data_not_in_filtered <- data[!data$PromoterID %in% filtered_data$Closest.Gene..Ensemble.ID., ] data_not_in_filtered$Motif_GRGGC_Center_Positions <- sapply(data_not_in_filtered$Motif_GRGGC_Center_Positions, toString) data_not_in_filtered$Motif_GCCYR_Center_Positions <- sapply(data_not_in_filtered$Motif_GCCYR_Center_Positions, toString) write.table(data_not_in_filtered, file="motifs_in_promoters_without_LT_peak.txt", sep="\t", row.names = FALSE) #--3.3. draw plots for motifs_in_promoters_with_LT_peak.txt #DELETE with Kate '"' in motifs_in_promoters_with_LT_peak.txt data <- read.table("motifs_in_promoters_with_LT_peak.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE) data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count) data$Motif_GCCYR_Count <- as.numeric(data$Motif_GCCYR_Count) # Histogram of Motif_GRGGC counts png("Histogram_of_Motif_GRGGC_in_promoters_with_LT_peak.png", width=800, height=800) hist(data$Motif_GRGGC_Count, main="Motif GRGGC Counts", xlab="Counts") dev.off() # Histogram of Motif_GCCYR counts png("Histogram_of_Motif_GCCYR_in_promoters_with_LT_peak.png", width=800, height=800) hist(data$Motif_GCCYR_Count, main="Motif GCCYR Counts", xlab="Counts") dev.off() data$Motif_GRGGC_Center_Positions <- strsplit(as.character(data$Motif_GRGGC_Center_Positions), ",") data$Motif_GCCYR_Center_Positions <- strsplit(as.character(data$Motif_GCCYR_Center_Positions), ",") # Create data frames for each motif data_motif1 <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GRGGC_Count), Motif = "Motif GRGGC", Position = unlist(data$Motif_GRGGC_Center_Positions)) data_motif2 <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GCCYR_Count), Motif = "Motif GCCYR", Position = unlist(data$Motif_GCCYR_Center_Positions)) # Combine motif data frames data_positions <- rbind(data_motif1, data_motif2) # Get positions as numeric values, excluding NAs motif1_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GRGGC"]) # Plot density of Motif_GRGGC positions png("Density_of_Motif_GRGGC_in_promoters_with_LT_peak.png", width=800, height=800) plot(density(motif1_positions, na.rm = TRUE), main="Density Plot of Motif GRGGC Positions", xlab="Position") dev.off() # Get positions as numeric values, excluding NAs motif2_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GCCYR"]) # Plot density of Motif_GCCYR positions png("Density_of_Motif_GCCYR_in_promoters_with_LT_peak.png", width=800, height=800) plot(density(motif2_positions, na.rm = TRUE), main="Density Plot of Motif GCCYR Positions", xlab="Position") dev.off() #--3.4. draw plots for motifs_in_promoters_without_LT_peak.txt #DELETE '"' with Kate in motifs_in_promoters_without_LT_peak.txt data <- read.table("motifs_in_promoters_without_LT_peak.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE) data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count) data$Motif_GCCYR_Count <- as.numeric(data$Motif_GCCYR_Count) # Histogram of Motif_GRGGC counts png("Histogram_of_Motif_GRGGC_in_promoters_without_LT_peak.png", width=800, height=800) hist(data$Motif_GRGGC_Count, main="Motif GRGGC Counts", xlab="Counts") dev.off() # Histogram of Motif_GCCYR counts png("Histogram_of_Motif_GCCYR_in_promoters_without_LT_peak.png", width=800, height=800) hist(data$Motif_GCCYR_Count, main="Motif GCCYR Counts", xlab="Counts") dev.off() data$Motif_GRGGC_Center_Positions <- strsplit(as.character(data$Motif_GRGGC_Center_Positions), ",") data$Motif_GCCYR_Center_Positions <- strsplit(as.character(data$Motif_GCCYR_Center_Positions), ",") # Create data frames for each motif data_motif1 <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GRGGC_Count), Motif = "Motif GRGGC", Position = unlist(data$Motif_GRGGC_Center_Positions)) data_motif2 <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GCCYR_Count), Motif = "Motif GCCYR", Position = unlist(data$Motif_GCCYR_Center_Positions)) # Combine motif data frames data_positions <- rbind(data_motif1, data_motif2) # Get positions as numeric values, excluding NAs motif1_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GRGGC"]) # Plot density of Motif_GRGGC positions png("Density_of_Motif_GRGGC_in_promoters_without_LT_peak.png", width=800, height=800) plot(density(motif1_positions, na.rm = TRUE), main="Density Plot of Motif GRGGC Positions", xlab="Position") dev.off() # Get positions as numeric values, excluding NAs motif2_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GCCYR"]) # Plot density of Motif_GCCYR positions png("Density_of_Motif_GCCYR_in_promoters_without_LT_peak.png", width=800, height=800) plot(density(motif2_positions, na.rm = TRUE), main="Density Plot of Motif GCCYR Positions", xlab="Position") dev.off() #--3.5. assign each promoter to a prefined class according to Total_Motifs and Avg_Motif_Distance_To_TSS data <- read.table("motifs_in_promoters_with_LT_peak.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE) data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count) data$Motif_GCCYR_Count <- as.numeric(data$Motif_GCCYR_Count) # Add a column for the sum of Motif_GRGGC_Count and Motif_GCCYR_Count data <- data %>% mutate(Total_Motifs = Motif_GRGGC_Count + Motif_GCCYR_Count) # Calculate absolute average Motif_GRGGC distance to TSS data$Avg_Motif_GRGGC_Distance_To_TSS <- sapply(strsplit(data$Motif_GRGGC_Center_Positions, ", "), function(x) round(mean(as.numeric(x), na.rm = TRUE))) # Calculate absolute average Motif_GCCYR distance to TSS data$Avg_Motif_GCCYR_Distance_To_TSS <- sapply(strsplit(data$Motif_GCCYR_Center_Positions, ", "), function(x) round(mean(as.numeric(x), na.rm = TRUE))) # Revised function to calculate the mean of two strings of comma-separated numbers mean_of_strings <- function(str1, str2) { # Convert strings of numbers to numeric vectors vec1 <- as.numeric(strsplit(str1, split = ",")[[1]]) vec2 <- as.numeric(strsplit(str2, split = ",")[[1]]) # Handle cases where vec1 or vec2 is NULL (i.e., when str1 or str2 is an empty string) if (is.null(vec1)) { vec1 <- numeric(0) } if (is.null(vec2)) { vec2 <- numeric(0) } # Compute the mean of the two vectors return(round(mean(c(vec1, vec2), na.rm = TRUE))) } # Apply the function to each row of the dataframe data$Avg_Motif_Distance_To_TSS <- mapply(mean_of_strings, data$Motif_GRGGC_Center_Positions, data$Motif_GCCYR_Center_Positions) # Assign classes based on the given rules #change the class name as string "Class 1", "Class 2" ... in the following code. data <- data %>% mutate( Class = case_when( Total_Motifs < 50 & abs(Avg_Motif_Distance_To_TSS) <= 1000 ~ "Total_Motifs < 50 & abs(Avg_Motif_Distance_To_TSS) <= 1000", Total_Motifs < 50 & abs(Avg_Motif_Distance_To_TSS) > 1000 ~ "Total_Motifs < 50 & abs(Avg_Motif_Distance_To_TSS) > 1000", Total_Motifs >= 50 & Total_Motifs < 100 & abs(Avg_Motif_Distance_To_TSS) <= 1000 ~ "50 <= Total_Motifs < 100 & abs(Avg_Motif_Distance_To_TSS) <= 1000", Total_Motifs >= 50 & Total_Motifs < 100 & abs(Avg_Motif_Distance_To_TSS) > 1000 ~ "50 <= Total_Motifs < 100 & abs(Avg_Motif_Distance_To_TSS) > 1000", Total_Motifs >= 100 & Total_Motifs < 150 & abs(Avg_Motif_Distance_To_TSS) <= 1000 ~ "100 <= Total_Motifs < 150 & abs(Avg_Motif_Distance_To_TSS) <= 1000", Total_Motifs >= 100 & Total_Motifs < 150 & abs(Avg_Motif_Distance_To_TSS) > 1000 ~ "100 <= Total_Motifs < 150 & abs(Avg_Motif_Distance_To_TSS) > 1000" ) ) write.table(data, file="motifs_in_promoters_with_LT_peak_Category.txt", sep="\t", row.names = FALSE) #~/Tools/csv2xls-0.4/csv_to_xls.py motifs_in_promoters_with_LT_peak_Category.txt -d$'\t' -o motifs_in_promoters_with_LT_peak_Category.xls;
Defining and Categorizing Promoter Types Based on the ‘GRGGC’ Motif Frequency and Distance to TSS
Leave a reply