Preparing a GTF file from GenBank for bacterial RNA-seq analysis, using the example of WA
-
In the example Yersinia enterocolitica strain WA
#[IMPORTANT FINAL STEPS] [STEP1] genbank2fasta.py WA.gb [STEP2] python3 ~/Scripts/genbank_to_gtf.py WA.gb WA.gtf #(1col or 3cols) [STEP3] python3 ~/Scripts/modify_gtf.py WA.gtf WA_m.gtf #(1col or 3cols) "CDS" --> "exon"
-
In the example of 1585 wildtype genome WA_m.gtf
#Ziel all ncRNA, rRNA, tmRNA, tRNA as exon, the original types set as gene_biotype "protein_coding"; gene_biotype "tRNA"; ... #cut -d$'\t' -f3 WA_m.gtf | sort -u #exon #gene #rRNA #tRNA #Gene; CDS (renamed exon=4121); rRNA (22); tRNA (84); ncRNA [STEP4] replace transcript_id " --> transcript_id "tx- under kate [STEP5] python3 ~/Scripts/add_geneid_genename_genebiotype.py WA_m.gtf WA_m_.gtf #(4cols+3cols) ergänzen all lines as gene_id; gene_name; transcript_id; and add all lines annotated with "gene" at the end with 'gene_biotype "protein_coding";' #NO ERROR in WA case. #DEBUG ERROR Stop at line : CP143516 biopython exon 1 1356 . + . gene_id "dnaA"; gene_name "dnaA"; transcript_id "tx-V0R30_00005" # Error Message: Cannot find transcript_id! [STEP6] python3 ~/Scripts/add_exonid_to_exon_tRNA_rRNA_tmRNA_ncRNA.py WA_m_.gtf WA_m__.gtf # (4cols+4cols) Process features of types Gene, CDS, rRNA, tRNA, ncRNA, and tmRNA; For rRNA, tRNA, tmRNA, and ncRNA features, it will add an additional line with exon_id following the pattern described. [STEP7] cp WA_m__.gtf WA_m___.gtf; MANUALLY modify the lines tRNA --> exon and 'gene_biotype "tRNA";' #for ncRNA (3), tmRNA (1), rRNA (19), tRNA (69) in 1585_m___.gtf #for rRNA (22), tRNA (84) in WA_m___.gtf #grep "gene_biotype \"tRNA\"" WA_m___.gtf | wc -l #CHECK they are correctly changed! [RESULTS] #CP143516 biopython gene 1 1356 . + . gene_id "dnaA"; gene_name "dnaA"; transcript_id "tx-V0R30_00005"; gene_biotype "protein_coding" #CP143516 biopython exon 1 1356 . + . gene_id "dnaA"; gene_name "dnaA"; transcript_id "tx-V0R30_00005"; exon_id "exon-V0R30_00005"; #... #[Intermediate RESULTS] #/home/jhuang/DATA/Data_Marc_RNA-seq_Sepidermidis/results_1585/genome/rsem #grep ">" genome.transcripts.fa | wc -l #2240 only exon level, but not tRNA, rRNA, tmRNA, ncRNA records! [STEP8 (Optional, not used in the example)] To include also the 92 records in the results, we can change the annotations records of rRNA, tRNA, tmRNA, ncRNA to exon, so that we can also the records! # see WA_m___.gtf # grep "exon" WA_m___.gtf | wc -l # 2332 # grep ">" /home/jhuang/DATA/Data_Marc_RNA-seq_Sepidermidis/results_1585/genome/genome.transcripts.fa | wc -l # grep ">" /home/jhuang/DATA/Data_Marc_RNA-seq_Sepidermidis/results_1585/genome/rsem/genome.transcripts.fa | wc -l # 2332 # 4227 #For example exon-V0R30_00090 is a tRNA. #>tx-V0R30_00090 #GGCTCCTTGGTCAAGCGGTTAAGACACCGCCCTTTCACGGCGGTAACACGGGTTCGAGTCCCGTAGGAGTCACCA #We can also filter the count number in the downstream analysis from the big matrix! #tmRNA: V0R30_09330; ncRNA: V0R30_05655, V0R30_06520, V0R30_11060 >tx-V0R30_05655 AGTCATCTGTGGTGTTCGTAAGTTTGCTTTTTATTTGGGCCTAACACTCTTTGATCAGGGAGCCCAATAGGTTTTCTCGCAGCGCACACGCCTCTATAGGAGGACTTGCAAAACGAGAAACAGGGCACCCACCTGTATATAGCAGGCCGAATGATCAAGCTATTTATAACTACGGCATCAACGGACTCTAT >tx-V0R30_06520 TATTTCGGGTAATCGCTATAGCAATATAGAGGAAAGTCCATGCTCACACAATCTGAGATGATTGTAGTGTTCGTGCTTGATGAAACAATAAATCAAGGCATTAATTTGACGGCAATGAAATAACCTAAGTCATTGGATATGGTTAGAATAGTTTGAAAGTGCCACAGTGACGTAGCTTTTATAGAAATATAAAAGGTGGAACGCGGTAAACCCCTCGAGTGAGCAATCCAAATTGGGTAGGAGCACTTGTTTAGCGGAATTCAACGTATAGACGAGACGATTTTTACGCGAAAGTAAAAATATGTAGACAGATGGTTACCACCGACGTACCAGTGTAACTAGTACACATTATGAGTACAACGGAACAGAACATGGCTTACAGAA >tx-V0R30_11060 CCGTGCTAGGTGGGGAGGTAGCGGTTCCCTGTACTCGAAATCCGCTTTATGCGAGACTTAATTCCTTTGTTGAGGGCGTATTTTTGTGAAGTCTGCCCGAAGCACGTAGTGTTTGAAGATTTCGGTCCTATGCAATATGAACCCATGAACCATGTCAGGTCCTGACGGAAGCAGCATTAAGTGGATCCTCATATGTGCCGTAGGGTAGCCGAGATTTAGCTGTCGACTTCGGTAACGTTTATGATATGCGTTCGATACAAAGGTGCATGG
-
nextflow running
nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_WA --fasta WA.fasta --gtf WA_m___.gtf -profile test_full -resume --max_memory 126.GB --max_time 2400.h --save_reference --aligner star_salmon --skip_rseqc --skip_dupradar --skip_preseq --skip_biotype_qc --skip_deseq2_qc --min_mapped_reads 0 nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38 -profile test_full -resume --max_memory 126.GB --max_time 2400.h --save_reference --aligner star_salmon --skip_deseq2_qc
-
[TODO] generate 1585 Excel table, we need DNA-sequence, we can use the DNA seqeunce from results_WA/genome/genome.transcripts.fa in the gb_to_excel.py
python3 ~/Scripts/gb_to_excel.py 1585.gb 1585.xlsx #insert C./P. (Chrom or Plasmid1, Plasmid2, Plasmid3) as the second chrom in the 1585.xlsx.
-
GO enrichment analysis of non-model bacterial RNA-seq_Sepidermidis
~/Scripts/1_filter_interproscan_out.py ~/Scripts/2_split_proteins_with_and_without_GO_terms3.py ~/Scripts/3_generate_fasta_not_annotated_by_interpro.py 1. Use Custom Annotations for Enrichment Analysis For GO term enrichment analysis without standard database support, you can create a custom set of annotations if you have the necessary GO term associations for your genes. This involves manually mapping your genes to GO terms based on your experimental data or literature research. 2. GO Term Enrichment without biomaRt If you already have GO term associations for your genes or can obtain them through other means, you can proceed with GO term enrichment analysis using clusterProfiler or similar tools by creating a custom OrgDb or using the enricher function directly with your custom GO term mappings. Here’s a simplified example using clusterProfiler with custom GO term mappings: library(clusterProfiler) # Assuming you have a named vector of genes and corresponding GO terms genes <- c("gene1", "gene2", "gene3", ...) # Your gene identifiers go_terms <- c("GO:0008150", "GO:0006355", "GO:0004672", ...) # Corresponding GO terms for these genes # Create a named list where names are GO terms and elements are vectors of gene IDs associated with each GO term go_list <- split(genes, go_terms) # Your list of interesting genes (e.g., those differentially expressed) interesting_genes <- c("gene1", "gene2", ...) # Perform enrichment analysis enrichment_results <- enricher(interesting_genes, TERM2GENE = go_list, pAdjustMethod = "BH", pvalueCutoff = 0.05, qvalueCutoff = 0.2) # View the enrichment results print(enrichment_results) #Install and Run InterProScan. #https://interproscan-docs.readthedocs.io/en/latest/HowToRun.html python3 setup.py -f interproscan.properties ./interproscan.sh -i 1585_CDS.fasta -f tsv -goterms -pa #InterProScan's remote option also supports additional output formats and analyses, such as producing graphical overviews (-svg for SVG output) or specific types of analyses only (-appl TIGRFAM,Pfam to limit the search to certain databases). Check the InterProScan documentation for a full list of options and more detailed explanations. https://interproscan-docs.readthedocs.io/en/latest/UserDocs.html#obtaining-a-copy-of-interproscan -dp,--disable-precalc Optional. Disables use of the precalculated match lookup service. All match calculations will be run locally. InterProScan test run This distribution of InterProScan provides a set of protein test sequences, which you can use to check how InterProScan behaves on your system. First, if you have not yet run the initialisation script run the following command: python3 setup.py -f interproscan.properties This command will press and index the hmm models to prepare them into a format used by hmmscan. This command need only be run once. You can then run the following two test case commands: ./interproscan.sh -i test_all_appl.fasta -f tsv -dp ./interproscan.sh -i test_all_appl.fasta -f tsv ./interproscan.sh -i 1585_CDS_test.fasta -f tsv -goterms -pa #./interproscan.sh -i 1585_CDS_test.fasta -f TSV,XML -goterms awk -F'\t' '{if(NR>1) print $1, $14}' 1585_CDS_test.fasta.tsv1 > filtered_GO_terms.txt #4d_vs_15m-up.id #4d_vs_15m-down.id # Install and load required packages #install.packages("rentrez") #install.packages("KEGGREST") #install.packages("dplyr") # for data manipulation #install.packages("ggplot2") # for visualization library(rentrez) library(KEGGREST) library(dplyr) library(ggplot2) # Read GenBank file genbank_file <- "../../../1585.gb" genbank_data <- readLines(genbank_file) # Extract accession number from the GenBank file accession <- grep("^ACCESSION", genbank_data, value = TRUE) accession <- gsub("^ACCESSION\\s+", "", accession) accession <- trimws(accession) # Function to retrieve sequence information from GenBank file get_sequence_from_genbank <- function(genbank_file) { # Read GenBank file genbank_data <- readLines(genbank_file) # Find sequence lines starting with "ORIGIN" origin_line <- grep("^ORIGIN", genbank_data) # Extract sequence lines sequence_lines <- genbank_data[(origin_line + 1):length(genbank_data)] # Remove non-sequence characters and concatenate into a single sequence sequence <- gsub("[0-9 ]", "", sequence_lines) sequence <- paste(sequence, collapse = "") return(sequence) } # GenBank file genbank_file <- "1585.gb" # Retrieve sequence from GenBank file sequence <- get_sequence_from_genbank(genbank_file) # Perform KEGG pathway enrichment analysis kegg_pathways <- keggList("pathway") # Toy example: Randomly select 100 genes and map to KEGG pathways set.seed(123) genes <- sample(1:1000, 100) # Mapping genes to KEGG pathways mapped_pathways <- sapply(genes, function(gene) { kegg_gene <- keggGet("link", gene) if (length(kegg_gene) > 0) { pathway_ids <- kegg_gene$ENTRY %>% unique() pathway_names <- kegg_pathways[kegg_pathways$ID %in% pathway_ids, "NAME"] if (length(pathway_names) > 0) { return(paste(pathway_names, collapse = "; ")) } else { return(NA) } } else { return(NA) } }) # Filter out NAs mapped_pathways <- mapped_pathways[!is.na(mapped_pathways)] # Count pathway occurrences pathway_counts <- table(unlist(strsplit(mapped_pathways, "; "))) # Sort pathways by occurrence pathway_counts <- sort(pathway_counts, decreasing = TRUE) # Top 10 enriched pathways top_pathways <- head(pathway_counts, 10) # Plot the top enriched pathways barplot(top_pathways, main = "Top 10 Enriched KEGG Pathways", xlab = "Pathway", ylab = "Frequency")