Processing Data_Tam_RNAseq_2024_MHB_vs_Urine_ATCC19606

Preparing raw data

 They are wildtype strains grown in different medium.
 Urine - human urine
 AUM - artificial urine medium
 MHB - Mueller-Hinton broth
 Urine（人类尿液）：pH值、比重、温度、污染物、化学成分、微生物负荷。
 AUM（人工尿液培养基）：pH值、营养成分、无菌性、渗透压、温度、污染物。
 MHB（Mueller-Hinton培养基）：pH值、无菌性、营养成分、温度、渗透压、抗生素浓度。

 mkdir raw_data; cd raw_data
 ln -s ../X101SC24105589-Z01-J001/01.RawData/AUM-1/AUM-1_1.fq.gz AUM_r1_R1.fq.gz
 ln -s ../X101SC24105589-Z01-J001/01.RawData/AUM-1/AUM-1_2.fq.gz AUM_r1_R2.fq.gz
 ln -s ../X101SC24105589-Z01-J001/01.RawData/AUM-2/AUM-2_1.fq.gz AUM_r2_R1.fq.gz
 ln -s ../X101SC24105589-Z01-J001/01.RawData/AUM-2/AUM-2_2.fq.gz AUM_r2_R2.fq.gz
 ln -s ../X101SC24105589-Z01-J001/01.RawData/AUM-3/AUM-3_1.fq.gz AUM_r3_R1.fq.gz
 ln -s ../X101SC24105589-Z01-J001/01.RawData/AUM-3/AUM-3_2.fq.gz AUM_r3_R2.fq.gz

 ln -s ../X101SC24105589-Z01-J001/01.RawData/MHB-1/MHB-1_1.fq.gz MHB_r1_R1.fq.gz
 ln -s ../X101SC24105589-Z01-J001/01.RawData/MHB-1/MHB-1_2.fq.gz MHB_r1_R2.fq.gz
 ln -s ../X101SC24105589-Z01-J001/01.RawData/MHB-2/MHB-2_1.fq.gz MHB_r2_R1.fq.gz
 ln -s ../X101SC24105589-Z01-J001/01.RawData/MHB-2/MHB-2_2.fq.gz MHB_r2_R2.fq.gz
 ln -s ../X101SC24105589-Z01-J001/01.RawData/MHB-3/MHB-3_1.fq.gz MHB_r3_R1.fq.gz
 ln -s ../X101SC24105589-Z01-J001/01.RawData/MHB-3/MHB-3_2.fq.gz MHB_r3_R2.fq.gz

 ln -s ../X101SC24105589-Z01-J001/01.RawData/Urine-1/Urine-1_1.fq.gz Urine_r1_R1.fq.gz
 ln -s ../X101SC24105589-Z01-J001/01.RawData/Urine-1/Urine-1_2.fq.gz Urine_r1_R2.fq.gz
 ln -s ../X101SC24105589-Z01-J001/01.RawData/Urine-2/Urine-2_1.fq.gz Urine_r2_R1.fq.gz
 ln -s ../X101SC24105589-Z01-J001/01.RawData/Urine-2/Urine-2_2.fq.gz Urine_r2_R2.fq.gz
 ln -s ../X101SC24105589-Z01-J001/01.RawData/Urine-3/Urine-3_1.fq.gz Urine_r3_R1.fq.gz
 ln -s ../X101SC24105589-Z01-J001/01.RawData/Urine-3/Urine-3_2.fq.gz Urine_r3_R2.fq.gz

(Optional) using trinity to find the most closely reference

 Trinity --seqType fq --max_memory 50G --left trimmed/wt_r1_R1.fastq.gz  --right trimmed/wt_r1_R2.fastq.gz --CPU 12

 #https://www.genome.jp/kegg/tables/br08606.html#prok
 acb     KGB     Acinetobacter baumannii ATCC 17978  2007    GenBank
 abm     KGB     Acinetobacter baumannii SDF     2008    GenBank
 aby     KGB     Acinetobacter baumannii AYE     2008    GenBank
 abc     KGB     Acinetobacter baumannii ACICU   2008    GenBank
 abn     KGB     Acinetobacter baumannii AB0057  2008    GenBank
 abb     KGB     Acinetobacter baumannii AB307-0294  2008    GenBank
 abx     KGB     Acinetobacter baumannii 1656-2  2012    GenBank
 abz     KGB     Acinetobacter baumannii MDR-ZJ06    2012    GenBank
 abr     KGB     Acinetobacter baumannii MDR-TJ  2012    GenBank
 abd     KGB     Acinetobacter baumannii TCDC-AB0715     2012    GenBank
 abh     KGB     Acinetobacter baumannii TYTH-1  2012    GenBank
 abad    KGB     Acinetobacter baumannii D1279779    2013    GenBank
 abj     KGB     Acinetobacter baumannii BJAB07104   2013    GenBank
 abab    KGB     Acinetobacter baumannii BJAB0715    2013    GenBank
 abaj    KGB     Acinetobacter baumannii BJAB0868    2013    GenBank
 abaz    KGB     Acinetobacter baumannii ZW85-1  2013    GenBank
 abk     KGB     Acinetobacter baumannii AbH12O-A2   2014    GenBank
 abau    KGB     Acinetobacter baumannii AB030   2014    GenBank
 abaa    KGB     Acinetobacter baumannii AB031   2014    GenBank
 abw     KGB     Acinetobacter baumannii AC29    2014    GenBank
 abal    KGB     Acinetobacter baumannii LAC-4   2015    GenBank
 #Note that the Acinetobacter baumannii strain ATCC 19606 chromosome, complete genome (GenBank: CP059040.1) was choosen as reference!

Downloading CP059040.fasta and CP059040.gff from GenBank

(Optional) Preparing CP059040.fasta, CP059040_gene.gff3 and CP059040.bed

 #Reference genome: https://www.ncbi.nlm.nih.gov/nuccore/CP059040
 cp /media/jhuang/Elements2/Data_Tam_RNASeq3/CP059040.fasta .     # Elements (Anna C.arnes)
 cp /media/jhuang/Elements2/Data_Tam_RNASeq3/CP059040_gene.gff3 .
 cp /media/jhuang/Elements2/Data_Tam_RNASeq3/CP059040_gene.gtf .
 cp /media/jhuang/Elements2/Data_Tam_RNASeq3/CP059040.bed .
 rsync -a -P CP059040.fasta jhuang@hamm:~/DATA/Data_Tam_RNAseq_2024/
 rsync -a -P CP059040_gene.gff3 jhuang@hamm:~/DATA/Data_Tam_RNAseq_2024/
 rsync -a -P CP059040.bed jhuang@hamm:~/DATA/Data_Tam_RNAseq_2024/
 (base) jhuang@WS-2290C:/media/jhuang/Elements2/Data_Tam_RNASeq3$ find . -name "CP059040*"
 ./CP059040.fasta
 ./CP059040.bed
 ./CP059040.gb
 ./CP059040.gff3
 ./CP059040.gff3_backup
 ./CP059040_full.gb
 ./CP059040_gene.gff3
 ./CP059040_gene.gtf
 ./CP059040_gene_old.gff3
 ./CP059040_rRNA.gff3
 ./CP059040_rRNA_v.gff3

 # ---- REF: Acinetobacter baumannii ATCC 17978 (DEBUG, gene_name failed) ----
 #gffread -E -F -T GCA_000015425.1_ASM1542v1_genomic.gff -o GCA_000015425.1_ASM1542v1_genomic.gtf_
 #grep "CDS" GCA_000015425.1_ASM1542v1_genomic.gtf_ > GCA_000015425.1_ASM1542v1_genomic.gtf
 #sed -i -e "s/\tCDS\t/\texon\t/g" GCA_000015425.1_ASM1542v1_genomic.gtf
 #gffread -E -F --bed GCA_000015425.1_ASM1542v1_genomic.gtf -o GCA_000015425.1_ASM1542v1_genomic.bed

 grep "locus_tag" GCA_000015425.1_ASM1542v1_genomic.gtf_ > GCA_000015425.1_ASM1542v1_genomic.gtf
 sed -i -e "s/\ttranscript\t/\texon\t/g" GCA_000015425.1_ASM1542v1_genomic.gtf # or using fc_count_type=transcript
 sed -i -e "s/\tgene_name\t/\tName\t/g" GCA_000015425.1_ASM1542v1_genomic.gtf
 gffread -E -F --bed GCA_000015425.1_ASM1542v1_genomic.gtf -o GCA_000015425.1_ASM1542v1_genomic.bed
 #grep "gene_name" GCA_000015425.1_ASM1542v1_genomic.gtf | wc -l  #69=3887-3803

 cp CP059040.gff3 CP059040_backup.gff3
 sed -i -e "s/\tGenbank\tgene\t/\tGenbank_gene\t/g" CP059040.gff3
 grep "Genbank_gene" CP059040.gff3 > CP059040_gene.gff3
 sed -i -e "s/\tGenbank_gene\t/\tGenbank\tgene\t/g" CP059040_gene.gff3

 #3796-3754=42--> they are pseudogene since grep "pseudogene" CP059040.gff3 | wc -l = 42
 # --------------------------------------------------------------------------------------------------------------------------------------------------
 # ---------- PREPARING gff3 file including gene_biotype=protein_coding+gene_biotype=tRNA = total(3754)) and gene_biotype=pseudogene(42) ------------
 cp CP059040.gff3 CP059040_backup.gff3
 sed -i -e "s/\tGenbank\tgene\t/\tGenbank_gene\t/g" CP059040.gff3
 grep "Genbank_gene" CP059040.gff3 > CP059040_gene.gff3
 sed -i -e "s/\tGenbank_gene\t/\tGenbank\tgene\t/g" CP059040_gene.gff3
 grep "gene_biotype=pseudogene" CP059040.gff3_backup >> CP059040_gene.gff3    #-->3796

 #The whole point of the GTF format was to standardise certain aspects that are left open in GFF. Hence, there are many different valid ways to encode the same information in a valid GFF format, and any parser or converter needs to be written specifically for the choices the author of the GFF file made. For example, a GTF file requires the gene ID attribute to be called "gene_id", while in GFF files, it may be "ID", "Gene", something different, or completely missing.
 # from gff3 to gtf
 sed -i -e "s/\tID=gene-/\tgene_id \"/g" CP059040_gene.gtf
 sed -i -e "s/;/\"; /g" CP059040_gene.gtf
 sed -i -e "s/=/=\"/g" CP059040_gene.gtf

 #sed -i -e "s/\n/\"\n/g" CP059040_gene.gtf
 #using editor instead!

 #The following is GTF-format.
 CP000521.1      Genbank exon    95      1492    .       +       .       transcript_id "gene0"; gene_id "gene0"; Name "A1S_0001"; gbkey "Gene"; gene_biotype "protein_coding"; locus_tag "A1S_0001";

 #NZ_MJHA01000001.1       RefSeq  region  1       8663    .       +       .       ID=id0;Dbxref=taxon:575584;Name=unnamed1;collected-by=IG Schaub;collection-date=1948;country=USA: Vancouver;culture-collection=ATCC:19606;gbkey=Src;genome=plasmid;isolation-source=urine;lat-lon=37.53 N 75.4 W;map=unlocalized;mol_type=genomic DNA;nat-host=Homo sapiens;plasmid-name=unnamed1;strain=ATCC 19606;type-material=type strain of Acinetobacter baumannii
 #NZ_MJHA01000001.1       RefSeq  gene    228     746     .       -       .       ID=gene0;Name=BIT33_RS00005;gbkey=Gene;gene_biotype=protein_coding;locus_tag=BIT33_RS00005;old_locus_tag=BIT33_18795
 #NZ_MJHA01000001.1       Protein Homology        CDS     228     746     .       -       0       ID=cds0;Parent=gene0;Dbxref=Genbank:WP_000839337.1;Name=WP_000839337.1;gbkey=CDS;inference=COORDINATES: similar to AA sequence:RefSeq:WP_000839337.1;product=hypothetical protein;protein_id=WP_000839337.1;transl_table=11

 ##gff-version 3
 ##sequence-region CP059040.1 1 3980852
 ##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=470

 gffread -E -F --bed CP059040.gff3 -o CP059040.bed    #-->3796
 ##prepare the GTF-format (see above) --> ERROR! ----> using CP059040.gff3
 ##stringtie adeIJ.abx_r1.sorted.bam -o adeIJ.abx_r1.sorted_transcripts.gtf -v -G /media/jhuang/Elements/Data_Tam_RNASeq3/CP059040.gff3 -A adeIJ.abx_r1.sorted.gene_abund.txt -C adeIJ.abx_r1.sorted.bam.cov_refs.gtf -e -b adeIJ.abx_r1.sorted_ballgown
 #[01/21 10:57:46] Loading reference annotation (guides)..
 #GFF warning: merging adjacent/overlapping segments of gene-H0N29_00815 on CP059040.1 (179715-179786, 179788-180810)
 #[01/21 10:57:46] 3796 reference transcripts loaded.
 #Default stack size for threads: 8388608
 #WARNING: no reference transcripts found for genomic sequence "gi|1906906720|gb|CP059040.1|"! (mismatched reference names?)
 #WARNING: no reference transcripts were found for the genomic sequences where reads were mapped!
 #Please make sure the -G annotation file uses the same naming convention for the genome sequences.
 #[01/21 10:58:30] All threads finished.

 #  ERROR: failed to find the gene identifier attribute in the 9th column of the provided GTF file.
 #  The specified gene identifier attribute is 'Name'
 #  An example of attributes included in your GTF annotation is 'ID=exon-H0N29_00075-1;Parent=rna-H0N29_00075;gbkey=rRNA;locus_tag=H0N29_00075;product=16S ribosomal RNA'
 #  The program has to termin

 #  ERROR: failed to find the gene identifier attribute in the 9th column of the provided GTF file.
 #  The specified gene identifier attribute is 'gene_biotype'
 #  An example of attributes included in your GTF annotation is 'ID=exon-H0N29_00075-1;Parent=rna-H0N29_00075;gbkey=rRNA;locus_tag=H0N29_00075;product=16S ribosomal RNA'
 #  The program has to terminate.

 #grep "ID=cds-" CP059040.gff3 | wc -l
 #grep "ID=exon-" CP059040.gff3 | wc -l
 #grep "ID=gene-" CP059040.gff3 | wc -l   #the same as H0N29_18980/5=3796
 grep "gbkey=" CP059040.gff3 | wc -l  7695
 grep "ID=id-" CP059040.gff3 | wc -l  5
 grep "locus_tag=" CP059040.gff3 | wc -l    7689
 #...
 cds   3701                             locus_tag=xxxx, no gene_biotype
 exon   96                              locus_tag=xxxx, no gene_biotype
 gene   3796                            locus_tag=xxxx, gene_biotype=xxxx,
 id  (riboswitch+direct_repeat,5)       both no --> ignoring them!!  # grep "ID=id-" CP059040.gff3
 rna    96                              locus_tag=xxxx, no gene_biotype
 ------------------
     7694

 cp CP059040.gff3_backup CP059040.gff3
 grep "^##" CP059040.gff3 > CP059040_gene.gff3
 grep "ID=gene" CP059040.gff3 >> CP059040_gene.gff3
 #!!!!VERY_IMPORTANT!!!!: change type '\tCDS\t' to '\texon\t'!
 sed -i -e "s/\tgene\t/\texon\t/g" CP059040_gene.gff3

Preparing the directory trimmed

 mkdir trimmed trimmed_unpaired;
 for sample_id in AUM_r1 AUM_r2 AUM_r3 Urine_r1 Urine_r2 Urine_r3 MHB_r1 MHB_r2 MHB_r3; do \
 for sample_id in MHB_r1 MHB_r2 MHB_r3; do \
         java -jar /home/jhuang/Tools/Trimmomatic-0.36/trimmomatic-0.36.jar PE -threads 100 raw_data/${sample_id}_R1.fq.gz raw_data/${sample_id}_R2.fq.gz trimmed/${sample_id}_R1.fq.gz trimmed_unpaired/${sample_id}_R1.fq.gz trimmed/${sample_id}_R2.fq.gz trimmed_unpaired/${sample_id}_R2.fq.gz ILLUMINACLIP:/home/jhuang/Tools/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa:2:30:10:8:TRUE LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 AVGQUAL:20; done 2> trimmomatic_pe.log;
 done

Preparing samplesheet.csv

 sample,fastq_1,fastq_2,strandedness
 AUM_r1,AUM_r1_R1.fq.gz,AUM_r1_R2.fq.gz,auto
 AUM_r2,AUM_r2_R1.fq.gz,AUM_r2_R2.fq.gz,auto
 AUM_r3,AUM_r3_R1.fq.gz,AUM_r3_R2.fq.gz,auto
 MHB_r1,MHB_r1_R1.fq.gz,MHB_r1_R2.fq.gz,auto
 MHB_r2,MHB_r2_R1.fq.gz,MHB_r2_R2.fq.gz,auto
 MHB_r3,MHB_r3_R1.fq.gz,MHB_r3_R2.fq.gz,auto
 Urine_r1,Urine_r1_R1.fq.gz,Urine_r1_R2.fq.gz,auto
 Urine_r2,Urine_r2_R1.fq.gz,Urine_r2_R2.fq.gz,auto
 Urine_r3,Urine_r3_R1.fq.gz,Urine_r3_R2.fq.gz,auto

nextflow run

 #Example1: http://xgenes.com/article/article-content/157/prepare-virus-gtf-for-nextflow-run/

 docker pull nfcore/rnaseq
 ln -s /home/jhuang/Tools/nf-core-rnaseq-3.12.0/ rnaseq

 #Default: --gtf_group_features 'gene_id'  --gtf_extra_attributes 'gene_name' --featurecounts_group_type 'gene_biotype' --featurecounts_feature_type 'exon'
 #(host_env) !NOT_WORKING! jhuang@WS-2290C:~/DATA/Data_Tam_RNAseq_2024$ /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results    --fasta "/home/jhuang/DATA/Data_Tam_RNAseq_2024/CP059040.fasta" --gff "/home/jhuang/DATA/Data_Tam_RNAseq_2024/CP059040.gff"        -profile docker -resume  --max_cpus 55 --max_memory 512.GB --max_time 2400.h    --save_align_intermeds --save_unaligned --save_reference    --aligner 'star_salmon'    --gtf_group_features 'gene_id'  --gtf_extra_attributes 'gene_name' --featurecounts_group_type 'gene_biotype' --featurecounts_feature_type 'transcript'

 # -- DEBUG_1 (CDS --> exon in CP059040.gff) --
 #Checking the record (see below) in results/genome/CP059040.gtf
 #In ./results/genome/CP059040.gtf e.g. "CP059040.1      Genbank transcript      1       1398    .       +       .       transcript_id "gene-H0N29_00005"; gene_id "gene-H0N29_00005"; gene_name "dnaA"; Name "dnaA"; gbkey "Gene"; gene "dnaA"; gene_biotype "protein_coding"; locus_tag "H0N29_00005";"
 #--featurecounts_feature_type 'transcript' returns only the tRNA results
 #Since the tRNA records have "transcript and exon". In gene records, we have "transcript and CDS". replace the CDS with exon

 grep -P "\texon\t" CP059040.gff | sort | wc -l    #96
 grep -P "cmsearch\texon\t" CP059040.gff | wc -l    #=10  ignal recognition particle sRNA small typ, transfer-messenger RNA, 5S ribosomal RNA
 grep -P "Genbank\texon\t" CP059040.gff | wc -l    #=12  16S and 23S ribosomal RNA
 grep -P "tRNAscan-SE\texon\t" CP059040.gff | wc -l    #tRNA 74
 wc -l star_salmon/AUM_r3/quant.genes.sf  #--featurecounts_feature_type 'transcript' results in 96 records!

 grep -P "\tCDS\t" CP059040.gff | wc -l  #3701
 sed 's/\tCDS\t/\texon\t/g' CP059040.gff > CP059040_m.gff
 grep -P "\texon\t" CP059040_m.gff | sort | wc -l  #3797

 # -- DEBUG_2: combination of 'CP059040_m.gff' and 'exon' results in ERROR, using 'transcript' instead!
 --gff "/home/jhuang/DATA/Data_Tam_RNAseq_2024/CP059040_m.gff" --featurecounts_feature_type 'transcript'

 # ---- SUCCESSFUL with directly downloaded gff3 and fasta from NCBI using docker after replacing 'CDS' with 'exon' ----
 (host_env) /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results    --fasta "/home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040.fasta" --gff "/home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_m.gff"        -profile docker -resume  --max_cpus 55 --max_memory 512.GB --max_time 2400.h    --save_align_intermeds --save_unaligned --save_reference    --aligner 'star_salmon'    --gtf_group_features 'gene_id'  --gtf_extra_attributes 'gene_name' --featurecounts_group_type 'gene_biotype' --featurecounts_feature_type 'transcript'

 # -- DEBUG_3: make sure the header of fasta is the same to the *_m.gff file

Import data and pca-plot

 #mamba activate r_env

 #install.packages("ggfun")
 # Import the required libraries
 library("AnnotationDbi")
 library("clusterProfiler")
 library("ReactomePA")
 library(gplots)
 library(tximport)
 library(DESeq2)
 #library("org.Hs.eg.db")
 library(dplyr)
 library(tidyverse)
 #install.packages("devtools")
 #devtools::install_version("gtable", version = "0.3.0")
 library(gplots)
 library("RColorBrewer")
 #install.packages("ggrepel")
 library("ggrepel")
 # install.packages("openxlsx")
 library(openxlsx)
 library(EnhancedVolcano)
 library(DESeq2)
 library(edgeR)

 setwd("~/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/results/star_salmon")
 # Define paths to your Salmon output quantification files
 files <- c("Urine_r1" = "./Urine_r1/quant.sf",
         "Urine_r2" = "./Urine_r2/quant.sf",
         "Urine_r3" = "./Urine_r3/quant.sf",
         "MHB_r1" = "./MHB_r1/quant.sf",
         "MHB_r2" = "./MHB_r2/quant.sf",
         "MHB_r3" = "./MHB_r3/quant.sf")
 # Import the transcript abundance data with tximport
 txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
 # Define the replicates and condition of the samples
 replicate <- factor(c("r1", "r2", "r3", "r1", "r2", "r3"))
 condition <- factor(c("Urine","Urine","Urine", "MHB","MHB","MHB"))
 # Define the colData for DESeq2
 colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))

 # ------------------------
 # 1️⃣ Setup and input files
 # ------------------------

 # Read in transcript-to-gene mapping
 tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
 colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")

 # Prepare tx2gene for gene-level summarization (remove gene_name if needed)
 tx2gene_geneonly <- tx2gene[, c("transcript_id", "gene_id")]

 # -------------------------------
 # 2️⃣ Transcript-level counts
 # -------------------------------
 # Create DESeqDataSet directly from tximport (transcript-level)
 dds_tx <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
 write.csv(counts(dds_tx), file="transcript_counts.csv")

 # --------------------------------
 # 3️⃣ Gene-level summarization
 # --------------------------------
 # Re-import Salmon data summarized at gene level
 txi_gene <- tximport(files, type="salmon", tx2gene=tx2gene_geneonly, txOut=FALSE)

 # Create DESeqDataSet for gene-level counts
 dds <- DESeqDataSetFromTximport(txi_gene, colData=colData, design=~condition+replicate)

 # --------------------------------
 # 4️⃣ Raw counts table (with gene names)
 # --------------------------------
 # Extract raw gene-level counts
 counts_data <- as.data.frame(counts(dds, normalized=FALSE))
 counts_data$gene_id <- rownames(counts_data)

 # Add gene names
 tx2gene_unique <- unique(tx2gene[, c("gene_id", "gene_name")])
 counts_data <- merge(counts_data, tx2gene_unique, by="gene_id", all.x=TRUE)

 # Reorder columns: gene_id, gene_name, then counts
 count_cols <- setdiff(colnames(counts_data), c("gene_id", "gene_name"))
 counts_data <- counts_data[, c("gene_id", "gene_name", count_cols)]

 # --------------------------------
 # 5️⃣ Calculate CPM
 # --------------------------------
 library(edgeR)
 library(openxlsx)

 # Prepare count matrix for CPM calculation
 count_matrix <- as.matrix(counts_data[, !(colnames(counts_data) %in% c("gene_id", "gene_name"))])

 # Calculate CPM
 #cpm_matrix <- cpm(count_matrix, normalized.lib.sizes=FALSE)
 total_counts <- colSums(count_matrix)
 cpm_matrix <- t(t(count_matrix) / total_counts) * 1e6
 cpm_matrix <- as.data.frame(cpm_matrix)

 # Add gene_id and gene_name back to CPM table
 cpm_counts <- cbind(counts_data[, c("gene_id", "gene_name")], cpm_matrix)

 # --------------------------------
 # 6️⃣ Save outputs
 # --------------------------------
 write.csv(counts_data, "gene_raw_counts.csv", row.names=FALSE)
 write.xlsx(counts_data, "gene_raw_counts.xlsx", row.names=FALSE)
 write.xlsx(cpm_counts, "gene_cpm_counts.xlsx", row.names=FALSE)

PCA dim(counts(dds)) head(counts(dds), 10) rld <- rlogTransformation(dds)

 # draw simple pca and heatmap
 #mat <- assay(rld)
 #mm <- model.matrix(~condition, colData(rld))
 #mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm)
 #assay(rld) <- mat
 # -- pca --
 png("pca.png", 1200, 800)
 plotPCA(rld, intgroup=c("condition"))
 dev.off()
 # -- heatmap --
 png("heatmap.png", 1200, 800)
 distsRL <- dist(t(assay(rld)))
 mat <- as.matrix(distsRL)
 hc <- hclust(distsRL)
 hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
 heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
 dev.off()

Select the differentially expressed genes

 #https://galaxyproject.eu/posts/2020/08/22/three-steps-to-galaxify-your-tool/
 #https://www.biostars.org/p/282295/
 #https://www.biostars.org/p/335751/
 #> dds$condition
 #[1] Urine Urine Urine MHB   MHB   MHB
 #Levels: MHB Urine
 #CONSOLE: mkdir star_salmon/degenes

 setwd("degenes")
 #---- relevel to control ----
 dds$condition <- relevel(dds$condition, "MHB")
 dds = DESeq(dds, betaPrior=FALSE)
 resultsNames(dds)
 clist <- c("Urine_vs_MHB")

 for (i in clist) {
   contrast = paste("condition", i, sep="_")
   res = results(dds, name=contrast)
   res <- res[!is.na(res$log2FoldChange),]
   res_df <- as.data.frame(res)

   write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
   up <- subset(res_df, padj<=0.05 & log2FoldChange>=1.35)
   down <- subset(res_df, padj<=0.05 & log2FoldChange<=-1.35)
   write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
   write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
 }

 # -- Under host-env --
 grep -P "\tgene\t" CP059040.gff > CP059040_gene.gff
 python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040_gene.gff Urine_vs_MHB-all.txt Urine_vs_MHB-all.csv
 python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040_gene.gff Urine_vs_MHB-up.txt Urine_vs_MHB-up.csv
 python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/CP059040_gene.gff Urine_vs_MHB-down.txt Urine_vs_MHB-down.csv

 res <- read.csv("Urine_vs_MHB-all.csv")
 # Replace empty GeneName with modified GeneID
 res$GeneName <- ifelse(
   res$GeneName == "" | is.na(res$GeneName),
   gsub("gene-", "", res$GeneID),
   res$GeneName
 )
 duplicated_genes <- res[duplicated(res$GeneName), "GeneName"]

 res <- res %>%
   group_by(GeneName) %>%
   slice_min(padj, with_ties = FALSE) %>%
   ungroup()
 res <- as.data.frame(res)
 # Sort res first by padj (ascending) and then by log2FoldChange (descending)
 res <- res[order(res$padj, -res$log2FoldChange), ]

 # Assuming res is your dataframe and already processed
 # Filter up-regulated genes: log2FoldChange > 2 and padj < 1e-2
 up_regulated <- res[res$log2FoldChange > 2 & res$padj < 1e-2, ]
 # Filter down-regulated genes: log2FoldChange < -2 and padj < 1e-2
 down_regulated <- res[res$log2FoldChange < -2 & res$padj < 1e-2, ]
 # Create a new workbook
 wb <- createWorkbook()
 # Add the complete dataset as the first sheet
 addWorksheet(wb, "Complete_Data")
 writeData(wb, "Complete_Data", res)
 # Add the up-regulated genes as the second sheet
 addWorksheet(wb, "Up_Regulated")
 writeData(wb, "Up_Regulated", up_regulated)
 # Add the down-regulated genes as the third sheet
 addWorksheet(wb, "Down_Regulated")
 writeData(wb, "Down_Regulated", down_regulated)
 # Save the workbook to a file
 saveWorkbook(wb, "Gene_Expression_Urine_vs_MHB.xlsx", overwrite = TRUE)

 # Set the 'GeneName' column as row.names
 rownames(res) <- res$GeneName
 # Drop the 'GeneName' column since it's now the row names
 res$GeneName <- NULL
 head(res)

 ## Ensure the data frame matches the expected format
 ## For example, it should have columns: log2FoldChange, padj, etc.
 #res <- as.data.frame(res)
 ## Remove rows with NA in log2FoldChange (if needed)
 #res <- res[!is.na(res$log2FoldChange),]

 # Replace padj = 0 with a small value
 res$padj[res$padj == 0] <- 1e-305

 #library(EnhancedVolcano)
 # Assuming res is already sorted and processed
 png("Urine_vs_MHB.png", width=1200, height=2000)
 #max.overlaps = 10
 EnhancedVolcano(res,
                 lab = rownames(res),
                 x = 'log2FoldChange',
                 y = 'padj',
                 pCutoff = 1e-2,
                 FCcutoff = 2,
                 title = '',
                 subtitleLabSize = 18,
                 pointSize = 3.0,
                 labSize = 5.0,
                 colAlpha = 1,
                 legendIconSize = 4.0,
                 drawConnectors = TRUE,
                 widthConnectors = 0.5,
                 colConnectors = 'black',
                 subtitle = expression("Urine versus MHB"))
 dev.off()

KEGG and GO annotations in non-model organisms

https://www.biobam.com/functional-analysis/

Assign KEGG and GO Terms (see diagram above)

Since your organism is non-model, standard R databases (org.Hs.eg.db, etc.) won’t work. You’ll need to manually retrieve KEGG and GO annotations.
- Preparing file 1 eggnog_out.emapper.annotations.txt for the R-code below: (KEGG Terms): EggNog based on orthology and phylogenies
  
  EggNOG-mapper assigns both KEGG Orthology (KO) IDs and GO terms.
  
  Install EggNOG-mapper:
```
  mamba create -n eggnog_env python=3.8 eggnog-mapper -c conda-forge -c bioconda  #eggnog-mapper_2.1.12
  mamba activate eggnog_env
```
  Run annotation:
```
  #diamond makedb --in eggnog6.prots.faa -d eggnog_proteins.dmnd
  mkdir /home/jhuang/mambaforge/envs/eggnog_env/lib/python3.8/site-packages/data/
  download_eggnog_data.py --dbname eggnog.db -y --data_dir /home/jhuang/mambaforge/envs/eggnog_env/lib/python3.8/site-packages/data/
  #NOT_WORKING: emapper.py -i CP059040_gene.fasta -o eggnog_dmnd_out --cpu 60 -m diamond[hmmer,mmseqs] --dmnd_db /home/jhuang/REFs/eggnog_data/data/eggnog_proteins.dmnd
  python ~/Scripts/update_fasta_header.py CP059040_protein_.fasta CP059040_protein.fasta
  emapper.py -i CP059040_protein.fasta -o eggnog_out --cpu 60 --resume
  #----> result annotations.tsv: Contains KEGG, GO, and other functional annotations.
  #---->  470.IX87_14445:
      * 470 likely refers to the organism or strain (e.g., Acinetobacter baumannii ATCC 19606 or another related strain).
      * IX87_14445 would refer to a specific gene or protein within that genome.
```
  Extract KEGG KO IDs from annotations.emapper.annotations.
- Preparing file 2 blast2goannot.annot2 for the R-code below:
  - Basic (GO Terms from ‘Blast2GO 5 Basic’, saved in blast2go_annot.annot): Using Blast/Diamond + Blast2GO_GUI based on sequence alignment + GO mapping
    - ‘Load protein sequences’ (Tags: NONE, generated columns: Nr, SeqName) –>
    - Buttons ‘blast’ (Tags: BLASTED, generated columns: Description, Length, #Hits, e-Value, sim mean),
    - Button ‘mapping’ (Tags: MAPPED, generated columns: #GO, GO IDs, GO Names), “Mapping finished – Please proceed now to annotation.”
    - Button ‘annot’ (Tags: ANNOTATED, generated columns: Enzyme Codes, Enzyme Names), “Annotation finished.”
      - Used parameter ‘Annotation CutOff’: The Blast2GO Annotation Rule seeks to find the most specific GO annotations with a certain level of reliability. An annotation score is calculated for each candidate GO which is composed by the sequence similarity of the Blast Hit, the evidence code of the source GO and the position of the particular GO in the Gene Ontology hierarchy. This annotation score cutoff select the most specific GO term for a given GO branch which lies above this value.
      - Used parameter ‘GO Weight’ is a value which is added to Annotation Score of a more general/abstract Gene Ontology term for each of its more specific, original source GO terms. In this case, more general GO terms which summarise many original source terms (those ones directly associated to the Blast Hits) will have a higher Annotation Score.
  - Advanced (GO Terms from ‘Blast2GO 5 Basic’): Interpro based protein families / domains –> Button interpro
    - Button ‘interpro’ (Tags: INTERPRO, generated columns: InterPro IDs, InterPro GO IDs, InterPro GO Names) –> “InterProScan Finished – You can now merge the obtained GO Annotations.”
  - MERGE the results of InterPro GO IDs (advanced) to GO IDs (basic) and generate final GO IDs, saved in blast2go_annot.annot2
    - Button ‘interpro’/’Merge InterProScan GOs to Annotation’ –> “Merge (add and validate) all GO terms retrieved via InterProScan to the already existing GO annotation.” –> “Finished merging GO terms from InterPro with annotations. Maybe you want to run ANNEX (Annotation Augmentation).”
    - (NOT_USED) Button ‘annot’/’ANNEX’ –> “ANNEX finished. Maybe you want to do the next step: Enzyme Code Mapping.”
  - PREPARING go_terms and ecterms: annot* file:
    
    cut -f1-2 -d$’\t’ blast2go_annot.annot2 > blast2goannot.annot2

Perform KEGG and GO Enrichment in R

     #BiocManager::install("GO.db")
     #BiocManager::install("AnnotationDbi")

     # Load required libraries
     library(openxlsx)  # For Excel file handling
     library(dplyr)     # For data manipulation
     library(tidyr)
     library(stringr)
     library(clusterProfiler)  # For KEGG and GO enrichment analysis
     #library(org.Hs.eg.db)  # Replace with appropriate organism database
     library(GO.db)
     library(AnnotationDbi)

     setwd("~/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/results/star_salmon/degenes")
     # Step 1: Load the blast2go annotation file with a check for missing columns
     annot_df <- read.table("/home/jhuang/b2gWorkspace_Tam_RNAseq_2024/blast2go_annot.annot2_",
                         header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE)

     # If the structure is inconsistent, we can make sure there are exactly 3 columns:
     colnames(annot_df) <- c("GeneID", "Term")
     # Step 2: Filter and aggregate GO and EC terms as before
     go_terms <- annot_df %>%
     filter(grepl("^GO:", Term)) %>%
     group_by(GeneID) %>%
     summarize(GOs = paste(Term, collapse = ","), .groups = "drop")
     ec_terms <- annot_df %>%
     filter(grepl("^EC:", Term)) %>%
     group_by(GeneID) %>%
     summarize(EC = paste(Term, collapse = ","), .groups = "drop")

     # Load the results
     res <- read.csv("Urine_vs_MHB-all.csv")   #up259, down138

     # Replace empty GeneName with modified GeneID
     res$GeneName <- ifelse(
         res$GeneName == "" | is.na(res$GeneName),
         gsub("gene-", "", res$GeneID),
         res$GeneName
     )

     # Remove duplicated genes by selecting the gene with the smallest padj
     duplicated_genes <- res[duplicated(res$GeneName), "GeneName"]

     res <- res %>%
     group_by(GeneName) %>%
     slice_min(padj, with_ties = FALSE) %>%
     ungroup()

     res <- as.data.frame(res)
     # Sort res first by padj (ascending) and then by log2FoldChange (descending)
     res <- res[order(res$padj, -res$log2FoldChange), ]
     # Read eggnog annotations
     eggnog_data <- read.delim("~/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/eggnog_out.emapper.annotations.txt", header = TRUE, sep = "\t")
     # Remove the "gene-" prefix from GeneID in res to match eggnog 'query' format
     res$GeneID <- gsub("gene-", "", res$GeneID)
     # Merge eggnog data with res based on GeneID
     res <- res %>%
     left_join(eggnog_data, by = c("GeneID" = "query"))

     # DEBUG: NOT_NECESSARY, since res has already GeneName
     ##Convert row names to a new column 'GeneName' in res
     #res_with_geneName <- res %>%
     #mutate(GeneName = rownames(res)) %>%
     #as.data.frame()  # Ensure that it's a regular data frame without row names
     ## View the result
     #head(res_with_geneName)

     # Merge with the res dataframe
     # Perform the left joins and rename columns
     res_updated <- res %>%
     left_join(go_terms, by = "GeneID") %>%
     left_join(ec_terms, by = "GeneID") %>% dplyr::select(-EC.x, -GOs.x) %>% dplyr::rename(EC = EC.y, GOs = GOs.y)

     # DEBUG: NOT_NECESSARY, since 'GeneName' is already the first column.
     ## Reorder columns to move 'GeneName' as the first column in res_updated
     #res_updated <- res_updated %>%
     #select(GeneName, everything())

     ## Count the number of rows in the KEGG_ko, GOs, EC columns that have non-missing values
     #num_non_missing_KEGG_ko <- sum(res_updated$KEGG_ko != "-" & !is.na(res_updated$KEGG_ko))
     #print(num_non_missing_KEGG_ko)
     ##[1] 2030
     #num_non_missing_GOs <- sum(res_updated$GOs != "-" & !is.na(res_updated$GOs))
     #print(num_non_missing_GOs)
     ##[1] 2865 --> 2875
     #num_non_missing_EC <- sum(res_updated$EC != "-" & !is.na(res_updated$EC))
     #print(num_non_missing_EC)
     ##[1] 1701

     # Filter up-regulated genes
     up_regulated <- res_updated[res_updated$log2FoldChange > 2 & res_updated$padj < 0.01, ]
     # Filter down-regulated genes
     down_regulated <- res_updated[res_updated$log2FoldChange < -2 & res_updated$padj < 0.01, ]

     # Create a new workbook
     wb <- createWorkbook()
     # Add the complete dataset as the first sheet (with annotations)
     addWorksheet(wb, "Complete_Data")
     writeData(wb, "Complete_Data", res_updated)
     # Add the up-regulated genes as the second sheet (with annotations)
     addWorksheet(wb, "Up_Regulated")
     writeData(wb, "Up_Regulated", up_regulated)
     # Add the down-regulated genes as the third sheet (with annotations)
     addWorksheet(wb, "Down_Regulated")
     writeData(wb, "Down_Regulated", down_regulated)
     # Save the workbook to a file
     saveWorkbook(wb, "Gene_Expression_with_Annotations_Urine_vs_MHB.xlsx", overwrite = TRUE)

     # Set GeneName as row names after the join
     rownames(res_updated) <- res_updated$GeneName
     res_updated <- res_updated %>% dplyr::select(-GeneName)
     ## Set the 'GeneName' column as row.names
     #rownames(res_updated) <- res_updated$GeneName
     ## Drop the 'GeneName' column since it's now the row names
     #res_updated$GeneName <- NULL

     # ---- Perform KEGG enrichment analysis (up_regulated) ----
     gene_list_kegg_up <- up_regulated$KEGG_ko
     gene_list_kegg_up <- gsub("ko:", "", gene_list_kegg_up)
     kegg_enrichment_up <- enrichKEGG(gene = gene_list_kegg_up, organism = 'ko')
     # -- convert the GeneID (Kxxxxxx) to the true GeneID --
     # Step 0: Create KEGG to GeneID mapping
     kegg_to_geneid_up <- up_regulated %>%
     dplyr::select(KEGG_ko, GeneID) %>%
     filter(!is.na(KEGG_ko)) %>%  # Remove missing KEGG KO entries
     mutate(KEGG_ko = str_remove(KEGG_ko, "ko:"))  # Remove 'ko:' prefix if present
     # Step 1: Clean KEGG_ko values (separate multiple KEGG IDs)
     kegg_to_geneid_clean <- kegg_to_geneid_up %>%
     mutate(KEGG_ko = str_remove_all(KEGG_ko, "ko:")) %>%  # Remove 'ko:' prefixes
     separate_rows(KEGG_ko, sep = ",") %>%  # Ensure each KEGG ID is on its own row
     filter(KEGG_ko != "-") %>%  # Remove invalid KEGG IDs ("-")
     distinct()  # Remove any duplicate mappings
     # Step 2.1: Expand geneID column in kegg_enrichment_up
     expanded_kegg <- kegg_enrichment_up %>%
     as.data.frame() %>%
     separate_rows(geneID, sep = "/") %>%  # Split multiple KEGG IDs (Kxxxxx)
     left_join(kegg_to_geneid_clean, by = c("geneID" = "KEGG_ko"), relationship = "many-to-many") %>%  # Explicitly handle many-to-many
     distinct() %>%  # Remove duplicate matches
     group_by(ID) %>%
     summarise(across(everything(), ~ paste(unique(na.omit(.)), collapse = "/")), .groups = "drop")  # Re-collapse results
     #dplyr::glimpse(expanded_kegg)
     # Step 3.1: Replace geneID column in the original dataframe
     kegg_enrichment_up_df <- as.data.frame(kegg_enrichment_up)
     # Remove old geneID column and merge new one
     kegg_enrichment_up_df <- kegg_enrichment_up_df %>%
     dplyr::select(-geneID) %>%  # Remove old geneID column
     left_join(expanded_kegg %>% dplyr::select(ID, GeneID), by = "ID") %>%  # Merge new GeneID column
     dplyr::rename(geneID = GeneID)  # Rename column back to geneID

     # ---- Perform KEGG enrichment analysis (down_regulated) ----
     # Step 1: Extract KEGG KO terms from down-regulated genes
     gene_list_kegg_down <- down_regulated$KEGG_ko
     gene_list_kegg_down <- gsub("ko:", "", gene_list_kegg_down)
     # Step 2: Perform KEGG enrichment analysis
     kegg_enrichment_down <- enrichKEGG(gene = gene_list_kegg_down, organism = 'ko')
     # --- Convert KEGG gene IDs (Kxxxxxx) to actual GeneIDs ---
     # Step 3: Create KEGG to GeneID mapping from down_regulated dataset
     kegg_to_geneid_down <- down_regulated %>%
     dplyr::select(KEGG_ko, GeneID) %>%
     filter(!is.na(KEGG_ko)) %>%  # Remove missing KEGG KO entries
     mutate(KEGG_ko = str_remove(KEGG_ko, "ko:"))  # Remove 'ko:' prefix if present
     # Step 4: Clean KEGG_ko values (handle multiple KEGG IDs)
     kegg_to_geneid_down_clean <- kegg_to_geneid_down %>%
     mutate(KEGG_ko = str_remove_all(KEGG_ko, "ko:")) %>%  # Remove 'ko:' prefixes
     separate_rows(KEGG_ko, sep = ",") %>%  # Ensure each KEGG ID is on its own row
     filter(KEGG_ko != "-") %>%  # Remove invalid KEGG IDs ("-")
     distinct()  # Remove duplicate mappings
     # Step 5: Expand geneID column in kegg_enrichment_down
     expanded_kegg_down <- kegg_enrichment_down %>%
     as.data.frame() %>%
     separate_rows(geneID, sep = "/") %>%  # Split multiple KEGG IDs (Kxxxxx)
     left_join(kegg_to_geneid_down_clean, by = c("geneID" = "KEGG_ko"), relationship = "many-to-many") %>%  # Handle many-to-many mappings
     distinct() %>%  # Remove duplicate matches
     group_by(ID) %>%
     summarise(across(everything(), ~ paste(unique(na.omit(.)), collapse = "/")), .groups = "drop")  # Re-collapse results
     # Step 6: Replace geneID column in the original kegg_enrichment_down dataframe
     kegg_enrichment_down_df <- as.data.frame(kegg_enrichment_down) %>%
     dplyr::select(-geneID) %>%  # Remove old geneID column
     left_join(expanded_kegg_down %>% dplyr::select(ID, GeneID), by = "ID") %>%  # Merge new GeneID column
     dplyr::rename(geneID = GeneID)  # Rename column back to geneID
     # View the updated dataframe
     head(kegg_enrichment_down_df)

     # Create a new workbook
     wb <- createWorkbook()
     # Save enrichment results to the workbook
     addWorksheet(wb, "KEGG_Enrichment_Up")
     writeData(wb, "KEGG_Enrichment_Up", as.data.frame(kegg_enrichment_up_df))
     # Save enrichment results to the workbook
     addWorksheet(wb, "KEGG_Enrichment_Down")
     writeData(wb, "KEGG_Enrichment_Down", as.data.frame(kegg_enrichment_down_df))
     saveWorkbook(wb, "KEGG_Enrichment.xlsx", overwrite = TRUE)

     # ---- Perform GO enrichment analysis (TODO: extract the merged GO IDs from 'Blast2GO 5 Basic' and adapt the code below!)----

     # Define gene list (up-regulated genes)
     gene_list_go_up <- up_regulated$GeneID  # Extract the 149 up-regulated genes
     gene_list_go_down <- down_regulated$GeneID  # Extract the 65 down-regulated genes

     # Define background gene set (all genes in res)
     background_genes <- res_updated$GeneID  # Extract the 3646 background genes

     # Prepare GO annotation data from res
     go_annotation <- res_updated[, c("GOs","GeneID")]  # Extract relevant columns
     go_annotation <- go_annotation %>%
     tidyr::separate_rows(GOs, sep = ",")  # Split multiple GO terms into separate rows

     # Perform GO enrichment analysis, where pAdjustMethod is one of "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"
     go_enrichment_up <- enricher(
         gene = gene_list_go_up,                # Up-regulated genes
         TERM2GENE = go_annotation,       # Custom GO annotation
         pvalueCutoff = 1.0,             # Significance threshold
         pAdjustMethod = "BH",
         universe = background_genes      # Define the background gene set
     )
     go_enrichment_up <- as.data.frame(go_enrichment_up)

     go_enrichment_down <- enricher(
         gene = gene_list_go_down,                # Up-regulated genes
         TERM2GENE = go_annotation,       # Custom GO annotation
         pvalueCutoff = 1.0,             # Significance threshold
         pAdjustMethod = "BH",
         universe = background_genes      # Define the background gene set
     )
     go_enrichment_down <- as.data.frame(go_enrichment_down)

     ## Remove the 'p.adjust' column since no adjusted methods have been applied!
     #go_enrichment_up <- go_enrichment_up[, !names(go_enrichment_up) %in% "p.adjust"]
     # Update the Description column with the term descriptions
     go_enrichment_up$Description <- sapply(go_enrichment_up$ID, function(go_id) {
     # Using select to get the term description
     term <- tryCatch({
         AnnotationDbi::select(GO.db, keys = go_id, columns = "TERM", keytype = "GOID")
     }, error = function(e) {
         message(paste("Error for GO term:", go_id))  # Print which GO ID caused the error
         return(data.frame(TERM = NA))  # In case of error, return NA
     })

     if (nrow(term) > 0) {
         return(term$TERM)
     } else {
         return(NA)  # If no description found, return NA
     }
     })
     ## Print the updated data frame
     #print(go_enrichment_up)

     ## Remove the 'p.adjust' column since no adjusted methods have been applied!
     #go_enrichment_down <- go_enrichment_down[, !names(go_enrichment_down) %in% "p.adjust"]
     # Update the Description column with the term descriptions
     go_enrichment_down$Description <- sapply(go_enrichment_down$ID, function(go_id) {
     # Using select to get the term description
     term <- tryCatch({
         AnnotationDbi::select(GO.db, keys = go_id, columns = "TERM", keytype = "GOID")
     }, error = function(e) {
         message(paste("Error for GO term:", go_id))  # Print which GO ID caused the error
         return(data.frame(TERM = NA))  # In case of error, return NA
     })

     if (nrow(term) > 0) {
         return(term$TERM)
     } else {
         return(NA)  # If no description found, return NA
     }
     })

     addWorksheet(wb, "GO_Enrichment_Up")
     writeData(wb, "GO_Enrichment_Up", as.data.frame(go_enrichment_up))

     addWorksheet(wb, "GO_Enrichment_Down")
     writeData(wb, "GO_Enrichment_Down", as.data.frame(go_enrichment_down))

     # Save the workbook with enrichment results
     saveWorkbook(wb, "KEGG_and_GO_Enrichments_Urine_vs_MHB.xlsx", overwrite = TRUE)

     #Error for GO term: GO:0006807: replace GO:0006807  obsolete nitrogen compound metabolic process
     #TODO: marked the color as yellow if the p.adjusted <= 0.05 in GO_enrichment!

Finalizing the KEGG and GO Enrichment table

     1. NOTE: geneIDs in KEGG_Enrichment have been already translated from ko to geneID in H0N29_*-format;
     2. NEED_MANUAL_DELETION: p.adjust values have been calculated, we have to filter all records in GO_Enrichment-results by |p.adjust|<=0.05.

Microbial bioinformatics

Microbial bioinformatics uses computational tools to analyze genomes, track evolution, and study functions in microorganisms, including bacteria and viruses.

Processing Data_Tam_RNAseq_2024_MHB_vs_Urine_ATCC19606

Leave a Reply Cancel reply