Daily Archives: 2026年4月10日

RNAseq using Rscript (Data_Tam_RNAseq_2026_on_AYE)

04_PCA_CombinedGroups

complete_deg_pipeline.R

  1. Preparing raw data

     mkdir raw_data; cd raw_data
    
     # control samples (8)
     ln -s ../X101SC26025981-Z01-J001/01.RawData/1/1_1.fq.gz AYE-WT_ctr_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/1/1_2.fq.gz AYE-WT_ctr_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/2/2_1.fq.gz AYE-WT_ctr_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/2/2_2.fq.gz AYE-WT_ctr_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/3/3_1.fq.gz AYE-WT_ctr_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/3/3_2.fq.gz AYE-WT_ctr_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/4/4_1.fq.gz AYE-T_ctr_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/4/4_2.fq.gz AYE-T_ctr_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/5/5_1.fq.gz AYE-T_ctr_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/5/5_2.fq.gz AYE-T_ctr_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/6/6_1.fq.gz AYE-T_ctr_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/6/6_2.fq.gz AYE-T_ctr_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/7/7_1.fq.gz AYE-O_ctr_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/7/7_2.fq.gz AYE-O_ctr_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/8/8_1.fq.gz AYE-O_ctr_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/8/8_2.fq.gz AYE-O_ctr_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/9/9_1.fq.gz AYE-O_ctr_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/9/9_2.fq.gz AYE-O_ctr_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/10/10_1.fq.gz O-Trans_ctr_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/10/10_2.fq.gz O-Trans_ctr_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/11/11_1.fq.gz O-Trans_ctr_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/11/11_2.fq.gz O-Trans_ctr_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/12/12_1.fq.gz O-Trans_ctr_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/12/12_2.fq.gz O-Trans_ctr_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/1new/1new_1.fq.gz WT-Trans_ctr_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/1new/1new_2.fq.gz WT-Trans_ctr_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/2new/2new_1.fq.gz WT-Trans_ctr_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/2new/2new_2.fq.gz WT-Trans_ctr_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/3new/3new_1.fq.gz WT-Trans_ctr_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/3new/3new_2.fq.gz WT-Trans_ctr_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/49/49_1.fq.gz AYE-WT_ctr_solid_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/49/49_2.fq.gz AYE-WT_ctr_solid_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/50/50_1.fq.gz AYE-WT_ctr_solid_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/50/50_2.fq.gz AYE-WT_ctr_solid_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/51/51_1.fq.gz AYE-WT_ctr_solid_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/51/51_2.fq.gz AYE-WT_ctr_solid_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/52/52_1.fq.gz AYE-O_ctr_solid_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/52/52_2.fq.gz AYE-O_ctr_solid_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/53/53_1.fq.gz AYE-O_ctr_solid_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/53/53_2.fq.gz AYE-O_ctr_solid_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/54/54_1.fq.gz AYE-O_ctr_solid_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/54/54_2.fq.gz AYE-O_ctr_solid_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/55/55_1.fq.gz AYE-T_ctr_solid_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/55/55_2.fq.gz AYE-T_ctr_solid_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/56/56_1.fq.gz AYE-T_ctr_solid_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/56/56_2.fq.gz AYE-T_ctr_solid_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/57/57_1.fq.gz AYE-T_ctr_solid_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/57/57_2.fq.gz AYE-T_ctr_solid_r3_R2.fastq.gz
    
     # Diclofenac(双氯芬酸)treatment (6)
     ln -s ../X101SC26025981-Z01-J001/01.RawData/25/25_1.fq.gz AYE-WT_Diclo750_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/25/25_2.fq.gz AYE-WT_Diclo750_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/26/26_1.fq.gz AYE-WT_Diclo750_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/26/26_2.fq.gz AYE-WT_Diclo750_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/27/27_1.fq.gz AYE-WT_Diclo750_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/27/27_2.fq.gz AYE-WT_Diclo750_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/28/28_1.fq.gz AYE-T_Diclo375_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/28/28_2.fq.gz AYE-T_Diclo375_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/29/29_1.fq.gz AYE-T_Diclo375_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/29/29_2.fq.gz AYE-T_Diclo375_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/30/30_1.fq.gz AYE-T_Diclo375_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/30/30_2.fq.gz AYE-T_Diclo375_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/31/31_1.fq.gz AYE-O_Diclo375_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/31/31_2.fq.gz AYE-O_Diclo375_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/32/32_1.fq.gz AYE-O_Diclo375_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/32/32_2.fq.gz AYE-O_Diclo375_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/33/33_1.fq.gz AYE-O_Diclo375_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/33/33_2.fq.gz AYE-O_Diclo375_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/34/34_1.fq.gz O-Trans_Diclo375_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/34/34_2.fq.gz O-Trans_Diclo375_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/35/35_1.fq.gz O-Trans_Diclo375_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/35/35_2.fq.gz O-Trans_Diclo375_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/36/36_1.fq.gz O-Trans_Diclo375_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/36/36_2.fq.gz O-Trans_Diclo375_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/4new/4new_1.fq.gz WT-Trans_Diclo750_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/4new/4new_2.fq.gz WT-Trans_Diclo750_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/5new/5new_1.fq.gz WT-Trans_Diclo750_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/5new/5new_2.fq.gz WT-Trans_Diclo750_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/6new/6new_1.fq.gz WT-Trans_Diclo750_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/6new/6new_2.fq.gz WT-Trans_Diclo750_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/73/73_1.fq.gz AYE-WT_Diclo1250_solid_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/73/73_2.fq.gz AYE-WT_Diclo1250_solid_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/74/74_1.fq.gz AYE-WT_Diclo1250_solid_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/74/74_2.fq.gz AYE-WT_Diclo1250_solid_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/75/75_1.fq.gz AYE-WT_Diclo1250_solid_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/75/75_2.fq.gz AYE-WT_Diclo1250_solid_r3_R2.fastq.gz
    
     # Rifampicin(利福平)treatment (4)
     ln -s ../X101SC26025981-Z01-J001/01.RawData/13/13_1.fq.gz AYE-WT_Rifampicin1.5_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/13/13_2.fq.gz AYE-WT_Rifampicin1.5_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/14/14_1.fq.gz AYE-WT_Rifampicin1.5_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/14/14_2.fq.gz AYE-WT_Rifampicin1.5_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/15/15_1.fq.gz AYE-WT_Rifampicin1.5_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/15/15_2.fq.gz AYE-WT_Rifampicin1.5_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/16/16_1.fq.gz AYE-T_Rifampicin2_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/16/16_2.fq.gz AYE-T_Rifampicin2_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/17/17_1.fq.gz AYE-T_Rifampicin2_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/17/17_2.fq.gz AYE-T_Rifampicin2_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/18/18_1.fq.gz AYE-T_Rifampicin2_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/18/18_2.fq.gz AYE-T_Rifampicin2_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/19/19_1.fq.gz AYE-O_Rifampicin2_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/19/19_2.fq.gz AYE-O_Rifampicin2_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/20/20_1.fq.gz AYE-O_Rifampicin2_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/20/20_2.fq.gz AYE-O_Rifampicin2_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/21/21_1.fq.gz AYE-O_Rifampicin2_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/21/21_2.fq.gz AYE-O_Rifampicin2_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/22/22_1.fq.gz O-Trans_Rifampicin2_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/22/22_2.fq.gz O-Trans_Rifampicin2_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/23/23_1.fq.gz O-Trans_Rifampicin2_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/23/23_2.fq.gz O-Trans_Rifampicin2_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/24/24_1.fq.gz O-Trans_Rifampicin2_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/24/24_2.fq.gz O-Trans_Rifampicin2_r3_R2.fastq.gz
    
     # Meropenem(美罗培南)treatment (4)
     ln -s ../X101SC26025981-Z01-J001/01.RawData/37/37_1.fq.gz AYE-WT_Mero0.35-0.5_r1_R1.fastq.gz  #AYE-WT_Mero0.5_r1
     ln -s ../X101SC26025981-Z01-J001/01.RawData/37/37_2.fq.gz AYE-WT_Mero0.35-0.5_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/38/38_1.fq.gz AYE-WT_Mero0.35-0.5_r2_R1.fastq.gz  #AYE-WT_YX_Mero0.35_r2
     ln -s ../X101SC26025981-Z01-J001/01.RawData/38/38_2.fq.gz AYE-WT_Mero0.35-0.5_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/39/39_1.fq.gz AYE-WT_Mero0.35-0.5_r3_R1.fastq.gz  #AYE-WT_public_Mero0.35_r3
     ln -s ../X101SC26025981-Z01-J001/01.RawData/39/39_2.fq.gz AYE-WT_Mero0.35-0.5_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/40/40_1.fq.gz AYE-T_Mero0.15_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/40/40_2.fq.gz AYE-T_Mero0.15_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/41/41_1.fq.gz AYE-T_Mero0.15_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/41/41_2.fq.gz AYE-T_Mero0.15_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/42/42_1.fq.gz AYE-T_Mero0.15_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/42/42_2.fq.gz AYE-T_Mero0.15_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/43/43_1.fq.gz AYE-O_Mero0.5_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/43/43_2.fq.gz AYE-O_Mero0.5_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/44/44_1.fq.gz AYE-O_Mero0.5_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/44/44_2.fq.gz AYE-O_Mero0.5_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/45/45_1.fq.gz AYE-O_Mero0.5_r3_R1.fastq.gz  #Mero0.45
     ln -s ../X101SC26025981-Z01-J001/01.RawData/45/45_2.fq.gz AYE-O_Mero0.5_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/46/46_1.fq.gz O-Trans_Mero0.25_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/46/46_2.fq.gz O-Trans_Mero0.25_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/47/47_1.fq.gz O-Trans_Mero0.25_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/47/47_2.fq.gz O-Trans_Mero0.25_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/48/48_1.fq.gz O-Trans_Mero0.25_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/48/48_2.fq.gz O-Trans_Mero0.25_r3_R2.fastq.gz
    
     # Azithromycin(阿奇霉素)treatment (5), among them, F_ctr_solid is clinical isolate.
     ln -s ../X101SC26025981-Z01-J001/01.RawData/58/58_1.fq.gz F_ctr_solid_r1_R1.fastq.gz  #clinical
     ln -s ../X101SC26025981-Z01-J001/01.RawData/58/58_2.fq.gz F_ctr_solid_r1_R2.fastq.gz  #clinical
     ln -s ../X101SC26025981-Z01-J001/01.RawData/59/59_1.fq.gz F_ctr_solid_r2_R1.fastq.gz  #clinical
     ln -s ../X101SC26025981-Z01-J001/01.RawData/59/59_2.fq.gz F_ctr_solid_r2_R2.fastq.gz  #clinical
     ln -s ../X101SC26025981-Z01-J001/01.RawData/60/60_1.fq.gz F_ctr_solid_r3_R1.fastq.gz  #clinical
     ln -s ../X101SC26025981-Z01-J001/01.RawData/60/60_2.fq.gz F_ctr_solid_r3_R2.fastq.gz  #clinical
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/61/61_1.fq.gz AYE-WT_Azi20_solid_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/61/61_2.fq.gz AYE-WT_Azi20_solid_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/62/62_1.fq.gz AYE-WT_Azi20_solid_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/62/62_2.fq.gz AYE-WT_Azi20_solid_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/63/63_1.fq.gz AYE-WT_Azi20_solid_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/63/63_2.fq.gz AYE-WT_Azi20_solid_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/67/67_1.fq.gz AYE-T_Azi20_solid_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/67/67_2.fq.gz AYE-T_Azi20_solid_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/68/68_1.fq.gz AYE-T_Azi20_solid_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/68/68_2.fq.gz AYE-T_Azi20_solid_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/69/69_1.fq.gz AYE-T_Azi20_solid_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/69/69_2.fq.gz AYE-T_Azi20_solid_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/64/64_1.fq.gz AYE-O_Azi20_solid_r1_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/64/64_2.fq.gz AYE-O_Azi20_solid_r1_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/65/65_1.fq.gz AYE-O_Azi20_solid_r2_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/65/65_2.fq.gz AYE-O_Azi20_solid_r2_R2.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/66/66_1.fq.gz AYE-O_Azi20_solid_r3_R1.fastq.gz
     ln -s ../X101SC26025981-Z01-J001/01.RawData/66/66_2.fq.gz AYE-O_Azi20_solid_r3_R2.fastq.gz
    
     ln -s ../X101SC26025981-Z01-J001/01.RawData/70/70_1.fq.gz F_Azi20_solid_r1_R1.fastq.gz  #clinical
     ln -s ../X101SC26025981-Z01-J001/01.RawData/70/70_2.fq.gz F_Azi20_solid_r1_R2.fastq.gz  #clinical
     ln -s ../X101SC26025981-Z01-J001/01.RawData/71/71_1.fq.gz F_Azi20_solid_r2_R1.fastq.gz  #clinical
     ln -s ../X101SC26025981-Z01-J001/01.RawData/71/71_2.fq.gz F_Azi20_solid_r2_R2.fastq.gz  #clinical
     ln -s ../X101SC26025981-Z01-J001/01.RawData/72/72_1.fq.gz F_Azi20_solid_r3_R1.fastq.gz  #clinical
     ln -s ../X101SC26025981-Z01-J001/01.RawData/72/72_2.fq.gz F_Azi20_solid_r3_R2.fastq.gz  #clinical
  2. Preparing the directory trimmed

     mkdir trimmed trimmed_unpaired;
     for sample_id in AYE-O_Azi20_solid_r1 AYE-O_Azi20_solid_r2 AYE-O_Azi20_solid_r3 AYE-O_ctr_r1 AYE-O_ctr_r2 AYE-O_ctr_r3 AYE-O_ctr_solid_r1 AYE-O_ctr_solid_r2 AYE-O_ctr_solid_r3 AYE-O_Diclo375_r1 AYE-O_Diclo375_r2 AYE-O_Diclo375_r3 AYE-O_Mero0.5_r1 AYE-O_Mero0.5_r2 AYE-O_Mero0.5_r3 AYE-O_Rifampicin2_r1 AYE-O_Rifampicin2_r2 AYE-O_Rifampicin2_r3 AYE-T_Azi20_solid_r1 AYE-T_Azi20_solid_r2 AYE-T_Azi20_solid_r3 AYE-T_ctr_r1 AYE-T_ctr_r2 AYE-T_ctr_r3 AYE-T_ctr_solid_r1 AYE-T_ctr_solid_r2 AYE-T_ctr_solid_r3 AYE-T_Diclo375_r1 AYE-T_Diclo375_r2 AYE-T_Diclo375_r3 AYE-T_Mero0.15_r1 AYE-T_Mero0.15_r2 AYE-T_Mero0.15_r3 AYE-T_Rifampicin2_r1 AYE-T_Rifampicin2_r2 AYE-T_Rifampicin2_r3 AYE-WT_Azi20_solid_r1 AYE-WT_Azi20_solid_r2 AYE-WT_Azi20_solid_r3 AYE-WT_ctr_r1 AYE-WT_ctr_r2 AYE-WT_ctr_r3 AYE-WT_ctr_solid_r1 AYE-WT_ctr_solid_r2 AYE-WT_ctr_solid_r3 AYE-WT_Diclo1250_solid_r1 AYE-WT_Diclo1250_solid_r2 AYE-WT_Diclo1250_solid_r3 AYE-WT_Diclo750_r1 AYE-WT_Diclo750_r2 AYE-WT_Diclo750_r3 AYE-WT_Mero0.35-0.5_r1 AYE-WT_Mero0.35-0.5_r2 AYE-WT_Mero0.35-0.5_r3 AYE-WT_Rifampicin1.5_r1 AYE-WT_Rifampicin1.5_r2 AYE-WT_Rifampicin1.5_r3 F_Azi20_solid_r1 F_Azi20_solid_r2 F_Azi20_solid_r3 F_ctr_solid_r1 F_ctr_solid_r2 F_ctr_solid_r3 O-Trans_ctr_r1 O-Trans_ctr_r2 O-Trans_ctr_r3 O-Trans_Diclo375_r1 O-Trans_Diclo375_r2 O-Trans_Diclo375_r3 O-Trans_Mero0.25_r1 O-Trans_Mero0.25_r2 O-Trans_Mero0.25_r3 O-Trans_Rifampicin2_r1 O-Trans_Rifampicin2_r2 O-Trans_Rifampicin2_r3 WT-Trans_ctr_r1 WT-Trans_ctr_r2 WT-Trans_ctr_r3 WT-Trans_Diclo750_r1 WT-Trans_Diclo750_r2 WT-Trans_Diclo750_r3; do \
     for sample_id in AYE-T_Diclo375_r2; do \
             java -jar /home/jhuang/Tools/Trimmomatic-0.36/trimmomatic-0.36.jar PE -threads 100 raw_data/${sample_id}_R1.fastq.gz raw_data/${sample_id}_R2.fastq.gz trimmed/${sample_id}_R1.fastq.gz trimmed_unpaired/${sample_id}_R1.fastq.gz trimmed/${sample_id}_R2.fastq.gz trimmed_unpaired/${sample_id}_R2.fastq.gz ILLUMINACLIP:/home/jhuang/Tools/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa:2:30:10:8:TRUE LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 AVGQUAL:20; done 2> trimmomatic_pe.log;
     done
  3. (Optional) using trinity to find the most closely reference

     #Trinity --seqType fq --max_memory 50G --left trimmed/wt_r1_R1.fastq.gz  --right trimmed/wt_r1_R2.fastq.gz --CPU 12
    
     #https://www.genome.jp/kegg/tables/br08606.html#prok
     acb     KGB     Acinetobacter baumannii ATCC 17978  2007    GenBank
     abm     KGB     Acinetobacter baumannii SDF     2008    GenBank
     aby     KGB     Acinetobacter baumannii AYE     2008    GenBank --> *
     abc     KGB     Acinetobacter baumannii ACICU   2008    GenBank
     abn     KGB     Acinetobacter baumannii AB0057  2008    GenBank
     abb     KGB     Acinetobacter baumannii AB307-0294  2008    GenBank
     abx     KGB     Acinetobacter baumannii 1656-2  2012    GenBank
     abz     KGB     Acinetobacter baumannii MDR-ZJ06    2012    GenBank
     abr     KGB     Acinetobacter baumannii MDR-TJ  2012    GenBank
     abd     KGB     Acinetobacter baumannii TCDC-AB0715     2012    GenBank
     abh     KGB     Acinetobacter baumannii TYTH-1  2012    GenBank
     abad    KGB     Acinetobacter baumannii D1279779    2013    GenBank
     abj     KGB     Acinetobacter baumannii BJAB07104   2013    GenBank
     abab    KGB     Acinetobacter baumannii BJAB0715    2013    GenBank
     abaj    KGB     Acinetobacter baumannii BJAB0868    2013    GenBank
     abaz    KGB     Acinetobacter baumannii ZW85-1  2013    GenBank
     abk     KGB     Acinetobacter baumannii AbH12O-A2   2014    GenBank
     abau    KGB     Acinetobacter baumannii AB030   2014    GenBank
     abaa    KGB     Acinetobacter baumannii AB031   2014    GenBank
     abw     KGB     Acinetobacter baumannii AC29    2014    GenBank
     abal    KGB     Acinetobacter baumannii LAC-4   2015    GenBank
     #Note that the Acinetobacter baumannii strain ATCC 19606 chromosome, complete genome (GenBank: CU459141.1) was choosen as reference!
  4. Preparing samplesheet.csv

     sample,fastq_1,fastq_2,strandedness
     Urine_r1,Urine_r1_R1.fq.gz,Urine_r1_R2.fq.gz,auto
     ...
  5. Downloading CU459141.fasta and CU459141.gff from GenBank and preparing CU459141_m.gff

     #Example1: http://xgenes.com/article/article-content/157/prepare-virus-gtf-for-nextflow-run/
     #Default NOT_WORKING: --gtf_group_features 'gene_id'  --gtf_extra_attributes 'gene_name' --featurecounts_group_type 'gene_biotype' --featurecounts_feature_type 'exon'
     #(host_env) !NOT_WORKING! jhuang@WS-2290C:~/DATA/Data_Tam_RNAseq_2024$ /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results    --fasta "/home/jhuang/DATA/Data_Tam_RNAseq_2024/CU459141.fasta" --gff "/home/jhuang/DATA/Data_Tam_RNAseq_2024/CU459141.gff"        -profile docker -resume  --max_cpus 55 --max_memory 512.GB --max_time 2400.h    --save_align_intermeds --save_unaligned --save_reference    --aligner 'star_salmon'    --gtf_group_features 'gene_id'  --gtf_extra_attributes 'gene_name' --featurecounts_group_type 'gene_biotype' --featurecounts_feature_type 'transcript'
    
     # -- DEBUG_1 (CDS --> exon in CP059040.gff) --
     #Checking the record (see below) in results/genome/CP059040.gtf
     #In ./results/genome/CP059040.gtf e.g. "CP059040.1      Genbank transcript      1       1398    .       +       .       transcript_id "gene-H0N29_00005"; gene_id "gene-H0N29_00005"; gene_name "dnaA"; Name "dnaA"; gbkey "Gene"; gene "dnaA"; gene_biotype "protein_coding"; locus_tag "H0N29_00005";"
     #--featurecounts_feature_type 'transcript' returns only the tRNA results
     #Since the tRNA records have "transcript and exon". In gene records, we have "transcript and CDS". replace the CDS with exon
    
     grep -P "\texon\t" CP059040.gff | sort | wc -l    #96
     grep -P "cmsearch\texon\t" CP059040.gff | wc -l    #=10  ignal recognition particle sRNA small typ, transfer-messenger RNA, 5S ribosomal RNA
     grep -P "Genbank\texon\t" CP059040.gff | wc -l    #=12  16S and 23S ribosomal RNA
     grep -P "tRNAscan-SE\texon\t" CP059040.gff | wc -l    #tRNA 74
     wc -l star_salmon/AUM_r3/quant.genes.sf  #--featurecounts_feature_type 'transcript' results in 96 records!
    
     grep -P "\tCDS\t" CU459141.gff3 | wc -l  #3659
     sed 's/\tCDS\t/\texon\t/g' CU459141.gff3 > CU459141_m.gff
     grep -P "\texon\t" CU459141_m.gff | sort | wc -l  #3760
    
     # -- DEBUG_2: combination of 'CU459141_m.gff' and 'exon' results in ERROR, using 'transcript' instead!
     --gff "/home/jhuang/DATA/Data_Tam_RNAseq_2026_on_AYE/CU459141_m.gff" --featurecounts_feature_type 'transcript'
    
     # -- DEBUG_3: make sure the header of fasta is the same to the *_m.gff file
  6. nextflow run

     # ---- SUCCESSFUL with directly downloaded gff3 and fasta from NCBI using docker after replacing 'CDS' with 'exon' ----
     (host_env) mv trimmed/*.fastq.gz .
     (host_env) nextflow run nf-core/rnaseq -r 3.14.0 -profile docker \

    –input samplesheet.csv –outdir results –fasta “/home/jhuang/DATA/Data_Tam_RNAseq_2026_on_AYE/CU459141.fasta” –gff “/home/jhuang/DATA/Data_Tam_RNAseq_2026_on_AYE/CU459141_m.gff” -resume –max_cpus 90 –max_memory 900.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘star_salmon’ –gtf_group_features ‘gene_id’ –gtf_extra_attributes ‘gene_name’ –featurecounts_group_type ‘gene_biotype’ –featurecounts_feature_type ‘transcript’

  7. Import data and pca-plot

     #TODO_AFTERNOON: at least sent today a PCA, am besten all comparison tables!
    
     #mamba activate r_env
    
     #install.packages("ggfun")
     # Import the required libraries
     library("AnnotationDbi")
     library("clusterProfiler")
     library("ReactomePA")
     library(gplots)
     library(tximport)
     library(DESeq2)
     #library("org.Hs.eg.db")
     library(dplyr)
     library(tidyverse)
     #install.packages("devtools")
     #devtools::install_version("gtable", version = "0.3.0")
     library(gplots)
     library("RColorBrewer")
     #install.packages("ggrepel")
     library("ggrepel")
     # install.packages("openxlsx")
     library(openxlsx)
     library(EnhancedVolcano)
     library(DESeq2)
     library(edgeR)
    
     setwd("~/DATA/Data_Tam_RNAseq_2026_on_AYE/results/star_salmon")
     # Define paths to your Salmon output quantification files
    
     # Store sample names in a character vector
     samples <- c(
         "AYE-O_Azi20_solid_r1", "AYE-O_Azi20_solid_r2", "AYE-O_Azi20_solid_r3", "AYE-O_ctr_r1", "AYE-O_ctr_r2",
         "AYE-O_ctr_r3", "AYE-O_ctr_solid_r1", "AYE-O_ctr_solid_r2", "AYE-O_ctr_solid_r3",
         "AYE-O_Diclo375_r1", "AYE-O_Diclo375_r2", "AYE-O_Diclo375_r3", "AYE-O_Mero0.5_r1",
         "AYE-O_Mero0.5_r2", "AYE-O_Mero0.5_r3", "AYE-O_Rifampicin2_r1", "AYE-O_Rifampicin2_r2",
         "AYE-O_Rifampicin2_r3", "AYE-T_Azi20_solid_r1", "AYE-T_Azi20_solid_r2", "AYE-T_Azi20_solid_r3",
         "AYE-T_ctr_r1", "AYE-T_ctr_r2", "AYE-T_ctr_r3", "AYE-T_ctr_solid_r1", "AYE-T_ctr_solid_r2",
         "AYE-T_ctr_solid_r3", "AYE-T_Diclo375_r1", "AYE-T_Diclo375_r2", "AYE-T_Diclo375_r3",
         "AYE-T_Mero0.15_r1", "AYE-T_Mero0.15_r2", "AYE-T_Mero0.15_r3", "AYE-T_Rifampicin2_r1",
         "AYE-T_Rifampicin2_r2", "AYE-T_Rifampicin2_r3", "AYE-WT_Azi20_solid_r1", "AYE-WT_Azi20_solid_r2",
         "AYE-WT_Azi20_solid_r3", "AYE-WT_ctr_r1", "AYE-WT_ctr_r2", "AYE-WT_ctr_r3", "AYE-WT_ctr_solid_r1",
         "AYE-WT_ctr_solid_r2", "AYE-WT_ctr_solid_r3", "AYE-WT_Diclo1250_solid_r1", "AYE-WT_Diclo1250_solid_r2",
         "AYE-WT_Diclo1250_solid_r3", "AYE-WT_Diclo750_r1", "AYE-WT_Diclo750_r2", "AYE-WT_Diclo750_r3",
         "AYE-WT_Mero0.35-0.5_r1", "AYE-WT_Mero0.35-0.5_r2", "AYE-WT_Mero0.35-0.5_r3", "AYE-WT_Rifampicin1.5_r1",
         "AYE-WT_Rifampicin1.5_r2", "AYE-WT_Rifampicin1.5_r3", "F_Azi20_solid_r1", "F_Azi20_solid_r2",
         "F_Azi20_solid_r3", "F_ctr_solid_r1", "F_ctr_solid_r2", "F_ctr_solid_r3", "O-Trans_ctr_r1",
         "O-Trans_ctr_r2", "O-Trans_ctr_r3", "O-Trans_Diclo375_r1", "O-Trans_Diclo375_r2", "O-Trans_Diclo375_r3",
         "O-Trans_Mero0.25_r1", "O-Trans_Mero0.25_r2", "O-Trans_Mero0.25_r3", "O-Trans_Rifampicin2_r1",
         "O-Trans_Rifampicin2_r2", "O-Trans_Rifampicin2_r3", "WT-Trans_ctr_r1", "WT-Trans_ctr_r2",
         "WT-Trans_ctr_r3", "WT-Trans_Diclo750_r1", "WT-Trans_Diclo750_r2", "WT-Trans_Diclo750_r3"
     )
    
     ## Automatically generate the named vector
     files <- setNames(paste0("./", samples, "/quant.sf"), samples)
    
     # -----------------------------------------------------------------
     # ---- Step 1: Create Detailed Metadata from Your Sample Names ----
    
     # Extract metadata from sample names
     samples <- names(files)
    
     # Parse the complex sample names
     metadata <- data.frame(
     sample = samples,
     stringsAsFactors = FALSE
     )
    
     # Extract strain (everything before first underscore or hyphen treatment)
     metadata$strain <- sapply(strsplit(samples, "[-_]"), function(x) {
     if(x[1] %in% c("AYE", "O", "WT", "F")) {
         if(x[1] == "AYE" && length(x) > 1 && x[2] %in% c("WT", "T", "O")) {
         paste(x[1:2], collapse = "-")
         } else if(x[1] %in% c("O", "WT") && x[2] == "Trans") {
         paste(x[1:2], collapse = "-")
         } else {
         x[1]
         }
     } else {
         x[1]
     }
     })
    
     # Extract treatment type
     metadata$treatment <- sapply(samples, function(x) {
     if(grepl("_ctr", x)) return("ctrl")
     if(grepl("Diclo", x)) return("Diclo")
     if(grepl("Mero", x)) return("Mero")
     if(grepl("Azi", x)) return("Azi")
     if(grepl("Rifampicin", x)) return("Rifampicin")
     return("ctrl")
     })
    
     # Extract concentration
     metadata$concentration <- sapply(samples, function(x) {
     if(grepl("Diclo1250", x)) return("1250")
     if(grepl("Diclo750", x)) return("750")
     if(grepl("Diclo375", x)) return("375")
     if(grepl("Mero0.5", x)) return("0.5")
     if(grepl("Mero0.35", x)) return("0.35")
     if(grepl("Mero0.25", x)) return("0.25")
     if(grepl("Mero0.15", x)) return("0.15")
     if(grepl("Azi20", x)) return("20")
     if(grepl("Rifampicin2", x)) return("2")
     if(grepl("Rifampicin1.5", x)) return("1.5")
     return("0")
     })
    
     # Extract condition (solid vs liquid)
     metadata$condition <- ifelse(grepl("_solid", samples), "solid", "liquid")
    
     # Extract replicate
     metadata$replicate <- sapply(strsplit(samples, "_"), function(x) {
     rep_part <- x[length(x)]
     gsub("r", "", rep_part)
     })
    
     # Create combined group for easy comparisons
     metadata$group <- paste(metadata$strain, metadata$treatment, metadata$concentration, sep = "_")
    
     # Set row names
     rownames(metadata) <- metadata$sample
    
     # Reorder to match txi columns
     metadata <- metadata[colnames(txi$counts), ]
    
     # ---------------------------------------------
     # ---- Step 2: Choose Your Design Strategy ----
    
     # Strategy A: Full Factorial Design (if balanced)
     dds <- DESeqDataSetFromTximport(txi, metadata,
                              design = ~ strain + treatment + condition)
    
     # --> Strategy B: Combined Group Factor ⭐ RECOMMENDED
     metadata$group <- factor(paste(metadata$strain,
                                     metadata$treatment,
                                     metadata$concentration,
                                     metadata$condition,
                                     sep = "_"))
    
     dds <- DESeqDataSetFromTximport(txi, metadata,
                                     design = ~ group)
     dds <- DESeq(dds)
    
     # See all available comparisons
     resultsNames(dds)
    
     # -------------------------------------------------------------
     # ---- Step 3: Set Up Specific Comparisons from Your Notes ----
     # ==========================================
     # 1. Define Exact Comparisons from Your Notes
     # ==========================================
     planned_comparisons <- list(
     # --- Baseline / Strain Controls ---
     AYE_T_ctr_vs_AYE_WT_ctr            = list(treat = "AYE-T_ctrl_0_liquid",   ctrl = "AYE-WT_ctrl_0_liquid"),
     AYE_O_ctr_vs_AYE_WT_ctr            = list(treat = "AYE-O_ctrl_0_liquid",   ctrl = "AYE-WT_ctrl_0_liquid"),
     O_Trans_ctr_vs_AYE_WT_ctr          = list(treat = "O-Trans_ctrl_0_liquid", ctrl = "AYE-WT_ctrl_0_liquid"),
     WT_Trans_ctr_vs_AYE_WT_ctr         = list(treat = "WT-Trans_ctrl_0_liquid",ctrl = "AYE-WT_ctrl_0_liquid"),
     AYE_O_ctr_vs_AYE_T                 = list(treat = "AYE-O_ctrl_0_liquid",   ctrl = "AYE-T_ctrl_0_liquid"),
     O_Trans_ctr_vs_AYE_T               = list(treat = "O-Trans_ctrl_0_liquid", ctrl = "AYE-T_ctrl_0_liquid"),
     WT_Trans_ctr_vs_AYE_T              = list(treat = "WT-Trans_ctrl_0_liquid",ctrl = "AYE-T_ctrl_0_liquid"),
    
     # --- Condition Effects (Solid vs Liquid) ---
     AYE_WT_ctr_solid_vs_AYE_WT_ctr     = list(treat = "AYE-WT_ctrl_0_solid",   ctrl = "AYE-WT_ctrl_0_liquid"),
     AYE_O_ctr_solid_vs_AYE_O_ctr       = list(treat = "AYE-O_ctrl_0_solid",    ctrl = "AYE-O_ctrl_0_liquid"),
     AYE_T_ctr_solid_vs_AYE_T_ctr       = list(treat = "AYE-T_ctrl_0_solid",    ctrl = "AYE-T_ctrl_0_liquid"),
     AYE_O_ctr_solid_vs_AYE_WT_ctr_solid= list(treat = "AYE-O_ctrl_0_solid",    ctrl = "AYE-WT_ctrl_0_solid"),
     AYE_T_ctr_solid_vs_AYE_WT_ctr_solid= list(treat = "AYE-T_ctrl_0_solid",    ctrl = "AYE-WT_ctrl_0_solid"),
    
     # --- Diclofenac ---
     AYE_WT_Diclo750_vs_AYE_WT_ctr      = list(treat = "AYE-WT_Diclo_750_liquid",   ctrl = "AYE-WT_ctrl_0_liquid"),
     AYE_T_Diclo375_vs_AYE_WT_ctr       = list(treat = "AYE-T_Diclo_375_liquid",    ctrl = "AYE-WT_ctrl_0_liquid"),
     AYE_O_Diclo375_vs_AYE_WT_ctr       = list(treat = "AYE-O_Diclo_375_liquid",    ctrl = "AYE-WT_ctrl_0_liquid"),
     O_Trans_Diclo375_vs_AYE_WT_ctr     = list(treat = "O-Trans_Diclo_375_liquid",  ctrl = "AYE-WT_ctrl_0_liquid"),
     WT_Trans_Diclo750_vs_AYE_WT_ctr    = list(treat = "WT-Trans_Diclo_750_liquid", ctrl = "AYE-WT_ctrl_0_liquid"),
     Diclo_AYE_WT_1250_solid_vs_solid_ctr = list(treat = "AYE-WT_Diclo_1250_solid", ctrl = "AYE-WT_ctrl_0_solid"),
    
     # --- Meropenem ---
     AYE_WT_Mero_vs_AYE_WT_ctr          = list(treat = "AYE-WT_Mero_0.35_liquid", ctrl = "AYE-WT_ctrl_0_liquid"),
     AYE_T_Mero_vs_AYE_WT_ctr           = list(treat = "AYE-T_Mero_0.15_liquid",      ctrl = "AYE-WT_ctrl_0_liquid"),
     AYE_O_Mero_vs_AYE_WT_ctr           = list(treat = "AYE-O_Mero_0.5_liquid",       ctrl = "AYE-WT_ctrl_0_liquid"),
     O_Trans_Mero_vs_AYE_WT_ctr         = list(treat = "O-Trans_Mero_0.25_liquid",    ctrl = "AYE-WT_ctrl_0_liquid"),
     AYE_T_Mero_vs_AYE_T_ctr            = list(treat = "AYE-T_Mero_0.15_liquid",      ctrl = "AYE-T_ctrl_0_liquid"),
    
     # --- Azithromycin (Solid) ---
     AYE_WT_Azi_vs_solid_ctr            = list(treat = "AYE-WT_Azi_20_solid", ctrl = "AYE-WT_ctrl_0_solid"),
     AYE_T_Azi_vs_solid_ctr             = list(treat = "AYE-T_Azi_20_solid",  ctrl = "AYE-T_ctrl_0_solid"),
     AYE_O_Azi_vs_solid_ctr             = list(treat = "AYE-O_Azi_20_solid",  ctrl = "AYE-O_ctrl_0_solid"),
     F_Azi_vs_F_solid_ctr               = list(treat = "F_Azi_20_solid",      ctrl = "F_ctrl_0_solid"),
    
     # --- Rifampicin ---
     AYE_WT_Rif_vs_AYE_WT_ctr           = list(treat = "AYE-WT_Rifampicin_1.5_liquid", ctrl = "AYE-WT_ctrl_0_liquid"),
     AYE_T_Rif_vs_AYE_T_ctr             = list(treat = "AYE-T_Rifampicin_2_liquid",    ctrl = "AYE-T_ctrl_0_liquid"),
     AYE_O_Rif_vs_AYE_O_ctr             = list(treat = "AYE-O_Rifampicin_2_liquid",    ctrl = "AYE-O_ctrl_0_liquid"),
     O_Trans_Rif_vs_O_Trans_ctr         = list(treat = "O-Trans_Rifampicin_2_liquid",  ctrl = "O-Trans_ctrl_0_liquid")
     )
    
     # ==========================================
     # 2. Verification & Validation Script
     # ==========================================
     # Identify which column in colData holds your group names
     group_col <- if("group" %in% colnames(colData(dds))) "group" else
                 if("treatment" %in% colnames(colData(dds))) "treatment" else
                 stop("❌ Please specify the correct colData column containing group names.")
    
     actual_groups <- unique(colData(dds)[[group_col]])
    
     cat("\n", paste(rep("=", 85), collapse=""), "\n")
     cat("📋 VERIFICATION OF NOTE-DERIVED COMPARISONS\n")
     cat(paste(rep("=", 85), collapse=""), "\n\n")
    
     validation_results <- data.frame(
     Comparison_Name = character(),
     Treatment_String = character(),
     Control_String = character(),
     Status = character(),
     Suggested_Contrast = character(),
     stringsAsFactors = FALSE
     )
    
     for(name in names(planned_comparisons)) {
     trt <- planned_comparisons[[name]]$treat
     ctl <- planned_comparisons[[name]]$ctrl
    
     # Find closest matches in actual data
     trt_match <- actual_groups[grepl(trt, actual_groups, fixed = TRUE)]
     ctl_match <- actual_groups[grepl(ctl, actual_groups, fixed = TRUE)]
    
     status <- if(length(trt_match) > 0 && length(ctl_match) > 0) "✅ VALID" else "⚠️  CHECK"
     contrast_str <- if(status == "✅ VALID")
         paste0('c("', group_col, '", "', trt_match[1], '", "', ctl_match[1], '")') else "N/A"
    
     validation_results <- rbind(validation_results, data.frame(
         Comparison_Name = name,
         Treatment_String = trt,
         Control_String = ctl,
         Status = status,
         Suggested_Contrast = contrast_str,
         stringsAsFactors = FALSE
     ))
    
     cat(sprintf("%-45s | T:%-25s C:%-20s | %s\n", name, trt, ctl, status))
     if(status == "⚠️  CHECK") {
         if(length(trt_match) == 0) cat("   🔍 Treat not found. Closest: ", paste(head(actual_groups[grepl(strsplit(trt, "_")[[1]][1], actual_groups)], 3), collapse=", "), "\n")
         if(length(ctl_match) == 0) cat("   🔍 Ctrl not found.  Closest: ", paste(head(actual_groups[grepl(strsplit(ctl, "_")[[1]][1], actual_groups)], 3), collapse=", "), "\n")
     }
     }
    
     # ==========================================
     # 3. Auto-Generate DESeq2 results() Calls (Optional)
     # ==========================================
     valid_comparisons <- validation_results[validation_results$Status == "✅ VALID", ]
     if(nrow(valid_comparisons) > 0) {
     cat("\n📜 READY-TO-RUN DESeq2 CONTRASTS:\n")
     cat(paste(rep("-", 60), collapse=""), "\n")
     for(i in seq_len(nrow(valid_comparisons))) {
         cat(sprintf('res_%s <- results(dds, contrast = %s)\n',
                     gsub("[^A-Za-z0-9]", "_", valid_comparisons$Comparison_Name[i]),
                     valid_comparisons$Suggested_Contrast[i]))
     }
     } else {
     cat("\n⚠️  No exact matches found. Check your colData group naming convention.\n")
     }
    
     # -----------------------------
     # ---- Step 4: PCA figures ----
    
     # 🔍 What each figure shows:
     #
     #    01_PCA_by_Strain.png → Tests if genetic background (AYE-WT, AYE-T, AYE-O, Trans, F) is the dominant source of variation.
     #    02_PCA_by_Treatment.png → Shows clustering by antibiotic/drug exposure (ctrl, Diclo, Mero, Azi, Rifampicin).
     #    03_PCA_by_Condition.png → Reveals batch/growth media effects (solid vs liquid).
     #    04_PCA_CombinedGroups.png → Full experimental grouping with labeled sample names for quick outlier detection.
     #    05_PCA_Ellipses.png → Adds 95% confidence boundaries per strain to visualize group spread and overlap.
     #
     # ⚠️ Quick Checklist Before Running:
     #
     #    Ensure metadata columns (strain, treatment, condition, group) are attached to colData(dds).
     #    If ggrepel is missing, run install.packages("ggrepel").
     #    All PNGs will save to your current working directory (getwd()).
    
     # Install if missing: install.packages(c("ggplot2", "ggrepel"))
     library(DESeq2)
     library(ggplot2)
     library(ggrepel)
    
     # 1. Variance Stabilizing Transformation & Extract PCA Data
     vsd <- vst(dds, blind = FALSE)
     pca_data <- plotPCA(vsd, intgroup = c("strain", "treatment", "condition", "group"), returnData = TRUE)
     percent_var <- round(100 * attr(pca_data, "percentVar"))
    
     # Consistent theme for all plots
     base_theme <- theme_bw(base_size = 12) +
     theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 13),
             legend.position = "right",
             legend.title = element_text(face = "bold"),
             panel.grid.major = element_line(color = "grey90"),
             panel.grid.minor = element_blank())
    
     # --- Plot 1: Colored by Strain ---
     p1 <- ggplot(pca_data, aes(x = PC1, y = PC2, color = strain, shape = condition)) +
     geom_point(size = 3, alpha = 0.8) +
     geom_text_repel(aes(label = name), size = 2.5, max.overlaps = 20, show.legend = FALSE) +
     labs(x = paste0("PC1: ", percent_var[1], "% variance"),
         y = paste0("PC2: ", percent_var[2], "% variance"),
         title = "PCA: Samples Colored by Strain",
         color = "Strain", shape = "Condition") +
     base_theme
     ggsave("01_PCA_by_Strain.png", p1, width = 8, height = 6, dpi = 300)
    
     # --- Plot 2: Colored by Treatment ---
     p2 <- ggplot(pca_data, aes(x = PC1, y = PC2, color = treatment, shape = condition)) +
     geom_point(size = 3, alpha = 0.8) +
     labs(x = paste0("PC1: ", percent_var[1], "% variance"),
         y = paste0("PC2: ", percent_var[2], "% variance"),
         title = "PCA: Samples Colored by Treatment",
         color = "Treatment", shape = "Condition") +
     base_theme
     ggsave("02_PCA_by_Treatment.png", p2, width = 8, height = 6, dpi = 300)
    
     # --- Plot 3: Colored by Condition (Solid vs Liquid) ---
     p3 <- ggplot(pca_data, aes(x = PC1, y = PC2, color = condition, shape = strain)) +
     geom_point(size = 3, alpha = 0.8) +
     labs(x = paste0("PC1: ", percent_var[1], "% variance"),
         y = paste0("PC2: ", percent_var[2], "% variance"),
         title = "PCA: Samples Colored by Growth Condition",
         color = "Condition", shape = "Strain") +
     base_theme
     ggsave("03_PCA_by_Condition.png", p3, width = 8, height = 6, dpi = 300)
    
     # --- Plot 4: Combined Groups with Sample Labels ---
     p4 <- ggplot(pca_data, aes(x = PC1, y = PC2, color = group)) +
     geom_point(size = 3, alpha = 0.8) +
     geom_text_repel(aes(label = name), size = 2, max.overlaps = 30, box.padding = 0.3) +
     labs(x = paste0("PC1: ", percent_var[1], "% variance"),
         y = paste0("PC2: ", percent_var[2], "% variance"),
         title = "PCA: Combined Experimental Groups",
         color = "Group") +
     base_theme + theme(legend.position = "none")
     ggsave("04_PCA_CombinedGroups.png", p4, width = 9, height = 7, dpi = 300)
    
     # --- Plot 5: 95% Confidence Ellipses (by Strain) ---
     p5 <- ggplot(pca_data, aes(x = PC1, y = PC2, color = strain, fill = strain)) +
     geom_point(size = 3, alpha = 0.7) +
     stat_ellipse(level = 0.95, alpha = 0.2, geom = "polygon", show.legend = FALSE) +
     labs(x = paste0("PC1: ", percent_var[1], "% variance"),
         y = paste0("PC2: ", percent_var[2], "% variance"),
         title = "PCA: 95% Confidence Ellipses by Strain",
         color = "Strain", fill = "Strain") +
     base_theme
     ggsave("05_PCA_Ellipses.png", p5, width = 8, height = 6, dpi = 300)
    
     message("✅ All 5 PCA plots saved to working directory!")
  8. Run Differential Expression & PCA Analysis Complete

     (r_env) jhuang@WS-2290C:/mnt/md1/DATA/Data_Tam_RNAseq_2026_on_AYE/results/star_salmon$ Rscript complete_deg_pipeline.R

Variant Calling: 229E Passaging Dataset (2024–2026) – Inter- & Intra-host Analysis using SPANDx and VPhaser2, only one deletion generated by snippy (Data_Pietschmann_229ECoronavirus_Mutations_2026)

variant_annot_2026__final.xls

  1. Input data:

     # ---- Datasets 2024 (in total 4) ----
     ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/hCoV229E_Rluc_R1.fastq.gz hCoV229E_Rluc_R1.fastq.gz
     ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/hCoV229E_Rluc_R2.fastq.gz hCoV229E_Rluc_R2.fastq.gz
     ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_DMSO_R1.fastq.gz DMSO_p10_R1.fastq.gz
     ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_DMSO_R2.fastq.gz DMSO_p10_R2.fastq.gz
     ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_K22_R1.fastq.gz K22_p10_R1.fastq.gz
     ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_K22_R2.fastq.gz K22_p10_R2.fastq.gz
     ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_K7523_R1.fastq.gz X7523_p10_R1.fastq.gz
     ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_K7523_R2.fastq.gz X7523_p10_R2.fastq.gz
    
     # ---- Datasets 2025 (in total 3) ----
     ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20606/p16_DMSO_S29_R1_001.fastq.gz DMSO_p16_R1.fastq.gz
     ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20606/p16_DMSO_S29_R2_001.fastq.gz DMSO_p16_R2.fastq.gz
     ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20607/p16_K22_S30_R1_001.fastq.gz K22_p16_R1.fastq.gz
     ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20607/p16_K22_S30_R2_001.fastq.gz K22_p16_R2.fastq.gz
     ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20608/p16_X7523_S31_R1_001.fastq.gz X7523_p16_R1.fastq.gz
     ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20608/p16_X7523_S31_R2_001.fastq.gz X7523_p16_R2.fastq.gz
    
     # ---- Datasets 2026 (in total 3) ----
     ln -s ../raw_data_2026/20260212_AV243904_0054_B/02_DMSO_p26/02_DMSO_p26_R1.fastq.gz DMSO_p26_R1.fastq.gz
     ln -s ../raw_data_2026/20260212_AV243904_0054_B/02_DMSO_p26/02_DMSO_p26_R2.fastq.gz DMSO_p26_R2.fastq.gz
     ln -s ../raw_data_2026/20260212_AV243904_0054_B/01_K22_p26/01_K22_p26_R1.fastq.gz K22_p26_R1.fastq.gz
     ln -s ../raw_data_2026/20260212_AV243904_0054_B/01_K22_p26/01_K22_p26_R2.fastq.gz K22_p26_R2.fastq.gz
     ln -s ../raw_data_2026/20260212_AV243904_0054_B/03_X723_p26/03_X723_p26_R1.fastq.gz X7523_p26_R1.fastq.gz
     ln -s ../raw_data_2026/20260212_AV243904_0054_B/03_X723_p26/03_X723_p26_R2.fastq.gz X7523_p26_R2.fastq.gz
  2. Call variant calling using snippy

     ln -s ~/Tools/bacto/db/ .;
     ln -s ~/Tools/bacto/envs/ .;
     ln -s ~/Tools/bacto/local/ .;
     cp ~/Tools/bacto/Snakefile .;
     cp ~/Tools/bacto/bacto-0.1.json .;
     cp ~/Tools/bacto/cluster.json .;
    
     #download CU459141.gb from GenBank
     mv ~/Downloads/sequence\(2\).gb db/PP810610.gb
    
     #setting the following in bacto-0.1.json
         "fastqc": false,
         "taxonomic_classifier": false,
         "assembly": true,
         "typing_ariba": false,
         "typing_mlst": true,
         "pangenome": true,
         "variants_calling": true,
         "phylogeny_fasttree": true,
         "phylogeny_raxml": true,
         "recombination": false, (due to gubbins-error set false)
         "genus": "Alphacoronavirus",
         "kingdom": "Viruses",
         "species": "Human coronavirus 229E",
         "mykrobe": {
             "species": "corona"
         },
         "reference": "db/PP810610.gb"
    
     mamba activate /home/jhuang/miniconda3/envs/bengal3_ac3
     (bengal3_ac3) /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
  3. Summarize all SNPs and Indels from the snippy result directory.

     cp ~/Scripts/summarize_snippy_res_ordered.py .
     # IMPORTANT_ADAPT the array isolates = ["hCoV229E_Rluc", "DMSO_p10", "K22_p10", "X7523_p10", "DMSO_p16", "K22_p16", "X7523_p16", "DMSO_p26", "K22_p26", "X7523_p26"]
     mamba activate plot-numpy1
     python3 ./summarize_snippy_res_ordered.py snippy
     #--> Summary CSV file created successfully at: snippy/summary_snps_indels.csv
     cd snippy
     #REMOVE_the_line? I don't find the sence of the line:    grep -v "None,,,,,,None,None" summary_snps_indels.csv > summary_snps_indels_.csv
  4. Using spandx calling variants (almost the same results to the one from viral-ngs!)

     mamba deactivate
     mamba activate /home/jhuang/miniconda3/envs/spandx
     mkdir ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610
     cp PP810610.gb  ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610/genes.gbk
     vim ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config
     /home/jhuang/miniconda3/envs/spandx/bin/snpEff build PP810610    #-d
     ~/Scripts/genbank2fasta.py PP810610.gb
     mv PP810610.gb_converted.fna PP810610.fasta    #rename "NC_001348.1 xxxxx" to "NC_001348" in the fasta-file
     ln -s /home/jhuang/Tools/spandx/ spandx
     (spandx) nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq" --ref PP810610.fasta --annotation --database PP810610 -resume
    
     # RERUN SNP_matrix.sh due to the error ERROR_CHROMOSOME_NOT_FOUND in the variants annotation --> IT WORKS!
     cd Outputs/Master_vcf
     conda activate /home/jhuang/miniconda3/envs/spandx
     (spandx) cp -r ../../snippy/hCoV229E_Rluc/reference .
     (spandx) cp ../../spandx/bin/SNP_matrix.sh ./
     #Note that ${variant_genome_path}=NC_001348 in the following command, but it was not used after the following command modification.
     #Adapt "snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -v ${variant_genome_path} out.vcf > out.annotated.vcf" to
     "/home/jhuang/miniconda3/envs/bengal3_ac3/bin/snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -c reference/snpeff.config -dataDir . ref out.vcf > out.annotated.vcf" in SNP_matrix.sh
     (spandx) bash SNP_matrix.sh PP810610 .
  5. Calling inter-host variants by merging the results from snippy+spandx (Manually! Note that the sample order of two tools are different, we should reorder the SNP-columns of spandx-results.)

     # Inter-host variants(宿主间变异):一种病毒在两个人之间有不同的基因变异,这些变异可能与宿主的免疫反应、疾病表现或病毒传播的方式相关。
     cp All_SNPs_indels_annotated.txt All_SNPs_indels_annotated_backup.txt
     vim All_SNPs_indels_annotated.txt
    
     #in the file ids: grep "$(echo -e '\t')353$(echo -e '\t')" All_SNPs_indels_annotated.txt >> All_SNPs_indels_annotated_.txt
     #Replace \n with " All_SNPs_indels_annotated.txt >> All_SNPs_indels_annotated_.txt\ngrep "
     #Replace grep " --> grep "$(echo -e '\t')
     #Replace " All_ --> $(echo -e '\t')" All_
    
     # Potential intra-host variants: 10871, 19289, 21027, 23435.
     CHROM   POS REF ALT TYPE    hCoV229E_Rluc   DMSO_p10    K22_p10 X7523_p10   DMSO_p16    K22_p16 X7523_p16   DMSO_p26    K22_p26 X7523_p26   Effect  Impact  Functional_Class    Codon_change    Protein_and_nucleotide_change   Amino_Acid_Length   Gene_name   Biotype
     PP810610    1464    T   C   SNP C   C   C   C   C   C   C   C   C   C   missense_variant    MODERATE    MISSENSE    gTt/gCt p.Val416Ala/c.1247T>C   6757    CDS_1   protein_coding
     PP810610    1492    T   A   SNP T/A T/A T/A T/A T/A T/A T/A T   T/A T/A synonymous_variant  LOW SILENT  acT/acA p.Thr425Thr/c.1275T>A   6757    CDS_1   protein_coding
     PP810610    1699    C   T   SNP T   T   T   T   T   T   T   T   T   T   synonymous_variant  LOW SILENT  gtC/gtT p.Val494Val/c.1482C>T   6757    CDS_1   protein_coding
     PP810610    4809    A   C   SNP A   A   A   A   A   A   A/C A   A   A/C missense_variant    MODERATE    MISSENSE    gAt/gCt p.Asp1531Ala/c.4592A>C  6757    CDS_1   protein_coding
     PP810610    6470    C   T   SNP C   C   C   C   C   C   C   C   C/T C   synonymous_variant  LOW SILENT  Ctg/Ttg p.Leu2085Leu/c.6253C>T  6757    CDS_1   protein_coding
     PP810610    6691    C   T   SNP T   T   T   T   T   T   T   T   T   T   synonymous_variant  LOW SILENT  tgC/tgT p.Cys2158Cys/c.6474C>T  6757    CDS_1   protein_coding
     PP810610    6919    C   G   SNP G   G   G   G   G   G   G   G   G   G   synonymous_variant  LOW SILENT  ggC/ggG p.Gly2234Gly/c.6702C>G  6757    CDS_1   protein_coding
     PP810610    7294    T   A   SNP A   A   A   A   A   A   A   A   A   A   missense_variant    MODERATE    MISSENSE    agT/agA p.Ser2359Arg/c.7077T>A  6757    CDS_1   protein_coding
     PP810610    8289    C   A   SNP C/A C/A C   C/A C   C   C/A C   C   C/A missense_variant    MODERATE    MISSENSE    gCc/gAc p.Ala2691Asp/c.8072C>A  6757    CDS_1   protein_coding
     PP810610    8294    A   G   SNP A/G A   A/G A   A   A/G A   A   A/G A   missense_variant    MODERATE    MISSENSE    Aaa/Gaa p.Lys2693Glu/c.8077A>G  6757    CDS_1   protein_coding
     PP810610    8376    G   T   SNP G/T G   G   G   G   G   G   G   G   G   missense_variant    MODERATE    MISSENSE    cGt/cTt p.Arg2720Leu/c.8159G>T  6757    CDS_1   protein_coding
     PP810610    9146    T   C   SNP T   T   T   T/C T   T   T/C T   T   T   missense_variant    MODERATE    MISSENSE    Ttt/Ctt p.Phe2977Leu/c.8929T>C  6757    CDS_1   protein_coding
     PP810610    9174    G   A   SNP G   G   G   G/A G   G   G/A G   G   G   missense_variant    MODERATE    MISSENSE    tGc/tAc p.Cys2986Tyr/c.8957G>A  6757    CDS_1   protein_coding
     PP810610    9261    C   T   SNP C   C   C   C   C   C   C   C   C/T C   missense_variant    MODERATE    MISSENSE    gCt/gTt p.Ala3015Val/c.9044C>T  6757    CDS_1   protein_coding
     PP810610    10145   A   G   SNP A   A   A   A/G A   A   A/G A   A   A/G missense_variant    MODERATE    MISSENSE    Atg/Gtg p.Met3310Val/c.9928A>G  6757    CDS_1   protein_coding
     PP810610    10239   T   G   SNP T   T   T/G T   T   T/G T   T   T   T   missense_variant    MODERATE    MISSENSE    gTg/gGg p.Val3341Gly/c.10022T>G 6757    CDS_1   protein_coding
     PP810610    10310   G   A   SNP G   G   G   G/A G   G   G/A G   G   G/A missense_variant    MODERATE    MISSENSE    Gtt/Att p.Val3365Ile/c.10093G>A 6757    CDS_1   protein_coding
     PP810610    10432   C   T   SNP C   C   C   C   C   C   C   C   C   C/T synonymous_variant  LOW SILENT  ttC/ttT p.Phe3405Phe/c.10215C>T 6757    CDS_1   protein_coding
     PP810610    10871   C   T   SNP C   C/T T   C/T C/T T   C/T C   T   C/T missense_variant    MODERATE    MISSENSE    Ctt/Ttt p.Leu3552Phe/c.10654C>T 6757    CDS_1   protein_coding
     PP810610    10898   G   A   SNP G   G/A G   G/A G   G   G/A G   G   G/A missense_variant    MODERATE    MISSENSE    Ggc/Agc p.Gly3561Ser/c.10681G>A 6757    CDS_1   protein_coding
     PP810610    11418   C   A   SNP C   C   C   C   C   C   C   C   C/A C   missense_variant    MODERATE    MISSENSE    aCt/aAt p.Thr3734Asn/c.11201C>A 6757    CDS_1   protein_coding
     PP810610    11577   A   C   SNP A   A/C A   A   A/C A   A   A/C A   A   missense_variant    MODERATE    MISSENSE    gAa/gCa p.Glu3787Ala/c.11360A>C 6757    CDS_1   protein_coding
     PP810610    14472   T   C   SNP C   C   C   C   C   C   C   C   C   C   missense_variant    MODERATE    MISSENSE    aTg/aCg p.Met4752Thr/c.14255T>C 6757    CDS_1   protein_coding
     PP810610    15270   G   A   SNP G   G   G   G   G   G   G   G   G   G/A missense_variant    MODERATE    MISSENSE    cGa/cAa p.Arg5018Gln/c.15053G>A 6757    CDS_1   protein_coding
     PP810610    15458   T   C   SNP C   C   C   C   C   C   C   C   C   C   synonymous_variant  LOW SILENT  Ttg/Ctg p.Leu5081Leu/c.15241T>C 6757    CDS_1   protein_coding
     PP810610    16035   C   A   SNP A   A   A   A   A   A   A   A   A   A   stop_gained HIGH    NONSENSE    tCa/tAa p.Ser5273*/c.15818C>A   6757    CDS_1   protein_coding
     PP810610    17119   A   G   SNP A   A   A   A   A   A   A   A   A   A/G synonymous_variant  LOW SILENT  aaA/aaG p.Lys5634Lys/c.16902A>G 6757    CDS_1   protein_coding
     PP810610    17430   T   C   SNP C   C   C   C   C   C   C   C   C   C   missense_variant    MODERATE    MISSENSE    tTa/tCa p.Leu5738Ser/c.17213T>C 6757    CDS_1   protein_coding
     PP810610    18640   T   G   SNP T   T   T   T/G T   T   T/G T   T   T/G missense_variant    MODERATE    MISSENSE    tgT/tgG p.Cys6141Trp/c.18423T>G 6757    CDS_1   protein_coding
     PP810610    18646   C   T   SNP C   C   C   C/T C   C   C   C   C   C   synonymous_variant  LOW SILENT  taC/taT p.Tyr6143Tyr/c.18429C>T 6757    CDS_1   protein_coding
     PP810610    18701   A   G   SNP A   A   A   A/G A   A   A/G A   A   A   missense_variant    MODERATE    MISSENSE    Aca/Gca p.Thr6162Ala/c.18484A>G 6757    CDS_1   protein_coding
     PP810610    19028   C   T   SNP C   C   C   C/T C   C   C   C   C   C   stop_gained HIGH    NONSENSE    Caa/Taa p.Gln6271*/c.18811C>T   6757    CDS_1   protein_coding
     PP810610    19289   G   T   SNP G   G   T   G   G   G/T G   G   G/T G   missense_variant    MODERATE    MISSENSE    Gtt/Ttt p.Val6358Phe/c.19072G>T 6757    CDS_1   protein_coding
     PP810610    19294   C   T   SNP C   C   C   C   C   C/T C/T C   C/T C   synonymous_variant  LOW SILENT  taC/taT p.Tyr6359Tyr/c.19077C>T 6757    CDS_1   protein_coding
     PP810610    21027   C   T   SNP C   C/T C   C/T C/T C   C/T T   C   C/T missense_variant    MODERATE    MISSENSE    aCt/aTt p.Thr178Ile/c.533C>T    1173    CDS_2   protein_coding
     PP810610    21183   T   G   SNP G   G   G   G   G   G   G   G   G   G   missense_variant    MODERATE    MISSENSE    tTt/tGt p.Phe230Cys/c.689T>G    1173    CDS_2   protein_coding
     PP810610    21633   T   C   SNP T   T/C T   T   T   T   T   T   T   T   missense_variant    MODERATE    MISSENSE    gTt/gCt p.Val380Ala/c.1139T>C   1173    CDS_2   protein_coding
     PP810610    22215   T   G   SNP T   T   T   T/G T   T   T/G T   T   T   missense_variant    MODERATE    MISSENSE    gTt/gGt p.Val574Gly/c.1721T>G   1173    CDS_2   protein_coding
     PP810610    22487   A   G   SNP A   A   A   A   A   A/G A   A   A/G A   missense_variant    MODERATE    MISSENSE    Agt/Ggt p.Ser665Gly/c.1993A>G   1173    CDS_2   protein_coding
     PP810610    22630   C   T   SNP C   C   C   C   C   C   C   C   C   C/T synonymous_variant  LOW SILENT  taC/taT p.Tyr712Tyr/c.2136C>T   1173    CDS_2   protein_coding
     PP810610    22636   T   G   SNP G   G   G   G   G   G   G   G   G   G   missense_variant    MODERATE    MISSENSE    aaT/aaG p.Asn714Lys/c.2142T>G   1173    CDS_2   protein_coding
     PP810610    22763   G   T   SNP G   G   G   G   G   G   G   G   G/T G   missense_variant    MODERATE    MISSENSE    Gct/Tct p.Ala757Ser/c.2269G>T   1173    CDS_2   protein_coding
     PP810610    22788   T   C   SNP T   T   T   T   T   T/C T   T/C T/C T   missense_variant    MODERATE    MISSENSE    gTt/gCt p.Val765Ala/c.2294T>C   1173    CDS_2   protein_coding
     PP810610    23022   T   C   SNP C   C   C   C   C   C   C   C   C   C   missense_variant    MODERATE    MISSENSE    tTa/tCa p.Leu843Ser/c.2528T>C   1173    CDS_2   protein_coding
     PP810610    23435   C   T   SNP C   C   T   C/T C   C/T C/T C   C/T C   missense_variant    MODERATE    MISSENSE    Ctt/Ttt p.Leu981Phe/c.2941C>T   1173    CDS_2   protein_coding
     PP810610    24781   C   T,G SNP T   T   T   T   T   T   T   T   T   T/G missense_variant    MODERATE    MISSENSE    aCt/aGt p.Thr36Ser/c.107C>G 77  CDS_5   protein_coding
     PP810610    24971   A   G   SNP A   A   A   A   A   A   A   A   A/G A   missense_variant    MODERATE    MISSENSE    Aat/Gat p.Asn18Asp/c.52A>G  225 CDS_6   protein_coding
     PP810610    25025   C   T   SNP C   C/T C   C   C/T C   C   C/T C   C/T missense_variant    MODERATE    MISSENSE    Cac/Tac p.His36Tyr/c.106C>T 225 CDS_6   protein_coding
     PP810610    25163   C   T   SNP T   T   T   T   T   T   T   T   T   T   missense_variant    MODERATE    MISSENSE    Ctt/Ttt p.Leu82Phe/c.244C>T 225 CDS_6   protein_coding
     PP810610    25264   C   T   SNP T   T   T   T   T   T   T   T   T   T   synonymous_variant  LOW SILENT  gtC/gtT p.Val115Val/c.345C>T    225 CDS_6   protein_coding
     PP810610    25592   T   C   SNP T   T/C T   T   T/C T   T   T/C T   T/C missense_variant    MODERATE    MISSENSE    Ttc/Ctc p.Phe225Leu/c.673T>C    225 CDS_6   protein_coding
     PP810610    26104   C   T   SNP C   C   C   C   C   C   C   C   C   C/T missense_variant    MODERATE    MISSENSE    cCt/cTt p.Pro165Leu/c.494C>T    389 CDS_7   protein_coding
     PP810610    26838   G   T   SNP T   T   T   T   T   T   T   T   T   T                               
    
     PP810610    24731   TGTGGTGCTTATA   T   DEL TGTGGTGCTTATA   TGTGGTGCTTATA   TGTGGTGCTTATA   TGTGGTGCTTATA   TGTGGTGCTTATA   TGTGGTGCTTATA   TGTGGTGCTTATA   TGTGGTGCTTATA   T   TGTGGTGCTTATA   conservative_inframe_deletion           c.61_72delGTGCTTATAGTG  p.Val21_Val24del            
  6. Calling intra-host variants using viral-ngs

     # #Install Gap2Seq in the docker-env --> generated the new docker-env viral-ngs-fixed:la
     # bin/assembly.py gapfill_gap2seq tmp/02_assembly/hCoV229E_Rluc.assembly2-scaffolded.fasta data/01_per_sample/hCoV229E_Rluc.cleaned.bam tmp/02_assembly/# hCoV229E_Rluc.assembly2-gapfilled.fasta --memLimitGb 12 --maskErrors --randomSeed 0
     # ----> go to the script tools/gap2seq.py, install the required tool (for example gap2seq) and adapt the correct version.
     # conda install -y -c bioconda gap2seq
     # root@f47daf7c44ee:/work# Gap2Seq -h
     # #>Gap2Seq 3.1
     # vim /home/jhuang/Tools/viral-ngs_docker/bin/tools/gap2seq.py
     # #Adapt the TOOL_NAME and TOOL_VERSION in the script
     #
     # #Save the docker-env with newly installed Gap2Seq
     # exit
     # docker ps -a
     # docker commit f47daf7c44ee viral-ngs-fixed:la
    
     #调用新的 docker-env having Gap2Seq
     docker run -it -v /mnt/md1/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2026/viralngs:/work -v /home/jhuang/Tools/viral-ngs_docker:/home/jhuang/Tools/viral-ngs_docker -v /home/jhuang/REFs:/home/jhuang/REFs -v /home/jhuang/Tools/GenomeAnalysisTK-3.6:/home/jhuang/Tools/GenomeAnalysisTK-3.6 -v /home/jhuang/Tools/novocraft_v3:/home/jhuang/Tools/novocraft_v3 -v /usr/local/bin/gatk:/usr/local/bin/gatk   viral-ngs-fixed:la bash
     cd /work
     conda activate viral-ngs-env
     snakemake --directory /work --printshellcmds --cores 80
    
     # The complete running commands for the complete data in 2026 as follows:
    
     #-1- Run several times of docker viral-ngs-fixed:la so that all ${sample}.assembly2-scaffolded.fasta generated by running each time only a isolate (one record in samples-assembly.txt and empty in samples-runs.txt)
     docker run ...
     cd /work
     conda activate viral-ngs-env
     snakemake --directory /work --printshellcmds --cores 80
     # --> Note that tmp/02_assembly/${sample}.assembly2-scaffolded.fasta and tmp/02_assembly/${sample}.assembly2-gapfilled.fasta were generated!
    
     #-2- Then run the following commands: tmp/02_assembly/${sample}.assembly2-gapfilled.fasta --> tmp/02_assembly/${sample}.assembly3-modify.fasta
     for sample in DMSO_p10 K22_p10 X7523_p10    DMSO_p16 K22_p16 X7523_p16    DMSO_p26 K22_p26 X7523_p26; do
             #MANUALLY running the following commands!
             bin/assembly.py impute_from_reference tmp/02_assembly/${sample}.assembly2-gapfilled.fasta tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta tmp/02_assembly/${sample}.assembly3-modify.fasta --newName ${sample} --replaceLength 55 --minLengthFraction 0.05 --minUnambig 0.05 --index
     done
    
     #-3- Run docker for all isolates (full in amples-assembly.txt and samples-runs.txt).
     docker run ...
     cd /work
     conda activate viral-ngs-env
     snakemake --directory /work --printshellcmds --cores 80
  7. Generate variant_annot.xls and coverages.xls

     sudo chown -R jhuang:jhuang data
     # -- generate isnvs_annot_complete__.txt, isnvs_annot_0.05.txt from ~/DATA/Data_Pietschmann_RSV_Probe3/data/04_intrahost
     cp isnvs.annot.txt isnvs.annot_complete.txt
     ~/Tools/csv2xls-0.4/csv_to_xls.py isnvs.annot_complete.txt -d$'\t' -o isnvs.annot_complete.xls
     #delete the columns patient, time, Hw and Hs and the header in the xls and save as txt file.
    
     awk '{printf "%.3f\n", $5}' isnvs.annot_complete.csv > f5
     cut -f1-4 isnvs.annot_complete.csv > f1_4
     cut -f6- isnvs.annot_complete.csv > f6_
     paste f1_4 f5 > f1_5
     paste f1_5 f6_ > isnvs_annot_complete_.txt
     #correct f5 in header of isnvs_annot_complete_.txt to iSNV_freq
     #Add header: chr    pos sample  alleles iSNV_freq   eff_type    eff_codon_dna   eff_aa  eff_aa_pos  eff_prot_len    eff_gene    eff_protein
     ~/Tools/csv2xls-0.4/csv_to_xls.py isnvs_annot_complete_.txt -d$'\t' -o variant_annot.xls
    
     #MANUALLY generate variant_annot_0.01.csv variant_annot_0.05.csv
     awk ' $5 >= 0.05 ' isnvs_annot_complete_.txt > 0.05.csv
     cut -f2 0.05.csv | uniq > ids_0.05
    
     awk ' $5 >= 0.02 ' isnvs_annot_complete_.txt > 0.02.csv
     cut -f2 0.02.csv | uniq > ids_0.02
    
     awk ' $5 >= 0.01 ' isnvs_annot_complete_.txt > 0.01.csv
     cut -f2 0.01.csv | uniq > ids_0.01
    
     #Replace '\n' with '\\t" isnvs_annot_complete_.txt >> isnvs_annot_0.05.txt\ngrep -P "PP810610\\t' in ids_0.05 and then deleting the 'pos' line
     #Replace '\n' with '\\t" isnvs_annot_complete_.txt >> isnvs_annot_0.01.txt\ngrep -P "PP810610\\t'  in ids_0.01 and then deleting the 'pos' line
     #Run ids_0.05 and ids_0.01
    
     cp ../../Outputs/Master_vcf/All_SNPs_indels_annotated.txt ../../Outputs/Master_vcf/All_SNPs_indels_annotated.txt hCoV229E_Rluc_variants
     #Manully adding the 3-way SNPs by getting original data from isnvs.annot.csv, for example,
     PP810610    8289    hCoV229E_Rluc   C,A,T   0.325, 0.439    missense_variant    8072C>A,8072C>T Ala2691Asp,Ala2691Val   2691    6758    Gene_217_20492  XBA84229.1
    
     # Delete the three records which already reported in intra-host results hCoV229E_Rluc_variants: they are 10871, 19289, 23435.
     PP810610       10871   C       T       SNP     C       C/T     T       C/T     C/T     T       C/T     missense_variant        MODERATE        MISSENSE        Ctt/Ttt p.Leu3552Phe/c.10654C>T 6757    CDS_1   protein_coding
     PP810610       19289   G       T       SNP     G       G       T       G       G       G/T     G       missense_variant        MODERATE        MISSENSE        Gtt/Ttt p.Val6358Phe/c.19072G>T 6757    CDS_1   protein_coding
     PP810610       23435   C       T       SNP     C       C       T       C/T     C       C/T     C/T     missense_variant        MODERATE        MISSENSE        Ctt/Ttt p.Leu981Phe/c.2941C>T   1173    CDS_2   protein_coding
    
     ~/Tools/csv2xls-0.4/csv_to_xls.py isnvs_annot_0.02.txt All_SNPs_indels_annotated.txt -d$'\t' -o variant_annot_2026.xls
     #Modify sheetname to variant_annot_0.05 and variant_annot_0.01 and add the header in Excel file.
     #Note in the complete list, Set 2024 is NOT a subset of Set 2025 because the element 26283 is in set 2024 but missing from set 2025.
    
     # -- calculate the coverage
     samtools depth ./data/02_align_to_self/hCoV229E_Rluc.mapped.bam > hCoV229E_Rluc_cov.txt
     samtools depth ./data/02_align_to_self/p10_DMSO.mapped.bam > p10_DMSO_cov.txt
     samtools depth ./data/02_align_to_self/p10_K22.mapped.bam > p10_K22_cov.txt
     samtools depth ./data/02_align_to_self/p10_K7523.mapped.bam > p10_K7523_cov.txt
     ~/Tools/csv2xls-0.4/csv_to_xls.py hCoV229E_Rluc_cov.txt p10_DMSO_cov.txt p10_K22_cov.txt p10_K7523_cov.txt -d$'\t' -o coverages.xls
     #draw coverage and see if they are continuous?
     samtools depth ./data/02_align_to_self/p16_DMSO.mapped.bam > p16_DMSO_cov.txt
     samtools depth ./data/02_align_to_self/p16_K22.mapped.bam > p16_K22_cov.txt
     samtools depth ./data/02_align_to_self/p16_X7523.mapped.bam > p16_K7523_cov.txt
     ~/Tools/csv2xls-0.4/csv_to_xls.py p16_DMSO_cov.txt p16_K22_cov.txt p16_K7523_cov.txt -d$'\t' -o coverages_p16.xls
    
             # Load required packages
             library(ggplot2)
             library(dplyr)
    
             # Read the coverage data
             cov_data <- read.table("p16_K7523_cov.txt", header = FALSE, sep = "\t",
                             col.names = c("Chromosome", "Position", "Coverage"))
    
             # Create full position range for the given chromosome
             full_range <- data.frame(Position = seq(min(cov_data$Position), max(cov_data$Position)))
    
             # Merge with actual coverage data and fill missing positions with 0
             cov_full <- full_range %>%
             left_join(cov_data[, c("Position", "Coverage")], by = "Position") %>%
             mutate(Coverage = ifelse(is.na(Coverage), 0, Coverage))
    
             # Save the plot to PNG
             png("p16_K7523_coverage_filled.png", width = 1200, height = 600)
    
             ggplot(cov_full, aes(x = Position, y = Coverage)) +
             geom_line(color = "steelblue", size = 0.3) +
             labs(title = "Coverage Plot for p16_K7523 (Missing = 0)",
             x = "Genomic Position",
             y = "Coverage Depth") +
             theme_minimal() +
             theme(
             plot.title = element_text(hjust = 0.5),
             axis.text = element_text(size = 10),
             axis.title = element_text(size = 12)
             )
    
             dev.off()
  8. Comparing intra- and inter-host variants (40 positions are common between spandx and vphaser2)

     Please find attached the complete variant calling results for our 229E passaging experiment (2024–2026 datasets): variant_annot_2026__final.xls.
    
     • Sheet 1 (All_SNPs_Indels_annotated): Inter-host variants called by spandx. Shows consensus alleles at each variant position across all 10 samples relative to the PP810610 reference.
     • Sheet 2 (isnvs_annot): Intra-host variants called by vphaser2. Provides precise allele frequencies (iSNV_freq) per sample to quantify within-host diversity.
    
     Overlap:
    
     Green-highlighted rows indicate variants consistently detected by both pipelines.
     Sheet 2 is a subset of Sheet 1, focusing only on positions with intra-host heterogeneity. Fixed differences from the reference (where all 10 isolates share the same non-reference allele) remain in Sheet 1 (white rows). A few positions in Sheet 2 show iSNV_freq ≈ 1.0 (e.g., POS 15458, 22636). These reflect positions where vphaser2 detected minor variation in at least one sample, even if most samples show near-fixation.
  9. (Optional) Consensus sequences of each and of all isolates

     cat PP810610.1.fa OZ035258.1.fa MZ712010.1.fa OK662398.1.fa OK625404.1.fa KF293664.1.fa NC_002645.1.fa > all.fa
     cp data/02_assembly/*.fasta ./
     for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523; do \
     mv ${sample}.fasta ${sample}.fa
     cat all.fa ${sample}.fa >> all.fa
     done
    
     cat RSV_dedup.fa all.fa > RSV_all.fa
     mafft --clustalout --adjustdirection RSV_all.fa > RSV_all.aln
     snp-sites RSV_all.aln -o RSV_all_.aln