-
Preparing raw data
mkdir raw_data; cd raw_data # control samples (8) ln -s ../X101SC26025981-Z01-J001/01.RawData/1/1_1.fq.gz AYE-WT_ctr_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/1/1_2.fq.gz AYE-WT_ctr_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/2/2_1.fq.gz AYE-WT_ctr_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/2/2_2.fq.gz AYE-WT_ctr_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/3/3_1.fq.gz AYE-WT_ctr_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/3/3_2.fq.gz AYE-WT_ctr_r3_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/4/4_1.fq.gz AYE-T_ctr_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/4/4_2.fq.gz AYE-T_ctr_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/5/5_1.fq.gz AYE-T_ctr_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/5/5_2.fq.gz AYE-T_ctr_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/6/6_1.fq.gz AYE-T_ctr_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/6/6_2.fq.gz AYE-T_ctr_r3_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/7/7_1.fq.gz AYE-O_ctr_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/7/7_2.fq.gz AYE-O_ctr_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/8/8_1.fq.gz AYE-O_ctr_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/8/8_2.fq.gz AYE-O_ctr_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/9/9_1.fq.gz AYE-O_ctr_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/9/9_2.fq.gz AYE-O_ctr_r3_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/10/10_1.fq.gz O-Trans_ctr_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/10/10_2.fq.gz O-Trans_ctr_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/11/11_1.fq.gz O-Trans_ctr_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/11/11_2.fq.gz O-Trans_ctr_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/12/12_1.fq.gz O-Trans_ctr_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/12/12_2.fq.gz O-Trans_ctr_r3_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/1new/1new_1.fq.gz WT-Trans_ctr_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/1new/1new_2.fq.gz WT-Trans_ctr_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/2new/2new_1.fq.gz WT-Trans_ctr_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/2new/2new_2.fq.gz WT-Trans_ctr_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/3new/3new_1.fq.gz WT-Trans_ctr_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/3new/3new_2.fq.gz WT-Trans_ctr_r3_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/49/49_1.fq.gz AYE-WT_ctr_solid_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/49/49_2.fq.gz AYE-WT_ctr_solid_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/50/50_1.fq.gz AYE-WT_ctr_solid_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/50/50_2.fq.gz AYE-WT_ctr_solid_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/51/51_1.fq.gz AYE-WT_ctr_solid_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/51/51_2.fq.gz AYE-WT_ctr_solid_r3_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/52/52_1.fq.gz AYE-O_ctr_solid_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/52/52_2.fq.gz AYE-O_ctr_solid_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/53/53_1.fq.gz AYE-O_ctr_solid_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/53/53_2.fq.gz AYE-O_ctr_solid_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/54/54_1.fq.gz AYE-O_ctr_solid_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/54/54_2.fq.gz AYE-O_ctr_solid_r3_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/55/55_1.fq.gz AYE-T_ctr_solid_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/55/55_2.fq.gz AYE-T_ctr_solid_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/56/56_1.fq.gz AYE-T_ctr_solid_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/56/56_2.fq.gz AYE-T_ctr_solid_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/57/57_1.fq.gz AYE-T_ctr_solid_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/57/57_2.fq.gz AYE-T_ctr_solid_r3_R2.fastq.gz # Diclofenac(双氯芬酸)treatment (6) ln -s ../X101SC26025981-Z01-J001/01.RawData/25/25_1.fq.gz AYE-WT_Diclo750_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/25/25_2.fq.gz AYE-WT_Diclo750_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/26/26_1.fq.gz AYE-WT_Diclo750_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/26/26_2.fq.gz AYE-WT_Diclo750_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/27/27_1.fq.gz AYE-WT_Diclo750_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/27/27_2.fq.gz AYE-WT_Diclo750_r3_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/28/28_1.fq.gz AYE-T_Diclo375_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/28/28_2.fq.gz AYE-T_Diclo375_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/29/29_1.fq.gz AYE-T_Diclo375_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/29/29_2.fq.gz AYE-T_Diclo375_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/30/30_1.fq.gz AYE-T_Diclo375_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/30/30_2.fq.gz AYE-T_Diclo375_r3_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/31/31_1.fq.gz AYE-O_Diclo375_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/31/31_2.fq.gz AYE-O_Diclo375_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/32/32_1.fq.gz AYE-O_Diclo375_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/32/32_2.fq.gz AYE-O_Diclo375_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/33/33_1.fq.gz AYE-O_Diclo375_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/33/33_2.fq.gz AYE-O_Diclo375_r3_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/34/34_1.fq.gz O-Trans_Diclo375_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/34/34_2.fq.gz O-Trans_Diclo375_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/35/35_1.fq.gz O-Trans_Diclo375_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/35/35_2.fq.gz O-Trans_Diclo375_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/36/36_1.fq.gz O-Trans_Diclo375_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/36/36_2.fq.gz O-Trans_Diclo375_r3_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/4new/4new_1.fq.gz WT-Trans_Diclo750_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/4new/4new_2.fq.gz WT-Trans_Diclo750_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/5new/5new_1.fq.gz WT-Trans_Diclo750_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/5new/5new_2.fq.gz WT-Trans_Diclo750_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/6new/6new_1.fq.gz WT-Trans_Diclo750_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/6new/6new_2.fq.gz WT-Trans_Diclo750_r3_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/73/73_1.fq.gz AYE-WT_Diclo1250_solid_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/73/73_2.fq.gz AYE-WT_Diclo1250_solid_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/74/74_1.fq.gz AYE-WT_Diclo1250_solid_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/74/74_2.fq.gz AYE-WT_Diclo1250_solid_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/75/75_1.fq.gz AYE-WT_Diclo1250_solid_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/75/75_2.fq.gz AYE-WT_Diclo1250_solid_r3_R2.fastq.gz # Rifampicin(利福平)treatment (4) ln -s ../X101SC26025981-Z01-J001/01.RawData/13/13_1.fq.gz AYE-WT_Rifampicin1.5_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/13/13_2.fq.gz AYE-WT_Rifampicin1.5_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/14/14_1.fq.gz AYE-WT_Rifampicin1.5_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/14/14_2.fq.gz AYE-WT_Rifampicin1.5_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/15/15_1.fq.gz AYE-WT_Rifampicin1.5_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/15/15_2.fq.gz AYE-WT_Rifampicin1.5_r3_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/16/16_1.fq.gz AYE-T_Rifampicin2_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/16/16_2.fq.gz AYE-T_Rifampicin2_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/17/17_1.fq.gz AYE-T_Rifampicin2_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/17/17_2.fq.gz AYE-T_Rifampicin2_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/18/18_1.fq.gz AYE-T_Rifampicin2_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/18/18_2.fq.gz AYE-T_Rifampicin2_r3_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/19/19_1.fq.gz AYE-O_Rifampicin2_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/19/19_2.fq.gz AYE-O_Rifampicin2_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/20/20_1.fq.gz AYE-O_Rifampicin2_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/20/20_2.fq.gz AYE-O_Rifampicin2_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/21/21_1.fq.gz AYE-O_Rifampicin2_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/21/21_2.fq.gz AYE-O_Rifampicin2_r3_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/22/22_1.fq.gz O-Trans_Rifampicin2_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/22/22_2.fq.gz O-Trans_Rifampicin2_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/23/23_1.fq.gz O-Trans_Rifampicin2_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/23/23_2.fq.gz O-Trans_Rifampicin2_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/24/24_1.fq.gz O-Trans_Rifampicin2_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/24/24_2.fq.gz O-Trans_Rifampicin2_r3_R2.fastq.gz # Meropenem(美罗培南)treatment (4) ln -s ../X101SC26025981-Z01-J001/01.RawData/37/37_1.fq.gz AYE-WT_Mero0.35-0.5_r1_R1.fastq.gz #AYE-WT_Mero0.5_r1 ln -s ../X101SC26025981-Z01-J001/01.RawData/37/37_2.fq.gz AYE-WT_Mero0.35-0.5_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/38/38_1.fq.gz AYE-WT_Mero0.35-0.5_r2_R1.fastq.gz #AYE-WT_YX_Mero0.35_r2 ln -s ../X101SC26025981-Z01-J001/01.RawData/38/38_2.fq.gz AYE-WT_Mero0.35-0.5_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/39/39_1.fq.gz AYE-WT_Mero0.35-0.5_r3_R1.fastq.gz #AYE-WT_public_Mero0.35_r3 ln -s ../X101SC26025981-Z01-J001/01.RawData/39/39_2.fq.gz AYE-WT_Mero0.35-0.5_r3_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/40/40_1.fq.gz AYE-T_Mero0.15_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/40/40_2.fq.gz AYE-T_Mero0.15_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/41/41_1.fq.gz AYE-T_Mero0.15_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/41/41_2.fq.gz AYE-T_Mero0.15_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/42/42_1.fq.gz AYE-T_Mero0.15_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/42/42_2.fq.gz AYE-T_Mero0.15_r3_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/43/43_1.fq.gz AYE-O_Mero0.5_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/43/43_2.fq.gz AYE-O_Mero0.5_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/44/44_1.fq.gz AYE-O_Mero0.5_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/44/44_2.fq.gz AYE-O_Mero0.5_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/45/45_1.fq.gz AYE-O_Mero0.5_r3_R1.fastq.gz #Mero0.45 ln -s ../X101SC26025981-Z01-J001/01.RawData/45/45_2.fq.gz AYE-O_Mero0.5_r3_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/46/46_1.fq.gz O-Trans_Mero0.25_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/46/46_2.fq.gz O-Trans_Mero0.25_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/47/47_1.fq.gz O-Trans_Mero0.25_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/47/47_2.fq.gz O-Trans_Mero0.25_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/48/48_1.fq.gz O-Trans_Mero0.25_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/48/48_2.fq.gz O-Trans_Mero0.25_r3_R2.fastq.gz # Azithromycin(阿奇霉素)treatment (5), among them, F_ctr_solid is clinical isolate. ln -s ../X101SC26025981-Z01-J001/01.RawData/58/58_1.fq.gz F_ctr_solid_r1_R1.fastq.gz #clinical ln -s ../X101SC26025981-Z01-J001/01.RawData/58/58_2.fq.gz F_ctr_solid_r1_R2.fastq.gz #clinical ln -s ../X101SC26025981-Z01-J001/01.RawData/59/59_1.fq.gz F_ctr_solid_r2_R1.fastq.gz #clinical ln -s ../X101SC26025981-Z01-J001/01.RawData/59/59_2.fq.gz F_ctr_solid_r2_R2.fastq.gz #clinical ln -s ../X101SC26025981-Z01-J001/01.RawData/60/60_1.fq.gz F_ctr_solid_r3_R1.fastq.gz #clinical ln -s ../X101SC26025981-Z01-J001/01.RawData/60/60_2.fq.gz F_ctr_solid_r3_R2.fastq.gz #clinical ln -s ../X101SC26025981-Z01-J001/01.RawData/61/61_1.fq.gz AYE-WT_Azi20_solid_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/61/61_2.fq.gz AYE-WT_Azi20_solid_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/62/62_1.fq.gz AYE-WT_Azi20_solid_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/62/62_2.fq.gz AYE-WT_Azi20_solid_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/63/63_1.fq.gz AYE-WT_Azi20_solid_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/63/63_2.fq.gz AYE-WT_Azi20_solid_r3_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/67/67_1.fq.gz AYE-T_Azi20_solid_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/67/67_2.fq.gz AYE-T_Azi20_solid_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/68/68_1.fq.gz AYE-T_Azi20_solid_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/68/68_2.fq.gz AYE-T_Azi20_solid_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/69/69_1.fq.gz AYE-T_Azi20_solid_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/69/69_2.fq.gz AYE-T_Azi20_solid_r3_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/64/64_1.fq.gz AYE-O_Azi20_solid_r1_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/64/64_2.fq.gz AYE-O_Azi20_solid_r1_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/65/65_1.fq.gz AYE-O_Azi20_solid_r2_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/65/65_2.fq.gz AYE-O_Azi20_solid_r2_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/66/66_1.fq.gz AYE-O_Azi20_solid_r3_R1.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/66/66_2.fq.gz AYE-O_Azi20_solid_r3_R2.fastq.gz ln -s ../X101SC26025981-Z01-J001/01.RawData/70/70_1.fq.gz F_Azi20_solid_r1_R1.fastq.gz #clinical ln -s ../X101SC26025981-Z01-J001/01.RawData/70/70_2.fq.gz F_Azi20_solid_r1_R2.fastq.gz #clinical ln -s ../X101SC26025981-Z01-J001/01.RawData/71/71_1.fq.gz F_Azi20_solid_r2_R1.fastq.gz #clinical ln -s ../X101SC26025981-Z01-J001/01.RawData/71/71_2.fq.gz F_Azi20_solid_r2_R2.fastq.gz #clinical ln -s ../X101SC26025981-Z01-J001/01.RawData/72/72_1.fq.gz F_Azi20_solid_r3_R1.fastq.gz #clinical ln -s ../X101SC26025981-Z01-J001/01.RawData/72/72_2.fq.gz F_Azi20_solid_r3_R2.fastq.gz #clinical -
Preparing the directory trimmed
mkdir trimmed trimmed_unpaired; for sample_id in AYE-O_Azi20_solid_r1 AYE-O_Azi20_solid_r2 AYE-O_Azi20_solid_r3 AYE-O_ctr_r1 AYE-O_ctr_r2 AYE-O_ctr_r3 AYE-O_ctr_solid_r1 AYE-O_ctr_solid_r2 AYE-O_ctr_solid_r3 AYE-O_Diclo375_r1 AYE-O_Diclo375_r2 AYE-O_Diclo375_r3 AYE-O_Mero0.5_r1 AYE-O_Mero0.5_r2 AYE-O_Mero0.5_r3 AYE-O_Rifampicin2_r1 AYE-O_Rifampicin2_r2 AYE-O_Rifampicin2_r3 AYE-T_Azi20_solid_r1 AYE-T_Azi20_solid_r2 AYE-T_Azi20_solid_r3 AYE-T_ctr_r1 AYE-T_ctr_r2 AYE-T_ctr_r3 AYE-T_ctr_solid_r1 AYE-T_ctr_solid_r2 AYE-T_ctr_solid_r3 AYE-T_Diclo375_r1 AYE-T_Diclo375_r2 AYE-T_Diclo375_r3 AYE-T_Mero0.15_r1 AYE-T_Mero0.15_r2 AYE-T_Mero0.15_r3 AYE-T_Rifampicin2_r1 AYE-T_Rifampicin2_r2 AYE-T_Rifampicin2_r3 AYE-WT_Azi20_solid_r1 AYE-WT_Azi20_solid_r2 AYE-WT_Azi20_solid_r3 AYE-WT_ctr_r1 AYE-WT_ctr_r2 AYE-WT_ctr_r3 AYE-WT_ctr_solid_r1 AYE-WT_ctr_solid_r2 AYE-WT_ctr_solid_r3 AYE-WT_Diclo1250_solid_r1 AYE-WT_Diclo1250_solid_r2 AYE-WT_Diclo1250_solid_r3 AYE-WT_Diclo750_r1 AYE-WT_Diclo750_r2 AYE-WT_Diclo750_r3 AYE-WT_Mero0.35-0.5_r1 AYE-WT_Mero0.35-0.5_r2 AYE-WT_Mero0.35-0.5_r3 AYE-WT_Rifampicin1.5_r1 AYE-WT_Rifampicin1.5_r2 AYE-WT_Rifampicin1.5_r3 F_Azi20_solid_r1 F_Azi20_solid_r2 F_Azi20_solid_r3 F_ctr_solid_r1 F_ctr_solid_r2 F_ctr_solid_r3 O-Trans_ctr_r1 O-Trans_ctr_r2 O-Trans_ctr_r3 O-Trans_Diclo375_r1 O-Trans_Diclo375_r2 O-Trans_Diclo375_r3 O-Trans_Mero0.25_r1 O-Trans_Mero0.25_r2 O-Trans_Mero0.25_r3 O-Trans_Rifampicin2_r1 O-Trans_Rifampicin2_r2 O-Trans_Rifampicin2_r3 WT-Trans_ctr_r1 WT-Trans_ctr_r2 WT-Trans_ctr_r3 WT-Trans_Diclo750_r1 WT-Trans_Diclo750_r2 WT-Trans_Diclo750_r3; do \ for sample_id in AYE-T_Diclo375_r2; do \ java -jar /home/jhuang/Tools/Trimmomatic-0.36/trimmomatic-0.36.jar PE -threads 100 raw_data/${sample_id}_R1.fastq.gz raw_data/${sample_id}_R2.fastq.gz trimmed/${sample_id}_R1.fastq.gz trimmed_unpaired/${sample_id}_R1.fastq.gz trimmed/${sample_id}_R2.fastq.gz trimmed_unpaired/${sample_id}_R2.fastq.gz ILLUMINACLIP:/home/jhuang/Tools/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa:2:30:10:8:TRUE LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 AVGQUAL:20; done 2> trimmomatic_pe.log; done -
(Optional) using trinity to find the most closely reference
#Trinity --seqType fq --max_memory 50G --left trimmed/wt_r1_R1.fastq.gz --right trimmed/wt_r1_R2.fastq.gz --CPU 12 #https://www.genome.jp/kegg/tables/br08606.html#prok acb KGB Acinetobacter baumannii ATCC 17978 2007 GenBank abm KGB Acinetobacter baumannii SDF 2008 GenBank aby KGB Acinetobacter baumannii AYE 2008 GenBank --> * abc KGB Acinetobacter baumannii ACICU 2008 GenBank abn KGB Acinetobacter baumannii AB0057 2008 GenBank abb KGB Acinetobacter baumannii AB307-0294 2008 GenBank abx KGB Acinetobacter baumannii 1656-2 2012 GenBank abz KGB Acinetobacter baumannii MDR-ZJ06 2012 GenBank abr KGB Acinetobacter baumannii MDR-TJ 2012 GenBank abd KGB Acinetobacter baumannii TCDC-AB0715 2012 GenBank abh KGB Acinetobacter baumannii TYTH-1 2012 GenBank abad KGB Acinetobacter baumannii D1279779 2013 GenBank abj KGB Acinetobacter baumannii BJAB07104 2013 GenBank abab KGB Acinetobacter baumannii BJAB0715 2013 GenBank abaj KGB Acinetobacter baumannii BJAB0868 2013 GenBank abaz KGB Acinetobacter baumannii ZW85-1 2013 GenBank abk KGB Acinetobacter baumannii AbH12O-A2 2014 GenBank abau KGB Acinetobacter baumannii AB030 2014 GenBank abaa KGB Acinetobacter baumannii AB031 2014 GenBank abw KGB Acinetobacter baumannii AC29 2014 GenBank abal KGB Acinetobacter baumannii LAC-4 2015 GenBank #Note that the Acinetobacter baumannii strain ATCC 19606 chromosome, complete genome (GenBank: CU459141.1) was choosen as reference! -
Preparing samplesheet.csv
sample,fastq_1,fastq_2,strandedness Urine_r1,Urine_r1_R1.fq.gz,Urine_r1_R2.fq.gz,auto ... -
Downloading CU459141.fasta and CU459141.gff from GenBank and preparing CU459141_m.gff
#Example1: http://xgenes.com/article/article-content/157/prepare-virus-gtf-for-nextflow-run/ #Default NOT_WORKING: --gtf_group_features 'gene_id' --gtf_extra_attributes 'gene_name' --featurecounts_group_type 'gene_biotype' --featurecounts_feature_type 'exon' #(host_env) !NOT_WORKING! jhuang@WS-2290C:~/DATA/Data_Tam_RNAseq_2024$ /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results --fasta "/home/jhuang/DATA/Data_Tam_RNAseq_2024/CU459141.fasta" --gff "/home/jhuang/DATA/Data_Tam_RNAseq_2024/CU459141.gff" -profile docker -resume --max_cpus 55 --max_memory 512.GB --max_time 2400.h --save_align_intermeds --save_unaligned --save_reference --aligner 'star_salmon' --gtf_group_features 'gene_id' --gtf_extra_attributes 'gene_name' --featurecounts_group_type 'gene_biotype' --featurecounts_feature_type 'transcript' # -- DEBUG_1 (CDS --> exon in CP059040.gff) -- #Checking the record (see below) in results/genome/CP059040.gtf #In ./results/genome/CP059040.gtf e.g. "CP059040.1 Genbank transcript 1 1398 . + . transcript_id "gene-H0N29_00005"; gene_id "gene-H0N29_00005"; gene_name "dnaA"; Name "dnaA"; gbkey "Gene"; gene "dnaA"; gene_biotype "protein_coding"; locus_tag "H0N29_00005";" #--featurecounts_feature_type 'transcript' returns only the tRNA results #Since the tRNA records have "transcript and exon". In gene records, we have "transcript and CDS". replace the CDS with exon grep -P "\texon\t" CP059040.gff | sort | wc -l #96 grep -P "cmsearch\texon\t" CP059040.gff | wc -l #=10 ignal recognition particle sRNA small typ, transfer-messenger RNA, 5S ribosomal RNA grep -P "Genbank\texon\t" CP059040.gff | wc -l #=12 16S and 23S ribosomal RNA grep -P "tRNAscan-SE\texon\t" CP059040.gff | wc -l #tRNA 74 wc -l star_salmon/AUM_r3/quant.genes.sf #--featurecounts_feature_type 'transcript' results in 96 records! grep -P "\tCDS\t" CU459141.gff3 | wc -l #3659 sed 's/\tCDS\t/\texon\t/g' CU459141.gff3 > CU459141_m.gff grep -P "\texon\t" CU459141_m.gff | sort | wc -l #3760 # -- DEBUG_2: combination of 'CU459141_m.gff' and 'exon' results in ERROR, using 'transcript' instead! --gff "/home/jhuang/DATA/Data_Tam_RNAseq_2026_on_AYE/CU459141_m.gff" --featurecounts_feature_type 'transcript' # -- DEBUG_3: make sure the header of fasta is the same to the *_m.gff file -
nextflow run
# ---- SUCCESSFUL with directly downloaded gff3 and fasta from NCBI using docker after replacing 'CDS' with 'exon' ---- (host_env) mv trimmed/*.fastq.gz . (host_env) nextflow run nf-core/rnaseq -r 3.14.0 -profile docker \–input samplesheet.csv –outdir results –fasta “/home/jhuang/DATA/Data_Tam_RNAseq_2026_on_AYE/CU459141.fasta” –gff “/home/jhuang/DATA/Data_Tam_RNAseq_2026_on_AYE/CU459141_m.gff” -resume –max_cpus 90 –max_memory 900.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘star_salmon’ –gtf_group_features ‘gene_id’ –gtf_extra_attributes ‘gene_name’ –featurecounts_group_type ‘gene_biotype’ –featurecounts_feature_type ‘transcript’
-
Import data and pca-plot
#TODO_AFTERNOON: at least sent today a PCA, am besten all comparison tables! #mamba activate r_env #install.packages("ggfun") # Import the required libraries library("AnnotationDbi") library("clusterProfiler") library("ReactomePA") library(gplots) library(tximport) library(DESeq2) #library("org.Hs.eg.db") library(dplyr) library(tidyverse) #install.packages("devtools") #devtools::install_version("gtable", version = "0.3.0") library(gplots) library("RColorBrewer") #install.packages("ggrepel") library("ggrepel") # install.packages("openxlsx") library(openxlsx) library(EnhancedVolcano) library(DESeq2) library(edgeR) setwd("~/DATA/Data_Tam_RNAseq_2026_on_AYE/results/star_salmon") # Define paths to your Salmon output quantification files # Store sample names in a character vector samples <- c( "AYE-O_Azi20_solid_r1", "AYE-O_Azi20_solid_r2", "AYE-O_Azi20_solid_r3", "AYE-O_ctr_r1", "AYE-O_ctr_r2", "AYE-O_ctr_r3", "AYE-O_ctr_solid_r1", "AYE-O_ctr_solid_r2", "AYE-O_ctr_solid_r3", "AYE-O_Diclo375_r1", "AYE-O_Diclo375_r2", "AYE-O_Diclo375_r3", "AYE-O_Mero0.5_r1", "AYE-O_Mero0.5_r2", "AYE-O_Mero0.5_r3", "AYE-O_Rifampicin2_r1", "AYE-O_Rifampicin2_r2", "AYE-O_Rifampicin2_r3", "AYE-T_Azi20_solid_r1", "AYE-T_Azi20_solid_r2", "AYE-T_Azi20_solid_r3", "AYE-T_ctr_r1", "AYE-T_ctr_r2", "AYE-T_ctr_r3", "AYE-T_ctr_solid_r1", "AYE-T_ctr_solid_r2", "AYE-T_ctr_solid_r3", "AYE-T_Diclo375_r1", "AYE-T_Diclo375_r2", "AYE-T_Diclo375_r3", "AYE-T_Mero0.15_r1", "AYE-T_Mero0.15_r2", "AYE-T_Mero0.15_r3", "AYE-T_Rifampicin2_r1", "AYE-T_Rifampicin2_r2", "AYE-T_Rifampicin2_r3", "AYE-WT_Azi20_solid_r1", "AYE-WT_Azi20_solid_r2", "AYE-WT_Azi20_solid_r3", "AYE-WT_ctr_r1", "AYE-WT_ctr_r2", "AYE-WT_ctr_r3", "AYE-WT_ctr_solid_r1", "AYE-WT_ctr_solid_r2", "AYE-WT_ctr_solid_r3", "AYE-WT_Diclo1250_solid_r1", "AYE-WT_Diclo1250_solid_r2", "AYE-WT_Diclo1250_solid_r3", "AYE-WT_Diclo750_r1", "AYE-WT_Diclo750_r2", "AYE-WT_Diclo750_r3", "AYE-WT_Mero0.35-0.5_r1", "AYE-WT_Mero0.35-0.5_r2", "AYE-WT_Mero0.35-0.5_r3", "AYE-WT_Rifampicin1.5_r1", "AYE-WT_Rifampicin1.5_r2", "AYE-WT_Rifampicin1.5_r3", "F_Azi20_solid_r1", "F_Azi20_solid_r2", "F_Azi20_solid_r3", "F_ctr_solid_r1", "F_ctr_solid_r2", "F_ctr_solid_r3", "O-Trans_ctr_r1", "O-Trans_ctr_r2", "O-Trans_ctr_r3", "O-Trans_Diclo375_r1", "O-Trans_Diclo375_r2", "O-Trans_Diclo375_r3", "O-Trans_Mero0.25_r1", "O-Trans_Mero0.25_r2", "O-Trans_Mero0.25_r3", "O-Trans_Rifampicin2_r1", "O-Trans_Rifampicin2_r2", "O-Trans_Rifampicin2_r3", "WT-Trans_ctr_r1", "WT-Trans_ctr_r2", "WT-Trans_ctr_r3", "WT-Trans_Diclo750_r1", "WT-Trans_Diclo750_r2", "WT-Trans_Diclo750_r3" ) ## Automatically generate the named vector files <- setNames(paste0("./", samples, "/quant.sf"), samples) # ----------------------------------------------------------------- # ---- Step 1: Create Detailed Metadata from Your Sample Names ---- # Extract metadata from sample names samples <- names(files) # Parse the complex sample names metadata <- data.frame( sample = samples, stringsAsFactors = FALSE ) # Extract strain (everything before first underscore or hyphen treatment) metadata$strain <- sapply(strsplit(samples, "[-_]"), function(x) { if(x[1] %in% c("AYE", "O", "WT", "F")) { if(x[1] == "AYE" && length(x) > 1 && x[2] %in% c("WT", "T", "O")) { paste(x[1:2], collapse = "-") } else if(x[1] %in% c("O", "WT") && x[2] == "Trans") { paste(x[1:2], collapse = "-") } else { x[1] } } else { x[1] } }) # Extract treatment type metadata$treatment <- sapply(samples, function(x) { if(grepl("_ctr", x)) return("ctrl") if(grepl("Diclo", x)) return("Diclo") if(grepl("Mero", x)) return("Mero") if(grepl("Azi", x)) return("Azi") if(grepl("Rifampicin", x)) return("Rifampicin") return("ctrl") }) # Extract concentration metadata$concentration <- sapply(samples, function(x) { if(grepl("Diclo1250", x)) return("1250") if(grepl("Diclo750", x)) return("750") if(grepl("Diclo375", x)) return("375") if(grepl("Mero0.5", x)) return("0.5") if(grepl("Mero0.35", x)) return("0.35") if(grepl("Mero0.25", x)) return("0.25") if(grepl("Mero0.15", x)) return("0.15") if(grepl("Azi20", x)) return("20") if(grepl("Rifampicin2", x)) return("2") if(grepl("Rifampicin1.5", x)) return("1.5") return("0") }) # Extract condition (solid vs liquid) metadata$condition <- ifelse(grepl("_solid", samples), "solid", "liquid") # Extract replicate metadata$replicate <- sapply(strsplit(samples, "_"), function(x) { rep_part <- x[length(x)] gsub("r", "", rep_part) }) # Create combined group for easy comparisons metadata$group <- paste(metadata$strain, metadata$treatment, metadata$concentration, sep = "_") # Set row names rownames(metadata) <- metadata$sample # Reorder to match txi columns metadata <- metadata[colnames(txi$counts), ] # --------------------------------------------- # ---- Step 2: Choose Your Design Strategy ---- # Strategy A: Full Factorial Design (if balanced) dds <- DESeqDataSetFromTximport(txi, metadata, design = ~ strain + treatment + condition) # --> Strategy B: Combined Group Factor ⭐ RECOMMENDED metadata$group <- factor(paste(metadata$strain, metadata$treatment, metadata$concentration, metadata$condition, sep = "_")) dds <- DESeqDataSetFromTximport(txi, metadata, design = ~ group) dds <- DESeq(dds) # See all available comparisons resultsNames(dds) # ------------------------------------------------------------- # ---- Step 3: Set Up Specific Comparisons from Your Notes ---- # ========================================== # 1. Define Exact Comparisons from Your Notes # ========================================== planned_comparisons <- list( # --- Baseline / Strain Controls --- AYE_T_ctr_vs_AYE_WT_ctr = list(treat = "AYE-T_ctrl_0_liquid", ctrl = "AYE-WT_ctrl_0_liquid"), AYE_O_ctr_vs_AYE_WT_ctr = list(treat = "AYE-O_ctrl_0_liquid", ctrl = "AYE-WT_ctrl_0_liquid"), O_Trans_ctr_vs_AYE_WT_ctr = list(treat = "O-Trans_ctrl_0_liquid", ctrl = "AYE-WT_ctrl_0_liquid"), WT_Trans_ctr_vs_AYE_WT_ctr = list(treat = "WT-Trans_ctrl_0_liquid",ctrl = "AYE-WT_ctrl_0_liquid"), AYE_O_ctr_vs_AYE_T = list(treat = "AYE-O_ctrl_0_liquid", ctrl = "AYE-T_ctrl_0_liquid"), O_Trans_ctr_vs_AYE_T = list(treat = "O-Trans_ctrl_0_liquid", ctrl = "AYE-T_ctrl_0_liquid"), WT_Trans_ctr_vs_AYE_T = list(treat = "WT-Trans_ctrl_0_liquid",ctrl = "AYE-T_ctrl_0_liquid"), # --- Condition Effects (Solid vs Liquid) --- AYE_WT_ctr_solid_vs_AYE_WT_ctr = list(treat = "AYE-WT_ctrl_0_solid", ctrl = "AYE-WT_ctrl_0_liquid"), AYE_O_ctr_solid_vs_AYE_O_ctr = list(treat = "AYE-O_ctrl_0_solid", ctrl = "AYE-O_ctrl_0_liquid"), AYE_T_ctr_solid_vs_AYE_T_ctr = list(treat = "AYE-T_ctrl_0_solid", ctrl = "AYE-T_ctrl_0_liquid"), AYE_O_ctr_solid_vs_AYE_WT_ctr_solid= list(treat = "AYE-O_ctrl_0_solid", ctrl = "AYE-WT_ctrl_0_solid"), AYE_T_ctr_solid_vs_AYE_WT_ctr_solid= list(treat = "AYE-T_ctrl_0_solid", ctrl = "AYE-WT_ctrl_0_solid"), # --- Diclofenac --- AYE_WT_Diclo750_vs_AYE_WT_ctr = list(treat = "AYE-WT_Diclo_750_liquid", ctrl = "AYE-WT_ctrl_0_liquid"), AYE_T_Diclo375_vs_AYE_WT_ctr = list(treat = "AYE-T_Diclo_375_liquid", ctrl = "AYE-WT_ctrl_0_liquid"), AYE_O_Diclo375_vs_AYE_WT_ctr = list(treat = "AYE-O_Diclo_375_liquid", ctrl = "AYE-WT_ctrl_0_liquid"), O_Trans_Diclo375_vs_AYE_WT_ctr = list(treat = "O-Trans_Diclo_375_liquid", ctrl = "AYE-WT_ctrl_0_liquid"), WT_Trans_Diclo750_vs_AYE_WT_ctr = list(treat = "WT-Trans_Diclo_750_liquid", ctrl = "AYE-WT_ctrl_0_liquid"), Diclo_AYE_WT_1250_solid_vs_solid_ctr = list(treat = "AYE-WT_Diclo_1250_solid", ctrl = "AYE-WT_ctrl_0_solid"), # --- Meropenem --- AYE_WT_Mero_vs_AYE_WT_ctr = list(treat = "AYE-WT_Mero_0.35_liquid", ctrl = "AYE-WT_ctrl_0_liquid"), AYE_T_Mero_vs_AYE_WT_ctr = list(treat = "AYE-T_Mero_0.15_liquid", ctrl = "AYE-WT_ctrl_0_liquid"), AYE_O_Mero_vs_AYE_WT_ctr = list(treat = "AYE-O_Mero_0.5_liquid", ctrl = "AYE-WT_ctrl_0_liquid"), O_Trans_Mero_vs_AYE_WT_ctr = list(treat = "O-Trans_Mero_0.25_liquid", ctrl = "AYE-WT_ctrl_0_liquid"), AYE_T_Mero_vs_AYE_T_ctr = list(treat = "AYE-T_Mero_0.15_liquid", ctrl = "AYE-T_ctrl_0_liquid"), # --- Azithromycin (Solid) --- AYE_WT_Azi_vs_solid_ctr = list(treat = "AYE-WT_Azi_20_solid", ctrl = "AYE-WT_ctrl_0_solid"), AYE_T_Azi_vs_solid_ctr = list(treat = "AYE-T_Azi_20_solid", ctrl = "AYE-T_ctrl_0_solid"), AYE_O_Azi_vs_solid_ctr = list(treat = "AYE-O_Azi_20_solid", ctrl = "AYE-O_ctrl_0_solid"), F_Azi_vs_F_solid_ctr = list(treat = "F_Azi_20_solid", ctrl = "F_ctrl_0_solid"), # --- Rifampicin --- AYE_WT_Rif_vs_AYE_WT_ctr = list(treat = "AYE-WT_Rifampicin_1.5_liquid", ctrl = "AYE-WT_ctrl_0_liquid"), AYE_T_Rif_vs_AYE_T_ctr = list(treat = "AYE-T_Rifampicin_2_liquid", ctrl = "AYE-T_ctrl_0_liquid"), AYE_O_Rif_vs_AYE_O_ctr = list(treat = "AYE-O_Rifampicin_2_liquid", ctrl = "AYE-O_ctrl_0_liquid"), O_Trans_Rif_vs_O_Trans_ctr = list(treat = "O-Trans_Rifampicin_2_liquid", ctrl = "O-Trans_ctrl_0_liquid") ) # ========================================== # 2. Verification & Validation Script # ========================================== # Identify which column in colData holds your group names group_col <- if("group" %in% colnames(colData(dds))) "group" else if("treatment" %in% colnames(colData(dds))) "treatment" else stop("❌ Please specify the correct colData column containing group names.") actual_groups <- unique(colData(dds)[[group_col]]) cat("\n", paste(rep("=", 85), collapse=""), "\n") cat("📋 VERIFICATION OF NOTE-DERIVED COMPARISONS\n") cat(paste(rep("=", 85), collapse=""), "\n\n") validation_results <- data.frame( Comparison_Name = character(), Treatment_String = character(), Control_String = character(), Status = character(), Suggested_Contrast = character(), stringsAsFactors = FALSE ) for(name in names(planned_comparisons)) { trt <- planned_comparisons[[name]]$treat ctl <- planned_comparisons[[name]]$ctrl # Find closest matches in actual data trt_match <- actual_groups[grepl(trt, actual_groups, fixed = TRUE)] ctl_match <- actual_groups[grepl(ctl, actual_groups, fixed = TRUE)] status <- if(length(trt_match) > 0 && length(ctl_match) > 0) "✅ VALID" else "⚠️ CHECK" contrast_str <- if(status == "✅ VALID") paste0('c("', group_col, '", "', trt_match[1], '", "', ctl_match[1], '")') else "N/A" validation_results <- rbind(validation_results, data.frame( Comparison_Name = name, Treatment_String = trt, Control_String = ctl, Status = status, Suggested_Contrast = contrast_str, stringsAsFactors = FALSE )) cat(sprintf("%-45s | T:%-25s C:%-20s | %s\n", name, trt, ctl, status)) if(status == "⚠️ CHECK") { if(length(trt_match) == 0) cat(" 🔍 Treat not found. Closest: ", paste(head(actual_groups[grepl(strsplit(trt, "_")[[1]][1], actual_groups)], 3), collapse=", "), "\n") if(length(ctl_match) == 0) cat(" 🔍 Ctrl not found. Closest: ", paste(head(actual_groups[grepl(strsplit(ctl, "_")[[1]][1], actual_groups)], 3), collapse=", "), "\n") } } # ========================================== # 3. Auto-Generate DESeq2 results() Calls (Optional) # ========================================== valid_comparisons <- validation_results[validation_results$Status == "✅ VALID", ] if(nrow(valid_comparisons) > 0) { cat("\n📜 READY-TO-RUN DESeq2 CONTRASTS:\n") cat(paste(rep("-", 60), collapse=""), "\n") for(i in seq_len(nrow(valid_comparisons))) { cat(sprintf('res_%s <- results(dds, contrast = %s)\n', gsub("[^A-Za-z0-9]", "_", valid_comparisons$Comparison_Name[i]), valid_comparisons$Suggested_Contrast[i])) } } else { cat("\n⚠️ No exact matches found. Check your colData group naming convention.\n") } # ----------------------------- # ---- Step 4: PCA figures ---- # 🔍 What each figure shows: # # 01_PCA_by_Strain.png → Tests if genetic background (AYE-WT, AYE-T, AYE-O, Trans, F) is the dominant source of variation. # 02_PCA_by_Treatment.png → Shows clustering by antibiotic/drug exposure (ctrl, Diclo, Mero, Azi, Rifampicin). # 03_PCA_by_Condition.png → Reveals batch/growth media effects (solid vs liquid). # 04_PCA_CombinedGroups.png → Full experimental grouping with labeled sample names for quick outlier detection. # 05_PCA_Ellipses.png → Adds 95% confidence boundaries per strain to visualize group spread and overlap. # # ⚠️ Quick Checklist Before Running: # # Ensure metadata columns (strain, treatment, condition, group) are attached to colData(dds). # If ggrepel is missing, run install.packages("ggrepel"). # All PNGs will save to your current working directory (getwd()). # Install if missing: install.packages(c("ggplot2", "ggrepel")) library(DESeq2) library(ggplot2) library(ggrepel) # 1. Variance Stabilizing Transformation & Extract PCA Data vsd <- vst(dds, blind = FALSE) pca_data <- plotPCA(vsd, intgroup = c("strain", "treatment", "condition", "group"), returnData = TRUE) percent_var <- round(100 * attr(pca_data, "percentVar")) # Consistent theme for all plots base_theme <- theme_bw(base_size = 12) + theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 13), legend.position = "right", legend.title = element_text(face = "bold"), panel.grid.major = element_line(color = "grey90"), panel.grid.minor = element_blank()) # --- Plot 1: Colored by Strain --- p1 <- ggplot(pca_data, aes(x = PC1, y = PC2, color = strain, shape = condition)) + geom_point(size = 3, alpha = 0.8) + geom_text_repel(aes(label = name), size = 2.5, max.overlaps = 20, show.legend = FALSE) + labs(x = paste0("PC1: ", percent_var[1], "% variance"), y = paste0("PC2: ", percent_var[2], "% variance"), title = "PCA: Samples Colored by Strain", color = "Strain", shape = "Condition") + base_theme ggsave("01_PCA_by_Strain.png", p1, width = 8, height = 6, dpi = 300) # --- Plot 2: Colored by Treatment --- p2 <- ggplot(pca_data, aes(x = PC1, y = PC2, color = treatment, shape = condition)) + geom_point(size = 3, alpha = 0.8) + labs(x = paste0("PC1: ", percent_var[1], "% variance"), y = paste0("PC2: ", percent_var[2], "% variance"), title = "PCA: Samples Colored by Treatment", color = "Treatment", shape = "Condition") + base_theme ggsave("02_PCA_by_Treatment.png", p2, width = 8, height = 6, dpi = 300) # --- Plot 3: Colored by Condition (Solid vs Liquid) --- p3 <- ggplot(pca_data, aes(x = PC1, y = PC2, color = condition, shape = strain)) + geom_point(size = 3, alpha = 0.8) + labs(x = paste0("PC1: ", percent_var[1], "% variance"), y = paste0("PC2: ", percent_var[2], "% variance"), title = "PCA: Samples Colored by Growth Condition", color = "Condition", shape = "Strain") + base_theme ggsave("03_PCA_by_Condition.png", p3, width = 8, height = 6, dpi = 300) # --- Plot 4: Combined Groups with Sample Labels --- p4 <- ggplot(pca_data, aes(x = PC1, y = PC2, color = group)) + geom_point(size = 3, alpha = 0.8) + geom_text_repel(aes(label = name), size = 2, max.overlaps = 30, box.padding = 0.3) + labs(x = paste0("PC1: ", percent_var[1], "% variance"), y = paste0("PC2: ", percent_var[2], "% variance"), title = "PCA: Combined Experimental Groups", color = "Group") + base_theme + theme(legend.position = "none") ggsave("04_PCA_CombinedGroups.png", p4, width = 9, height = 7, dpi = 300) # --- Plot 5: 95% Confidence Ellipses (by Strain) --- p5 <- ggplot(pca_data, aes(x = PC1, y = PC2, color = strain, fill = strain)) + geom_point(size = 3, alpha = 0.7) + stat_ellipse(level = 0.95, alpha = 0.2, geom = "polygon", show.legend = FALSE) + labs(x = paste0("PC1: ", percent_var[1], "% variance"), y = paste0("PC2: ", percent_var[2], "% variance"), title = "PCA: 95% Confidence Ellipses by Strain", color = "Strain", fill = "Strain") + base_theme ggsave("05_PCA_Ellipses.png", p5, width = 8, height = 6, dpi = 300) message("✅ All 5 PCA plots saved to working directory!") -
Run Differential Expression & PCA Analysis Complete
(r_env) jhuang@WS-2290C:/mnt/md1/DATA/Data_Tam_RNAseq_2026_on_AYE/results/star_salmon$ Rscript complete_deg_pipeline.R
Variant Calling: 229E Passaging Dataset (2024–2026) – Inter- & Intra-host Analysis using SPANDx and VPhaser2, only one deletion generated by snippy
-
Input data:
# ---- Datasets 2024 (in total 4) ---- ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/hCoV229E_Rluc_R1.fastq.gz hCoV229E_Rluc_R1.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/hCoV229E_Rluc_R2.fastq.gz hCoV229E_Rluc_R2.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_DMSO_R1.fastq.gz DMSO_p10_R1.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_DMSO_R2.fastq.gz DMSO_p10_R2.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_K22_R1.fastq.gz K22_p10_R1.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_K22_R2.fastq.gz K22_p10_R2.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_K7523_R1.fastq.gz X7523_p10_R1.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_K7523_R2.fastq.gz X7523_p10_R2.fastq.gz # ---- Datasets 2025 (in total 3) ---- ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20606/p16_DMSO_S29_R1_001.fastq.gz DMSO_p16_R1.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20606/p16_DMSO_S29_R2_001.fastq.gz DMSO_p16_R2.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20607/p16_K22_S30_R1_001.fastq.gz K22_p16_R1.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20607/p16_K22_S30_R2_001.fastq.gz K22_p16_R2.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20608/p16_X7523_S31_R1_001.fastq.gz X7523_p16_R1.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20608/p16_X7523_S31_R2_001.fastq.gz X7523_p16_R2.fastq.gz # ---- Datasets 2026 (in total 3) ---- ln -s ../raw_data_2026/20260212_AV243904_0054_B/02_DMSO_p26/02_DMSO_p26_R1.fastq.gz DMSO_p26_R1.fastq.gz ln -s ../raw_data_2026/20260212_AV243904_0054_B/02_DMSO_p26/02_DMSO_p26_R2.fastq.gz DMSO_p26_R2.fastq.gz ln -s ../raw_data_2026/20260212_AV243904_0054_B/01_K22_p26/01_K22_p26_R1.fastq.gz K22_p26_R1.fastq.gz ln -s ../raw_data_2026/20260212_AV243904_0054_B/01_K22_p26/01_K22_p26_R2.fastq.gz K22_p26_R2.fastq.gz ln -s ../raw_data_2026/20260212_AV243904_0054_B/03_X723_p26/03_X723_p26_R1.fastq.gz X7523_p26_R1.fastq.gz ln -s ../raw_data_2026/20260212_AV243904_0054_B/03_X723_p26/03_X723_p26_R2.fastq.gz X7523_p26_R2.fastq.gz -
Call variant calling using snippy
ln -s ~/Tools/bacto/db/ .; ln -s ~/Tools/bacto/envs/ .; ln -s ~/Tools/bacto/local/ .; cp ~/Tools/bacto/Snakefile .; cp ~/Tools/bacto/bacto-0.1.json .; cp ~/Tools/bacto/cluster.json .; #download CU459141.gb from GenBank mv ~/Downloads/sequence\(2\).gb db/PP810610.gb #setting the following in bacto-0.1.json "fastqc": false, "taxonomic_classifier": false, "assembly": true, "typing_ariba": false, "typing_mlst": true, "pangenome": true, "variants_calling": true, "phylogeny_fasttree": true, "phylogeny_raxml": true, "recombination": false, (due to gubbins-error set false) "genus": "Alphacoronavirus", "kingdom": "Viruses", "species": "Human coronavirus 229E", "mykrobe": { "species": "corona" }, "reference": "db/PP810610.gb" mamba activate /home/jhuang/miniconda3/envs/bengal3_ac3 (bengal3_ac3) /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds -
Summarize all SNPs and Indels from the snippy result directory.
cp ~/Scripts/summarize_snippy_res_ordered.py . # IMPORTANT_ADAPT the array isolates = ["hCoV229E_Rluc", "DMSO_p10", "K22_p10", "X7523_p10", "DMSO_p16", "K22_p16", "X7523_p16", "DMSO_p26", "K22_p26", "X7523_p26"] mamba activate plot-numpy1 python3 ./summarize_snippy_res_ordered.py snippy #--> Summary CSV file created successfully at: snippy/summary_snps_indels.csv cd snippy #REMOVE_the_line? I don't find the sence of the line: grep -v "None,,,,,,None,None" summary_snps_indels.csv > summary_snps_indels_.csv -
Using spandx calling variants (almost the same results to the one from viral-ngs!)
mamba deactivate mamba activate /home/jhuang/miniconda3/envs/spandx mkdir ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610 cp PP810610.gb ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610/genes.gbk vim ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config /home/jhuang/miniconda3/envs/spandx/bin/snpEff build PP810610 #-d ~/Scripts/genbank2fasta.py PP810610.gb mv PP810610.gb_converted.fna PP810610.fasta #rename "NC_001348.1 xxxxx" to "NC_001348" in the fasta-file ln -s /home/jhuang/Tools/spandx/ spandx (spandx) nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq" --ref PP810610.fasta --annotation --database PP810610 -resume # RERUN SNP_matrix.sh due to the error ERROR_CHROMOSOME_NOT_FOUND in the variants annotation --> IT WORKS! cd Outputs/Master_vcf conda activate /home/jhuang/miniconda3/envs/spandx (spandx) cp -r ../../snippy/hCoV229E_Rluc/reference . (spandx) cp ../../spandx/bin/SNP_matrix.sh ./ #Note that ${variant_genome_path}=NC_001348 in the following command, but it was not used after the following command modification. #Adapt "snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -v ${variant_genome_path} out.vcf > out.annotated.vcf" to "/home/jhuang/miniconda3/envs/bengal3_ac3/bin/snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -c reference/snpeff.config -dataDir . ref out.vcf > out.annotated.vcf" in SNP_matrix.sh (spandx) bash SNP_matrix.sh PP810610 . -
Calling inter-host variants by merging the results from snippy+spandx (Manually! Note that the sample order of two tools are different, we should reorder the SNP-columns of spandx-results.)
# Inter-host variants(宿主间变异):一种病毒在两个人之间有不同的基因变异,这些变异可能与宿主的免疫反应、疾病表现或病毒传播的方式相关。 cp All_SNPs_indels_annotated.txt All_SNPs_indels_annotated_backup.txt vim All_SNPs_indels_annotated.txt #in the file ids: grep "$(echo -e '\t')353$(echo -e '\t')" All_SNPs_indels_annotated.txt >> All_SNPs_indels_annotated_.txt #Replace \n with " All_SNPs_indels_annotated.txt >> All_SNPs_indels_annotated_.txt\ngrep " #Replace grep " --> grep "$(echo -e '\t') #Replace " All_ --> $(echo -e '\t')" All_ # Potential intra-host variants: 10871, 19289, 21027, 23435. CHROM POS REF ALT TYPE hCoV229E_Rluc DMSO_p10 K22_p10 X7523_p10 DMSO_p16 K22_p16 X7523_p16 DMSO_p26 K22_p26 X7523_p26 Effect Impact Functional_Class Codon_change Protein_and_nucleotide_change Amino_Acid_Length Gene_name Biotype PP810610 1464 T C SNP C C C C C C C C C C missense_variant MODERATE MISSENSE gTt/gCt p.Val416Ala/c.1247T>C 6757 CDS_1 protein_coding PP810610 1492 T A SNP T/A T/A T/A T/A T/A T/A T/A T T/A T/A synonymous_variant LOW SILENT acT/acA p.Thr425Thr/c.1275T>A 6757 CDS_1 protein_coding PP810610 1699 C T SNP T T T T T T T T T T synonymous_variant LOW SILENT gtC/gtT p.Val494Val/c.1482C>T 6757 CDS_1 protein_coding PP810610 4809 A C SNP A A A A A A A/C A A A/C missense_variant MODERATE MISSENSE gAt/gCt p.Asp1531Ala/c.4592A>C 6757 CDS_1 protein_coding PP810610 6470 C T SNP C C C C C C C C C/T C synonymous_variant LOW SILENT Ctg/Ttg p.Leu2085Leu/c.6253C>T 6757 CDS_1 protein_coding PP810610 6691 C T SNP T T T T T T T T T T synonymous_variant LOW SILENT tgC/tgT p.Cys2158Cys/c.6474C>T 6757 CDS_1 protein_coding PP810610 6919 C G SNP G G G G G G G G G G synonymous_variant LOW SILENT ggC/ggG p.Gly2234Gly/c.6702C>G 6757 CDS_1 protein_coding PP810610 7294 T A SNP A A A A A A A A A A missense_variant MODERATE MISSENSE agT/agA p.Ser2359Arg/c.7077T>A 6757 CDS_1 protein_coding PP810610 8289 C A SNP C/A C/A C C/A C C C/A C C C/A missense_variant MODERATE MISSENSE gCc/gAc p.Ala2691Asp/c.8072C>A 6757 CDS_1 protein_coding PP810610 8294 A G SNP A/G A A/G A A A/G A A A/G A missense_variant MODERATE MISSENSE Aaa/Gaa p.Lys2693Glu/c.8077A>G 6757 CDS_1 protein_coding PP810610 8376 G T SNP G/T G G G G G G G G G missense_variant MODERATE MISSENSE cGt/cTt p.Arg2720Leu/c.8159G>T 6757 CDS_1 protein_coding PP810610 9146 T C SNP T T T T/C T T T/C T T T missense_variant MODERATE MISSENSE Ttt/Ctt p.Phe2977Leu/c.8929T>C 6757 CDS_1 protein_coding PP810610 9174 G A SNP G G G G/A G G G/A G G G missense_variant MODERATE MISSENSE tGc/tAc p.Cys2986Tyr/c.8957G>A 6757 CDS_1 protein_coding PP810610 9261 C T SNP C C C C C C C C C/T C missense_variant MODERATE MISSENSE gCt/gTt p.Ala3015Val/c.9044C>T 6757 CDS_1 protein_coding PP810610 10145 A G SNP A A A A/G A A A/G A A A/G missense_variant MODERATE MISSENSE Atg/Gtg p.Met3310Val/c.9928A>G 6757 CDS_1 protein_coding PP810610 10239 T G SNP T T T/G T T T/G T T T T missense_variant MODERATE MISSENSE gTg/gGg p.Val3341Gly/c.10022T>G 6757 CDS_1 protein_coding PP810610 10310 G A SNP G G G G/A G G G/A G G G/A missense_variant MODERATE MISSENSE Gtt/Att p.Val3365Ile/c.10093G>A 6757 CDS_1 protein_coding PP810610 10432 C T SNP C C C C C C C C C C/T synonymous_variant LOW SILENT ttC/ttT p.Phe3405Phe/c.10215C>T 6757 CDS_1 protein_coding PP810610 10871 C T SNP C C/T T C/T C/T T C/T C T C/T missense_variant MODERATE MISSENSE Ctt/Ttt p.Leu3552Phe/c.10654C>T 6757 CDS_1 protein_coding PP810610 10898 G A SNP G G/A G G/A G G G/A G G G/A missense_variant MODERATE MISSENSE Ggc/Agc p.Gly3561Ser/c.10681G>A 6757 CDS_1 protein_coding PP810610 11418 C A SNP C C C C C C C C C/A C missense_variant MODERATE MISSENSE aCt/aAt p.Thr3734Asn/c.11201C>A 6757 CDS_1 protein_coding PP810610 11577 A C SNP A A/C A A A/C A A A/C A A missense_variant MODERATE MISSENSE gAa/gCa p.Glu3787Ala/c.11360A>C 6757 CDS_1 protein_coding PP810610 14472 T C SNP C C C C C C C C C C missense_variant MODERATE MISSENSE aTg/aCg p.Met4752Thr/c.14255T>C 6757 CDS_1 protein_coding PP810610 15270 G A SNP G G G G G G G G G G/A missense_variant MODERATE MISSENSE cGa/cAa p.Arg5018Gln/c.15053G>A 6757 CDS_1 protein_coding PP810610 15458 T C SNP C C C C C C C C C C synonymous_variant LOW SILENT Ttg/Ctg p.Leu5081Leu/c.15241T>C 6757 CDS_1 protein_coding PP810610 16035 C A SNP A A A A A A A A A A stop_gained HIGH NONSENSE tCa/tAa p.Ser5273*/c.15818C>A 6757 CDS_1 protein_coding PP810610 17119 A G SNP A A A A A A A A A A/G synonymous_variant LOW SILENT aaA/aaG p.Lys5634Lys/c.16902A>G 6757 CDS_1 protein_coding PP810610 17430 T C SNP C C C C C C C C C C missense_variant MODERATE MISSENSE tTa/tCa p.Leu5738Ser/c.17213T>C 6757 CDS_1 protein_coding PP810610 18640 T G SNP T T T T/G T T T/G T T T/G missense_variant MODERATE MISSENSE tgT/tgG p.Cys6141Trp/c.18423T>G 6757 CDS_1 protein_coding PP810610 18646 C T SNP C C C C/T C C C C C C synonymous_variant LOW SILENT taC/taT p.Tyr6143Tyr/c.18429C>T 6757 CDS_1 protein_coding PP810610 18701 A G SNP A A A A/G A A A/G A A A missense_variant MODERATE MISSENSE Aca/Gca p.Thr6162Ala/c.18484A>G 6757 CDS_1 protein_coding PP810610 19028 C T SNP C C C C/T C C C C C C stop_gained HIGH NONSENSE Caa/Taa p.Gln6271*/c.18811C>T 6757 CDS_1 protein_coding PP810610 19289 G T SNP G G T G G G/T G G G/T G missense_variant MODERATE MISSENSE Gtt/Ttt p.Val6358Phe/c.19072G>T 6757 CDS_1 protein_coding PP810610 19294 C T SNP C C C C C C/T C/T C C/T C synonymous_variant LOW SILENT taC/taT p.Tyr6359Tyr/c.19077C>T 6757 CDS_1 protein_coding PP810610 21027 C T SNP C C/T C C/T C/T C C/T T C C/T missense_variant MODERATE MISSENSE aCt/aTt p.Thr178Ile/c.533C>T 1173 CDS_2 protein_coding PP810610 21183 T G SNP G G G G G G G G G G missense_variant MODERATE MISSENSE tTt/tGt p.Phe230Cys/c.689T>G 1173 CDS_2 protein_coding PP810610 21633 T C SNP T T/C T T T T T T T T missense_variant MODERATE MISSENSE gTt/gCt p.Val380Ala/c.1139T>C 1173 CDS_2 protein_coding PP810610 22215 T G SNP T T T T/G T T T/G T T T missense_variant MODERATE MISSENSE gTt/gGt p.Val574Gly/c.1721T>G 1173 CDS_2 protein_coding PP810610 22487 A G SNP A A A A A A/G A A A/G A missense_variant MODERATE MISSENSE Agt/Ggt p.Ser665Gly/c.1993A>G 1173 CDS_2 protein_coding PP810610 22630 C T SNP C C C C C C C C C C/T synonymous_variant LOW SILENT taC/taT p.Tyr712Tyr/c.2136C>T 1173 CDS_2 protein_coding PP810610 22636 T G SNP G G G G G G G G G G missense_variant MODERATE MISSENSE aaT/aaG p.Asn714Lys/c.2142T>G 1173 CDS_2 protein_coding PP810610 22763 G T SNP G G G G G G G G G/T G missense_variant MODERATE MISSENSE Gct/Tct p.Ala757Ser/c.2269G>T 1173 CDS_2 protein_coding PP810610 22788 T C SNP T T T T T T/C T T/C T/C T missense_variant MODERATE MISSENSE gTt/gCt p.Val765Ala/c.2294T>C 1173 CDS_2 protein_coding PP810610 23022 T C SNP C C C C C C C C C C missense_variant MODERATE MISSENSE tTa/tCa p.Leu843Ser/c.2528T>C 1173 CDS_2 protein_coding PP810610 23435 C T SNP C C T C/T C C/T C/T C C/T C missense_variant MODERATE MISSENSE Ctt/Ttt p.Leu981Phe/c.2941C>T 1173 CDS_2 protein_coding PP810610 24781 C T,G SNP T T T T T T T T T T/G missense_variant MODERATE MISSENSE aCt/aGt p.Thr36Ser/c.107C>G 77 CDS_5 protein_coding PP810610 24971 A G SNP A A A A A A A A A/G A missense_variant MODERATE MISSENSE Aat/Gat p.Asn18Asp/c.52A>G 225 CDS_6 protein_coding PP810610 25025 C T SNP C C/T C C C/T C C C/T C C/T missense_variant MODERATE MISSENSE Cac/Tac p.His36Tyr/c.106C>T 225 CDS_6 protein_coding PP810610 25163 C T SNP T T T T T T T T T T missense_variant MODERATE MISSENSE Ctt/Ttt p.Leu82Phe/c.244C>T 225 CDS_6 protein_coding PP810610 25264 C T SNP T T T T T T T T T T synonymous_variant LOW SILENT gtC/gtT p.Val115Val/c.345C>T 225 CDS_6 protein_coding PP810610 25592 T C SNP T T/C T T T/C T T T/C T T/C missense_variant MODERATE MISSENSE Ttc/Ctc p.Phe225Leu/c.673T>C 225 CDS_6 protein_coding PP810610 26104 C T SNP C C C C C C C C C C/T missense_variant MODERATE MISSENSE cCt/cTt p.Pro165Leu/c.494C>T 389 CDS_7 protein_coding PP810610 26838 G T SNP T T T T T T T T T T PP810610 24731 TGTGGTGCTTATA T DEL TGTGGTGCTTATA TGTGGTGCTTATA TGTGGTGCTTATA TGTGGTGCTTATA TGTGGTGCTTATA TGTGGTGCTTATA TGTGGTGCTTATA TGTGGTGCTTATA T TGTGGTGCTTATA conservative_inframe_deletion c.61_72delGTGCTTATAGTG p.Val21_Val24del -
Calling intra-host variants using viral-ngs
# #Install Gap2Seq in the docker-env --> generated the new docker-env viral-ngs-fixed:la # bin/assembly.py gapfill_gap2seq tmp/02_assembly/hCoV229E_Rluc.assembly2-scaffolded.fasta data/01_per_sample/hCoV229E_Rluc.cleaned.bam tmp/02_assembly/# hCoV229E_Rluc.assembly2-gapfilled.fasta --memLimitGb 12 --maskErrors --randomSeed 0 # ----> go to the script tools/gap2seq.py, install the required tool (for example gap2seq) and adapt the correct version. # conda install -y -c bioconda gap2seq # root@f47daf7c44ee:/work# Gap2Seq -h # #>Gap2Seq 3.1 # vim /home/jhuang/Tools/viral-ngs_docker/bin/tools/gap2seq.py # #Adapt the TOOL_NAME and TOOL_VERSION in the script # # #Save the docker-env with newly installed Gap2Seq # exit # docker ps -a # docker commit f47daf7c44ee viral-ngs-fixed:la #调用新的 docker-env having Gap2Seq docker run -it -v /mnt/md1/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2026/viralngs:/work -v /home/jhuang/Tools/viral-ngs_docker:/home/jhuang/Tools/viral-ngs_docker -v /home/jhuang/REFs:/home/jhuang/REFs -v /home/jhuang/Tools/GenomeAnalysisTK-3.6:/home/jhuang/Tools/GenomeAnalysisTK-3.6 -v /home/jhuang/Tools/novocraft_v3:/home/jhuang/Tools/novocraft_v3 -v /usr/local/bin/gatk:/usr/local/bin/gatk viral-ngs-fixed:la bash cd /work conda activate viral-ngs-env snakemake --directory /work --printshellcmds --cores 80 # The complete running commands for the complete data in 2026 as follows: #-1- Run several times of docker viral-ngs-fixed:la so that all ${sample}.assembly2-scaffolded.fasta generated by running each time only a isolate (one record in samples-assembly.txt and empty in samples-runs.txt) docker run ... cd /work conda activate viral-ngs-env snakemake --directory /work --printshellcmds --cores 80 # --> Note that tmp/02_assembly/${sample}.assembly2-scaffolded.fasta and tmp/02_assembly/${sample}.assembly2-gapfilled.fasta were generated! #-2- Then run the following commands: tmp/02_assembly/${sample}.assembly2-gapfilled.fasta --> tmp/02_assembly/${sample}.assembly3-modify.fasta for sample in DMSO_p10 K22_p10 X7523_p10 DMSO_p16 K22_p16 X7523_p16 DMSO_p26 K22_p26 X7523_p26; do #MANUALLY running the following commands! bin/assembly.py impute_from_reference tmp/02_assembly/${sample}.assembly2-gapfilled.fasta tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta tmp/02_assembly/${sample}.assembly3-modify.fasta --newName ${sample} --replaceLength 55 --minLengthFraction 0.05 --minUnambig 0.05 --index done #-3- Run docker for all isolates (full in amples-assembly.txt and samples-runs.txt). docker run ... cd /work conda activate viral-ngs-env snakemake --directory /work --printshellcmds --cores 80 -
Generate variant_annot.xls and coverages.xls
sudo chown -R jhuang:jhuang data # -- generate isnvs_annot_complete__.txt, isnvs_annot_0.05.txt from ~/DATA/Data_Pietschmann_RSV_Probe3/data/04_intrahost cp isnvs.annot.txt isnvs.annot_complete.txt ~/Tools/csv2xls-0.4/csv_to_xls.py isnvs.annot_complete.txt -d$'\t' -o isnvs.annot_complete.xls #delete the columns patient, time, Hw and Hs and the header in the xls and save as txt file. awk '{printf "%.3f\n", $5}' isnvs.annot_complete.csv > f5 cut -f1-4 isnvs.annot_complete.csv > f1_4 cut -f6- isnvs.annot_complete.csv > f6_ paste f1_4 f5 > f1_5 paste f1_5 f6_ > isnvs_annot_complete_.txt #correct f5 in header of isnvs_annot_complete_.txt to iSNV_freq #Add header: chr pos sample alleles iSNV_freq eff_type eff_codon_dna eff_aa eff_aa_pos eff_prot_len eff_gene eff_protein ~/Tools/csv2xls-0.4/csv_to_xls.py isnvs_annot_complete_.txt -d$'\t' -o variant_annot.xls #MANUALLY generate variant_annot_0.01.csv variant_annot_0.05.csv awk ' $5 >= 0.05 ' isnvs_annot_complete_.txt > 0.05.csv cut -f2 0.05.csv | uniq > ids_0.05 awk ' $5 >= 0.02 ' isnvs_annot_complete_.txt > 0.02.csv cut -f2 0.02.csv | uniq > ids_0.02 awk ' $5 >= 0.01 ' isnvs_annot_complete_.txt > 0.01.csv cut -f2 0.01.csv | uniq > ids_0.01 #Replace '\n' with '\\t" isnvs_annot_complete_.txt >> isnvs_annot_0.05.txt\ngrep -P "PP810610\\t' in ids_0.05 and then deleting the 'pos' line #Replace '\n' with '\\t" isnvs_annot_complete_.txt >> isnvs_annot_0.01.txt\ngrep -P "PP810610\\t' in ids_0.01 and then deleting the 'pos' line #Run ids_0.05 and ids_0.01 cp ../../Outputs/Master_vcf/All_SNPs_indels_annotated.txt ../../Outputs/Master_vcf/All_SNPs_indels_annotated.txt hCoV229E_Rluc_variants #Manully adding the 3-way SNPs by getting original data from isnvs.annot.csv, for example, PP810610 8289 hCoV229E_Rluc C,A,T 0.325, 0.439 missense_variant 8072C>A,8072C>T Ala2691Asp,Ala2691Val 2691 6758 Gene_217_20492 XBA84229.1 # Delete the three records which already reported in intra-host results hCoV229E_Rluc_variants: they are 10871, 19289, 23435. PP810610 10871 C T SNP C C/T T C/T C/T T C/T missense_variant MODERATE MISSENSE Ctt/Ttt p.Leu3552Phe/c.10654C>T 6757 CDS_1 protein_coding PP810610 19289 G T SNP G G T G G G/T G missense_variant MODERATE MISSENSE Gtt/Ttt p.Val6358Phe/c.19072G>T 6757 CDS_1 protein_coding PP810610 23435 C T SNP C C T C/T C C/T C/T missense_variant MODERATE MISSENSE Ctt/Ttt p.Leu981Phe/c.2941C>T 1173 CDS_2 protein_coding ~/Tools/csv2xls-0.4/csv_to_xls.py isnvs_annot_0.02.txt All_SNPs_indels_annotated.txt -d$'\t' -o variant_annot_2026.xls #Modify sheetname to variant_annot_0.05 and variant_annot_0.01 and add the header in Excel file. #Note in the complete list, Set 2024 is NOT a subset of Set 2025 because the element 26283 is in set 2024 but missing from set 2025. # -- calculate the coverage samtools depth ./data/02_align_to_self/hCoV229E_Rluc.mapped.bam > hCoV229E_Rluc_cov.txt samtools depth ./data/02_align_to_self/p10_DMSO.mapped.bam > p10_DMSO_cov.txt samtools depth ./data/02_align_to_self/p10_K22.mapped.bam > p10_K22_cov.txt samtools depth ./data/02_align_to_self/p10_K7523.mapped.bam > p10_K7523_cov.txt ~/Tools/csv2xls-0.4/csv_to_xls.py hCoV229E_Rluc_cov.txt p10_DMSO_cov.txt p10_K22_cov.txt p10_K7523_cov.txt -d$'\t' -o coverages.xls #draw coverage and see if they are continuous? samtools depth ./data/02_align_to_self/p16_DMSO.mapped.bam > p16_DMSO_cov.txt samtools depth ./data/02_align_to_self/p16_K22.mapped.bam > p16_K22_cov.txt samtools depth ./data/02_align_to_self/p16_X7523.mapped.bam > p16_K7523_cov.txt ~/Tools/csv2xls-0.4/csv_to_xls.py p16_DMSO_cov.txt p16_K22_cov.txt p16_K7523_cov.txt -d$'\t' -o coverages_p16.xls # Load required packages library(ggplot2) library(dplyr) # Read the coverage data cov_data <- read.table("p16_K7523_cov.txt", header = FALSE, sep = "\t", col.names = c("Chromosome", "Position", "Coverage")) # Create full position range for the given chromosome full_range <- data.frame(Position = seq(min(cov_data$Position), max(cov_data$Position))) # Merge with actual coverage data and fill missing positions with 0 cov_full <- full_range %>% left_join(cov_data[, c("Position", "Coverage")], by = "Position") %>% mutate(Coverage = ifelse(is.na(Coverage), 0, Coverage)) # Save the plot to PNG png("p16_K7523_coverage_filled.png", width = 1200, height = 600) ggplot(cov_full, aes(x = Position, y = Coverage)) + geom_line(color = "steelblue", size = 0.3) + labs(title = "Coverage Plot for p16_K7523 (Missing = 0)", x = "Genomic Position", y = "Coverage Depth") + theme_minimal() + theme( plot.title = element_text(hjust = 0.5), axis.text = element_text(size = 10), axis.title = element_text(size = 12) ) dev.off() -
Comparing intra- and inter-host variants (40 positions are common between spandx and vphaser2)
Please find attached the complete variant calling results for our 229E passaging experiment (2024–2026 datasets): variant_annot_2026__final.xls. • Sheet 1 (All_SNPs_Indels_annotated): Inter-host variants called by spandx. Shows consensus alleles at each variant position across all 10 samples relative to the PP810610 reference. • Sheet 2 (isnvs_annot): Intra-host variants called by vphaser2. Provides precise allele frequencies (iSNV_freq) per sample to quantify within-host diversity. Overlap: Green-highlighted rows indicate variants consistently detected by both pipelines. Sheet 2 is a subset of Sheet 1, focusing only on positions with intra-host heterogeneity. Fixed differences from the reference (where all 10 isolates share the same non-reference allele) remain in Sheet 1 (white rows). A few positions in Sheet 2 show iSNV_freq ≈ 1.0 (e.g., POS 15458, 22636). These reflect positions where vphaser2 detected minor variation in at least one sample, even if most samples show near-fixation. -
(Optional) Consensus sequences of each and of all isolates
cat PP810610.1.fa OZ035258.1.fa MZ712010.1.fa OK662398.1.fa OK625404.1.fa KF293664.1.fa NC_002645.1.fa > all.fa cp data/02_assembly/*.fasta ./ for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523; do \ mv ${sample}.fasta ${sample}.fa cat all.fa ${sample}.fa >> all.fa done cat RSV_dedup.fa all.fa > RSV_all.fa mafft --clustalout --adjustdirection RSV_all.fa > RSV_all.aln snp-sites RSV_all.aln -o RSV_all_.aln
Phylogenetic tree construction
scripts_tree_construction_v2.zip
conda activate /home/jhuang/miniconda3/envs/bengal3_ac3
1. Download genomes from Genbank to local disk
LOCAL_ONLY_ACCESSIONS=(“CP002773.1” “CP093323.1” “CP012097.1”, “CP014137.1”, “JBUDMV010000001.1”) saving in the directory local_Sply_fastas, GE11174.fasta should be ./
2. Replace EXTRA_ASSEMBLIES (manual edit or use sed/python as shown above)
EXTRA_ASSEMBLIES=(
=== Gibbsiella (core comparison) ===
"GCF_002291425.1" # G. quercinecans FRB97ᵀ
"GCF_004342245.1" # G. quercinecans DSM 25889
#"GCF_039539505.1" # G. papilionis PWX6
# "GCF_039539505.1": "Gibbsiella papilionis PWX6 (GCF_039539505.1)",
"JBUDMV010000001.1"
"GCA_039540155.1" # G. greigii USA56
#"GCA_004342265.1" # G. dentisursi DSM 23818ᵀ
# === Serratia (address biochemical mis-ID) ===
#mkdir -p local_Sply_fastas
#mv ~/Downloads/CP002773.1.fasta local_Sply_fastas/
#mv ~/Downloads/CP093323.1.fasta local_Sply_fastas/
#mv ~/Downloads/CP012097.1.fasta local_Sply_fastas/
#"CP002773.1" # S. plymuthica AS9
#"CP093323.1" # S. plymuthica B37/06
#"CP012097.1" # S. plymuthica 3Re4-18
"GCA_001663135.1" # S. odorifera 4Rx13 (already validated)
#"GCA_002951025.1" # S. fonticola CDC 470-73ᵀ
# === Close outgroup (same family Yersiniaceae) ===
#"GCF_005484965.1" # Brenneria nigrifluens LMG 5956ᵀ
# "GCF_005484965.1": "Brenneria nigrifluens LMG 5956 (GCF_005484965.1)"
#"GCF_000239205.1" # Brenneria goodwinii BC-1
"CP014137.1"
)
3. EXACT desired display labels (as requested)
EXACT display names
desired = { “JBVKFS000000000”: “Gibbsiella quercinecans HH131225 (JBVKFS000000000) ⭐”, “GCF_002291425.1”: “Gibbsiella quercinecans FRB97 (GCF_002291425.1)”, “GCF_004342245.1”: “Gibbsiella quercinecans DSM 25889 (GCF_004342245.1)”, “JBUDMV010000001.1”: “Gibbsiella dentisursi 2e (JBUDMV010000001)”, “GCA_039540155.1”: “Gibbsiella greigii USA56 (GCA_039540155.1)”, “CP002773.1”: “Serratia plymuthica AS9 (CP002773.1)”, “CP093323.1”: “Serratia plymuthica B37/06 (CP093323.1)”, “CP012097.1”: “Serratia plymuthica 3Re4-18 (CP012097.1)”, “GCA_001663135.1”: “Serratia odorifera 4Rx13 (GCA_001663135.1)”, “CP014137.1”: “Brenneria goodwinii FRB141 (CP014137.1)” }
4. Re-run the pipeline
./build_wgs_tree_fig3B_v3.sh
5. Verify output labels
cat work_wgs_tree/plot/labels.tsv
6. Delete the unnecessary isolates from labels.tsv, fastas and gffs, for example delete two isolates starting with ‘-‘
sample display
GCF_002291425.1 Gibbsiella quercinecans FRB97 (GCF_002291425.1)
GCA_039540155.1 Gibbsiella greigii USA56 (GCA_039540155.1)
- GCF_039539505.1 G_papilionis_PWX6 (GCF_039539505.1)
- GCF_005484965.1 B_nigrifluens_LMG5956 (GCF_005484965.1)
JBVKFS000000000 Gibbsiella quercinecans HH131225 (JBVKFS000000000) ⭐
GCF_004342245.1 Gibbsiella quercinecans DSM 25889 (GCF_004342245.1)
JBUDMV010000001.1 Gibbsiella dentisursi 2e (JBUDMV010000001)
CP002773.1 Serratia plymuthica AS9 (CP002773.1)
CP093323.1 Serratia plymuthica B37/06 (CP093323.1)
CP012097.1 Serratia plymuthica 3Re4-18 (CP012097.1)
GCA_001663135.1 Serratia odorifera 4Rx13 (GCA_001663135.1)
CP014137.1 Brenneria goodwinii FRB141 (CP014137.1)
🔧 If you want to skip re-running everything If Prokka/GFFs are already done and you just need to re-run Roary→RAxML→Plot with correct inputs:
A. Ensure collect_fastas copied all 10 FASTAs
ls -1 work_wgs_tree_expanded/fastas/*.fna | wc -l # Should be 10
B. Re-run Roary with relaxed core threshold (for mixed genera)
roary -e –mafft -p 64 -cd 95 -i 95 \ -f work_wgs_tree_expanded/roary_final \ work_wgs_tree_expanded/gffs/*.gff #–> 60736 SNPs
C. Run RAxML-NG with explicit FASTA format
CORE_ALN=”work_wgs_tree_expanded/roary_final/core_gene_alignment.aln” raxml-ng –all \ –msa “$CORE_ALN” \ –msa-format FASTA \ –model GTR+G \ –bs-trees 1000 \ –threads 80 \ –prefix work_wgs_tree_expanded/raxmlng/core \ –redo
D. Run plotting with correct labels
Adapt the plot_tree.R by entering the outgroup-id, e.g. CP014137.1
Version 1: WITHOUT bootstrap values
Rscript plot_tree.R \ work_wgs_tree_expanded/raxmlng/core.raxml.support \ work_wgs_tree_expanded/plot/labels.tsv \ work_wgs_tree_expanded/plot/core_tree_fig3B_no_bootstrap.pdf \ work_wgs_tree_expanded/plot/core_tree_fig3B_no_bootstrap.png \ FALSE
Version 2: WITH bootstrap values
Rscript plot_tree.R \ work_wgs_tree_expanded/raxmlng/core.raxml.support \ work_wgs_tree_expanded/plot/labels.tsv \ work_wgs_tree_expanded/plot/core_tree_fig3B_with_bootstrap.pdf \ work_wgs_tree_expanded/plot/core_tree_fig3B_with_bootstrap.png \ TRUE
REPORT
I agree that showing where Gibbsiella sits among related Gram-negative bacteria would be interesting. However, for the generation of the phylogenetic tree, we are using a core-genome alignment approach (Roary + RAxML-NG), which has an important technical constraint: Core-genome methods require sufficient gene content overlap. When taxa are too evolutionarily distant, the number of shared (“core”) genes drops sharply, resulting in sparse alignments and reduced branch support. With too few core genes, the tree may reflect alignment artifacts rather than true evolutionary history.
I have currently selected the following species for tree construction: TARGET ISOLATE • G. quercinecans HH131225 – Clinical case (highlighted in red) GIBBSIELLA (intra-genus diversity) • G. quercinecans (2 strains) • G. papilionis • G. greigii SERRATIA (address biochemical misidentification) • S. plymuthica (3 strains) • S. odorifera OUTGROUP (rooting) • Brenneria nigrifluens This ensures a robust core alignment (51,495 SNPs) and high-confidence branching for the clinically relevant comparison.
Figure Legend: Figure 1. Core-genome maximum-likelihood phylogeny of Gibbsiella quercinecans HH131225 (red) with reference strains from the genus Gibbsiella, biochemically similar Serratia species, and Brenneria outgroup. Tip labels show species, strain designation, and GenBank assembly accession or chromosome accession. Scale bar: substitutions per site. Tree inferred from 60,736 core-genome SNPs using Roary (Page et al., 2015) and RAxML-NG (Kozlov et al., 2019) with the GTR+G model and 1,000 bootstrap replicates.
References: Page et al., 2015: Page, A. J., Cummins, C. A., Hunt, M., Wong, V. K., Reuter, S., Holden, M. T., … & Parkhill, J. (2015). Roary: rapid large-scale prokaryote pan genome analysis. Bioinformatics, 31(22), 3691–3693. https://doi.org/10.1093/bioinformatics/btv421 Kozlov et al., 2019: Kozlov, A. M., Darriba, D., Flouri, T., Morel, B., & Stamatakis, A. (2019). RAxML-NG: a fast, scalable and user-friendly tool for maximum likelihood phylogenetic inference. Bioinformatics, 35(21), 4453–4455. https://doi.org/10.1093/bioinformatics/btz305
Run viral_ngs
Calling inter-host variants by merging the results from snippy+spandx (Manually!) Calling intra-host variants using viral-ngs (http://xgenes.com/article/article-content/347/variant-calling-for-herpes-simplex-virus-1-from-patient-sample-using-capture-probe-sequencing/) #TODO: How? Merge intra- and inter-host variants, comparing the variants to the alignments of the assemblies to confirm its correctness.
#TODO: If the results from 2024 contains only intra-host variants, explain this time I also give the results of the inter-host variants!
Variant calling (inter-host + intra-host) for Data_Pietschmann_229ECoronavirus_Mutations_2024+2025+2026 (via docker own_viral_ngs) v2
-
Input data:
# ---- Datasets 2024 (in total 4) ---- ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/hCoV229E_Rluc_R1.fastq.gz hCoV229E_Rluc_R1.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/hCoV229E_Rluc_R2.fastq.gz hCoV229E_Rluc_R2.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_DMSO_R1.fastq.gz DMSO_p10_R1.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_DMSO_R2.fastq.gz DMSO_p10_R2.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_K22_R1.fastq.gz K22_p10_R1.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_K22_R2.fastq.gz K22_p10_R2.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_K7523_R1.fastq.gz X7523_p10_R1.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_K7523_R2.fastq.gz X7523_p10_R2.fastq.gz # ---- Datasets 2025 (in total 3) ---- ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20606/p16_DMSO_S29_R1_001.fastq.gz DMSO_p16_R1.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20606/p16_DMSO_S29_R2_001.fastq.gz DMSO_p16_R2.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20607/p16_K22_S30_R1_001.fastq.gz K22_p16_R1.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20607/p16_K22_S30_R2_001.fastq.gz K22_p16_R2.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20608/p16_X7523_S31_R1_001.fastq.gz X7523_p16_R1.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20608/p16_X7523_S31_R2_001.fastq.gz X7523_p16_R2.fastq.gz # ---- Datasets 2026 (in total 3) ---- ln -s ../raw_data_2026/20260212_AV243904_0054_B/02_DMSO_p26/02_DMSO_p26_R1.fastq.gz DMSO_p26_R1.fastq.gz ln -s ../raw_data_2026/20260212_AV243904_0054_B/02_DMSO_p26/02_DMSO_p26_R2.fastq.gz DMSO_p26_R2.fastq.gz ln -s ../raw_data_2026/20260212_AV243904_0054_B/01_K22_p26/01_K22_p26_R1.fastq.gz K22_p26_R1.fastq.gz ln -s ../raw_data_2026/20260212_AV243904_0054_B/01_K22_p26/01_K22_p26_R2.fastq.gz K22_p26_R2.fastq.gz ln -s ../raw_data_2026/20260212_AV243904_0054_B/03_X723_p26/03_X723_p26_R1.fastq.gz X7523_p26_R1.fastq.gz ln -s ../raw_data_2026/20260212_AV243904_0054_B/03_X723_p26/03_X723_p26_R2.fastq.gz X7523_p26_R2.fastq.gz -
Call variant calling using snippy
ln -s ~/Tools/bacto/db/ .; ln -s ~/Tools/bacto/envs/ .; ln -s ~/Tools/bacto/local/ .; cp ~/Tools/bacto/Snakefile .; cp ~/Tools/bacto/bacto-0.1.json .; cp ~/Tools/bacto/cluster.json .; #download CU459141.gb from GenBank mv ~/Downloads/sequence\(2\).gb db/PP810610.gb #setting the following in bacto-0.1.json "fastqc": false, "taxonomic_classifier": false, "assembly": true, "typing_ariba": false, "typing_mlst": true, "pangenome": true, "variants_calling": true, "phylogeny_fasttree": true, "phylogeny_raxml": true, "recombination": false, (due to gubbins-error set false) "genus": "Alphacoronavirus", "kingdom": "Viruses", "species": "Human coronavirus 229E", "mykrobe": { "species": "corona" }, "reference": "db/PP810610.gb" mamba activate /home/jhuang/miniconda3/envs/bengal3_ac3 (bengal3_ac3) /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds -
Summarize all SNPs and Indels from the snippy result directory.
cp ~/Scripts/summarize_snippy_res_ordered.py . # IMPORTANT_ADAPT the array isolates = ["hCoV229E_Rluc", "DMSO_p10", "K22_p10", "X7523_p10", "DMSO_p16", "K22_p16", "X7523_p16", "DMSO_p26", "K22_p26", "X7523_p26"] mamba activate plot-numpy1 python3 ./summarize_snippy_res_ordered.py snippy #--> Summary CSV file created successfully at: snippy/summary_snps_indels.csv cd snippy #REMOVE_the_line? I don't find the sence of the line: grep -v "None,,,,,,None,None" summary_snps_indels.csv > summary_snps_indels_.csv -
Using spandx calling variants (almost the same results to the one from viral-ngs!)
mamba deactivate mamba activate /home/jhuang/miniconda3/envs/spandx mkdir ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610 cp PP810610.gb ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610/genes.gbk vim ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config /home/jhuang/miniconda3/envs/spandx/bin/snpEff build PP810610 #-d ~/Scripts/genbank2fasta.py PP810610.gb mv PP810610.gb_converted.fna PP810610.fasta #rename "NC_001348.1 xxxxx" to "NC_001348" in the fasta-file ln -s /home/jhuang/Tools/spandx/ spandx (spandx) nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq" --ref PP810610.fasta --annotation --database PP810610 -resume # Rerun SNP_matrix.sh due to the error ERROR_CHROMOSOME_NOT_FOUND in the variants annotation cd Outputs/Master_vcf (spandx) cp -r ../../snippy/hCoV229E_Rluc/reference . (spandx) cp ../../spandx/bin/SNP_matrix.sh ./ #Note that ${variant_genome_path}=NC_001348 in the following command, but it was not used after command replacement. #Adapt "snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -v ${variant_genome_path} out.vcf > out.annotated.vcf" to "/home/jhuang/miniconda3/envs/bengal3_ac3/bin/snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -c reference/snpeff.config -dataDir . ref out.vcf > out.annotated.vcf" in SNP_matrix.sh (spandx) bash SNP_matrix.sh PP810610 . -
Calling inter-host variants by merging the results from snippy+spandx (Manually!)
# Inter-host variants(宿主间变异):一种病毒在两个人之间有不同的基因变异,这些变异可能与宿主的免疫反应、疾病表现或病毒传播的方式相关。 cp All_SNPs_indels_annotated.txt All_SNPs_indels_annotated_backup.txt vim All_SNPs_indels_annotated.txt #in the file ids: grep "$(echo -e '\t')353$(echo -e '\t')" All_SNPs_indels_annotated.txt >> All_SNPs_indels_annotated_.txt #Replace \n with " All_SNPs_indels_annotated.txt >> All_SNPs_indels_annotated_.txt\ngrep " #Replace grep " --> grep "$(echo -e '\t') #Replace " All_ --> $(echo -e '\t')" All_ # Potential intra-host variants: 10871, 19289, 23435. CHROM POS REF ALT TYPE hCoV229E_Rluc_trimmed p10_DMSO_trimmed p10_K22_trimmed p10_K7523_trimmed p16_DMSO_trimmed p16_K22_trimmed p16_X7523_trimmed Effect Impact Functional_Class Codon_change Protein_and_nucleotide_change Amino_Acid_Length Gene_name Biotype PP810610 1464 T C SNP C C C C C C C missense_variant MODERATE MISSENSE gTt/gCt p.Val416Ala/c.1247T>C 6757 CDS_1 protein_coding PP810610 1699 C T SNP T T T T T T T synonymous_variant LOW SILENT gtC/gtT p.Val494Val/c.1482C>T 6757 CDS_1 protein_coding PP810610 6691 C T SNP T T T T T T T synonymous_variant LOW SILENT tgC/tgT p.Cys2158Cys/c.6474C>T 6757 CDS_1 protein_coding PP810610 6919 C G SNP G G G G G G G synonymous_variant LOW SILENT ggC/ggG p.Gly2234Gly/c.6702C>G 6757 CDS_1 protein_coding PP810610 7294 T A SNP A A A A A A A missense_variant MODERATE MISSENSE agT/agA p.Ser2359Arg/c.7077T>A 6757 CDS_1 protein_coding * PP810610 10871 C T SNP C C/T T C/T C/T T C/T missense_variant MODERATE MISSENSE Ctt/Ttt p.Leu3552Phe/c.10654C>T 6757 CDS_1 protein_coding PP810610 14472 T C SNP C C C C C C C missense_variant MODERATE MISSENSE aTg/aCg p.Met4752Thr/c.14255T>C 6757 CDS_1 protein_coding PP810610 15458 T C SNP C C C C C C C synonymous_variant LOW SILENT Ttg/Ctg p.Leu5081Leu/c.15241T>C 6757 CDS_1 protein_coding PP810610 16035 C A SNP A A A A A A A stop_gained HIGH NONSENSE tCa/tAa p.Ser5273*/c.15818C>A 6757 CDS_1 protein_coding PP810610 17430 T C SNP C C C C C C C missense_variant MODERATE MISSENSE tTa/tCa p.Leu5738Ser/c.17213T>C 6757 CDS_1 protein_coding * PP810610 19289 G T SNP G G T G G G/T G missense_variant MODERATE MISSENSE Gtt/Ttt p.Val6358Phe/c.19072G>T 6757 CDS_1 protein_coding PP810610 21183 T G SNP G G G G G G G missense_variant MODERATE MISSENSE tTt/tGt p.Phe230Cys/c.689T>G 1173 CDS_2 protein_coding PP810610 22636 T G SNP G G G G G G G missense_variant MODERATE MISSENSE aaT/aaG p.Asn714Lys/c.2142T>G 1173 CDS_2 protein_coding PP810610 23022 T C SNP C C C C C C C missense_variant MODERATE MISSENSE tTa/tCa p.Leu843Ser/c.2528T>C 1173 CDS_2 protein_coding * PP810610 23435 C T SNP C C T C/T C C/T C/T missense_variant MODERATE MISSENSE Ctt/Ttt p.Leu981Phe/c.2941C>T 1173 CDS_2 protein_coding PP810610 24512 C T SNP T T T T T T T missense_variant MODERATE MISSENSE Ctc/Ttc p.Leu36Phe/c.106C>T 88 CDS_4 protein_coding PP810610 24781 C T SNP T T T T T T T missense_variant MODERATE MISSENSE aCt/aTt p.Thr36Ile/c.107C>T 77 CDS_5 protein_coding PP810610 25163 C T SNP T T T T T T T missense_variant MODERATE MISSENSE Ctt/Ttt p.Leu82Phe/c.244C>T 225 CDS_6 protein_coding PP810610 25264 C T SNP T T T T T T T synonymous_variant LOW SILENT gtC/gtT p.Val115Val/c.345C>T 225 CDS_6 protein_coding PP810610 26838 G T SNP T T T T T T T -
Calling intra-host variants using viral-ngs
# Intra-host variants(宿主内变异):同一个人感染了某种病毒,但在其体内的不同细胞或器官中可能存在多个不同的病毒变异株。 #How to run and debug the viral-ngs docker? # ---- DEBUG_2026_1: using docker instead ---- mkdir viralngs; cd viralngs ln -s ~/Tools/viral-ngs_docker/Snakefile Snakefile ln -s ~/Tools/viral-ngs_docker/bin bin cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/refsel.acids refsel.acids cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/lastal.acids lastal.acids cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/config.yaml config.yaml cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-runs.txt samples-runs.txt cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-depletion.txt samples-depletion.txt cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-metagenomics.txt samples-metagenomics.txt cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-assembly.txt samples-assembly.txt cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-assembly-failures.txt samples-assembly-failures.txt # Adapt the sample-*.txt mkdir viralngs/data mkdir viralngs/data/00_raw mkdir bams ref_fa="PP810610.fasta"; #for sample in hCoV229E_Rluc p10_DMSO p10_K22; do #for sample in p10_K7523 p16_DMSO p16_K22 p16_X7523; do for sample in hCoV229E_Rluc DMSO_p10 K22_p10 X7523_p10 DMSO_p16 K22_p16 X7523_p16 DMSO_p26 K22_p26 X7523_p26; do bwa index ${ref_fa}; \ bwa mem -M -t 16 ${ref_fa} trimmed/${sample}_trimmed_P_1.fastq trimmed/${sample}_trimmed_P_2.fastq | samtools view -bS - > bams/${sample}_genome_alignment.bam; \ done conda activate viral-ngs4 #for sample in hCoV229E_Rluc p10_DMSO p10_K22; do #for sample in p10_K7523 p16_DMSO p16_K22 p16_X7523; do for sample in hCoV229E_Rluc DMSO_p10 K22_p10 X7523_p10 DMSO_p16 K22_p16 X7523_p16 DMSO_p26 K22_p26 X7523_p26; do picard AddOrReplaceReadGroups I=bams/${sample}_genome_alignment.bam O=~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2026/viralngs/data/00_raw/${sample}.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=$sample RGSM=$sample RGLB=standard RGPU=$sample VALIDATION_STRINGENCY=LENIENT; \ done conda deactivate # -- ! Firstly set the samples-assembly.txt empty, so that only focus on running depletion! docker run -it -v /mnt/md1/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2026/viralngs:/work -v /home/jhuang/Tools/viral-ngs_docker:/home/jhuang/Tools/viral-ngs_docker -v /home/jhuang/REFs:/home/jhuang/REFs -v /home/jhuang/Tools/GenomeAnalysisTK-3.6:/home/jhuang/Tools/GenomeAnalysisTK-3.6 -v /home/jhuang/Tools/novocraft_v3:/home/jhuang/Tools/novocraft_v3 -v /usr/local/bin/gatk:/usr/local/bin/gatk own_viral_ngs bash cd /work snakemake --directory /work --printshellcmds --cores 80 # -- ! Secondly manully run assembly steps # --> By itereative add the unfinished assembly in the list, each time replace one, and run "cd /work; snakemake --directory /work --printshellcmds --cores 80" after exiting and re-entering the docker-env, since some tools were during the running automatically deleted.
Here is the combined, consolidated fix sequence for your Docker container:
🔧 Complete Fix Commands (Run Inside Docker Container)
#!/bin/bash
# ============================================================
# FIX SCRIPT FOR viral-ngs Docker Environment
# Run these commands INSIDE your running Docker container
# ============================================================
echo "=== Step 1: Activate base environment for config changes ==="
conda activate base
echo "=== Step 2: Disable conda safety checks (fixes 'unsafe path' errors) ==="
conda config --set safety_checks disabled
conda config --set allow_softlinks true
echo "=== Step 3: Verify config was applied ==="
conda config --show safety_checks
# Expected output: safety_checks: disabled
echo "=== Step 4: (Optional) Update conda for compatibility ==="
conda update -n base -c defaults conda -y
echo "=== Step 5: Activate viral-ngs environment ==="
conda activate viral-ngs-env
echo "Current env: $CONDA_DEFAULT_ENV" && which python
#Current env: viral-ngs-env
#/opt/miniconda/bin/python --> ⚠️ Problem Identified!
#conda deactivate; conda deactivate;
## 1. Install mamba in your base environment (one-time setup)
#conda install -n base -c conda-forge mamba -y
## 2. Activate your existing environment
#conda activate viral-ngs-env
## 3. Use mamba to install missing packages (instead of conda)
#mamba install -c bioconda biopython mafft -y
## 4. Verify existing tools are still there
#which python
#conda list # Shows all packages, both old and new
conda install python=3.6.7 -y
which python
echo "Current env: $CONDA_DEFAULT_ENV" && which python
#Current env: viral-ngs-env
#/opt/miniconda/envs/viral-ngs-env/bin/python (Python 3.6.7)
echo "=== Step 6: Install missing Python packages ==="
conda install -y -c conda-forge biopython
echo "=== Step 7: Install missing binary tools (with specific versions if needed) ==="
conda install -y -c bioconda perl=5.32.1 prinseq-lite samtools
echo "=== Step 8: Verify all installations ==="
echo "--- Checking samtools ---"
which samtools && samtools --version
#/opt/miniconda/envs/viral-ngs-env/bin/samtools samtools 1.9 using htslib 1.9
echo "--- Checking perl ---"
which perl && perl --version
echo "--- Checking prinseq-lite ---"
which prinseq-lite.pl && prinseq-lite.pl -version
echo "--- Checking Biopython ---"
python -c "import Bio; print('Biopython OK:', Bio.__version__)"
echo "=== Step 9: Refresh environment PATH ==="
hash -r
echo "=== ✅ All fixes applied! You can now re-run your pipeline ==="
echo "Tip: Run 'snakemake --unlock' first if pipeline is locked, then:"
echo " snakemake -j
<threads> --rerun-incomplete"
In Dockerfile
#ENV CONDA_ALLOW_UNSAFE_PATHS=1 #RUN conda update -n base -c defaults conda -y
🐳 To Make Fixes Permanent: Commit the Container
After running the fixes above, save your working container:
# 1. Exit the container (but don't delete it)
exit
docker ps -a
docker commit c51d44624f1b viral-ngs-fixed:2026-03-19
# 3. Next time, run the fixed image
#docker run -it -v /mnt/md1/... [your other volumes] viral-ngs-fixed:2026-03-19 bash
docker run -it -v /mnt/md1/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2026/viralngs:/work -v /home/jhuang/Tools/viral-ngs_docker:/home/jhuang/Tools/viral-ngs_docker -v /home/jhuang/REFs:/home/jhuang/REFs -v /home/jhuang/Tools/GenomeAnalysisTK-3.6:/home/jhuang/Tools/GenomeAnalysisTK-3.6 -v /home/jhuang/Tools/novocraft_v3:/home/jhuang/Tools/novocraft_v3 -v /usr/local/bin/gatk:/usr/local/bin/gatk viral-ngs-fixed:2026-03-19 bash
conda activate viral-ngs-env
which samtools && samtools --version
#/opt/miniconda/envs/viral-ngs-env/bin/samtools samtools 1.9 using htslib 1.9
which perl && perl --version
#/opt/miniconda/envs/viral-ngs-env/bin/perl (v5.26.2)
which prinseq-lite.pl && prinseq-lite.pl -version
#/opt/miniconda/envs/viral-ngs-env/bin/prinseq-lite.pl (PRINSEQ-lite 0.20.4)
python -c "import Bio; print('Biopython OK:', Bio.__version__)"
#Biopython OK: 1.72
conda install -c bioconda trimmomatic -y
which trimmomatic
#/opt/miniconda/envs/viral-ngs-env/bin/trimmomatic (trimmomatic-0.39)
exit
docker ps -a
docker commit e70395e5625c viral-ngs-fixed:l
docker run -it -v /mnt/md1/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2026/viralngs:/work -v /home/jhuang/Tools/viral-ngs_docker:/home/jhuang/Tools/viral-ngs_docker -v /home/jhuang/REFs:/home/jhuang/REFs -v /home/jhuang/Tools/GenomeAnalysisTK-3.6:/home/jhuang/Tools/GenomeAnalysisTK-3.6 -v /home/jhuang/Tools/novocraft_v3:/home/jhuang/Tools/novocraft_v3 -v /usr/local/bin/gatk:/usr/local/bin/gatk viral-ngs-fixed:l bash
cd /work
#!!!! IMPORTANT !!!!
conda activate viral-ngs-env
snakemake --directory /work --printshellcmds --cores 80
#DEBUG the specific commmand as follows, for example install Gap2Seq in the docker-env. bin/assembly.py gapfill_gap2seq tmp/02_assembly/hCoV229E_Rluc.assembly2-scaffolded.fasta data/01_per_sample/hCoV229E_Rluc.cleaned.bam tmp/02_assembly/hCoV229E_Rluc.assembly2-gapfilled.fasta –memLimitGb 12 –maskErrors –randomSeed 0 —-> go to the script tools/gap2seq.py, install the required tool (for example gap2seq) and adapt the correct version. conda install -y -c bioconda gap2seq root@f47daf7c44ee:/work# Gap2Seq -h #>Gap2Seq 3.1 vim /home/jhuang/Tools/viral-ngs_docker/bin/tools/gap2seq.py #Adapt the TOOL_NAME and TOOL_VERSION in the script
#Save the docker-env with newly installed Gap2Seq exit docker ps -a docker commit f47daf7c44ee viral-ngs-fixed:la
#调用新的 docker-env installed with Gap2Seq docker run -it -v /mnt/md1/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2026/viralngs:/work -v /home/jhuang/Tools/viral-ngs_docker:/home/jhuang/Tools/viral-ngs_docker -v /home/jhuang/REFs:/home/jhuang/REFs -v /home/jhuang/Tools/GenomeAnalysisTK-3.6:/home/jhuang/Tools/GenomeAnalysisTK-3.6 -v /home/jhuang/Tools/novocraft_v3:/home/jhuang/Tools/novocraft_v3 -v /usr/local/bin/gatk:/usr/local/bin/gatk viral-ngs-fixed:la bash cd /work conda activate viral-ngs-env snakemake –directory /work –printshellcmds –cores 80
#MANUALLY running the following commands! bin/assembly.py gapfill_gap2seq in_scaffold=tmp/02_assembly/hCoV229E_Rluc.assembly2-scaffolded.fasta in_bam=data/01_per_sample/hCoV229E_Rluc.cleaned.bam out_scaffold=tmp/02_assembly/hCoV229E_Rluc.assembly2-gapfilled.fasta mem_limit_gb=12 time_soft_limit_minutes=60.0 mask_errors=True gap2seq_opts= random_seed=0 threads=None loglevel=INFO tmp_dir=/tmp tmp_dirKeep=False
#MANUALLY running the following commands! bin/assembly.py impute_from_reference tmp/02_assembly/hCoV229E_Rluc.assembly2-gapfilled.fasta tmp/02_assembly/hCoV229E_Rluc.assembly2-scaffold_ref.fasta tmp/02_assembly/hCoV229E_Rluc.assembly3-modify.fasta –newName hCoV229E_Rluc –replaceLength 55 –minLengthFraction 0.05 –minUnambig 0.05 –index
#!!!! TODO_NEXT_WEEK !!!!: run several times of docker so that all ${sample}.assembly2-scaffolded.fasta generated! Then run the following commands, after that run docker for all isolates. for sample in DMSO_p10 K22_p10 X7523_p10 DMSO_p16 K22_p16 X7523_p16 DMSO_p26 K22_p26 X7523_p26; do #MANUALLY running the following commands! bin/assembly.py gapfill_gap2seq in_scaffold=tmp/02_assembly/${sample}.assembly2-scaffolded.fasta in_bam=data/01_per_sample/${sample}.cleaned.bam out_scaffold=tmp/02_assembly/${sample}.assembly2-gapfilled.fasta mem_limit_gb=12 time_soft_limit_minutes=60.0 mask_errors=True gap2seq_opts= random_seed=0 threads=None loglevel=INFO tmp_dir=/tmp tmp_dirKeep=False #MANUALLY running the following commands! bin/assembly.py impute_from_reference tmp/02_assembly/${sample}.assembly2-gapfilled.fasta tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta tmp/02_assembly/${sample}.assembly3-modify.fasta –newName ${sample} –replaceLength 55 –minLengthFraction 0.05 –minUnambig 0.05 –index done
🔍 Troubleshooting Checklist
If issues persist after running the fix:
# Check conda config
conda config --show | grep -E "safety|softlink"
# List installed packages in viral-ngs-env
conda activate viral-ngs-env
conda list | grep -E "samtools|prinseq|perl|biopython"
# Test each tool manually
samtools --version
prinseq-lite.pl -version
perl -e 'use Bio::Seq; print "BioPerl OK\n"'
# Check PATH includes conda bin directories
echo $PATH | tr ':' '\n' | grep conda
⚠️ Important Notes
| Issue | Solution |
|---|---|
safety_checks disabled not working |
Must run conda config in base env, not viral-ngs-env |
| Packages still fail to install | Try conda clean --all -y first, then reinstall |
samtools: command not found after install |
Run hash -r or restart shell to refresh PATH |
| Pipeline still fails after fixes | Run snakemake --unlock --rerun-incomplete to resume |
conda config --set safety_checks disable
conda activate viral-ngs-env
conda install -y -c conda-forge biopython
docker ps -a
# Look for the container you are working in, e.g., "viral-ngs-container"
#bb117a6ca70a
docker commit
viral-ngs-fixed:latest
docker run -it viral-ngs-fixed:latest bash
# # —- NOTE that the following steps need rerun –> DOES NOT WORK, USE STRATEGY ABOVE —-
# #for sample in p10_K22 p10_K7523; do
# for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523 p16_DMSO p16_K22 p16_X7523; do
# bin/read_utils.py merge_bams data/01_cleaned/${sample}.cleaned.bam tmp/01_cleaned/${sample}.cleaned.bam –picardOptions SORT_ORDER=queryname
# bin/read_utils.py rmdup_mvicuna_bam tmp/01_cleaned/${sample}.cleaned.bam data/01_per_sample/${sample}.cleaned.bam –JVMmemory 30g
# done
#
# #Note that the error generated by nextflow is from the step gapfill_gap2seq!
# for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523 p16_DMSO p16_K22 p16_X7523; do
# bin/assembly.py assemble_spades data/01_per_sample/${sample}.taxfilt.bam /home/jhuang/REFs/viral_ngs_dbs/trim_clip/contaminants.fasta tmp/02_assembly/${sample}.assembly1-spades.fasta –nReads 10000000 –threads 15 –memLimitGb 12
# done
# for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523 p16_DMSO p16_K22 p16_X7523; do
# for sample in p10_K22 p10_K7523; do
# bin/assembly.py order_and_orient tmp/02_assembly/${sample}.assembly1-spades.fasta refsel_db/refsel.fasta tmp/02_assembly/${sample}.assembly2-scaffolded.fasta –min_pct_contig_aligned 0.05 –outAlternateContigs tmp/02_assembly/${sample}.assembly2-alternate_sequences.fasta –nGenomeSegments 1 –outReference tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta –threads 15
# done
#
# for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523 p16_DMSO p16_K22 p16_X7523; do
# bin/assembly.py gapfill_gap2seq tmp/02_assembly/${sample}.assembly2-scaffolded.fasta data/01_per_sample/${sample}.cleaned.bam tmp/02_assembly/${sample}.assembly2-gapfilled.fasta –memLimitGb 12 –maskErrors –randomSeed 0 –loglevel DEBUG
# done
#IMPORTANT: Reun the following commands!
for sample in hCoV229E_Rluc DMSO_p10 K22_p10 X7523_p10 DMSO_p16 K22_p16 X7523_p16 DMSO_p26 K22_p26 X7523_p26; do
bin/assembly.py impute_from_reference tmp/02_assembly/${sample}.assembly2-gapfilled.fasta tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta tmp/02_assembly/${sample}.assembly3-modify.fasta –newName ${sample} –replaceLength 55 –minLengthFraction 0.05 –minUnambig 0.05 –index –loglevel DEBUG
done
# for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523 p16_DMSO p16_K22 p16_X7523; do
# bin/assembly.py refine_assembly tmp/02_assembly/${sample}.assembly3-modify.fasta data/01_per_sample/${sample}.cleaned.bam tmp/02_assembly/${sample}.assembly4-refined.fasta –outVcf tmp/02_assembly/${sample}.assembly3.vcf.gz –min_coverage 2 –novo_params ‘-r Random -l 20 -g 40 -x 20 -t 502’ –threads 15 –loglevel DEBUG
# bin/assembly.py refine_assembly tmp/02_assembly/${sample}.assembly4-refined.fasta data/01_per_sample/${sample}.cleaned.bam data/02_assembly/${sample}.fasta –outVcf tmp/02_assembly/${sample}.assembly4.vcf.gz –min_coverage 3 –novo_params ‘-r Random -l 20 -g 40 -x 20 -t 100’ –threads 15 –loglevel DEBUG
# done
# — ! Thirdly set the samples-assembly.txt completely and run “snakemake –directory /work –printshellcmds –cores 40”
# —————————- BUG list of the docker pipeline, mostly are due to the version incompability —————————-
#BUG_1: FileNotFoundError: [Errno 2] No such file or directory: ‘/home/jhuang/Tools/samtools-1.9/samtools’: ‘/home/jhuang/Tools/samtools-1.9/samtools’
#DEBUG_1 (DEPRECATED):
# – In docker install independent samtools
conda create -n samtools-1.9-env samtools=1.9 -c bioconda -c conda-forge
# – persistence the modified docker, next time run own docker image
docker ps
#CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
#881a1ad6a990 quay.io/broadinstitute/viral-ngs “bash” 8 minutes ago Up 8 minutes intelligent_yalow
docker commit 881a1ad6a990 own_viral_ngs
docker image ls
docker run -it own_viral_ngs bash
#Change the path as “/opt/miniconda/envs/samtools-1.9-env/bin/samtools” in /work/bin/tools/samtools.py
# If another tool expect for samtools could not be installed, also use the same method above to install it on own_viral_ngs!
#DEBUG_1_BETTER_SIMPLE: TOOL_VERSION = ‘1.6’ –> ‘1.9’ in ~/Tools/viral-ngs_docker/bin/tools/samtools.py
#BUG_2:
bin/taxon_filter.py deplete data/00_raw/2040_04.bam tmp/01_cleaned/2040_04.raw.bam tmp/01_cleaned/2040_04.bmtagger_depleted.bam tmp/01_cleaned/2040_04.rmdup.bam data/01_cleaned/2040_04.cleaned.bam –bmtaggerDbs /home/jhuang/REFs/viral_ngs_dbs/bmtagger_dbs_remove/hg19 /home/jhuang/REFs/viral_ngs_dbs/bmtagger_dbs_remove/metagenomics_contaminants_v3 /home/jhuang/REFs/viral_ngs_dbs/bmtagger_dbs_remove/GRCh37.68_ncRNA-GRCh37.68_transcripts-HS_rRNA_mitRNA –blastDbs /home/jhuang/REFs/viral_ngs_dbs/blast_dbs_remove/hybsel_probe_adapters /home/jhuang/REFs/viral_ngs_dbs/blast_dbs_remove/metag_v3.ncRNA.mRNA.mitRNA.consensus –threads 15 –srprismMemory 14250 –JVMmemory 50g –loglevel DEBUG
#2025-05-23 09:58:45,326 – __init__:445:_attempt_install – DEBUG – Currently installed version of blast: 2.7.1-h4422958_6
#2025-05-23 09:58:45,327 – __init__:448:_attempt_install – DEBUG – Expected version of blast: 2.6.0
#2025-05-23 09:58:45,327 – __init__:449:_attempt_install – DEBUG – Incorrect version of blast installed. Removing it…
#DEBUG_2: TOOL_VERSION = “2.6.0” –> “2.7.1” in ~/Tools/viral-ngs_docker/bin/tools/blast.py
#BUG_3:
bin/read_utils.py bwamem_idxstats data/01_cleaned/1762_04.cleaned.bam /home/jhuang/REFs/viral_ngs_dbs/spikeins/ercc_spike-ins.fasta –outStats reports/spike_count/1762_04.spike_count.txt –minScoreToFilter 60 –loglevel DEBUG
#DEBUG_3: TOOL_VERSION = “0.7.15” –> “0.7.17” in ~/Tools/viral-ngs_docker/bin/tools/bwa.py
#BUG_4: FileNotFoundError: [Errno 2] No such file or directory: ‘/usr/local/bin/trimmomatic’: ‘/usr/local/bin/trimmomatic’
#DEBUG_4: TOOL_VERSION = “0.36” –> “0.38” in ~/Tools/viral-ngs_docker/bin/tools/trimmomatic.py
#BUG_5: FileNotFoundError: [Errno 2] No such file or directory: ‘/usr/bin/spades.py’: ‘/usr/bin/spades.py’
#DEBUG_5: TOOL_VERSION = “0.36” –> “0.38” in ~/Tools/viral-ngs_docker/bin/tools/trimmomatic.py
# def install_and_get_path(self):
# # the conda version wraps the jar file with a shell script
# return ‘trimmomatic’
#BUG_6: bin/assembly.py order_and_orient tmp/02_assembly/2039_04.assembly1-spades.fasta refsel_db/refsel.fasta tmp/02_assembly/2039_04.assembly2-scaffolded.fasta –min_pct_contig_aligned 0.05 –outAlternateContigs tmp/02_assembly/2039_04.assembly2-alternate_sequences.fasta –nGenomeSegments 1 –outReference tmp/02_assembly/2039_04.assembly2-scaffold_ref.fasta –threads 15 –loglevel DEBUG
2025-05-23 17:40:19,526 – __init__:445:_attempt_install – DEBUG – Currently installed version of mummer4: 4.0.0beta2-pl526hf484d3e_4
2025-05-23 17:40:19,527 – __init__:448:_attempt_install – DEBUG – Expected version of mummer4: 4.0.0rc1
2025-05-23 17:40:19,527 – __init__:449:_attempt_install – DEBUG – Incorrect version of mummer4 installed. Removing it..
DEBUG_6: TOOL_VERSION = “4.0.0rc1” –> “4.0.0beta2” in ~/Tools/viral-ngs_docker/bin/tools/mummer.py
#BUG_7: bin/assembly.py order_and_orient tmp/02_assembly/2039_04.assembly1-spades.fasta refsel_db/refsel.fasta tmp/02_assembly/2039_04.assembly2-scaffolded.fasta –min_pct_contig_aligned 0.05 –outAlternateContigs tmp/02_assembly/2039_04.assembly2-alternate_sequences.fasta –nGenomeSegments 1 –outReference tmp/02_assembly/2039_04.assembly2-scaffold_ref.fasta –threads 15 –loglevel DEBUG
File “bin/assembly.py”, line 549, in
base_counts = [sum([len(seg.seq.replace(“N”, “”)) for seg in scaffold]) \
AttributeError: ‘Seq’ object has no attribute ‘replace’
DEBUG_7: base_counts = [sum([len(seg.seq.replace(“N”, “”)) for seg in scaffold]) –> base_counts = [sum([len(seg.seq.ungap(‘N’)) for seg in scaffold]) in ~/Tools/viral-ngs_docker/bin/assembly.py
BUG_8:
bin/assembly.py refine_assembly tmp/02_assembly/1243_2.assembly3-modify.fasta data/01_per_sample/1243_2.cleaned.bam tmp/02_assembly/1243_2.assembly4-refined.fasta –outVcf tmp/02_assembly/1243_2.assembly3.vcf.gz –min_coverage 2 –novo_params ‘-r Random -l 20 -g 40 -x 20 -t 502’ –threads 15 –loglevel DEBUG
File “/work/bin/tools/gatk.py”, line 75, in execute
FileNotFoundError: [Errno 2] No such file or directory: ‘/usr/local/bin/gatk’: ‘/usr/local/bin/gatk’
#DEBUG_8: -v /usr/local/bin/gatk:/usr/local/bin/gatk in ‘docker run’ and change default python in the script via a shebang; TOOL_VERSION = “3.8” –> “3.6” in ~/Tools/viral-ngs_docker/bin/tools/gatk.py
BUG_9:
pyyaml is missing!
#DEBUG_9: NO_ERROR if rerun!
bin/assembly.py impute_from_reference tmp/02_assembly/2039_04.assembly2-gapfilled.fasta tmp/02_assembly/2039_04.assembly2-scaffold_ref.fasta tmp/02_assembly/2039_04.assembly3-modify.fasta –newName 2039_04 –replaceLength 55 –minLengthFraction 0.05 –minUnambig 0.05 –index –loglevel DEBUG
for sample in 2039_04 2040_04; do
for sample in 1762_04 1243_2 875_04; do
bin/assembly.py impute_from_reference tmp/02_assembly/${sample}.assembly2-gapfilled.fasta tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta tmp/02_assembly/${sample}.assembly3-modify.fasta –newName ${sample} –replaceLength 55 –minLengthFraction 0.05 –minUnambig 0.05 –index –loglevel DEBUG
done
#BUG_10: bin/reports.py consolidate_fastqc reports/fastqc/2039_04/align_to_self reports/fastqc/2040_04/align_to_self reports/fastqc/1762_04/align_to_self reports/fastqc/1243_2/align_to_self reports/fastqc/875_04/align_to_self reports/summary.fastqc.align_to_self.txt
#DEBUG_10: File “bin/intrahost.py”, line 527 and line 579 in merge_to_vcf
# #MODIFIED_BACK
samp_to_seqIndex[sampleName] = seq.seq.ungap(‘-‘)
#samp_to_seqIndex[sampleName] = seq.seq.replace(“-“, “”)
#BUG_11: bin/interhost.py multichr_mafft ref_genome/reference.fasta data/02_assembly/2039_04.fasta data/02_assembly/2040_04.fasta data/02_assembly/1762_04.fasta data/02_assembly/1243_2.fasta data/02_assembly/875_04.fasta data/03_multialign_to_ref –ep 0.123 –maxiters 1000 –preservecase –localpair –outFilePrefix aligned –sampleNameListFile data/03_multialign_to_ref/sampleNameList.txt –threads 15 –loglevel DEBUG
2025-05-26 15:04:19,014 – cmd:195:main_argparse – INFO – command: bin/interhost.py multichr_mafft inFastas=[‘ref_genome/reference.fasta’, ‘data/02_assembly/2039_04.fasta’, ‘data/02_assembly/2040_04.fasta’, ‘data/02_assembly/1762_04.fasta’, ‘data/02_assembly/1243_2.fasta’, ‘data/02_assembly/875_04.fasta’] localpair=True globalpair=None preservecase=True reorder=None gapOpeningPenalty=1.53 ep=0.123 verbose=False outputAsClustal=None maxiters=1000 outDirectory=data/03_multialign_to_ref outFilePrefix=aligned sampleRelationFile=None sampleNameListFile=data/03_multialign_to_ref/sampleNameList.txt threads=15 loglevel=DEBUG tmp_dir=/tmp tmp_dirKeep=False
2025-05-26 15:04:19,014 – cmd:209:main_argparse – DEBUG – using tempDir: /tmp/tmp-interhost-multichr_mafft-nuws9mhp
2025-05-26 15:04:21,085 – __init__:445:_attempt_install – DEBUG – Currently installed version of mafft: 7.402-0
2025-05-26 15:04:21,085 – __init__:448:_attempt_install – DEBUG – Expected version of mafft: 7.221
2025-05-26 15:04:21,085 – __init__:449:_attempt_install – DEBUG – Incorrect version of mafft installed. Removing it…
#DEBUG_11: TOOL_VERSION = “7.221” –> “7.402” in ~/Tools/viral-ngs_docker/bin/tools/mafft.py
#BUG_12: bin/interhost.py snpEff data/04_intrahost/isnvs.vcf.gz PP810610.1 data/04_intrahost/isnvs.annot.vcf.gz j.huang@uke.de –loglevel DEBUG
2025-06-10 13:14:07,526 – __init__:445:_attempt_install – DEBUG – Currently installed version of snpeff: 4.3.1t-3
2025-06-10 13:14:07,527 – __init__:448:_attempt_install – DEBUG – Expected version of snpeff: 4.1l
#DEBUG_12: -v /usr/local/bin/gatk:/usr/local/bin/gatk in ‘docker run’ and change default python in the script via a shebang; TOOL_VERSION = “4.1l” –> “4.3.1t” in ~/Tools/viral-ngs_docker/bin/tools/snpeff.py
7. Comparing intra- and inter-host variants, comparing the variants to the alignments of the assemblies to confirm its correctness.
From the step 5, only 5 inter-host variants were confirmed: they are 10871, 19289, 23435.
PP810610 10871 hCoV229E_Rluc hCoV229E_Rluc C,T 0.0057070386810399495 0.011348936781066188 1.0 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10871 p10_DMSO p10_DMSO C,T 0.0577716643741403 0.10886819833916395 1.0 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10871 p10_K22 p10_K22 C,T 1.0 0.0 1.0 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10871 p10_K7523 p10_K7523 C,T 0.8228321896444167 0.2915587546587828 1.0 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10871 p16_DMSO p16_DMSO C,T 0.02927088877062267 0.05682820768240093 1.0 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10871 p16_K22 p16_K22 C,T 0.9911209766925638 0.017600372505084394 1.0 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10871 p16_X7523 p16_X7523 C,T 0.8776699029126214 0.21473088886794223 1.0 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 19289 hCoV229E_Rluc hCoV229E_Rluc G,T 0.0 0.0 1.0 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
PP810610 19289 p10_DMSO p10_DMSO G,T 0.0 0.0 1.0 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
PP810610 19289 p10_K22 p10_K22 G,T 1.0 0.0 1.0 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
PP810610 19289 p10_K7523 p10_K7523 G,T 0.0 0.0 1.0 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
PP810610 19289 p16_DMSO p16_DMSO G,T 0.0 0.0 1.0 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
PP810610 19289 p16_K22 p16_K22 G,T 0.9884823848238482 0.02276991943361173 1.0 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
PP810610 19289 p16_X7523 p16_X7523 G,T 0.0 0.0 1.0 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
PP810610 23435 hCoV229E_Rluc hCoV229E_Rluc C,T 0.0 0.0 1.0 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 p10_DMSO p10_DMSO C,T 0.031912415560214305 0.061788026586653055 1.0 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 p10_K22 p10_K22 C,T 1.0 0.0 1.0 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 p10_K7523 p10_K7523 C,T 0.8352090032154341 0.27526984832663026 1.0 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 p16_DMSO p16_DMSO C,T 0.0 0.0 1.0 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 p16_K22 p16_K22 C,T 0.958498023715415 0.07955912449811753 1.0 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 p16_X7523 p16_X7523 C,T 0.13175164058556285 0.22878629157715102 1.0 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
8. Generate variant_annot.xls and coverages.xls
sudo chown -R jhuang:jhuang data
# — generate isnvs_annot_complete__.txt, isnvs_annot_0.05.txt from ~/DATA/Data_Pietschmann_RSV_Probe3/data/04_intrahost
cp isnvs.annot.txt isnvs.annot_complete.txt
~/Tools/csv2xls-0.4/csv_to_xls.py isnvs.annot_complete.txt -d$’\t’ -o isnvs.annot_complete.xls
#delete the columns patient, time, Hw and Hs and the header in the xls and save as txt file.
awk ‘{printf “%.3f\n”, $5}’ isnvs.annot_complete.csv > f5
cut -f1-4 isnvs.annot_complete.csv > f1_4
cut -f6- isnvs.annot_complete.csv > f6_
paste f1_4 f5 > f1_5
paste f1_5 f6_ > isnvs_annot_complete_.txt
#correct f5 in header of isnvs_annot_complete_.txt to iSNV_freq
#header: chr pos sample alleles iSNV_freq eff_type eff_codon_dna eff_aa eff_aa_pos eff_prot_len eff_gene eff_protein
~/Tools/csv2xls-0.4/csv_to_xls.py isnvs_annot_complete_.txt -d$’\t’ -o variant_annot.xls
#MANUALLY generate variant_annot_0.01.csv variant_annot_0.05.csv
awk ‘ $5 >= 0.05 ‘ isnvs_annot_complete_.txt > 0.05.csv
cut -f2 0.05.csv
awk ‘ $5 >= 0.01 ‘ isnvs_annot_complete_.txt > 0.01.csv
cut -f2 0.05.csv | uniq > ids_0.05
cut -f2 0.01.csv | uniq > ids_0.01
#Replace ‘\n’ with ‘\\t” isnvs_annot_complete_.txt >> isnvs_annot_0.05.txt\ngrep -P “PP810610\\t’ in ids_0.05 and then deleting the ‘pos’ line
#Replace ‘\n’ with ‘\\t” isnvs_annot_complete_.txt >> isnvs_annot_0.01.txt\ngrep -P “PP810610\\t’ in ids_0.01 and then deleting the ‘pos’ line
#Run ids_0.05 and ids_0.01
cp ../../Outputs/Master_vcf/All_SNPs_indels_annotated.txt ../../Outputs/Master_vcf/All_SNPs_indels_annotated.txt hCoV229E_Rluc_variants
# Delete the three records which already reported in intra-host results hCoV229E_Rluc_variants: they are 10871, 19289, 23435.
PP810610 10871 C T SNP C C/T T C/T C/T T C/T missense_variant MODERATE MISSENSE Ctt/Ttt p.Leu3552Phe/c.10654C>T 6757 CDS_1 protein_coding
PP810610 19289 G T SNP G G T G G G/T G missense_variant MODERATE MISSENSE Gtt/Ttt p.Val6358Phe/c.19072G>T 6757 CDS_1 protein_coding
PP810610 23435 C T SNP C C T C/T C C/T C/T missense_variant MODERATE MISSENSE Ctt/Ttt p.Leu981Phe/c.2941C>T 1173 CDS_2 protein_coding
~/Tools/csv2xls-0.4/csv_to_xls.py isnvs_annot_0.05.txt isnvs_annot_0.01.txt hCoV229E_Rluc_variants -d$’\t’ -o variant_annot.xls
#Modify sheetname to variant_annot_0.05 and variant_annot_0.01 and add the header in Excel file.
#Note in the complete list, Set 2024 is NOT a subset of Set 2025 because the element 26283 is in set 2024 but missing from set 2025.
# — calculate the coverage
samtools depth ./data/02_align_to_self/hCoV229E_Rluc.mapped.bam > hCoV229E_Rluc_cov.txt
samtools depth ./data/02_align_to_self/p10_DMSO.mapped.bam > p10_DMSO_cov.txt
samtools depth ./data/02_align_to_self/p10_K22.mapped.bam > p10_K22_cov.txt
samtools depth ./data/02_align_to_self/p10_K7523.mapped.bam > p10_K7523_cov.txt
~/Tools/csv2xls-0.4/csv_to_xls.py hCoV229E_Rluc_cov.txt p10_DMSO_cov.txt p10_K22_cov.txt p10_K7523_cov.txt -d$’\t’ -o coverages.xls
#draw coverage and see if they are continuous?
samtools depth ./data/02_align_to_self/p16_DMSO.mapped.bam > p16_DMSO_cov.txt
samtools depth ./data/02_align_to_self/p16_K22.mapped.bam > p16_K22_cov.txt
samtools depth ./data/02_align_to_self/p16_X7523.mapped.bam > p16_K7523_cov.txt
~/Tools/csv2xls-0.4/csv_to_xls.py p16_DMSO_cov.txt p16_K22_cov.txt p16_K7523_cov.txt -d$’\t’ -o coverages_p16.xls
# Load required packages
library(ggplot2)
library(dplyr)
# Read the coverage data
cov_data <- read.table("p16_K7523_cov.txt", header = FALSE, sep = "\t",
col.names = c("Chromosome", "Position", "Coverage"))
# Create full position range for the given chromosome
full_range <- data.frame(Position = seq(min(cov_data$Position), max(cov_data$Position)))
# Merge with actual coverage data and fill missing positions with 0
cov_full %
left_join(cov_data[, c(“Position”, “Coverage”)], by = “Position”) %>%
mutate(Coverage = ifelse(is.na(Coverage), 0, Coverage))
# Save the plot to PNG
png(“p16_K7523_coverage_filled.png”, width = 1200, height = 600)
ggplot(cov_full, aes(x = Position, y = Coverage)) +
geom_line(color = “steelblue”, size = 0.3) +
labs(title = “Coverage Plot for p16_K7523 (Missing = 0)”,
x = “Genomic Position”,
y = “Coverage Depth”) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12)
)
dev.off()
9. (Optional) Consensus sequences of each and of all isolates
cat PP810610.1.fa OZ035258.1.fa MZ712010.1.fa OK662398.1.fa OK625404.1.fa KF293664.1.fa NC_002645.1.fa > all.fa
cp data/02_assembly/*.fasta ./
for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523; do \
mv ${sample}.fasta ${sample}.fa
cat all.fa ${sample}.fa >> all.fa
done
cat RSV_dedup.fa all.fa > RSV_all.fa
mafft –clustalout –adjustdirection RSV_all.fa > RSV_all.aln
snp-sites RSV_all.aln -o RSV_all_.aln
10. Report
Please find attached the variant analysis results for Thomas. Variant frequencies in the new samples are highlighted in yellow.
Although PP810610 is used as the reference, only differences observed in the samples p10_DMSO, p10_K22, p10_K7523, p16_DMSO, p16_K22, and p16_X7523 compared to hCoV229E_Rluc are reported in the sheets variant_annot_0.05 and variant_annot_0.01 (see variant_annot.xls). Variants already present in hCoV229E_Rluc are excluded from these sheets. In total, 17 mutations were found in hCoV229E_Rluc relative to PP810610, detailed in the sheet “hCoV229E_Rluc_variants” (see variant_annot.xls).
—— Explanation of iSNV_freq in the sheets variant_annot_0.05 and variant_annot_0.01 ——
The iSNV_freq column shows the frequency of the second allele at each position. For example, at position 23435 on chr PP810610:
chr Position Sample Alleles iSNV_freq
PP810610 23435 hCoV229E_Rluc C,T 0
PP810610 23435 p10_DMSO C,T 0.032
PP810610 23435 p10_K22 C,T 0.995
PP810610 23435 p10_K7523 C,T 0.835
PP810610 23435 p16_DMSO C,T 0
PP810610 23435 p16_K22 C,T 0.958
PP810610 23435 p16_X7523 C,T 0.132
The second allele (T) frequencies are:
0 (only C)
0.032 (3.2% T)
0.995 (99.5% T)
0.835 (83.5% T)
0 (only C)
0.958 (95.8% T)
0.132 (13.2% T)
# —- Explanation of Mutation at Position 19289 —-
Regarding the mutation at position 19289 — you’re absolutely right, and I had also noticed the discrepancy.
In the 2024 analysis, I performed intra-host variant calling, which detects only those variants with frequencies strictly between 0% and 100% within a single sample. Since position 19289 showed 100% G in p10_DMSO, 100% T in p10_K22, and 100% G in p10_K7523, it was not identified as an intra-host variant at that time. Rather, it’s a clear example of an inter-host variant — a fixed difference between samples.
In the 2025 analysis, I again used intra-host variant calling. This time, the mutation at position 19289 in p16_K22 was detected at 98.8% T, which falls within the threshold and therefore appears in the intra-host variant table.
After noticing this, I also ran a dedicated inter-host variant calling analysis, which specifically highlights differences between samples rather than within them. The results can be found in the third table (“hCoV229E_Rluc_variants”) of the variant_annot.xls file I sent you previously. As you’ll see, all 17 positions are identical across the 7 samples, indicating that no additional inter-host variants were detected beyond what we had already observed.
Lastly, please find the coverage data in the attached files.
# —
Just following up on the mutation at position 19289. By tweaking some settings in the inter-host variant calling, we can also detect variants at positions like 19289. However, in these results, a “/” indicates intra-host variants that require further validation through intra-host variant calling. The intra-host variant calling uses a more precise mapping strategy, enabling a more accurate estimation of allele frequencies.
Here’s an example from the inter-host variant table showing the mutation at 19289 with the adjusted settings:
CHROM POS REF ALT TYPE hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523 p16_DMSO p16_K22 p16_X7523
PP810610 19289 G T SNP G G T G G G/T G
# —————————————— END —————————————-
#Check if the 0.05 and 0.01 are superset of 0.05 and 0.01 of 2024 version: comparing ‘cut -f2 0.05.csv | uniq > ids_0.05_’ and ‘cut -f2 0.01.csv | uniq > ids_0.01_’ between 2024 and 2025
869, 1492, 4809, 5797, 8289, 8294, 8331, 8376, 9146, 9174, 9933, 9954, 9993,
10145, 10239, 10310, 10871, 10898, 10970, 11577, 12634, 17941, 18640, 18646,
18701, 18815, 19028, 19294, 19388, 21027, 21633, 21671, 21928, 22215, 23435,
23633, 24738, 25025, 25592, 26885
869, 1492, 3422, 4074, 4809, 5345, 5373, 5543, 5797, 6470, 8289, 8294, 8331,
8376, 9146, 9174, 9261, 9933, 9954, 9993, 10145, 10239, 10310, 10871, 10898,
10970, 11194, 11568, 11577, 11706, 12634, 13113, 13912, 15615, 17941, 18640,
18646, 18701, 18815, 18919, 19028, 19165, 19289, 19294, 19388, 21027, 21633,
21671, 21747, 21928, 22215, 22318, 22630, 22788, 22820, 22906, 22918, 23435,
23586, 23633, 24738, 24903, 25025, 25432, 25592, 26104, 26281, 26307, 26411,
26500, 26746, 26885
✅ Ja, Set 1 ist eine Teilmenge von Set 2. Alle Elemente von Set 1 sind auch in Set 2 enthalten.
Set 1: {1492,8289,8294,9174,10239,10310,10871,10898,11577,18640,21027,21633,22215,23435,24738,25025,25592}
Set 2: {1492,8289,8294,9174,10145,10239,10310,10871,10898,11577,18640,19289,21027,21633,22215,23435,24738,25025,25592}
Since every element of Set 1 is in Set 2, we have:
Set 1 ⊆ Set 2 In other words, Set 1 is a subset of Set 2.
diff 0.05_test_uniq.txt 0.05_test.csv
diff 0.01_test_uniq.txt 0.01_test.csv
> chr pos sample alleles iSNV_freq eff_type eff_codon_dna eff_aa eff_aa_pos eff_prot_len eff_gene eff_protein
8a10,17
> PP810610 3422 hCoV229E_Rluc C,T 0 missense_variant 3205C>T Leu1069Phe 1069 6758 Gene_217_20492 XBA84229.1
> PP810610 3422 p10_DMSO C,T 0 missense_variant 3205C>T Leu1069Phe 1069 6758 Gene_217_20492 XBA84229.1
> PP810610 3422 p10_K22 C,T 0 missense_variant 3205C>T Leu1069Phe 1069 6758 Gene_217_20492 XBA84229.1
> PP810610 3422 p10_K7523 C,T 0 missense_variant 3205C>T Leu1069Phe 1069 6758 Gene_217_20492 XBA84229.1
> PP810610 4074 hCoV229E_Rluc G,T 0 missense_variant 3857G>T Gly1286Val 1286 6758 Gene_217_20492 XBA84229.1
> PP810610 4074 p10_DMSO G,T 0 missense_variant 3857G>T Gly1286Val 1286 6758 Gene_217_20492 XBA84229.1
> PP810610 4074 p10_K22 G,T 0 missense_variant 3857G>T Gly1286Val 1286 6758 Gene_217_20492 XBA84229.1
> PP810610 4074 p10_K7523 G,T 0 missense_variant 3857G>T Gly1286Val 1286 6758 Gene_217_20492 XBA84229.1
12a22,33
> PP810610 5345 hCoV229E_Rluc C,T 0 synonymous_variant 5128C>T Leu1710Leu 1710 6758 Gene_217_20492 XBA84229.1
> PP810610 5345 p10_DMSO C,T 0 synonymous_variant 5128C>T Leu1710Leu 1710 6758 Gene_217_20492 XBA84229.1
> PP810610 5345 p10_K22 C,T 0 synonymous_variant 5128C>T Leu1710Leu 1710 6758 Gene_217_20492 XBA84229.1
> PP810610 5345 p10_K7523 C,T 0 synonymous_variant 5128C>T Leu1710Leu 1710 6758 Gene_217_20492 XBA84229.1
> PP810610 5373 hCoV229E_Rluc C,A 0 stop_gained 5156C>A Ser1719* 1719 6758 Gene_217_20492 XBA84229.1
> PP810610 5373 p10_DMSO C,A 0 stop_gained 5156C>A Ser1719* 1719 6758 Gene_217_20492 XBA84229.1
> PP810610 5373 p10_K22 C,A 0 stop_gained 5156C>A Ser1719* 1719 6758 Gene_217_20492 XBA84229.1
> PP810610 5373 p10_K7523 C,A 0 stop_gained 5156C>A Ser1719* 1719 6758 Gene_217_20492 XBA84229.1
> PP810610 5543 hCoV229E_Rluc C,T 0 missense_variant 5326C>T His1776Tyr 1776 6758 Gene_217_20492 XBA84229.1
> PP810610 5543 p10_DMSO C,T 0 missense_variant 5326C>T His1776Tyr 1776 6758 Gene_217_20492 XBA84229.1
> PP810610 5543 p10_K22 C,T 0 missense_variant 5326C>T His1776Tyr 1776 6758 Gene_217_20492 XBA84229.1
> PP810610 5543 p10_K7523 C,T 0 missense_variant 5326C>T His1776Tyr 1776 6758 Gene_217_20492 XBA84229.1
16a38,41
> PP810610 6470 hCoV229E_Rluc C,T 0 synonymous_variant 6253C>T Leu2085Leu 2085 6758 Gene_217_20492 XBA84229.1
> PP810610 6470 p10_DMSO C,T 0 synonymous_variant 6253C>T Leu2085Leu 2085 6758 Gene_217_20492 XBA84229.1
> PP810610 6470 p10_K22 C,T 0 synonymous_variant 6253C>T Leu2085Leu 2085 6758 Gene_217_20492 XBA84229.1
> PP810610 6470 p10_K7523 C,T 0 synonymous_variant 6253C>T Leu2085Leu 2085 6758 Gene_217_20492 XBA84229.1
40a66,69
> PP810610 9261 hCoV229E_Rluc C,T 0 missense_variant 9044C>T Ala3015Val 3015 6758 Gene_217_20492 XBA84229.1
> PP810610 9261 p10_DMSO C,T 0 missense_variant 9044C>T Ala3015Val 3015 6758 Gene_217_20492 XBA84229.1
> PP810610 9261 p10_K22 C,T 0 missense_variant 9044C>T Ala3015Val 3015 6758 Gene_217_20492 XBA84229.1
> PP810610 9261 p10_K7523 C,T 0 missense_variant 9044C>T Ala3015Val 3015 6758 Gene_217_20492 XBA84229.1
*1* 72c101
A Gly3561Ser 3561 6758 Gene_217_20492 XBA84229.1 (OLD calculation,take this to integrate)
—
> PP810610 10898 p10_K7523 G,A 0.062 missense_variant 10681G>A Gly3561Ser 3561 6758 Gene_217_20492 XBA84229.1 (NEW calculation)
76a106,113
> PP810610 11194 hCoV229E_Rluc C,A 0 synonymous_variant 10977C>A Ser3659Ser 3659 6758 Gene_217_20492 XBA84229.1
> PP810610 11194 p10_DMSO C,A 0 synonymous_variant 10977C>A Ser3659Ser 3659 6758 Gene_217_20492 XBA84229.1
> PP810610 11194 p10_K22 C,A 0 synonymous_variant 10977C>A Ser3659Ser 3659 6758 Gene_217_20492 XBA84229.1
> PP810610 11194 p10_K7523 C,A 0 synonymous_variant 10977C>A Ser3659Ser 3659 6758 Gene_217_20492 XBA84229.1
> PP810610 11568 hCoV229E_Rluc C,T 0 missense_variant 11351C>T Thr3784Ile 3784 6758 Gene_217_20492 XBA84229.1
> PP810610 11568 p10_DMSO C,T 0 missense_variant 11351C>T Thr3784Ile 3784 6758 Gene_217_20492 XBA84229.1
> PP810610 11568 p10_K22 C,T 0 missense_variant 11351C>T Thr3784Ile 3784 6758 Gene_217_20492 XBA84229.1
> PP810610 11568 p10_K7523 C,T 0 missense_variant 11351C>T Thr3784Ile 3784 6758 Gene_217_20492 XBA84229.1
80a118,121
> PP810610 11706 hCoV229E_Rluc C,A 0 missense_variant 11489C>A Pro3830Gln 3830 6758 Gene_217_20492 XBA84229.1
> PP810610 11706 p10_DMSO C,A 0 missense_variant 11489C>A Pro3830Gln 3830 6758 Gene_217_20492 XBA84229.1
> PP810610 11706 p10_K22 C,A 0 missense_variant 11489C>A Pro3830Gln 3830 6758 Gene_217_20492 XBA84229.1
> PP810610 11706 p10_K7523 C,A 0 missense_variant 11489C>A Pro3830Gln 3830 6758 Gene_217_20492 XBA84229.1
84a126,137
> PP810610 13113 hCoV229E_Rluc C,T 0 synonymous_variant 12897C>T Tyr4299Tyr 4299 6758 Gene_217_20492 XBA84229.1
> PP810610 13113 p10_DMSO C,T 0 synonymous_variant 12897C>T Tyr4299Tyr 4299 6758 Gene_217_20492 XBA84229.1
> PP810610 13113 p10_K22 C,T 0 synonymous_variant 12897C>T Tyr4299Tyr 4299 6758 Gene_217_20492 XBA84229.1
> PP810610 13113 p10_K7523 C,T 0 synonymous_variant 12897C>T Tyr4299Tyr 4299 6758 Gene_217_20492 XBA84229.1
> PP810610 13912 hCoV229E_Rluc G,A 0 missense_variant 13696G>A Gly4566Ser 4566 6758 Gene_217_20492 XBA84229.1
> PP810610 13912 p10_DMSO G,A 0 missense_variant 13696G>A Gly4566Ser 4566 6758 Gene_217_20492 XBA84229.1
> PP810610 13912 p10_K22 G,A 0 missense_variant 13696G>A Gly4566Ser 4566 6758 Gene_217_20492 XBA84229.1
> PP810610 13912 p10_K7523 G,A 0 missense_variant 13696G>A Gly4566Ser 4566 6758 Gene_217_20492 XBA84229.1
> PP810610 15615 hCoV229E_Rluc C,A 0 synonymous_variant 15399C>A Val5133Val 5133 6758 Gene_217_20492 XBA84229.1
> PP810610 15615 p10_DMSO C,A 0 synonymous_variant 15399C>A Val5133Val 5133 6758 Gene_217_20492 XBA84229.1
> PP810610 15615 p10_K22 C,A 0 synonymous_variant 15399C>A Val5133Val 5133 6758 Gene_217_20492 XBA84229.1
> PP810610 15615 p10_K7523 C,A 0 synonymous_variant 15399C>A Val5133Val 5133 6758 Gene_217_20492 XBA84229.1
104a158,161
> PP810610 18919 hCoV229E_Rluc C,T 0 missense_variant 18703C>T Arg6235Cys 6235 6758 Gene_217_20492 XBA84229.1
> PP810610 18919 p10_DMSO C,T 0 missense_variant 18703C>T Arg6235Cys 6235 6758 Gene_217_20492 XBA84229.1
> PP810610 18919 p10_K22 C,T 0 missense_variant 18703C>T Arg6235Cys 6235 6758 Gene_217_20492 XBA84229.1
> PP810610 18919 p10_K7523 C,T 0 missense_variant 18703C>T Arg6235Cys 6235 6758 Gene_217_20492 XBA84229.1
108a166,173
> PP810610 19165 hCoV229E_Rluc C,A 0 missense_variant 18949C>A Arg6317Ser 6317 6758 Gene_217_20492 XBA84229.1
> PP810610 19165 p10_DMSO C,A 0 missense_variant 18949C>A Arg6317Ser 6317 6758 Gene_217_20492 XBA84229.1
> PP810610 19165 p10_K22 C,A 0 missense_variant 18949C>A Arg6317Ser 6317 6758 Gene_217_20492 XBA84229.1
> PP810610 19165 p10_K7523 C,A 0 missense_variant 18949C>A Arg6317Ser 6317 6758 Gene_217_20492 XBA84229.1
> PP810610 19289 hCoV229E_Rluc G,T 0 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
> PP810610 19289 p10_DMSO G,T 0 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
> PP810610 19289 p10_K22 G,T 1 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
> PP810610 19289 p10_K7523 G,T 0 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
128a194,197
> PP810610 21747 hCoV229E_Rluc C,A 0 missense_variant 1253C>A Ser418Tyr 418 1173 Gene_20494_24015 XBA84230.1
> PP810610 21747 p10_DMSO C,A 0 missense_variant 1253C>A Ser418Tyr 418 1173 Gene_20494_24015 XBA84230.1
> PP810610 21747 p10_K22 C,A 0 missense_variant 1253C>A Ser418Tyr 418 1173 Gene_20494_24015 XBA84230.1
> PP810610 21747 p10_K7523 C,A 0 missense_variant 1253C>A Ser418Tyr 418 1173 Gene_20494_24015 XBA84230.1
*2* 131c200
C Gly478Gly 478 1173 Gene_20494_24015 XBA84230.1
—
> PP810610 21928 p10_K22 T,C 0.029 synonymous_variant 1434T>C Gly478Gly 478 1173 Gene_20494_24015 XBA84230.1
140a210,229
> PP810610 22630 hCoV229E_Rluc C,T 0 synonymous_variant 2136C>T Tyr712Tyr 712 1173 Gene_20494_24015 XBA84230.1
> PP810610 22630 p10_DMSO C,T 0 synonymous_variant 2136C>T Tyr712Tyr 712 1173 Gene_20494_24015 XBA84230.1
> PP810610 22630 p10_K22 C,T 0 synonymous_variant 2136C>T Tyr712Tyr 712 1173 Gene_20494_24015 XBA84230.1
> PP810610 22630 p10_K7523 C,T 0 synonymous_variant 2136C>T Tyr712Tyr 712 1173 Gene_20494_24015 XBA84230.1
> PP810610 22788 hCoV229E_Rluc T,C 0 missense_variant 2294T>C Val765Ala 765 1173 Gene_20494_24015 XBA84230.1
> PP810610 22788 p10_DMSO T,C 0 missense_variant 2294T>C Val765Ala 765 1173 Gene_20494_24015 XBA84230.1
> PP810610 22788 p10_K22 T,C 0 missense_variant 2294T>C Val765Ala 765 1173 Gene_20494_24015 XBA84230.1
> PP810610 22788 p10_K7523 T,C 0 missense_variant 2294T>C Val765Ala 765 1173 Gene_20494_24015 XBA84230.1
> PP810610 22820 hCoV229E_Rluc C,T 0 missense_variant 2326C>T Arg776Cys 776 1173 Gene_20494_24015 XBA84230.1
> PP810610 22820 p10_DMSO C,T 0 missense_variant 2326C>T Arg776Cys 776 1173 Gene_20494_24015 XBA84230.1
> PP810610 22820 p10_K22 C,T 0 missense_variant 2326C>T Arg776Cys 776 1173 Gene_20494_24015 XBA84230.1
> PP810610 22820 p10_K7523 C,T 0 missense_variant 2326C>T Arg776Cys 776 1173 Gene_20494_24015 XBA84230.1
> PP810610 22906 hCoV229E_Rluc C,T 0 synonymous_variant 2412C>T Asn804Asn 804 1173 Gene_20494_24015 XBA84230.1
> PP810610 22906 p10_DMSO C,T 0 synonymous_variant 2412C>T Asn804Asn 804 1173 Gene_20494_24015 XBA84230.1
> PP810610 22906 p10_K22 C,T 0 synonymous_variant 2412C>T Asn804Asn 804 1173 Gene_20494_24015 XBA84230.1
> PP810610 22906 p10_K7523 C,T 0 synonymous_variant 2412C>T Asn804Asn 804 1173 Gene_20494_24015 XBA84230.1
> PP810610 22918 hCoV229E_Rluc C,A 0 synonymous_variant 2424C>A Ala808Ala 808 1173 Gene_20494_24015 XBA84230.1
> PP810610 22918 p10_DMSO C,A 0 synonymous_variant 2424C>A Ala808Ala 808 1173 Gene_20494_24015 XBA84230.1
> PP810610 22918 p10_K22 C,A 0 synonymous_variant 2424C>A Ala808Ala 808 1173 Gene_20494_24015 XBA84230.1
> PP810610 22918 p10_K7523 C,A 0 synonymous_variant 2424C>A Ala808Ala 808 1173 Gene_20494_24015 XBA84230.1
*3* 143c232
T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
—
> PP810610 23435 p10_K22 C,T 1 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
144a234,237
> PP810610 23586 hCoV229E_Rluc C,A 0 missense_variant 3092C>A Pro1031Gln 1031 1173 Gene_20494_24015 XBA84230.1
> PP810610 23586 p10_DMSO C,A 0 missense_variant 3092C>A Pro1031Gln 1031 1173 Gene_20494_24015 XBA84230.1
> PP810610 23586 p10_K22 C,A 0 missense_variant 3092C>A Pro1031Gln 1031 1173 Gene_20494_24015 XBA84230.1
> PP810610 23586 p10_K7523 C,A 0 missense_variant 3092C>A Pro1031Gln 1031 1173 Gene_20494_24015 XBA84230.1
‘-minus this record-‘ 149,152c242,249
T,64C>A Leu22Phe,Leu22Ile 22 77 Gene_24674_24907 XBA84233.1
T,64C>A Leu22Phe,Leu22Ile 22 77 Gene_24674_24907 XBA84233.1
T,64C>A Leu22Phe,Leu22Ile 22 77 Gene_24674_24907 XBA84233.1
T,64C>A Leu22Phe,Leu22Ile 22 77 Gene_24674_24907 XBA84233.1
—
> PP810610 24738 hCoV229E_Rluc C,A,T 0 missense_variant 64C>A,64C>T Leu22Ile,Leu22Phe 22 77 Gene_24674_24907 XBA84233.1
> PP810610 24738 p10_DMSO C,A,T 0.011 missense_variant 64C>A,64C>T Leu22Ile,Leu22Phe 22 77 Gene_24674_24907 XBA84233.1
> PP810610 24738 p10_K22 C,A,T 1 missense_variant 64C>A,64C>T Leu22Ile,Leu22Phe 22 77 Gene_24674_24907 XBA84233.1
> PP810610 24738 p10_K7523 C,A,T 0.106 missense_variant 64C>A,64C>T Leu22Ile,Leu22Phe 22 77 Gene_24674_24907 XBA84233.1
> PP810610 24903 hCoV229E_Rluc T,C 0 missense_variant 229T>C Phe77Leu 77 77 Gene_24674_24907 XBA84233.1
> PP810610 24903 p10_DMSO T,C 0 missense_variant 229T>C Phe77Leu 77 77 Gene_24674_24907 XBA84233.1
> PP810610 24903 p10_K22 T,C 0 missense_variant 229T>C Phe77Leu 77 77 Gene_24674_24907 XBA84233.1
> PP810610 24903 p10_K7523 T,C 0 missense_variant 229T>C Phe77Leu 77 77 Gene_24674_24907 XBA84233.1
*4* 154c251
T His36Tyr 36 225 Gene_24919_25596 XBA84234.1
—
> PP810610 25025 p10_DMSO C,T 0.049 missense_variant 106C>T His36Tyr 36 225 Gene_24919_25596 XBA84234.1
156a254,257
> PP810610 25432 hCoV229E_Rluc C,A 0 synonymous_variant 513C>A Ala171Ala 171 225 Gene_24919_25596 XBA84234.1
> PP810610 25432 p10_DMSO C,A 0 synonymous_variant 513C>A Ala171Ala 171 225 Gene_24919_25596 XBA84234.1
> PP810610 25432 p10_K22 C,A 0 synonymous_variant 513C>A Ala171Ala 171 225 Gene_24919_25596 XBA84234.1
> PP810610 25432 p10_K7523 C,A 0 synonymous_variant 513C>A Ala171Ala 171 225 Gene_24919_25596 XBA84234.1
161,164c262,289
< PP810610 26885 hCoV229E_Rluc T,A 0 intergenic_region
< PP810610 26885 p10_DMSO T,A 0 intergenic_region
< PP810610 26885 p10_K22 T,A 0.009 intergenic_region
PP810610 26104 hCoV229E_Rluc C,T,A 0 missense_variant 494C>A,494C>T Pro165His,Pro165Leu 165 389 Gene_25610_26779 XBA84235.1
> PP810610 26104 p10_DMSO C,T,A 0 missense_variant 494C>A,494C>T Pro165His,Pro165Leu 165 389 Gene_25610_26779 XBA84235.1
> PP810610 26104 p10_K22 C,T,A 0 missense_variant 494C>A,494C>T Pro165His,Pro165Leu 165 389 Gene_25610_26779 XBA84235.1
> PP810610 26104 p10_K7523 C,T,A 0 missense_variant 494C>A,494C>T Pro165His,Pro165Leu 165 389 Gene_25610_26779 XBA84235.1
> PP810610 26281 hCoV229E_Rluc C,T 0 missense_variant 671C>T Thr224Ile 224 389 Gene_25610_26779 XBA84235.1
> PP810610 26281 p10_DMSO C,T 0 missense_variant 671C>T Thr224Ile 224 389 Gene_25610_26779 XBA84235.1
> PP810610 26281 p10_K22 C,T 0 missense_variant 671C>T Thr224Ile 224 389 Gene_25610_26779 XBA84235.1
> PP810610 26281 p10_K7523 C,T 0 missense_variant 671C>T Thr224Ile 224 389 Gene_25610_26779 XBA84235.1
> PP810610 26307 hCoV229E_Rluc C,A 0 missense_variant 697C>A Gln233Lys 233 389 Gene_25610_26779 XBA84235.1
> PP810610 26307 p10_DMSO C,A 0 missense_variant 697C>A Gln233Lys 233 389 Gene_25610_26779 XBA84235.1
> PP810610 26307 p10_K22 C,A 0 missense_variant 697C>A Gln233Lys 233 389 Gene_25610_26779 XBA84235.1
> PP810610 26307 p10_K7523 C,A 0 missense_variant 697C>A Gln233Lys 233 389 Gene_25610_26779 XBA84235.1
> PP810610 26411 hCoV229E_Rluc C,A 0 synonymous_variant 801C>A Pro267Pro 267 389 Gene_25610_26779 XBA84235.1
> PP810610 26411 p10_DMSO C,A 0 synonymous_variant 801C>A Pro267Pro 267 389 Gene_25610_26779 XBA84235.1
> PP810610 26411 p10_K22 C,A 0 synonymous_variant 801C>A Pro267Pro 267 389 Gene_25610_26779 XBA84235.1
> PP810610 26411 p10_K7523 C,A 0 synonymous_variant 801C>A Pro267Pro 267 389 Gene_25610_26779 XBA84235.1
> PP810610 26500 hCoV229E_Rluc C,A 0 missense_variant 890C>A Pro297Gln 297 389 Gene_25610_26779 XBA84235.1
> PP810610 26500 p10_DMSO C,A 0 missense_variant 890C>A Pro297Gln 297 389 Gene_25610_26779 XBA84235.1
> PP810610 26500 p10_K22 C,A 0 missense_variant 890C>A Pro297Gln 297 389 Gene_25610_26779 XBA84235.1
> PP810610 26500 p10_K7523 C,A 0 missense_variant 890C>A Pro297Gln 297 389 Gene_25610_26779 XBA84235.1
> PP810610 26746 hCoV229E_Rluc C,A 0 missense_variant 1136C>A Ser379Tyr 379 389 Gene_25610_26779 XBA84235.1
> PP810610 26746 p10_DMSO C,A 0 missense_variant 1136C>A Ser379Tyr 379 389 Gene_25610_26779 XBA84235.1
> PP810610 26746 p10_K22 C,A 0 missense_variant 1136C>A Ser379Tyr 379 389 Gene_25610_26779 XBA84235.1
> PP810610 26746 p10_K7523 C,A 0 missense_variant 1136C>A Ser379Tyr 379 389 Gene_25610_26779 XBA84235.1
> PP810610 26885 hCoV229E_Rluc T,A 0 intergenic_region n.26885T>A Gene_25610_26779-CHR_END Gene_25610_26779-CHR_END
> PP810610 26885 p10_DMSO T,A 0 intergenic_region n.26885T>A Gene_25610_26779-CHR_END Gene_25610_26779-CHR_END
> PP810610 26885 p10_K22 T,A 0.009 intergenic_region n.26885T>A Gene_25610_26779-CHR_END Gene_25610_26779-CHR_END
> PP810610 26885 p10_K7523 T,A 0.011 intergenic_region n.26885T>A Gene_25610_26779-CHR_END Gene_25610_26779-CHR_END
TODOs:
Schnaps-Idee: we can organize the results with a additional column 2025, so at the end:
#chr pos n.a. alleles iSNV_freq eff_type eff_codon_dna eff_aa eff_aa_pos eff_prot_len eff_gene eff_protein
#chr pos sample2024 sample2025 alleles iSNV_freq eff_type eff_codon_dna eff_aa eff_aa_pos eff_prot_len eff_gene eff_protein
hCoV229E_Rluc
that means, we need to delete the SNP results of 2025 for hCoV229E_Rluc, p10_DMSO, p10_K22, p10_K7523. we have only three new samples of data.
If the SNP ist complete new in 2025, the 2024 data should be all ‘0’
For the
please generate the report according to the SNP-comparison between 2024 and 2025:
!!!!TODO_TOMORROW!!!!:
1. Using the following report, however copy the results of 2024 to new table so that we can unify the results! Marking all new added results yellow.
2. If the SNP ist complete new in 2025, the 2024 data should be all ‘0’, should all 7 mark yellow.
3. One for 0.01 and one for 0.05, in this way, we can also present the results 2024_0.01.
4. Copy the pipeline process to xgenes.com!
2024:
chr pos sample alleles iSNV_freq eff_type eff_codon_dna eff_aa eff_aa_pos eff_prot_len eff_gene eff_protein
PP810610 1492 hCoV229E_Rluc T,A 0.207 synonymous_variant 1275T>A Thr425Thr 425 6758 Gene_217_20492 XBA84229.1
PP810610 1492 p10_DMSO T,A 0.081 synonymous_variant 1275T>A Thr425Thr 425 6758 Gene_217_20492 XBA84229.1
PP810610 1492 p10_K22 T,A 0.854 synonymous_variant 1275T>A Thr425Thr 425 6758 Gene_217_20492 XBA84229.1
PP810610 1492 p10_K7523 T,A 0.229 synonymous_variant 1275T>A Thr425Thr 425 6758 Gene_217_20492 XBA84229.1
PP810610 8289 hCoV229E_Rluc C,A 0.325 missense_variant 8072C>A Ala2691Asp 2691 6758 Gene_217_20492 XBA84229.1
PP810610 8289 p10_DMSO C,A 0.028 missense_variant 8072C>A Ala2691Asp 2691 6758 Gene_217_20492 XBA84229.1
PP810610 8289 p10_K22 C,A 0 missense_variant 8072C>A Ala2691Asp 2691 6758 Gene_217_20492 XBA84229.1
PP810610 8289 p10_K7523 C,A 0.831 missense_variant 8072C>A Ala2691Asp 2691 6758 Gene_217_20492 XBA84229.1
PP810610 8294 hCoV229E_Rluc A,G 0.179 missense_variant 8077A>G Lys2693Glu 2693 6758 Gene_217_20492 XBA84229.1
PP810610 8294 p10_DMSO A,G 0.024 missense_variant 8077A>G Lys2693Glu 2693 6758 Gene_217_20492 XBA84229.1
PP810610 8294 p10_K22 A,G 0.074 missense_variant 8077A>G Lys2693Glu 2693 6758 Gene_217_20492 XBA84229.1
PP810610 8294 p10_K7523 A,G 0 missense_variant 8077A>G Lys2693Glu 2693 6758 Gene_217_20492 XBA84229.1
PP810610 9174 hCoV229E_Rluc G,A 0 missense_variant 8957G>A Cys2986Tyr 2986 6758 Gene_217_20492 XBA84229.1
PP810610 9174 p10_DMSO G,A 0 missense_variant 8957G>A Cys2986Tyr 2986 6758 Gene_217_20492 XBA84229.1
PP810610 9174 p10_K22 G,A 0 missense_variant 8957G>A Cys2986Tyr 2986 6758 Gene_217_20492 XBA84229.1
PP810610 9174 p10_K7523 G,A 0.066 missense_variant 8957G>A Cys2986Tyr 2986 6758 Gene_217_20492 XBA84229.1
PP810610 10239 hCoV229E_Rluc T,G 0 missense_variant 10022T>G Val3341Gly 3341 6758 Gene_217_20492 XBA84229.1
PP810610 10239 p10_DMSO T,G 0 missense_variant 10022T>G Val3341Gly 3341 6758 Gene_217_20492 XBA84229.1
PP810610 10239 p10_K22 T,G 0.055 missense_variant 10022T>G Val3341Gly 3341 6758 Gene_217_20492 XBA84229.1
PP810610 10239 p10_K7523 T,G 0 missense_variant 10022T>G Val3341Gly 3341 6758 Gene_217_20492 XBA84229.1
PP810610 10310 hCoV229E_Rluc G,A 0 missense_variant 10093G>A Val3365Ile 3365 6758 Gene_217_20492 XBA84229.1
PP810610 10310 p10_DMSO G,A 0 missense_variant 10093G>A Val3365Ile 3365 6758 Gene_217_20492 XBA84229.1
PP810610 10310 p10_K22 G,A 0 missense_variant 10093G>A Val3365Ile 3365 6758 Gene_217_20492 XBA84229.1
PP810610 10310 p10_K7523 G,A 0.156 missense_variant 10093G>A Val3365Ile 3365 6758 Gene_217_20492 XBA84229.1
PP810610 10871 hCoV229E_Rluc C,T 0.006 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10871 p10_DMSO C,T 0.058 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10871 p10_K22 C,T 1 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10871 p10_K7523 C,T 0.823 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10898 hCoV229E_Rluc G,A 0.012 missense_variant 10681G>A Gly3561Ser 3561 6758 Gene_217_20492 XBA84229.1
PP810610 10898 p10_DMSO G,A 0.036 missense_variant 10681G>A Gly3561Ser 3561 6758 Gene_217_20492 XBA84229.1
PP810610 10898 p10_K22 G,A 0 missense_variant 10681G>A Gly3561Ser 3561 6758 Gene_217_20492 XBA84229.1
PP810610 10898 p10_K7523 G,A 0.064 missense_variant 10681G>A Gly3561Ser 3561 6758 Gene_217_20492 XBA84229.1
PP810610 11577 hCoV229E_Rluc A,C 0 missense_variant 11360A>C Glu3787Ala 3787 6758 Gene_217_20492 XBA84229.1
PP810610 11577 p10_DMSO A,C 0.184 missense_variant 11360A>C Glu3787Ala 3787 6758 Gene_217_20492 XBA84229.1
PP810610 11577 p10_K22 A,C 0 missense_variant 11360A>C Glu3787Ala 3787 6758 Gene_217_20492 XBA84229.1
PP810610 11577 p10_K7523 A,C 0 missense_variant 11360A>C Glu3787Ala 3787 6758 Gene_217_20492 XBA84229.1
PP810610 18640 hCoV229E_Rluc T,G 0 missense_variant 18424T>G Phe6142Val 6142 6758 Gene_217_20492 XBA84229.1
PP810610 18640 p10_DMSO T,G 0 missense_variant 18424T>G Phe6142Val 6142 6758 Gene_217_20492 XBA84229.1
PP810610 18640 p10_K22 T,G 0 missense_variant 18424T>G Phe6142Val 6142 6758 Gene_217_20492 XBA84229.1
PP810610 18640 p10_K7523 T,G 0.055 missense_variant 18424T>G Phe6142Val 6142 6758 Gene_217_20492 XBA84229.1
PP810610 21027 hCoV229E_Rluc C,T 0 missense_variant 533C>T Thr178Ile 178 1173 Gene_20494_24015 XBA84230.1
PP810610 21027 p10_DMSO C,T 0.186 missense_variant 533C>T Thr178Ile 178 1173 Gene_20494_24015 XBA84230.1
PP810610 21027 p10_K22 C,T 0 missense_variant 533C>T Thr178Ile 178 1173 Gene_20494_24015 XBA84230.1
PP810610 21027 p10_K7523 C,T 0.032 missense_variant 533C>T Thr178Ile 178 1173 Gene_20494_24015 XBA84230.1
PP810610 21633 hCoV229E_Rluc T,C 0 missense_variant 1139T>C Val380Ala 380 1173 Gene_20494_24015 XBA84230.1
PP810610 21633 p10_DMSO T,C 0.08 missense_variant 1139T>C Val380Ala 380 1173 Gene_20494_24015 XBA84230.1
PP810610 21633 p10_K22 T,C 0 missense_variant 1139T>C Val380Ala 380 1173 Gene_20494_24015 XBA84230.1
PP810610 21633 p10_K7523 T,C 0 missense_variant 1139T>C Val380Ala 380 1173 Gene_20494_24015 XBA84230.1
PP810610 22215 hCoV229E_Rluc T,G 0 missense_variant 1721T>G Val574Gly 574 1173 Gene_20494_24015 XBA84230.1
PP810610 22215 p10_DMSO T,G 0 missense_variant 1721T>G Val574Gly 574 1173 Gene_20494_24015 XBA84230.1
PP810610 22215 p10_K22 T,G 0 missense_variant 1721T>G Val574Gly 574 1173 Gene_20494_24015 XBA84230.1
PP810610 22215 p10_K7523 T,G 0.078 missense_variant 1721T>G Val574Gly 574 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 hCoV229E_Rluc C,T 0 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 p10_DMSO C,T 0.032 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 p10_K22 C,T 0.995 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 p10_K7523 C,T 0.835 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 25592 hCoV229E_Rluc T,C 0.012 missense_variant 673T>C Phe225Leu 225 225 Gene_24919_25596 XBA84234.1
PP810610 25592 p10_DMSO T,C 0.925 missense_variant 673T>C Phe225Leu 225 225 Gene_24919_25596 XBA84234.1
PP810610 25592 p10_K22 T,C 0 missense_variant 673T>C Phe225Leu 225 225 Gene_24919_25596 XBA84234.1
PP810610 25592 p10_K7523 T,C 0 missense_variant 673T>C Phe225Leu 225 225 Gene_24919_25596 XBA84234.1
2025:
chr pos sample alleles iSNV_freq eff_type eff_codon_dna eff_aa eff_aa_pos eff_prot_len eff_gene eff_protein
PP810610 1492 hCoV229E_Rluc T,A 0.207 synonymous_variant 1275T>A Thr425Thr 425 6758 Gene_217_20492 XBA84229.1
PP810610 1492 p10_DMSO T,A 0.081 synonymous_variant 1275T>A Thr425Thr 425 6758 Gene_217_20492 XBA84229.1
PP810610 1492 p10_K22 T,A 0.854 synonymous_variant 1275T>A Thr425Thr 425 6758 Gene_217_20492 XBA84229.1
PP810610 1492 p10_K7523 T,A 0.229 synonymous_variant 1275T>A Thr425Thr 425 6758 Gene_217_20492 XBA84229.1
PP810610 1492 p16_DMSO T,A 0.043 synonymous_variant 1275T>A Thr425Thr 425 6758 Gene_217_20492 XBA84229.1
PP810610 1492 p16_K22 T,A 0.893 synonymous_variant 1275T>A Thr425Thr 425 6758 Gene_217_20492 XBA84229.1
PP810610 1492 p16_X7523 T,A 0.179 synonymous_variant 1275T>A Thr425Thr 425 6758 Gene_217_20492 XBA84229.1
PP810610 8289 hCoV229E_Rluc C,A 0.325 missense_variant 8072C>A Ala2691Asp 2691 6758 Gene_217_20492 XBA84229.1
PP810610 8289 p10_DMSO C,A 0.028 missense_variant 8072C>A Ala2691Asp 2691 6758 Gene_217_20492 XBA84229.1
PP810610 8289 p10_K22 C,A 0 missense_variant 8072C>A Ala2691Asp 2691 6758 Gene_217_20492 XBA84229.1
PP810610 8289 p10_K7523 C,A 0.831 missense_variant 8072C>A Ala2691Asp 2691 6758 Gene_217_20492 XBA84229.1
PP810610 8289 p16_DMSO C,A 0 missense_variant 8072C>A Ala2691Asp 2691 6758 Gene_217_20492 XBA84229.1
PP810610 8289 p16_K22 C,A 0 missense_variant 8072C>A Ala2691Asp 2691 6758 Gene_217_20492 XBA84229.1
PP810610 8289 p16_X7523 C,A 0.226 missense_variant 8072C>A Ala2691Asp 2691 6758 Gene_217_20492 XBA84229.1
PP810610 8294 hCoV229E_Rluc A,G 0.179 missense_variant 8077A>G Lys2693Glu 2693 6758 Gene_217_20492 XBA84229.1
PP810610 8294 p10_DMSO A,G 0.024 missense_variant 8077A>G Lys2693Glu 2693 6758 Gene_217_20492 XBA84229.1
PP810610 8294 p10_K22 A,G 0.074 missense_variant 8077A>G Lys2693Glu 2693 6758 Gene_217_20492 XBA84229.1
PP810610 8294 p10_K7523 A,G 0 missense_variant 8077A>G Lys2693Glu 2693 6758 Gene_217_20492 XBA84229.1
PP810610 8294 p16_DMSO A,G 0 missense_variant 8077A>G Lys2693Glu 2693 6758 Gene_217_20492 XBA84229.1
PP810610 8294 p16_K22 A,G 0.145 missense_variant 8077A>G Lys2693Glu 2693 6758 Gene_217_20492 XBA84229.1
PP810610 8294 p16_X7523 A,G 0 missense_variant 8077A>G Lys2693Glu 2693 6758 Gene_217_20492 XBA84229.1
PP810610 9174 hCoV229E_Rluc G,A 0 missense_variant 8957G>A Cys2986Tyr 2986 6758 Gene_217_20492 XBA84229.1
PP810610 9174 p10_DMSO G,A 0 missense_variant 8957G>A Cys2986Tyr 2986 6758 Gene_217_20492 XBA84229.1
PP810610 9174 p10_K22 G,A 0 missense_variant 8957G>A Cys2986Tyr 2986 6758 Gene_217_20492 XBA84229.1
PP810610 9174 p10_K7523 G,A 0.066 missense_variant 8957G>A Cys2986Tyr 2986 6758 Gene_217_20492 XBA84229.1
PP810610 9174 p16_DMSO G,A 0 missense_variant 8957G>A Cys2986Tyr 2986 6758 Gene_217_20492 XBA84229.1
PP810610 9174 p16_K22 G,A 0 missense_variant 8957G>A Cys2986Tyr 2986 6758 Gene_217_20492 XBA84229.1
PP810610 9174 p16_X7523 G,A 0.025 missense_variant 8957G>A Cys2986Tyr 2986 6758 Gene_217_20492 XBA84229.1
PP810610 10145 hCoV229E_Rluc A,G 0 missense_variant 9928A>G Met3310Val 3310 6758 Gene_217_20492 XBA84229.1
PP810610 10145 p10_DMSO A,G 0 missense_variant 9928A>G Met3310Val 3310 6758 Gene_217_20492 XBA84229.1
PP810610 10145 p10_K22 A,G 0 missense_variant 9928A>G Met3310Val 3310 6758 Gene_217_20492 XBA84229.1
PP810610 10145 p10_K7523 A,G 0.045 missense_variant 9928A>G Met3310Val 3310 6758 Gene_217_20492 XBA84229.1
PP810610 10145 p16_DMSO A,G 0 missense_variant 9928A>G Met3310Val 3310 6758 Gene_217_20492 XBA84229.1
PP810610 10145 p16_K22 A,G 0 missense_variant 9928A>G Met3310Val 3310 6758 Gene_217_20492 XBA84229.1
PP810610 10145 p16_X7523 A,G 0.064 missense_variant 9928A>G Met3310Val 3310 6758 Gene_217_20492 XBA84229.1
PP810610 10239 hCoV229E_Rluc T,G 0 missense_variant 10022T>G Val3341Gly 3341 6758 Gene_217_20492 XBA84229.1
PP810610 10239 p10_DMSO T,G 0 missense_variant 10022T>G Val3341Gly 3341 6758 Gene_217_20492 XBA84229.1
PP810610 10239 p10_K22 T,G 0.055 missense_variant 10022T>G Val3341Gly 3341 6758 Gene_217_20492 XBA84229.1
PP810610 10239 p10_K7523 T,G 0 missense_variant 10022T>G Val3341Gly 3341 6758 Gene_217_20492 XBA84229.1
PP810610 10239 p16_DMSO T,G 0 missense_variant 10022T>G Val3341Gly 3341 6758 Gene_217_20492 XBA84229.1
PP810610 10239 p16_K22 T,G 0.08 missense_variant 10022T>G Val3341Gly 3341 6758 Gene_217_20492 XBA84229.1
PP810610 10239 p16_X7523 T,G 0 missense_variant 10022T>G Val3341Gly 3341 6758 Gene_217_20492 XBA84229.1
PP810610 10310 hCoV229E_Rluc G,A 0 missense_variant 10093G>A Val3365Ile 3365 6758 Gene_217_20492 XBA84229.1
PP810610 10310 p10_DMSO G,A 0 missense_variant 10093G>A Val3365Ile 3365 6758 Gene_217_20492 XBA84229.1
PP810610 10310 p10_K22 G,A 0 missense_variant 10093G>A Val3365Ile 3365 6758 Gene_217_20492 XBA84229.1
PP810610 10310 p10_K7523 G,A 0.156 missense_variant 10093G>A Val3365Ile 3365 6758 Gene_217_20492 XBA84229.1
PP810610 10310 p16_DMSO G,A 0 missense_variant 10093G>A Val3365Ile 3365 6758 Gene_217_20492 XBA84229.1
PP810610 10310 p16_K22 G,A 0 missense_variant 10093G>A Val3365Ile 3365 6758 Gene_217_20492 XBA84229.1
PP810610 10310 p16_X7523 G,A 0.091 missense_variant 10093G>A Val3365Ile 3365 6758 Gene_217_20492 XBA84229.1
PP810610 10871 hCoV229E_Rluc C,T 0.006 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10871 p10_DMSO C,T 0.058 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10871 p10_K22 C,T 1 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10871 p10_K7523 C,T 0.823 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10871 p16_DMSO C,T 0.029 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10871 p16_K22 C,T 0.991 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10871 p16_X7523 C,T 0.878 missense_variant 10654C>T Leu3552Phe 3552 6758 Gene_217_20492 XBA84229.1
PP810610 10898 hCoV229E_Rluc G,A 0.012 missense_variant 10681G>A Gly3561Ser 3561 6758 Gene_217_20492 XBA84229.1
PP810610 10898 p10_DMSO G,A 0.036 missense_variant 10681G>A Gly3561Ser 3561 6758 Gene_217_20492 XBA84229.1
PP810610 10898 p10_K22 G,A 0 missense_variant 10681G>A Gly3561Ser 3561 6758 Gene_217_20492 XBA84229.1
PP810610 10898 p10_K7523 G,A 0.062 missense_variant 10681G>A Gly3561Ser 3561 6758 Gene_217_20492 XBA84229.1
PP810610 10898 p16_DMSO G,A 0.018 missense_variant 10681G>A Gly3561Ser 3561 6758 Gene_217_20492 XBA84229.1
PP810610 10898 p16_K22 G,A 0 missense_variant 10681G>A Gly3561Ser 3561 6758 Gene_217_20492 XBA84229.1
PP810610 10898 p16_X7523 G,A 0.044 missense_variant 10681G>A Gly3561Ser 3561 6758 Gene_217_20492 XBA84229.1
PP810610 11577 hCoV229E_Rluc A,C 0 missense_variant 11360A>C Glu3787Ala 3787 6758 Gene_217_20492 XBA84229.1
PP810610 11577 p10_DMSO A,C 0.184 missense_variant 11360A>C Glu3787Ala 3787 6758 Gene_217_20492 XBA84229.1
PP810610 11577 p10_K22 A,C 0 missense_variant 11360A>C Glu3787Ala 3787 6758 Gene_217_20492 XBA84229.1
PP810610 11577 p10_K7523 A,C 0 missense_variant 11360A>C Glu3787Ala 3787 6758 Gene_217_20492 XBA84229.1
PP810610 11577 p16_DMSO A,C 0.946 missense_variant 11360A>C Glu3787Ala 3787 6758 Gene_217_20492 XBA84229.1
PP810610 11577 p16_K22 A,C 0 missense_variant 11360A>C Glu3787Ala 3787 6758 Gene_217_20492 XBA84229.1
PP810610 11577 p16_X7523 A,C 0 missense_variant 11360A>C Glu3787Ala 3787 6758 Gene_217_20492 XBA84229.1
PP810610 18640 hCoV229E_Rluc T,G 0 missense_variant 18424T>G Phe6142Val 6142 6758 Gene_217_20492 XBA84229.1
PP810610 18640 p10_DMSO T,G 0 missense_variant 18424T>G Phe6142Val 6142 6758 Gene_217_20492 XBA84229.1
PP810610 18640 p10_K22 T,G 0 missense_variant 18424T>G Phe6142Val 6142 6758 Gene_217_20492 XBA84229.1
PP810610 18640 p10_K7523 T,G 0.055 missense_variant 18424T>G Phe6142Val 6142 6758 Gene_217_20492 XBA84229.1
PP810610 18640 p16_DMSO T,G 0 missense_variant 18424T>G Phe6142Val 6142 6758 Gene_217_20492 XBA84229.1
PP810610 18640 p16_K22 T,G 0 missense_variant 18424T>G Phe6142Val 6142 6758 Gene_217_20492 XBA84229.1
PP810610 18640 p16_X7523 T,G 0.183 missense_variant 18424T>G Phe6142Val 6142 6758 Gene_217_20492 XBA84229.1
PP810610 19289 hCoV229E_Rluc G,T 0 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
PP810610 19289 p10_DMSO G,T 0 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
PP810610 19289 p10_K22 G,T 1 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
PP810610 19289 p10_K7523 G,T 0 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
PP810610 19289 p16_DMSO G,T 0 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
PP810610 19289 p16_K22 G,T 0.988 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
PP810610 19289 p16_X7523 G,T 0 missense_variant 19073G>T Gly6358Val 6358 6758 Gene_217_20492 XBA84229.1
PP810610 21027 hCoV229E_Rluc C,T 0 missense_variant 533C>T Thr178Ile 178 1173 Gene_20494_24015 XBA84230.1
PP810610 21027 p10_DMSO C,T 0.186 missense_variant 533C>T Thr178Ile 178 1173 Gene_20494_24015 XBA84230.1
PP810610 21027 p10_K22 C,T 0 missense_variant 533C>T Thr178Ile 178 1173 Gene_20494_24015 XBA84230.1
PP810610 21027 p10_K7523 C,T 0.032 missense_variant 533C>T Thr178Ile 178 1173 Gene_20494_24015 XBA84230.1
PP810610 21027 p16_DMSO C,T 0.954 missense_variant 533C>T Thr178Ile 178 1173 Gene_20494_24015 XBA84230.1
PP810610 21027 p16_K22 C,T 0.009 missense_variant 533C>T Thr178Ile 178 1173 Gene_20494_24015 XBA84230.1
PP810610 21027 p16_X7523 C,T 0.158 missense_variant 533C>T Thr178Ile 178 1173 Gene_20494_24015 XBA84230.1
PP810610 21633 hCoV229E_Rluc T,C 0 missense_variant 1139T>C Val380Ala 380 1173 Gene_20494_24015 XBA84230.1
PP810610 21633 p10_DMSO T,C 0.08 missense_variant 1139T>C Val380Ala 380 1173 Gene_20494_24015 XBA84230.1
PP810610 21633 p10_K22 T,C 0 missense_variant 1139T>C Val380Ala 380 1173 Gene_20494_24015 XBA84230.1
PP810610 21633 p10_K7523 T,C 0 missense_variant 1139T>C Val380Ala 380 1173 Gene_20494_24015 XBA84230.1
PP810610 21633 p16_DMSO T,C 0.015 missense_variant 1139T>C Val380Ala 380 1173 Gene_20494_24015 XBA84230.1
PP810610 21633 p16_K22 T,C 0 missense_variant 1139T>C Val380Ala 380 1173 Gene_20494_24015 XBA84230.1
PP810610 21633 p16_X7523 T,C 0 missense_variant 1139T>C Val380Ala 380 1173 Gene_20494_24015 XBA84230.1
PP810610 22215 hCoV229E_Rluc T,G 0 missense_variant 1721T>G Val574Gly 574 1173 Gene_20494_24015 XBA84230.1
PP810610 22215 p10_DMSO T,G 0 missense_variant 1721T>G Val574Gly 574 1173 Gene_20494_24015 XBA84230.1
PP810610 22215 p10_K22 T,G 0 missense_variant 1721T>G Val574Gly 574 1173 Gene_20494_24015 XBA84230.1
PP810610 22215 p10_K7523 T,G 0.078 missense_variant 1721T>G Val574Gly 574 1173 Gene_20494_24015 XBA84230.1
PP810610 22215 p16_DMSO T,G 0 missense_variant 1721T>G Val574Gly 574 1173 Gene_20494_24015 XBA84230.1
PP810610 22215 p16_K22 T,G 0 missense_variant 1721T>G Val574Gly 574 1173 Gene_20494_24015 XBA84230.1
PP810610 22215 p16_X7523 T,G 0.033 missense_variant 1721T>G Val574Gly 574 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 hCoV229E_Rluc C,T 0 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 p10_DMSO C,T 0.032 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 p10_K22 C,T 1 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 p10_K7523 C,T 0.835 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 p16_DMSO C,T 0 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 p16_K22 C,T 0.958 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 23435 p16_X7523 C,T 0.132 missense_variant 2941C>T Leu981Phe 981 1173 Gene_20494_24015 XBA84230.1
PP810610 24738 hCoV229E_Rluc C,A,T 0 missense_variant 64C>A,64C>T Leu22Ile,Leu22Phe 22 77 Gene_24674_24907 XBA84233.1
PP810610 24738 p10_DMSO C,A,T 0.011 missense_variant 64C>A,64C>T Leu22Ile,Leu22Phe 22 77 Gene_24674_24907 XBA84233.1
PP810610 24738 p10_K22 C,A,T 1 missense_variant 64C>A,64C>T Leu22Ile,Leu22Phe 22 77 Gene_24674_24907 XBA84233.1
PP810610 24738 p10_K7523 C,A,T 0.106 missense_variant 64C>A,64C>T Leu22Ile,Leu22Phe 22 77 Gene_24674_24907 XBA84233.1
PP810610 24738 p16_DMSO C,A,T 0 missense_variant 64C>A,64C>T Leu22Ile,Leu22Phe 22 77 Gene_24674_24907 XBA84233.1
PP810610 24738 p16_K22 C,A,T 1 missense_variant 64C>A,64C>T Leu22Ile,Leu22Phe 22 77 Gene_24674_24907 XBA84233.1
PP810610 24738 p16_X7523 C,A,T 0.958 missense_variant 64C>A,64C>T Leu22Ile,Leu22Phe 22 77 Gene_24674_24907 XBA84233.1
PP810610 25025 hCoV229E_Rluc C,T 0 missense_variant 106C>T His36Tyr 36 225 Gene_24919_25596 XBA84234.1
PP810610 25025 p10_DMSO C,T 0.049 missense_variant 106C>T His36Tyr 36 225 Gene_24919_25596 XBA84234.1
PP810610 25025 p10_K22 C,T 0 missense_variant 106C>T His36Tyr 36 225 Gene_24919_25596 XBA84234.1
PP810610 25025 p10_K7523 C,T 0 missense_variant 106C>T His36Tyr 36 225 Gene_24919_25596 XBA84234.1
PP810610 25025 p16_DMSO C,T 0.057 missense_variant 106C>T His36Tyr 36 225 Gene_24919_25596 XBA84234.1
PP810610 25025 p16_K22 C,T 0 missense_variant 106C>T His36Tyr 36 225 Gene_24919_25596 XBA84234.1
PP810610 25025 p16_X7523 C,T 0.016 missense_variant 106C>T His36Tyr 36 225 Gene_24919_25596 XBA84234.1
PP810610 25592 hCoV229E_Rluc T,C 0.012 missense_variant 673T>C Phe225Leu 225 225 Gene_24919_25596 XBA84234.1
PP810610 25592 p10_DMSO T,C 0.925 missense_variant 673T>C Phe225Leu 225 225 Gene_24919_25596 XBA84234.1
PP810610 25592 p10_K22 T,C 0 missense_variant 673T>C Phe225Leu 225 225 Gene_24919_25596 XBA84234.1
PP810610 25592 p10_K7523 T,C 0 missense_variant 673T>C Phe225Leu 225 225 Gene_24919_25596 XBA84234.1
PP810610 25592 p16_DMSO T,C 0.935 missense_variant 673T>C Phe225Leu 225 225 Gene_24919_25596 XBA84234.1
PP810610 25592 p16_K22 T,C 0 missense_variant 673T>C Phe225Leu 225 225 Gene_24919_25596 XBA84234.1
PP810610 25592 p16_X7523 T,C 0.013 missense_variant 673T>C Phe225Leu 225 225 Gene_24919_25596 XBA84234.1
the mail I generate answer:
Lieber Jiabin,
wir hatten im Sommer 2024 ein Projekt gemeinsam mit Thomas Pietschmann zu 229E Coronavirus, wo wir sequenziert hatten und die Adaption des Virus durch Variantenanalyse bestimmt hatten. Ich habe Dir die alten Auswertungen angehängt.
Die Viren sind nun ohne Selektionsdruck weiterpassagiert worden und die Frage ist ob die damals aufgefundenen Mutationen erhalten bleiben oder verloren gehen.
Könntest Du diese drei Proben (siehe link von Patrick https://public.leibniz-liv.de/sharing/tuBWPq3ca
) im Vergleich zu dem adaptierten Virus von 2024 analysieren?
Viele Grüße
Nicole
7. Merge intra- and inter-host variants, comparing the variants to the alignments of the assemblies to confirm its correctness.
cat NC_001348.fasta viralngs/data/02_assembly/VZV_20S.fasta viralngs/data/02_assembly/VZV_60S.fasta > aligned_1.fasta
mafft –clustalout aligned_1.fasta > aligned_1.aln
#~/Scripts/convert_fasta_to_clustal.py aligned_1.fasta_orig aligned_1.aln
~/Scripts/convert_clustal_to_clustal.py aligned_1.aln aligned_1_.aln
#manully delete the postion with all or ‘-‘ in aligned_1_.aln
~/Scripts/check_sequence_differences.py aligned_1_.aln
~/Scripts/check_sequence_differences.py aligned_1_.aln > aligned_1.res
grep -v ” = n” aligned_1.res > aligned_1_.res
cat NC_001348.fasta viralngs/tmp/02_assembly/VZV_20S.assembly4-refined.fasta viralngs/tmp/02_assembly/VZV_60S.assembly4-refined.fasta > aligned_1.fasta
mafft –clustalout aligned_1.fasta > aligned_1.aln
~/Scripts/convert_clustal_to_clustal.py aligned_1.aln aligned_1_.aln
~/Scripts/check_sequence_differences.py aligned_1_.aln > aligned_1.res
grep -v ” = n” aligned_1.res > aligned_1_.res
#Differences found at the following positions (150):
Position 8956: OP297860.1 = A, HSV1_S1-1 = A, HSV-Klinik_S2-1 = G
Position 8991: OP297860.1 = A, HSV1_S1-1 = A, HSV-Klinik_S2-1 = C
Position 8992: OP297860.1 = T, HSV1_S1-1 = C, HSV-Klinik_S2-1 = C
Position 8995: OP297860.1 = T, HSV1_S1-1 = T, HSV-Klinik_S2-1 = C
Position 9190: OP297860.1 = T, HSV1_S1-1 = A, HSV-Klinik_S2-1 = T
* Position 13659: OP297860.1 = G, HSV1_S1-1 = T, HSV-Klinik_S2-1 = G
* Position 47969: OP297860.1 = C, HSV1_S1-1 = T, HSV-Klinik_S2-1 = C
* Position 53691: OP297860.1 = G, HSV1_S1-1 = T, HSV-Klinik_S2-1 = G
* Position 55501: OP297860.1 = T, HSV1_S1-1 = C, HSV-Klinik_S2-1 = C
* Position 63248: OP297860.1 = G, HSV1_S1-1 = T, HSV-Klinik_S2-1 = G
Position 63799: OP297860.1 = T, HSV1_S1-1 = C, HSV-Klinik_S2-1 = T
* Position 64328: OP297860.1 = C, HSV1_S1-1 = A, HSV-Klinik_S2-1 = C
Position 65179: OP297860.1 = T, HSV1_S1-1 = T, HSV-Klinik_S2-1 = C
* Position 65225: OP297860.1 = G, HSV1_S1-1 = G, HSV-Klinik_S2-1 = A
* Position 95302: OP297860.1 = C, HSV1_S1-1 = A, HSV-Klinik_S2-1 = C
gunzip isnvs.annot.txt.gz
~/Scripts/filter_isnv.py isnvs.annot.txt 0.05
cut -d$’\t’ filtered_isnvs.annot.txt -f1-7
chr pos sample patient time alleles iSNV_freq
OP297860 13203 HSV1_S1 HSV1_S1 T,C,A 1.0
OP297860 13203 HSV-Klinik_S2 HSV-Klinik_S2 T,C,A 1.0
OP297860 13522 HSV1_S1 HSV1_S1 G,T 1.0
OP297860 13522 HSV-Klinik_S2 HSV-Klinik_S2 G,T 0.008905554253573941
OP297860 13659 HSV1_S1 HSV1_S1 G,T 1.0
OP297860 13659 HSV-Klinik_S2 HSV-Klinik_S2 G,T 0.008383233532934131
~/Scripts/convert_clustal_to_fasta.py aligned_1_.aln aligned_1.fasta
samtools faidx aligned_1.fasta
samtools faidx aligned_1.fasta OP297860.1 > OP297860.1.fasta
samtools faidx aligned_1.fasta HSV1_S1-1 > HSV1_S1-1.fasta
samtools faidx aligned_1.fasta HSV-Klinik_S2-1 > HSV-Klinik_S2-1.fasta
seqkit seq OP297860.1.fasta -w 70 > OP297860.1_w70.fasta
diff OP297860.1_w70.fasta ../../refsel_db/refsel.fasta
8. Consensus sequences of each and of all isolates
cp data/02_assembly/*.fasta ./
for sample in 838_S1 840_S2 820_S3 828_S4 815_S5 834_S6 808_S7 811_S8 837_S9 768_S10 773_S11 767_S12 810_S13 814_S14 10121-16_S15 7510-15_S16 828-17_S17 8806-15_S18 9881-16_S19 8981-14_S20; do
for sample in p953-84660-tsek p938-16972-nra p942-88507-nra p943-98523-nra p944-103323-nra p947-105565-nra p948-112830-nra; do \
mv ${sample}.fasta ${sample}.fa
cat all.fa ${sample}.fa >> all.fa
done
cat RSV_dedup.fa all.fa > RSV_all.fa
mafft –adjustdirection RSV_all.fa > RSV_all.aln
snp-sites RSV_all.aln -o RSV_all_.aln
9. Download all Human alphaherpesvirus 3 (Varicella-zoster virus) genomes
Human alphaherpesvirus 3
acronym: HHV-3 VZV
equivalent: Human herpes virus 3
Human alphaherpesvirus 3 (Varicella-zoster virus)
* Human herpesvirus 3 strain Dumas
* Human herpesvirus 3 strain Oka vaccine
* Human herpesvirus 3 VZV-32
#Taxonomy ID: 10335
esearch -db nucleotide -query “txid10335[Organism:exp]” | efetch -format fasta -email j.huang@uke.de > genome_10335_ncbi.fasta
python ~/Scripts/filter_fasta.py genome_10335_ncbi.fasta complete_genome_10335_ncbi.fasta #2041–>165
# —- Download related genomes from ENA —-
https://www.ebi.ac.uk/ena/browser/view/10335
#Click “Sequence” and download “Counts” (2003) and “Taxon descendants count” (2005) if there is enough time! Downloading time points is 11.03.2025.
python ~/Scripts/filter_fasta.py ena_10335_sequence.fasta complete_genome_10335_ena_taxon_descendants_count.fasta #2005–>153
#python ~/Scripts/filter_fasta.py ena_10335_sequence_Counts.fasta complete_genome_10335_ena_Counts.fasta #xxx, 5.8G
https://www.ebi.ac.uk/ena/browser/view/10239
https://www.ebi.ac.uk/ena/browser/view/2497569
https://www.ebi.ac.uk/ena/browser/view/Taxon:2497569
ena_10239_sequence.fasta
esearch -db nucleotide -query “txid10239[Organism:exp]” | efetch -format fasta -email j.huang@uke.de > genome_10239_ncbi.fasta
10. Using Multi-CAR for scaffolding the contigs (If not useful, choose another scaffolding tool, e.g. https://github.com/malonge/RagTag)
All contigs over 500 bp were successfully scaffolded to the graft genome using Multi-CAR (13), resulting in a chromosomal assembly of 4,506,689 bp.
https://genome.cs.nthu.edu.tw/Multi-CAR/
https://github.com/ablab-nthu/Multi-CSAR
11. Using the bowtie of vrap to map the reads on ref_genome/reference.fasta (The reference refers to the closest related genome found from the list generated by vrap)
(vrap) vrap/vrap.py -1 trimmed/VZV_20S_trimmed_P_1.fastq -2 trimmed/VZV_20S_trimmed_P_2.fastq -o VZV_20S_on_X04370 –host /home/jhuang/DATA/Data_Huang_Human_herpesvirus_3/X04370.fasta -t 100 -l 200 -g
cd bowtie
mv mapped mapped.sam
samtools view -S -b mapped.sam > mapped.bam
samtools sort mapped.bam -o mapped_sorted.bam
samtools index mapped_sorted.bam
samtools view -H mapped_sorted.bam
samtools flagstat mapped_sorted.bam
12. Show the bw on IGV
13. Reports
diff data/02_assembly/2040_04.fasta tmp/02_assembly/2040_04.assembly4-refined.fasta
diff data/02_assembly/2040_04.fasta tmp/02_assembly/2040_04.assembly1-spades.fasta
diff data/02_assembly/2040_04.fasta tmp/02_assembly/2040_04.assembly2-scaffolded.fasta
diff data/02_assembly/2040_04.fasta tmp/02_assembly/2040_04.assembly2-gapfilled.fasta
diff data/02_assembly/2040_04.fasta tmp/02_assembly/2040_04.assembly3-modify.fasta
diff data/02_assembly/2040_04.fasta tmp/02_assembly/2040_04.assembly4-refined.fasta
./2040_04.assembly2-alternate_sequences.fasta
./2040_04.assembly2-scaffold_ref.fasta
# —————————————– END —————————————–
Lieber Jiabin,
wir hatten im Sommer 2024 ein Projekt gemeinsam mit Thomas Pietschmann zu 229E Coronavirus, wo wir sequenziert hatten und die Adaption des Virus durch Variantenanalyse bestimmt hatten. Ich habe Dir die alten Auswertungen angehängt.
Die Viren sind nun ohne Selektionsdruck weiterpassagiert worden und die Frage ist ob die damals aufgefundenen Mutationen erhalten bleiben oder verloren gehen.
Könntest Du diese drei Proben (siehe link von Patrick https://public.leibniz-liv.de/sharing/tuBWPq3ca) im Vergleich zu dem adaptierten Virus von 2024 analysieren?
Viele Grüße
Nicole
亲爱的Jiabin,
我们在2024年夏天与Thomas Pietschmann一起进行了一项关于229E冠状病毒的项目,我们进行了测序,并通过变异分析确定了病毒的适应性。我已经将之前的分析结果附在了邮件中。
这些病毒现在已经在没有选择压力的情况下继续通过传代,我们的问题是,之前发现的突变是否会保持下来还是会消失。
你能否分析一下这三份样本(请参阅Patrick提供的链接:https://public.leibniz-liv.de/sharing/tuBWPq3ca),并与2024年适应的病毒进行比较?
此致
Nicole
Von: Blümke, Patrick
Gesendet: Freitag, 9. Mai 2025 16:26
An: ‘nfischer@uke.de’
Betreff: [EXT] RE: Re: Re: [EXTERN] NGS von adaptiertem hCoV-229E Virus
Liebe Nicole,
wir haben die 3 RNA-Proben von Thomas Pietschmann mittlerweile sequenziert und du kannst die Daten hier herunterladen:
https://public.leibniz-liv.de/sharing/tuBWPq3ca
LG und ein schönes Wochenende,
Patrick
From: nfischer@uke.de
Sent: Mittwoch, 23. April 2025 12:42
To: Blümke, Patrick
Subject: WG: [EXT] Re: Re: [EXTERN] NGS von adaptiertem hCoV-229E Virus
Lieber Patrick,
Thomas Pietschmann/Twin Core Hannover hat drei weitere Proben geschickt, RNA aus dem Überstand von passagiertem CoV229E. Er würde sie gerne bei Euch nochmal sequenzieren und Jiabin würde die Auswertung entsprechend der vorherigen Analyse machen.
Ich weiß nicht, wie tief Ihr damals sequenziert habt. Ich denke 5Mio reads? Kann ich Euch die RNA bringen und habt Ihr die Kapazität diese Libraries zu machen und zu sequenzieren?
LG
Nicole
Von: KC6hler, Natalie
Gesendet: Freitag, 4. April 2025 11:21
An: Nicole Fischer ; Pietschmann, Thomas ; Sibylle Haid ; Nicole Fischer ; Pietschmann, Thomas ; Sibylle Haid
Betreff: [EXT] NGS von adaptiertem hCoV-229E Virus
Liebe Nicole,
vielen Dank nochmal für die Sequenzierung unseres adaptierten hCoV-229E Virus. Die Ergebnisse haben uns sehr weitergeholfen. Wir haben nun unsere adaptierten hCoV-229E Viruspopulationen ohne Selektionsdruck weiter passagiert, um zu schauen, ob unsere Mutationen und die Resistenz wieder verloren geht oder erhalten bleibt. Dazu haben wir auch bereits einen ersten Phänotyp zeigen können, den wir nochmal validieren wollen. Thomas und ich haben uns daher gefragt, ob Ihr nach der Validierung unsere neuen Virusstocks nochmal sequenzieren könntet (es wären drei Proben)?
Vielen Dank schon mal für die Hilfe und ich wünsche ein schönes sonniges Wochenende!
Liebe Grüße aus Hannover
Natalie
——–
Natalie Köhler, M. Sc.
PhD Student
AG Prof. Thomas Pietschmann
TWINCORE – Centre for Experimental and Clinical Infection Research
Institute for Experimental Virology
Feodor-Lynen-Str. 7 | D-30625 Hannover
Tel.: +49 (0)511 22002-7138
E-Mail: natalie.koehler@twincore.de
Von: “Pietschmann, Thomas”
Datum: Mittwoch, 26. Juni 2024 um 10:36
An: Nicole Fischer , “Haid, Sibylle (Twincore)” , Köhler, Natalie
Betreff: Re: [EXTERN] AW: [EXT] AW: NGS von adaptiertem CoV-229E Virus
Liebe Nicole, liebe alle,
Dir und deinen Teammitgliedern, die beteiligt waren vielen Dank für die Analyse der Viren.
Das sind sehr interessante Ergebnisse.
Wir finden nicht die vorbeschriebenen Mutationen, die nach Selektion mit K22 erzeugt wurden (siehe Lundin et al in der Anlage V..)
Wir finden zwei identische Mutationen, die sowohl bei K22 als auch bei K7523 selektiert wurden (Leu3552Phe, Leu981Phe)
Wir finden eine zusätzliche Mutation für K7523 (Ala2691Asp)
Keine der Mutationen ist direkt in Nsp6 (Leu2552Phe müsste der N-terminus von Nsp7 sein); habe auf die schnelle kein annotiertes 229E Genom mit unserer Sequenz gefunden.. stattdessen AGT21366.1 verwendet, dort ist allerdings 3552 ein Serin (https://www.ncbi.nlm.nih.gov/protein/AGT21366.1)
Die Mutation die nur durch K7523 (nicht durch K22) selektiert wurde sitzt in nsp4 (Ala 2691Asp; https://www.ncbi.nlm.nih.gov/protein/AGT21366.1)
Die gemeinsame zweite gemeinsame Mutation Leu981Phe sitzt in nsp3, Papain-like protease PLpro (https://www.ncbi.nlm.nih.gov/protein/AGT21366.1)
PLpro spaltet nur nsp1-4, die Spaltung zw. nsp6-7 wird durch CLpro gemacht.
Natalie, kannst Du all das bitte nachprüfen? Am besten mit einem annotierten hCoV229E Genom, das unserer Sequenz entspricht. Wir sollte nun sofort in Kooperation die Varianten klonieren und testen, welche Varienten-(Kombinationen) Resistenz vermitteln. Spannend wird auch die Selektion auf SARS-CoV-2. Es könnte auch interessant sein, die Selektion nochmal auf hCoV229E zu wiederholen (wieder selbige Mutation)… vielleicht auch eine Selektion in einer anderen Zelllinie… gibt es weitere Lösungen oder immer selbige?
Viele Grüße
Thomas
PS: Findest du weitere Papiere, die K22 Resistenzen beschreiben? Wenn ja, wo liegen die?
Von: “nfischer@uke.de”
Datum: Dienstag, 25. Juni 2024 um 18:58
An: Sibylle Haid
Cc: “‘KC6hler,Natalie'” , Thomas Pietschmann
Betreff: [EXTERN] AW: [EXT] AW: NGS von adaptiertem CoV-229E Virus
Liebe Sibylle, Liebe Natalie, lieber Thomas,
anbei die Ergebnisse des variant callings der Proben. Es gab etwas Probleme mit der Referenzsequenz assembly, weshalb wir auf eine andere accession number ausweichen mussten. Jiabin hat PP810610 als Referenz genommen aufgrund des unvollständigen de novo assemblies der mitgeschickten Referenz. PP810610 ist nur wenig abweichend von den Bereichen des Reportervirus, die assembled werden konnten.
Gerne beantworten wir Fragen zu den Analysen. Jiabin schaut sich nochmal eine Position genauer an, Ihr werdet es sehen, an Position xxx zeigt die DMSO Kontrolle eine Abweichung. Wir schauen uns gerade die IGV files dazu an, warum das so ist. Das könnt Ihr für jetzt erstmal ignorieren.
Herzliche Grüße
Nicole
# ——————————————————————-
# —- !! DEPRECATED mamba env configuration due to too many conflicts, use docker instead (see above) !! —-
#mamba activate /home/jhuang/miniconda3/envs/viral-ngs4
mkdir viralngs
ln -s ~/Tools/viral-ngs/Snakefile Snakefile
ln -s ~/Tools/viral-ngs/bin bin
cp ~/Tools/viral-ngs/refsel.acids refsel.acids
cp ~/Tools/viral-ngs/lastal.acids lastal.acids
cp ~/Tools/viral-ngs/config.yaml config.yaml
cp ~/Tools/viral-ngs/samples-runs.txt samples-runs.txt
cp ~/Tools/viral-ngs/samples-depletion.txt samples-depletion.txt
cp ~/Tools/viral-ngs/samples-metagenomics.txt samples-metagenomics.txt
cp ~/Tools/viral-ngs/samples-assembly.txt samples-assembly.txt
cp ~/Tools/viral-ngs/samples-assembly-failures.txt samples-assembly-failures.txt
# — DEBUG: If the env disappeared, reinstall the env viral-ngs4 —
# — Running time hints —
#Note that novoalign is not installed. The used Novoalign path: /home/jhuang/Tools/novocraft_v3/novoalign; the used gatk: /usr/local/bin/gatk using /home/jhuang/Tools/GenomeAnalysisTK-3.6/GenomeAnalysisTK.jar.
#Samtools path: #Why, the samtools in the env is v1.6?
#Novoalign path: /home/jhuang/Tools/novocraft_v3/novoalign
#GATK path: /usr/local/bin/gatk # jar_file in the file: jar_file = ‘/home/jhuang/Tools/GenomeAnalysisTK-3.6/GenomeAnalysisTK.jar’
# — in config.yaml —
#GATK_PATH: “/home/jhuang/Tools/GenomeAnalysisTK-3.6”
#NOVOALIGN_PATH: “/home/jhuang/Tools/novocraft_v3”
mamba list or mamba list blast
mamba create -n viral-ngs4 python=3.6
mamba activate viral-ngs4
mamba install blast=2.6.0 bmtagger biopython pysam pyyaml picard mvicuna pybedtools fastqc matplotlib spades last=876 -c conda-forge -c bioconda
#mafft=7.221 –> mafft since └─ mafft 7.221** is not installable because it conflicts with any installable versions previously reported.
mamba install cd-hit cd-hit-auxtools diamond gap2seq=2.1 mafft mummer4 muscle=3.8 parallel pigz prinseq samtools=1.6 tbl2asn trimmomatic trinity unzip vphaser2 bedtools -c r -c defaults -c conda-forge -c bioconda
mamba install bwa
mamba install vphaser2=2.0
# Sovle confilict between bowtie, bowtie2 and snpeff
mamba remove bowtie
mamba install bowtie2
mamba remove snpeff
mamba install snpeff=4.1l
#which snpEff
mamba install gatk=3.6
#DEBUG if FileNotFoundError: [Errno 2] No such file or directory: ‘/usr/local/bin/gatk’: ‘/usr/local/bin/gatk’
#IMPORTANT_UPDATE jar_file in the file /home/jhuang/mambaforge/envs/viral-ngs4/bin/gatk3 with “/home/jhuang/Tools/GenomeAnalysisTK-3.6/GenomeAnalysisTK.jar”
#IMPORTANT_REPLACE “sudo cp /home/jhuang/mambaforge/envs/viral-ngs4/bin/gatk3 /usr/local/bin/gatk”
#IMPORTANT_SET /home/jhuang/Tools/GenomeAnalysisTK-3.6 as GATK_PATH in config.yaml
#IMPORTANT_CHECK if it works
# java -jar /home/jhuang/Tools/GenomeAnalysisTK-3.6/GenomeAnalysisTK.jar -T RealignerTargetCreator –help
# /usr/local/bin/gatk -T RealignerTargetCreator –help
#IMPORTANT_NOTE that the env viral-ngs4 cannot logined from the base env due to the python3-conflict!
# —- BUG_2025_1 —-
bin/taxon_filter.py deplete data/00_raw/1762_04.bam tmp/01_cleaned/1762_04.raw.bam tmp/01_cleaned/1762_04.bmtagger_depleted.bam tmp/01_cleaned/1762_04.rmdup.bam data/01_cleaned/1762_04.cleaned.bam –bmtaggerDbs /home/jhuang/REFs/viral_ngs_dbs/bmtagger_dbs_remove/hg19 /home/jhuang/REFs/viral_ngs_dbs/bmtagger_dbs_remove/GRCh37.68_ncRNA-GRCh37.68_transcripts-HS_rRNA_mitRNA /home/jhuang/REFs/viral_ngs_dbs/bmtagger_dbs_remove/metagenomics_contaminants_v3 –blastDbs /home/jhuang/REFs/viral_ngs_dbs/blast_dbs_remove/metag_v3.ncRNA.mRNA.mitRNA.consensus /home/jhuang/REFs/viral_ngs_dbs/blast_dbs_remove/hybsel_probe_adapters –threads 15 –srprismMemory 14250 –JVMmemory 50g –loglevel DEBUG
2025-05-22 12:10:18,313 – __init__:445:_attempt_install – DEBUG – Currently installed version of blast: 2.16.0-hc155240_3
2025-05-22 12:10:18,314 – __init__:448:_attempt_install – DEBUG – Expected version of blast: 2.6.0
2025-05-22 12:10:18,314 – __init__:449:_attempt_install – DEBUG – Incorrect version of blast installed. Removing it…
2025-05-23 09:58:43,151 – __init__:445:_attempt_install – DEBUG – Currently installed version of bmtagger: 3.101-h470a237_4
2025-05-23 09:58:45,326 – __init__:445:_attempt_install – DEBUG – Currently installed version of blast: 2.7.1-h4422958_6
2025-05-23 09:58:45,327 – __init__:448:_attempt_install – DEBUG – Expected version of blast: 2.6.0
2025-05-23 09:58:45,327 – __init__:449:_attempt_install – DEBUG – Incorrect version of blast installed. Removing it…
# —-
# # Some not errorous intermediate commands
# fastqc -f bam data/02_align_to_self/p10_K7523.bam -o reports/fastqc/p10_K7523
# bin/intrahost.py vphaser_one_sample data/02_align_to_self/p10_K7523.mapped.bam data/02_assembly/p10_K7523.fasta data/04_intrahost/vphaser2.p10_K7523.txt.gz –vphaserNumThreads 15 –removeDoublyMappedReads –minReadsEach 5 –maxBias 10
# bin/read_utils.py align_and_fix data/01_per_sample/p16_DMSO.cleaned.bam data/02_assembly/p16_DMSO.fasta –outBamAll data/02_align_to_self/p16_DMSO.bam –outBamFiltered data/02_align_to_self/p16_DMSO.mapped.bam –aligner novoalign –aligner_options ‘-r Random -l 20 -g 40 -x 20 -t 100 -k’ –threads 15
# bin/intrahost.py merge_to_vcf ref_genome/reference.fasta data/04_intrahost/isnvs.vcf.gz –samples 2039_04 2040_04 1762_04 1243_2 875_04 –isnvs data/04_intrahost/vphaser2.2039_04.txt.gz data/04_intrahost/vphaser2.2040_04.txt.gz data/04_intrahost/vphaser2.1762_04.txt.gz data/04_intrahost/vphaser2.1243_2.txt.gz data/04_intrahost/vphaser2.875_04.txt.gz –alignments data/03_multialign_to_ref/aligned_1.fasta –strip_chr_version –parse_accession –loglevel DEBUG Protected: Muster Angabe for Habilitation for shared first and last authorships
Pipeline: Whole-Genome Validation of *Acinetobacter baumannii* ATCC19606 Deletion Mutants (Data_Foong_RNAseq_2021_ATCC19606_Cm)
This workflow describes the assembly, scaffolding, variant calling, and structural variant validation used to confirm the genomic integrity of A. baumannii ATCC19606 WT and the ΔadeAB, ΔadeIJ, and ΔcraA mutants.
1. Raw Read Preparation and Assembly
Create project structure
mkdir bacto_DNAseq
cd bacto_DNAseq
mkdir raw_data
cd raw_data
Link raw FASTQ files
ln -s ../../X101SC26025981-Z02-J001/01.RawData/19606_adeAB/19606_adeAB_1.fq.gz 19606adeAB_R1.fastq.gz
ln -s ../../X101SC26025981-Z02-J001/01.RawData/19606_adeAB/19606_adeAB_2.fq.gz 19606adeAB_R2.fastq.gz
# Additional files present in the project
A10CraA_R1.fastq.gz
A10CraA_R2.fastq.gz
A6WT_R1.fastq.gz
A6WT_R2.fastq.gz
adeIJ_R1.fastq.gz
adeIJ_R2.fastq.gz
Install and run the bacto_DNAseq pipeline including assembly
git clone https://github.com/huang/bacto_DNAseq
mv bacto_DNAseq/* ./
rm -rf bacto_DNAseq
conda activate /home/jhuang/miniconda3/envs/bengal3_ac3
snakemake --printshellcmds
Notes
-
Edit
bacto_DNAseq-0.1.jsonto enable only:assemblytyping_mlst- optionally
pangenome variants_calling
- The pipeline requires access to:
/media/jhuang/Titisee/GAMOLA2/TIGRfam_db/TIGRFAMs_15.0_HMM.LIB
Original commands
# ---------------------------- Assembly using bacto ----------------------------
mkdir bacto_DNAseq; cd bacto_DNAseq;
mkdir raw_data; cd raw_data;
ln -s ../../X101SC26025981-Z02-J001/01.RawData/19606_adeAB/19606_adeAB_1.fq.gz 19606adeAB_R1.fastq.gz
ln -s ../../X101SC26025981-Z02-J001/01.RawData/19606_adeAB/19606_adeAB_2.fq.gz 19606adeAB_R2.fastq.gz
./A10CraA_R1.fastq.gz
./A10CraA_R2.fastq.gz
./A6WT_R1.fastq.gz
./A6WT_R2.fastq.gz
./adeIJ_R1.fastq.gz
./adeIJ_R2.fastq.gz
git clone https://github.com/huang/bacto_DNAseq
mv bacto_DNAseq/* ./
rm -rf bacto_DNAseq
conda activate /home/jhuang/miniconda3/envs/bengal3_ac3
(bengal3_ac3) jhuang@WS-2290C:~/DATA/Data_Tam_DNAseq_2023_A6WT_A10CraA_A12AYE_A1917978$ which snakemake
/home/jhuang/miniconda3/envs/bengal3_ac3/bin/snakemake
(bengal3_ac3) jhuang@WS-2290C:~/DATA/Data_Tam_DNAseq_2023_A6WT_A10CraA_A12AYE_A1917978$ snakemake -v
4.0.0 --> CORRECT!
#NOTE_1: modify bacto_DNAseq-0.1.json keeping only steps assembly, typing_mlst, possibly pangenome and variants_calling true!
#NOTE_2: needs disk Titisee since the pipeline needs /media/jhuang/Titisee/GAMOLA2/TIGRfam_db/TIGRFAMs_15.0_HMM.LIB
snakemake --printshellcmds
2. Contig Filtering and Chromosome Scaffolding
After SPAdes assembly, contigs shorter than 500 bp are removed:
seqkit seq -m 500 A6WT/contigs.fa > A6WT_contigs.min500.fasta
seqkit seq -m 500 19606adeAB/contigs.fa > adeAB_contigs.min500.fasta
seqkit seq -m 500 A10CraA/contigs.fa > A10CraA_contigs.min500.fasta
seqkit seq -m 500 adeIJ/contigs.fa > adeIJ_contigs.min500.fasta
Chromosomal contigs are then identified by alignment against the reference genome CP059040.fasta using minimap2. Contigs lacking alignment are interpreted as putative plasmids and excluded from scaffolding.
WT
Excluded plasmid contig:
contig00016
ΔadeAB
Excluded plasmid contigs:
contig00029
contig00030
contig00033
contig00039
ΔcraA
Excluded plasmid contig:
contig00096
ΔadeIJ
Excluded plasmid contigs:
contig00017
contig00019
contig00020
contig00021
contig00025
Scaffold chromosome using RagTag
Example for ΔadeAB:
ragtag.py scaffold CP059040.fasta adeAB_contigs.min500.no29_30_33_39.fasta -o ragtag_adeAB -C
The scaffolded chromosome is concatenated with excluded plasmid contigs to generate the final assembly.
Original commands
# ----------------------------- Scaffolding ------------------------------
cd shovill
seqkit seq -m 500 contigs.fa > contigs.min500.fasta
#seqkit seq -g -m 500 contigs.fa > contigs.min500_g.fasta
#For project 2:
seqkit seq -m 500 adeABadeIJ_contigs.fa > adeABadeIJ_contigs.min500.fasta
seqkit seq -m 500 adeIJK_contigs.fa > adeIJK_contigs.min500.fasta
#For project 1:
seqkit seq -m 500 A6WT/contigs.fa > A6WT_contigs.min500.fasta
seqkit seq -m 500 19606adeAB/contigs.fa > adeAB_contigs.min500.fasta
seqkit seq -m 500 A10CraA/contigs.fa > A10CraA_contigs.min500.fasta
seqkit seq -m 500 adeIJ/contigs.fa > adeIJ_contigs.min500.fasta
# NOT_NEED_ANYMORE: Perform online scaffolding with Multi-CSAR v1.1 (https://genome.cs.nthu.edu.tw/Multi-CSAR/) --> Using new methods minimap2 + RagTag!
#2
adeABadeIJ 29 contigs
adeIJK 22 contigs
#1
A6WT 22 contigs -1
adeAB 40 contigs -4
adeIJ 27 contigs -5
A10CraA 24 contigs -1
#2
minimap2 -cx asm20 --paf-no-hit ../CP059040.fasta adeABadeIJ_contigs.min500.fasta > asm20_all.paf
awk '$6=="*"{print $1}' asm20_all.paf
#-->contig00020
#-->contig00021
#-->contig00027
seqkit grep -v -r \
-p "^contig00020([[:space:]]|$)" \
-p "^contig00021([[:space:]]|$)" \
-p "^contig00027([[:space:]]|$)" \
adeABadeIJ_contigs.min500.fasta > adeABadeIJ_contigs.min500.no20_21_27.fasta
(ragtag_env) ragtag.py scaffold ../CP059040.fasta adeABadeIJ_contigs.min500.no20_21_27.fasta -o ragtag_adeABadeIJ -C
minimap2 -cx asm20 --paf-no-hit ../CP059040.fasta adeIJK_contigs.min500.fasta > asm20_all.paf
awk '$6=="*"{print $1}' asm20_all.paf
#-->contig00016
seqkit grep -v -r -p "^contig00016(\s|$)" adeIJK_contigs.min500.fasta > adeIJK_contigs.min500.no16.fasta
(ragtag_env) ragtag.py scaffold ../CP059040.fasta adeIJK_contigs.min500.no16.fasta -o ragtag_adeIJK -C
#1
(ragtag_env) minimap2 -cx asm20 --paf-no-hit ../CP059040.fasta A6WT_contigs.min500.fasta > asm20_all.paf
awk '$6=="*"{print $1}' asm20_all.paf
#-->contig00016
seqkit grep -v -r -p "^contig00016(\s|$)" A6WT_contigs.min500.fasta > A6WT_contigs.min500.no16.fasta
seqkit grep -r -p "^contig00016(\s|$)" A6WT_contigs.min500.fasta > A6WT_contigs.min500.16.fasta
(ragtag_env) ragtag.py scaffold ../CP059040.fasta A6WT_contigs.min500.no16.fasta -o ragtag_A6WT -C
cat ragtag_A6WT/ragtag.scaffold.fasta A6WT_contigs.min500.16.fasta > A6WT_chr_plasmids_.fasta
sed 's/^>Chr0_RagTag$/NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN/' A6WT_chr_plasmids_.fasta > A6WT_chr_plasmids__.fasta
seqkit seq A6WT_chr_plasmids__.fasta > A6WT_chr_plasmids.fasta
(ragtag_env) minimap2 -cx asm20 --paf-no-hit ../CP059040.fasta adeAB_contigs.min500.fasta > asm20_all.paf
awk '$6=="*"{print $1}' asm20_all.paf
#-->contig00029
#-->contig00030
#-->contig00033
#-->contig00039
seqkit grep -v -r \
-p "^contig00029([[:space:]]|$)" \
-p "^contig00030([[:space:]]|$)" \
-p "^contig00033([[:space:]]|$)" \
-p "^contig00039([[:space:]]|$)" \
adeAB_contigs.min500.fasta > adeAB_contigs.min500.no29_30_33_39.fasta
seqkit grep -r \
-p "^contig00029([[:space:]]|$)" \
-p "^contig00030([[:space:]]|$)" \
-p "^contig00033([[:space:]]|$)" \
-p "^contig00039([[:space:]]|$)" \
adeAB_contigs.min500.fasta > adeAB_contigs.min500.29_30_33_39.fasta
(ragtag_env) ragtag.py scaffold ../CP059040.fasta adeAB_contigs.min500.no29_30_33_39.fasta -o ragtag_adeAB -C
cat ragtag_adeAB/ragtag.scaffold.fasta adeAB_contigs.min500.29_30_33_39.fasta > adeAB_chr_plasmids_.fasta
sed 's/^>Chr0_RagTag$/NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN/' adeAB_chr_plasmids_.fasta > adeAB_chr_plasmids__.fasta
seqkit seq adeAB_chr_plasmids__.fasta > adeAB_chr_plasmids.fasta
samtools faidx adeAB_chr_plasmids.fasta
(ragtag_env) minimap2 -cx asm20 --paf-no-hit ../CP059040.fasta A10CraA_clean.fasta > asm20_all.paf
awk '$6=="*"{print $1}' asm20_all.paf
#-->contig00096
seqkit grep -v -r \
-p "^contig00096([[:space:]]|$)" \
A10CraA_clean.fasta > A10CraA_contigs.min500.no96.fasta
seqkit grep -r \
-p "^contig00096([[:space:]]|$)" \
A10CraA_clean.fasta > A10CraA_contigs.min500.96.fasta
(ragtag_env) ragtag.py scaffold ../CP059040.fasta A10CraA_contigs.min500.no96.fasta -o ragtag_A10CraA -C
cat ragtag_A10CraA/ragtag.scaffold.fasta A10CraA_contigs.min500.96.fasta > A10CraA_chr_plasmids_.fasta
sed 's/^>Chr0_RagTag$/NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN/' A10CraA_chr_plasmids_.fasta > A10CraA_chr_plasmids__.fasta
seqkit seq A10CraA_chr_plasmids__.fasta > A10CraA_chr_plasmids.fasta
(ragtag_env) minimap2 -cx asm20 --paf-no-hit ../CP059040.fasta adeIJ_contigs.min500.fasta > asm20_all.paf
awk '$6=="*"{print $1}' asm20_all.paf
#contig00017
#contig00019
#contig00020
#contig00021
#contig00025
seqkit grep -v -r \
-p "^contig00017([[:space:]]|$)" \
-p "^contig00019([[:space:]]|$)" \
-p "^contig00020([[:space:]]|$)" \
-p "^contig00021([[:space:]]|$)" \
-p "^contig00025([[:space:]]|$)" \
adeIJ_contigs.min500.fasta > adeIJ_contigs.min500.no17_19_20_21_25.fasta
seqkit grep -r \
-p "^contig00017([[:space:]]|$)" \
-p "^contig00019([[:space:]]|$)" \
-p "^contig00020([[:space:]]|$)" \
-p "^contig00021([[:space:]]|$)" \
-p "^contig00025([[:space:]]|$)" \
adeIJ_contigs.min500.fasta > adeIJ_contigs.min500.17_19_20_21_25.fasta
(ragtag_env) ragtag.py scaffold ../CP059040.fasta adeIJ_contigs.min500.no17_19_20_21_25.fasta -o ragtag_adeIJ -C
cat ragtag_adeIJ/ragtag.scaffold.fasta adeIJ_contigs.min500.17_19_20_21_25.fasta > adeIJ_chr_plasmids_.fasta
sed 's/^>Chr0_RagTag$/NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN/' adeIJ_chr_plasmids_.fasta > adeIJ_chr_plasmids__.fasta
seqkit seq adeIJ_chr_plasmids__.fasta > adeIJ_chr_plasmids.fasta
samtools faidx adeIJ_chr_plasmids.fasta
3. Final FASTA Header Format for NCBI Submission
Example headers:
>Chr [location=chromosome] [topology=circular] [completeness=partial]
>contig00029 [plasmid-name=pAdeAB1] [topology=circular] [completeness=partial]
Important: completeness=incomplete is not accepted by NCBI and must be replaced with:
completeness=partial
Automatic correction:
sed -i 's/completeness=incomplete/completeness=partial/g' *.fasta
Original commands
# IUPUT assembled and scaffolded files
#./shovill/A6WT_chr_plasmids.fasta
#./shovill/A10CraA_chr_plasmids.fasta
#./shovill/adeAB_chr_plasmids.fasta
#./shovill/adeIJ_chr_plasmids.fasta
# 备份原文件
cp A6WT_chr_plasmids.fasta A6WT_chr_plasmids.fasta.backup
# 替换错误的 completeness=incomplete 为 completeness=partial
sed -i 's/completeness=incomplete/completeness=partial/g' A6WT_chr_plasmids.fasta
## 或者移除所有 topology 和 completeness 标签(最安全)
#sed -i 's/ \[topology=[^]]*\]//g' A6WT_chr_plasmids.fasta
#sed -i 's/ \[completeness=[^]]*\]//g' A6WT_chr_plasmids.fasta
(bengal3_ac3) jhuang@WS-2290C:/mnt/md1/DATA/Data_Foong_RNAseq_2021_ATCC19606_Cm/bacto_DNAseq/shovill$ grep ">" A6WT_chr_plasmids.fasta
>Chr [location=chromosome] [topology=circular] [completeness=partial]
>contig00016 [plasmid-name=pWT1] [topology=circular] [completeness=partial]
(bengal3_ac3) jhuang@WS-2290C:/mnt/md1/DATA/Data_Foong_RNAseq_2021_ATCC19606_Cm/bacto_DNAseq/shovill$ grep ">" A10CraA_chr_plasmids.fasta
>Chr [location=chromosome] [topology=circular] [completeness=partial]
>contig00096 [plasmid-name=pCraA1] [topology=circular] [completeness=partial]
(bengal3_ac3) jhuang@WS-2290C:/mnt/md1/DATA/Data_Foong_RNAseq_2021_ATCC19606_Cm/bacto_DNAseq/shovill$ grep ">" adeAB_chr_plasmids.fasta
>Chr [location=chromosome] [topology=circular] [completeness=partial]
>contig00029 [plasmid-name=pAdeAB1] [topology=circular] [completeness=partial]
>contig00030 [plasmid-name=pAdeAB2] [topology=circular] [completeness=partial]
>contig00033 [plasmid-name=pAdeAB3] [topology=circular] [completeness=partial]
>contig00039 [plasmid-name=pAdeAB4] [topology=circular] [completeness=partial]
(bengal3_ac3) jhuang@WS-2290C:/mnt/md1/DATA/Data_Foong_RNAseq_2021_ATCC19606_Cm/bacto_DNAseq/shovill$ grep ">" adeIJ_chr_plasmids.fasta
>Chr [location=chromosome] [topology=circular] [completeness=partial]
>contig00017 [plasmid-name=pAdeIJ1] [topology=circular] [completeness=partial]
>contig00019 [plasmid-name=pAdeIJ2] [topology=circular] [completeness=partial]
>contig00020 [plasmid-name=pAdeIJ3] [topology=circular] [completeness=partial]
>contig00021 [plasmid-name=pAdeIJ4] [topology=circular] [completeness=partial]
>contig00025 [plasmid-name=pAdeIJ5] [topology=circular] [completeness=partial]
4. SNP and Small Indel Detection
Two independent pipelines are used:
- Snippy v4.6.0
- SPANDx v3.2
Only variants detected by both methods are retained.
Snippy summary
python3 ~/Scripts/summarize_snippy_res.py snippy
SPANDx run
nextflow run spandx/main.nf \
--fastq "*_P_{1,2}.fastq.gz" \
--ref CP059040.fasta \
--annotation \
--database CP059040 \
-resume
Merge final variant calls
python3 ~/Scripts/merge_snps_indels.py \
bacto_DNAseq/snippy/summary_snps_indels.csv \
spandx/Outputs/Phylogeny_and_annotation/f1_7___ \
merged_variants.csv
Result: no SNPs or small indels unique to any mutant outside the intended deletion loci.
Original commands
# ---------------------------- SNP+Indel using snippy ----------------------------
#Summarize all SNPs and Indels from the snippy result directory
#NOTE: need to adapt the isolate names in summarize_snippy_res.py
#Output: snippy_CP133676/snippy/summary_snps_indels.csv
#Aapt the sample names in ~/Scripts/summarize_snippy_res.py "A6WT", "A10CraA", "19606adeAB", "adeIJ"
python3 ~/Scripts/summarize_snippy_res.py snippy
#? in the record they are not 100% identical: CP059040,1527276,TTGAACC,del,TTGAACC,T,TTGAACC,TTGAACC,conservative_inframe_deletion c.1327_1332delGAACCT p.Glu443_Pro444del,,,,,,H0N29_07175,nan
# CP059040,3124917,T,snp,T,T,C,T,nan,,,,,,nan,nan --> gene AB!
# ---------------------------- SNP+Indel using spandx ----------------------------
mkdir ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/CP059040
cp CP059040.gb ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/CP059040/genes.gbk
vim ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config
/home/jhuang/miniconda3/envs/spandx/bin/snpEff build CP059040 -d
mkdir spandx
cp bacto_DNAseq/trimmed/*_P_*fastq spandx
cd spandx
gzip A6WT_trimmed_P_1.fastq A6WT_trimmed_P_2.fastq 19606adeAB_trimmed_P_1.fastq 19606adeAB_trimmed_P_2.fastq ...
cp CP059040.fasta spandx
#grep ">" CP059040.fasta
#>CP059040
conda activate /home/jhuang/miniconda3/envs/spandx
ln -s /home/jhuang/Tools/spandx/ spandx
(spandx) nextflow run spandx/main.nf --fastq "*_P_{1,2}.fastq.gz" --ref CP059040.fasta --annotation --database CP059040 -resume
# ---------------------------- post-processing of Outputs since BUG: all indels and snps are annotated as MODIFIER --> taking only position, the annotation take we from snippy ----------------------
#-- _CP133676 produced by SPANDx (temporary not necessary) --
cd Outputs/Phylogeny_and_annotation
#awk '{if($3!=$7) print}' < All_SNPs_indels_annotated.txt > All_SNPs_indels_annotated_.txt
cut -d$'\t' -f1-7 All_SNPs_indels_annotated.txt > f1_7
grep -v "/" f1_7 > f1_7_
grep -v "\." f1_7_ > f1_7__
grep -v "*" f1_7__ > f1_7___ #(35 records)
# ---------------------------- merge the following two files summary_snps_indels.csv (192) and All_SNPs_indels_annotated.txt (248) –> merged_variants.csv (94) ----------------------------
#Note that the results for the project is from manually selected the common annotated snp and indels as bacto_DNAseq/snippy/summary_snps_indels.csv
CHROM,POS,REF,TYPE,A6WT,A10CraA,19606adeAB,adeIJ,Effect,Impact,Functional_Class,Codon_change,Protein_and_nucleotide_change,Amino_Acid_Length,Gene_name,Biotype
CP059040,136447,T,snp,A,A,A,A,nan,,,,,,nan,nan
CP059040,152318,T,snp,A,A,A,A,missense_variant c.106A>T p.Thr36Ser,,,,,,H0N29_00700,nan
CP059040,171965,T,snp,A,A,A,A,missense_variant c.167T>A p.Val56Asp,,,,,,H0N29_00780,nan
CP059040,194020,C,ins,CT,CT,CT,CT,frameshift_variant c.3260dupA p.Arg1088fs,,,,,,H0N29_00885,nan
CP059040,375297,A,snp,T,T,T,T,nan,,,,,,nan,nan
CP059040,468931,A,ins,AT,AT,AT,AT,nan,,,,,,nan,nan
CP059040,468976,A,ins,AT,AT,AT,AT,nan,,,,,,nan,nan
CP059040,609979,A,snp,T,T,T,T,stop_gained c.1009A>T p.Lys337*,,,,,,H0N29_02925,nan
CP059040,1036730,T,ins,TA,TA,TA,TA,nan,,,,,,nan,nan
CP059040,1059847,C,snp,T,T,T,T,missense_variant c.119G>A p.Gly40Asp,,,,,,H0N29_04880,nan
CP059040,1272639,T,snp,G,G,G,G,nan,,,,,,nan,nan
CP059040,1300706,A,snp,G,G,G,G,synonymous_variant c.351A>G p.Ser117Ser,,,,,,H0N29_06100,nan
CP059040,1970200,T,snp,C,C,C,C,missense_variant c.281T>C p.Val94Ala,,,,,,H0N29_09260,nan
CP059040,2383727,A,ins,AT,AT,AT,AT,nan,,,,,,nan,nan
CP059040,2477628,T,ins,TA,TA,TA,TA,nan,,,,,,nan,nan
CP059040,2525852,A,ins,AT,AT,AT,AT,frameshift_variant c.529dupA p.Ile177fs,,,,,,H0N29_11845,nan
CP059040,3016359,A,ins,AT,AT,AT,AT,frameshift_variant c.2136dupA p.Phe713fs,,,,,,H0N29_14350,cas3f
CP059040,3111299,A,snp,G,G,G,G,synonymous_variant c.957T>C p.Ile319Ile,,,,,,H0N29_14775,nan
CP059040,3124917,T,snp,T,T,C,T,nan,,,,,,nan,nan
CP059040,3310021,C,ins,CT,CT,CT,CT,nan,,,,,,nan,nan
CP059040,3542741,GC,del,G,G,G,G,frameshift_variant c.1746delC p.Arg583fs,,,,,,H0N29_16865,nan
CP059040,3542934,CT,del,C,C,C,C,frameshift_variant c.1938delT p.Met647fs,,,,,,H0N29_16865,nan
CP059040,3570717,AC,del,A,A,A,A,frameshift_variant c.187delC p.Gln63fs,,,,,,H0N29_16960,nan
CP059040,3629616,C,ins,CT,CT,CT,CT,nan,,,,,,nan,nan
CP059040,3873573,A,snp,G,G,G,G,missense_variant c.1705A>G p.Asn569Asp,,,,,,H0N29_18380,nan
CP059040,1527276,TTGAACC,del,TTGAACC,T,TTGAACC,TTGAACC,conservative_inframe_deletion c.1327_1332delGAACCT p.Glu443_Pro444del,,,,,,H0N29_07175,nan
Deleted: *CP059040,3124917,T,snp,T,C,nan,,,,,,nan,nan
* 1527276
# #python3 ~/Scripts/merge_snps_indels.py bacto_DNAseq/snippy/summary_snps_indels.csv spandx/Outputs/Phylogeny_and_annotation/All_SNPs_indels_annotated.txt merged_variants.csv
# python3 ~/Scripts/merge_snps_indels.py bacto_DNAseq/snippy/summary_snps_indels.csv spandx/Outputs/Phylogeny_and_annotation/f1_7___ merged_variants.csv
# #check if the number of the output file is correct?
# comm -12 <(cut -d, -f2 bacto_DNAseq/snippy/summary_snps_indels.csv | sort | uniq) <(cut -f2 spandx/Outputs/Phylogeny_and_annotation/All_SNPs_indels_annotated.txt | sort | uniq) | wc -l #26
# comm -12 <(cut -d, -f2 bacto_DNAseq/snippy/summary_snps_indels.csv | sort | uniq) <(cut -f2 spandx/Outputs/Phylogeny_and_annotation/All_SNPs_indels_annotated.txt | sort | uniq)
5. Structural Variant Detection
Structural variants are identified by assembly-to-reference comparison using:
- MUMmer / nucmer
- delta-filter
- Assemblytics
Example workflow:
Original commands
# install Environment
mamba create -n sv_assembly \
python=3.10 \
minimap2 \
mummer4 \
samtools \
bcftools \
syri \
assemblytics \
-c conda-forge -c bioconda
mamba activate sv_assembly
mamba install numpy pandas -c conda-forge
which paftools.js
minimap2 --version
nucmer --version
syri -h
Assemblytics
#mamba install mummerplot igv -c bioconda
# 4 Tools, only the 4th works
#Scenario Tool
#reads vs reference Sniffles / SVIM --> NOT WORKING
#assembly vs reference SyRI (best) --> NOT WORKING
#quick & simple minimap2 + paftools --> NOT WORKING
#classic SV detection Assemblytics --> WORKING (see below)
#--> Choose option 4: Assemblytics (classic SV tool for assemblies)
mamba activate sv_assembly
# ---------------- A6WT vs CP059040 (-->Nothing) ----------------
#nucmer --maxmatch -l 100 -c 500 CP059040.fasta shovill/A6WT/contigs.fa -p A6WT
nucmer --maxmatch -l 100 -c 500 CP059040.fasta ./shovill/A6WT_chr_plasmids.fasta -p A6WT
delta-filter -1 -q A6WT.delta > A6WT.filtered.delta
Assemblytics A6WT.filtered.delta A6WT_assemblytics 1000 100 50000
grep -w "Insertion" A6WT_assemblytics.Assemblytics_structural_variants.bed > A6WT_insertions.bed
wc -l A6WT_assemblytics.Assemblytics_structural_variants.bed
cut -f4 A6WT_assemblytics.Assemblytics_structural_variants.bed | sort | uniq -c
cat A6WT_assemblytics.Assemblytics_assembly_stats.txt
show-coords -rcl A6WT.filtered.delta | head -20
Assemblytics A6WT.filtered.delta A6WT_assemblytics_v2 500 50 100000
CP059040 216043 226883 Assemblytics_b_1 10739 + Repeat_contraction 10840 101 CP059040_RagTag:192505-192606:+ between_alignments
CP059040 491041 496665 Assemblytics_b_2 5523 + Repeat_contraction 5624 101 CP059040_RagTag:456766-456867:+ between_alignments
CP059040 523243 528864 Assemblytics_b_3 5520 + Repeat_contraction 5621 101 CP059040_RagTag:483445-483546:+ between_alignments
CP059040 785771 794627 Assemblytics_b_4 8755 + Repeat_contraction 8856 101 CP059040_RagTag:740453-740554:+ between_alignments
CP059040 1327560 1328441 Assemblytics_b_5 780 + Repeat_contraction 881 101 CP059040_RagTag:1273488-1273589:+ between_alignments
CP059040 2190218 2191651 Assemblytics_b_7 1332 + Repeat_contraction 1433 101 CP059040_RagTag:2135360-2135461:+ between_alignments
CP059040 2259736 2260384 Assemblytics_b_8 135 + Tandem_contraction -648 -783 CP059040_RagTag:2203411-2204194:- between_alignments
CP059040 2477722 2488562 Assemblytics_b_9 10739 + Repeat_contraction 10840 101 CP059040_RagTag:2421405-2421506:+ between_alignments
CP059040 2579725 2581158 Assemblytics_b_10 1332 + Repeat_contraction 1433 101 CP059040_RagTag:2512670-2512771:+ between_alignments
CP059040 2810861 2863470 Assemblytics_b_11 52668 + Tandem_contraction 52609 -59 CP059040_RagTag:2742415-2742474:- between_alignments
CP059040 3089638 3090457 Assemblytics_b_12 718 + Repeat_contraction 819 101 CP059040_RagTag:2968584-2968685:+ between_alignments
CP059040 3124916 3125037 Assemblytics_b_13 198 + Tandem_contraction 121 -77 CP059040_RagTag:3003067-3003144:- between_alignments
CP059040 3282693 3288043 Assemblytics_b_14 5249 + Repeat_contraction 5350 101 CP059040_RagTag:3160723-3160824:+ between_alignments
CP059040 3642789 3643608 Assemblytics_b_15 718 + Repeat_contraction 819 101 CP059040_RagTag:3515569-3515670:+ between_alignments
CP059040 3732412 3737994 Assemblytics_b_16 5481 + Repeat_contraction 5582 101 CP059040_RagTag:3604474-3604575:+ between_alignments
CP059040 3881974 3882855 Assemblytics_b_17 780 + Repeat_contraction 881 101 CP059040_RagTag:3748555-3748656:+ between_alignments
CP059040 3942815 3948161 Assemblytics_b_18 5245 + Repeat_contraction 5346 101 CP059040_RagTag:3808616-3808717:+ between_alignments
# ---------------- A10CraA vs CP059040 (1157 nt deletion) ----------------
#nucmer --maxmatch -l 100 -c 500 CP059040.fasta shovill/A10CraA_clean.fasta -p A10CraA
nucmer --maxmatch -l 100 -c 500 CP059040.fasta ./shovill/A10CraA_chr_plasmids.fasta -p A10CraA
delta-filter -1 -q A10CraA.delta > A10CraA.filtered.delta
Assemblytics A10CraA.filtered.delta A10CraA_assemblytics 1000 100 50000
grep -w "Insertion" A10CraA_assemblytics.Assemblytics_structural_variants.bed > A10CraA_insertions.bed
wc -l A10CraA_assemblytics.Assemblytics_structural_variants.bed
cut -f4 A10CraA_assemblytics.Assemblytics_structural_variants.bed | sort | uniq -c
cat A10CraA_assemblytics.Assemblytics_assembly_stats.txt
show-coords -rcl A10CraA.filtered.delta | head -20
Assemblytics A10CraA.filtered.delta A10CraA_assemblytics_v2 500 50 100000
#reference ref_start ref_stop ID size strand type ref_gap_size query_gap_size query_coordinates method
#CP059040 2810861 2863470 Assemblytics_b_2 52668 + Tandem_contraction 52609 -59 contig00003:229645-229704:- between_alignments
#CP059040 3124916 3125037 Assemblytics_b_3 198 + Tandem_contraction 121 -77 contig00009:157580-157657:- between_alignments
CP059040 364652 365809 Assemblytics_b_4 1154 + Deletion 1157 3 contig00011:137770-137773:+ between_alignments
#reference ref_start ref_stop ID size strand type ref_gap_size query_gap_size query_coordinates method
CP059040 216043 226883 Assemblytics_b_1 10739 + Repeat_contraction 10840 101 CP059040_RagTag:192505-192606:+ between_alignments
* CP059040 364652 365809 Assemblytics_b_2 1154 + Deletion 1157 3 CP059040_RagTag:330375-330378:+ between_alignments
CP059040 414041 414151 Assemblytics_b_3 211 + Tandem_expansion -110 101 CP059040_RagTag:378720-378821:+ between_alignments
CP059040 465315 465425 Assemblytics_b_4 211 + Tandem_expansion -110 101 CP059040_RagTag:430205-430306:+ between_alignments
CP059040 491041 496665 Assemblytics_b_5 5523 + Repeat_contraction 5624 101 CP059040_RagTag:456034-456135:+ between_alignments
CP059040 523243 528864 Assemblytics_b_6 5520 + Repeat_contraction 5621 101 CP059040_RagTag:482713-482814:+ between_alignments
CP059040 785771 794627 Assemblytics_b_7 8755 + Repeat_contraction 8856 101 CP059040_RagTag:739721-739822:+ between_alignments
CP059040 1327560 1328441 Assemblytics_b_8 780 + Repeat_contraction 881 101 CP059040_RagTag:1272756-1272857:+ between_alignments
CP059040 2190218 2191651 Assemblytics_b_10 1332 + Repeat_contraction 1433 101 CP059040_RagTag:2134628-2134729:+ between_alignments
CP059040 2477722 2488562 Assemblytics_b_11 10739 + Repeat_contraction 10840 101 CP059040_RagTag:2420802-2420903:+ between_alignments
CP059040 2579725 2581158 Assemblytics_b_12 1332 + Repeat_contraction 1433 101 CP059040_RagTag:2512067-2512168:+ between_alignments
CP059040 2810861 2863470 Assemblytics_b_13 52668 + Tandem_contraction 52609 -59 CP059040_RagTag:2741812-2741871:- between_alignments
CP059040 3089638 3090457 Assemblytics_b_14 718 + Repeat_contraction 819 101 CP059040_RagTag:2967981-2968082:+ between_alignments
CP059040 3124916 3125037 Assemblytics_b_15 198 + Tandem_contraction 121 -77 CP059040_RagTag:3002464-3002541:- between_alignments
CP059040 3282693 3288043 Assemblytics_b_16 5249 + Repeat_contraction 5350 101 CP059040_RagTag:3160120-3160221:+ between_alignments
CP059040 3642789 3643608 Assemblytics_b_17 718 + Repeat_contraction 819 101 CP059040_RagTag:3514966-3515067:+ between_alignments
CP059040 3732412 3737887 Assemblytics_b_18 5374 + Repeat_contraction 5475 101 CP059040_RagTag:3603871-3603972:+ between_alignments
CP059040 3881974 3882855 Assemblytics_b_19 780 + Repeat_contraction 881 101 CP059040_RagTag:3748059-3748160:+ between_alignments
CP059040 3942815 3948161 Assemblytics_b_20 5245 + Repeat_contraction 5346 101 CP059040_RagTag:3808120-3808221:+ between_alignments
gene 364609..365838
/locus_tag="craA"
CDS 364609..365838
/locus_tag="H0N29_01695"
/inference="COORDINATES: similar to AA
sequence:RefSeq:YP_004996877.1"
/note="Derived by automated computational analysis using
gene prediction method: Protein Homology."
/codon_start=1
/transl_table=11
/product="MFS transporter"
/protein_id="QNT85352.1"
/db_xref="GI:1906908782"
/translation="MKNIQTTALNRTTLMFPLALVLFEFAVYIGNDLIQPAMLAITEDFGVSATWAPSSMSFYLLGGASVAWLLGPLSDRLGRKKVLLSGVLFFALCCFLILLTRQIEHFLTLRFLQGIGLSVISAVGYAAIQENFAERDAIKVMALMANISLLAPLLGPVLGAFLIDYVSWHWGFVAIALLALLSWVGLKKQMPSHKVSVTKQPFSYLFDDFKKVFSNRQFLGLTLALPLVGMPLMLWIALSPIILVDELKLTSVQYGLAQFPVFLGLIVGNIVLIKIIDRLALGKTVLIGLPIMLTGTLILILGVVWQAYLIPCLLIGMTLICFGEGISFSVLYRFALMSSEVSKGTVAAAVSMLLMTSFFAMIELVRYLYTQFHLWAFVLSAFAFIALWFTQPRLALKREMQERVAQDLH"
chloramphenicol efflux MFS transporter CraA [Acinetobacter pittii]
Sequence ID: WP_016142916.1Length: 409Number of Matches: 1
# ---------------- 19606adeAB vs CP059040 (4282 nt deletion) ----------------
# Step 1: Align assemblies to reference
#nucmer --maxmatch -l 100 -c 500 CP059040.fasta shovill/19606adeAB/contigs.fa -p 19606adeAB
nucmer --maxmatch -l 100 -c 500 CP059040.fasta ./shovill/adeAB_chr_plasmids.fasta -p 19606adeAB
# Step 2: Filter alignments (1-to-1 best matches)
delta-filter -1 -q 19606adeAB.delta > 19606adeAB.filtered.delta
# Note: Use -1 for 1-to-1, not -r -q
# Step 3: Run Assemblytics with ALL 5 parameters
Assemblytics 19606adeAB.filtered.delta 19606adeAB_assemblytics 1000 100 50000
# Step 5: Extract large insertions only
grep -w "Insertion" 19606adeAB_assemblytics.Assemblytics_structural_variants.bed > 19606adeAB_insertions.bed
# 6. Check if ANY variants were detected (any size)
wc -l 19606adeAB_assemblytics.Assemblytics_structural_variants.bed
# 7. View variant type distribution
cut -f4 19606adeAB_assemblytics.Assemblytics_structural_variants.bed | sort | uniq -c
# 8. Check alignment coverage (are contigs aligning well?)
cat 19606adeAB_assemblytics.Assemblytics_assembly_stats.txt
# 9. Check raw delta file for alignment blocks
show-coords -rcl 19606adeAB.filtered.delta | head -20
# 10. If bed file is empty, try relaxing parameters and re-run:
Assemblytics 19606adeAB.filtered.delta 19606adeAB_assemblytics_v2 500 50 100000
# └─unique─┘ └min┘ └──max──┘
CP059040 2810861 2863470 Assemblytics_b_1 52668 + Tandem_contraction 52609 -59 contig00005:229645-229704:- between_alignments
CP059040 3124916 3125037 Assemblytics_b_2 198 + Tandem_contraction 121 -77 contig00012:92096-92173:- between_alignments
3124917
--> DELETE!
CP059040 1844323 1848605 Assemblytics_b_5 4282 + Deletion 4282 0 contig00024:26693-26693:+ between_alignments
#reference ref_start ref_stop ID size strand type ref_gap_size query_gap_size query_coordinates method
CP059040 216043 226883 Assemblytics_b_1 10739 + Repeat_contraction 10840 101 CP059040_RagTag:192505-192606:+ between_alignments
CP059040 375124 375138 Assemblytics_b_2 115 + Insertion -14 101 CP059040_RagTag:340861-340962:+ between_alignments
CP059040 491041 496665 Assemblytics_b_5 5523 + Repeat_contraction 5624 101 CP059040_RagTag:456894-456995:+ between_alignments
CP059040 523243 528864 Assemblytics_b_6 5520 + Repeat_contraction 5621 101 CP059040_RagTag:483573-483674:+ between_alignments
CP059040 785771 794627 Assemblytics_b_8 8755 + Repeat_contraction 8856 101 CP059040_RagTag:740606-740707:+ between_alignments
CP059040 1327560 1328441 Assemblytics_b_9 780 + Repeat_contraction 881 101 CP059040_RagTag:1273641-1273742:+ between_alignments
CP059040 1607031 1607049 Assemblytics_b_12 83 + Insertion 18 101 CP059040_RagTag:1552326-1552427:+ between_alignments
* CP059040 1844323 1848605 Assemblytics_b_14 4282 + Deletion 4282 0 CP059040_RagTag:1789745-1789745:+ between_alignments
CP059040 1852044 1852303 Assemblytics_b_15 158 + Repeat_contraction 259 101 CP059040_RagTag:1793184-1793285:+ between_alignments
CP059040 2147239 2147252 Assemblytics_b_17 114 + Insertion -13 101 CP059040_RagTag:2088243-2088344:+ between_alignments
CP059040 2178043 2178056 Assemblytics_b_18 114 + Insertion -13 101 CP059040_RagTag:2119161-2119262:+ between_alignments
CP059040 2180008 2180010 Assemblytics_b_19 99 + Insertion 2 101 CP059040_RagTag:2121227-2121328:+ between_alignments
CP059040 2190218 2191651 Assemblytics_b_20 1332 + Repeat_contraction 1433 101 CP059040_RagTag:2131536-2131637:+ between_alignments
CP059040 2488562 2488672 Assemblytics_b_22 211 + Tandem_expansion -110 101 CP059040_RagTag:2428616-2428717:+ between_alignments
CP059040 2579725 2581158 Assemblytics_b_23 1332 + Repeat_contraction 1433 101 CP059040_RagTag:2519881-2519982:+ between_alignments
CP059040 2810861 2863470 Assemblytics_b_24 52668 + Tandem_contraction 52609 -59 CP059040_RagTag:2749626-2749685:- between_alignments
CP059040 2873400 2873644 Assemblytics_b_25 143 + Repeat_contraction 244 101 CP059040_RagTag:2759556-2759657:+ between_alignments
CP059040 3089638 3090457 Assemblytics_b_26 718 + Repeat_contraction 819 101 CP059040_RagTag:2975652-2975753:+ between_alignments
CP059040 3124916 3125037 Assemblytics_b_27 198 + Tandem_contraction 121 -77 CP059040_RagTag:3010135-3010212:- between_alignments
CP059040 3217209 3217578 Assemblytics_b_28 268 + Repeat_contraction 369 101 CP059040_RagTag:3102307-3102408:+ between_alignments
CP059040 3282677 3288043 Assemblytics_b_29 5265 + Repeat_contraction 5366 101 CP059040_RagTag:3167507-3167608:+ between_alignments
CP059040 3642789 3643608 Assemblytics_b_30 718 + Repeat_contraction 819 101 CP059040_RagTag:3522353-3522454:+ between_alignments
CP059040 3732428 3737887 Assemblytics_b_31 5358 + Repeat_contraction 5459 101 CP059040_RagTag:3611274-3611375:+ between_alignments
CP059040 3881974 3882855 Assemblytics_b_33 780 + Repeat_contraction 881 101 CP059040_RagTag:3755461-3755562:+ between_alignments
CP059040 3942815 3948161 Assemblytics_b_34 5245 + Repeat_contraction 5346 101 CP059040_RagTag:3815522-3815623:+ between_alignments
gene 1844319..1845509
/gene="adeA"
/locus_tag="H0N29_08675"
gene 1845506..1848616
/gene="adeB"
/locus_tag="H0N29_08680"
# Step 1: Align assemblies to reference
nucmer --maxmatch -l 100 -c 500 shovill/A6WT/contigs.fa shovill/19606adeAB/contigs.fa -p 19606adeAB_vs_A6WT
# Step 2: Filter alignments (1-to-1 best matches)
delta-filter -1 -q 19606adeAB_vs_A6WT.delta > 19606adeAB_vs_A6WT.filtered.delta
# Note: Use -1 for 1-to-1, not -r -q
# Step 3: Run Assemblytics with ALL 5 parameters
Assemblytics 19606adeAB_vs_A6WT.filtered.delta 19606adeAB_vs_A6WT_assemblytics 1000 100 50000
# Step 4: Extract large insertions only
grep -w "Insertion" 19606adeAB_vs_A6WT_assemblytics.Assemblytics_structural_variants.bed > 19606adeAB_vs_A6WT_insertions.bed
# Step 5. Check if ANY variants were detected (any size)
wc -l 19606adeAB_vs_A6WT_assemblytics.Assemblytics_structural_variants.bed
# Step 6. View variant type distribution
cut -f4 19606adeAB_vs_A6WT_assemblytics.Assemblytics_structural_variants.bed | sort | uniq -c
# Step 7. Check alignment coverage (are contigs aligning well?)
cat 19606adeAB_vs_A6WT_assemblytics.Assemblytics_assembly_stats.txt
# Step 8. Check raw delta file for alignment blocks
show-coords -rcl 19606adeAB_vs_A6WT.filtered.delta | head -20
# Step 9. If bed file is empty, try relaxing parameters and re-run:
Assemblytics 19606adeAB_vs_A6WT.filtered.delta 19606adeAB_vs_A6WT_assemblytics_v2 500 50 100000
# └─unique─┘ └min┘ └──max──┘
# ---------------- adeIJ vs CP059040 (4436 nt deletion) ----------------
#nucmer --maxmatch -l 100 -c 500 CP059040.fasta shovill/adeIJ/contigs.fa -p adeIJ
nucmer --maxmatch -l 100 -c 500 CP059040.fasta ./shovill/adeIJ_chr_plasmids.fasta -p adeIJ
delta-filter -1 -q adeIJ.delta > adeIJ.filtered.delta
Assemblytics adeIJ.filtered.delta adeIJ_assemblytics 1000 100 50000
grep -w "Insertion" adeIJ_assemblytics.Assemblytics_structural_variants.bed > adeIJ_insertions.bed
wc -l adeIJ_assemblytics.Assemblytics_structural_variants.bed
cut -f4 adeIJ_assemblytics.Assemblytics_structural_variants.bed | sort | uniq -c
cat adeIJ_assemblytics.Assemblytics_assembly_stats.txt
show-coords -rcl adeIJ.filtered.delta | head -20
Assemblytics adeIJ.filtered.delta adeIJ_assemblytics_v2 500 50 100000
#CP059040 2810861 2863470 Assemblytics_b_2 52668 + Tandem_contraction 52609 -59 contig00003:229645-229704:- between_alignments
#CP059040 2259736 2260384 Assemblytics_b_3 135 + Tandem_contraction -648 -783 contig00005:217212-217995:- between_alignments
CP059040 737224 741667 Assemblytics_b_4 4436 + Deletion 4443 7 contig00007:208361-208368:+ between_alignments
#CP059040 3124916 3125037 Assemblytics_b_5 198 + Tandem_contraction 121 -77 contig00009:157580-157657:- between_alignments
CP059040 216043 226883 Assemblytics_b_1 10739 + Repeat_contraction 10840 101 CP059040_RagTag:192505-192606:+ between_alignments
CP059040 491041 496665 Assemblytics_b_2 5523 + Repeat_contraction 5624 101 CP059040_RagTag:456766-456867:+ between_alignments
CP059040 523243 528864 Assemblytics_b_3 5520 + Repeat_contraction 5621 101 CP059040_RagTag:483445-483546:+ between_alignments
* CP059040 737224 741667 Assemblytics_b_4 4436 + Deletion 4443 7 CP059040_RagTag:691906-691913:+ between_alignments
CP059040 785771 794627 Assemblytics_b_5 8755 + Repeat_contraction 8856 101 CP059040_RagTag:736017-736118:+ between_alignments
CP059040 1327560 1328441 Assemblytics_b_6 780 + Repeat_contraction 881 101 CP059040_RagTag:1269052-1269153:+ between_alignments
CP059040 2190218 2191651 Assemblytics_b_8 1332 + Repeat_contraction 1433 101 CP059040_RagTag:2130924-2131025:+ between_alignments
CP059040 2259736 2260384 Assemblytics_b_9 135 + Tandem_contraction -648 -783 CP059040_RagTag:2198975-2199758:- between_alignments
CP059040 2477722 2488562 Assemblytics_b_10 10739 + Repeat_contraction 10840 101 CP059040_RagTag:2416969-2417070:+ between_alignments
CP059040 2579725 2581158 Assemblytics_b_11 1332 + Repeat_contraction 1433 101 CP059040_RagTag:2508234-2508335:+ between_alignments
CP059040 3089638 3090457 Assemblytics_b_13 718 + Repeat_contraction 819 101 CP059040_RagTag:2964148-2964249:+ between_alignments
CP059040 3124916 3125037 Assemblytics_b_14 198 + Tandem_contraction 121 -77 CP059040_RagTag:2998631-2998708:- between_alignments
CP059040 3282693 3288043 Assemblytics_b_15 5249 + Repeat_contraction 5350 101 CP059040_RagTag:3156287-3156388:+ between_alignments
CP059040 3642789 3643608 Assemblytics_b_16 718 + Repeat_contraction 819 101 CP059040_RagTag:3511133-3511234:+ between_alignments
CP059040 3732412 3737994 Assemblytics_b_17 5481 + Repeat_contraction 5582 101 CP059040_RagTag:3600038-3600139:+ between_alignments
CP059040 3881974 3882855 Assemblytics_b_18 780 + Repeat_contraction 881 101 CP059040_RagTag:3744119-3744220:+ between_alignments
CP059040 3942815 3948161 Assemblytics_b_19 5245 + Repeat_contraction 5346 101 CP059040_RagTag:3804180-3804281:+ between_alignments
CP059040 788949 794737 Assemblytics_b_27 5889 + Tandem_expansion -5788 101 CP059040_RagTag:3876490-3876591:+ between_alignments
gene complement(737233..740409)
/gene="adeJ"
/locus_tag="H0N29_03545"
gene complement(740422..741672)
/gene="adeI"
/locus_tag="H0N29_03550"
6. Confirmed Targeted Deletions
ΔadeAB
- 4,282-bp deletion
- Corresponds to CP059040 positions 1,844,323–1,848,605
-
Removes nearly the entire adeAB operon:
- adeA (H0N29_08675; 1,844,319–1,845,509)
- adeB (H0N29_08680; 1,845,506–1,848,616)
ΔadeIJ
- 4,443-bp deletion
- Corresponds to CP059040 positions 737,224–741,667
-
Removes the reverse-strand adeIJ operon:
- adeJ (H0N29_03545; complement 737,233–740,409)
- adeI (H0N29_03550; complement 740,422–741,672)
ΔcraA
- 1,157-bp deletion
- Corresponds to CP059040 positions 364,652–365,809
- Removes the central portion of craA (H0N29_01695; 364,609–365,838)
Together, the structural variant and SNP/indel analyses confirm that each mutant differs from WT only at the intended deletion locus.
7. Manuscript-Ready Summary
Whole-genome sequencing and comparative assembly analysis confirmed the genomic integrity of the ΔadeAB, ΔadeIJ, and ΔcraA mutants. Structural variant analysis identified a single targeted deletion in each strain, while consensus SNP/indel calling using Snippy and SPANDx detected no additional variants relative to WT outside the engineered loci.
Habilitationsschrift
汉堡大学医学院博士后资格认定(教授资格)程序实施指南
(2004年10月13日版本,最近修订于2017年3月) 根据2017年3月22日学院理事会决议,自2018年4月1日起生效
1. 序言
根据2015年2月18日颁布的《汉堡大学医学院博士后资格认定条例》(以下简称《博士后条例》)第2条规定,博士后资格认定旨在证明申请人在医学院某一研究领域具备独立开展科研工作的特殊能力,且该研究已为相关学科带来具有重大意义的知识突破¹。本《博士后资格认定指南》所列标准旨在为《博士后条例》第7条规定的博士后资格认定委员会提供决策参考,以判断申请人的博士后论文及其他学术成果是否足以证明其已对所申请博士后资格认定的学科领域进行了科学深入的钻研,从而满足《博士后条例》第2条的要求。
本指南旨在促进学院内部的学术竞争,持续提升汉堡大学医学院的科研质量与产出水平。
此外,本指南还旨在提高程序执行的透明度与客观性,并为博士后申请人提供自我评估的依据,从而便于其决定是否提交博士后资格认定申请。需要强调的是,本指南仅规定最低要求;针对申请博士后资格认定的具体研究领域,可设定更高的标准。因此,满足本指南所列标准并不能保证博士后资格认定申请必然成功。
¹ 重大知识突破的示例包括:
- 通过可靠(即可重复)的新数据推翻原有科学假说,并提出新假说;
- 开发并验证具有开创性的新实验方法;
- 对传统假说进行修正,同时开辟新的研究领域;
- 激发对科学问题的新理解,从而推动新的科研工作。
2. 学术成果评估
需注意,博士后资格认定程序的评审对象仅限于博士后论文及其所基于的出版物;其他发表成果的证明仅用于佐证申请人的学术水准及博士后资格认定适格性。即使是非常高水平的发表成果,本身也不构成独立的博士后资格认定业绩。
A) 著作目录
a) 《博士后条例》第3条第2款第5项规定的著作目录应按以下结构编排:
- 原创研究论文
- 综述文章
- 病例报告
- 已发表的会议贡献(非摘要)
- 书籍章节
- 报告与海报(摘要)
- 继续教育讲座
b) 申请人应至少在专业领域内高影响力期刊上发表2篇原创研究论文,其中1篇须以第一作者或通讯作者(最后作者)身份发表。共同第一作者视同单独第一作者,共同通讯作者视同单独通讯作者。若某期刊在其所属专业领域的《期刊引证报告—科学引文索引》(JCR-SCI)或《期刊引证报告—社会科学引文索引》(JCR-SSCI)期刊列表中,按影响因子排名位于前33%,则视为该专业领域内的高影响力期刊。
c) 此外,申请人应至少在专业领域内优秀期刊上以第一作者或通讯作者身份发表4篇原创研究论文。共同第一作者视同单独第一作者,共同通讯作者视同单独通讯作者。若某期刊在其所属专业领域的JCR-SCI/JCR-SSCI期刊列表中,按影响因子排名位于前60%,则视为该专业领域内的优秀期刊。
d) 在b)和c)项要求的第一作者或通讯作者论文中,至少60%须为单独第一作者或单独通讯作者发表。若单独第一作者或通讯作者论文数量不足,可用两篇共同第一作者或共同通讯作者的原创论文替代一篇单独作者论文。共同第一作者或通讯作者人数不得超过两人。
e) 除此之外,申请人还应至少在国际公认期刊上发表5篇其他原创研究论文,且申请人须对论文作出决定性贡献²。通常,若某期刊被收录于JCR专业期刊列表中,即视为国际公认期刊。
f) 博士后申请的形式审查由学院办公室负责;审查结果将提交博士后资格认定委员会供进一步审议参考。
g) 博士后资格认定委员会负责评估博士后论文的学术内容。对已发表出版物及未发表的特殊学术成果的评价可作为辅助参考。
h) 博士后论文应能清晰归入申请人的研究领域。
i) 申请人应提交至少5篇以摘要形式证明的会议报告或海报,且申请人须以第一作者或通讯作者身份在学术会议上进行展示。
j) 专利可等同于一篇原创研究论文进行评价。
² 申请人须清晰说明其具体贡献内容。
k) 在特殊情况下,结合学科特点,某些专业领域的特殊成果可被认可为著作目录的有效组成部分。
B) 博士后论文
博士后论文篇幅通常不应超过120页(含图表、图注、表格、摘要,但不含参考文献)。论文结构应如下:目录(含页码)、引言、材料与方法、结果、讨论、总结、参考文献。
累积型博士后资格认定应基于至少3篇围绕同一主题领域、以第一作者或通讯作者(含共同作者)身份发表的原创论文,且这些论文通常应在获得博士学位后完成。申请人须提交一份约20页的书面综合论述,对上述原创论文进行系统总结。该总结应表明申请人有能力将各篇论文的研究成果置于更广阔的学术背景中加以整合,并阐明其对所申请学科领域带来的重大知识突破。
C) 评审意见
根据《博士后条例》第8条第1款,博士后资格认定委员会需借助外部专家意见,通过委托校外专家出具评审报告,以评估申请人的博士后资格认定适格性。在此过程中须注意³:
- 评审意见的目的是对博士后资格认定业绩进行专业评估,确保评审结果能准确对照考核目标。因此,受邀评审专家必须具备必要的专业资质,并需审慎遴选;
- 评审专家应基于完整、准确的事实进行判断,即向其提供评估博士后资格认定申请所需的全部信息。但需注意,评审专家应专注于博士后论文本身的评估,因为只有论文本身才能体现博士后资格认定业绩的学术价值。博士后论文的不足不能通过申请人(共同)撰写的其他出版物予以弥补;具备博士后资格认定价值的论文必须自身逻辑清晰、内容完整,构成一项独立自洽的学术成果;
- 评审专家在评估博士后资格认定业绩时,仅应依据《汉堡高等教育法》第71条及《博士后条例》第2条规定的要求。此举旨在确保评审标准的统一性。相关要求已在本指南序言部分予以说明。
³ 依据汉堡行政法院2004年4月21日判决(案号:15 K 3849/03)的阐述。
医学院
院长办公室
博士后资格认定指南
附件02,编号7.1.6,版本日期:2017年4月24日
Abbreviation: GENES-BASEL ISSN: N/A eISSN: 2073-4425 Category: GENETICS & HEREDITY – SCIE WoS Core Citation Indexes: Scientific literature search SCIE – Science Citation Index Expanded Journal Impact Factor (JIF): 2.8 5-year Impact Factor: 3.2 Best ranking: GENETICS & HEREDITY ║ Percentage rank: 58.6% Open Access Support: Fully Open Access ― It may take a publication fee. For more info, check it on DOAJ.ORG Country: SWITZERLAND Status in WoS core: Active Publisher: MDPI AG
Abbreviation: BIOLOGY-BASEL ISSN: N/A eISSN: 2079-7737 Category: BIOLOGY – SCIE WoS Core Citation Indexes: SCIE – Science Citation Index Expanded Journal Impact Factor (JIF): 3.5 5-year Impact Factor: 4 Best ranking: BIOLOGY ║ Percentage rank: 81.3% Open Access Support: Fully Open Access ― It may take a publication fee. For more info, check it on DOAJ.ORG Country: SWITZERLAND Status in WoS core: Active Publisher: MDPI AG
Abbreviation: MSYSTEMS ISSN: 2379-5077 eISSN: 2379-5077 Category: MICROBIOLOGY – SCIE WoS Core Citation Indexes: Scientific literature search SCIE – Science Citation Index Expanded Journal Impact Factor (JIF): 4.6 5-year Impact Factor: 5.7 Best ranking: MICROBIOLOGY (Q1) ║ Percentage rank: 79.1% Open Access Support: Fully Open Access ― It may take a publication fee. For more info, check it on DOAJ.ORG Country: UNITED STATES Status in WoS core: Active Publisher: American Society for Microbiology
Abbreviation: MICROBIOL RESOUR ANN ISSN: N/A eISSN: 2576-098X Category: MICROBIOLOGY – ESCI WoS Core Citation Indexes: ESCI – Emerging Sources Citation Index Journal Impact Factor (JIF): 0.6 5-year Impact Factor: 0.6 Best ranking: MICROBIOLOGY (Q4) ║ Percentage rank: 4.3% Open Access Support: Fully Open Access ― It may take a publication fee. For more info, check it on DOAJ.ORG Country: UNITED STATES Status in WoS core: Active Publisher: American Society for Microbiology
是的,文件里明确有对 Vortrag / Poster 的最低要求:
你至少需要 5 个有 abstract 证明的会议报告或海报,并且你必须是该 abstract 的第一作者或最后作者。
原文是:
“Es sollen mindestens 5 durch Abstracts belegte Vorträge oder Poster vorgelegt werden, die die Antragstellerin als Erst- oder Letztautorin/der Antragsteller als Erst- oder Letztautor auf einem Kongress vertreten hat.”
中文逐句翻译:
-
Es sollen mindestens 5 durch Abstracts belegte Vorträge oder Poster vorgelegt werden 应至少提交 5 个有摘要(abstract)作为证明的学术报告或海报。
-
die die Antragstellerin als Erst- oder Letztautorin / der Antragsteller als Erst- oder Letztautor auf einem Kongress vertreten hat. 并且申请人必须是在会议上以第一作者或最后作者身份展示这些报告或海报的人。
注意:
- 可以是 oral presentation,也可以是 poster;
- 只要会议 abstract 被正式接受、可提供会议册或 abstract book 即可;
- shared first / shared last 通常会被接受,但文件没有像论文那样明确写出,因此最好保存好会议 abstract 和作者顺序。
b) “Es sollen mindestens 2 Originalarbeiten in fachspezifisch hochrangigen Journalen vorliegen, von denen eine als Erst- oder Letztautorin/Erst- oder Letztautor publiziert sein soll.” 应至少有 2 篇发表在本专业高水平期刊上的原创论文,其中至少 1 篇申请人应为第一作者或最后作者。
“Dabei wird eine gleichberechtigte Erstautorschaft wie eine alleinige Erstautorschaft gewertet, eine gleichberechtigte Letztautorenschaft wird wie eine alleinige Letztautorschaft gewertet.” 其中,共同第一作者视同于单独第一作者;共同最后作者视同于单独最后作者。
“Fachspezifisch hochrangig ist ein Journal dann, wenn es sich unter den ersten 33% (nach Impact-Faktor) der den jeweiligen Fachgebieten der/des Antragstellers zugeordneten fachspezifischen Journallisten des Journal Citation Reports – Science Citation Index (JCR-SCI) bzw. dem Journal Citation Reports – Social Science Citation Index (JCR-SSCI) befindet.” 所谓“本专业高水平期刊”,是指该期刊按照影响因子排名,位于申请人所属学科的 JCR-SCI 或 JCR-SSCI 期刊列表前 33% 之内。
c) “Des Weiteren sollen mindestens 4 Originalarbeiten in fachspezifisch sehr guten Journalen vorliegen, bei denen die Antragstellerin Erst- oder Letztautorin/der Antragsteller Erst- oder Letztautor ist.” 此外,还应至少有 4 篇发表在本专业“非常优秀”期刊上的原创论文,并且申请人是第一作者或最后作者。
“Dabei wird eine gleichberechtigte Erstautorschaft wie eine alleinige Erstautorschaft gewertet, eine gleichberechtigte Letztautorenschaft wird wie eine alleinige Letztautorschaft gewertet.” 其中,共同第一作者视同于单独第一作者;共同最后作者视同于单独最后作者。
“Fachspezifisch sehr gut ist ein Journal dann, wenn es sich unter den vorderen 60% (nach Impact-Faktor) der dem jeweiligen Fachgebiet der/des Antragstellers zugeordneten fachspezifischen Journallisten des JCR-SCI/JCR-SSCI befindet.” 所谓“本专业非常优秀期刊”,是指该期刊按影响因子排名,位于申请人所属学科 JCR-SCI/JCR-SSCI 期刊列表前 60% 之内。
d) “Von den unter b) und c) geforderten Publikationen in Erst- oder Letztautorenschaft müssen mindestens 60% in alleiniger Erst- oder Letztautorschaft erschienen sein.” 在上述 b) 和 c) 要求的第一作者/最后作者论文中,至少 60% 必须是单独第一作者或单独最后作者。
“Dabei können fehlende alleinige Erst- oder Letztautorenschaften durch jeweils zwei Originalpublikationen in geteilter Erst- oder Letztautorenschaft ersetzt werden.” 如果缺少单独第一作者或单独最后作者论文,则每缺 1 篇,可以用 2 篇共同第一作者或共同最后作者论文来替代。
“Die Erst- oder Letztautorenschaften dürfen sich hierbei höchstens zwei Autoren oder Autorinnen teilen.” 这里的共同第一作者或共同最后作者,最多只能由两位作者共同承担。
Protected: Whole-genome sequencing, assembly, and variant calling
Elbinsel-Pokal 2026
- HSK Youth-Cup 2025: Schüler der Klassen 5-13 und Grundschüler mit DWZ oder ELO Die Bedenkzeit beträgt 20 Minuten pro Spieler und Partie.
https://hsk1830.de/termin/40-hsk-youth-cup (12. Oktober 2025)
https://hsk1830.de/termin/44-hsk-kids-cup (19. Juli 2025)
https://hsk1830.de/termin/45-hsk-kids-cup (11. Oktober 2025)
https://www.schachbund.de/turnierdatenbank-hamburg.html
https://www.schachbund.de/turnierdetails/1-hamburger-talente-cup-u12.html
Es werden 7 Runden nach Schweizer System gespielt. Es sind zwei getrennt voneinander durchgeführte Turniere:
- A) Klassen 5 – 8 und Grundschüler mit DWZ >1000
- B) Klassen 5 – 6 und Grundschüler mit DWZ <1001
Modus: 7 Runden Schweizer System, Start der 1. Runde ca. 10:00 Uhr Turnier für Teilnehmer wie folgt:
- Schüler der Klassen 5-13 und Grundschüler mit DWZ oder ELO
- Die Bedenkzeit beträgt 20 Minuten pro Spieler und Partie.
HSK Kids-Cup
findet mehrmals im Jahr an einem Wochenendtag statt. Er ist für Kinder ohne DWZ und geeignet für Einsteiger und gilt als zusätzliche Schulwertung.
Es werden 7 Runden nach Schweizer System gespielt. Es sind zwei getrennt voneinander durchgeführte Turniere:
A) offen bis Klasse 4, DWZ-gewertet B) KiGa bis Klasse 2
Die Bedenkzeit beträgt 20 Minuten pro Spieler und Partie. In den ersten 15 Minuten wird mit geschrieben. Damit die Trainer zwischen den Runden mit den Spielern die Partie analysieren können.
10,–€ Startgeld für HSK-Mitglieder. Gäste zahlen 15,–€ Startgeld.
Pokale gibt es für die ersten drei Plätze, für das beste Mädchen und den Besten jeder Klassenstufe. Medaillen und Urkunden bekommen alle anderen.
laden ein zum Schnellturnier im Schulschach um den Elbinsel-Pokal 2026
-
Für: Schüler:innen im schulpflichtigen Alter
-
Termin: Samstag, 25. April 2026 persönliche Anwesenheitsmeldung bis 11 Uhr, danach 1. Runde Preisverleihung gegen 16:30 Uhr
-
Spielmodus: 9 Runden Schweizer System 15 Minuten Bedenkzeit pro Spieler:in und Partie
-
Ort: Bildungszentrum Tor zur Welt, Krieterstraße 2d, 21109 Hamburg
-
Anreise: S3/S5 bis Wilhelmsburg und/oder 154er bis Thielenstraße (Ost) Parkmöglichkeiten für Autos vorhanden, wenn auch begrenzt
-
Preise: Es gibt Pokale für Platz eins bis drei und Medaillen für die beste Schüler:in einer Jahrgangsstufe. Es gibt weitere Pokale für die beste weiterführende Schule und die beste Grundschule. Zur Vergabe dieser Pokale werden die Punkte der vier besten Schüler:innen einer Schule addiert.
-
Startgeld: 7,‒ € bei Zahlungseingang bis 30. März, danach 10,‒ € bevorzugt per Überweisung, nötigenfalls bar vor Ort
-
Verpflegung: Es gibt ausreichend Snacks und Getränke zu günstigen Preisen.
-
Anmeldung: bis zum 23. April über die Webseite https://www.skw.one/elbinsel-pokal– 2026, danach vor Ort, wenn noch Startplätze frei sind. Ob noch welche frei sind, wird auf der Webseite veröffentlicht.
Die Anmeldung gilt nur nach Zahlung des Startgeldes als vollzogen!


