Daily Archives: 2026年7月3日

Variant Calling Results and Analysis Summary for the 6 Pairs of DNA-seq Data (Data_Holger_DNAseq_2026_Sepi_Pairs)

  1. Run nextflow bacass

     conda deactivate
    
     # Downlod k2_standard_08_GB_20251015.tar.gz from https://benlangmead.github.io/aws-indexes/k2#kraken2--bracken
     # Download 20190108_kmerfinder_stable_dirs.tar.gz from https://zenodo.org/records/13447056; 'tar xzf 20190108_kmerfinder_stable_dirs.tar.gz'  #The database does not work!
     # Download the kmerfinder database: https://www.genomicepidemiology.org/services/ --> https://cge.food.dtu.dk/services/KmerFinder/ --> https://cge.food.dtu.dk/services/KmerFinder/etc/kmerfinder_db.tar.gz  #The database works!
    
     # DEBUG: --kmerfinderdb /mnt/nvme1n1p1/REFs/kmerfinder/bacteria/ not working!
    
     nextflow run nf-core/bacass -r 2.6.0 -profile docker --help
    
     # -- Short assembly --
     #Maybe BUG is from '--skip_kmerfinder for -r 2.6.0, using db in 2.5.0'
     nextflow run nf-core/bacass -r 2.5.0 -profile docker \
       --input samplesheet.tsv \
       --outdir bacass_out \
       --assembly_type short \
       --kraken2db /mnt/nvme1n1p1/REFs/k2_standard_08_GB_20251015.tar.gz \
       --kmerfinderdb /mnt/nvme1n1p1/REFs/kmerfinder/bacteria/ \
       -resume \
       -work-dir bacass_out/work
  2. Verify if the genome is pure

     # 1. Go up one level to the main 'bacass_out' directory
     cd ..
    
     # 2. Create directories for CheckM inputs and outputs
     mkdir -p checkm_input checkm_output
    
     # 3. Copy all .fna files into the 'checkm_input' folder
     # (CheckM cannot search subdirectories, so they must be in one folder)
     find ./Prokka -name "*.fna" -exec cp {} checkm_input/ \;
    
     # 4. Run CheckM on all 4 assemblies
     (checkm_env2) checkm lineage_wf -x fna checkm_input checkm_output
  3. mlst

     (bacto) mlst *.fna > mlst
    
     #Pair1
     2780_.fna   sepidermidis    2   arcC(7) aroE(1) gtr(2)  mutS(2) pyrR(4) tpiA(1) yqiL(1)
     12835_.fna  sepidermidis    2   arcC(7) aroE(1) gtr(2)  mutS(2) pyrR(4) tpiA(1) yqiL(1)
    
     #Pair2
     4667_.fna   sepidermidis    2   arcC(7) aroE(1) gtr(2)  mutS(2) pyrR(4) tpiA(1) yqiL(1)
     9940_.fna   sepidermidis    2   arcC(7) aroE(1) gtr(2)  mutS(2) pyrR(4) tpiA(1) yqiL(1)
    
     #Pair3
     3139_.fna   sepidermidis    5   arcC(1) aroE(1) gtr(1)  mutS(2) pyrR(2) tpiA(1) yqiL(1)
     19296_.fna  sepidermidis    5   arcC(1) aroE(1) gtr(1)  mutS(2) pyrR(2) tpiA(1) yqiL(1)
    
     #Pair4
     9849_.fna   sepidermidis    5   arcC(1) aroE(1) gtr(1)  mutS(2) pyrR(2) tpiA(1) yqiL(1)
     11635_.fna  sepidermidis    5   arcC(1) aroE(1) gtr(1)  mutS(2) pyrR(2) tpiA(1) yqiL(1)
    
     #Pair5
     3006_.fna   sepidermidis    5   arcC(1) aroE(1) gtr(1)  mutS(2) pyrR(2) tpiA(1) yqiL(1)
     10393_.fna  sepidermidis    5   arcC(1) aroE(1) gtr(1)  mutS(2) pyrR(2) tpiA(1) yqiL(1)
    
     #Pair6
     1736_.fna   sepidermidis    59  arcC(2) aroE(1) gtr(1)  mutS(1) pyrR(2) tpiA(1) yqiL(1)
     8583_.fna   sepidermidis    59  arcC(2) aroE(1) gtr(1)  mutS(1) pyrR(2) tpiA(1) yqiL(1)
    
    #Compared to other isolates from the lab: patient HD04 [ST5], HD21 [ST2], HD26 [ST5], HD27 [ST5], HD29 [ST5], HD59 [ST5], HD33 [ST87]
  4. Species Identification: 快速筛查用 Mash → 精确分类用 GTDB-Tk → 种级验证用 FastANI,三者结合可最大限度提高物种鉴定的准确性和可解释性。

     # 1. 创建环境(推荐 mamba)
     mamba create -n gtdbtk -c conda-forge -c bioconda gtdbtk
     mamba activate gtdbtk
    
     # 2. 下载数据库(仅需首次,约 60GB)
     gtdbtk download --data_dir ./gtdb_data --release 220
    
     wget https://data.gtdb.aau.ecogenomic.org/releases/release232/232.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r232_data.tar.g
     mamba env config vars set GTDBTK_DATA_PATH="/mnt/nvme4n1p1/gtdb_data/release232"
     # 先退出当前环境,再重新激活
     mamba deactivate
     mamba activate gtdbtk
    
     # 验证环境变量是否加载成功
     echo $GTDBTK_DATA_PATH
     # 应输出:/mnt/nvme4n1p1/gtdb_data/release232
    
     # 3. 运行分类(你提供的命令 + 实用参数)
     gtdbtk classify_wf \
       --genome_dir ./checkm_input \
       --out_dir gtdb_out \
       --cpus 64 \
       --extension .fna \
       --prefix mygenome
    
     # 4. 查看结果
     cat gtdb_out/classify/mygenome.bac120.summary.tsv   # 细菌结果
    
     #Pair1
     2780_.fna   sepidermidis    2   arcC(7) aroE(1) gtr(2)  mutS(2) pyrR(4) tpiA(1) yqiL(1)
     12835_.fna  sepidermidis    2   arcC(7) aroE(1) gtr(2)  mutS(2) pyrR(4) tpiA(1) yqiL(1)
    
     #Pair2
     4667_.fna   sepidermidis    2   arcC(7) aroE(1) gtr(2)  mutS(2) pyrR(4) tpiA(1) yqiL(1)
     9940_.fna   sepidermidis    2   arcC(7) aroE(1) gtr(2)  mutS(2) pyrR(4) tpiA(1) yqiL(1)
    
     #Pair3
     3139_.fna   sepidermidis    5   arcC(1) aroE(1) gtr(1)  mutS(2) pyrR(2) tpiA(1) yqiL(1)
     19296_.fna  sepidermidis    5   arcC(1) aroE(1) gtr(1)  mutS(2) pyrR(2) tpiA(1) yqiL(1)
    
     #Pair4
     9849_.fna   sepidermidis    5   arcC(1) aroE(1) gtr(1)  mutS(2) pyrR(2) tpiA(1) yqiL(1)
     11635_.fna  sepidermidis    5   arcC(1) aroE(1) gtr(1)  mutS(2) pyrR(2) tpiA(1) yqiL(1)
    
     #Pair5
     3006_.fna   sepidermidis    5   arcC(1) aroE(1) gtr(1)  mutS(2) pyrR(2) tpiA(1) yqiL(1)
     10393_.fna  sepidermidis    5   arcC(1) aroE(1) gtr(1)  mutS(2) pyrR(2) tpiA(1) yqiL(1)
    
     #Pair6
     1736_.fna   sepidermidis    59  arcC(2) aroE(1) gtr(1)  mutS(1) pyrR(2) tpiA(1) yqiL(1)
     8583_.fna   sepidermidis    59  arcC(2) aroE(1) gtr(1)  mutS(1) pyrR(2) tpiA(1) yqiL(1)
    
     ----------------------------------------------------------------------------------------------------------------------------------------------------------------
     Bin Id         Marker lineage         # genomes   # markers   # marker sets   0    1    2   3   4   5+   Completeness   Contamination   Strain heterogeneity
     ----------------------------------------------------------------------------------------------------------------------------------------------------------------
     2780_    g__Staphylococcus (UID294)       60         773           178        1   772   0   0   0   0       99.81            0.00               0.00
     12835_   g__Staphylococcus (UID294)       60         773           178        1   772   0   0   0   0       99.81            0.00               0.00
    
     4667_    g__Staphylococcus (UID294)       60         773           178        1   772   0   0   0   0       99.81            0.00               0.00
     9940_    g__Staphylococcus (UID294)       60         773           178        1   772   0   0   0   0       99.81            0.00               0.00
    
     3139_    g__Staphylococcus (UID294)       60         773           178        1   772   0   0   0   0       99.81            0.00               0.00
     19296_   g__Staphylococcus (UID294)       60         773           178        1   772   0   0   0   0       99.81            0.00               0.00
    
     9849_    g__Staphylococcus (UID294)       60         773           178        1   772   0   0   0   0       99.81            0.00               0.00
     11635_   g__Staphylococcus (UID294)       60         773           178        2   771   0   0   0   0       99.73            0.00               0.00
    
     3006_    g__Staphylococcus (UID294)       60         773           178        1   772   0   0   0   0       99.81            0.00               0.00
     10393_   g__Staphylococcus (UID294)       60         773           178        1   772   0   0   0   0       99.81            0.00               0.00
    
     1736_    g__Staphylococcus (UID294)       60         773           178        1   771   1   0   0   0       99.81            0.28               0.00
     8583_    g__Staphylococcus (UID294)       60         773           178        1   771   1   0   0   0       99.81            0.28               0.00
     ----------------------------------------------------------------------------------------------------------------------------------------------------------------
  5. Antimicrobial resistance gene profiling and Resistome and Virulence Profiling with Abricate and RGI (Reisistance Gene Identifier)

     conda activate /home/jhuang/miniconda3/envs/bengal3_ac3
     abricate --list
    
     conda deactivate
    
     # 1. Set both required Java variables to bypass the buggy conda activation script
     export JAVA_HOME=/home/jhuang/miniconda3/envs/bengal3_ac3
     export JAVA_LD_LIBRARY_PATH=$JAVA_HOME/lib/server
    
     # 2. Run your script
     for sample in 2780 12835 4667 9940 3139 19296 9849 11635 3006 10393 1736 8583; do
         ENV_NAME=/home/jhuang/miniconda3/envs/bengal3_ac3 \
         ASM=bacass_out/checkm_input/${sample}_.fna \
         SAMPLE=${sample} \
         OUTDIR=resistome_virulence_${sample} \
         MINID=80 MINCOV=60 \
         THREADS=32 \
         ~/Scripts/run_abricate_resistome_virulome_one_per_gene.sh
     done
    
     conda activate /home/jhuang/miniconda3/envs/bengal3_ac3
     #NEED_TO_ADAPT: OUTDIR = Path("resistome_virulence_An7")
     #NEED_TO_ADAPT: SAMPLE = "An7"
     #DEPRECATED_DUE_TO_NEED_MANULL_SETTING: python ~/Scripts/merge_amr_sources_by_gene.py
    
     for sample in 2780 12835 4667 9940 3139 19296 9849 11635 3006 10393 1736 8583; do
         python ~/Scripts/export_resistome_virulence_to_excel_py36.py \
         --workdir resistome_virulence_${sample} \
         --sample ${sample} \
         --out Resistome_Virulence_${sample}.xlsx
     done
    
     Wrote Excel: Resistome_Virulence_2780.xlsx
     AMR genes (one row per gene): 29
     VFDB hits: 0
     DB hit counts saved.
     Wrote Excel: Resistome_Virulence_12835.xlsx
     AMR genes (one row per gene): 30
     VFDB hits: 0
     DB hit counts saved.
    
     Wrote Excel: Resistome_Virulence_4667.xlsx
     AMR genes (one row per gene): 33
     VFDB hits: 0
     DB hit counts saved.
     Wrote Excel: Resistome_Virulence_9940.xlsx
     AMR genes (one row per gene): 30
     VFDB hits: 0
     DB hit counts saved.
    
     Wrote Excel: Resistome_Virulence_3139.xlsx
     AMR genes (one row per gene): 27
     VFDB hits: 1
     DB hit counts saved.
     Wrote Excel: Resistome_Virulence_19296.xlsx
     AMR genes (one row per gene): 27
     VFDB hits: 1
     DB hit counts saved.
    
     Wrote Excel: Resistome_Virulence_9849.xlsx
     AMR genes (one row per gene): 20
     VFDB hits: 1
     DB hit counts saved.
     Wrote Excel: Resistome_Virulence_11635.xlsx
     AMR genes (one row per gene): 20
     VFDB hits: 1
     DB hit counts saved.
    
     Wrote Excel: Resistome_Virulence_3006.xlsx
     AMR genes (one row per gene): 21
     VFDB hits: 1
     DB hit counts saved.
     Wrote Excel: Resistome_Virulence_10393.xlsx
     AMR genes (one row per gene): 24
     VFDB hits: 1
     DB hit counts saved.
    
     Wrote Excel: Resistome_Virulence_1736.xlsx
     AMR genes (one row per gene): 22
     VFDB hits: 0
     DB hit counts saved.
     Wrote Excel: Resistome_Virulence_8583.xlsx
     AMR genes (one row per gene): 22
     VFDB hits: 0
     DB hit counts saved.
    
     # TODO: Delete the column 'COVERAGE_MAP' in all 'Raw_*' sheets
  6. Rename the files

     mv ./raw_data/Pair1/2780_S71_1_sequence.fastq.gz ./raw_data/Pair1/2780_S71_R1.fastq.gz
     mv ./raw_data/Pair1/2780_S71_2_sequence.fastq.gz ./raw_data/Pair1/2780_S71_R2.fastq.gz
     mv ./raw_data/Pair1/12835_S72_1_sequence.fastq.gz ./raw_data/Pair1/12835_S72_R1.fastq.gz
     mv ./raw_data/Pair1/12835_S72_2_sequence.fastq.gz ./raw_data/Pair1/12835_S72_R2.fastq.gz
    
     mv ./raw_data/Pair2/4667_S75_1_sequence.fastq.gz ./raw_data/Pair2/4667_S75_R1.fastq.gz
     mv ./raw_data/Pair2/4667_S75_2_sequence.fastq.gz ./raw_data/Pair2/4667_S75_R2.fastq.gz
     mv ./raw_data/Pair2/9940_S78_1_sequence.fastq.gz ./raw_data/Pair2/9940_S78_R1.fastq.gz
     mv ./raw_data/Pair2/9940_S78_2_sequence.fastq.gz ./raw_data/Pair2/9940_S78_R2.fastq.gz
    
     mv ./raw_data/Pair3/3139_S74_1_sequence.fastq.gz ./raw_data/Pair3/3139_S74_R1.fastq.gz
     mv ./raw_data/Pair3/3139_S74_2_sequence.fastq.gz ./raw_data/Pair3/3139_S74_R2.fastq.gz
     mv ./raw_data/Pair3/19296_S81_1_sequence.fastq.gz ./raw_data/Pair3/19296_S81_R1.fastq.gz
     mv ./raw_data/Pair3/19296_S81_2_sequence.fastq.gz ./raw_data/Pair3/19296_S81_R2.fastq.gz
    
     mv ./raw_data/Pair4/11635_S80_1_sequence.fastq.gz ./raw_data/Pair4/11635_S80_R1.fastq.gz
     mv ./raw_data/Pair4/11635_S80_2_sequence.fastq.gz ./raw_data/Pair4/11635_S80_R2.fastq.gz
     mv ./raw_data/Pair4/9849_S77_1_sequence.fastq.gz ./raw_data/Pair4/9849_S77_R1.fastq.gz
     mv ./raw_data/Pair4/9849_S77_2_sequence.fastq.gz ./raw_data/Pair4/9849_S77_R2.fastq.gz
    
     mv ./raw_data/Pair5/3006_S73_1_sequence.fastq.gz ./raw_data/Pair5/3006_S73_R1.fastq.gz
     mv ./raw_data/Pair5/3006_S73_2_sequence.fastq.gz ./raw_data/Pair5/3006_S73_R2.fastq.gz
     mv ./raw_data/Pair5/10393_S79_1_sequence.fastq.gz ./raw_data/Pair5/10393_S79_R1.fastq.gz
     mv ./raw_data/Pair5/10393_S79_2_sequence.fastq.gz ./raw_data/Pair5/10393_S79_R2.fastq.gz
    
     mv ./raw_data/Pair6/1736_S70_1_sequence.fastq.gz ./raw_data/Pair6/1736_S70_R1.fastq.gz
     mv ./raw_data/Pair6/1736_S70_2_sequence.fastq.gz ./raw_data/Pair6/1736_S70_R2.fastq.gz
     mv ./raw_data/Pair6/8583_S76_1_sequence.fastq.gz ./raw_data/Pair6/8583_S76_R1.fastq.gz
     mv ./raw_data/Pair6/8583_S76_2_sequence.fastq.gz ./raw_data/Pair6/8583_S76_R2.fastq.gz
    
     mv ./Pair1/2780_S71_R1.fastq.gz ./Pair1_2780_S71_R1.fastq.gz
     mv ./Pair1/2780_S71_R2.fastq.gz ./Pair1_2780_S71_R2.fastq.gz
     mv ./Pair1/12835_S72_R1.fastq.gz ./Pair1_12835_S72_R1.fastq.gz
     mv ./Pair1/12835_S72_R2.fastq.gz ./Pair1_12835_S72_R2.fastq.gz
    
     mv ./Pair2/4667_S75_R1.fastq.gz ./Pair2_4667_S75_R1.fastq.gz
     mv ./Pair2/4667_S75_R2.fastq.gz ./Pair2_4667_S75_R2.fastq.gz
     mv ./Pair2/9940_S78_R1.fastq.gz ./Pair2_9940_S78_R1.fastq.gz
     mv ./Pair2/9940_S78_R2.fastq.gz ./Pair2_9940_S78_R2.fastq.gz
    
     mv ./Pair3/3139_S74_R1.fastq.gz ./Pair3_3139_S74_R1.fastq.gz
     mv ./Pair3/3139_S74_R2.fastq.gz ./Pair3_3139_S74_R2.fastq.gz
     mv ./Pair3/19296_S81_R1.fastq.gz ./Pair3_19296_S81_R1.fastq.gz
     mv ./Pair3/19296_S81_R2.fastq.gz ./Pair3_19296_S81_R2.fastq.gz
    
     mv ./Pair4/9849_S77_R1.fastq.gz ./Pair4_9849_S77_R1.fastq.gz
     mv ./Pair4/9849_S77_R2.fastq.gz ./Pair4_9849_S77_R2.fastq.gz
     mv ./Pair4/11635_S80_R1.fastq.gz ./Pair4_11635_S80_R1.fastq.gz
     mv ./Pair4/11635_S80_R2.fastq.gz ./Pair4_11635_S80_R2.fastq.gz
    
     mv ./Pair5/3006_S73_R1.fastq.gz ./Pair5_3006_S73_R1.fastq.gz
     mv ./Pair5/3006_S73_R2.fastq.gz ./Pair5_3006_S73_R2.fastq.gz
     mv ./Pair5/10393_S79_R1.fastq.gz ./Pair5_10393_S79_R1.fastq.gz
     mv ./Pair5/10393_S79_R2.fastq.gz ./Pair5_10393_S79_R2.fastq.gz
    
     mv ./Pair6/1736_S70_R1.fastq.gz ./Pair6_1736_S70_R1.fastq.gz
     mv ./Pair6/1736_S70_R2.fastq.gz ./Pair6_1736_S70_R2.fastq.gz
     mv ./Pair6/8583_S76_R1.fastq.gz ./Pair6_8583_S76_R1.fastq.gz
     mv ./Pair6/8583_S76_R2.fastq.gz ./Pair6_8583_S76_R2.fastq.gz

https://bioinformatics.cc/call-and-merge-snp-and-indel-results-from-using-two-variant-calling-methods/

  1. SNP-calling on AP019721 using snippy, but too far is the reference –> throw the results away!

     git clone https://github.com/huang/bacto
     mv bacto/* ./
     rm -rf bacto
    
     #Prepare raw_data and bacto-0.1.json
     #Set "reference": "db/CP133***.gb" in bacto-0.1.json
    
     conda activate bengal3_ac3
     /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds

https://bioinformatics.cc/analysis-of-snps-indels-transposons-and-is-elements-in-5-a-baumannii-strains/

  1. Annotate a sequence as reference

     conda activate /home/jhuang/miniconda3/envs/bakta
     bakta --db /mnt/nvme0n1p1/bakta_db shovill/Pair1_2780_S71/contigs.fa --prefix 2780
     bakta --db /mnt/nvme0n1p1/bakta_db shovill/Pair2_4667_S75/contigs.fa --prefix 4667
     bakta --db /mnt/nvme0n1p1/bakta_db shovill/Pair3_3139_S74/contigs.fa --prefix 3139
     bakta --db /mnt/nvme0n1p1/bakta_db shovill/Pair4_9849_S77/contigs.fa --prefix 9849
     bakta --db /mnt/nvme0n1p1/bakta_db shovill/Pair5_3006_S73/contigs.fa --prefix 3006
     bakta --db /mnt/nvme0n1p1/bakta_db shovill/Pair6_1736_S70/contigs.fa --prefix 1736
     #END
  2. Since your organism is non-model, standard R databases (org.Hs.eg.db, etc.) won’t work. You’ll need to manually retrieve KEGG and GO annotations.

     * Preparing file 1 eggnog_out.emapper.annotations.txt for the R-code below: (KEGG Terms): EggNog based on orthology and phylogenies
    
         EggNOG-mapper assigns both KEGG Orthology (KO) IDs and GO terms.
    
         Install EggNOG-mapper:
    
             mamba create -n eggnog_env python=3.8 eggnog-mapper -c conda-forge -c bioconda  #eggnog-mapper_2.1.12
             mamba activate eggnog_env
    
         Run annotation:
    
             #diamond makedb --in eggnog6.prots.faa -d eggnog_proteins.dmnd
             mkdir /home/jhuang/mambaforge/envs/eggnog_env/lib/python3.8/site-packages/data/
             download_eggnog_data.py --dbname eggnog.db -y --data_dir /home/jhuang/mambaforge/envs/eggnog_env/lib/python3.8/site-packages/data/
             #NOT_WORKING: emapper.py -i CP059040_gene.fasta -o eggnog_dmnd_out --cpu 60 -m diamond[hmmer,mmseqs] --dmnd_db /home/jhuang/REFs/eggnog_data/data/eggnog_proteins.dmnd
    
             # (DO_NOT_NEED_HERE) Download CU459141_protein_.fasta from NCBI
             # python ~/Scripts/update_fasta_header.py CU459141_protein_.fasta CU459141_protein.fasta
    
             emapper.py -i 2780.faa -o 2780_eggnog_out --cpu 80    #--resume
             emapper.py -i 4667.faa -o 4667_eggnog_out --cpu 80
             emapper.py -i 3139.faa -o 3139_eggnog_out --cpu 80
             emapper.py -i 9849.faa -o 9849_eggnog_out --cpu 80
             emapper.py -i 3006.faa -o 3006_eggnog_out --cpu 80
             emapper.py -i 1736.faa -o 1736_eggnog_out --cpu 80
             #----> result annotations.tsv: Contains KEGG, GO, and other functional annotations.
             #---->  470.IX87_14445:
                 * 470 likely refers to the organism or strain (e.g., Acinetobacter baumannii ATCC 19606 or another related strain).
                 * IX87_14445 would refer to a specific gene or protein within that genome.
    
         Extract KEGG KO IDs from annotations.emapper.annotations.
  3. Call variant calling on self using snippy

     mkdir snippy_Pair1
     mv 2780* snippy_Pair1
     cd snippy_Pair1
    
     git clone https://github.com/huang/bacto
     mv bacto/* ./
     rm -rf bacto
    
     #Prepare raw_data and bacto-0.1.json
     mkdir raw_data
     mv ../raw_data/Pair1_2780_S71_R1.fastq.gz raw_data
     mv ../raw_data/Pair1_2780_S71_R2.fastq.gz raw_data
     mv ../raw_data/Pair1_12835_S72_R1.fastq.gz raw_data
     mv ../raw_data/Pair1_12835_S72_R2.fastq.gz raw_data
    
     #Copy the reference 2780.gbff to db, and set "reference": "db/2780.gbff" in bacto-0.1.json
    
     conda activate /home/jhuang/miniconda3/envs/bengal3_ac3
     /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    
     --
    
     mkdir snippy_Pair2
     mv 4667* snippy_Pair2
     cd snippy_Pair2
     git clone https://github.com/huang/bacto
     mv bacto/* ./
     rm -rf bacto
     mkdir raw_data
     mv ../raw_data/Pair2_4667_S75_R1.fastq.gz raw_data
     mv ../raw_data/Pair2_4667_S75_R2.fastq.gz raw_data
     mv ../raw_data/Pair2_9940_S78_R1.fastq.gz raw_data
     mv ../raw_data/Pair2_9940_S78_R2.fastq.gz raw_data
     #Copy the reference 4667.gbff to db, and set "reference": "db/2780.gbff" in bacto-0.1.json
     /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    
     --
    
     mkdir snippy_Pair3
     mv 3139* snippy_Pair3
     cd snippy_Pair3
     git clone https://github.com/huang/bacto
     mv bacto/* ./
     rm -rf bacto
     mkdir raw_data
     mv ../raw_data/Pair3_3139_S74_R1.fastq.gz raw_data
     mv ../raw_data/Pair3_3139_S74_R2.fastq.gz raw_data
     mv ../raw_data/Pair3_19296_S81_R1.fastq.gz raw_data
     mv ../raw_data/Pair3_19296_S81_R2.fastq.gz raw_data
     #Copy the reference 3139.gbff to db, and set "reference": "db/3139.gbff" in bacto-0.1.json
     /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    
     --
    
     mkdir snippy_Pair4
     mv 9849* snippy_Pair4
     cd snippy_Pair4
     git clone https://github.com/huang/bacto
     mv bacto/* ./
     rm -rf bacto
     mkdir raw_data
     mv ../raw_data/Pair4_9849_S77_R1.fastq.gz ./raw_data
     mv ../raw_data/Pair4_9849_S77_R2.fastq.gz ./raw_data
     mv ../raw_data/Pair4_11635_S80_R1.fastq.gz ./raw_data
     mv ../raw_data/Pair4_11635_S80_R2.fastq.gz ./raw_data
     #Copy the reference 9849.gbff to db, and set "reference": "db/9849.gbff" in bacto-0.1.json
     /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    
     --
    
     mkdir snippy_Pair5
     mv 3006* snippy_Pair5
     cd snippy_Pair5
     git clone https://github.com/huang/bacto
     mv bacto/* ./
     rm -rf bacto
     mkdir raw_data
     mv ../raw_data/Pair5_3006_S73_R1.fastq.gz ./raw_data
     mv ../raw_data/Pair5_3006_S73_R2.fastq.gz ./raw_data
     mv ../raw_data/Pair5_10393_S79_R1.fastq.gz ./raw_data
     mv ../raw_data/Pair5_10393_S79_R2.fastq.gz ./raw_data
     #Copy the reference 3006.gbff to db, and set "reference": "db/3006.gbff" in bacto-0.1.json
     /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    
     --
    
     mkdir snippy_Pair6
     mv 1736* snippy_Pair6
     cd snippy_Pair6
     git clone https://github.com/huang/bacto
     mv bacto/* ./
     rm -rf bacto
     mkdir raw_data
     mv ../raw_data/Pair6_1736_S70_R1.fastq.gz ./raw_data
     mv ../raw_data/Pair6_1736_S70_R2.fastq.gz ./raw_data
     mv ../raw_data/Pair6_8583_S76_R1.fastq.gz ./raw_data
     mv ../raw_data/Pair6_8583_S76_R2.fastq.gz ./raw_data
     #Copy the reference 1736.gbff to db, and set "reference": "db/1736.gbff" in bacto-0.1.json
     /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds

TODOs: # Merge the annotated information 2780_eggnog_out.emapper.annotations into the SNP-Indels-results after making concensus SNP-Indels-results merging results of SPANDx!

!!!! OPTIONALLY (for the project, 我觉得用snippy就够了: simply only taking the results of snippy, do not do the concensus of the results using snippy and SPANDx !!!!)

https://bioinformatics.cc/interhost-variant-calling-data_tam_dnaseq_2025_y1y2y3y4w1w2w3w4_tig1_tig2_dij_on_atcc19606/

  1. Summarize all SNPs and Indels from the snippy result directory.

    cp ~/Scripts/summarize_snippy_res_ordered.py .
    # IMPORTANT_ADAPT the array in script should be adapted
    isolates = ["W1", "W2", "W3", "W4", "Y1", "Y2", "Y3", "Y4"]
    mamba activate plot-numpy1
    python3 ./summarize_snippy_res_ordered.py snippy
    #--> Summary CSV file created successfully at: snippy/summary_snps_indels.csv
    cd snippy
    #REMOVE_the_line? I don't find the sence of the line:    grep -v "None,,,,,,None,None" summary_snps_indels.csv > summary_snps_indels_.csv
  2. Convert the results to Excel-format

    #REPLCE_HEAD: CHROM,POS,REF,TYPE,Pair1_2780_S71,Pair1_12835_S72,Functional_Class,Codon_change,Protein_and_nucleotide_change,Gene_ID,Gene_name
    #Replace ' ' --> ','; Replace ",,,,,," --> ','; Remove "nan"
    #Check the first sample is the same to the reference, otherwise delete it.
    #Sort according contig number and position
    
    ~/Tools/csv2xls-0.4/csv_to_xls.py ./snippy_Pair1/snippy/summary_snps_indels.csv -d',' -o SNPs_Indels_pair1_on_2780.xls
    ~/Tools/csv2xls-0.4/csv_to_xls.py ./snippy_Pair2/snippy/summary_snps_indels.csv -d',' -o SNPs_Indels_pair2_on_4667.xls
    #EMPTY ~/Tools/csv2xls-0.4/csv_to_xls.py ./snippy_Pair3/snippy/summary_snps_indels.csv -d',' -o SNPs_Indels_pair3_on_3139.xls
    #EMPTY ~/Tools/csv2xls-0.4/csv_to_xls.py ./snippy_Pair4/snippy/summary_snps_indels.csv -d',' -o SNPs_Indels_pair4_on_9849.xls
    ~/Tools/csv2xls-0.4/csv_to_xls.py ./snippy_Pair5/snippy/summary_snps_indels.csv -d',' -o SNPs_Indels_pair5_on_3006.xls
    #EMPTY ~/Tools/csv2xls-0.4/csv_to_xls.py ./snippy_Pair6/snippy/summary_snps_indels.csv -d',' -o SNPs_Indels_pair6_on_1736.xls
    
    ~/Tools/csv2xls-0.4/csv_to_xls.py ./snippy_Pair1/2780_eggnog_out.emapper.annotations -d$'\t' -o 2780_ORF_annotations.xls
    ~/Tools/csv2xls-0.4/csv_to_xls.py ./snippy_Pair2/4667_eggnog_out.emapper.annotations -d$'\t' -o 4667_ORF_annotations.xls
    ~/Tools/csv2xls-0.4/csv_to_xls.py ./snippy_Pair3/3139_eggnog_out.emapper.annotations -d$'\t' -o 3139_ORF_annotations.xls
    ~/Tools/csv2xls-0.4/csv_to_xls.py ./snippy_Pair4/9849_eggnog_out.emapper.annotations -d$'\t' -o 9849_ORF_annotations.xls
    ~/Tools/csv2xls-0.4/csv_to_xls.py ./snippy_Pair5/3006_eggnog_out.emapper.annotations -d$'\t' -o 3006_ORF_annotations.xls
    ~/Tools/csv2xls-0.4/csv_to_xls.py ./snippy_Pair6/1736_eggnog_out.emapper.annotations -d$'\t' -o 1736_ORF_annotations.xls
    
    ~/Tools/csv2xls-0.4/csv_to_xls.py mlst.txt -d$'\t' -o mlst.xls
  3. Report

    I have completed the variant calling and functional annotation for the 6 pairs of Staphylococcus epidermidis DNA-seq data.
    
    Key Findings:
    Out of the 6 pairs, 3 pairs reported zero variants (Pairs 3, 4, and 6), while the other 3 pairs (Pairs 1, 2, and 5) yielded SNPs and Indels.
    This result aligns with the resistome and virulence profiling. For the pairs with no genetic variants (Pairs 3, 4, and 6), the isolates have identical AMR and VFDB gene profiles (e.g., both isolates in Pair 3 have 27 AMR genes and 1 VFDB hit; both in Pair 4 have 20 AMR genes). In contrast, the pairs that showed SNPs/Indels (Pairs 1, 2, and 5) also exhibit differences in their AMR gene counts (e.g., Pair 2 has 33 vs. 30 AMR genes; Pair 5 has 21 vs. 24 AMR genes).
    Attached Files:
    Please find the following results attached:
    
        MLST calling: mlst.xls.
        Resistome_Virulence results: 12 Excel files
        SNP/Indel Results (for the pairs with variants):
            SNPs_Indels_pair1_on_2780.xls
            SNPs_Indels_pair2_on_4667.xls
            SNPs_Indels_pair5_on_3006.xls
        ORF Annotations (EggNOG-mapper results with KEGG/GO terms for all 6 reference genomes in the 6 paired comparisons):
            2780_ORF_annotations.xls, 4667_ORF_annotations.xls, 3139_ORF_annotations.xls, 9849_ORF_annotations.xls, 3006_ORF_annotations.xls, 1736_ORF_annotations.xls
    
    Brief Methodology Summary:
    
        Assembly & QC: We used nf-core/bacass for short-read assembly. CheckM confirmed high genome purity (>99.7% completeness, ~0% contamination), and GTDB-Tk confirmed the species as Staphylococcus epidermidis.
        Typing & AMR: MLST was performed (identifying ST2, ST5, and ST59 lineages). Resistome and virulence profiling were done using Abricate and RGI.
        Variant Calling: We used snippy to map each pair against its own assembled reference genome to ensure high-confidence SNP/Indel calling.
        Functional Annotation: Bakta was used for structural annotation, and EggNOG-mapper was utilized to assign KEGG and GO terms to the ORFs for all 6 reference genomes in the 6 paired comparisons.

Annotation of Multi-Nucleotide Polymorphisms (MNPs): Dissecting the c.465_469delCATTGinsAATTT Complex Variant (Data_Holger_DNAseq_2026_Sepi_Pairs)

Based on the genetic notation provided, here is the detailed explanation for why it appears that “only one nucleotide changed” the protein, even though the DNA notation looks like a large 5-base substitution.

The Short Answer

At the protein level, the change from Glycine to Cysteine (p.Gly157Cys) is indeed caused by a single nucleotide change within that specific codon.

However, at the DNA level, two nucleotides actually changed within the 5-base window (CATTG $\rightarrow$ AATTT). The reason you only see one amino acid change is that the second DNA mutation is likely synonymous (silent)—meaning it does not change the amino acid. Bioinformatics tools often group adjacent DNA changes into a single “block” notation, which can make it look more complicated than it is.


Step-by-Step Breakdown of the Mutation

Let’s map the 5-base change (c.465_469delCATTGinsAATTT) directly to the codons (amino acid building blocks) to see exactly what happened.

Since each codon is 3 bases long:

  • Codon 155 consists of bases 463, 464, and 465.
  • Codon 156 consists of bases 466, 467, and 468.
  • Codon 157 consists of base 469, 470, and 471.

Now, let’s align your reference (CATTG) and alternate (AATTT) sequences across these positions:

DNA Position Codon Location Reference Base Alternate Base Effect
465 3rd base of Codon 155 C A Synonymous (Silent): Because this is the 3rd “wobble” position of the codon, this change likely does not alter the amino acid at position 155.
466 1st base of Codon 156 A A No change
467 2nd base of Codon 156 T T No change
468 3rd base of Codon 156 T T No change
469 1st base of Codon 157 G T Missense: This changes the first letter of Codon 157. Glycine codons start with GG... (e.g., GGT). Changing the G to a T makes it TG... (e.g., TGT), which codes for Cysteine.

Why is it written as a 5-base change?

You might wonder why the software wrote delCATTGinsAATTT (which looks like a massive 5-base deletion and insertion) instead of just listing the two single-nucleotide changes.

This is a standard behavior of variant calling and annotation pipelines (like GATK, SnpEff, or VEP). When multiple nucleotide changes occur very close to each other in the sequencing reads, the software groups them together into a single “Complex Variant” or Multi-Nucleotide Polymorphism (MNP).

Instead of outputting two separate lines:

  1. c.465C>A (Silent)
  2. c.469G>T (Missense)

The pipeline outputs them as one combined block event: c.465_469delCATTGinsAATTT.

Summary

  • Did only one nucleotide change the protein? Yes. The G $\rightarrow$ T mutation at position 469 is the sole reason the amino acid changed from Glycine to Cysteine.
  • What about the other change? The C $\rightarrow$ A mutation at position 465 is a “silent” passenger mutation that doesn’t affect the final protein structure.
  • Why the confusing notation? The software simply bundled the two adjacent DNA events into one line for simplicity, even though they affect different codons.

DWZ-Karteikarte XXXX

Die Daten enthalten die Spielerinformationen sowie die Turnierhistorie mit den zugehörigen DWZ-Wertungen (Deutsche Wertungszahl). Die Spalten “Wert 1” und “Wert 2” repräsentieren zusätzliche Kennzahlen (wie z. B. Start-DWZ oder Leistungszahl), die auf der DSB-Webseite nur bei bestimmten Turnieren angezeigt werden.

Spielerinformationen

  • Aktion: Foto senden
  • Name: XXXX
  • Geschlecht: (nicht angegeben)

Turnier- und DWZ-Historie

Nr. Jahr Turnier Punkte Partien Erwartung DWZ-Diff. Gegner-Schnitt Wert 1 Wert 2 Neue DWZ
1 2023 4. Kieler U12-Pokal 0 1 0,000 0 777
2 2023 30. DWZ Cup – sfwi.de – Gruppe 15 0 3 0,000 0 900
3 2023 31. DWZ Cup – sfwi.de – Gruppe 13 1 7 0,000 0 847 768 – 1
4 2024 32. DWZ Cup – sfwi.de – Gruppe 14 2 1,030 5 609 822 – 2
5 2024 Regionales Kinder Schach Turnier (U8) 0 2 0,221 29 1252 816 – 3
6 2024 Kieler U8-Wochenendturnier / RKST 0 2 0,894 30 762 794 – 4
7 2024 HJEM 2024 U8 0 2 1,007 34 742 772 – 5
8 2024 33. DWZ Cup – sfwi.de – Gruppe 15 2 3 1,583 5 695 814 – 6
9 2024 Jugendkreisliga Hamburg 2024 1 2 0,960 5 737 819 – 7
10 2024 34. DWZ Cup – sfwi.de – Gruppe 11 1 1 0,472 5 839 889 – 8
11 2024 U10 Sonderklasse 2024 0 5 0,727 20 1351 674 866 – 9
12 2024 35. DWZ Cup – sfwi.de – Gruppe 13 1 1 0,617 5 782 917 – 10
13 2024 36. DWZ Cup – sfwi.de – Gruppe 9 1 3 1,785 18 849 887 – 11
14 2025 Regionales Kinder Schachturnier 2025 A 0 3 1,071 20 1005 850 – 12
15 2025 1. Schachwerkstatt DWZ Challenge M 3 3 1,527 5 783 997 – 13
16 2025 HJEM 2025 U8 ½ 2 1,213 13 910 959 – 14
17 2025 39. DWZ Cup – sfwi.de – Gruppe 12 1 3 1,676 15 912 929 – 15
18 2025 Jugendkreisliga 0 4 1,364 17 1055 877 – 16
19 2025 40. DWZ-ELO-Cup Gruppe 15 ½ 2 1,296 21 769 849 – 17
20 2025 U10 Sonderklasse 2025 0 2 0,739 25 835 827 – 18
21 2025 2. Schachwerkstatt U12 DWZ Challenge I 1 1 0,499 5 828 894 – 19
22 2026 HJET 2026 – U10-1 4 8 3,350 5 957 956 934 – 20
23 2026 43. DWZ-Elo-Cup der Wilstermarsch u. I […] 2 3 1,528 5 828 981 – 21
AKT 2026 HJEM 2026 U10 7 2,144 5 1149 1145 1071 – 22

(Hinweis: “AKT” in der letzten Zeile steht für das aktuell laufende oder gerade beendete Turnier, bei dem die neue DWZ vorläufig oder endgültig berechnet wurde.)



https://schach-blankenese.de/2024/02/19/rkst-2024/

https://schach-blankenese.de/2025/02/25/rekordbeteiligung-beim-rkst/

https://schach-blankenese.de/2026/03/24/gut-besuchtes-rkst/

https://schach-blankenese.de/wp-content/uploads/2026/03/rkst-hamburg-22-03-2026-tabellen.pdf

Über diese RKST-Turniere, die auch in vielen anderen Orten Deutschlands ausgetragen werden, können sich Kinder der Altersklasse U8 direkt für die Deutschen Meisterschaften qualifizieren. Darüber hinaus werden besonders leistungsstarke Spieler:innen von Scouts gesichtet und gegebenenfalls für die DM vorgeschlagen. Diese Funktion übernahmen in diesem Jahr Olaf Wolna und Bernhard Jürgens. Vielen Dank! Um die Förderung der Kinder zu optimieren, konnten alle Kinder ihre Partien analysieren lassen. An dieser Aufgabe beteiligten sich unter der Führung von Berthold vor allem Sami, Nua, Noah, Rishi, Okke und Aaron sowie Kristina vom HSK. Es ist sehr schön zu sehen, dass auch unsere Jugend gern Aufgaben übernimmt und sich engagiert. Herzlichen Dank an alle Beteiligten!