Daily Archives: 2026年4月29日

Interhost variant calling (Data_Tam_DNAseq_2026_19606wt_dAB_dIJ_mito_flu_on_ATCC19606)

  1. Input data:

     mkdir bacto; cd bacto;
     mkdir raw_data; cd raw_data;
    
     # ── Fluoxetine Dataset ──
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEcef-1/19606△ABfluEcef-1_1.fq.gz flu_dAB_cef_R1.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEcef-1/19606△ABfluEcef-1_2.fq.gz flu_dAB_cef_R2.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEcipro-2/19606△ABfluEcipro-2_1.fq.gz flu_dAB_cipro_R1.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEcipro-2/19606△ABfluEcipro-2_2.fq.gz flu_dAB_cipro_R2.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEdori-2/19606△ABfluEdori-2_1.fq.gz flu_dAB_dori_R1.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEdori-2/19606△ABfluEdori-2_2.fq.gz flu_dAB_dori_R2.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEnitro-3/19606△ABfluEnitro-3_1.fq.gz flu_dAB_nitro_R1.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEnitro-3/19606△ABfluEnitro-3_2.fq.gz flu_dAB_nitro_R2.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEpip-1/19606△ABfluEpip-1_1.fq.gz flu_dAB_pip_R1.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEpip-1/19606△ABfluEpip-1_2.fq.gz flu_dAB_pip_R2.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEpolyB-3/19606△ABfluEpolyB-3_1.fq.gz flu_dAB_polyB_R1.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEpolyB-3/19606△ABfluEpolyB-3_2.fq.gz flu_dAB_polyB_R2.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEtet-1/19606△ABfluEtet-1_1.fq.gz flu_dAB_tet_R1.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEtet-1/19606△ABfluEtet-1_2.fq.gz flu_dAB_tet_R2.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEcef-4/19606△IJfluEcef-4_1.fq.gz flu_dIJ_cef_R1.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEcef-4/19606△IJfluEcef-4_2.fq.gz flu_dIJ_cef_R2.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEcipro-3/19606△IJfluEcipro-3_1.fq.gz flu_dIJ_cipro_R1.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEcipro-3/19606△IJfluEcipro-3_2.fq.gz flu_dIJ_cipro_R2.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEdori-1/19606△IJfluEdori-1_1.fq.gz flu_dIJ_dori_R1.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEdori-1/19606△IJfluEdori-1_2.fq.gz flu_dIJ_dori_R2.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEnitro-3/19606△IJfluEnitro-3_1.fq.gz flu_dIJ_nitro_R1.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEnitro-3/19606△IJfluEnitro-3_2.fq.gz flu_dIJ_nitro_R2.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEpip-4/19606△IJfluEpip-4_1.fq.gz flu_dIJ_pip_R1.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEpip-4/19606△IJfluEpip-4_2.fq.gz flu_dIJ_pip_R2.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEpolyB-4/19606△IJfluEpolyB-4_1.fq.gz flu_dIJ_polyB_R1.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEpolyB-4/19606△IJfluEpolyB-4_2.fq.gz flu_dIJ_polyB_R2.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEcef-1/19606wtfluEcef-1_1.fq.gz flu_wt_cef_R1.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEcef-1/19606wtfluEcef-1_2.fq.gz flu_wt_cef_R2.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEcipro-2/19606wtfluEcipro-2_1.fq.gz flu_wt_cipro_R1.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEcipro-2/19606wtfluEcipro-2_2.fq.gz flu_wt_cipro_R2.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEdori-1/19606wtfluEdori-1_1.fq.gz flu_wt_dori_R1.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEdori-1/19606wtfluEdori-1_2.fq.gz flu_wt_dori_R2.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEnitro-1/19606wtfluEnitro-1_1.fq.gz flu_wt_nitro_R1.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEnitro-1/19606wtfluEnitro-1_2.fq.gz flu_wt_nitro_R2.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEpip-4/19606wtfluEpip-4_1.fq.gz flu_wt_pip_R1.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEpip-4/19606wtfluEpip-4_2.fq.gz flu_wt_pip_R2.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEpolyB-4/19606wtfluEpolyB-4_1.fq.gz flu_wt_polyB_R1.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEpolyB-4/19606wtfluEpolyB-4_2.fq.gz flu_wt_polyB_R2.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEtet-2/19606wtfluEtet-2_1.fq.gz flu_wt_tet_R1.fastq.gz
     ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEtet-2/19606wtfluEtet-2_2.fq.gz flu_wt_tet_R2.fastq.gz
    
     # ── Mitomycin C Dataset ──
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_AB_cipro_1/MitoC_E_AB_cipro_1_1.fq.gz mito_dAB_cipro_R1.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_AB_cipro_1/MitoC_E_AB_cipro_1_2.fq.gz mito_dAB_cipro_R2.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_AB_dori_1/MitoC_E_AB_dori_1_1.fq.gz mito_dAB_dori_R1.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_AB_dori_1/MitoC_E_AB_dori_1_2.fq.gz mito_dAB_dori_R2.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_AB_nitro_2/MitoC_E_AB_nitro_2_1.fq.gz mito_dAB_nitro_R1.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_AB_nitro_2/MitoC_E_AB_nitro_2_2.fq.gz mito_dAB_nitro_R2.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_AB_tet_2/MitoC_E_AB_tet_2_1.fq.gz mito_dAB_tet_R1.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_AB_tet_2/MitoC_E_AB_tet_2_2.fq.gz mito_dAB_tet_R2.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_AB_trime_4/MitoC_E_AB_trime_4_1.fq.gz mito_dAB_trime_R1.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_AB_trime_4/MitoC_E_AB_trime_4_2.fq.gz mito_dAB_trime_R2.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_cipro_1/MitoC_E_IJ_cipro_1_1.fq.gz mito_dIJ_cipro_R1.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_cipro_1/MitoC_E_IJ_cipro_1_2.fq.gz mito_dIJ_cipro_R2.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_dori_4/MitoC_E_IJ_dori_4_1.fq.gz mito_dIJ_dori_R1.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_dori_4/MitoC_E_IJ_dori_4_2.fq.gz mito_dIJ_dori_R2.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_nitro_2/MitoC_E_IJ_nitro_2_1.fq.gz mito_dIJ_nitro_R1.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_nitro_2/MitoC_E_IJ_nitro_2_2.fq.gz mito_dIJ_nitro_R2.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_polyB_3/MitoC_E_IJ_polyB_3_1.fq.gz mito_dIJ_polyB_R1.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_polyB_3/MitoC_E_IJ_polyB_3_2.fq.gz mito_dIJ_polyB_R2.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_tet_3/MitoC_E_IJ_tet_3_1.fq.gz mito_dIJ_tet_R1.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_tet_3/MitoC_E_IJ_tet_3_2.fq.gz mito_dIJ_tet_R2.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_trime_1/MitoC_E_IJ_trime_1_1.fq.gz mito_dIJ_trime_R1.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_trime_1/MitoC_E_IJ_trime_1_2.fq.gz mito_dIJ_trime_R2.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_wt_cipro_1/MitoC_E_wt_cipro_1_1.fq.gz mito_wt_cipro_R1.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_wt_cipro_1/MitoC_E_wt_cipro_1_2.fq.gz mito_wt_cipro_R2.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_wt_nitro_1/MitoC_E_wt_nitro_1_1.fq.gz mito_wt_nitro_R1.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_wt_nitro_1/MitoC_E_wt_nitro_1_2.fq.gz mito_wt_nitro_R2.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_wt_polyB_1/MitoC_E_wt_polyB_1_1.fq.gz mito_wt_polyB_R1.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_wt_polyB_1/MitoC_E_wt_polyB_1_2.fq.gz mito_wt_polyB_R2.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_wt_trime_2/MitoC_E_wt_trime_2_1.fq.gz mito_wt_trime_R1.fastq.gz
     ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_wt_trime_2/MitoC_E_wt_trime_2_2.fq.gz mito_wt_trime_R2.fastq.gz
  2. Call variant calling using snippy

     ln -s ~/Tools/bacto/db/ .;
     ln -s ~/Tools/bacto/envs/ .;
     ln -s ~/Tools/bacto/local/ .;
     cp ~/Tools/bacto/Snakefile .;
     cp ~/Tools/bacto/bacto-0.1.json .;
     cp ~/Tools/bacto/cluster.json .;
    
     #download CP059040.gb from GenBank
     #mv ~/Downloads/sequence\(2\).gb db/CP059040.gb
    
     mamba activate /home/jhuang/miniconda3/envs/bengal3_ac3
     (bengal3_ac3) jhuang@WS-2290C:~/DATA/Data_Tam_DNAseq_2023_A6WT_A10CraA_A12AYE_A1917978$ which snakemake
     /home/jhuang/miniconda3/envs/bengal3_ac3/bin/snakemake
     (bengal3_ac3) jhuang@WS-2290C:~/DATA/Data_Tam_DNAseq_2023_A6WT_A10CraA_A12AYE_A1917978$ snakemake -v
     4.0.0 --> CORRECT!
    
     #NOTE_1: modify bacto-0.1.json keeping only steps assembly, typing_mlst, possibly pangenome and variants_calling true; setting cpu=20 in all used steps.
         #setting the following in bacto-0.1.json
         "fastqc": false,
         "taxonomic_classifier": false,
         "assembly": true,
         "typing_ariba": false,
         "typing_mlst": true,
         "pangenome": true,
         "variants_calling": true,
         "phylogeny_fasttree": false,
         "phylogeny_raxml": false,
         "recombination": false, (due to gubbins-error set false)
    
         "prokka": {
           "genus": "Acinetobacter",
           "kingdom": "Bacteria",
           "species": "baumannii",
           "cpu": 10,
           "evalue": "1e-06",
           "other": ""
         },
    
         "mykrobe": {
           "species": "abaum"
         },
    
         "reference": "db/CP059040.gb"
    
     #NOTE_2: needs disk Titisee since the pipeline needs /media/jhuang/Titisee/GAMOLA2/TIGRfam_db/TIGRFAMs_15.0_HMM.LIB
     snakemake --printshellcmds
  3. Checking the contaminated samples based on the size of genome

     find . -maxdepth 2 -name "contigs.fa" -exec du -h {} \;
     find . -maxdepth 2 -name "contigs.fa" -printf "%s\t%p\n" | sort -nr | \
             while IFS=$'\t' read -r bytes path; do
             sample=$(basename "$(dirname "$path")")
             printf "%-20s %8s\t%s\n" "$sample" "$(numfmt --to=iec-i --suffix=B "$bytes")" "$path"
     done
    
     mito_wt_polyB          8,6MiB   ./mito_wt_polyB/contigs.fa
     mito_wt_trime          8,2MiB   ./mito_wt_trime/contigs.fa
     mito_dAB_trime         8,2MiB   ./mito_dAB_trime/contigs.fa
     flu_wt_cipro           6,7MiB   ./flu_wt_cipro/contigs.fa
     mito_dIJ_nitro         6,6MiB   ./mito_dIJ_nitro/contigs.fa
     flu_dAB_cipro          4,7MiB   ./flu_dAB_cipro/contigs.fa
    
     flu_wt_nitro           3,9MiB   ./flu_wt_nitro/contigs.fa
     flu_wt_polyB           3,9MiB   ./flu_wt_polyB/contigs.fa
     flu_wt_cef             3,9MiB   ./flu_wt_cef/contigs.fa
     flu_wt_dori            3,8MiB   ./flu_wt_dori/contigs.fa
     flu_dIJ_pip            3,8MiB   ./flu_dIJ_pip/contigs.fa
     flu_dIJ_dori           3,8MiB   ./flu_dIJ_dori/contigs.fa
     flu_wt_pip             3,8MiB   ./flu_wt_pip/contigs.fa
     flu_dAB_dori           3,8MiB   ./flu_dAB_dori/contigs.fa
     flu_dIJ_cipro          3,8MiB   ./flu_dIJ_cipro/contigs.fa
     flu_dAB_nitro          3,8MiB   ./flu_dAB_nitro/contigs.fa
     flu_dAB_pip            3,8MiB   ./flu_dAB_pip/contigs.fa
     flu_dAB_polyB          3,8MiB   ./flu_dAB_polyB/contigs.fa
     flu_dIJ_nitro          3,8MiB   ./flu_dIJ_nitro/contigs.fa
     flu_dIJ_cef            3,8MiB   ./flu_dIJ_cef/contigs.fa
     flu_dAB_tet            3,8MiB   ./flu_dAB_tet/contigs.fa
     flu_dIJ_polyB          3,8MiB   ./flu_dIJ_polyB/contigs.fa
     flu_dAB_cef            3,8MiB   ./flu_dAB_cef/contigs.fa
     mito_dIJ_polyB         3,8MiB   ./mito_dIJ_polyB/contigs.fa
     mito_dIJ_cipro         3,8MiB   ./mito_dIJ_cipro/contigs.fa
     mito_dIJ_dori          3,8MiB   ./mito_dIJ_dori/contigs.fa
     flu_wt_tet             3,8MiB   ./flu_wt_tet/contigs.fa
     mito_wt_nitro          3,8MiB   ./mito_wt_nitro/contigs.fa
     mito_wt_cipro          3,8MiB   ./mito_wt_cipro/contigs.fa
     mito_dIJ_tet           3,8MiB   ./mito_dIJ_tet/contigs.fa
     mito_dIJ_trime         3,8MiB   ./mito_dIJ_trime/contigs.fa
     mito_dAB_dori          3,8MiB   ./mito_dAB_dori/contigs.fa
     mito_dAB_cipro         3,7MiB   ./mito_dAB_cipro/contigs.fa
     mito_dAB_tet           3,7MiB   ./mito_dAB_tet/contigs.fa
     mito_dAB_nitro         3,7MiB   ./mito_dAB_nitro/contigs.fa
  4. Summarize all SNPs and Indels from the snippy result directory.

     cp ~/Scripts/summarize_snippy_res_ordered.py .
     # IMPORTANT_ADAPT the array in script should be adapted; deleting the isolates "mito_wt_polyB","mito_wt_trime", "mito_dAB_trime", "flu_wt_cipro", "mito_dIJ_nitro", and "flu_dAB_cipro"
     isolates = ["flu_wt_cef", "flu_wt_dori", "flu_wt_nitro", "flu_wt_pip", "flu_wt_polyB", "flu_wt_tet",    "flu_dAB_cef", "flu_dAB_dori", "flu_dAB_nitro", "flu_dAB_pip", "flu_dAB_polyB", "flu_dAB_tet",    "flu_dIJ_cef", "flu_dIJ_cipro", "flu_dIJ_dori", "flu_dIJ_nitro", "flu_dIJ_pip", "flu_dIJ_polyB",         "mito_dIJ_trime",    "mito_wt_cipro", "mito_wt_nitro",   "mito_dAB_cipro", "mito_dAB_dori", "mito_dAB_nitro", "mito_dAB_tet",     "mito_dIJ_cipro", "mito_dIJ_dori",  "mito_dIJ_polyB", "mito_dIJ_tet"]
    
     mamba activate plot-numpy1
     python3 ./summarize_snippy_res_ordered.py snippy
     #--> Summary CSV file created successfully at: snippy/summary_snps_indels.csv
     cd snippy
     #REMOVE_the_line? I don't find the sence of the line:    grep -v "None,,,,,,None,None" summary_snps_indels.csv > summary_snps_indels_.csv
  5. Using spandx calling variants (almost the same results to the one from viral-ngs!)

     mamba deactivate
     mamba activate /home/jhuang/miniconda3/envs/spandx
    
     # PREPARE the inputs for the options ref and database (NOT ALWAYS NEED, PREPARE ONlY ONCE)
     mkdir ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610
     cp PP810610.gb  ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610/genes.gbk
     vim ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config
     /home/jhuang/miniconda3/envs/spandx/bin/snpEff build PP810610    #-d
     ~/Scripts/genbank2fasta.py PP810610.gb
     mv PP810610.gb_converted.fna PP810610.fasta    #rename "NC_001348.1 xxxxx" to "NC_001348" in the fasta-file
    
     ln -s /home/jhuang/Tools/spandx/ spandx
     # Deleting the contaminated samples flu_wt_cipro*.fastq and flu_dAB_cipro*.fastq
     (spandx) nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq" --ref CP059040.fasta --annotation --database CP059040 -resume
    
     # RERUN SNP_matrix.sh due to the error ERROR_CHROMOSOME_NOT_FOUND in the variants annotation, resulting in all impacts are MODIFIER --> IT WORKS!
     cd Outputs/Master_vcf
     conda activate /home/jhuang/miniconda3/envs/spandx
     (spandx) cp -r ../../snippy/flu_wt_cef/reference . # Eigentlich irgendein directory, all directories contains the same reference.
     (spandx) cp ../../spandx/bin/SNP_matrix.sh ./
     #Note that ${variant_genome_path}=CP059040 in the following command, but it was not used after the following command modification.
     #Adapt "snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -v ${variant_genome_path} out.vcf > out.annotated.vcf" to
     "/home/jhuang/miniconda3/envs/bengal3_ac3/bin/snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -c reference/snpeff.config -dataDir . ref out.vcf > out.annotated.vcf" in SNP_matrix.sh
     (spandx) bash SNP_matrix.sh CP059040 .

Cross-Caller SNP/Indel Concordance & Invariant Variant Analyzer; Multi-Isolate Variant Intersection, Annotation Harmonization & Caller Discrepancy Report; Comparative Genomic Variant Profiling: Concordance, Invariance & Allele Mismatch Analysis; VarMatch: Cross-Tool Variant Concordance Pipeline

  1. Calling inter-host variants by merging the results from snippy+spandx

     mamba activate plot-numpy1
     cd bacto
     cp Outputs/Master_vcf/All_SNPs_indels_annotated.txt .
     cp snippy/summary_snps_indels.csv .
    
     cp ~/Scripts/process_variants_snippy_alleles_spandx_annotations.py .
    
     #Configuring
     #Delete "mito_wt_polyB", "mito_wt_trime", "mito_dAB_trime", "flu_wt_cipro", "mito_dIJ_nitro", and "flu_dAB_cipro"
             ISOLATES = [
             "flu_wt_cef", "flu_wt_dori", "flu_wt_nitro", "flu_wt_pip", "flu_wt_polyB", "flu_wt_tet",
             "flu_dAB_cef", "flu_dAB_dori", "flu_dAB_nitro", "flu_dAB_pip", "flu_dAB_polyB", "flu_dAB_tet",
             "flu_dIJ_cef", "flu_dIJ_cipro", "flu_dIJ_dori", "flu_dIJ_nitro", "flu_dIJ_pip", "flu_dIJ_polyB",
             "mito_dIJ_trime",
             "mito_wt_cipro", "mito_wt_nitro",
             "mito_dAB_cipro", "mito_dAB_dori", "mito_dAB_nitro", "mito_dAB_tet",
             "mito_dIJ_cipro", "mito_dIJ_dori", "mito_dIJ_polyB", "mito_dIJ_tet"
             ]
    
     (plot-numpy1) python process_variants_snippy_alleles_spandx_annotations.py
    
     # -- Indeed, the generated files are the common SNPs generated by snippy+spandx, therefore, we need to modify the filenames. --
     mv common_variants_all_snippy_annotated.csv common_variants_snippy+spandx_annotated.csv
     mv common_variants_invariant_snippy_annotated.csv common_invariants_snippy+spandx_annotated.csv
     mv common_variants_all_snippy_annotated.xlsx common_variants_snippy+spandx_annotated.xlsx
     mv common_variants_invariant_snippy_annotated.xlsx common_invariants_snippy+spandx_annotated.xlsx
  2. Manully checking each of the 6 records by comparing them to the results from SPANDx; three are confirmed!

     #The file will only contain rows where at least one of the 29 samples had a different allele between the two files. You can open allele_differences.xlsx side-by-side with your original common_variants_all_snippy_annotated.xlsx for fast, column-aligned manual verification.
     mamba activate plot-numpy1
     (plot-numpy1) python ~/Scripts/export_mismatch_alleles.py common_variants_snippy+spandx_annotated.csv All_SNPs_indels_annotated.txt
     #❌ Found 13 mismatched positions. Extracting & formatting...
    
     # The export_mismatch_alleles.py always generated the same output-file, avoid to recoved by the following commands, modify the generated filename.
     (plot-numpy1) mv allele_differences.xlsx allele_differences_for_cmp.xlsx
    
     (plot-numpy1) python ~/Scripts/export_mismatch_alleles.py common_invariants_snippy+spandx_annotated.csv All_SNPs_indels_annotated.txt
     #✅ All alleles match perfectly across all common positions and samples!
    
     cp common_variants_all_snippy_annotated.xlsx common_variants_all_snippy_annotated_for_cmp.xlsx
     #Then marked the corresponding rows in yellow, then compare it to allele_differences.xlsx.
    
     # IMPORTANT_MANUAL_CHECK: checking all variants of common_variants_snippy+spandx_annotated.xlsx and common_invariants_snippy+spandx_annotated.xlsx by comparing allele_differences_for_cmp.xlsx.
  3. Run nextflow bacass

     # Download the kmerfinder database: https://www.genomicepidemiology.org/services/ --> https://cge.food.dtu.dk/services/KmerFinder/ --> https://cge.food.dtu.dk/services/KmerFinder/etc/kmerfinder_db.tar.gz
     # Download 20190108_kmerfinder_stable_dirs.tar.gz from https://zenodo.org/records/13447056
     #--kmerfinderdb /mnt/nvme1n1p1/REFs/kmerfinder_db.tar.gz
     #--kmerfinderdb /mnt/nvme1n1p1/REFs/20190108_kmerfinder_stable_dirs.tar.gz
     nextflow run nf-core/bacass -r 2.5.0 -profile docker \
     --input samplesheet.tsv \
     --outdir bacass_out \
     --assembly_type short \
     --kraken2db /mnt/nvme1n1p1/REFs/k2_standard_08_GB_20251015.tar.gz \
     --kmerfinderdb /mnt/nvme1n1p1/REFs/kmerfinder/bacteria/ \
     -resume
     #Possibly the chraracter '△' is a problem.
     #Solution: 19606△ABfluEcef-1 → 19606delABfluEcef-1
    
     #SAVE bacass_out/Kmerfinder/kmerfinder_summary.csv to bacass_out/Kmerfinder/An6?/An6?_kmerfinder_results.xlsx
    
     samplesheet.tsv
     sample,fastq_1,fastq_2
     flu_dAB_cef,../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEcef-1/19606△ABfluEcef-1_1.fq.gz,../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEcef-1/19606△ABfluEcef-1_2.fq.gz
     flu_dAB_cipro,../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEcipro-2/19606△ABfluEcipro-2_1.fq.gz,../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEcipro-2/19606△ABfluEcipro-2_2.fq.gz
    
     #busco example results:
     Input_file      Dataset Complete        Single  Duplicated      Fragmented      Missing n_markers       Scaffold N50    Contigs N50     Percent gaps    Number of scaffolds
     wt_cef.scaffolds.fa     bacteria_odb10  98.4    98.4    0.0     1.6     0.0     124     285852  285852  0.000%  45
     wt_cipro.scaffolds.fa   bacteria_odb10  90.3    89.5    0.8     8.1     1.6     124     7434    7434    0.000%  1699
  4. Run bactmap

     nextflow run nf-core/bactmap -r 1.0.0 -profile docker \
     --input samplesheet.csv \
     --reference CP059040.fasta \
     --outdir bactmap_out \
     -resume
    
     nextflow run nf-core/bactmap -r 1.0.0 -profile docker \
     --input samplesheet.csv \
     --reference CU459141.fasta \
     --outdir bactmap_out \
     -resume
    
     sample,fastq_1,fastq_2
     G18582004,fastqs/G18582004_1.fastq.gz,fastqs/G18582004_2.fastq.gz
     G18756254,fastqs/G18756254_1.fastq.gz,fastqs/G18756254_2.fastq.gz
     G18582006,fastqs/G18582006_1.fastq.gz,fastqs/G18582006_2.fastq.gz
    
     mkdir bactmap_workspace
     #Prepare reference.fasta (example for CP059040.fasta) in bactmap_workspace and samplesheet.csv in bactmap_workspace
     nextflow run nf-core/bactmap -r 1.0.0 -profile docker \
     --input samplesheet.csv \
     --reference CP059040.fasta \
     --outdir bactmap_out \
     -resume
  5. Structural variant calling

     conda activate sv_assembly
    
     nucmer --maxmatch -l 100 -c 500 bacto/CP059040.fasta bacto/shovill/flu_wt_cef/contigs.fa -p flu_wt_cef
     delta-filter -1 -q flu_wt_cef.delta > flu_wt_cef.filtered.delta
     Assemblytics flu_wt_cef.filtered.delta flu_wt_cef_assemblytics 1000 100 50000
    
     #reference  ref_start  ref_stop  ID                size  strand  type                ref_gap_size  query_gap_size  query_coordinates          method
     CP059040    3124916    3125037   Assemblytics_b_3  198   +       Tandem_contraction  121           -77             contig00010:34383-34460:-  between_alignments
    
     for sample in flu_wt_cef  flu_wt_dori  flu_wt_nitro  flu_wt_pip  flu_wt_polyB  flu_wt_tet  flu_dAB_cef  flu_dAB_dori  flu_dAB_nitro  flu_dAB_pip  flu_dAB_polyB  flu_dAB_tet  flu_dIJ_cef  flu_dIJ_cipro  flu_dIJ_dori  flu_dIJ_nitro  flu_dIJ_pip  flu_dIJ_polyB  mito_dIJ_trime  mito_wt_cipro  mito_wt_nitro  mito_dAB_cipro  mito_dAB_dori  mito_dAB_nitro  mito_dAB_tet  mito_dIJ_cipro  mito_dIJ_dori  mito_dIJ_polyB  mito_dIJ_tet; do
             nucmer --maxmatch -l 100 -c 500 bacto/CP059040.fasta bacto/shovill/${sample}/contigs.fa -p ${sample};
             delta-filter -1 -q ${sample}.delta > ${sample}.filtered.delta;
             Assemblytics ${sample}.filtered.delta ${sample}_assemblytics 1000 100 50000;
     done
    
     #NOTE that We excluded 6 samples with abnormal assembly sizes (>4.5 Mb or <3.7 Mb), which likely reflect contamination or fragmentation. The final merged SV table (merged_sv_results_filtered.tsv) contains 29 high-quality isolates and is ready for downstream analysis.
     #mito_wt_polyB          8,6MiB   ./mito_wt_polyB/contigs.fa
     #mito_wt_trime          8,2MiB   ./mito_wt_trime/contigs.fa
     #mito_dAB_trime         8,2MiB   ./mito_dAB_trime/contigs.fa
     #flu_wt_cipro           6,7MiB   ./flu_wt_cipro/contigs.fa
     #mito_dIJ_nitro         6,6MiB   ./mito_dIJ_nitro/contigs.fa
     #flu_dAB_cipro          4,7MiB   ./flu_dAB_cipro/contigs.fa
    
     mito_wt_polyB
     # Adapt the EXCLUDE_SAMPLES names in the following script and run it.
     merge_sv_filtered.sh  # merged_sv_results.txt
     #✅ Merge complete: merged_sv_results_filtered.txt
     #• Included: 29 samples
     #• Excluded: 4 samples (Indeed, 6 samples should not include in the calculation, but 2 samples was not run in the last step.)

Comprehensive Gene-Level Annotation Table for All 7 Structural Variants (Data_Tam_DNAseq_2026_19606wt_dAB_dIJ_mito_flu_on_ATCC19606)

Based on CP059040.gff3 Reference Annotation (Full GFF3 Intersection)

Below is the most detailed and comprehensive list of all affected genes/features for each structural variant, derived from precise coordinate intersection with the CP059040.gff3 file.


🔑 Master Table: All 7 SVs with Complete Affected Gene Lists

Original SV ID Coordinates (CP059040) Type Size (bp) All Affected Genes/Features (Locus Tags + Products) Overlap Type per Gene Functional Impact Summary Sample Pattern
SV_adeIJ_del 737224–741667 Deletion 4,436 adeK (H0N29_03540): multidrug efflux RND transporter outer membrane channel subunit AdeK
adeJ (H0N29_03545): multidrug efflux RND transporter permease subunit AdeJ
adeI (H0N29_03550): multidrug efflux RND transporter periplasmic adaptor subunit AdeI
• adeK: 3′ truncation (~10 bp lost)
• adeJ: fully deleted
• adeI: fully deleted
🔴 HIGH: Complete loss of AdeJ+AdeI → tripartite RND pump cannot assemble; adeK truncation likely destabilizes protein; potential increased susceptibility to AdeIJK substrate antibiotics ✅ All *_dIJ_*
SV_adeAB_del 1844323–1848605 Deletion 4,282 adeA (H0N29_08675): multidrug efflux RND transporter periplasmic adaptor subunit AdeA
adeB (H0N29_08680): multidrug efflux RND transporter permease subunit AdeB
• adeA: fully deleted (4 bp 5′ end preserved)
• adeB: fully deleted (~11 bp 3′ end preserved)
🔴 HIGH: Complete loss of AdeA+AdeB → tripartite RND pump cannot assemble; potential increased susceptibility to AdeAB substrate antibiotics ✅ All *_dAB_*
SV_tRNA_contract 3124916–3125037 Tandem_contraction 198 tRNA-Gln (H0N29_14860): tRNA-Gln (anticodon: ttg; product: glutamine codon translation) • H0N29_14860: fully lost (75 bp coding sequence) 🟢 LOW: tRNA gene copy number reduction 4→3; likely neutral due to tRNA redundancy; stable lineage marker for clonal tracking ✅ All 29 filtered samples
SV_flu_tandem 2259736–2260384 Tandem_contraction ~135 No protein-coding genes fully overlapped
• Nearest gene: cydB (H0N29_10425): cytochrome bd oxidase subunit II (ends at 2217351, ~42 kb upstream)
• Intergenic/repetitive region; no CDS disruption 🟢 LOW: Likely neutral repetitive element contraction; possible regulatory impact on nearby genes; fluoroquinolone-selection marker ⚠️ Subset of flu_* (nitro/polyB/tet)
SV_mito_tandem 2494563–2536071 Tandem_contraction 41,564 H0N29_11610: hypothetical protein (upstream boundary, partial overlap)
H0N29_11615: pseudo CDS, hypothetical protein (partial overlap)
• Multiple unannotated hypothetical proteins in H0N29_11xxx series within region
• Repeat-rich region with transposase/integrase remnants
• Multiple hypothetical proteins: partial or full deletion
• Repeat arrays: contraction of tandem elements
🟡 MEDIUM: Large repeat array contraction; likely non-essential hypothetical proteins affected; possible mobile element-associated genome plasticity under mitomycin selection ✅ Only mito_dAB_*
SV_mito_large_del2 2621714–2664046 Deletion 42,352 H0N29_12335 (2626228..2626773): hypothetical protein
H0N29_12525 (2655635..2656549): DNA cytosine methyltransferase (EC: 2.1.21.-)
• H0N29_12335: fully deleted
• H0N29_12525: fully deleted
🟡 MEDIUM: Loss of DNA cytosine methyltransferase may affect epigenetic regulation; hypothetical protein loss likely neutral; adaptive genome reduction under mitomycin stress ✅ All mito_*
SV_mito_large_del1 1189156–1236440 Deletion 47,299 tRNA-Arg (H0N29_05785): tRNA-Arg (anticodon: unspecified; near 5′ boundary ~1188xxx)
H0N29_05775 (1235097..1235492): hypothetical protein (partial overlap at 3′ boundary)
• Multiple unannotated hypothetical proteins in H0N29_05xxx series within region
• Gene-sparse, low-complexity intergenic region
• tRNA-Arg: boundary proximity; possible regulatory element loss
• H0N29_05775: partial 5′ deletion
• Other hypotheticals: full or partial deletion
🟡 MEDIUM: Possible loss of non-essential functions; tRNA boundary effect uncertain; adaptive genome reduction under mitomycin stress; dAB-background specific ✅ Only mito_dAB_*

🧬 Detailed Gene-by-Gene Breakdown per SV

🔴 SV_adeIJ_del: AdeIJK Efflux Pump Deletion (737224–741667)

Reference structure (complement strand ←):

735779..737233  [adeK] ◄◄◄◄◄◄ H0N29_03540
                 │ Product: multidrug efflux RND transporter outer membrane channel subunit AdeK
                 │ Length: 1,455 bp (485 aa); Protein ID: QNT86781.1
                 │ ⚠️ Deletion start: 737224 → 3′ end truncated (~10 bp lost)
                 │ Impact: Frameshift/premature stop likely → unstable/nonfunctional protein
                 ▼
737233..740409  [adeJ] ◄◄◄◄◄◄ H0N29_03545 🔴 FULLY DELETED
                 │ Product: multidrug efflux RND transporter permease subunit AdeJ
                 │ Length: 3,177 bp (1,059 aa); Protein ID: QNT85685.1
                 │ EC: N/A; DBxref: GI:1906909115
                 ▼
740422..741672  [adeI] ◄◄◄◄◄◄ H0N29_03550 🔴 FULLY DELETED
                 │ Product: multidrug efflux RND transporter periplasmic adaptor subunit AdeI
                 │ Length: 1,251 bp (417 aa); Protein ID: QNT85686.1
                 │ ⚠️ Deletion end: 741667 → ~5 bp of 3′ end preserved
                 ▼
741697..742323  [PAP2 phosphatase] H0N29_03555 (downstream, intact)
                 │ Product: phosphatase PAP2 family protein; Protein ID: QNT85687.1

Functional Impact Summary:
• AdeJ (permease) + AdeI (adaptor) complete loss → tripartite RND pump cannot assemble
• adeK 3′ truncation → likely unstable/nonfunctional outer membrane channel
• Phenotypic consequence: potential increased susceptibility to AdeIJK substrates:
  - Aminoglycosides (amikacin, gentamicin, tobramycin)
  - Fluoroquinolones (ciprofloxacin, levofloxacin)
  - β-lactams (cefepime, imipenem, meropenem)
  - Tetracyclines (tigecycline, minocycline)
  - Chloramphenicol, trimethoprim
• Compensatory mechanisms: Other efflux systems (AdeABC, AdeFGH, AbeM) may be upregulated

🔴 SV_adeAB_del: AdeAB Efflux Pump Deletion (1844323–1848605)

Reference structure (forward strand →):

1844319..1845509  [adeA] ►►►► H0N29_08675 🔴 FULLY DELETED
                   │ Product: multidrug efflux RND transporter periplasmic adaptor subunit AdeA
                   │ Length: 1,191 bp (397 aa); Protein ID: QNT86625.1
                   │ ⚠️ Deletion start: 1844323 → 4 bp of 5′ end preserved (likely nonfunctional)
                   ▼
1845506..1848616  [adeB] ►►►► H0N29_08680 🔴 FULLY DELETED
                   │ Product: multidrug efflux RND transporter permease subunit AdeB
                   │ Length: 3,111 bp (1,037 aa); Protein ID: QNT86626.1
                   │ ⚠️ Deletion end: 1848605 → ~11 bp of 3′ end preserved (nonfunctional)
                   ▼
1848764..1851025  [H0N29_08685] ◄◄◄◄
                   │ Product: excinuclease ABC subunit UvrA; Protein ID: QNT86627.1
                   │ Status: ✅ INTACT (starts 159 bp downstream)

Upstream regulatory genes (INTACT):
• adeS (H0N29_08665): two-component sensor histidine kinase AdeS (1842325–1843398)
• adeR (H0N29_08670): efflux system response regulator transcription factor AdeR (1843430–1844173)

Functional Impact Summary:
• AdeA (adaptor) + AdeB (permease) complete loss → tripartite RND pump cannot assemble
• Phenotypic consequence: potential increased susceptibility to AdeAB substrate antibiotics:
  - Aminoglycosides, fluoroquinolones, β-lactams, tetracyclines, chloramphenicol
• Regulatory paradox: adeS/adeR intact but structural genes deleted → possible compensatory evolution or pseudogenization over time

🟢 SV_tRNA_contract: tRNA-Gln Array Contraction (3124916–3125037)

Reference tRNA-Gln tandem array (forward strand →):

3124675..3124749  [tRNA-Gln #1] H0N29_14850 (75 bp) │ anticodon: ttg (CAG/CAA)
3124841..3124915  [tRNA-Gln #2] H0N29_14855 (75 bp) │ anticodon: ttg
3124916..3124942  [spacer] (27 bp)
3124943..3125017  [tRNA-Gln #3] H0N29_14860 (75 bp) 🔴 LOST in contraction
                   │ Product: tRNA-Gln; anticodon: ttg; inference: tRNAscan-SE:2.0.4
3125018..3125037  [spacer] (20 bp)
3125039..3125113  [tRNA-Gln #4] H0N29_14865 (75 bp) │ anticodon: ttg

Functional Impact Summary:
• Copy number reduction: 4 → 3 identical tRNA-Gln genes
• Likely neutral: tRNA genes are highly redundant in bacteria; single copy loss rarely affects translation efficiency
• Utility: Stable molecular marker for:
  - Clonal tracking across experiments
  - Quality control (present in 100% of high-quality assemblies)
  - Phylogenetic analysis (fixed in this lineage)

🟢 SV_flu_tandem: Intergenic Tandem Contraction (2259736–2260384)

Reference context:

2216347..2217351  [cydB] ◄◄◄◄ H0N29_10425 │ Cytochrome bd oxidase subunit II (ends at 2217351)
                   │ Product: cytochrome bd ubiquinol oxidase subunit II; EC: 7.1.1.7
                   │ Protein ID: QNT85688.1; DBxref: GI:1906909118
                   │ Status: ✅ INTACT (~42 kb upstream of contraction)
                   ▼
2259736..2260384  [🔴 CONTRACTION REGION: ~135 bp]
                   │ No annotated protein-coding CDS in GFF3 excerpt
                   │ Likely repetitive element (IS, CRISPR, prophage remnant, or low-complexity DNA)
                   │ Assemblytics metrics: ref_gap: -648; query_gap: -780

Functional Impact Summary:
• No CDS disruption → likely neutral at protein level
• Possible regulatory impact if contraction removes:
  - Promoter/enhancer elements affecting downstream genes
  - Small RNA genes or riboswitches
  - DNA topology elements affecting local chromatin structure
• Utility: Condition-specific marker for fluoroquinolone selection (nitro/polyB/tet)

🟡 SV_mito_tandem: Large Repeat Array Contraction (2494563–2536071)

Reference context (genes/features within/adjacent to region):

2474459..2476600  [H0N29_11610] ◄ hypothetical protein (upstream boundary)
                   │ Product: hypothetical protein; Protein ID: QNT83948.1
                   │ Status: ⚠️ Partial overlap at 3′ end
                   ▼
2476597..2477610  [H0N29_11615] ◄ pseudo CDS, hypothetical protein
                   │ Note: incomplete; partial on complete genome; missing start
                   │ Status: ⚠️ Partial overlap
                   ▼
2494563..2536071  [🔴 CONTRACTION REGION: 41,564 bp]
                   │ Multiple hypothetical proteins in H0N29_11xxx series (partial/full overlap)
                   │ Transposase/integrase remnants (mobile element-associated)
                   │ Repeat-rich, low-complexity sequence
                   │ Assemblytics metrics: ref_gap: 41508; query_gap: -56

Functional Impact Summary:
• Large repeat array contraction → possible loss of mobile element-associated sequences
• Hypothetical proteins affected → functional impact unknown; likely non-essential
• Hypothesis: Mitomycin C (DNA crosslinker) induces replication fork collapse → error-prone repair → large contractions in repetitive regions
• Utility: Marker for mitomycin-selected lineage; dAB-background specific

🟡 SV_mito_large_del2: Large Deletion Affecting Methyltransferase (2621714–2664046)

Reference context (genes overlapping region):

2626228..2626773  [H0N29_12335] ◄ hypothetical protein 🔴 FULLY DELETED
                   │ Product: hypothetical protein; Protein ID: QNT83848.1
                   │ Length: 546 bp (182 aa)
                   ▼
2655635..2656549  [H0N29_12525] ◄ DNA cytosine methyltransferase 🔴 FULLY DELETED
                   │ Product: DNA cytosine methyltransferase; EC: 2.1.21.-
                   │ Protein ID: QNT83886.1; DBxref: GI:1906907316
                   │ Length: 915 bp (305 aa)
                   │ Function: Catalyzes methylation of cytosine residues in DNA; epigenetic regulation

Functional Impact Summary:
• H0N29_12525 (DNA cytosine methyltransferase) loss → potential epigenetic regulation changes:
  - Altered DNA methylation patterns
  - Possible effects on gene expression, phase variation, or restriction-modification systems
• H0N29_12335 (hypothetical) loss → functional impact unknown; likely neutral
• Hypothesis: Mitomycin-induced genomic instability → adaptive genome reduction in non-essential regions
• Utility: Marker for mitomycin-selected lineage (all mito_* samples)

🟡 SV_mito_large_del1: Large Gene-Sparse Deletion (1189156–1236440)

Reference context (genes/features within/adjacent to region):

1168871..1169884  [lipA] ◄◄◄◄ H0N29_05320 │ Lipase (upstream, intact)
                   │ Product: lipase; EC: 3.1.1.3; Protein ID: QNT85998.1
                   ▼
1171737..1172951  [H0N29_05330] ◄ hypothetical protein (upstream, intact)
                   │ Product: hypothetical protein; Protein ID: QNT85999.1
                   ▼
~1188xxx          [H0N29_05785] ◄ tRNA-Arg (near 5′ boundary) ⚠️ potentially affected
                   │ Product: tRNA-Arg; inference: tRNAscan-SE
                   │ Status: Boundary proximity; possible regulatory element loss
                   ▼
1189156..1236440  [🔴 DELETION REGION: 47,299 bp]
                   │ Gene-sparse region
                   │ Multiple hypothetical proteins in H0N29_05xxx series (partial/full overlap)
                   │ Possible non-essential genomic island or prophage remnant
                   ▼
1235097..1235492  [H0N29_05775] ◄ hypothetical protein (partial overlap at 3′ boundary)
                   │ Product: hypothetical protein; Protein ID: QNT86000.1
                   ▼
1263952..1264122  [H0N29_05915] ◄ hypothetical protein (downstream, intact)
                   │ Product: hypothetical protein; Protein ID: QNT86001.1

Functional Impact Summary:
• tRNA-Arg near boundary: deletion may affect tRNA expression/regulation if promoter elements removed
• Multiple hypothetical proteins lost → likely non-essential functions
• Hypothesis: Adaptive genome reduction under mitomycin stress; loss of non-essential functions to reduce metabolic burden
• Utility: Marker for mitomycin + dAB background lineage

📋 Export-Ready Comprehensive Annotation Table (TSV Format)

SV_ID   Coordinates Type    Size_bp Affected_Genes_LocusTags    Affected_Genes_Products Overlap_Type_per_Gene   Functional_Impact_Summary   Sample_Pattern  Priority
SV_adeIJ_del    737224-741667   Deletion    4436    adeK(H0N29_03540);adeJ(H0N29_03545);adeI(H0N29_03550)   multidrug_efflux_RND_transporter_subunits_AdeIJK    adeK:3prime_truncation;adeJ:full_deletion;adeI:full_deletion    Loss_of_AdeIJK_pump_function;potential_increased_antibiotic_susceptibility  all_dIJ_samples HIGH
SV_adeAB_del    1844323-1848605 Deletion    4282    adeA(H0N29_08675);adeB(H0N29_08680) multidrug_efflux_RND_transporter_subunits_AdeAB adeA:full_deletion_4bp_5prime_preserved;adeB:full_deletion_11bp_3prime_preserved    Loss_of_AdeAB_pump_function;potential_increased_antibiotic_susceptibility   all_dAB_samples HIGH
SV_tRNA_contract    3124916-3125037 Tandem_contraction  198 tRNA-Gln(H0N29_14860)   tRNA-Gln_anticodon:ttg_glutamine_codon_translation  H0N29_14860:full_deletion_75bp  tRNA_dosage_reduction_4to3;likely_neutral_lineage_marker    all_filtered_samples    LOW
SV_flu_tandem   2259736-2260384 Tandem_contraction  135 intergenic_no_CDS_overlapped    Likely_neutral_repetitive_element   No_CDS_disruption;possible_regulatory_impact    Likely_neutral;fluoroquinolone_selection_marker flu_subset_condition_specific   LOW
SV_mito_tandem  2494563-2536071 Tandem_contraction  41564   H0N29_11610(partial);H0N29_11615(partial);multiple_H0N29_11xxx_hypotheticals    hypothetical_proteins;repeat_rich_region    Multiple_hypotheticals:partial_or_full_deletion;repeat_arrays:contraction   Large_repeat_array_contraction;likely_non-essential;mobile_element_associated_plasticity    mito_dAB_only   MEDIUM
SV_mito_large_del2  2621714-2664046 Deletion    42352   H0N29_12335(full);H0N29_12525(full) hypothetical_protein;DNA_cytosine_methyltransferase_EC:2.1.21.- H0N29_12335:full_deletion;H0N29_12525:full_deletion Loss_of_DNA_cytosine_methyltransferase;potential_epigenetic_regulation_changes  all_mito_samples    MEDIUM
SV_mito_large_del1  1189156-1236440 Deletion    47299   tRNA-Arg(H0N29_05785,boundary);H0N29_05775(partial);multiple_H0N29_05xxx_hypotheticals  tRNA-Arg;hypothetical_proteins  tRNA-Arg:boundary_proximity;H0N29_05775:partial_deletion;others:full/partial    Possible_loss_of_non-essential_functions;tRNA_boundary_effect_uncertain;adaptive_genome_reduction   mito_dAB_only   MEDIUM

🔬 Reproducible Validation Workflow (Command-Line)

# 1. Create BED file of SV coordinates (0-based, half-open for bedtools)
cat > sv_coords.bed << EOF
CP059040    737223  741667  SV_adeIJ_del    4436    Deletion
CP059040    1844322 1848605 SV_adeAB_del    4282    Deletion
CP059040    3124915 3125037 SV_tRNA_contract    198 Tandem_contraction
CP059040    2259735 2260384 SV_flu_tandem   135 Tandem_contraction
CP059040    2494562 2536071 SV_mito_tandem  41564   Tandem_contraction
CP059040    2621713 2664046 SV_mito_large_del2  42352   Deletion
CP059040    1189155 1236440 SV_mito_large_del1  47299   Deletion
EOF

# 2. Intersect with GFF3 annotation (requires bedtools)
bedtools intersect -a sv_coords.bed -b CP059040.gff3.txt -wa -wb -loj > sv_gene_overlap.tsv

# 3. Extract and summarize affected genes per SV
awk -F'\t' 'NR>1 {print $4, $10, $11, $12}' sv_gene_overlap.tsv | \
  sort | uniq -c | column -t > sv_gene_summary.txt

# 4. Extract sequences for breakpoint validation
while IFS=$'\t' read -r chr start end sv_id size sv_type; do
  samtools faidx bacto/CP059040.fasta ${chr}:${start}-${end} > ${sv_id}_region.fasta
done < sv_coords.bed

💡 Manuscript Interpretation Guidelines

High-Priority Findings (Results Section)

“Two mutually exclusive ~4.3-kb deletions define efflux pump backgrounds: SV_adeIJ_del (CP059040:737224–741667) abolishes the AdeIJK pump (adeJ, adeI, truncated adeK) in all dIJ strains, while SV_adeAB_del (CP059040:1844323–1848605) abolishes the AdeAB pump (adeA, adeB) in all dAB strains. Both variants result in complete loss of permease and adaptor subunits, likely conferring increased susceptibility to respective substrate antibiotics.”

Medium-Priority (Supplementary/Discussion)

“Mitomycin C-selected strains harbor large (>40 kb) structural variants in repeat-rich genomic regions (SV_mito_tandem, SV_mito_large_del1/2), absent in fluoroquinolone-selected isolates. Notably, SV_mito_large_del2 deletes a DNA cytosine methyltransferase (H0N29_12525), suggesting potential epigenetic adaptation under genotoxic stress.”

Low-Priority (Methods/QC)

“A universal 198-bp tandem contraction in a tRNA-Gln array (SV_tRNA_contract; copy number 4→3) and a condition-specific ~135-bp intergenic contraction (SV_flu_tandem) were detected, serving as stable lineage markers and selection-condition signatures, respectively.”


  • BED/GFF3 files for IGV visualization of all 7 SVs with gene tracks
  • Integration with SNP/InDel results for a unified variant report
  • R/Python scripts to generate publication-ready figures (SV distribution, genome maps, gene impact heatmaps)