-
Input data:
mkdir bacto; cd bacto; mkdir raw_data; cd raw_data; # ── Fluoxetine Dataset ── ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEcef-1/19606△ABfluEcef-1_1.fq.gz flu_dAB_cef_R1.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEcef-1/19606△ABfluEcef-1_2.fq.gz flu_dAB_cef_R2.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEcipro-2/19606△ABfluEcipro-2_1.fq.gz flu_dAB_cipro_R1.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEcipro-2/19606△ABfluEcipro-2_2.fq.gz flu_dAB_cipro_R2.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEdori-2/19606△ABfluEdori-2_1.fq.gz flu_dAB_dori_R1.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEdori-2/19606△ABfluEdori-2_2.fq.gz flu_dAB_dori_R2.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEnitro-3/19606△ABfluEnitro-3_1.fq.gz flu_dAB_nitro_R1.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEnitro-3/19606△ABfluEnitro-3_2.fq.gz flu_dAB_nitro_R2.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEpip-1/19606△ABfluEpip-1_1.fq.gz flu_dAB_pip_R1.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEpip-1/19606△ABfluEpip-1_2.fq.gz flu_dAB_pip_R2.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEpolyB-3/19606△ABfluEpolyB-3_1.fq.gz flu_dAB_polyB_R1.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEpolyB-3/19606△ABfluEpolyB-3_2.fq.gz flu_dAB_polyB_R2.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEtet-1/19606△ABfluEtet-1_1.fq.gz flu_dAB_tet_R1.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEtet-1/19606△ABfluEtet-1_2.fq.gz flu_dAB_tet_R2.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEcef-4/19606△IJfluEcef-4_1.fq.gz flu_dIJ_cef_R1.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEcef-4/19606△IJfluEcef-4_2.fq.gz flu_dIJ_cef_R2.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEcipro-3/19606△IJfluEcipro-3_1.fq.gz flu_dIJ_cipro_R1.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEcipro-3/19606△IJfluEcipro-3_2.fq.gz flu_dIJ_cipro_R2.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEdori-1/19606△IJfluEdori-1_1.fq.gz flu_dIJ_dori_R1.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEdori-1/19606△IJfluEdori-1_2.fq.gz flu_dIJ_dori_R2.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEnitro-3/19606△IJfluEnitro-3_1.fq.gz flu_dIJ_nitro_R1.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEnitro-3/19606△IJfluEnitro-3_2.fq.gz flu_dIJ_nitro_R2.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEpip-4/19606△IJfluEpip-4_1.fq.gz flu_dIJ_pip_R1.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEpip-4/19606△IJfluEpip-4_2.fq.gz flu_dIJ_pip_R2.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEpolyB-4/19606△IJfluEpolyB-4_1.fq.gz flu_dIJ_polyB_R1.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606△IJfluEpolyB-4/19606△IJfluEpolyB-4_2.fq.gz flu_dIJ_polyB_R2.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEcef-1/19606wtfluEcef-1_1.fq.gz flu_wt_cef_R1.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEcef-1/19606wtfluEcef-1_2.fq.gz flu_wt_cef_R2.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEcipro-2/19606wtfluEcipro-2_1.fq.gz flu_wt_cipro_R1.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEcipro-2/19606wtfluEcipro-2_2.fq.gz flu_wt_cipro_R2.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEdori-1/19606wtfluEdori-1_1.fq.gz flu_wt_dori_R1.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEdori-1/19606wtfluEdori-1_2.fq.gz flu_wt_dori_R2.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEnitro-1/19606wtfluEnitro-1_1.fq.gz flu_wt_nitro_R1.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEnitro-1/19606wtfluEnitro-1_2.fq.gz flu_wt_nitro_R2.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEpip-4/19606wtfluEpip-4_1.fq.gz flu_wt_pip_R1.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEpip-4/19606wtfluEpip-4_2.fq.gz flu_wt_pip_R2.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEpolyB-4/19606wtfluEpolyB-4_1.fq.gz flu_wt_polyB_R1.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEpolyB-4/19606wtfluEpolyB-4_2.fq.gz flu_wt_polyB_R2.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEtet-2/19606wtfluEtet-2_1.fq.gz flu_wt_tet_R1.fastq.gz ln -s ../../X101SC25116512-Z01-J003/01.RawData/19606wtfluEtet-2/19606wtfluEtet-2_2.fq.gz flu_wt_tet_R2.fastq.gz # ── Mitomycin C Dataset ── ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_AB_cipro_1/MitoC_E_AB_cipro_1_1.fq.gz mito_dAB_cipro_R1.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_AB_cipro_1/MitoC_E_AB_cipro_1_2.fq.gz mito_dAB_cipro_R2.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_AB_dori_1/MitoC_E_AB_dori_1_1.fq.gz mito_dAB_dori_R1.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_AB_dori_1/MitoC_E_AB_dori_1_2.fq.gz mito_dAB_dori_R2.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_AB_nitro_2/MitoC_E_AB_nitro_2_1.fq.gz mito_dAB_nitro_R1.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_AB_nitro_2/MitoC_E_AB_nitro_2_2.fq.gz mito_dAB_nitro_R2.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_AB_tet_2/MitoC_E_AB_tet_2_1.fq.gz mito_dAB_tet_R1.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_AB_tet_2/MitoC_E_AB_tet_2_2.fq.gz mito_dAB_tet_R2.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_AB_trime_4/MitoC_E_AB_trime_4_1.fq.gz mito_dAB_trime_R1.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_AB_trime_4/MitoC_E_AB_trime_4_2.fq.gz mito_dAB_trime_R2.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_cipro_1/MitoC_E_IJ_cipro_1_1.fq.gz mito_dIJ_cipro_R1.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_cipro_1/MitoC_E_IJ_cipro_1_2.fq.gz mito_dIJ_cipro_R2.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_dori_4/MitoC_E_IJ_dori_4_1.fq.gz mito_dIJ_dori_R1.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_dori_4/MitoC_E_IJ_dori_4_2.fq.gz mito_dIJ_dori_R2.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_nitro_2/MitoC_E_IJ_nitro_2_1.fq.gz mito_dIJ_nitro_R1.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_nitro_2/MitoC_E_IJ_nitro_2_2.fq.gz mito_dIJ_nitro_R2.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_polyB_3/MitoC_E_IJ_polyB_3_1.fq.gz mito_dIJ_polyB_R1.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_polyB_3/MitoC_E_IJ_polyB_3_2.fq.gz mito_dIJ_polyB_R2.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_tet_3/MitoC_E_IJ_tet_3_1.fq.gz mito_dIJ_tet_R1.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_tet_3/MitoC_E_IJ_tet_3_2.fq.gz mito_dIJ_tet_R2.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_trime_1/MitoC_E_IJ_trime_1_1.fq.gz mito_dIJ_trime_R1.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_IJ_trime_1/MitoC_E_IJ_trime_1_2.fq.gz mito_dIJ_trime_R2.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_wt_cipro_1/MitoC_E_wt_cipro_1_1.fq.gz mito_wt_cipro_R1.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_wt_cipro_1/MitoC_E_wt_cipro_1_2.fq.gz mito_wt_cipro_R2.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_wt_nitro_1/MitoC_E_wt_nitro_1_1.fq.gz mito_wt_nitro_R1.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_wt_nitro_1/MitoC_E_wt_nitro_1_2.fq.gz mito_wt_nitro_R2.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_wt_polyB_1/MitoC_E_wt_polyB_1_1.fq.gz mito_wt_polyB_R1.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_wt_polyB_1/MitoC_E_wt_polyB_1_2.fq.gz mito_wt_polyB_R2.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_wt_trime_2/MitoC_E_wt_trime_2_1.fq.gz mito_wt_trime_R1.fastq.gz ln -s ../../X101SC26025981-Z02-J002/01.RawData/MitoC_E_wt_trime_2/MitoC_E_wt_trime_2_2.fq.gz mito_wt_trime_R2.fastq.gz -
Call variant calling using snippy
ln -s ~/Tools/bacto/db/ .; ln -s ~/Tools/bacto/envs/ .; ln -s ~/Tools/bacto/local/ .; cp ~/Tools/bacto/Snakefile .; cp ~/Tools/bacto/bacto-0.1.json .; cp ~/Tools/bacto/cluster.json .; #download CP059040.gb from GenBank #mv ~/Downloads/sequence\(2\).gb db/CP059040.gb mamba activate /home/jhuang/miniconda3/envs/bengal3_ac3 (bengal3_ac3) jhuang@WS-2290C:~/DATA/Data_Tam_DNAseq_2023_A6WT_A10CraA_A12AYE_A1917978$ which snakemake /home/jhuang/miniconda3/envs/bengal3_ac3/bin/snakemake (bengal3_ac3) jhuang@WS-2290C:~/DATA/Data_Tam_DNAseq_2023_A6WT_A10CraA_A12AYE_A1917978$ snakemake -v 4.0.0 --> CORRECT! #NOTE_1: modify bacto-0.1.json keeping only steps assembly, typing_mlst, possibly pangenome and variants_calling true; setting cpu=20 in all used steps. #setting the following in bacto-0.1.json "fastqc": false, "taxonomic_classifier": false, "assembly": true, "typing_ariba": false, "typing_mlst": true, "pangenome": true, "variants_calling": true, "phylogeny_fasttree": false, "phylogeny_raxml": false, "recombination": false, (due to gubbins-error set false) "prokka": { "genus": "Acinetobacter", "kingdom": "Bacteria", "species": "baumannii", "cpu": 10, "evalue": "1e-06", "other": "" }, "mykrobe": { "species": "abaum" }, "reference": "db/CP059040.gb" #NOTE_2: needs disk Titisee since the pipeline needs /media/jhuang/Titisee/GAMOLA2/TIGRfam_db/TIGRFAMs_15.0_HMM.LIB snakemake --printshellcmds -
Checking the contaminated samples based on the size of genome
find . -maxdepth 2 -name "contigs.fa" -exec du -h {} \; find . -maxdepth 2 -name "contigs.fa" -printf "%s\t%p\n" | sort -nr | \ while IFS=$'\t' read -r bytes path; do sample=$(basename "$(dirname "$path")") printf "%-20s %8s\t%s\n" "$sample" "$(numfmt --to=iec-i --suffix=B "$bytes")" "$path" done mito_wt_polyB 8,6MiB ./mito_wt_polyB/contigs.fa mito_wt_trime 8,2MiB ./mito_wt_trime/contigs.fa mito_dAB_trime 8,2MiB ./mito_dAB_trime/contigs.fa flu_wt_cipro 6,7MiB ./flu_wt_cipro/contigs.fa mito_dIJ_nitro 6,6MiB ./mito_dIJ_nitro/contigs.fa flu_dAB_cipro 4,7MiB ./flu_dAB_cipro/contigs.fa flu_wt_nitro 3,9MiB ./flu_wt_nitro/contigs.fa flu_wt_polyB 3,9MiB ./flu_wt_polyB/contigs.fa flu_wt_cef 3,9MiB ./flu_wt_cef/contigs.fa flu_wt_dori 3,8MiB ./flu_wt_dori/contigs.fa flu_dIJ_pip 3,8MiB ./flu_dIJ_pip/contigs.fa flu_dIJ_dori 3,8MiB ./flu_dIJ_dori/contigs.fa flu_wt_pip 3,8MiB ./flu_wt_pip/contigs.fa flu_dAB_dori 3,8MiB ./flu_dAB_dori/contigs.fa flu_dIJ_cipro 3,8MiB ./flu_dIJ_cipro/contigs.fa flu_dAB_nitro 3,8MiB ./flu_dAB_nitro/contigs.fa flu_dAB_pip 3,8MiB ./flu_dAB_pip/contigs.fa flu_dAB_polyB 3,8MiB ./flu_dAB_polyB/contigs.fa flu_dIJ_nitro 3,8MiB ./flu_dIJ_nitro/contigs.fa flu_dIJ_cef 3,8MiB ./flu_dIJ_cef/contigs.fa flu_dAB_tet 3,8MiB ./flu_dAB_tet/contigs.fa flu_dIJ_polyB 3,8MiB ./flu_dIJ_polyB/contigs.fa flu_dAB_cef 3,8MiB ./flu_dAB_cef/contigs.fa mito_dIJ_polyB 3,8MiB ./mito_dIJ_polyB/contigs.fa mito_dIJ_cipro 3,8MiB ./mito_dIJ_cipro/contigs.fa mito_dIJ_dori 3,8MiB ./mito_dIJ_dori/contigs.fa flu_wt_tet 3,8MiB ./flu_wt_tet/contigs.fa mito_wt_nitro 3,8MiB ./mito_wt_nitro/contigs.fa mito_wt_cipro 3,8MiB ./mito_wt_cipro/contigs.fa mito_dIJ_tet 3,8MiB ./mito_dIJ_tet/contigs.fa mito_dIJ_trime 3,8MiB ./mito_dIJ_trime/contigs.fa mito_dAB_dori 3,8MiB ./mito_dAB_dori/contigs.fa mito_dAB_cipro 3,7MiB ./mito_dAB_cipro/contigs.fa mito_dAB_tet 3,7MiB ./mito_dAB_tet/contigs.fa mito_dAB_nitro 3,7MiB ./mito_dAB_nitro/contigs.fa -
Summarize all SNPs and Indels from the snippy result directory.
cp ~/Scripts/summarize_snippy_res_ordered.py . # IMPORTANT_ADAPT the array in script should be adapted; deleting the isolates "mito_wt_polyB","mito_wt_trime", "mito_dAB_trime", "flu_wt_cipro", "mito_dIJ_nitro", and "flu_dAB_cipro" isolates = ["flu_wt_cef", "flu_wt_dori", "flu_wt_nitro", "flu_wt_pip", "flu_wt_polyB", "flu_wt_tet", "flu_dAB_cef", "flu_dAB_dori", "flu_dAB_nitro", "flu_dAB_pip", "flu_dAB_polyB", "flu_dAB_tet", "flu_dIJ_cef", "flu_dIJ_cipro", "flu_dIJ_dori", "flu_dIJ_nitro", "flu_dIJ_pip", "flu_dIJ_polyB", "mito_dIJ_trime", "mito_wt_cipro", "mito_wt_nitro", "mito_dAB_cipro", "mito_dAB_dori", "mito_dAB_nitro", "mito_dAB_tet", "mito_dIJ_cipro", "mito_dIJ_dori", "mito_dIJ_polyB", "mito_dIJ_tet"] mamba activate plot-numpy1 python3 ./summarize_snippy_res_ordered.py snippy #--> Summary CSV file created successfully at: snippy/summary_snps_indels.csv cd snippy #REMOVE_the_line? I don't find the sence of the line: grep -v "None,,,,,,None,None" summary_snps_indels.csv > summary_snps_indels_.csv -
Using spandx calling variants (almost the same results to the one from viral-ngs!)
mamba deactivate mamba activate /home/jhuang/miniconda3/envs/spandx # PREPARE the inputs for the options ref and database (NOT ALWAYS NEED, PREPARE ONlY ONCE) mkdir ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610 cp PP810610.gb ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610/genes.gbk vim ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config /home/jhuang/miniconda3/envs/spandx/bin/snpEff build PP810610 #-d ~/Scripts/genbank2fasta.py PP810610.gb mv PP810610.gb_converted.fna PP810610.fasta #rename "NC_001348.1 xxxxx" to "NC_001348" in the fasta-file ln -s /home/jhuang/Tools/spandx/ spandx # Deleting the contaminated samples flu_wt_cipro*.fastq and flu_dAB_cipro*.fastq (spandx) nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq" --ref CP059040.fasta --annotation --database CP059040 -resume # RERUN SNP_matrix.sh due to the error ERROR_CHROMOSOME_NOT_FOUND in the variants annotation, resulting in all impacts are MODIFIER --> IT WORKS! cd Outputs/Master_vcf conda activate /home/jhuang/miniconda3/envs/spandx (spandx) cp -r ../../snippy/flu_wt_cef/reference . # Eigentlich irgendein directory, all directories contains the same reference. (spandx) cp ../../spandx/bin/SNP_matrix.sh ./ #Note that ${variant_genome_path}=CP059040 in the following command, but it was not used after the following command modification. #Adapt "snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -v ${variant_genome_path} out.vcf > out.annotated.vcf" to "/home/jhuang/miniconda3/envs/bengal3_ac3/bin/snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -c reference/snpeff.config -dataDir . ref out.vcf > out.annotated.vcf" in SNP_matrix.sh (spandx) bash SNP_matrix.sh CP059040 .
Cross-Caller SNP/Indel Concordance & Invariant Variant Analyzer; Multi-Isolate Variant Intersection, Annotation Harmonization & Caller Discrepancy Report; Comparative Genomic Variant Profiling: Concordance, Invariance & Allele Mismatch Analysis; VarMatch: Cross-Tool Variant Concordance Pipeline
-
Calling inter-host variants by merging the results from snippy+spandx
mamba activate plot-numpy1 cd bacto cp Outputs/Master_vcf/All_SNPs_indels_annotated.txt . cp snippy/summary_snps_indels.csv . cp ~/Scripts/process_variants_snippy_alleles_spandx_annotations.py . #Configuring #Delete "mito_wt_polyB", "mito_wt_trime", "mito_dAB_trime", "flu_wt_cipro", "mito_dIJ_nitro", and "flu_dAB_cipro" ISOLATES = [ "flu_wt_cef", "flu_wt_dori", "flu_wt_nitro", "flu_wt_pip", "flu_wt_polyB", "flu_wt_tet", "flu_dAB_cef", "flu_dAB_dori", "flu_dAB_nitro", "flu_dAB_pip", "flu_dAB_polyB", "flu_dAB_tet", "flu_dIJ_cef", "flu_dIJ_cipro", "flu_dIJ_dori", "flu_dIJ_nitro", "flu_dIJ_pip", "flu_dIJ_polyB", "mito_dIJ_trime", "mito_wt_cipro", "mito_wt_nitro", "mito_dAB_cipro", "mito_dAB_dori", "mito_dAB_nitro", "mito_dAB_tet", "mito_dIJ_cipro", "mito_dIJ_dori", "mito_dIJ_polyB", "mito_dIJ_tet" ] (plot-numpy1) python process_variants_snippy_alleles_spandx_annotations.py # -- Indeed, the generated files are the common SNPs generated by snippy+spandx, therefore, we need to modify the filenames. -- mv common_variants_all_snippy_annotated.csv common_variants_snippy+spandx_annotated.csv mv common_variants_invariant_snippy_annotated.csv common_invariants_snippy+spandx_annotated.csv mv common_variants_all_snippy_annotated.xlsx common_variants_snippy+spandx_annotated.xlsx mv common_variants_invariant_snippy_annotated.xlsx common_invariants_snippy+spandx_annotated.xlsx -
Manully checking each of the 6 records by comparing them to the results from SPANDx; three are confirmed!
#The file will only contain rows where at least one of the 29 samples had a different allele between the two files. You can open allele_differences.xlsx side-by-side with your original common_variants_all_snippy_annotated.xlsx for fast, column-aligned manual verification. mamba activate plot-numpy1 (plot-numpy1) python ~/Scripts/export_mismatch_alleles.py common_variants_snippy+spandx_annotated.csv All_SNPs_indels_annotated.txt #❌ Found 13 mismatched positions. Extracting & formatting... # The export_mismatch_alleles.py always generated the same output-file, avoid to recoved by the following commands, modify the generated filename. (plot-numpy1) mv allele_differences.xlsx allele_differences_for_cmp.xlsx (plot-numpy1) python ~/Scripts/export_mismatch_alleles.py common_invariants_snippy+spandx_annotated.csv All_SNPs_indels_annotated.txt #✅ All alleles match perfectly across all common positions and samples! cp common_variants_all_snippy_annotated.xlsx common_variants_all_snippy_annotated_for_cmp.xlsx #Then marked the corresponding rows in yellow, then compare it to allele_differences.xlsx. # IMPORTANT_MANUAL_CHECK: checking all variants of common_variants_snippy+spandx_annotated.xlsx and common_invariants_snippy+spandx_annotated.xlsx by comparing allele_differences_for_cmp.xlsx. -
Run nextflow bacass
# Download the kmerfinder database: https://www.genomicepidemiology.org/services/ --> https://cge.food.dtu.dk/services/KmerFinder/ --> https://cge.food.dtu.dk/services/KmerFinder/etc/kmerfinder_db.tar.gz # Download 20190108_kmerfinder_stable_dirs.tar.gz from https://zenodo.org/records/13447056 #--kmerfinderdb /mnt/nvme1n1p1/REFs/kmerfinder_db.tar.gz #--kmerfinderdb /mnt/nvme1n1p1/REFs/20190108_kmerfinder_stable_dirs.tar.gz nextflow run nf-core/bacass -r 2.5.0 -profile docker \ --input samplesheet.tsv \ --outdir bacass_out \ --assembly_type short \ --kraken2db /mnt/nvme1n1p1/REFs/k2_standard_08_GB_20251015.tar.gz \ --kmerfinderdb /mnt/nvme1n1p1/REFs/kmerfinder/bacteria/ \ -resume #Possibly the chraracter '△' is a problem. #Solution: 19606△ABfluEcef-1 → 19606delABfluEcef-1 #SAVE bacass_out/Kmerfinder/kmerfinder_summary.csv to bacass_out/Kmerfinder/An6?/An6?_kmerfinder_results.xlsx samplesheet.tsv sample,fastq_1,fastq_2 flu_dAB_cef,../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEcef-1/19606△ABfluEcef-1_1.fq.gz,../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEcef-1/19606△ABfluEcef-1_2.fq.gz flu_dAB_cipro,../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEcipro-2/19606△ABfluEcipro-2_1.fq.gz,../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEcipro-2/19606△ABfluEcipro-2_2.fq.gz #busco example results: Input_file Dataset Complete Single Duplicated Fragmented Missing n_markers Scaffold N50 Contigs N50 Percent gaps Number of scaffolds wt_cef.scaffolds.fa bacteria_odb10 98.4 98.4 0.0 1.6 0.0 124 285852 285852 0.000% 45 wt_cipro.scaffolds.fa bacteria_odb10 90.3 89.5 0.8 8.1 1.6 124 7434 7434 0.000% 1699 -
Run bactmap
nextflow run nf-core/bactmap -r 1.0.0 -profile docker \ --input samplesheet.csv \ --reference CP059040.fasta \ --outdir bactmap_out \ -resume nextflow run nf-core/bactmap -r 1.0.0 -profile docker \ --input samplesheet.csv \ --reference CU459141.fasta \ --outdir bactmap_out \ -resume sample,fastq_1,fastq_2 G18582004,fastqs/G18582004_1.fastq.gz,fastqs/G18582004_2.fastq.gz G18756254,fastqs/G18756254_1.fastq.gz,fastqs/G18756254_2.fastq.gz G18582006,fastqs/G18582006_1.fastq.gz,fastqs/G18582006_2.fastq.gz mkdir bactmap_workspace #Prepare reference.fasta (example for CP059040.fasta) in bactmap_workspace and samplesheet.csv in bactmap_workspace nextflow run nf-core/bactmap -r 1.0.0 -profile docker \ --input samplesheet.csv \ --reference CP059040.fasta \ --outdir bactmap_out \ -resume -
Structural variant calling
conda activate sv_assembly nucmer --maxmatch -l 100 -c 500 bacto/CP059040.fasta bacto/shovill/flu_wt_cef/contigs.fa -p flu_wt_cef delta-filter -1 -q flu_wt_cef.delta > flu_wt_cef.filtered.delta Assemblytics flu_wt_cef.filtered.delta flu_wt_cef_assemblytics 1000 100 50000 #reference ref_start ref_stop ID size strand type ref_gap_size query_gap_size query_coordinates method CP059040 3124916 3125037 Assemblytics_b_3 198 + Tandem_contraction 121 -77 contig00010:34383-34460:- between_alignments for sample in flu_wt_cef flu_wt_dori flu_wt_nitro flu_wt_pip flu_wt_polyB flu_wt_tet flu_dAB_cef flu_dAB_dori flu_dAB_nitro flu_dAB_pip flu_dAB_polyB flu_dAB_tet flu_dIJ_cef flu_dIJ_cipro flu_dIJ_dori flu_dIJ_nitro flu_dIJ_pip flu_dIJ_polyB mito_dIJ_trime mito_wt_cipro mito_wt_nitro mito_dAB_cipro mito_dAB_dori mito_dAB_nitro mito_dAB_tet mito_dIJ_cipro mito_dIJ_dori mito_dIJ_polyB mito_dIJ_tet; do nucmer --maxmatch -l 100 -c 500 bacto/CP059040.fasta bacto/shovill/${sample}/contigs.fa -p ${sample}; delta-filter -1 -q ${sample}.delta > ${sample}.filtered.delta; Assemblytics ${sample}.filtered.delta ${sample}_assemblytics 1000 100 50000; done #NOTE that We excluded 6 samples with abnormal assembly sizes (>4.5 Mb or <3.7 Mb), which likely reflect contamination or fragmentation. The final merged SV table (merged_sv_results_filtered.tsv) contains 29 high-quality isolates and is ready for downstream analysis. #mito_wt_polyB 8,6MiB ./mito_wt_polyB/contigs.fa #mito_wt_trime 8,2MiB ./mito_wt_trime/contigs.fa #mito_dAB_trime 8,2MiB ./mito_dAB_trime/contigs.fa #flu_wt_cipro 6,7MiB ./flu_wt_cipro/contigs.fa #mito_dIJ_nitro 6,6MiB ./mito_dIJ_nitro/contigs.fa #flu_dAB_cipro 4,7MiB ./flu_dAB_cipro/contigs.fa mito_wt_polyB # Adapt the EXCLUDE_SAMPLES names in the following script and run it. merge_sv_filtered.sh # merged_sv_results.txt #✅ Merge complete: merged_sv_results_filtered.txt #• Included: 29 samples #• Excluded: 4 samples (Indeed, 6 samples should not include in the calculation, but 2 samples was not run in the last step.)