-
Targets
Attached are the DNA sequencing data for your review. Could you please compare these sequences with the A. baumannii AYE reference (accession CU459141) and let us know whether any genomic mutations are present? Sorry, I realize I made a mistake. Could you confirm whether variant calling has been performed for these strains (X101SC25116512-Z01-J001, samples labed as Dark or light) to determine if they contain SNPs or InDels compared to CU459141? -
mkdir raw_data; cd raw_data; # Note that the names must be ending with fastq.gz ln -s ../RSMD00304/X101SC25116512-Z01/X101SC25116512-Z01-J001/01.RawData/Light/Light_1.fq.gz Light_R1.fastq.gz ln -s ../RSMD00304/X101SC25116512-Z01/X101SC25116512-Z01-J001/01.RawData/Light/Light_2.fq.gz Light_R2.fastq.gz ln -s ../RSMD00304/X101SC25116512-Z01/X101SC25116512-Z01-J001/01.RawData/Dark/Dark_1.fq.gz Dark_R1.fastq.gz ln -s ../RSMD00304/X101SC25116512-Z01/X101SC25116512-Z01-J001/01.RawData/Dark/Dark_2.fq.gz Dark_R2.fastq.gz -
Call variant calling using snippy
ln -s ~/Tools/bacto/db/ .; ln -s ~/Tools/bacto/envs/ .; ln -s ~/Tools/bacto/local/ .; cp ~/Tools/bacto/Snakefile .; cp ~/Tools/bacto/bacto-0.1.json .; cp ~/Tools/bacto/cluster.json .; #download CU459141.gb from GenBank mv ~/Downloads/sequence\(1\).gb db/CU459141.gb #setting the following in bacto-0.1.json "fastqc": false, "taxonomic_classifier": false, "assembly": true, "typing_ariba": false, "typing_mlst": true, "pangenome": true, "variants_calling": true, "phylogeny_fasttree": true, "phylogeny_raxml": true, "recombination": false, (due to gubbins-error set false) "genus": "Acinetobacter", "kingdom": "Bacteria", "species": "baumannii", (in both prokka and mykrobe) "reference": "db/CU459141.gb" mamba activate /home/jhuang/miniconda3/envs/bengal3_ac3 (bengal3_ac3) /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds -
Checking the contaminated samples based on the size of genome
cd shovill find . -maxdepth 2 -name "contigs.fa" -exec du -h {} \; find . -maxdepth 2 -name "contigs.fa" -printf "%s\t%p\n" | sort -nr | \ while IFS=$'\t' read -r bytes path; do sample=$(basename "$(dirname "$path")") printf "%-20s %8s\t%s\n" "$sample" "$(numfmt --to=iec-i --suffix=B "$bytes")" "$path" done #Dark 3,9MiB ./Dark/contigs.fa #Light 3,9MiB ./Light/contigs.fa -
Summarize all SNPs and Indels from the snippy result directory.
cp ~/Scripts/summarize_snippy_res_ordered.py . # IMPORTANT_ADAPT the array in script should be adapted; deleting the isolates ["Dark", "Light"] mamba activate plot-numpy1 python3 ./summarize_snippy_res_ordered.py snippy #--> Summary CSV file created successfully at: snippy/summary_snps_indels.csv cd snippy #REMOVE_the_line? I don't find the sence of the line: grep -v "None,,,,,,None,None" summary_snps_indels.csv > summary_snps_indels_.csv -
Using spandx calling variants (almost the same results to the one from viral-ngs!)
mamba deactivate mamba activate /home/jhuang/miniconda3/envs/spandx # PREPARE the inputs for the options ref and database (NOT ALWAYS NEED, PREPARE ONLY ONCE) mkdir ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610 cp PP810610.gb ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610/genes.gbk vim ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config /home/jhuang/miniconda3/envs/spandx/bin/snpEff build PP810610 #-d ~/Scripts/genbank2fasta.py PP810610.gb mv PP810610.gb_converted.fna PP810610.fasta #rename "NC_001348.1 xxxxx" to "NC_001348" in the fasta-file ln -s /home/jhuang/Tools/spandx/ spandx # Deleting the contaminated samples flu_wt_cipro*.fastq and flu_dAB_cipro*.fastq if contamination exists! (spandx) nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq" --ref CU459141.fasta --annotation --database CU459141 -resume # RERUN SNP_matrix.sh due to the error ERROR_CHROMOSOME_NOT_FOUND in the variants annotation, resulting in all impacts are MODIFIER --> IT WORKS! cd Outputs/Master_vcf conda activate /home/jhuang/miniconda3/envs/spandx (spandx) cp -r ../../snippy/Dark/reference . # Eigentlich irgendein directory, all directories contains the same reference. (spandx) cp ../../spandx/bin/SNP_matrix.sh ./ #Note that ${variant_genome_path}=CP059040 in the following command, but it was not used after the following command modification. #Adapt "snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -v ${variant_genome_path} out.vcf > out.annotated.vcf" to "/home/jhuang/miniconda3/envs/bengal3_ac3/bin/snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -c reference/snpeff.config -dataDir . ref out.vcf > out.annotated.vcf" in SNP_matrix.sh (spandx) bash SNP_matrix.sh CU459141 .
Cross-Caller SNP/Indel Concordance & Invariant Variant Analyzer; Multi-Isolate Variant Intersection, Annotation Harmonization & Caller Discrepancy Report; Comparative Genomic Variant Profiling: Concordance, Invariance & Allele Mismatch Analysis; VarMatch: Cross-Tool Variant Concordance Pipeline
-
Calling inter-host variants by merging the results from snippy+spandx
mamba activate plot-numpy1 cp Outputs/Master_vcf/All_SNPs_indels_annotated.txt . cp snippy/summary_snps_indels.csv . cp ~/Scripts/process_variants_snippy_alleles_spandx_annotations.py . #Configuring #Delete "mito_wt_polyB", "mito_wt_trime", "mito_dAB_trime", "flu_wt_cipro", "mito_dIJ_nitro", and "flu_dAB_cipro" ISOLATES = ["Dark", "Light"] (plot-numpy1) python process_variants_snippy_alleles_spandx_annotations.py # 5 invariant and 0 variant # -- Indeed, the generated files are the common SNPs generated by snippy+spandx, therefore, we need to modify the filenames. -- mv common_variants_all_snippy_annotated.csv common_variants_snippy+spandx_annotated.csv mv common_variants_invariant_snippy_annotated.csv common_invariants_snippy+spandx_annotated.csv mv common_variants_all_snippy_annotated.xlsx common_variants_snippy+spandx_annotated.xlsx mv common_variants_invariant_snippy_annotated.xlsx common_invariants_snippy+spandx_annotated.xlsx #CU459141 1900834 T C SNP C C synonymous_variant LOW SILENT ttA/ttG p.Leu46Leu/c.138A>G 73 ABAYE1833 protein_coding #CU459141 1900838 T A SNP A A missense_variant MODERATE MISSENSE tAc/tTc p.Tyr45Phe/c.134A>T 73 ABAYE1833 protein_coding -
Manully checking each of the 6 records by comparing them to the results from SPANDx; three are confirmed!
#The file will only contain rows where at least one of the 29 samples had a different allele between the two files. You can open allele_differences.xlsx side-by-side with your original common_variants_all_snippy_annotated.xlsx for fast, column-aligned manual verification. mamba activate plot-numpy1 (plot-numpy1) python ~/Scripts/export_mismatch_alleles.py common_variants_snippy+spandx_annotated.csv All_SNPs_indels_annotated.txt #❌ Found 13 mismatched positions. Extracting & formatting... # The export_mismatch_alleles.py always generated the same output-file, avoid to recoved by the following commands, modify the generated filename. (plot-numpy1) mv allele_differences.xlsx allele_differences_for_cmp.xlsx (plot-numpy1) python ~/Scripts/export_mismatch_alleles.py common_invariants_snippy+spandx_annotated.csv All_SNPs_indels_annotated.txt #✅ All alleles match perfectly across all common positions and samples! cp common_variants_all_snippy_annotated.xlsx common_variants_all_snippy_annotated_for_cmp.xlsx #Then marked the corresponding rows in yellow, then compare it to allele_differences.xlsx. # IMPORTANT_MANUAL_CHECK: checking all variants of common_variants_snippy+spandx_annotated.xlsx and common_invariants_snippy+spandx_annotated.xlsx by comparing allele_differences_for_cmp.xlsx. -
Structural variant calling
conda activate sv_assembly for sample in Dark Light; do nucmer --maxmatch -l 100 -c 500 CU459141.fasta shovill/${sample}/contigs.fa -p ${sample}; delta-filter -1 -q ${sample}.delta > ${sample}.filtered.delta; Assemblytics ${sample}.filtered.delta ${sample}_assemblytics 1000 100 50000; done #< CU459141 3244053 3244284 Assemblytics_b_1 120 + Tandem_contraction -231 -351 contig00003:66319-66670:- between_alignments #--- #> CU459141 873756 874618 Assemblytics_b_1 258 + Tandem_contraction -862 -1120 contig00004:40161-41281:- between_alignments # ✅ If empty, relax Assemblytics parameters for sample in Dark Light; do nucmer --maxmatch -l 100 -c 500 CU459141.fasta shovill/${sample}/contigs.fa -p ${sample}; delta-filter -1 -q ${sample}.delta > ${sample}.filtered.delta; Assemblytics ${sample}.filtered.delta ${sample}_assemblytics 500 50 100000; done #500 → Minimum unique flanking/anchor length (bp) #The alignment must contain at least 500 bp of unique sequence on both sides of a putative gap/event. This prevents false positives in repetitive, low-complexity, or highly similar genomic regions by ensuring the variant is anchored by uniquely mappable sequence. #50 → Minimum event/variant size (bp) #Assemblytics will only report structural variants that are ≥50 bp in length. Smaller differences (like short indels or sequencing noise) are filtered out. #100000 → Maximum event/variant size (bp) #Variants >100,000 bp (100 kb) will be excluded. This focuses the output on typical SVs and avoids reporting extremely large alignment artifacts, assembly breaks, or whole-chromosome differences. #< CU459141 3244053 3244284 Assemblytics_b_1 120 + Tandem_contraction -231 -351 contig00003:66319-66670:- between_alignments #< CU459141 3617424 3680584 Assemblytics_b_2 66532 + Tandem_contraction 63160 -3372 contig00009:61062-64434:- between_alignments #--- #> CU459141 873756 874618 Assemblytics_b_1 258 + Tandem_contraction -862 -1120 contig00004:40161-41281:- between_alignments #> CU459141 3617424 3680584 Assemblytics_b_3 66532 + Tandem_contraction 63160 -3372 contig00013:44152-47524:- between_alignments cp ~/Scripts/merge_sv_filtered.sh . # ADAPT the EXCLUDE_SAMPLES names in the following script when samples has abnormal assembly sizes. bash merge_sv_filtered.sh # generate merged_sv_results.txt -
(Optional) Run nextflow bacass
# Download the kmerfinder database: https://www.genomicepidemiology.org/services/ --> https://cge.food.dtu.dk/services/KmerFinder/ --> https://cge.food.dtu.dk/services/KmerFinder/etc/kmerfinder_db.tar.gz # Download 20190108_kmerfinder_stable_dirs.tar.gz from https://zenodo.org/records/13447056 #--kmerfinderdb /mnt/nvme1n1p1/REFs/kmerfinder_db.tar.gz #--kmerfinderdb /mnt/nvme1n1p1/REFs/20190108_kmerfinder_stable_dirs.tar.gz nextflow run nf-core/bacass -r 2.5.0 -profile docker \ --input samplesheet.tsv \ --outdir bacass_out \ --assembly_type short \ --kraken2db /mnt/nvme1n1p1/REFs/k2_standard_08_GB_20251015.tar.gz \ --kmerfinderdb /mnt/nvme1n1p1/REFs/kmerfinder/bacteria/ \ -resume #Possibly the chraracter '△' is a problem. #Solution: 19606△ABfluEcef-1 → 19606delABfluEcef-1 #SAVE bacass_out/Kmerfinder/kmerfinder_summary.csv to bacass_out/Kmerfinder/An6?/An6?_kmerfinder_results.xlsx samplesheet.tsv sample,fastq_1,fastq_2 flu_dAB_cef,../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEcef-1/19606△ABfluEcef-1_1.fq.gz,../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEcef-1/19606△ABfluEcef-1_2.fq.gz flu_dAB_cipro,../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEcipro-2/19606△ABfluEcipro-2_1.fq.gz,../X101SC25116512-Z01-J003/01.RawData/19606△ABfluEcipro-2/19606△ABfluEcipro-2_2.fq.gz #busco example results: Input_file Dataset Complete Single Duplicated Fragmented Missing n_markers Scaffold N50 Contigs N50 Percent gaps Number of scaffolds wt_cef.scaffolds.fa bacteria_odb10 98.4 98.4 0.0 1.6 0.0 124 285852 285852 0.000% 45 wt_cipro.scaffolds.fa bacteria_odb10 90.3 89.5 0.8 8.1 1.6 124 7434 7434 0.000% 1699 -
(Optional) Run bactmap
nextflow run nf-core/bactmap -r 1.0.0 -profile docker \ --input samplesheet.csv \ --reference CP059040.fasta \ --outdir bactmap_out \ -resume nextflow run nf-core/bactmap -r 1.0.0 -profile docker \ --input samplesheet.csv \ --reference CU459141.fasta \ --outdir bactmap_out \ -resume sample,fastq_1,fastq_2 G18582004,fastqs/G18582004_1.fastq.gz,fastqs/G18582004_2.fastq.gz G18756254,fastqs/G18756254_1.fastq.gz,fastqs/G18756254_2.fastq.gz G18582006,fastqs/G18582006_1.fastq.gz,fastqs/G18582006_2.fastq.gz mkdir bactmap_workspace #Prepare reference.fasta (example for CP059040.fasta) in bactmap_workspace and samplesheet.csv in bactmap_workspace nextflow run nf-core/bactmap -r 1.0.0 -profile docker \ --input samplesheet.csv \ --reference CP059040.fasta \ --outdir bactmap_out \ -resume