-
Input data:
mkdir bacto; cd bacto; mkdir raw_data; cd raw_data; # ── W1-4 and Y1-4 ── ln -s ../../short/RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J001/01.RawData/W/W_1.fq.gz W1_R1.fastq.gz ln -s ../../short/RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J001/01.RawData/W/W_2.fq.gz W1_R2.fastq.gz ln -s ../../short/RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/W2/W2_1.fq.gz W2_R1.fastq.gz ln -s ../../short/RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/W2/W2_2.fq.gz W2_R2.fastq.gz ln -s ../../short/RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/W3/W3_1.fq.gz W3_R1.fastq.gz ln -s ../../short/RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/W3/W3_2.fq.gz W3_R2.fastq.gz ln -s ../../short/RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/W4/W4_1.fq.gz W4_R1.fastq.gz ln -s ../../short/RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/W4/W4_2.fq.gz W4_R2.fastq.gz ln -s ../../short/RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J001/01.RawData/Y/Y_1.fq.gz Y1_R1.fastq.gz ln -s ../../short/RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J001/01.RawData/Y/Y_2.fq.gz Y1_R2.fastq.gz ln -s ../../short/RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/Y2/Y2_1.fq.gz Y2_R1.fastq.gz ln -s ../../short/RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/Y2/Y2_2.fq.gz Y2_R2.fastq.gz ln -s ../../short/RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/Y3/Y3_1.fq.gz Y3_R1.fastq.gz ln -s ../../short/RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/Y3/Y3_2.fq.gz Y3_R2.fastq.gz ln -s ../../short/RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/Y4/Y4_1.fq.gz Y4_R1.fastq.gz ln -s ../../short/RSMD00304/X101SC24065637-Z01/X101SC24065637-Z01-J002/01.RawData/Y4/Y4_2.fq.gz Y4_R2.fastq.gz -
Call variant calling using snippy
ln -s ~/Tools/bacto/db/ .; ln -s ~/Tools/bacto/envs/ .; ln -s ~/Tools/bacto/local/ .; cp ~/Tools/bacto/Snakefile .; cp ~/Tools/bacto/bacto-0.1.json .; cp ~/Tools/bacto/cluster.json .; #download CP059040.gb from GenBank #mv ~/Downloads/sequence\(2\).gb db/CP059040.gb mamba activate /home/jhuang/miniconda3/envs/bengal3_ac3 (bengal3_ac3) jhuang@WS-2290C:~/DATA/Data_Tam_DNAseq_2023_A6WT_A10CraA_A12AYE_A1917978$ which snakemake /home/jhuang/miniconda3/envs/bengal3_ac3/bin/snakemake (bengal3_ac3) jhuang@WS-2290C:~/DATA/Data_Tam_DNAseq_2023_A6WT_A10CraA_A12AYE_A1917978$ snakemake -v 4.0.0 --> CORRECT! #NOTE_1: modify bacto-0.1.json keeping only steps assembly, typing_mlst, possibly pangenome and variants_calling true; setting cpu=20 in all used steps. #setting the following in bacto-0.1.json "fastqc": false, "taxonomic_classifier": false, "assembly": true, "typing_ariba": false, "typing_mlst": true, "pangenome": true, "variants_calling": true, "phylogeny_fasttree": false, "phylogeny_raxml": false, "recombination": false, (due to gubbins-error set false) "prokka": { "genus": "Acinetobacter", "kingdom": "Bacteria", "species": "baumannii", "cpu": 10, "evalue": "1e-06", "other": "" }, "mykrobe": { "species": "abaum" }, "reference": "db/CP059040.gb" #NOTE_2: needs disk Titisee since the pipeline needs /media/jhuang/Titisee/GAMOLA2/TIGRfam_db/TIGRFAMs_15.0_HMM.LIB snakemake --printshellcmds -
Summarize all SNPs and Indels from the snippy result directory.
cp ~/Scripts/summarize_snippy_res_ordered.py . # IMPORTANT_ADAPT the array in script should be adapted isolates = ["W1", "W2", "W3", "W4", "Y1", "Y2", "Y3", "Y4"] mamba activate plot-numpy1 python3 ./summarize_snippy_res_ordered.py snippy #--> Summary CSV file created successfully at: snippy/summary_snps_indels.csv cd snippy #REMOVE_the_line? I don't find the sence of the line: grep -v "None,,,,,,None,None" summary_snps_indels.csv > summary_snps_indels_.csv -
Using spandx calling variants (almost the same results to the one from viral-ngs!)
mamba deactivate mamba activate /home/jhuang/miniconda3/envs/spandx # PREPARE the inputs for the options ref and database mkdir ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610 cp PP810610.gb ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610/genes.gbk vim ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config /home/jhuang/miniconda3/envs/spandx/bin/snpEff build PP810610 #-d ~/Scripts/genbank2fasta.py PP810610.gb mv PP810610.gb_converted.fna PP810610.fasta #rename "NC_001348.1 xxxxx" to "NC_001348" in the fasta-file ln -s /home/jhuang/Tools/spandx/ spandx (spandx) nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq" --ref CP059040.fasta --annotation --database CP059040 -resume # RERUN SNP_matrix.sh due to the error ERROR_CHROMOSOME_NOT_FOUND in the variants annotation, resulting in all impacts are MODIFIER --> IT WORKS! cd Outputs/Master_vcf conda activate /home/jhuang/miniconda3/envs/spandx (spandx) cp -r ../../snippy/Y1/reference . # Eigentlich irgendein directory, all directories contains the same reference. (spandx) cp ../../spandx/bin/SNP_matrix.sh ./ #Note that ${variant_genome_path}=CP059040 in the following command, but it was not used after the following command modification. #Adapt "snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -v ${variant_genome_path} out.vcf > out.annotated.vcf" to "/home/jhuang/miniconda3/envs/bengal3_ac3/bin/snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -c reference/snpeff.config -dataDir . ref out.vcf > out.annotated.vcf" in SNP_matrix.sh (spandx) bash SNP_matrix.sh CP059040 . -
Calling inter-host variants by merging the results from snippy+spandx
Cross-Caller SNP/Indel Concordance & Invariant Variant Analyzer; Multi-Isolate Variant Intersection, Annotation Harmonization & Caller Discrepancy Report; Comparative Genomic Variant Profiling: Concordance, Invariance & Allele Mismatch Analysis; VarMatch: Cross-Tool Variant Concordance Pipeline
mamba activate plot-numpy1 cd bacto cp Outputs/Master_vcf/All_SNPs_indels_annotated.txt . cp snippy/summary_snps_indels.csv . cp ~/Scripts/process_variants_snippy_alleles_spandx_annotations.py . #Configuring ISOLATES = [ "Y1", "Y2", "Y3", "Y4", "W1", "W2", "W3", "W4" ] (plot-numpy1) python process_variants_snippy_alleles_spandx_annotations.py # mv common_variants_all_snippy_annotated.xlsx common_variants_snippy+spandx_annotated_Y1Y2Y3Y4W1W2W3W4.xlsx # mv common_variants_invariant_snippy_annotated.xlsx common_invariants_snippy+spandx_annotated_Y1Y2Y3Y4W1W2W3W4.xlsx -
Manully checking each of the 6 records by comparing them to the results from SPANDx; three are confirmed!
#CHROM,POS,REF,ALT,TYPE,Y1,Y2,Y3,Y4,W1,W2,W3,W4,Effect,Impact,Functional_Class,Codon_change,Protein_and_nucleotide_change,Amino_Acid_Length,Gene_name,Biotype # -- Results from snippy -- #move: CP059040,1527276,TTGAACC,T,del,TTGAACC,TTGAACC,TTGAACC,T,TTGAACC,TTGAACC,T,T,conservative_inframe_deletion,MODERATE,,gaacct/,p.Glu443_Pro444del/c.1327_1332delGAACCT,704,H0N29_07175,protein_coding #confirmed: CP059040,1843289,G,T,snp,G,T,G,G,G,G,G,G,missense_variant,MODERATE,MISSENSE,gCg/gAg,p.Ala37Glu/c.110C>A,357,H0N29_08665,protein_coding #confirmed: CP059040,2019186,G,A,snp,A,G,G,G,G,G,G,G,upstream_gene_variant,MODIFIER,,59,c.-59C>T,144,H0N29_09480,protein_coding #delete_this? CP059040,3124917,T,"TAC,TACTTCATTACATACCAACCGCCAAGGGTGC",snp,C,T,C,C,T,T,T,C,upstream_gene_variant,MODIFIER,,25,c.-25_-24insAC,0,H0N29_00075,protein_coding #move: CP059040,3310021,C,CT,ins,CT,CT,CT,CT,C,CT,CT,CT,intragenic_variant,MODIFIER,,,n.3310021_3310022insT,,H0N29_00075, #confirmed: CP059040,3853714,G,A,snp,G,G,G,G,G,A,G,A,stop_gained,HIGH,NONSENSE,Cag/Tag,p.Gln91*/c.271C>T,338,H0N29_18290,protein_coding #--> Only three SNPs are confirmed --> means there is almost no variation in the genomic level! # -- Results from the SPANDx -- #CP059040 1527276 TTGAACC T INDEL TTGAACC/T T T T T T T T conservative_inframe_deletion MODERATE gaacct/ p.Glu443_Pro444del/c.1327_1332delGAACCT 704 H0N29_07175 protein_coding #CP059040 1843289 G T SNP G T G G G G G G missense_variant MODERATE MISSENSE gCg/gAg p.Ala37Glu/c.110C>A 357 H0N29_08665 protein_coding #CP059040 2019186 G A SNP A G G G G G G G upstream_gene_variant MODIFIER 59 c.-59C>T 144 H0N29_09480 protein_coding #Cmp to CP059040 3124917 T TAC,TACTTCATTACATACCAACCGCCAAGGGTGC INDEL . TACTTCATTACATACCAACCGCCAAGGGTGC TACTTCATTACATACCAACCGCCAAGGGTGC TAC . . . . upstream_gene_variant MODIFIER 25 c.-25_-24insAC 0 H0N29_00075 protein_coding #Cmp to CP059040 3124920 C CATTACATACCAACCGCCAAGGGTGCTTCATG INDEL . . . CATTACATACCAACCGCCAAGGGTGCTTCATG . . C . upstream_gene_variant MODIFIER 22 c.-22_-21insATTACATACCAACCGCCAAGGGTGCTTCATG 0 H0N29_00075 protein_coding #TODO: Move to invariant-file: CP059040 3310021 C CT INDEL CT CT CT CT CT CT CT CT intragenic_variant MODIFIER n.3310021_3310022insT H0N29_00075 #CP059040 3853714 G A SNP G G G G G A G A stop_gained HIGH NONSENSE Cag/Tag p.Gln91*/c.271C>T 338 H0N29_18290 protein_coding #-->For the Indel-report, more complicated, needs the following command to find the initial change and related codon-change. ## Check gene strand in your GFF/GenBank #grep "H0N29_07175" reference.gff # Extract 20 bp around the variant from reference samtools faidx CP059040.fasta CP059040:1527260-1527290 -
Annotation of the three confirmed SNPs
gene complement(3852968..3853984) /gene="galE" /locus_tag="H0N29_18290" CDS complement(3852968..3853984) /gene="galE" /locus_tag="H0N29_18290" /EC_number="5.1.3.2" /inference="COORDINATES: similar to AA sequence:RefSeq:WP_017725586.1" /note="Derived by automated computational analysis using gene prediction method: Protein Homology." /codon_start=1 /transl_table=11 /product="UDP-glucose 4-epimerase GalE" /protein_id="QNT84923.1" /translation="MAKILVTGGAGYIGSHTCVELLEAGHEVIVFDNLSNSSKESLNR VQEITQKGLTFVEGDIRNSGELDRVFQEHAIDAVIHFAGLKAVGESQEKPLIYFDNNI AGSIQLVKSMEKAGVYTLVFSSSATVYDEANTSPLNEEMPTGMPSNNYGYTKLIVEQL LQKLSVADSKWSIALLRYFNPVGAHKSGRIGEDPQGIPNNLMPYVTQVAVGRREKLSI YGNDYDTIDGTGVRDYIHVVDLANAHLCALNNRLQAQGCRAWNIGTGNGSSVLQVKNT FEQVNGVPVAFEFAPRRAGDVATSFADNARAVAELGWKPQYGLEDMLKDSWNWQKQNP NGYN" gene complement(1842325..1843398) /gene="adeS" /locus_tag="H0N29_08665" CDS complement(1842325..1843398) /gene="adeS" /locus_tag="H0N29_08665" /inference="COORDINATES: similar to AA sequence:RefSeq:WP_000837467.1" /note="Derived by automated computational analysis using gene prediction method: Protein Homology." /codon_start=1 /transl_table=11 /product="two-component sensor histidine kinase AdeS" /protein_id="QNT86623.1" /translation="MKSKLGISKQLFIALTIVNLSVTLFSVVLGYVIYNYAIEKGWIS LSSFQQEDWTSFHFVDWIWLATVIFCGCIISLVIGMRLAKRFIVPINFLAEAAKKISH GDLSARAYDNRIHSAEMSELLYNFNDMAQKLEVSVKNAQVWNAAIAHELRAPITILQG RLQGIIDGVFKPDEVLFKSLLNQVEGLSHLVEDLRTLSLVENQQLRLNYELFDLKAVV EKVLKAFEDRLDQAKLVPELDLTSTPVYCDRRRIEQVLIALIDNSIRYSNAGKLKISS EVVADNWILKIEDEGPGIATEFQDDLFKPFFRLEESRNKEFGGTGLGLAVVHAIVVAL KGTIQYSNQGSKSVFTIKISMNN" gene complement(2018693..2019127) /locus_tag="H0N29_09480" CDS complement(2018693..2019127) /locus_tag="H0N29_09480" /inference="COORDINATES: similar to AA sequence:RefSeq:YP_004995263.1" /note="Derived by automated computational analysis using gene prediction method: Protein Homology." /codon_start=1 /transl_table=11 /product="HIT domain-containing protein" /protein_id="QNT83319.1" /translation="MFSLHPQLAQDTFFVGDFPLSTCRLMNDMQFPWLILVPRVPGIT ELYELSQADQEQFLRESSWLSSQLSRVFRADKMNVAALGNMVPQLHFHHVVRYQNDVA WPKPVWGTPAVPYTSDVLAHMRQTLMLALRGQGDMPFDWRMD"
Interhost variant calling (Data_Tam_DNAseq_2025_Y1Y2Y3Y4W1W2W3W4_Tig1_Tig2_dIJ_on_ATCC19606)
Leave a reply