-
Input data:
# ---- Datasets 2024 (in total 4) ---- ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/hCoV229E_Rluc_R1.fastq.gz hCoV229E_Rluc_R1.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/hCoV229E_Rluc_R2.fastq.gz hCoV229E_Rluc_R2.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_DMSO_R1.fastq.gz DMSO_p10_R1.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_DMSO_R2.fastq.gz DMSO_p10_R2.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_K22_R1.fastq.gz K22_p10_R1.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_K22_R2.fastq.gz K22_p10_R2.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_K7523_R1.fastq.gz X7523_p10_R1.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2024/p10_K7523_R2.fastq.gz X7523_p10_R2.fastq.gz # ---- Datasets 2025 (in total 3) ---- ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20606/p16_DMSO_S29_R1_001.fastq.gz DMSO_p16_R1.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20606/p16_DMSO_S29_R2_001.fastq.gz DMSO_p16_R2.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20607/p16_K22_S30_R1_001.fastq.gz K22_p16_R1.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20607/p16_K22_S30_R2_001.fastq.gz K22_p16_R2.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20608/p16_X7523_S31_R1_001.fastq.gz X7523_p16_R1.fastq.gz ln -s ../../Data_Pietschmann_229ECoronavirus_Mutations_2025/raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20608/p16_X7523_S31_R2_001.fastq.gz X7523_p16_R2.fastq.gz # ---- Datasets 2026 (in total 3) ---- ln -s ../raw_data_2026/20260212_AV243904_0054_B/02_DMSO_p26/02_DMSO_p26_R1.fastq.gz DMSO_p26_R1.fastq.gz ln -s ../raw_data_2026/20260212_AV243904_0054_B/02_DMSO_p26/02_DMSO_p26_R2.fastq.gz DMSO_p26_R2.fastq.gz ln -s ../raw_data_2026/20260212_AV243904_0054_B/01_K22_p26/01_K22_p26_R1.fastq.gz K22_p26_R1.fastq.gz ln -s ../raw_data_2026/20260212_AV243904_0054_B/01_K22_p26/01_K22_p26_R2.fastq.gz K22_p26_R2.fastq.gz ln -s ../raw_data_2026/20260212_AV243904_0054_B/03_X723_p26/03_X723_p26_R1.fastq.gz X7523_p26_R1.fastq.gz ln -s ../raw_data_2026/20260212_AV243904_0054_B/03_X723_p26/03_X723_p26_R2.fastq.gz X7523_p26_R2.fastq.gz -
Call variant calling using snippy
ln -s ~/Tools/bacto/db/ .; ln -s ~/Tools/bacto/envs/ .; ln -s ~/Tools/bacto/local/ .; cp ~/Tools/bacto/Snakefile .; cp ~/Tools/bacto/bacto-0.1.json .; cp ~/Tools/bacto/cluster.json .; #download CU459141.gb from GenBank mv ~/Downloads/sequence\(2\).gb db/PP810610.gb #setting the following in bacto-0.1.json "fastqc": false, "taxonomic_classifier": false, "assembly": true, "typing_ariba": false, "typing_mlst": true, "pangenome": true, "variants_calling": true, "phylogeny_fasttree": true, "phylogeny_raxml": true, "recombination": false, (due to gubbins-error set false) "genus": "Alphacoronavirus", "kingdom": "Viruses", "species": "Human coronavirus 229E", "mykrobe": { "species": "corona" }, "reference": "db/PP810610.gb" mamba activate /home/jhuang/miniconda3/envs/bengal3_ac3 (bengal3_ac3) /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds -
Summarize all SNPs and Indels from the snippy result directory.
cp ~/Scripts/summarize_snippy_res_ordered.py . # IMPORTANT_ADAPT the array isolates = ["hCoV229E_Rluc", "DMSO_p10", "K22_p10", "X7523_p10", "DMSO_p16", "K22_p16", "X7523_p16", "DMSO_p26", "K22_p26", "X7523_p26"] mamba activate plot-numpy1 python3 ./summarize_snippy_res_ordered.py snippy #--> Summary CSV file created successfully at: snippy/summary_snps_indels.csv cd snippy #REMOVE_the_line? I don't find the sence of the line: grep -v "None,,,,,,None,None" summary_snps_indels.csv > summary_snps_indels_.csv -
Using spandx calling variants (almost the same results to the one from viral-ngs!)
mamba deactivate mamba activate /home/jhuang/miniconda3/envs/spandx mkdir ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610 cp PP810610.gb ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610/genes.gbk vim ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config /home/jhuang/miniconda3/envs/spandx/bin/snpEff build PP810610 #-d ~/Scripts/genbank2fasta.py PP810610.gb mv PP810610.gb_converted.fna PP810610.fasta #rename "NC_001348.1 xxxxx" to "NC_001348" in the fasta-file ln -s /home/jhuang/Tools/spandx/ spandx (spandx) nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq" --ref PP810610.fasta --annotation --database PP810610 -resume # RERUN SNP_matrix.sh due to the error ERROR_CHROMOSOME_NOT_FOUND in the variants annotation --> IT WORKS! cd Outputs/Master_vcf conda activate /home/jhuang/miniconda3/envs/spandx (spandx) cp -r ../../snippy/hCoV229E_Rluc/reference . (spandx) cp ../../spandx/bin/SNP_matrix.sh ./ #Note that ${variant_genome_path}=NC_001348 in the following command, but it was not used after the following command modification. #Adapt "snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -v ${variant_genome_path} out.vcf > out.annotated.vcf" to "/home/jhuang/miniconda3/envs/bengal3_ac3/bin/snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -c reference/snpeff.config -dataDir . ref out.vcf > out.annotated.vcf" in SNP_matrix.sh (spandx) bash SNP_matrix.sh PP810610 . -
Calling inter-host variants by merging the results from snippy+spandx (Manually! Note that the sample order of two tools are different, we should reorder the SNP-columns of spandx-results.)
# Inter-host variants(宿主间变异):一种病毒在两个人之间有不同的基因变异,这些变异可能与宿主的免疫反应、疾病表现或病毒传播的方式相关。 cp All_SNPs_indels_annotated.txt All_SNPs_indels_annotated_backup.txt vim All_SNPs_indels_annotated.txt #in the file ids: grep "$(echo -e '\t')353$(echo -e '\t')" All_SNPs_indels_annotated.txt >> All_SNPs_indels_annotated_.txt #Replace \n with " All_SNPs_indels_annotated.txt >> All_SNPs_indels_annotated_.txt\ngrep " #Replace grep " --> grep "$(echo -e '\t') #Replace " All_ --> $(echo -e '\t')" All_ # Potential intra-host variants: 10871, 19289, 21027, 23435. CHROM POS REF ALT TYPE hCoV229E_Rluc DMSO_p10 K22_p10 X7523_p10 DMSO_p16 K22_p16 X7523_p16 DMSO_p26 K22_p26 X7523_p26 Effect Impact Functional_Class Codon_change Protein_and_nucleotide_change Amino_Acid_Length Gene_name Biotype PP810610 1464 T C SNP C C C C C C C C C C missense_variant MODERATE MISSENSE gTt/gCt p.Val416Ala/c.1247T>C 6757 CDS_1 protein_coding PP810610 1492 T A SNP T/A T/A T/A T/A T/A T/A T/A T T/A T/A synonymous_variant LOW SILENT acT/acA p.Thr425Thr/c.1275T>A 6757 CDS_1 protein_coding PP810610 1699 C T SNP T T T T T T T T T T synonymous_variant LOW SILENT gtC/gtT p.Val494Val/c.1482C>T 6757 CDS_1 protein_coding PP810610 4809 A C SNP A A A A A A A/C A A A/C missense_variant MODERATE MISSENSE gAt/gCt p.Asp1531Ala/c.4592A>C 6757 CDS_1 protein_coding PP810610 6470 C T SNP C C C C C C C C C/T C synonymous_variant LOW SILENT Ctg/Ttg p.Leu2085Leu/c.6253C>T 6757 CDS_1 protein_coding PP810610 6691 C T SNP T T T T T T T T T T synonymous_variant LOW SILENT tgC/tgT p.Cys2158Cys/c.6474C>T 6757 CDS_1 protein_coding PP810610 6919 C G SNP G G G G G G G G G G synonymous_variant LOW SILENT ggC/ggG p.Gly2234Gly/c.6702C>G 6757 CDS_1 protein_coding PP810610 7294 T A SNP A A A A A A A A A A missense_variant MODERATE MISSENSE agT/agA p.Ser2359Arg/c.7077T>A 6757 CDS_1 protein_coding PP810610 8289 C A SNP C/A C/A C C/A C C C/A C C C/A missense_variant MODERATE MISSENSE gCc/gAc p.Ala2691Asp/c.8072C>A 6757 CDS_1 protein_coding PP810610 8294 A G SNP A/G A A/G A A A/G A A A/G A missense_variant MODERATE MISSENSE Aaa/Gaa p.Lys2693Glu/c.8077A>G 6757 CDS_1 protein_coding PP810610 8376 G T SNP G/T G G G G G G G G G missense_variant MODERATE MISSENSE cGt/cTt p.Arg2720Leu/c.8159G>T 6757 CDS_1 protein_coding PP810610 9146 T C SNP T T T T/C T T T/C T T T missense_variant MODERATE MISSENSE Ttt/Ctt p.Phe2977Leu/c.8929T>C 6757 CDS_1 protein_coding PP810610 9174 G A SNP G G G G/A G G G/A G G G missense_variant MODERATE MISSENSE tGc/tAc p.Cys2986Tyr/c.8957G>A 6757 CDS_1 protein_coding PP810610 9261 C T SNP C C C C C C C C C/T C missense_variant MODERATE MISSENSE gCt/gTt p.Ala3015Val/c.9044C>T 6757 CDS_1 protein_coding PP810610 10145 A G SNP A A A A/G A A A/G A A A/G missense_variant MODERATE MISSENSE Atg/Gtg p.Met3310Val/c.9928A>G 6757 CDS_1 protein_coding PP810610 10239 T G SNP T T T/G T T T/G T T T T missense_variant MODERATE MISSENSE gTg/gGg p.Val3341Gly/c.10022T>G 6757 CDS_1 protein_coding PP810610 10310 G A SNP G G G G/A G G G/A G G G/A missense_variant MODERATE MISSENSE Gtt/Att p.Val3365Ile/c.10093G>A 6757 CDS_1 protein_coding PP810610 10432 C T SNP C C C C C C C C C C/T synonymous_variant LOW SILENT ttC/ttT p.Phe3405Phe/c.10215C>T 6757 CDS_1 protein_coding PP810610 10871 C T SNP C C/T T C/T C/T T C/T C T C/T missense_variant MODERATE MISSENSE Ctt/Ttt p.Leu3552Phe/c.10654C>T 6757 CDS_1 protein_coding PP810610 10898 G A SNP G G/A G G/A G G G/A G G G/A missense_variant MODERATE MISSENSE Ggc/Agc p.Gly3561Ser/c.10681G>A 6757 CDS_1 protein_coding PP810610 11418 C A SNP C C C C C C C C C/A C missense_variant MODERATE MISSENSE aCt/aAt p.Thr3734Asn/c.11201C>A 6757 CDS_1 protein_coding PP810610 11577 A C SNP A A/C A A A/C A A A/C A A missense_variant MODERATE MISSENSE gAa/gCa p.Glu3787Ala/c.11360A>C 6757 CDS_1 protein_coding PP810610 14472 T C SNP C C C C C C C C C C missense_variant MODERATE MISSENSE aTg/aCg p.Met4752Thr/c.14255T>C 6757 CDS_1 protein_coding PP810610 15270 G A SNP G G G G G G G G G G/A missense_variant MODERATE MISSENSE cGa/cAa p.Arg5018Gln/c.15053G>A 6757 CDS_1 protein_coding PP810610 15458 T C SNP C C C C C C C C C C synonymous_variant LOW SILENT Ttg/Ctg p.Leu5081Leu/c.15241T>C 6757 CDS_1 protein_coding PP810610 16035 C A SNP A A A A A A A A A A stop_gained HIGH NONSENSE tCa/tAa p.Ser5273*/c.15818C>A 6757 CDS_1 protein_coding PP810610 17119 A G SNP A A A A A A A A A A/G synonymous_variant LOW SILENT aaA/aaG p.Lys5634Lys/c.16902A>G 6757 CDS_1 protein_coding PP810610 17430 T C SNP C C C C C C C C C C missense_variant MODERATE MISSENSE tTa/tCa p.Leu5738Ser/c.17213T>C 6757 CDS_1 protein_coding PP810610 18640 T G SNP T T T T/G T T T/G T T T/G missense_variant MODERATE MISSENSE tgT/tgG p.Cys6141Trp/c.18423T>G 6757 CDS_1 protein_coding PP810610 18646 C T SNP C C C C/T C C C C C C synonymous_variant LOW SILENT taC/taT p.Tyr6143Tyr/c.18429C>T 6757 CDS_1 protein_coding PP810610 18701 A G SNP A A A A/G A A A/G A A A missense_variant MODERATE MISSENSE Aca/Gca p.Thr6162Ala/c.18484A>G 6757 CDS_1 protein_coding PP810610 19028 C T SNP C C C C/T C C C C C C stop_gained HIGH NONSENSE Caa/Taa p.Gln6271*/c.18811C>T 6757 CDS_1 protein_coding PP810610 19289 G T SNP G G T G G G/T G G G/T G missense_variant MODERATE MISSENSE Gtt/Ttt p.Val6358Phe/c.19072G>T 6757 CDS_1 protein_coding PP810610 19294 C T SNP C C C C C C/T C/T C C/T C synonymous_variant LOW SILENT taC/taT p.Tyr6359Tyr/c.19077C>T 6757 CDS_1 protein_coding PP810610 21027 C T SNP C C/T C C/T C/T C C/T T C C/T missense_variant MODERATE MISSENSE aCt/aTt p.Thr178Ile/c.533C>T 1173 CDS_2 protein_coding PP810610 21183 T G SNP G G G G G G G G G G missense_variant MODERATE MISSENSE tTt/tGt p.Phe230Cys/c.689T>G 1173 CDS_2 protein_coding PP810610 21633 T C SNP T T/C T T T T T T T T missense_variant MODERATE MISSENSE gTt/gCt p.Val380Ala/c.1139T>C 1173 CDS_2 protein_coding PP810610 22215 T G SNP T T T T/G T T T/G T T T missense_variant MODERATE MISSENSE gTt/gGt p.Val574Gly/c.1721T>G 1173 CDS_2 protein_coding PP810610 22487 A G SNP A A A A A A/G A A A/G A missense_variant MODERATE MISSENSE Agt/Ggt p.Ser665Gly/c.1993A>G 1173 CDS_2 protein_coding PP810610 22630 C T SNP C C C C C C C C C C/T synonymous_variant LOW SILENT taC/taT p.Tyr712Tyr/c.2136C>T 1173 CDS_2 protein_coding PP810610 22636 T G SNP G G G G G G G G G G missense_variant MODERATE MISSENSE aaT/aaG p.Asn714Lys/c.2142T>G 1173 CDS_2 protein_coding PP810610 22763 G T SNP G G G G G G G G G/T G missense_variant MODERATE MISSENSE Gct/Tct p.Ala757Ser/c.2269G>T 1173 CDS_2 protein_coding PP810610 22788 T C SNP T T T T T T/C T T/C T/C T missense_variant MODERATE MISSENSE gTt/gCt p.Val765Ala/c.2294T>C 1173 CDS_2 protein_coding PP810610 23022 T C SNP C C C C C C C C C C missense_variant MODERATE MISSENSE tTa/tCa p.Leu843Ser/c.2528T>C 1173 CDS_2 protein_coding PP810610 23435 C T SNP C C T C/T C C/T C/T C C/T C missense_variant MODERATE MISSENSE Ctt/Ttt p.Leu981Phe/c.2941C>T 1173 CDS_2 protein_coding PP810610 24781 C T,G SNP T T T T T T T T T T/G missense_variant MODERATE MISSENSE aCt/aGt p.Thr36Ser/c.107C>G 77 CDS_5 protein_coding PP810610 24971 A G SNP A A A A A A A A A/G A missense_variant MODERATE MISSENSE Aat/Gat p.Asn18Asp/c.52A>G 225 CDS_6 protein_coding PP810610 25025 C T SNP C C/T C C C/T C C C/T C C/T missense_variant MODERATE MISSENSE Cac/Tac p.His36Tyr/c.106C>T 225 CDS_6 protein_coding PP810610 25163 C T SNP T T T T T T T T T T missense_variant MODERATE MISSENSE Ctt/Ttt p.Leu82Phe/c.244C>T 225 CDS_6 protein_coding PP810610 25264 C T SNP T T T T T T T T T T synonymous_variant LOW SILENT gtC/gtT p.Val115Val/c.345C>T 225 CDS_6 protein_coding PP810610 25592 T C SNP T T/C T T T/C T T T/C T T/C missense_variant MODERATE MISSENSE Ttc/Ctc p.Phe225Leu/c.673T>C 225 CDS_6 protein_coding PP810610 26104 C T SNP C C C C C C C C C C/T missense_variant MODERATE MISSENSE cCt/cTt p.Pro165Leu/c.494C>T 389 CDS_7 protein_coding PP810610 26838 G T SNP T T T T T T T T T T PP810610 24731 TGTGGTGCTTATA T DEL TGTGGTGCTTATA TGTGGTGCTTATA TGTGGTGCTTATA TGTGGTGCTTATA TGTGGTGCTTATA TGTGGTGCTTATA TGTGGTGCTTATA TGTGGTGCTTATA T TGTGGTGCTTATA conservative_inframe_deletion c.61_72delGTGCTTATAGTG p.Val21_Val24del -
Calling intra-host variants using viral-ngs
# #Install Gap2Seq in the docker-env --> generated the new docker-env viral-ngs-fixed:la # bin/assembly.py gapfill_gap2seq tmp/02_assembly/hCoV229E_Rluc.assembly2-scaffolded.fasta data/01_per_sample/hCoV229E_Rluc.cleaned.bam tmp/02_assembly/# hCoV229E_Rluc.assembly2-gapfilled.fasta --memLimitGb 12 --maskErrors --randomSeed 0 # ----> go to the script tools/gap2seq.py, install the required tool (for example gap2seq) and adapt the correct version. # conda install -y -c bioconda gap2seq # root@f47daf7c44ee:/work# Gap2Seq -h # #>Gap2Seq 3.1 # vim /home/jhuang/Tools/viral-ngs_docker/bin/tools/gap2seq.py # #Adapt the TOOL_NAME and TOOL_VERSION in the script # # #Save the docker-env with newly installed Gap2Seq # exit # docker ps -a # docker commit f47daf7c44ee viral-ngs-fixed:la #调用新的 docker-env having Gap2Seq docker run -it -v /mnt/md1/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2026/viralngs:/work -v /home/jhuang/Tools/viral-ngs_docker:/home/jhuang/Tools/viral-ngs_docker -v /home/jhuang/REFs:/home/jhuang/REFs -v /home/jhuang/Tools/GenomeAnalysisTK-3.6:/home/jhuang/Tools/GenomeAnalysisTK-3.6 -v /home/jhuang/Tools/novocraft_v3:/home/jhuang/Tools/novocraft_v3 -v /usr/local/bin/gatk:/usr/local/bin/gatk viral-ngs-fixed:la bash cd /work conda activate viral-ngs-env snakemake --directory /work --printshellcmds --cores 80 # The complete running commands for the complete data in 2026 as follows: #-1- Run several times of docker viral-ngs-fixed:la so that all ${sample}.assembly2-scaffolded.fasta generated by running each time only a isolate (one record in samples-assembly.txt and empty in samples-runs.txt) docker run ... cd /work conda activate viral-ngs-env snakemake --directory /work --printshellcmds --cores 80 # --> Note that tmp/02_assembly/${sample}.assembly2-scaffolded.fasta and tmp/02_assembly/${sample}.assembly2-gapfilled.fasta were generated! #-2- Then run the following commands: tmp/02_assembly/${sample}.assembly2-gapfilled.fasta --> tmp/02_assembly/${sample}.assembly3-modify.fasta for sample in DMSO_p10 K22_p10 X7523_p10 DMSO_p16 K22_p16 X7523_p16 DMSO_p26 K22_p26 X7523_p26; do #MANUALLY running the following commands! bin/assembly.py impute_from_reference tmp/02_assembly/${sample}.assembly2-gapfilled.fasta tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta tmp/02_assembly/${sample}.assembly3-modify.fasta --newName ${sample} --replaceLength 55 --minLengthFraction 0.05 --minUnambig 0.05 --index done #-3- Run docker for all isolates (full in amples-assembly.txt and samples-runs.txt). docker run ... cd /work conda activate viral-ngs-env snakemake --directory /work --printshellcmds --cores 80 -
Generate variant_annot.xls and coverages.xls
sudo chown -R jhuang:jhuang data # -- generate isnvs_annot_complete__.txt, isnvs_annot_0.05.txt from ~/DATA/Data_Pietschmann_RSV_Probe3/data/04_intrahost cp isnvs.annot.txt isnvs.annot_complete.txt ~/Tools/csv2xls-0.4/csv_to_xls.py isnvs.annot_complete.txt -d$'\t' -o isnvs.annot_complete.xls #delete the columns patient, time, Hw and Hs and the header in the xls and save as txt file. awk '{printf "%.3f\n", $5}' isnvs.annot_complete.csv > f5 cut -f1-4 isnvs.annot_complete.csv > f1_4 cut -f6- isnvs.annot_complete.csv > f6_ paste f1_4 f5 > f1_5 paste f1_5 f6_ > isnvs_annot_complete_.txt #correct f5 in header of isnvs_annot_complete_.txt to iSNV_freq #Add header: chr pos sample alleles iSNV_freq eff_type eff_codon_dna eff_aa eff_aa_pos eff_prot_len eff_gene eff_protein ~/Tools/csv2xls-0.4/csv_to_xls.py isnvs_annot_complete_.txt -d$'\t' -o variant_annot.xls #MANUALLY generate variant_annot_0.01.csv variant_annot_0.05.csv awk ' $5 >= 0.05 ' isnvs_annot_complete_.txt > 0.05.csv cut -f2 0.05.csv | uniq > ids_0.05 awk ' $5 >= 0.02 ' isnvs_annot_complete_.txt > 0.02.csv cut -f2 0.02.csv | uniq > ids_0.02 awk ' $5 >= 0.01 ' isnvs_annot_complete_.txt > 0.01.csv cut -f2 0.01.csv | uniq > ids_0.01 #Replace '\n' with '\\t" isnvs_annot_complete_.txt >> isnvs_annot_0.05.txt\ngrep -P "PP810610\\t' in ids_0.05 and then deleting the 'pos' line #Replace '\n' with '\\t" isnvs_annot_complete_.txt >> isnvs_annot_0.01.txt\ngrep -P "PP810610\\t' in ids_0.01 and then deleting the 'pos' line #Run ids_0.05 and ids_0.01 cp ../../Outputs/Master_vcf/All_SNPs_indels_annotated.txt ../../Outputs/Master_vcf/All_SNPs_indels_annotated.txt hCoV229E_Rluc_variants #Manully adding the 3-way SNPs by getting original data from isnvs.annot.csv, for example, PP810610 8289 hCoV229E_Rluc C,A,T 0.325, 0.439 missense_variant 8072C>A,8072C>T Ala2691Asp,Ala2691Val 2691 6758 Gene_217_20492 XBA84229.1 # Delete the three records which already reported in intra-host results hCoV229E_Rluc_variants: they are 10871, 19289, 23435. PP810610 10871 C T SNP C C/T T C/T C/T T C/T missense_variant MODERATE MISSENSE Ctt/Ttt p.Leu3552Phe/c.10654C>T 6757 CDS_1 protein_coding PP810610 19289 G T SNP G G T G G G/T G missense_variant MODERATE MISSENSE Gtt/Ttt p.Val6358Phe/c.19072G>T 6757 CDS_1 protein_coding PP810610 23435 C T SNP C C T C/T C C/T C/T missense_variant MODERATE MISSENSE Ctt/Ttt p.Leu981Phe/c.2941C>T 1173 CDS_2 protein_coding ~/Tools/csv2xls-0.4/csv_to_xls.py isnvs_annot_0.02.txt All_SNPs_indels_annotated.txt -d$'\t' -o variant_annot_2026.xls #Modify sheetname to variant_annot_0.05 and variant_annot_0.01 and add the header in Excel file. #Note in the complete list, Set 2024 is NOT a subset of Set 2025 because the element 26283 is in set 2024 but missing from set 2025. # -- calculate the coverage samtools depth ./data/02_align_to_self/hCoV229E_Rluc.mapped.bam > hCoV229E_Rluc_cov.txt samtools depth ./data/02_align_to_self/p10_DMSO.mapped.bam > p10_DMSO_cov.txt samtools depth ./data/02_align_to_self/p10_K22.mapped.bam > p10_K22_cov.txt samtools depth ./data/02_align_to_self/p10_K7523.mapped.bam > p10_K7523_cov.txt ~/Tools/csv2xls-0.4/csv_to_xls.py hCoV229E_Rluc_cov.txt p10_DMSO_cov.txt p10_K22_cov.txt p10_K7523_cov.txt -d$'\t' -o coverages.xls #draw coverage and see if they are continuous? samtools depth ./data/02_align_to_self/p16_DMSO.mapped.bam > p16_DMSO_cov.txt samtools depth ./data/02_align_to_self/p16_K22.mapped.bam > p16_K22_cov.txt samtools depth ./data/02_align_to_self/p16_X7523.mapped.bam > p16_K7523_cov.txt ~/Tools/csv2xls-0.4/csv_to_xls.py p16_DMSO_cov.txt p16_K22_cov.txt p16_K7523_cov.txt -d$'\t' -o coverages_p16.xls # Load required packages library(ggplot2) library(dplyr) # Read the coverage data cov_data <- read.table("p16_K7523_cov.txt", header = FALSE, sep = "\t", col.names = c("Chromosome", "Position", "Coverage")) # Create full position range for the given chromosome full_range <- data.frame(Position = seq(min(cov_data$Position), max(cov_data$Position))) # Merge with actual coverage data and fill missing positions with 0 cov_full <- full_range %>% left_join(cov_data[, c("Position", "Coverage")], by = "Position") %>% mutate(Coverage = ifelse(is.na(Coverage), 0, Coverage)) # Save the plot to PNG png("p16_K7523_coverage_filled.png", width = 1200, height = 600) ggplot(cov_full, aes(x = Position, y = Coverage)) + geom_line(color = "steelblue", size = 0.3) + labs(title = "Coverage Plot for p16_K7523 (Missing = 0)", x = "Genomic Position", y = "Coverage Depth") + theme_minimal() + theme( plot.title = element_text(hjust = 0.5), axis.text = element_text(size = 10), axis.title = element_text(size = 12) ) dev.off() -
Comparing intra- and inter-host variants (40 positions are common between spandx and vphaser2)
Please find attached the complete variant calling results for our 229E passaging experiment (2024–2026 datasets): variant_annot_2026__final.xls. • Sheet 1 (All_SNPs_Indels_annotated): Inter-host variants called by spandx. Shows consensus alleles at each variant position across all 10 samples relative to the PP810610 reference. • Sheet 2 (isnvs_annot): Intra-host variants called by vphaser2. Provides precise allele frequencies (iSNV_freq) per sample to quantify within-host diversity. Overlap: Green-highlighted rows indicate variants consistently detected by both pipelines. Sheet 2 is a subset of Sheet 1, focusing only on positions with intra-host heterogeneity. Fixed differences from the reference (where all 10 isolates share the same non-reference allele) remain in Sheet 1 (white rows). A few positions in Sheet 2 show iSNV_freq ≈ 1.0 (e.g., POS 15458, 22636). These reflect positions where vphaser2 detected minor variation in at least one sample, even if most samples show near-fixation. -
(Optional) Consensus sequences of each and of all isolates
cat PP810610.1.fa OZ035258.1.fa MZ712010.1.fa OK662398.1.fa OK625404.1.fa KF293664.1.fa NC_002645.1.fa > all.fa cp data/02_assembly/*.fasta ./ for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523; do \ mv ${sample}.fasta ${sample}.fa cat all.fa ${sample}.fa >> all.fa done cat RSV_dedup.fa all.fa > RSV_all.fa mafft --clustalout --adjustdirection RSV_all.fa > RSV_all.aln snp-sites RSV_all.aln -o RSV_all_.aln
Variant Calling: 229E Passaging Dataset (2024–2026) – Inter- & Intra-host Analysis using SPANDx and VPhaser2, only one deletion generated by snippy
Leave a reply