-
Prepare input raw data
~/DATA/Data_DAMIAN_Post-processing_Flavivirus_and_FSME ln ./240621_M03701_0312_000000000-GHL9N/p20534/7448_7501_S0_R1_001.fastq.gz p20534_7448_R1.fastq.gz ln ./240621_M03701_0312_000000000-GHL9N/p20534/7448_7501_S0_R2_001.fastq.gz p20534_7448_R2.fastq.gz
-
Prepare virus database and select 8 representatives for the eight given viruses from the database
# -- Download genomes -- # ---- Date is 13.06.2025. ---- #Taxonomy ID: 3044782 #Die Gattung Orthoflavivirus (früher Flavivirus) umfasst behüllte Viren mit einem positivsträngigen RNA-Einzelstrang als Genom, die durch Arthropoden (Zecken und Stechmücken) als Vektoren auf Vögel und Säugetiere übertragen werden. #The English name for Flavivirus is simply: Flavivirus #It is both the scientific and common name for the genus of viruses in the family Flaviviridae. This genus includes several well-known viruses such as: * Dengue virus * Zika virus * West Nile virus * Yellow fever virus * Tick-borne encephalitis virus (TBEV / FSME virus) esearch -db nucleotide -query "txid3044782[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_3044782_ncbi.fasta python ~/Scripts/filter_fasta.py genome_3044782_ncbi.fasta complete_genome_3044782_ncbi.fasta #96579-->9431 #https://www.ebi.ac.uk/ena/browser/view/3044782 #Download FMSE esearch -db nucleotide -query "txid11084[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_11084_ncbi.fasta python ~/Scripts/filter_fasta.py genome_11084_ncbi.fasta complete_genome_11084_ncbi.fasta #3426-->219 #https://www.ebi.ac.uk/ena/browser/view/11084 samtools faidx complete_genome_11084_ncbi.fasta PV626569.1 > PV626569.fasta
-
Run the second round of vrap (–host==${virus}.fasta)
#cat FluB_PB1.fasta FluB_PB2.fasta FluB_PA.fasta FluB_HA.fasta FluB_NP.fasta FluB_NB_NA.fasta FluB_M1_BM2.fasta FluB_NEP_NS1.fasta > FluB.fasta # Run vrap (second round): selecte some representative viruses from the generated Excel-files generated by the last step as --host (vrap) for sample in p20534_7448; do vrap/vrap_until_bowtie2.py -1 ${sample}_R1.fastq.gz -2 ${sample}_R2.fastq.gz -o vrap_${sample}_on_FSME --host /home/jhuang/DATA/Data_DAMIAN_Post-processing_Flavivirus_and_FSME/PV626569.fasta -t 100 -l 200 --gbt2 --noblast done (vrap) for sample in p20534_7448; do vrap/vrap_until_bowtie2.py -1 ${sample}_R1.fastq.gz -2 ${sample}_R2.fastq.gz -o vrap_${sample}_on_Flavivirus --host /home/jhuang/DATA/Data_DAMIAN_Post-processing_Flavivirus_and_FSME/complete_genome_3044782_ncbi.fasta -t 100 -l 200 --gbt2 --noblast done
-
Generate the mapping statistics for the sam-files generated from last step
for sample in p20534_7448; do echo "-----${sample}_on_representatives------" >> LOG_mapping #cd vrap_${sample}_on_${virus}/bowtie cd vrap_${sample}_on_Flavivirus/bowtie # Rename and convert SAM to BAM mv mapped mapped.sam 2>> ../../LOG_mapping samtools view -S -b mapped.sam > mapped.bam 2>> ../../LOG_mapping samtools sort mapped.bam -o mapped_sorted.bam 2>> ../../LOG_mapping samtools index mapped_sorted.bam 2>> ../../LOG_mapping # Write flagstat output to log (go up two levels to write correctly) samtools flagstat mapped_sorted.bam >> ../../LOG_mapping 2>&1 #samtools idxstats mapped_sorted.bam >> ../../LOG_mapping 2>&1 cd ../.. done (bakta) jhuang@WS-2290C:/mnt/md1/DATA/Data_DAMIAN_Post-processing_Flavivirus_and_FSME/vrap_p20534_7448_on_FSME/bowtie$ samtools flagstat mapped_sorted.bam 7836046 + 0 in total (QC-passed reads + QC-failed reads) 7836046 + 0 primary 0 + 0 secondary 0 + 0 supplementary 0 + 0 duplicates 0 + 0 primary duplicates 0 + 0 mapped (0.00% : N/A) 0 + 0 primary mapped (0.00% : N/A) 5539082 + 0 paired in sequencing 2769541 + 0 read1 2769541 + 0 read2 0 + 0 properly paired (0.00% : N/A) 0 + 0 with itself and mate mapped 0 + 0 singletons (0.00% : N/A) 0 + 0 with mate mapped to a different chr 0 + 0 with mate mapped to a different chr (mapQ>=5) (bakta) jhuang@WS-2290C:/mnt/md1/DATA/Data_DAMIAN_Post-processing_Flavivirus_and_FSME/vrap_p20534_7448_on_Flavivirus/bowtie$ samtools flagstat mapped_sorted.bam 7836234 + 0 in total (QC-passed reads + QC-failed reads) 7836234 + 0 primary 0 + 0 secondary 0 + 0 supplementary 0 + 0 duplicates 0 + 0 primary duplicates 52 + 0 mapped (0.00% : N/A) 52 + 0 primary mapped (0.00% : N/A) 5539458 + 0 paired in sequencing 2769729 + 0 read1 2769729 + 0 read2 0 + 0 properly paired (0.00% : N/A) 4 + 0 with itself and mate mapped 13 + 0 singletons (0.00% : N/A) 0 + 0 with mate mapped to a different chr 0 + 0 with mate mapped to a different chr (mapQ>=5) samtools view -F 4 mapped_sorted.bam > mapped_reads.sam awk '{print $3}' mapped_reads.sam | sort | uniq -c 52 KY766069.1 Zika virus isolate Pf13/251013-18, complete genome # ------------------ DEBUG ---------------------- samtools idxstats mapped_sorted.bam | cut -f 1 for ref in PV424649.1 PV424650.1 PV424648.1 PV424643.1 PV424646.1 PV424645.1 PV424644.1 PV424647.1; do echo "Reference: $ref" samtools view -b mapped_sorted.bam "$ref" | samtools flagstat - done When I run samtools flagstat mapped_sorted.bam 49572521 + 0 in total (QC-passed reads + QC-failed reads) 0 + 0 secondary 0 + 0 supplementary 0 + 0 duplicates 1169 + 0 mapped (0.00% : N/A) 38247374 + 0 paired in sequencing 19123687 + 0 read1 19123687 + 0 read2 884 + 0 properly paired (0.00% : N/A) 934 + 0 with itself and mate mapped 227 + 0 singletons (0.00% : N/A) 0 + 0 with mate mapped to a different chr 0 + 0 with mate mapped to a different chr (mapQ>=5) however, wenn I run for ref in PV424649.1 PV424650.1 PV424648.1 PV424643.1 PV424646.1 PV424645.1 PV424644.1 PV424647.1; do echo "Reference: $ref" samtools view -b mapped_sorted.bam "$ref" | samtools flagstat - done Reference: PV424647.1 83 + 0 in total (QC-passed reads + QC-failed reads) 0 + 0 secondary 0 + 0 supplementary 0 + 0 duplicates 72 + 0 mapped (86.75% : N/A) 82 + 0 paired in sequencing 41 + 0 read1 41 + 0 read2 56 + 0 properly paired (68.29% : N/A) 60 + 0 with itself and mate mapped 11 + 0 singletons (13.41% : N/A) 0 + 0 with mate mapped to a different chr 0 + 0 with mate mapped to a different chr (mapQ>=5) I want to also the same total name as "samtools flagstat mapped_sorted.bam". How? samtools view -b mapped_sorted.bam PV424649.1 for ref in PV424649.1 PV424650.1 PV424648.1 PV424643.1 PV424646.1 PV424645.1 PV424644.1 PV424647.1; do echo "Reference: $ref" samtools view -h mapped_sorted.bam | grep -E "^@|$ref" | samtools view -Sb - | samtools flagstat - done # ---- DEBUG END ---- #draw some plots for some representative isolates which found in the first round (see Excel-file). samtools depth -m 0 -a mapped_sorted.bam > coverage.txt #grep "PV424649.1" coverage.txt > FluB_PB1_coverage.txt #grep "PV424650.1" coverage.txt > FluB_PB2_coverage.txt #grep "PV424648.1" coverage.txt > FluB_PA_coverage.txt #grep "PV424643.1" coverage.txt > FluB_HA_coverage.txt #grep "PV424646.1" coverage.txt > FluB_NP_coverage.txt #grep "PV424645.1" coverage.txt > FluB_NB_NA_coverage.txt #grep "PV424644.1" coverage.txt > FluB_M1_BM2_coverage.txt #grep "PV424647.1" coverage.txt > FluB_NEP_NS1_coverage.txt import pandas as pd import matplotlib.pyplot as plt # Load coverage data df = pd.read_csv("coverage.txt", sep="\t", header=None, names=["chr", "pos", "coverage"]) # Plot plt.figure(figsize=(10,4)) plt.plot(df["pos"], df["coverage"], color="blue", linewidth=0.5) plt.xlabel("Genomic Position") plt.ylabel("Coverage Depth") plt.title("BAM Coverage Plot") plt.show()
-
Report
Subject: Mapping Results for FluB Representative Isolate I have re-analyzed sample P20534 (7448) with a focus on Flaviviruses and FSME. Using curated reference sets from NCBI (Taxonomy ID 3044782 for Flavivirus, comprising 9,431 complete genomes—see attached flavivirus_names.txt for details; and Taxonomy ID 11084 for FSME, with 219 complete genomes), I performed targeted mapping. The key findings are summarized below: * Total reads: 7,836,234 * Mapped to Flavivirus: 52 reads All 52 reads mapped specifically to Zika virus (KY766069.1, complete genome of isolate Pf13/251013-18) * Mapped to FSME: No significant hits detected Please find attached a coverage plot for the Zika virus genome (KY766069).
-
Preparing a database containing all representative viruses from NCBI Virus #https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/ #Download All Records (18,708) am 26.05.2025
# ------------ Manually update the internal viral databases -------------- ##https://www.ebi.ac.uk/ena/browser/view/10239 #esearch -db nucleotide -query "txid10239[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_10239_ncbi.fasta #esearch -db protein -query "txid11520[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > protein_11520_ncbi.fasta #mv ~/Tools/vrap/database/viral_db/nucleotide.fa ~/Tools/vrap/database/viral_db/nucleotide_Human_alphaherpesvirus_1.fa #mv ~/Tools/vrap/database/viral_db/protein.fa ~/Tools/vrap/database/viral_db/protein_Human_alphaherpesvirus_1.fa #cp genome_11520_ncbi.fasta ~/Tools/vrap/database/viral_db/nucleotide.fa #cp protein_11520_ncbi.fasta ~/Tools/vrap/database/viral_db/protein.fa #cd ~/Tools/vrap/database/viral_db #~/Tools/vrap/external_tools/blast/makeblastdb -in nucleotide.fa -dbtype nucl -parse_seqids -out virus_nucleotide #~/Tools/vrap/external_tools/blast/makeblastdb -in protein.fa -dbtype prot -parse_seqids -out virus_protein #vrap/vrap_noassembly.py -1 AW005486_R1.fastq.gz -2 AW005486_R2.fastq.gz -o vrap_AW005486_on_InfluB --bt2idx=/home/jhuang/REFs/genome --host=/home/jhuang/REFs/genome.fa --virus=/mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/complete_11520_ncbi.fasta -t 20 -l 200 -g # ----------- Three databases ---------- #db is [virus_user_db] /home/jhuang/Tools/vrap/external_tools/blast/makeblastdb -in /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/custom_viral_seq.fa -dbtype nucl -parse_seqids -out /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/db/virus >> /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/vrap.log 2>> /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/vrap.log #db is ~/Tools/vrap/database/viral_db/nucleotide.fa [Human alphaherpesvirus 1] [virus_nt_db] /home/jhuang/Tools/vrap/external_tools/blast/blastn -num_threads 20 -query /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/vrap_contig.fasta -db "/home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/db/virus" -evalue 1e-4 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/blastn.csv >> /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/vrap.log Warning: [blastn] Examining 5 or more matches is recommended #db is ~/Tools/vrap/database/viral_db/protein.fa [Human alphaherpesvirus 1] [virus_aa_db] /home/jhuang/Tools/vrap/external_tools/blast/blastx -num_threads 20 -query /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/blastn.fa -db "/home/jhuang/Tools/vrap/database/viral_db/viral_protein" -evalue 1e-6 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/blastx.csv >> /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/vrap.log
DAMIAN Post-processing for Flavivirus and FSME
Leave a reply