gene_x 0 like s 166 view s
Tags: pipeline
Prepare input raw data
/mnt/md1/DATA/Data_Sophie_HDV_Sequences/raw_data
for f in *_R[12]_001.fastq.gz; do newname="$(echo "$f" | awk -F_ '{print $1 "_" $4 ".fastq.gz"}')"; echo mv "$f" "$newname"; done
for f in *_R[12]_001.fastq.gz; do newname="$(echo "$f" | awk -F_ '{print $1 "_" $4 ".fastq.gz"}')"; mv "$f" "$newname"; done
Call variant calling using snippy
ln -s ~/Tools/bacto/db/ .;
ln -s ~/Tools/bacto/envs/ .;
ln -s ~/Tools/bacto/local/ .;
cp ~/Tools/bacto/Snakefile .;
cp ~/Tools/bacto/bacto-0.1.json .;
cp ~/Tools/bacto/cluster.json .;
#download CU459141.gb from GenBank
mv ~/Downloads/sequence\(2\).gb db/NC_001653.gb
#setting the following in bacto-0.1.json
"fastqc": false,
"taxonomic_classifier": false,
"assembly": true,
"typing_ariba": false,
"typing_mlst": true,
"pangenome": true,
"variants_calling": true,
"phylogeny_fasttree": true,
"phylogeny_raxml": true,
"recombination": false, (due to gubbins-error set false)
"genus": "Alphacoronavirus",
"kingdom": "Viruses",
"species": "Human coronavirus 229E",
"mykrobe": {
"species": "corona"
},
"reference": "db/PP810610.gb"
mamba activate /home/jhuang/miniconda3/envs/bengal3_ac3
(bengal3_ac3) /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
Prepare virus database
# ---- Date is 16.06.2025. ----
#Taxonomy ID: 12475
esearch -db nucleotide -query "txid12475[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_12475_ncbi.fasta
python ~/Scripts/filter_fasta.py genome_12475_ncbi.fasta complete_genome_12475_ncbi.fasta #4208-->760
#https://de.wikipedia.org/wiki/Hepatitis-D-Virus
Hepatitis delta virus, complete genome
NCBI Reference Sequence: NC_001653.2
(Deprecated) Calling intra-host variants using viral-ngs
#How to run and debug the viral-ngs docker?
mkdir viralngs; cd viralngs
ln -s ~/Tools/viral-ngs_docker/Snakefile Snakefile
ln -s ~/Tools/viral-ngs_docker/bin bin
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/refsel.acids refsel.acids
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/lastal.acids lastal.acids
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/config.yaml config.yaml
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-runs.txt samples-runs.txt
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-depletion.txt samples-depletion.txt
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-metagenomics.txt samples-metagenomics.txt
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-assembly.txt samples-assembly.txt
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-assembly-failures.txt samples-assembly-failures.txt
# Adapt the sample-*.txt
mkdir viralngs/data
mkdir viralngs/data/00_raw
mkdir bams
ref_fa="NC_001653.fasta";
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
bwa index ${ref_fa}; \
bwa mem -M -t 16 ${ref_fa} trimmed/${sample}_trimmed_P_1.fastq trimmed/${sample}_trimmed_P_2.fastq | samtools view -bS - > bams/${sample}_genome_alignment.bam; \
done
conda activate viral-ngs4
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
picard AddOrReplaceReadGroups I=bams/${sample}_genome_alignment.bam O=~/DATA/Data_Sophie_HDV_Sequences/viralngs/data/00_raw/${sample}.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=$sample RGSM=$sample RGLB=standard RGPU=$sample VALIDATION_STRINGENCY=LENIENT; \
done
conda deactivate
# Activate the docker viralngs environment
docker run -it --rm -v /mnt/md1/DATA/Data_Sophie_HDV_Sequences/viralngs:/work -v /home/jhuang/Tools/viral-ngs_docker:/home/jhuang/Tools/viral-ngs_docker -v /home/jhuang/REFs:/home/jhuang/REFs -v /home/jhuang/Tools/GenomeAnalysisTK-3.6:/home/jhuang/Tools/GenomeAnalysisTK-3.6 -v /home/jhuang/Tools/novocraft_v3:/home/jhuang/Tools/novocraft_v3 -v /usr/local/bin/gatk:/usr/local/bin/gatk own_viral_ngs_gap2seq bash
cd /work
# -- ! Firstly manully run for generating all files ${sample}.cleaned.bam and ${sample}.taxfilt.bam in 01_cleaned and 01_per_sample
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
# -- generating data/01_cleaned/${sample}.cleaned.bam --
bin/taxon_filter.py deplete data/00_raw/${sample}.bam tmp/01_cleaned/${sample}.raw.bam tmp/01_cleaned/${sample}.bmtagger_depleted.bam tmp/01_cleaned/${sample}.rmdup.bam data/01_cleaned/${sample}.cleaned.bam --bmtaggerDbs /home/jhuang/REFs/viral_ngs_dbs/bmtagger_dbs_remove/metagenomics_contaminants_v3 /home/jhuang/REFs/viral_ngs_dbs/bmtagger_dbs_remove/GRCh37.68_ncRNA-GRCh37.68_transcripts-HS_rRNA_mitRNA /home/jhuang/REFs/viral_ngs_dbs/bmtagger_dbs_remove/hg19 --blastDbs /home/jhuang/REFs/viral_ngs_dbs/blast_dbs_remove/hybsel_probe_adapters /home/jhuang/REFs/viral_ngs_dbs/blast_dbs_remove/metag_v3.ncRNA.mRNA.mitRNA.consensus --threads 60 --srprismMemory 14250 --JVMmemory 50g
# -- data/01_cleaned/073.cleaned.bam --> data/01_cleaned/073.taxfilt.bam --
bin/taxon_filter.py filter_lastal_bam data/01_cleaned/${sample}.cleaned.bam lastal_db/lastal.fasta data/01_cleaned/${sample}.taxfilt.bam
bin/read_utils.py bwamem_idxstats data/01_cleaned/${sample}.cleaned.bam /home/jhuang/REFs/viral_ngs_dbs/spikeins/ercc_spike-ins.fasta --outStats reports/spike_count/${sample}.spike_count.txt --minScoreToFilter 60
fastqc -f bam data/01_cleaned/${sample}.cleaned.bam -o reports/fastqc/${sample}
unzip reports/fastqc/${sample}/${sample}.cleaned_fastqc.zip -d reports/fastqc/${sample}
fastqc -f bam data/01_cleaned/${sample}.taxfilt.bam -o reports/fastqc/${sample}
unzip reports/fastqc/${sample}/${sample}.taxfilt_fastqc.zip -d reports/fastqc/${sample}
# -- data/01_cleaned/${sample}.cleaned.bam --> data/01_per_sample/${sample}.cleaned.bam --
bin/read_utils.py merge_bams data/01_cleaned/${sample}.cleaned.bam tmp/01_cleaned/${sample}.cleaned.bam --picardOptions SORT_ORDER=queryname
bin/read_utils.py rmdup_mvicuna_bam tmp/01_cleaned/${sample}.cleaned.bam data/01_per_sample/${sample}.cleaned.bam --JVMmemory 30g
# -- data/01_cleaned/${sample}.taxfilt.bam --> data/01_per_sample/${sample}.taxfilt.bam --
bin/read_utils.py merge_bams data/01_cleaned/${sample}.taxfilt.bam tmp/01_cleaned/${sample}.taxfilt.bam --picardOptions SORT_ORDER=queryname
bin/read_utils.py rmdup_mvicuna_bam tmp/01_cleaned/${sample}.taxfilt.bam data/01_per_sample/${sample}.taxfilt.bam --JVMmemory 30g
done
# -- ! Secondly --
#If direct use snakemake --directory /work --printshellcmds --cores 40, has the following error, using bash commands instead.
#Error in rule orient_and_impute:
#jobid: 0
#output: tmp/02_assembly/HE290.assembly3-modify.fasta
#DEBUG: --memLimitGb 12 --> --memLimitGb 960, if threads=60: 256M / 58G, how big the memory needed when threads=120: 492M / 468G.
##ASSEMBLY_1_SPADES: data/01_per_sample/010.taxfilt.bam ----> 010.assembly1-spades.fasta in tmp/02_assembly/
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
bin/assembly.py assemble_spades data/01_per_sample/${sample}.taxfilt.bam /home/jhuang/REFs/viral_ngs_dbs/trim_clip/contaminants.fasta tmp/02_assembly/${sample}.assembly1-spades.fasta --nReads 10000000 --threads 120 --memLimitGb 960
done
#ASSEMBLY_2_SCAFFOLDED: 010.assembly1-spades.fasta ----> 010.assembly2-scaffolded[_ref].fasta + 010.assembly2-alternate_sequences.fasta
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
bin/assembly.py order_and_orient tmp/02_assembly/${sample}.assembly1-spades.fasta refsel_db/refsel.fasta tmp/02_assembly/${sample}.assembly2-scaffolded.fasta --min_pct_contig_aligned 0.05 --outAlternateContigs tmp/02_assembly/${sample}.assembly2-alternate_sequences.fasta --nGenomeSegments 1 --outReference tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta --threads 60
done
#DEBUG: the tool gap2seq is missing, installing package gap2seq to root@bcd2c36b083c:/opt/miniconda/envs/viral-ngs-env/bin
#
# #https://www.cs.helsinki.fi/u/lmsalmel/Gap2Seq/
# apt-get update
# apt-get install -y cmake
# mkdir build; cd build; cmake ..; make
#
# #cp /work/Gap2Seq-2.1/build/Gap2Seq /opt/miniconda/envs/viral-ngs-env/bin/gap2seq
# cp /work/Gap2Seq-2.1/build/Gap2Seq.sh /opt/miniconda/envs/viral-ngs-env/bin/
# cp /work/Gap2Seq-2.1/build/Gap2Seq /opt/miniconda/envs/viral-ngs-env/bin/
# cp /work/Gap2Seq-2.1/build/GapCutter /opt/miniconda/envs/viral-ngs-env/bin/
# cp /work/Gap2Seq-2.1/build/GapMerger /opt/miniconda/envs/viral-ngs-env/bin/
# cp -r /work/Gap2Seq-2.1/build/ext /opt/miniconda/envs/viral-ngs-env/bin/
# Gap2Seq.sh --help
#
# #MOFIFIED1 in bin/tools/gap2seq.py
# #TOOL_VERSION = '2.1'
# TOOL_VERSION = '3.1.1a2'
#
# root@544789adb8b6:/work# for sample in 010; do bin/assembly.py gapfill_gap2seq tmp/02_assembly/${sample}.assembly2-scaffolded.fasta data/01_per_sample/${sample}.cleaned.bam tmp/02_assembly/${sample}.assembly2-gapfilled.fasta --memLimitGb 960 --maskErrors --randomSeed 0 --loglevel DEBUG; done
# 2025-06-20 11:12:41,165 - gap2seq:44:execute - DEBUG - running gap2seq: /opt/miniconda/envs/viral-ngs-env/bin/Gap2Seq.sh -scaffolds /work/tmp/02_assembly/010.assembly2-scaffolded.fasta -filled /tmp/tmp-assembly-gapfill_gap2seq-vz_n3tkp/tmpkt8508du_gap2seq_dir/gap2seq-filled.s3.k90.fasta -reads /tmp/tmp-assembly-gapfill_gap2seq-vz_n3tkp/tmpfj62s6n5.1.fq,/tmp/tmp-assembly-gapfill_gap2seq-vz_n3tkp/tmpu5eu1n1r.2.fq -all-upper -verbose -solid 3 -k 90 -nb-cores 0 -max-mem 960 -randseed 0
# /opt/miniconda/envs/viral-ngs-env/bin/Gap2Seq.sh: Unrecognized option -randseed
#
# #MODIFIED2 in bin/tools/gap2seq.py: delete 'randseed=random_seed' in solid=solid_kmer_threshold, k=kmer_size, nb_cores=threads, max_mem=mem_limit_gb, randseed=random_seed) so that solid=solid_kmer_threshold, k=kmer_size, nb_cores=threads, max_mem=mem_limit_gb)
#
# docker commit 3f9f9507ab31 viral_ngs_with_gap2seq
# docker image ls or docker images
#
# #NOTE that the image cannot be deleted, since linke to other images!
# docker rmi own_viral_ngs_with_gap2seq
# #Error response from daemon: conflict: unable to remove repository reference "own_viral_ngs_gap2seq" (must force) - container 3f9f9507ab31 is using its referenced image 7ffc275c57cc
#NOTE: --memLimitGb 12 --> --memLimitGb 960
#ASSEMBLY_2_GAPFILLED: 010.assembly2-scaffolded.fasta + data/01_per_sample/010.cleaned.bam ----> 010.assembly2-gapfilled.fasta
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
bin/assembly.py gapfill_gap2seq tmp/02_assembly/${sample}.assembly2-scaffolded.fasta data/01_per_sample/${sample}.cleaned.bam tmp/02_assembly/${sample}.assembly2-gapfilled.fasta --memLimitGb 960 --maskErrors --randomSeed 0 --loglevel DEBUG
done
#ASSEMBLY_3_MOFIFY: 010.assembly2-gapfilled.fasta + 010.assembly2-scaffold_ref.fasta ----> 010.assembly3-modify.[fasta|fasta.fai|dict|nix]
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
bin/assembly.py impute_from_reference tmp/02_assembly/${sample}.assembly2-gapfilled.fasta tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta tmp/02_assembly/${sample}.assembly3-modify.fasta --newName ${sample} --replaceLength 55 --minLengthFraction 0.05 --minUnambig 0.05 --index
done
#ASSEMBLY_4_REFINED: 010.assembly3-modify.fasta + data/01_per_sample/010.cleaned.bam ----> 010.assembly4-refined.[fasta|fasta.fai|dict|nix] + 010.assembly3.[vcf.gz|vcf.gz.tbi]
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
bin/assembly.py refine_assembly tmp/02_assembly/${sample}.assembly3-modify.fasta data/01_per_sample/${sample}.cleaned.bam tmp/02_assembly/${sample}.assembly4-refined.fasta --outVcf tmp/02_assembly/${sample}.assembly3.vcf.gz --min_coverage 2 --novo_params '-r Random -l 20 -g 40 -x 20 -t 502' --threads 60
done
#ASSEMBLY_5_REFINED2_GENERATE_ASSEMBLY_IN_DATA_DIR: tmp/02_assembly/010.assembly4-refined.fasta + data/01_per_sample/010.cleaned.bam ----> data/02_assembly/010.[fasta|fasta.fai|dict|nix] + tmp/02_assembly/010.assembly4.[vcf.gz|vcf.gz.tbi]
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
bin/assembly.py refine_assembly tmp/02_assembly/${sample}.assembly4-refined.fasta data/01_per_sample/${sample}.cleaned.bam data/02_assembly/${sample}.fasta --outVcf tmp/02_assembly/${sample}.assembly4.vcf.gz --min_coverage 3 --novo_params '-r Random -l 20 -g 40 -x 20 -t 100' --threads 60
done
#ALIGN_CLEANED_BAM_GENERATE_MAPPED_BAM: data/02_assembly/010.fasta + data/01_per_sample/010.cleaned.bam ----> data/02_align_to_self/010.bam + data/02_align_to_self/010.mapped.bam
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
bin/read_utils.py align_and_fix data/01_per_sample/${sample}.cleaned.bam data/02_assembly/${sample}.fasta --outBamAll data/02_align_to_self/${sample}.bam --outBamFiltered data/02_align_to_self/${sample}.mapped.bam --aligner novoalign --aligner_options '-r Random -l 20 -g 40 -x 20 -t 100 -k' --threads 60
done
# -- ! Thirdly set the samples-assembly.txt full
snakemake --directory /work --printshellcmds --cores 40
#Error in rule orient_and_impute:
#jobid: 0
#output: tmp/02_assembly/HE290.assembly3-modify.fast
# # ---- The snakemake pipeline contains the following remaining steps ----
#
# fastqc -f bam data/02_align_to_self/093.bam -o reports/fastqc/093
# unzip reports/fastqc/093/093_fastqc.zip -d reports/fastqc/093
# fastqc -f bam data/01_cleaned/093.cleaned.bam -o reports/fastqc/093
# unzip reports/fastqc/093/093.cleaned_fastqc.zip -d reports/fastqc/093
# fastqc -f bam data/01_cleaned/093.taxfilt.bam -o reports/fastqc/093
# unzip reports/fastqc/093/093.taxfilt_fastqc.zip -d reports/fastqc/093
#
# bin/intrahost.py vphaser_one_sample data/02_align_to_self/093.mapped.bam data/02_assembly/093.fasta data/04_intrahost/vphaser2.093.txt.gz --vphaserNumThreads 15 --removeDoublyMappedReads --minReadsEach 5 --maxBias 10
# bin/reports.py consolidate_fastqc reports/fastqc/093/taxfilt reports/summary.fastqc.taxfilt.txt
# bin/reports.py consolidate_fastqc reports/fastqc/093/align_to_self reports/summary.fastqc.align_to_self.txt
# bin/reports.py consolidate_fastqc reports/fastqc/093/cleaned reports/summary.fastqc.cleaned.txt
# bin/interhost.py multichr_mafft ref_genome/reference.fasta data/02_assembly/093.fasta data/03_multialign_to_ref --ep 0.123 --maxiters 1000 --preservecase --localpair --outFilePrefix aligned --sampleNameListFile data/03_multialign_to_ref/sampleNameList.txt --threads 60
# bin/intrahost.py merge_to_vcf ref_genome/reference.fasta data/04_intrahost/isnvs.vcf.gz --samples 093 --isnvs data/04_intrahost/vphaser2.093.txt.gz --alignments data/03_multialign_to_ref/aligned_1.fasta --strip_chr_version --parse_accession
# bin/interhost.py snpEff data/04_intrahost/isnvs.vcf.gz NC_001653.2 data/04_intrahost/isnvs.annot.vcf.gz j.huang@uke.de
# bin/intrahost.py iSNV_table data/04_intrahost/isnvs.annot.vcf.gz data/04_intrahost/isnvs.annot.txt.gz
# bin/reports.py consolidate_spike_count reports/spike_count reports/summary.spike_count.txt
vrap-calling
ln -s /home/jhuang/Tools/vrap/ .
mamba activate /home/jhuang/miniconda3/envs/vrap
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
vrap/vrap.py -1 trimmed/${sample}_trimmed_P_1.fastq -2 trimmed/${sample}_trimmed_P_2.fastq -o vrap_${sample} --bt2idx=/home/jhuang/REFs/genome --host=/home/jhuang/REFs/genome.fa --virus=/mnt/md1/DATA/Data_Sophie_HDV_Sequences/complete_genome_12475_ncbi.fasta --nt=/mnt/nvme0n1p1/blast/nt --nr=/mnt/nvme0n1p1/blast/nr -t 100 -l 200 -g
done
(Deprecated) Using docker viral-ngs scripts processing the vrap-results. Be carefual since it doesn't release good results.
mv vrap_010 viralngs/
mkdir tmp/02_assembly data/02_assembly
#refsel_db/refsel.fasta
The longest contig genome-calling is
010: HQ005371
# Using viral-ngs to improve the assembly, the results are not so good due to the too diverser reference --> not used!
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
for sample in 010; do
bin/assembly.py order_and_orient vrap_${sample}/virus_user_db_contigs.fasta HQ005371.fasta tmp/02_assembly/${sample}.assembly2-scaffolded.fasta --min_pct_contig_aligned 0.05 --outAlternateContigs tmp/02_assembly/${sample}.assembly2-alternate_sequences.fasta --nGenomeSegments 1 --outReference tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta --threads 60
bin/assembly.py gapfill_gap2seq tmp/02_assembly/${sample}.assembly2-scaffolded.fasta data/01_per_sample/${sample}.cleaned.bam tmp/02_assembly/${sample}.assembly2-gapfilled.fasta --memLimitGb 960 --maskErrors --randomSeed 0 --loglevel DEBUG
bin/assembly.py impute_from_reference tmp/02_assembly/${sample}.assembly2-gapfilled.fasta tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta tmp/02_assembly/${sample}.assembly3-modify.fasta --newName ${sample} --replaceLength 55 --minLengthFraction 0.05 --minUnambig 0.05 --index
bin/assembly.py refine_assembly tmp/02_assembly/${sample}.assembly3-modify.fasta data/01_per_sample/${sample}.cleaned.bam tmp/02_assembly/${sample}.assembly4-refined.fasta --outVcf tmp/02_assembly/${sample}.assembly3.vcf.gz --min_coverage 2 --novo_params '-r Random -l 20 -g 40 -x 20 -t 502' --threads 60
bin/assembly.py refine_assembly tmp/02_assembly/${sample}.assembly4-refined.fasta data/01_per_sample/${sample}.cleaned.bam data/02_assembly/${sample}.fasta --outVcf tmp/02_assembly/${sample}.assembly4.vcf.gz --min_coverage 3 --novo_params '-r Random -l 20 -g 40 -x 20 -t 100' --threads 60
bin/read_utils.py align_and_fix data/01_per_sample/${sample}.cleaned.bam data/02_assembly/${sample}.fasta --outBamAll data/02_align_to_self/${sample}.bam --outBamFiltered data/02_align_to_self/${sample}.mapped.bam --aligner novoalign --aligner_options '-r Random -l 20 -g 40 -x 20 -t 100 -k' --threads 60
done
Filtering the contig from the vrap results
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
python ~/Scripts/extract_virus_user_db_contigs.py vrap_${sample}
done
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
python ~/Scripts/extract_longest_contig.py vrap_${sample}/virus_user_db_contigs.fasta ${sample}_raw.fasta
done
Circularity checking of the contigs: method_1 using ccfind
#Install ccfind
#Detects circular genomes via terminal redundancy using BLAST or FASTA Smith–Waterman.
git clone https://github.com/yosuken/ccfind
cd ccfind
# Ensure ssearch36, blastn, prodigal are installed
./ccfind <input.fasta> <output_dir>
#https://github.com/wrpearson/fasta3
wget http://faculty.virginia.edu/wrpearson/fasta/fasta36/fasta-36.3.8h.tar.gz
tar -xzf fasta-36.3.8h.tar.gz
cd fasta-36.3.8h
make -f Makefile.linux64_sse2
After compiling, add it to your path: export PATH=$PWD:$PATH
# Using ccfind check circularity of the contigs
#docker pull sangerpathogens/circlator
#docker run -it --rm -v /home/jhuang/DATA/Data_Sophie_HDV_Sequences:/data sangerpathogens/circlator bash
#cd /data
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
~/Tools/ccfind/ccfind ${sample}_raw.fasta ccfind_${sample}
#seqtk mergepe trimmed/${sample}_trimmed_P_1.fastq trimmed/${sample}_trimmed_P_2.fastq > ${sample}_interleaved.fq
#circlator all --threads 16 vrap_010/virus_user_db_contigs.fasta 010_interleaved.fq 010_circlator_out
done
#print file size of all files circ.noTR.fasta under "ccfind_*/result/"
find ccfind_*/result/ -name "circ.noTR.fasta" -exec ls -lh {} \; | awk '{print $5, $9}'
find ccfind_*/result/ -name "circ.noTR.fasta" -exec stat -c "%s %n" {} \;
Circularity checking of the contigs: method_2 using blastn
# Manual inspection of the mate-pair read mapping at the start and end confirmed that three of the contigs were circular plasmids.
# -- Circularity_varification: Confirm the contigs are circular using blastn --
makeblastdb -in virus_user_db_contigs.fasta -dbtype nucl -out contigs_db
blastn -query virus_user_db_contigs.fasta -db contigs_db -outfmt 6 -evalue 1e-10 > blast_results.txt
CAP_1_length_1851 CAP_1_length_1851 100.000 166 0 0 1686 1851 1 166 4.14e-86 307
CAP_1_length_1851 CAP_1_length_1851 100.000 166 0 0 1 166 1686 1851 4.14e-86 307
samtools faidx virus_user_db_contigs.fasta CAP_1_length_1851 > CAP_1.fasta
python3 ~/Scripts/process_circular.py CAP_1.fasta CAP_1_circular.fasta --overlap_len 166
#Modify the sequence header to 010
Calculate coverage
bwa index CAP_1_circular.fasta
bwa mem CAP_1_circular.fasta ../trimmed/010_trimmed_P_1.fastq ../trimmed/010_trimmed_P_2.fastq > aligned_reads.sam
samtools view -bS aligned_reads.sam | samtools sort -o aligned_reads.sorted.bam
samtools index aligned_reads.sorted.bam
samtools depth aligned_reads.sorted.bam > coverage.txt
awk '{sum+=$3} END {print "Average coverage:", sum/NR}' coverage.txt
#Average coverage: 7754.49
bwa index 010.fasta
bwa mem 010.fasta trimmed/010_trimmed_P_1.fastq trimmed/010_trimmed_P_2.fastq > 010_aligned_reads.sam
samtools view -bS 010_aligned_reads.sam | samtools sort -o 010_aligned_reads.sorted.bam
samtools index 010_aligned_reads.sorted.bam
bwa index 068.fasta
bwa mem 068.fasta trimmed/068_trimmed_P_1.fastq trimmed/068_trimmed_P_2.fastq > 068_aligned_reads.sam
samtools view -bS 068_aligned_reads.sam | samtools sort -o 068_aligned_reads.sorted.bam
samtools index 068_aligned_reads.sorted.bam
Copy and update the fasta-headers
#!/bin/bash
# List of IDs to process
ids=(
010 020 048 073 079 083 093 104 253 279 282 341 383 394 405 446 449 503 550 69 738 879 88
HE290 HE511 HE554 HE695 HE748
)
for id in "${ids[@]}"; do
src="ccfind_${id}/result/circ.noTR.fasta"
dst="${id}.fasta"
if [[ -f "$src" ]]; then
cp "$src" "$dst"
# Update header in the copied fasta
ruby -e "
filename = '${id}'
seq = ''
File.foreach('${dst}') do |line|
next if line.start_with?('>')
seq += line.strip
end
File.open('${dst}', 'w') do |f|
f.puts '>' + filename
f.puts seq
end
"
echo "Processed $id"
else
echo "Warning: source file $src not found!"
fi
done
Generate coverage plot filtering some sequences, then align qualified sequences as input of RDP4
#69 88 HE290 HE511 HE554 HE695 HE748 --> 069 088 290 511 554 695 748
mv 69_my.fasta 069_my.fasta
mv 88_my.fasta 088_my.fasta
mv HE290_my.fasta 290_my.fasta
mv HE511_my.fasta 511_my.fasta
mv HE554_my.fasta 554_my.fasta
mv HE695_my.fasta 695_my.fasta
mv HE748_my.fasta 748_my.fasta
for sample in 010 020 048 073 079 083 093 104 253 279 282 341 383 394 405 446 449 503 550 738 879 069 088 290 511 554 695 748; do
mv ~/Downloads/${sample}.fasta .
done
cd trimmed_
ln -s 69_trimmed_P_1.fastq 069_trimmed_P_1.fastq
ln -s 69_trimmed_P_2.fastq 069_trimmed_P_2.fastq
ln -s 88_trimmed_P_1.fastq 088_trimmed_P_1.fastq
ln -s 88_trimmed_P_2.fastq 088_trimmed_P_2.fastq
ln -s HE290_trimmed_P_1.fastq 290_trimmed_P_1.fastq
ln -s HE290_trimmed_P_2.fastq 290_trimmed_P_2.fastq
ln -s HE511_trimmed_P_1.fastq 511_trimmed_P_1.fastq
ln -s HE511_trimmed_P_2.fastq 511_trimmed_P_2.fastq
ln -s HE554_trimmed_P_1.fastq 554_trimmed_P_1.fastq
ln -s HE554_trimmed_P_2.fastq 554_trimmed_P_2.fastq
ln -s HE695_trimmed_P_1.fastq 695_trimmed_P_1.fastq
ln -s HE695_trimmed_P_2.fastq 695_trimmed_P_2.fastq
ln -s HE748_trimmed_P_1.fastq 748_trimmed_P_1.fastq
ln -s HE748_trimmed_P_2.fastq 748_trimmed_P_2.fastq
ln -s 81_trimmed_P_1.fastq 081_trimmed_P_1.fastq
ln -s 81_trimmed_P_2.fastq 081_trimmed_P_2.fastq
ln -s 82_trimmed_P_1.fastq 082_trimmed_P_1.fastq
ln -s 82_trimmed_P_2.fastq 082_trimmed_P_2.fastq
ln -s 94_trimmed_P_1.fastq 094_trimmed_P_1.fastq
ln -s 94_trimmed_P_2.fastq 094_trimmed_P_2.fastq
ln -s HE284_trimmed_P_1.fastq 284_trimmed_P_1.fastq
ln -s HE284_trimmed_P_2.fastq 284_trimmed_P_2.fastq
ln -s HE566_trimmed_P_1.fastq 566_trimmed_P_1.fastq
ln -s HE566_trimmed_P_2.fastq 566_trimmed_P_2.fastq
#update_file_header.sh
update_fasta.py # generate the HDV_genomes_
conda activate plot-numpy1
for sample in 010 020 048 073 079 083 093 104 253 279 282 341 383 394 405 446 449 503 550 738 879 069 088 290 511 554 695 748 068 081 082 090 094 097 103 107 108 109 129 141 284 357 370 442 444 445 497 566 691 771 973 995 1021 293 450 478 494; do
bwa index HDV_genomes_/${sample}.fasta
bwa mem HDV_genomes_/${sample}.fasta trimmed_/${sample}_trimmed_P_1.fastq trimmed_/${sample}_trimmed_P_2.fastq > ${sample}_aligned_reads.sam
samtools view -bS ${sample}_aligned_reads.sam | samtools sort -o ${sample}_aligned_reads.sorted.bam
samtools index ${sample}_aligned_reads.sorted.bam
samtools depth -m 0 -a ${sample}_aligned_reads.sorted.bam > ${sample}_coverage.txt
python ~/Scripts/plot_coverage.py ${sample}_coverage.txt ${sample}_cov.png
done
for sample in 010 020 048 073 079 083 093 104 253 279 282 341 383 394 405 446 449 503 550 738 879 069 088 290 511 554 695 748; do
bwa index HDV_genomes_/${sample}_my.fasta
bwa mem HDV_genomes_/${sample}_my.fasta trimmed_/${sample}_trimmed_P_1.fastq trimmed_/${sample}_trimmed_P_2.fastq > ${sample}_my_aligned_reads.sam
samtools view -bS ${sample}_my_aligned_reads.sam | samtools sort -o ${sample}_my_aligned_reads.sorted.bam
samtools index ${sample}_my_aligned_reads.sorted.bam
samtools depth -m 0 -a ${sample}_my_aligned_reads.sorted.bam > ${sample}_my_coverage.txt
python ~/Scripts/plot_coverage.py ${sample}_my_coverage.txt ${sample}_my_cov.png
done
#-- Note that The following two assembly were not sent to me due to the bad coverage --
#301_coverage.txt
#HSVM_coverage.txt
for file in *_cov.png; do
convert "$file" "${file%.png}.pdf"
done
pdftk 010_cov.pdf 010_my_cov.pdf 020_cov.pdf 020_my_cov.pdf 048_cov.pdf 048_my_cov.pdf 073_cov.pdf 073_my_cov.pdf 079_cov.pdf 079_my_cov.pdf 083_cov.pdf 083_my_cov.pdf 093_cov.pdf 093_my_cov.pdf 104_cov.pdf 104_my_cov.pdf 253_cov.pdf 253_my_cov.pdf 279_cov.pdf 279_my_cov.pdf 282_cov.pdf 282_my_cov.pdf 341_cov.pdf 341_my_cov.pdf 383_cov.pdf 383_my_cov.pdf 394_cov.pdf 394_my_cov.pdf 405_cov.pdf 405_my_cov.pdf 446_cov.pdf 446_my_cov.pdf 449_cov.pdf 449_my_cov.pdf 503_cov.pdf 503_my_cov.pdf 550_cov.pdf 550_my_cov.pdf 738_cov.pdf 738_my_cov.pdf 879_cov.pdf 879_my_cov.pdf 069_cov.pdf 069_my_cov.pdf 088_cov.pdf 088_my_cov.pdf 290_cov.pdf 290_my_cov.pdf 511_cov.pdf 511_my_cov.pdf 554_cov.pdf 554_my_cov.pdf 695_cov.pdf 695_my_cov.pdf 748_cov.pdf 748_my_cov.pdf 068_cov.pdf 081_cov.pdf 082_cov.pdf 090_cov.pdf 094_cov.pdf 097_cov.pdf 103_cov.pdf 107_cov.pdf 108_cov.pdf 109_cov.pdf 129_cov.pdf 141_cov.pdf 284_cov.pdf 357_cov.pdf 370_cov.pdf 442_cov.pdf 444_cov.pdf 445_cov.pdf 497_cov.pdf 566_cov.pdf 691_cov.pdf 771_cov.pdf 973_cov.pdf 995_cov.pdf 1021_cov.pdf 293_cov.pdf 450_cov.pdf 478_cov.pdf 494_cov.pdf cat output coverges_all.pdf
pdftk 010_cov.pdf 020_cov.pdf 048_cov.pdf 073_cov.pdf 079_cov.pdf 083_cov.pdf 093_cov.pdf 104_cov.pdf 253_cov.pdf 279_cov.pdf 282_cov.pdf 341_cov.pdf 383_cov.pdf 394_cov.pdf 405_cov.pdf 446_cov.pdf 449_cov.pdf 503_cov.pdf 550_cov.pdf 738_cov.pdf 879_cov.pdf 069_cov.pdf 088_cov.pdf 290_cov.pdf 511_cov.pdf 554_cov.pdf 695_cov.pdf 748_cov.pdf 068_cov.pdf 081_cov.pdf 082_cov.pdf 090_cov.pdf 094_cov.pdf 097_cov.pdf 103_cov.pdf 107_cov.pdf 108_cov.pdf 109_cov.pdf 129_cov.pdf 141_cov.pdf 284_cov.pdf 357_cov.pdf 370_cov.pdf 442_cov.pdf 444_cov.pdf 445_cov.pdf 497_cov.pdf 566_cov.pdf 691_cov.pdf 771_cov.pdf 973_cov.pdf 995_cov.pdf 1021_cov.pdf 293_cov.pdf 450_cov.pdf 478_cov.pdf 494_cov.pdf cat output coverages.pdf
#Not good quality: 082,094,129,284,566,691,293,450,478,494
cat 010.fasta 020.fasta 048.fasta 073.fasta 079.fasta 083.fasta 093.fasta 104.fasta 253.fasta 279.fasta 282.fasta 341.fasta 383.fasta 394.fasta 405.fasta 446.fasta 449.fasta 503.fasta 550.fasta 738.fasta 879.fasta 069.fasta 088.fasta 290.fasta 511.fasta 554.fasta 695.fasta 748.fasta 068.fasta 081.fasta 090.fasta 097.fasta 103.fasta 107.fasta 108.fasta 109.fasta 141.fasta 357.fasta 370.fasta 442.fasta 444.fasta 445.fasta 497.fasta 771.fasta 973.fasta 995.fasta 1021.fasta > all.fasta
awk '/^>/ {print $1} !/^>/ {print}' all.fasta > all_.fasta
mafft --adjustdirection --clustalout all_.fasta > all.aln
mafft --auto all_.fasta > aligned.fasta
#iqtree -s aligned.fasta -m GTR+G -bb 1000 -nt AUTO
FastTree -gtr -gamma aligned.fasta > tree.nwk
(NOT_NEEDED) rotate (fixstart) the genomes (not needed, since the genome provided has the same starting point)
#python rotate_circular_genome.py ${sample}.fasta ${sample}_rotated.fasta ATGAGC
(TODO) Draw plotTreeHeatmap
#http://xgenes.com/article/article-content/383/presence-absence-table-and-graphics-for-selected-genes-in-data-patricia-sepi-7samples/#
Report
Several assemblies (082, 094, 129, 284, 566, 691, 293, 450, 478, 494) show poor quality. Please refer to the attached coverage.pdf for an overview of read mapping. I excluded these from the recombination analysis, which was carried out using RDP4, applying nine detection methods (RDP, GENECONV, Bootscan, MaxChi, Chimaera, SiScan, PhylPro, LARD, and 3Seq).
Please note that the analysis assumes accurate assemblies—misassemblies can lead to false positives.
The results, summarized in the attached Excel files, identify three sequences (503, 109, 394) as potential recombinants. However, I recommend interpreting these findings with caution.
* Events 2 and 3: Breakpoints occur near the genome ends (1602–12 and 1451–134), where alignment artifacts are common.
* Event 1: The recombinant region is less than 100 nucleotides, which may be below biological relevance.
* Method flags: RDP flags Events 1 and 2 (~) as possibly caused by other evolutionary processes. All events are flagged (^) to indicate that the recombinant sequence may have been misidentified (one of the identified parents might be the recombinant).
点赞本文的读者
还没有人对此文章表态
没有评论
Analysis of the RNA binding protein (RBP) motifs for RNA-Seq and miRNAs (v3, simplied)
Processing RNAseq_2025_WT_vs_ΔIJ_on_ATCC19606
© 2023 XGenes.com Impressum