gene_x 0 like s 6 view s
Tags: pipeline
Prepare input raw data
/mnt/md1/DATA/Data_Sophie_HDV_Sequences/raw_data
for f in *_R[12]_001.fastq.gz; do newname="$(echo "$f" | awk -F_ '{print $1 "_" $4 ".fastq.gz"}')"; echo mv "$f" "$newname"; done
for f in *_R[12]_001.fastq.gz; do newname="$(echo "$f" | awk -F_ '{print $1 "_" $4 ".fastq.gz"}')"; mv "$f" "$newname"; done
Call variant calling using snippy
ln -s ~/Tools/bacto/db/ .;
ln -s ~/Tools/bacto/envs/ .;
ln -s ~/Tools/bacto/local/ .;
cp ~/Tools/bacto/Snakefile .;
cp ~/Tools/bacto/bacto-0.1.json .;
cp ~/Tools/bacto/cluster.json .;
#download CU459141.gb from GenBank
mv ~/Downloads/sequence\(2\).gb db/NC_001653.gb
#setting the following in bacto-0.1.json
"fastqc": false,
"taxonomic_classifier": false,
"assembly": true,
"typing_ariba": false,
"typing_mlst": true,
"pangenome": true,
"variants_calling": true,
"phylogeny_fasttree": true,
"phylogeny_raxml": true,
"recombination": false, (due to gubbins-error set false)
"genus": "Alphacoronavirus",
"kingdom": "Viruses",
"species": "Human coronavirus 229E",
"mykrobe": {
"species": "corona"
},
"reference": "db/PP810610.gb"
mamba activate /home/jhuang/miniconda3/envs/bengal3_ac3
(bengal3_ac3) /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
Prepare virus database
# ---- Date is 16.06.2025. ----
#Taxonomy ID: 12475
esearch -db nucleotide -query "txid12475[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_12475_ncbi.fasta
python ~/Scripts/filter_fasta.py genome_12475_ncbi.fasta complete_genome_12475_ncbi.fasta #4208-->760
#https://de.wikipedia.org/wiki/Hepatitis-D-Virus
Hepatitis delta virus, complete genome
NCBI Reference Sequence: NC_001653.2
(Deprecated) Calling intra-host variants using viral-ngs
#How to run and debug the viral-ngs docker?
mkdir viralngs; cd viralngs
ln -s ~/Tools/viral-ngs_docker/Snakefile Snakefile
ln -s ~/Tools/viral-ngs_docker/bin bin
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/refsel.acids refsel.acids
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/lastal.acids lastal.acids
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/config.yaml config.yaml
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-runs.txt samples-runs.txt
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-depletion.txt samples-depletion.txt
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-metagenomics.txt samples-metagenomics.txt
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-assembly.txt samples-assembly.txt
cp ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-assembly-failures.txt samples-assembly-failures.txt
# Adapt the sample-*.txt
mkdir viralngs/data
mkdir viralngs/data/00_raw
mkdir bams
ref_fa="NC_001653.fasta";
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
bwa index ${ref_fa}; \
bwa mem -M -t 16 ${ref_fa} trimmed/${sample}_trimmed_P_1.fastq trimmed/${sample}_trimmed_P_2.fastq | samtools view -bS - > bams/${sample}_genome_alignment.bam; \
done
conda activate viral-ngs4
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
picard AddOrReplaceReadGroups I=bams/${sample}_genome_alignment.bam O=~/DATA/Data_Sophie_HDV_Sequences/viralngs/data/00_raw/${sample}.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=$sample RGSM=$sample RGLB=standard RGPU=$sample VALIDATION_STRINGENCY=LENIENT; \
done
conda deactivate
# Activate the docker viralngs environment
docker run -it -v /mnt/md1/DATA/Data_Sophie_HDV_Sequences/viralngs:/work -v /home/jhuang/Tools/viral-ngs_docker:/home/jhuang/Tools/viral-ngs_docker -v /home/jhuang/REFs:/home/jhuang/REFs -v /home/jhuang/Tools/GenomeAnalysisTK-3.6:/home/jhuang/Tools/GenomeAnalysisTK-3.6 -v /home/jhuang/Tools/novocraft_v3:/home/jhuang/Tools/novocraft_v3 -v /usr/local/bin/gatk:/usr/local/bin/gatk own_viral_ngs_gap2seq bash
cd /work
# -- ! Firstly manully run for generating all files ${sample}.cleaned.bam and ${sample}.taxfilt.bam in 01_cleaned and 01_per_sample
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
# -- generating data/01_cleaned/${sample}.cleaned.bam --
bin/taxon_filter.py deplete data/00_raw/${sample}.bam tmp/01_cleaned/${sample}.raw.bam tmp/01_cleaned/${sample}.bmtagger_depleted.bam tmp/01_cleaned/${sample}.rmdup.bam data/01_cleaned/${sample}.cleaned.bam --bmtaggerDbs /home/jhuang/REFs/viral_ngs_dbs/bmtagger_dbs_remove/metagenomics_contaminants_v3 /home/jhuang/REFs/viral_ngs_dbs/bmtagger_dbs_remove/GRCh37.68_ncRNA-GRCh37.68_transcripts-HS_rRNA_mitRNA /home/jhuang/REFs/viral_ngs_dbs/bmtagger_dbs_remove/hg19 --blastDbs /home/jhuang/REFs/viral_ngs_dbs/blast_dbs_remove/hybsel_probe_adapters /home/jhuang/REFs/viral_ngs_dbs/blast_dbs_remove/metag_v3.ncRNA.mRNA.mitRNA.consensus --threads 60 --srprismMemory 14250 --JVMmemory 50g
# -- data/01_cleaned/073.cleaned.bam --> data/01_cleaned/073.taxfilt.bam --
bin/taxon_filter.py filter_lastal_bam data/01_cleaned/${sample}.cleaned.bam lastal_db/lastal.fasta data/01_cleaned/${sample}.taxfilt.bam
bin/read_utils.py bwamem_idxstats data/01_cleaned/${sample}.cleaned.bam /home/jhuang/REFs/viral_ngs_dbs/spikeins/ercc_spike-ins.fasta --outStats reports/spike_count/${sample}.spike_count.txt --minScoreToFilter 60
fastqc -f bam data/01_cleaned/${sample}.cleaned.bam -o reports/fastqc/${sample}
unzip reports/fastqc/${sample}/${sample}.cleaned_fastqc.zip -d reports/fastqc/${sample}
fastqc -f bam data/01_cleaned/${sample}.taxfilt.bam -o reports/fastqc/${sample}
unzip reports/fastqc/${sample}/${sample}.taxfilt_fastqc.zip -d reports/fastqc/${sample}
# -- data/01_cleaned/${sample}.cleaned.bam --> data/01_per_sample/${sample}.cleaned.bam --
bin/read_utils.py merge_bams data/01_cleaned/${sample}.cleaned.bam tmp/01_cleaned/${sample}.cleaned.bam --picardOptions SORT_ORDER=queryname
bin/read_utils.py rmdup_mvicuna_bam tmp/01_cleaned/${sample}.cleaned.bam data/01_per_sample/${sample}.cleaned.bam --JVMmemory 30g
# -- data/01_cleaned/${sample}.taxfilt.bam --> data/01_per_sample/${sample}.taxfilt.bam --
bin/read_utils.py merge_bams data/01_cleaned/${sample}.taxfilt.bam tmp/01_cleaned/${sample}.taxfilt.bam --picardOptions SORT_ORDER=queryname
bin/read_utils.py rmdup_mvicuna_bam tmp/01_cleaned/${sample}.taxfilt.bam data/01_per_sample/${sample}.taxfilt.bam --JVMmemory 30g
done
# -- ! Secondly --
#If direct use snakemake --directory /work --printshellcmds --cores 40, has the following error, using bash commands instead.
#Error in rule orient_and_impute:
#jobid: 0
#output: tmp/02_assembly/HE290.assembly3-modify.fasta
#DEBUG: --memLimitGb 12 --> --memLimitGb 960, if threads=60: 256M / 58G, how big the memory needed when threads=120: 492M / 468G.
##ASSEMBLY_1_SPADES: data/01_per_sample/010.taxfilt.bam ----> 010.assembly1-spades.fasta in tmp/02_assembly/
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
bin/assembly.py assemble_spades data/01_per_sample/${sample}.taxfilt.bam /home/jhuang/REFs/viral_ngs_dbs/trim_clip/contaminants.fasta tmp/02_assembly/${sample}.assembly1-spades.fasta --nReads 10000000 --threads 120 --memLimitGb 960
done
#ASSEMBLY_2_SCAFFOLDED: 010.assembly1-spades.fasta ----> 010.assembly2-scaffolded[_ref].fasta + 010.assembly2-alternate_sequences.fasta
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
bin/assembly.py order_and_orient tmp/02_assembly/${sample}.assembly1-spades.fasta refsel_db/refsel.fasta tmp/02_assembly/${sample}.assembly2-scaffolded.fasta --min_pct_contig_aligned 0.05 --outAlternateContigs tmp/02_assembly/${sample}.assembly2-alternate_sequences.fasta --nGenomeSegments 1 --outReference tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta --threads 60
done
#DEBUG: the tool gap2seq is missing, installing package gap2seq to root@bcd2c36b083c:/opt/miniconda/envs/viral-ngs-env/bin
#
# #https://www.cs.helsinki.fi/u/lmsalmel/Gap2Seq/
# apt-get update
# apt-get install -y cmake
# mkdir build; cd build; cmake ..; make
#
# #cp /work/Gap2Seq-2.1/build/Gap2Seq /opt/miniconda/envs/viral-ngs-env/bin/gap2seq
# cp /work/Gap2Seq-2.1/build/Gap2Seq.sh /opt/miniconda/envs/viral-ngs-env/bin/
# cp /work/Gap2Seq-2.1/build/Gap2Seq /opt/miniconda/envs/viral-ngs-env/bin/
# cp /work/Gap2Seq-2.1/build/GapCutter /opt/miniconda/envs/viral-ngs-env/bin/
# cp /work/Gap2Seq-2.1/build/GapMerger /opt/miniconda/envs/viral-ngs-env/bin/
# cp -r /work/Gap2Seq-2.1/build/ext /opt/miniconda/envs/viral-ngs-env/bin/
# Gap2Seq.sh --help
#
# #MOFIFIED1 in bin/tools/gap2seq.py
# #TOOL_VERSION = '2.1'
# TOOL_VERSION = '3.1.1a2'
#
# root@544789adb8b6:/work# for sample in 010; do bin/assembly.py gapfill_gap2seq tmp/02_assembly/${sample}.assembly2-scaffolded.fasta data/01_per_sample/${sample}.cleaned.bam tmp/02_assembly/${sample}.assembly2-gapfilled.fasta --memLimitGb 960 --maskErrors --randomSeed 0 --loglevel DEBUG; done
# 2025-06-20 11:12:41,165 - gap2seq:44:execute - DEBUG - running gap2seq: /opt/miniconda/envs/viral-ngs-env/bin/Gap2Seq.sh -scaffolds /work/tmp/02_assembly/010.assembly2-scaffolded.fasta -filled /tmp/tmp-assembly-gapfill_gap2seq-vz_n3tkp/tmpkt8508du_gap2seq_dir/gap2seq-filled.s3.k90.fasta -reads /tmp/tmp-assembly-gapfill_gap2seq-vz_n3tkp/tmpfj62s6n5.1.fq,/tmp/tmp-assembly-gapfill_gap2seq-vz_n3tkp/tmpu5eu1n1r.2.fq -all-upper -verbose -solid 3 -k 90 -nb-cores 0 -max-mem 960 -randseed 0
# /opt/miniconda/envs/viral-ngs-env/bin/Gap2Seq.sh: Unrecognized option -randseed
#
# #MODIFIED2 in bin/tools/gap2seq.py: delete 'randseed=random_seed' in solid=solid_kmer_threshold, k=kmer_size, nb_cores=threads, max_mem=mem_limit_gb, randseed=random_seed) so that solid=solid_kmer_threshold, k=kmer_size, nb_cores=threads, max_mem=mem_limit_gb)
#
# docker commit 3f9f9507ab31 viral_ngs_with_gap2seq
# docker image ls or docker images
#
# #NOTE that the image cannot be deleted, since linke to other images!
# docker rmi own_viral_ngs_with_gap2seq
# #Error response from daemon: conflict: unable to remove repository reference "own_viral_ngs_gap2seq" (must force) - container 3f9f9507ab31 is using its referenced image 7ffc275c57cc
#NOTE: --memLimitGb 12 --> --memLimitGb 960
#ASSEMBLY_2_GAPFILLED: 010.assembly2-scaffolded.fasta + data/01_per_sample/010.cleaned.bam ----> 010.assembly2-gapfilled.fasta
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
bin/assembly.py gapfill_gap2seq tmp/02_assembly/${sample}.assembly2-scaffolded.fasta data/01_per_sample/${sample}.cleaned.bam tmp/02_assembly/${sample}.assembly2-gapfilled.fasta --memLimitGb 960 --maskErrors --randomSeed 0 --loglevel DEBUG
done
#ASSEMBLY_3_MOFIFY: 010.assembly2-gapfilled.fasta + 010.assembly2-scaffold_ref.fasta ----> 010.assembly3-modify.[fasta|fasta.fai|dict|nix]
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
bin/assembly.py impute_from_reference tmp/02_assembly/${sample}.assembly2-gapfilled.fasta tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta tmp/02_assembly/${sample}.assembly3-modify.fasta --newName ${sample} --replaceLength 55 --minLengthFraction 0.05 --minUnambig 0.05 --index
done
#ASSEMBLY_4_REFINED: 010.assembly3-modify.fasta + data/01_per_sample/010.cleaned.bam ----> 010.assembly4-refined.[fasta|fasta.fai|dict|nix] + 010.assembly3.[vcf.gz|vcf.gz.tbi]
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
bin/assembly.py refine_assembly tmp/02_assembly/${sample}.assembly3-modify.fasta data/01_per_sample/${sample}.cleaned.bam tmp/02_assembly/${sample}.assembly4-refined.fasta --outVcf tmp/02_assembly/${sample}.assembly3.vcf.gz --min_coverage 2 --novo_params '-r Random -l 20 -g 40 -x 20 -t 502' --threads 60
done
#ASSEMBLY_5_REFINED2_GENERATE_ASSEMBLY_IN_DATA_DIR: tmp/02_assembly/010.assembly4-refined.fasta + data/01_per_sample/010.cleaned.bam ----> data/02_assembly/010.[fasta|fasta.fai|dict|nix] + tmp/02_assembly/010.assembly4.[vcf.gz|vcf.gz.tbi]
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
bin/assembly.py refine_assembly tmp/02_assembly/${sample}.assembly4-refined.fasta data/01_per_sample/${sample}.cleaned.bam data/02_assembly/${sample}.fasta --outVcf tmp/02_assembly/${sample}.assembly4.vcf.gz --min_coverage 3 --novo_params '-r Random -l 20 -g 40 -x 20 -t 100' --threads 60
done
#ALIGN_CLEANED_BAM_GENERATE_MAPPED_BAM: data/02_assembly/010.fasta + data/01_per_sample/010.cleaned.bam ----> data/02_align_to_self/010.bam + data/02_align_to_self/010.mapped.bam
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
bin/read_utils.py align_and_fix data/01_per_sample/${sample}.cleaned.bam data/02_assembly/${sample}.fasta --outBamAll data/02_align_to_self/${sample}.bam --outBamFiltered data/02_align_to_self/${sample}.mapped.bam --aligner novoalign --aligner_options '-r Random -l 20 -g 40 -x 20 -t 100 -k' --threads 60
done
# -- ! Thirdly set the samples-assembly.txt full
snakemake --directory /work --printshellcmds --cores 40
#Error in rule orient_and_impute:
#jobid: 0
#output: tmp/02_assembly/HE290.assembly3-modify.fast
# # ---- The snakemake pipeline contains the following remaining steps ----
#
# fastqc -f bam data/02_align_to_self/093.bam -o reports/fastqc/093
# unzip reports/fastqc/093/093_fastqc.zip -d reports/fastqc/093
# fastqc -f bam data/01_cleaned/093.cleaned.bam -o reports/fastqc/093
# unzip reports/fastqc/093/093.cleaned_fastqc.zip -d reports/fastqc/093
# fastqc -f bam data/01_cleaned/093.taxfilt.bam -o reports/fastqc/093
# unzip reports/fastqc/093/093.taxfilt_fastqc.zip -d reports/fastqc/093
#
# bin/intrahost.py vphaser_one_sample data/02_align_to_self/093.mapped.bam data/02_assembly/093.fasta data/04_intrahost/vphaser2.093.txt.gz --vphaserNumThreads 15 --removeDoublyMappedReads --minReadsEach 5 --maxBias 10
# bin/reports.py consolidate_fastqc reports/fastqc/093/taxfilt reports/summary.fastqc.taxfilt.txt
# bin/reports.py consolidate_fastqc reports/fastqc/093/align_to_self reports/summary.fastqc.align_to_self.txt
# bin/reports.py consolidate_fastqc reports/fastqc/093/cleaned reports/summary.fastqc.cleaned.txt
# bin/interhost.py multichr_mafft ref_genome/reference.fasta data/02_assembly/093.fasta data/03_multialign_to_ref --ep 0.123 --maxiters 1000 --preservecase --localpair --outFilePrefix aligned --sampleNameListFile data/03_multialign_to_ref/sampleNameList.txt --threads 60
# bin/intrahost.py merge_to_vcf ref_genome/reference.fasta data/04_intrahost/isnvs.vcf.gz --samples 093 --isnvs data/04_intrahost/vphaser2.093.txt.gz --alignments data/03_multialign_to_ref/aligned_1.fasta --strip_chr_version --parse_accession
# bin/interhost.py snpEff data/04_intrahost/isnvs.vcf.gz NC_001653.2 data/04_intrahost/isnvs.annot.vcf.gz j.huang@uke.de
# bin/intrahost.py iSNV_table data/04_intrahost/isnvs.annot.vcf.gz data/04_intrahost/isnvs.annot.txt.gz
# bin/reports.py consolidate_spike_count reports/spike_count reports/summary.spike_count.txt
vrap-calling
ln -s /home/jhuang/Tools/vrap/ .
mamba activate /home/jhuang/miniconda3/envs/vrap
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
vrap/vrap.py -1 trimmed/${sample}_trimmed_P_1.fastq -2 trimmed/${sample}_trimmed_P_2.fastq -o vrap_${sample} --bt2idx=/home/jhuang/REFs/genome --host=/home/jhuang/REFs/genome.fa --virus=/mnt/md1/DATA/Data_Sophie_HDV_Sequences/complete_genome_12475_ncbi.fasta --nt=/mnt/nvme0n1p1/blast/nt --nr=/mnt/nvme0n1p1/blast/nr -t 100 -l 200 -g
done
(Deprecated) Using docker viral-ngs scripts processing the vrap-results. Be carefual since it doesn't release good results.
mv vrap_010 viralngs/
mkdir tmp/02_assembly data/02_assembly
#refsel_db/refsel.fasta
The longest contig genome-calling is
010: HQ005371
# Using viral-ngs to improve the assembly, the results are not so good due to the too diverser reference --> not used!
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
for sample in 010; do
bin/assembly.py order_and_orient vrap_${sample}/virus_user_db_contigs.fasta HQ005371.fasta tmp/02_assembly/${sample}.assembly2-scaffolded.fasta --min_pct_contig_aligned 0.05 --outAlternateContigs tmp/02_assembly/${sample}.assembly2-alternate_sequences.fasta --nGenomeSegments 1 --outReference tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta --threads 60
bin/assembly.py gapfill_gap2seq tmp/02_assembly/${sample}.assembly2-scaffolded.fasta data/01_per_sample/${sample}.cleaned.bam tmp/02_assembly/${sample}.assembly2-gapfilled.fasta --memLimitGb 960 --maskErrors --randomSeed 0 --loglevel DEBUG
bin/assembly.py impute_from_reference tmp/02_assembly/${sample}.assembly2-gapfilled.fasta tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta tmp/02_assembly/${sample}.assembly3-modify.fasta --newName ${sample} --replaceLength 55 --minLengthFraction 0.05 --minUnambig 0.05 --index
bin/assembly.py refine_assembly tmp/02_assembly/${sample}.assembly3-modify.fasta data/01_per_sample/${sample}.cleaned.bam tmp/02_assembly/${sample}.assembly4-refined.fasta --outVcf tmp/02_assembly/${sample}.assembly3.vcf.gz --min_coverage 2 --novo_params '-r Random -l 20 -g 40 -x 20 -t 502' --threads 60
bin/assembly.py refine_assembly tmp/02_assembly/${sample}.assembly4-refined.fasta data/01_per_sample/${sample}.cleaned.bam data/02_assembly/${sample}.fasta --outVcf tmp/02_assembly/${sample}.assembly4.vcf.gz --min_coverage 3 --novo_params '-r Random -l 20 -g 40 -x 20 -t 100' --threads 60
bin/read_utils.py align_and_fix data/01_per_sample/${sample}.cleaned.bam data/02_assembly/${sample}.fasta --outBamAll data/02_align_to_self/${sample}.bam --outBamFiltered data/02_align_to_self/${sample}.mapped.bam --aligner novoalign --aligner_options '-r Random -l 20 -g 40 -x 20 -t 100 -k' --threads 60
done
Filtering the contig from the vrap results
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
python ~/Scripts/extract_virus_user_db_contigs.py vrap_${sample}
done
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
python ~/Scripts/extract_longest_contig.py vrap_${sample}/virus_user_db_contigs.fasta ${sample}_raw.fasta
done
Circularity checking of the contigs: method_1 using ccfind
#Install ccfind
#Detects circular genomes via terminal redundancy using BLAST or FASTA Smith–Waterman.
git clone https://github.com/yosuken/ccfind
cd ccfind
# Ensure ssearch36, blastn, prodigal are installed
./ccfind <input.fasta> <output_dir>
#https://github.com/wrpearson/fasta3
wget http://faculty.virginia.edu/wrpearson/fasta/fasta36/fasta-36.3.8h.tar.gz
tar -xzf fasta-36.3.8h.tar.gz
cd fasta-36.3.8h
make -f Makefile.linux64_sse2
After compiling, add it to your path: export PATH=$PWD:$PATH
# Using ccfind check circularity of the contigs
#docker pull sangerpathogens/circlator
#docker run -it -v /home/jhuang/DATA/Data_Sophie_HDV_Sequences:/data sangerpathogens/circlator bash
#cd /data
for sample in 010 048 073 083 093 1021 104 108 129 253 282 301 357 383 405 444 446 450 494 503 69 738 81 879 94 995 HE290 HE554 HE695 HSVM 020 068 079 090 097 103 107 109 141 279 293 341 370 394 442 445 449 478 497 550 691 771 82 88 973 HE284 HE511 HE566 HE748; do
~/Tools/ccfind/ccfind ${sample}_raw.fasta ccfind_${sample}
#seqtk mergepe trimmed/${sample}_trimmed_P_1.fastq trimmed/${sample}_trimmed_P_2.fastq > ${sample}_interleaved.fq
#circlator all --threads 16 vrap_010/virus_user_db_contigs.fasta 010_interleaved.fq 010_circlator_out
done
#print file size of all files circ.noTR.fasta under "ccfind_*/result/"
find ccfind_*/result/ -name "circ.noTR.fasta" -exec ls -lh {} \; | awk '{print $5, $9}'
find ccfind_*/result/ -name "circ.noTR.fasta" -exec stat -c "%s %n" {} \;
Circularity checking of the contigs: method_2 using blastn
# Manual inspection of the mate-pair read mapping at the start and end confirmed that three of the contigs were circular plasmids.
# -- Circularity_varification: Confirm the contigs are circular using blastn --
makeblastdb -in virus_user_db_contigs.fasta -dbtype nucl -out contigs_db
blastn -query virus_user_db_contigs.fasta -db contigs_db -outfmt 6 -evalue 1e-10 > blast_results.txt
CAP_1_length_1851 CAP_1_length_1851 100.000 166 0 0 1686 1851 1 166 4.14e-86 307
CAP_1_length_1851 CAP_1_length_1851 100.000 166 0 0 1 166 1686 1851 4.14e-86 307
samtools faidx virus_user_db_contigs.fasta CAP_1_length_1851 > CAP_1.fasta
python3 ~/Scripts/process_circular.py CAP_1.fasta CAP_1_circular.fasta --overlap_len 166
#Modify the sequence header to 010
Calculate coverage
bwa index CAP_1_circular.fasta
bwa mem CAP_1_circular.fasta ../trimmed/010_trimmed_P_1.fastq ../trimmed/010_trimmed_P_2.fastq > aligned_reads.sam
samtools view -bS aligned_reads.sam | samtools sort -o aligned_reads.sorted.bam
samtools index aligned_reads.sorted.bam
samtools depth aligned_reads.sorted.bam > coverage.txt
awk '{sum+=$3} END {print "Average coverage:", sum/NR}' coverage.txt
#Average coverage: 7754.49
Copy and update the fasta-headers
#!/bin/bash
# List of IDs to process
ids=(
010 020 048 073 079 083 093 104 253 279 282 341 383 394 405 446 449 503 550 69 738 879 88
HE290 HE511 HE554 HE695 HE748
)
for id in "${ids[@]}"; do
src="ccfind_${id}/result/circ.noTR.fasta"
dst="${id}.fasta"
if [[ -f "$src" ]]; then
cp "$src" "$dst"
# Update header in the copied fasta
ruby -e "
filename = '${id}'
seq = ''
File.foreach('${dst}') do |line|
next if line.start_with?('>')
seq += line.strip
end
File.open('${dst}', 'w') do |f|
f.puts '>' + filename
f.puts seq
end
"
echo "Processed $id"
else
echo "Warning: source file $src not found!"
fi
done
TODOs
#TODO_TOMORROW: based the pipeline above, processing all samples for assembly, then perform RDP4 in the VirtualBox. Add the process to xgenes.com.
# TODO: Senden all assemblies, say all virus can be successfully assembled, and the assembly-quality were successfully checked!
# TODO: draw plotTreeHeatmap after RDP4-calculation.
# NOTE: that viral-ngs results sometimes is not so good, witht the process above released more reliable and good-quality virus-assebly, each intermediate step can be controlled!
TODO_AFTERNOON: sort the method in the file; start the analyis of Karoline
TODO_TOMORROW: Tam's manuscript.
(TODO) Multiple alignment und tree construction
echo ">>> Running MAFFT (auto + adjust direction)…"
mafft --auto --adjustdirectionaccurately --thread "$THREADS" "$INPUT_FASTA" > "$ALIGNED_FASTA"
# Step 2: Build phylogenetic tree with FastTree (nucleotide)
echo ">>> Running FastTree…"
FastTree -nt "$ALIGNED_FASTA" > "$TREE_FILE"
(TODO) Draw plotTreeHeatmap
#http://xgenes.com/article/article-content/383/presence-absence-table-and-graphics-for-selected-genes-in-data-patricia-sepi-7samples/#
点赞本文的读者
还没有人对此文章表态
没有评论
DAMIAN Post-processing for Flavivirus and FSME
How to correlate RNA-seq Data with Mass Spectrometry Proteomics Data?
© 2023 XGenes.com Impressum