Viral genome assembly and recombination analysis for Data_Sophie_HDV_Sequences

gene_x 0 like s 6 view s

Tags: pipeline

  1. Prepare input raw data

    /mnt/md1/DATA/Data_Sophie_HDV_Sequences/raw_data
    for f in *_R[12]_001.fastq.gz; do newname="$(echo "$f" | awk -F_ '{print $1 "_" $4 ".fastq.gz"}')"; echo mv "$f" "$newname"; done
    for f in *_R[12]_001.fastq.gz; do newname="$(echo "$f" | awk -F_ '{print $1 "_" $4 ".fastq.gz"}')"; mv "$f" "$newname"; done
    
  2. Call variant calling using snippy

    ln -s ~/Tools/bacto/db/ .;
    ln -s ~/Tools/bacto/envs/ .;
    ln -s ~/Tools/bacto/local/ .;
    cp ~/Tools/bacto/Snakefile .;
    cp ~/Tools/bacto/bacto-0.1.json .;
    cp ~/Tools/bacto/cluster.json .;
    #download CU459141.gb from GenBank
    mv ~/Downloads/sequence\(2\).gb db/NC_001653.gb
    #setting the following in bacto-0.1.json
        "fastqc": false,
        "taxonomic_classifier": false,
        "assembly": true,
        "typing_ariba": false,
        "typing_mlst": true,
        "pangenome": true,
        "variants_calling": true,
        "phylogeny_fasttree": true,
        "phylogeny_raxml": true,
        "recombination": false, (due to gubbins-error set false)
        "genus": "Alphacoronavirus",
        "kingdom": "Viruses",
        "species": "Human coronavirus 229E",
        "mykrobe": {
            "species": "corona"
        },
        "reference": "db/PP810610.gb"
    mamba activate /home/jhuang/miniconda3/envs/bengal3_ac3
    (bengal3_ac3) /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    
  3. Prepare virus database

    # ---- Date is 16.06.2025. ----
    #Taxonomy ID: 12475
    esearch -db nucleotide -query "txid12475[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_12475_ncbi.fasta
    python ~/Scripts/filter_fasta.py genome_12475_ncbi.fasta complete_genome_12475_ncbi.fasta  #4208-->760
    
    #https://de.wikipedia.org/wiki/Hepatitis-D-Virus
    Hepatitis delta virus, complete genome
    NCBI Reference Sequence: NC_001653.2
    
  4. (Deprecated) Calling intra-host variants using viral-ngs

    #How to run and debug the viral-ngs docker?
    
    mkdir viralngs; cd viralngs
    ln -s ~/Tools/viral-ngs_docker/Snakefile Snakefile
    ln -s  ~/Tools/viral-ngs_docker/bin bin
    cp  ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/refsel.acids refsel.acids
    cp  ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/lastal.acids lastal.acids
    cp  ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/config.yaml config.yaml
    cp  ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-runs.txt samples-runs.txt
    cp  ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-depletion.txt samples-depletion.txt
    cp  ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-metagenomics.txt samples-metagenomics.txt
    cp  ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-assembly.txt samples-assembly.txt
    cp  ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-assembly-failures.txt samples-assembly-failures.txt
    
    # Adapt the sample-*.txt
    mkdir viralngs/data
    mkdir viralngs/data/00_raw
    
    mkdir bams
    ref_fa="NC_001653.fasta";
    
    for sample in 010  048  073  083  093  1021  104  108  129  253  282  301  357  383  405  444  446  450  494  503  69   738  81  879  94   995    HE290  HE554  HE695  HSVM 020  068  079  090  097  103   107  109  141  279  293  341  370  394  442  445  449  478  497  550  691  771  82  88   973  HE284  HE511  HE566  HE748; do
        bwa index ${ref_fa}; \
        bwa mem -M -t 16 ${ref_fa} trimmed/${sample}_trimmed_P_1.fastq trimmed/${sample}_trimmed_P_2.fastq | samtools view -bS - > bams/${sample}_genome_alignment.bam; \
    done
    conda activate viral-ngs4
    
    for sample in 010  048  073  083  093  1021  104  108  129  253  282  301  357  383  405  444  446  450  494  503  69   738  81  879  94   995    HE290  HE554  HE695  HSVM 020  068  079  090  097  103   107  109  141  279  293  341  370  394  442  445  449  478  497  550  691  771  82  88   973  HE284  HE511  HE566  HE748; do
        picard AddOrReplaceReadGroups I=bams/${sample}_genome_alignment.bam O=~/DATA/Data_Sophie_HDV_Sequences/viralngs/data/00_raw/${sample}.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=$sample RGSM=$sample RGLB=standard RGPU=$sample VALIDATION_STRINGENCY=LENIENT; \
    done
    conda deactivate
    
    # Activate the docker viralngs environment
    docker run -it -v /mnt/md1/DATA/Data_Sophie_HDV_Sequences/viralngs:/work -v /home/jhuang/Tools/viral-ngs_docker:/home/jhuang/Tools/viral-ngs_docker -v /home/jhuang/REFs:/home/jhuang/REFs -v /home/jhuang/Tools/GenomeAnalysisTK-3.6:/home/jhuang/Tools/GenomeAnalysisTK-3.6 -v /home/jhuang/Tools/novocraft_v3:/home/jhuang/Tools/novocraft_v3 -v /usr/local/bin/gatk:/usr/local/bin/gatk   own_viral_ngs_gap2seq bash
    cd /work
    
    # -- ! Firstly manully run for generating all files ${sample}.cleaned.bam and ${sample}.taxfilt.bam in 01_cleaned and 01_per_sample
    for sample in 010  048  073  083  093  1021  104  108  129  253  282  301  357  383  405  444  446  450  494  503  69   738  81  879  94   995    HE290  HE554  HE695  HSVM 020  068  079  090  097  103   107  109  141  279  293  341  370  394  442  445  449  478  497  550  691  771  82  88   973  HE284  HE511  HE566  HE748; do
    
        # -- generating data/01_cleaned/${sample}.cleaned.bam --
        bin/taxon_filter.py deplete data/00_raw/${sample}.bam tmp/01_cleaned/${sample}.raw.bam tmp/01_cleaned/${sample}.bmtagger_depleted.bam tmp/01_cleaned/${sample}.rmdup.bam data/01_cleaned/${sample}.cleaned.bam --bmtaggerDbs /home/jhuang/REFs/viral_ngs_dbs/bmtagger_dbs_remove/metagenomics_contaminants_v3 /home/jhuang/REFs/viral_ngs_dbs/bmtagger_dbs_remove/GRCh37.68_ncRNA-GRCh37.68_transcripts-HS_rRNA_mitRNA /home/jhuang/REFs/viral_ngs_dbs/bmtagger_dbs_remove/hg19 --blastDbs /home/jhuang/REFs/viral_ngs_dbs/blast_dbs_remove/hybsel_probe_adapters /home/jhuang/REFs/viral_ngs_dbs/blast_dbs_remove/metag_v3.ncRNA.mRNA.mitRNA.consensus --threads 60 --srprismMemory 14250 --JVMmemory 50g
    
        # -- data/01_cleaned/073.cleaned.bam --> data/01_cleaned/073.taxfilt.bam --
        bin/taxon_filter.py filter_lastal_bam data/01_cleaned/${sample}.cleaned.bam lastal_db/lastal.fasta data/01_cleaned/${sample}.taxfilt.bam
        bin/read_utils.py bwamem_idxstats data/01_cleaned/${sample}.cleaned.bam /home/jhuang/REFs/viral_ngs_dbs/spikeins/ercc_spike-ins.fasta --outStats reports/spike_count/${sample}.spike_count.txt --minScoreToFilter 60
    
        fastqc -f bam data/01_cleaned/${sample}.cleaned.bam -o reports/fastqc/${sample}
        unzip reports/fastqc/${sample}/${sample}.cleaned_fastqc.zip -d reports/fastqc/${sample}
        fastqc -f bam data/01_cleaned/${sample}.taxfilt.bam -o reports/fastqc/${sample}
        unzip reports/fastqc/${sample}/${sample}.taxfilt_fastqc.zip -d reports/fastqc/${sample}
    
        # -- data/01_cleaned/${sample}.cleaned.bam --> data/01_per_sample/${sample}.cleaned.bam --
        bin/read_utils.py merge_bams data/01_cleaned/${sample}.cleaned.bam tmp/01_cleaned/${sample}.cleaned.bam --picardOptions SORT_ORDER=queryname
        bin/read_utils.py rmdup_mvicuna_bam tmp/01_cleaned/${sample}.cleaned.bam data/01_per_sample/${sample}.cleaned.bam --JVMmemory 30g
    
        # -- data/01_cleaned/${sample}.taxfilt.bam --> data/01_per_sample/${sample}.taxfilt.bam --
        bin/read_utils.py merge_bams data/01_cleaned/${sample}.taxfilt.bam tmp/01_cleaned/${sample}.taxfilt.bam --picardOptions SORT_ORDER=queryname
        bin/read_utils.py rmdup_mvicuna_bam tmp/01_cleaned/${sample}.taxfilt.bam data/01_per_sample/${sample}.taxfilt.bam --JVMmemory 30g
    done
    
    # -- ! Secondly --
    #If direct use         snakemake --directory /work --printshellcmds --cores 40, has the following error, using bash commands instead.
    #Error in rule orient_and_impute:
    #jobid: 0
    #output: tmp/02_assembly/HE290.assembly3-modify.fasta
    #DEBUG: --memLimitGb 12 --> --memLimitGb 960, if threads=60: 256M / 58G, how big the memory needed when threads=120: 492M / 468G.
    ##ASSEMBLY_1_SPADES: data/01_per_sample/010.taxfilt.bam ----> 010.assembly1-spades.fasta in tmp/02_assembly/
    
    for sample in 010  048  073  083  093  1021  104  108  129  253  282  301  357  383  405  444  446  450  494  503  69   738  81  879  94   995    HE290  HE554  HE695  HSVM 020  068  079  090  097  103   107  109  141  279  293  341  370  394  442  445  449  478  497  550  691  771  82  88   973  HE284  HE511  HE566  HE748; do
        bin/assembly.py assemble_spades data/01_per_sample/${sample}.taxfilt.bam /home/jhuang/REFs/viral_ngs_dbs/trim_clip/contaminants.fasta tmp/02_assembly/${sample}.assembly1-spades.fasta --nReads 10000000 --threads 120 --memLimitGb 960
    done
    
    #ASSEMBLY_2_SCAFFOLDED: 010.assembly1-spades.fasta ----> 010.assembly2-scaffolded[_ref].fasta + 010.assembly2-alternate_sequences.fasta
    for sample in 010  048  073  083  093  1021  104  108  129  253  282  301  357  383  405  444  446  450  494  503  69   738  81  879  94   995    HE290  HE554  HE695  HSVM 020  068  079  090  097  103   107  109  141  279  293  341  370  394  442  445  449  478  497  550  691  771  82  88   973  HE284  HE511  HE566  HE748; do
        bin/assembly.py order_and_orient tmp/02_assembly/${sample}.assembly1-spades.fasta refsel_db/refsel.fasta tmp/02_assembly/${sample}.assembly2-scaffolded.fasta --min_pct_contig_aligned 0.05 --outAlternateContigs tmp/02_assembly/${sample}.assembly2-alternate_sequences.fasta --nGenomeSegments 1 --outReference tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta --threads 60
    done
    
    #DEBUG: the tool gap2seq is missing, installing package gap2seq to root@bcd2c36b083c:/opt/miniconda/envs/viral-ngs-env/bin
    #
    #        #https://www.cs.helsinki.fi/u/lmsalmel/Gap2Seq/
    #        apt-get update
    #        apt-get install -y cmake
    #        mkdir build;  cd build;  cmake ..;  make
    #
    #        #cp /work/Gap2Seq-2.1/build/Gap2Seq /opt/miniconda/envs/viral-ngs-env/bin/gap2seq
    #        cp /work/Gap2Seq-2.1/build/Gap2Seq.sh /opt/miniconda/envs/viral-ngs-env/bin/
    #        cp /work/Gap2Seq-2.1/build/Gap2Seq /opt/miniconda/envs/viral-ngs-env/bin/
    #        cp /work/Gap2Seq-2.1/build/GapCutter /opt/miniconda/envs/viral-ngs-env/bin/
    #        cp /work/Gap2Seq-2.1/build/GapMerger /opt/miniconda/envs/viral-ngs-env/bin/
    #        cp -r /work/Gap2Seq-2.1/build/ext /opt/miniconda/envs/viral-ngs-env/bin/
    #        Gap2Seq.sh --help
    #
    #        #MOFIFIED1 in bin/tools/gap2seq.py
    #        #TOOL_VERSION = '2.1'
    #        TOOL_VERSION = '3.1.1a2'
    #
    #        root@544789adb8b6:/work# for sample in 010; do             bin/assembly.py gapfill_gap2seq tmp/02_assembly/${sample}.assembly2-scaffolded.fasta data/01_per_sample/${sample}.cleaned.bam tmp/02_assembly/${sample}.assembly2-gapfilled.fasta --memLimitGb 960 --maskErrors --randomSeed 0 --loglevel DEBUG;         done
    #        2025-06-20 11:12:41,165 - gap2seq:44:execute - DEBUG - running gap2seq: /opt/miniconda/envs/viral-ngs-env/bin/Gap2Seq.sh -scaffolds /work/tmp/02_assembly/010.assembly2-scaffolded.fasta -filled /tmp/tmp-assembly-gapfill_gap2seq-vz_n3tkp/tmpkt8508du_gap2seq_dir/gap2seq-filled.s3.k90.fasta -reads /tmp/tmp-assembly-gapfill_gap2seq-vz_n3tkp/tmpfj62s6n5.1.fq,/tmp/tmp-assembly-gapfill_gap2seq-vz_n3tkp/tmpu5eu1n1r.2.fq -all-upper -verbose -solid 3 -k 90 -nb-cores 0 -max-mem 960 -randseed 0
    #        /opt/miniconda/envs/viral-ngs-env/bin/Gap2Seq.sh: Unrecognized option -randseed
    #
    #        #MODIFIED2 in bin/tools/gap2seq.py: delete 'randseed=random_seed' in solid=solid_kmer_threshold, k=kmer_size, nb_cores=threads, max_mem=mem_limit_gb, randseed=random_seed) so that solid=solid_kmer_threshold, k=kmer_size, nb_cores=threads, max_mem=mem_limit_gb)
    #
    #        docker commit 3f9f9507ab31 viral_ngs_with_gap2seq
    #        docker image ls or docker images
    #
    #        #NOTE that the image cannot be deleted, since linke to other images!
    #        docker rmi own_viral_ngs_with_gap2seq
    #        #Error response from daemon: conflict: unable to remove repository reference "own_viral_ngs_gap2seq" (must force) - container 3f9f9507ab31 is using its referenced image 7ffc275c57cc
    
    #NOTE: --memLimitGb 12 --> --memLimitGb 960
    #ASSEMBLY_2_GAPFILLED: 010.assembly2-scaffolded.fasta + data/01_per_sample/010.cleaned.bam ----> 010.assembly2-gapfilled.fasta
    for sample in 010  048  073  083  093  1021  104  108  129  253  282  301  357  383  405  444  446  450  494  503  69   738  81  879  94   995    HE290  HE554  HE695  HSVM 020  068  079  090  097  103   107  109  141  279  293  341  370  394  442  445  449  478  497  550  691  771  82  88   973  HE284  HE511  HE566  HE748; do
        bin/assembly.py gapfill_gap2seq tmp/02_assembly/${sample}.assembly2-scaffolded.fasta data/01_per_sample/${sample}.cleaned.bam tmp/02_assembly/${sample}.assembly2-gapfilled.fasta --memLimitGb 960 --maskErrors --randomSeed 0 --loglevel DEBUG
    done
    
    #ASSEMBLY_3_MOFIFY: 010.assembly2-gapfilled.fasta + 010.assembly2-scaffold_ref.fasta ----> 010.assembly3-modify.[fasta|fasta.fai|dict|nix]
    for sample in 010  048  073  083  093  1021  104  108  129  253  282  301  357  383  405  444  446  450  494  503  69   738  81  879  94   995    HE290  HE554  HE695  HSVM 020  068  079  090  097  103   107  109  141  279  293  341  370  394  442  445  449  478  497  550  691  771  82  88   973  HE284  HE511  HE566  HE748; do
        bin/assembly.py impute_from_reference tmp/02_assembly/${sample}.assembly2-gapfilled.fasta tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta tmp/02_assembly/${sample}.assembly3-modify.fasta --newName ${sample} --replaceLength 55 --minLengthFraction 0.05 --minUnambig 0.05 --index
    done
    
    #ASSEMBLY_4_REFINED: 010.assembly3-modify.fasta + data/01_per_sample/010.cleaned.bam ----> 010.assembly4-refined.[fasta|fasta.fai|dict|nix] + 010.assembly3.[vcf.gz|vcf.gz.tbi]
    for sample in 010  048  073  083  093  1021  104  108  129  253  282  301  357  383  405  444  446  450  494  503  69   738  81  879  94   995    HE290  HE554  HE695  HSVM 020  068  079  090  097  103   107  109  141  279  293  341  370  394  442  445  449  478  497  550  691  771  82  88   973  HE284  HE511  HE566  HE748; do
        bin/assembly.py refine_assembly tmp/02_assembly/${sample}.assembly3-modify.fasta data/01_per_sample/${sample}.cleaned.bam tmp/02_assembly/${sample}.assembly4-refined.fasta --outVcf tmp/02_assembly/${sample}.assembly3.vcf.gz --min_coverage 2 --novo_params '-r Random -l 20 -g 40 -x 20 -t 502' --threads 60
    done
    
    #ASSEMBLY_5_REFINED2_GENERATE_ASSEMBLY_IN_DATA_DIR: tmp/02_assembly/010.assembly4-refined.fasta + data/01_per_sample/010.cleaned.bam ----> data/02_assembly/010.[fasta|fasta.fai|dict|nix] + tmp/02_assembly/010.assembly4.[vcf.gz|vcf.gz.tbi]
    for sample in 010  048  073  083  093  1021  104  108  129  253  282  301  357  383  405  444  446  450  494  503  69   738  81  879  94   995    HE290  HE554  HE695  HSVM 020  068  079  090  097  103   107  109  141  279  293  341  370  394  442  445  449  478  497  550  691  771  82  88   973  HE284  HE511  HE566  HE748; do
        bin/assembly.py refine_assembly tmp/02_assembly/${sample}.assembly4-refined.fasta data/01_per_sample/${sample}.cleaned.bam data/02_assembly/${sample}.fasta --outVcf tmp/02_assembly/${sample}.assembly4.vcf.gz --min_coverage 3 --novo_params '-r Random -l 20 -g 40 -x 20 -t 100' --threads 60
    done
    
    #ALIGN_CLEANED_BAM_GENERATE_MAPPED_BAM: data/02_assembly/010.fasta + data/01_per_sample/010.cleaned.bam ----> data/02_align_to_self/010.bam + data/02_align_to_self/010.mapped.bam
    for sample in 010  048  073  083  093  1021  104  108  129  253  282  301  357  383  405  444  446  450  494  503  69   738  81  879  94   995    HE290  HE554  HE695  HSVM 020  068  079  090  097  103   107  109  141  279  293  341  370  394  442  445  449  478  497  550  691  771  82  88   973  HE284  HE511  HE566  HE748; do
        bin/read_utils.py align_and_fix data/01_per_sample/${sample}.cleaned.bam data/02_assembly/${sample}.fasta --outBamAll data/02_align_to_self/${sample}.bam --outBamFiltered data/02_align_to_self/${sample}.mapped.bam --aligner novoalign --aligner_options '-r Random -l 20 -g 40 -x 20 -t 100 -k' --threads 60
    done
    
    # -- ! Thirdly set the samples-assembly.txt full
    snakemake --directory /work --printshellcmds --cores 40
    #Error in rule orient_and_impute:
    #jobid: 0
    #output: tmp/02_assembly/HE290.assembly3-modify.fast
    
                        # # ---- The snakemake pipeline contains the following remaining steps ----
                        #
                        # fastqc -f bam data/02_align_to_self/093.bam -o reports/fastqc/093
                        # unzip reports/fastqc/093/093_fastqc.zip -d reports/fastqc/093
                        # fastqc -f bam data/01_cleaned/093.cleaned.bam -o reports/fastqc/093
                        # unzip reports/fastqc/093/093.cleaned_fastqc.zip -d reports/fastqc/093
                        # fastqc -f bam data/01_cleaned/093.taxfilt.bam -o reports/fastqc/093
                        # unzip reports/fastqc/093/093.taxfilt_fastqc.zip -d reports/fastqc/093
                        #
                        # bin/intrahost.py vphaser_one_sample data/02_align_to_self/093.mapped.bam data/02_assembly/093.fasta data/04_intrahost/vphaser2.093.txt.gz --vphaserNumThreads 15 --removeDoublyMappedReads --minReadsEach 5 --maxBias 10
                        # bin/reports.py consolidate_fastqc reports/fastqc/093/taxfilt reports/summary.fastqc.taxfilt.txt
                        # bin/reports.py consolidate_fastqc reports/fastqc/093/align_to_self reports/summary.fastqc.align_to_self.txt
                        # bin/reports.py consolidate_fastqc reports/fastqc/093/cleaned reports/summary.fastqc.cleaned.txt
                        # bin/interhost.py multichr_mafft ref_genome/reference.fasta data/02_assembly/093.fasta data/03_multialign_to_ref --ep 0.123 --maxiters 1000 --preservecase --localpair --outFilePrefix aligned --sampleNameListFile data/03_multialign_to_ref/sampleNameList.txt --threads 60
                        # bin/intrahost.py merge_to_vcf ref_genome/reference.fasta data/04_intrahost/isnvs.vcf.gz --samples 093 --isnvs data/04_intrahost/vphaser2.093.txt.gz --alignments data/03_multialign_to_ref/aligned_1.fasta --strip_chr_version --parse_accession
                        # bin/interhost.py snpEff data/04_intrahost/isnvs.vcf.gz NC_001653.2 data/04_intrahost/isnvs.annot.vcf.gz j.huang@uke.de
                        # bin/intrahost.py iSNV_table data/04_intrahost/isnvs.annot.vcf.gz data/04_intrahost/isnvs.annot.txt.gz
                        # bin/reports.py consolidate_spike_count reports/spike_count reports/summary.spike_count.txt
    
  5. vrap-calling

    ln -s /home/jhuang/Tools/vrap/ .
    mamba activate /home/jhuang/miniconda3/envs/vrap
    
    for sample in 010  048  073  083  093  1021  104  108  129  253  282  301  357  383  405  444  446  450  494  503  69   738  81  879  94   995    HE290  HE554  HE695  HSVM 020  068  079  090  097  103   107  109  141  279  293  341  370  394  442  445  449  478  497  550  691  771  82  88   973  HE284  HE511  HE566  HE748; do
            vrap/vrap.py  -1 trimmed/${sample}_trimmed_P_1.fastq -2 trimmed/${sample}_trimmed_P_2.fastq -o vrap_${sample}  --bt2idx=/home/jhuang/REFs/genome  --host=/home/jhuang/REFs/genome.fa --virus=/mnt/md1/DATA/Data_Sophie_HDV_Sequences/complete_genome_12475_ncbi.fasta --nt=/mnt/nvme0n1p1/blast/nt --nr=/mnt/nvme0n1p1/blast/nr  -t 100 -l 200  -g
    done
    
  6. (Deprecated) Using docker viral-ngs scripts processing the vrap-results. Be carefual since it doesn't release good results.

    mv vrap_010 viralngs/
    mkdir tmp/02_assembly data/02_assembly
    
    #refsel_db/refsel.fasta
    
    The longest contig genome-calling is
    010: HQ005371
    
    # Using viral-ngs to improve the assembly, the results are not so good due to the too diverser reference --> not used!
    for sample in 010  048  073  083  093  1021  104  108  129  253  282  301  357  383  405  444  446  450  494  503  69   738  81  879  94   995    HE290  HE554  HE695  HSVM 020  068  079  090  097  103   107  109  141  279  293  341  370  394  442  445  449  478  497  550  691  771  82  88   973  HE284  HE511  HE566  HE748; do
    for sample in 010; do
        bin/assembly.py order_and_orient vrap_${sample}/virus_user_db_contigs.fasta HQ005371.fasta  tmp/02_assembly/${sample}.assembly2-scaffolded.fasta --min_pct_contig_aligned 0.05 --outAlternateContigs tmp/02_assembly/${sample}.assembly2-alternate_sequences.fasta --nGenomeSegments 1 --outReference tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta --threads 60
        bin/assembly.py gapfill_gap2seq tmp/02_assembly/${sample}.assembly2-scaffolded.fasta data/01_per_sample/${sample}.cleaned.bam tmp/02_assembly/${sample}.assembly2-gapfilled.fasta --memLimitGb 960 --maskErrors --randomSeed 0 --loglevel DEBUG
        bin/assembly.py impute_from_reference tmp/02_assembly/${sample}.assembly2-gapfilled.fasta tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta tmp/02_assembly/${sample}.assembly3-modify.fasta --newName ${sample} --replaceLength 55 --minLengthFraction 0.05 --minUnambig 0.05 --index
        bin/assembly.py refine_assembly tmp/02_assembly/${sample}.assembly3-modify.fasta data/01_per_sample/${sample}.cleaned.bam tmp/02_assembly/${sample}.assembly4-refined.fasta --outVcf tmp/02_assembly/${sample}.assembly3.vcf.gz --min_coverage 2 --novo_params '-r Random -l 20 -g 40 -x 20 -t 502' --threads 60
        bin/assembly.py refine_assembly tmp/02_assembly/${sample}.assembly4-refined.fasta data/01_per_sample/${sample}.cleaned.bam data/02_assembly/${sample}.fasta --outVcf tmp/02_assembly/${sample}.assembly4.vcf.gz --min_coverage 3 --novo_params '-r Random -l 20 -g 40 -x 20 -t 100' --threads 60
    
        bin/read_utils.py align_and_fix data/01_per_sample/${sample}.cleaned.bam data/02_assembly/${sample}.fasta --outBamAll data/02_align_to_self/${sample}.bam --outBamFiltered data/02_align_to_self/${sample}.mapped.bam --aligner novoalign --aligner_options '-r Random -l 20 -g 40 -x 20 -t 100 -k' --threads 60
    done
    
  7. Filtering the contig from the vrap results

    for sample in 010  048  073  083  093  1021  104  108  129  253  282  301  357  383  405  444  446  450  494  503  69   738  81  879  94   995    HE290  HE554  HE695  HSVM 020  068  079  090  097  103   107  109  141  279  293  341  370  394  442  445  449  478  497  550  691  771  82  88   973  HE284  HE511  HE566  HE748; do
        python ~/Scripts/extract_virus_user_db_contigs.py vrap_${sample}
    done
    
    for sample in 010  048  073  083  093  1021  104  108  129  253  282  301  357  383  405  444  446  450  494  503  69   738  81  879  94   995    HE290  HE554  HE695  HSVM 020  068  079  090  097  103   107  109  141  279  293  341  370  394  442  445  449  478  497  550  691  771  82  88   973  HE284  HE511  HE566  HE748; do
        python ~/Scripts/extract_longest_contig.py vrap_${sample}/virus_user_db_contigs.fasta ${sample}_raw.fasta
    done
    
  8. Circularity checking of the contigs: method_1 using ccfind

    #Install ccfind
            #Detects circular genomes via terminal redundancy using BLAST or FASTA Smith–Waterman.
            git clone https://github.com/yosuken/ccfind
            cd ccfind
            # Ensure ssearch36, blastn, prodigal are installed
            ./ccfind <input.fasta> <output_dir>
    
            #https://github.com/wrpearson/fasta3
            wget http://faculty.virginia.edu/wrpearson/fasta/fasta36/fasta-36.3.8h.tar.gz
            tar -xzf fasta-36.3.8h.tar.gz
            cd fasta-36.3.8h
            make -f Makefile.linux64_sse2
            After compiling, add it to your path: export PATH=$PWD:$PATH
    
    # Using ccfind check circularity of the contigs
    #docker pull sangerpathogens/circlator
    #docker run -it -v /home/jhuang/DATA/Data_Sophie_HDV_Sequences:/data sangerpathogens/circlator bash
    #cd /data
    for sample in 010 048  073  083  093  1021  104  108  129  253  282  301  357  383  405  444  446  450  494  503  69   738  81  879  94   995    HE290  HE554  HE695  HSVM 020  068  079  090  097  103   107  109  141  279  293  341  370  394  442  445  449  478  497  550  691  771  82  88   973  HE284  HE511  HE566  HE748; do
        ~/Tools/ccfind/ccfind ${sample}_raw.fasta ccfind_${sample}
        #seqtk mergepe trimmed/${sample}_trimmed_P_1.fastq trimmed/${sample}_trimmed_P_2.fastq > ${sample}_interleaved.fq
        #circlator all --threads 16 vrap_010/virus_user_db_contigs.fasta 010_interleaved.fq 010_circlator_out
    done
    
    #print file size of all files circ.noTR.fasta under "ccfind_*/result/"
    find ccfind_*/result/ -name "circ.noTR.fasta" -exec ls -lh {} \; | awk '{print $5, $9}'
    find ccfind_*/result/ -name "circ.noTR.fasta" -exec stat -c "%s %n" {} \;
    
  9. Circularity checking of the contigs: method_2 using blastn

    # Manual inspection of the mate-pair read mapping at the start and end confirmed that three of the contigs were circular plasmids.
    
    # -- Circularity_varification: Confirm the contigs are circular using blastn --
    makeblastdb -in virus_user_db_contigs.fasta -dbtype nucl -out contigs_db
    blastn -query virus_user_db_contigs.fasta -db contigs_db -outfmt 6 -evalue 1e-10 > blast_results.txt
    
        CAP_1_length_1851       CAP_1_length_1851       100.000 166     0       0       1686    1851    1       166     4.14e-86        307
        CAP_1_length_1851       CAP_1_length_1851       100.000 166     0       0       1       166     1686    1851    4.14e-86        307
    
    samtools faidx virus_user_db_contigs.fasta CAP_1_length_1851 > CAP_1.fasta
    python3 ~/Scripts/process_circular.py CAP_1.fasta CAP_1_circular.fasta --overlap_len 166
    #Modify the sequence header to 010
    
  10. Calculate coverage

    bwa index CAP_1_circular.fasta
    bwa mem CAP_1_circular.fasta ../trimmed/010_trimmed_P_1.fastq ../trimmed/010_trimmed_P_2.fastq > aligned_reads.sam
    samtools view -bS aligned_reads.sam | samtools sort -o aligned_reads.sorted.bam
    samtools index aligned_reads.sorted.bam
    samtools depth aligned_reads.sorted.bam > coverage.txt
    awk '{sum+=$3} END {print "Average coverage:", sum/NR}' coverage.txt
    #Average coverage: 7754.49
    
  11. Copy and update the fasta-headers

    #!/bin/bash
    
    # List of IDs to process
    ids=(
    010 020 048 073 079 083 093 104 253 279 282 341 383 394 405 446 449 503 550 69 738 879 88
    HE290 HE511 HE554 HE695 HE748
    )
    
    for id in "${ids[@]}"; do
    src="ccfind_${id}/result/circ.noTR.fasta"
    dst="${id}.fasta"
    
    if [[ -f "$src" ]]; then
        cp "$src" "$dst"
    
        # Update header in the copied fasta
        ruby -e "
        filename = '${id}'
        seq = ''
        File.foreach('${dst}') do |line|
            next if line.start_with?('>')
            seq += line.strip
        end
        File.open('${dst}', 'w') do |f|
            f.puts '>' + filename
            f.puts seq
        end
        "
    
        echo "Processed $id"
    else
        echo "Warning: source file $src not found!"
    fi
    done
    
  12. TODOs

    #TODO_TOMORROW: based the pipeline above, processing all samples for assembly, then perform RDP4 in the VirtualBox.  Add the process to xgenes.com.
    # TODO: Senden all assemblies, say all virus can be successfully assembled, and the assembly-quality were successfully checked!
    # TODO: draw plotTreeHeatmap after RDP4-calculation.
    # NOTE: that viral-ngs results sometimes is not so good, witht the process above released more reliable and good-quality virus-assebly, each intermediate step can be controlled!
    TODO_AFTERNOON: sort the method in the file; start the analyis of Karoline
    TODO_TOMORROW: Tam's manuscript.
    
  13. (TODO) Multiple alignment und tree construction

    echo ">>> Running MAFFT (auto + adjust direction)…"
    mafft --auto --adjustdirectionaccurately --thread "$THREADS" "$INPUT_FASTA" > "$ALIGNED_FASTA"
    
    # Step 2: Build phylogenetic tree with FastTree (nucleotide)
    echo ">>> Running FastTree…"
    FastTree -nt "$ALIGNED_FASTA" > "$TREE_FILE"
    
  14. (TODO) Draw plotTreeHeatmap

    #http://xgenes.com/article/article-content/383/presence-absence-table-and-graphics-for-selected-genes-in-data-patricia-sepi-7samples/#
    

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum