Assembly and Variant Calling in Acinetobacter baumannii Genomes

gene_x 0 like s 422 view s

Tags: pipeline

  1. input data

    #Deciphering Evolutionary Trajectories in Acinetobacter baumannii: A Comparative Study of Isolates ATCC 19606, AYE, and ATCC 17978
    
    #PRJNA1041744
    
    A6 WT - Acinetobacter baumannii ATCC 19606, CP059040
    A10 CraA - Acinetobacter baumannii ATCC 19606, CP059040
    BIT33_RS17375: adeJ
    BIT33_RS17370: adeI
    BIT33_RS11855: craA (MKNIQTTALNRTTLMFPLALVLFEFAVYIGNDLIQPAMLAITED
                 FGVSATWAPSSMSFYLLGGASVAWLLGPLSDRLGRKKVLLSGVLFFALCCFLILLTRQ
                 IEHFLTLRFLQGIGLSVISAVGYAAIQENFAERDAIKVMALMANISLLAPLLGPVLGA
                 FLIDYVSWHWGFVAIALLALLSWVGLKKQMPSHKVSVTKQPFSYLFDDFKKVFSNRQF
                 LGLTLALPLVGMPLMLWIALSPIILVDELKLTSVQYGLAQFPVFLGLIVGNIVLIKII
                 DRLALGKTVLIGLPIMLTGTLILILGVVWQAYLIPCLLIGMTLICFGEGISFSVLYRF
                 ALMSSEVSKGTVAAAVSMLLMTSFFAMIELVRYLYTQFHLWAFVLSAFAFIALWFTQP
                 RLALKREMQERVAQDLH) MFS transporter --> H0N29_01695
    364663 - 365770
    
    A12 AYE - Acinetobacter baumannii AYE,         CU459141
    A19 17978 - Acinetobacter baumannii ATCC 17978  CP033110
    A1S_2736: adeJ
    A1S_2735: adeI
    A1S_3146: craA
    
    mv A6WT_HQ_R1.fq.gz A6_WT_HQ_R1.fastq.gz 
    mv A6WT_HQ_R2.fq.gz A6_WT_HQ_R2.fastq.gz 
    mv A10CraA_HQ_R1.fq.gz A10_CraA_HQ_R1.fastq.gz 
    mv A10CraA_HQ_R2.fq.gz A10_CraA_HQ_R2.fastq.gz 
    mv A12AYE_HQ_R1.fq.gz A12_AYE_HQ_R1.fastq.gz 
    mv A12AYE_HQ_R2.fq.gz A12_AYE_HQ_R2.fastq.gz 
    mv A1917978_HQ_R1.fq.gz A19_17978_HQ_R1.fastq.gz 
    mv A1917978_HQ_R2.fq.gz A19_17978_HQ_R2.fastq.gz
    
    #Nucleotide Accession Prefixes
    #AP,BS                                                      DDBJ           Genome project data
    #AL,BX,CR,CT,CU,FP,FQ,FR                                    EMBL           Genome project data
    #AE,CP,CY                                                   GenBank        Genome project data
    
    #transposon detection using gubbins
    
    #TODOs: Submit A6, A12 and A19 to NCBI, A10 not, since the assembly is very bad! Could do mapping plot of A10 on A6, and three genebank-files to Tam!     
    #Wuen Ee Foong
    #Jiabin Huang
    #Heng-Keat Tam, Hengyang Medical College, University of South China
    #pA12AYE1
    
  2. prokka

    for sample in A19_17978_HQ; do
    prokka --force --outdir prokka/${sample} --cpus 2 --usegenus --genus Acinetobacter --kingdom Bacteria --species baumannii --addgenes --addmrna --prefix ${sample} --locustag ${sample} shovill/${sample}/contigs.fa -hmm /media/jhuang/Titisee/GAMOLA2/TIGRfam_db/TIGRFAMs_15.0_HMM.LIB
    done
    
    (prokka on titisee) for the remaining steps (f,f,f,f,f,t,f(variants_calling),t,t,t in bacto-0.1.json)
    
    #TODO: annotate the SNPs with antimicrobial gene database!
    #TODO: ask Qianjie die sample are directly gotten from the patient, oder did some motification on the reference genome?
    

3.1. vrap (TODO: deploy vrap on xgenes.com, it does not use the database. under (vrap) jhuang@hamm[hamburg])

    #In summary: --(optional) host --> bowtie/host (bowtie_database for host-substraction)
    #            --(optional) virus --> blast/db/virus (blast1, in nt_dbs, custom blastn database before downloaded_db!)
    #            --(always) defined in download_db.py (setting by modifying the file) and run default --> database/viral_db/viral_nucleotide (blast2, in nt_dbs)
    #                                                                                                 --> database/viral_db/viral_protein (blast3, in prot_db)
    #            --(optional) nt+nr (blast4 and blast5)

    #Under (vrap) jhuang@hamm or (bengal3_ac3) jhuang@hamburg
    #DEBUG: {path}/external_tools/Lighter-master/lighter-->/home/jhuang/miniconda3/envs/vrap/bin/lighter
    #DEBUG: {path}/external_tools/SPAdes-3.11.1-Linux/bin/spades.py-->/home/jhuang/miniconda3/envs/vrap/bin/spades.py

    #gi|1690449040|gb|CP029454.1|    Bacillus cereus strain FORC087 chromosome, complete genome
    #gi|1621560499|gb|CP039269.1|    Bacillus cereus strain MH19 chromosome, complete genome
    #gi|1783154430|gb|CP040340.1|    Bacillus cereus strain DLOU-Tangshan chromosome, complete genome
    #CP059040        3980852 10      70      71
    #CU459141        3936291 4037743 70      71
    #CP033110        3902037 8030278 70      71

    #CP059040 --> A6 and A10
    #CU459141 --> A12 
    #CP033110 --> A19

    #Command line: /home/jhuang/miniconda3/envs/bengal3_ac3/bin/spades.py -1 A6_WT_HQ_R1.fastq.gz -2 A6_WT_HQ_R2.fastq.gz --careful --trusted-contigs CP059040.fasta -o spades_A6_ATCC19606

    #CP059040_RagTag 3866625 17      3866625 3866626
    #CAP_1_length_17300      17300   3866663 17300   17301
    #CAP_2_length_928        928     3883982 928     929
    #NODE_17_length_11061_cov_141.020669     11061   3884948 11061   11062
    #NODE_19_length_1654_cov_105.893255      1654    3896046 1654    1655
    #NODE_20_length_1102_cov_153.073846      1102    3897737 1102    1103
    #NODE_21_length_1040_cov_172.078861      1040    3898876 1040    1041

    #NOTE that we need to adjust the taxid in download_db.py to download viral_nucleotide and viral_protein before running vrap/vrap.py.

    #A10 
    #Blast with 5 DBs when --virus (blast/db/virus), viral_nucleotide (download_db.rb), viral_protein (download_db.rb), nt, nr
    #Bowtie2 with 1 DB when --host (bowtie/host)
    #NOTE the fasta files given with --host and --virus should be deduplicated!
    #NOTE that in vrap_CP059040.py the trusted-contigs is used "--trusted-contigs /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/CP074710.fasta"
    vrap/vrap_CP059040.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_out --host Bacillus_cereus.fasta --virus=Acinetobacter_baumannii.fasta   -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55

    #A6
    vrap/vrap_CP059040.py -1 ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_2.fastq.gz -o A6_vrap_out  -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55

    #A12
    vrap/vrap_CU459141.py -1 ./results_AYE/trimmed/A12_AYE_HQ_trimmed_P_1.fastq.gz -2 ./results_AYE/trimmed/A12_AYE_HQ_trimmed_P_2.fastq.gz -o A12_vrap_out  -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55

    #A19 
    vrap/vrap_CP033110.py -1 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_2.fastq.gz -o A19_vrap_out  -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55

    # -- A10 LOG --
    START VrAP
    START checking blast db
    START lighter
    START estimate k-mer size
    k-mer size: 15037634
    /home/jhuang/miniconda3/envs/vrap/bin/lighter -r A10_CraA_HQ_trimmed_P_1.fastq.gz -r A10_CraA_HQ_trimmed_P_2.fastq.gz -K 20 15037634 -od A10_vrap_out/lighter -t 55
    START flash
    /home/jhuang/Tools/vrap/external_tools/FLASH-1.2.11/flash A10_vrap_out/lighter/A10_CraA_HQ_trimmed_P_1.cor.fq.gz A10_vrap_out/lighter/A10_CraA_HQ_trimmed_P_2.cor.fq.gz -o flash -d A10_vrap_out/flash -M 1000

    START bowtie2-build
    START bowtie2 pe
    #MAPPING to 1 DB: bowtie/host

    START spades    
    START cap3
    #/home/jhuang/Tools/vrap/external_tools/cap3 A10_vrap_out/spades/contigs.fasta -y 100  
    START calculating orf density
    START HMMER

    START blast
    #grep ">" A10_vrap_out/blast/custom_viral_seq.fa == "--virus Acinetobacter_baumannii.fasta" 
    #MAPPING to 5 DBs: blast/db/virus[1], viral_nucleotide (defined in the download_db.py)[2], viral_protein (defined in the download_db.py)[3], nt[4], nr[5]
    #INPUT_SEQ: 
    #(Assuming, if annotated with the previous database, will not translate to the next level). For the example below: we have no record finding in [2,1], then we have 1317-0 in blast/blastn.fa; In [3] we find 'cut -f1 blastx.csv | sort -u | wc -l = 567', then we have 1317-567=750; In [4], we 'cut -f1 blastn_nt.csv | sort -u | wc -l=710', then we have 750-710=40 in blast/blastn_nt.fa; In [5], we find 'cut -f1 blastn_nr.csv | sort -u | wc -l=39', then we remain 40-39=1 record in blastx_nr.fa.
    #grep ">" vrap_contig.fasta | wc -l   #1317 
    #grep ">" blast/blastn.fa | wc -l     #1317
    #grep ">" blast/blastx.fa | wc -l     #750
    #grep ">" blast/blastn_nt.fa | wc -l  #40
    #grep ">" blast/blastx_nr.fa | wc -l  #1
    #--
    #grep ">" vrap_contig.fasta | wc -l   #2370 
    #grep ">" blast/blastn.fa | wc -l     #2306 --> 64 contigs should belong to A. baumannii! can be annotated with --custom_virus (3 speicies of Acinetobacter baumannii) and downloaded [virus_nt_db] (2.6 G Acinetobacter baumannii)
    #grep ">" blast/blastx.fa | wc -l     #1245: 1061 contigs can be annotated with downloaded [virus_aa_db]
    #grep ">" blast/blastn_nt.fa | wc -l  #41: 1204 contigs can be annotated with nt
    #grep ">" blast/blastx_nr.fa | wc -l  #1: 40 contigs can be annotated with nr 
    #--> extract 64 contigs using 'python3 extract_unique_sequences.py A10_vrap_out/vrap_contig.fasta A10_vrap_out/blast/blastn.fa A10_extracted_contigs.fa'

    [1_INDEX]/home/jhuang/Tools/vrap/external_tools/blast/makeblastdb -in /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/custom_viral_seq.fa -dbtype nucl -parse_seqids -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/db/virus >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log 2>> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log

    [2,1]/home/jhuang/Tools/vrap/external_tools/blast/blastn -num_threads 55 -query /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap_contig.fasta -db "/home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/db/virus"  -evalue 1e-4 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastn.csv >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log

    [3]/home/jhuang/Tools/vrap/external_tools/blast/blastx -num_threads 55 -query /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastn.fa -db "/home/jhuang/Tools/vrap/database/viral_db/viral_protein"  -evalue 1e-6 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastx.csv >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log

    [4]/home/jhuang/Tools/vrap/external_tools/blast/blastn -num_threads 55 -query /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastx.fa -db "/mnt/h1/jhuang/blast/nt"  -evalue 1e-4 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastn_nt.csv >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log

    [5]/home/jhuang/Tools/vrap/external_tools/blast/blastx -num_threads 55 -query /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastn_nt.fa -db "/mnt/h1/jhuang/blast/nr"  -evalue 1e-6 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastx_nr.csv >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log

    VrAP pipeline finished.
    Thank you for using VrAP!

3.2. ragtag.py ((bengal3_ac3) jhuang@hamm)

    #ragtag.py patch ragtag_output_scaffold/ragtag.scaffold.fasta CP059040.fasta

    ragtag.py scaffold CP059040.fasta A6_vrap_out/vrap_contig.fasta
        Acinetobacter baumannii strain ATCC 19606 plasmid unnamed, complete sequence    Acinetobacter baumannii     19653   39248   62%     0.0     99.97%  18598   CP074586.1

    python3 extract_unique_sequences.py vrap_contig.fasta blast/blastn.fa A10_extracted_contigs.fa
    ragtag.py scaffold CP059040.fasta A10_extracted_contigs.fa
    python3 filter_contigs.py A10_ragtag_output/ragtag.scaffold.fasta A10_ragtag_contigs_2000.fa
    # Check the converage to determine if they are chr or plasmids!

    ragtag.py scaffold CU459141.fasta A12_vrap_out/vrap_contig.fasta
        Acinetobacter baumannii str. AYE plasmid p2ABAYE, complete genome   Acinetobacter baumannii AYE     16835   18307   100%    0.0     99.99%  9661    CU459138.1
        Acinetobacter baumannii str. AYE plasmid p1ABAYE, complete genome   Acinetobacter baumannii AYE     9494    10856   100%    0.0     100.00%     5644    CU459137.1
        Acinetobacter baumannii str. AYE plasmid p4ABAYE, complete genome   Acinetobacter baumannii AYE     3709    5265    100%    0.0     99.95%  2726    CU459139.1
        Acinetobacter baumannii str. AYE plasmid p3ABAYE, complete genome   Acinetobacter baumannii AYE     1.384e+05   1.838e+05   100%    0.0     100.00%     94413   CU459140.1

    ragtag.py scaffold CP033110.fasta A19_vrap_out/vrap_contig.fasta
    #using CP074710.1 in the round2 (see below)
        Acinetobacter baumannii strain ATCC 17978 plasmid unnamed1, complete sequence   Acinetobacter baumannii     2.373e+05   2.463e+05   100%    0.0     100.00%     148956  CP074709.1
        Acinetobacter baumannii strain ATCC 17978 plasmid unnamed5, complete sequence   Acinetobacter baumannii     24842   50362   100%    0.0     100.00%     38118   CP074711.1
        Acinetobacter baumannii strain ATCC 17978 plasmid unnamed3, complete sequence   Acinetobacter baumannii     20953   37147   100%    0.0     100.00%     20351   CP074707.1
        Acinetobacter baumannii ATCC 17978 chromosome, complete genome  Acinetobacter baumannii ATCC 17978  13245   16527   100%    0.0     100.00%     4005343     CP053098.1

3.3 vrap round2

    #A10 
    vrap/vrap_A6_chr.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_out --host Bacillus_cereus.fasta --virus=Acinetobacter_baumannii.fasta   -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55

    #A19 
    vrap/vrap_CP074710.py -1 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_2.fastq.gz -o A19_vrap_out  -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55

3.4 ragtag round2

    python3 extract_unique_sequences.py A10_vrap_out/vrap_contig.fasta A10_vrap_out/blast/blastn.fa A10_vrap_out/A10_extracted_contigs.fa
    ragtag.py scaffold A6_chr.fasta A10_vrap_out/A10_extracted_contigs.fa
    mv ragtag_output A10_ragtag_output
    python3 filter_contigs.py A10_ragtag_output/ragtag.scaffold.fasta A10_ragtag_contigs_2000.fa

    ragtag.py scaffold CP074710.fasta A19_vrap_out/vrap_contig.fasta
    Acinetobacter baumannii strain ATCC 17978 plasmid unnamed5, complete sequence   Acinetobacter baumannii     24842   50362   100%    0.0     100.00%     38118   CP074711.1
    Acinetobacter baumannii strain ATCC 17978 plasmid unnamed3, complete sequence   Acinetobacter baumannii     20953   37147   100%    0.0     100.00%     20351   CP074707.1

3.5 ragtag round3

    ragtag.py patch CP059040.fasta A6_vrap_out/vrap_contig.fasta
    ragtag.py scaffold CP059040.fasta A6_vrap_out/vrap_contig.fasta

    python3 extract_unique_sequences.py vrap_contig.fasta blast/blastn.fa A10_extracted_contigs.fa
    ragtag.py scaffold CP059040.fasta A10_vrap_out/A10_extracted_contigs.fa
    python3 filter_contigs.py A10_ragtag_output/ragtag.scaffold.fasta A10_ragtag_contigs_2000.fa
    # Check the converage to determine if they are chr or plasmids!
    #>CAP_12_length_2972
    #Bacillus paranthracis strain Bc006 chromosome, complete genome     Bacillus paranthracis   4691    5472    100%    0.0     99.73%  5304537     CP119629.1
    #Bacillus paranthracis strain Bc006 chromosome, complete genome     Bacillus paranthracis   9762    11557   97%     0.0     99.87%  5304537     CP119629.1
    #Bacillus paranthracis strain Bc006 chromosome, complete genome     Bacillus paranthracis   14883   1.353e+05   100%    0.0     99.98%  5304537     CP119629.1
    #CP119629.1 Submitted (10-MAR-2023) College of Animal Science, Ahau, Chagnjianxilu, Anhui 230000, China

    ragtag.py patch CU459141.fasta A12_vrap_out/vrap_contig.fasta
    ragtag.py scaffold CU459141.fasta A12_vrap_out/vrap_contig.fasta

    ragtag.py patch CP074710.fasta A19_vrap_out/vrap_contig.fasta
    ragtag.py scaffold CP074710.fasta A19_vrap_out/vrap_contig.fasta

    A10:
    CP059040_RagTag 3941335 17      60      61
    CAP_11_length_2972      2972    4007061 60      61
    CAP_1_length_17300      17300   4010103 60      61
    samtools faidx A10_ragtag_contigs_2000.fa CAP_1_length_17300 > A10_plasmid.fasta

3.6 To map reads from one isolate (isolate 2) to the genome of another isolate (isolate 1), check coverage, and generate a consensus FASTA from the alignment, with regions not covered by the mapping masked as 'N', you'll need to follow a series of steps using bioinformatics tools.

    # -- Mapping to CP059040 --
    5447038 + 0 in total (QC-passed reads + QC-failed reads)
    5447038 + 0 primary
    0 + 0 secondary
    0 + 0 supplementary
    0 + 0 duplicates
    0 + 0 primary duplicates
    3346945 + 0 mapped (61.45% : N/A)
    3346945 + 0 primary mapped (61.45% : N/A)
    4361512 + 0 paired in sequencing
    2180756 + 0 read1
    2180756 + 0 read2
    2579038 + 0 properly paired (59.13% : N/A)
    2681114 + 0 with itself and mate mapped
    21988 + 0 singletons (0.50% : N/A)
    0 + 0 with mate mapped to a different chr
    0 + 0 with mate mapped to a different chr (mapQ>=5)

    # -- Mapping to A6 --
    5447044 + 0 in total (QC-passed reads + QC-failed reads)
    5447044 + 0 primary
    0 + 0 secondary
    0 + 0 supplementary
    0 + 0 duplicates
    0 + 0 primary duplicates
    3437574 + 0 mapped (63.11% : N/A)
    3437574 + 0 primary mapped (63.11% : N/A)
    4361524 + 0 paired in sequencing
    2180762 + 0 read1
    2180762 + 0 read2
    2646686 + 0 properly paired (60.68% : N/A)
    2753738 + 0 with itself and mate mapped
    22055 + 0 singletons (0.51% : N/A)
    1484 + 0 with mate mapped to a different chr
    1390 + 0 with mate mapped to a different chr (mapQ>=5)

    #- Sort and Index the BAM File:
    #    samtools sort -o sorted.bam input.bam
    #    samtools index sorted.bam
    #- Call Variants:
    #    bcftools mpileup -Ou -f reference.fasta sorted.bam | bcftools call -mv -Oz -o calls.vcf.gz
    #    bcftools index calls.vcf.gz
    #- Generate Consensus FASTA:
    #    bcftools consensus -f reference.fasta calls.vcf.gz -o consensus.fasta

    Step 1: Align Reads to Reference Genome
        First, align the reads from isolate 2 to the genome of isolate 1. For example, using BWA:

            bwa mem reference_genome.fasta isolate2_reads.fastq > aligned.sam

        or, use vrap to generate input.bam

            #for bowtie (1 DB): --host Bacillus_cereus.fasta 
            #for blast (5 DBs): --virus=Acinetobacter_baumannii.fasta -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr, and two default download_db.py
            #A6_chr_plasmid.fasta reads filtered, should only the contamination reads remaining! 
            vrap/vrap.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_host_A6 --host A6_chr_plasmid.fasta --noblast -t 55

    Step 2: Convert SAM to BAM, Sort and Index
        Convert the SAM file to BAM, sort it, and create an index:

            samtools view -Sb mapped > aligned.bam
            samtools sort aligned.bam -o sorted.bam
            samtools index sorted.bam

    Step 3: Coverage Analysis
        Check the coverage of the genome regions. Regions with no coverage will be masked later.

            #bedtools genomecov -ibam sorted.bam -bg > coverage.bed
            #BUG above: By default, Bedtools doesn't explicitly list regions with zero coverage in the output of genomecov. To get around this, you can use the -bga option with genomecov. This option ensures that regions with zero coverage are also included in the output. Here's how you can modify your command:
            bedtools genomecov -ibam sorted.bam -bga > coverage.bed

    Step 4: Create a Bed File for Masking
        Create a BED file that specifies regions with no or low coverage. Assume that any region with coverage less than a certain threshold (e.g., 1x) should be masked:

            awk '$4 < 1 {print $1 "\t" $2 "\t" $3}' coverage.bed > low_coverage.bed

    Step 5: Generate Consensus Sequence
        Generate a consensus sequence, using bcftools, and mask regions with low or no coverage:

            bcftools mpileup -Ou -f ../../CP059040.fasta sorted.bam | bcftools call -mv --ploidy 1 -Oz -o calls.vcf.gz
            bcftools index calls.vcf.gz
            bcftools consensus -f ../../CP059040.fasta -m low_coverage.bed calls.vcf.gz -o consensus.fasta
            #The site CP059040:4 overlaps with another variant, skipping...
            #The site CP059040:3125036 overlaps with another variant, skipping...
            #Applied 58 variants

            bcftools mpileup -Ou -f ../../A6_chr_plasmid.fasta sorted.bam | bcftools call -mv --ploidy 1 -Oz -o calls.vcf.gz
            bcftools index calls.vcf.gz
            bcftools consensus -f ../../A6_chr_plasmid.fasta -m low_coverage.bed calls.vcf.gz -o consensus.fasta
            #The site seq00000000:4 overlaps with another variant, skipping...
            #The site seq00000000:3125036 overlaps with another variant, skipping...
            #Applied 61 variants
            #--> TODO: could report the following regions are not covered in A10 comparing to A6.
            #seq00000000     364666  365765
            #seq00000000     2810907 2813323
            #seq00000000     2813552 2813779
            #seq00000000     2813923 2819663
            #seq00000000     2821839 2831921
            #seq00000000     2832750 2832753
            #seq00000000     2833977 2860354
            #seq00000000     2860505 2861780
            #seq00000000     2861931 2861951
            #seq00000000     2862102 2862234
            #seq00000000     2862385 2863434
            #seq00000000     3124939 3124943
            #seq00000000     3125009 3125010

        In this command, the -m option in bcftools consensus is used to specify a BED file for masking low-coverage regions.

4.1, generate genebank in snpEff

    mkdir ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/CP059040
    cp db/CP059040.gb ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/CP059040/genes.gbk
    mkdir ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/CU459141 
    cp db/CU459141.gb ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/CU459141/genes.gbk
    mkdir ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/CP079931 
    cp db/CP079931.gb ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/CP079931/genes.gbk

    vim ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/snpEff.config
    /home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank CP059040      -d
    /home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank CU459141      -d
    /home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank CP079931      -d

4.2, using spandx calling variants

    gzip A6_WT_HQ_trimmed_P_1.fastq A6_WT_HQ_trimmed_P_2.fastq A10_CraA_HQ_trimmed_P_1.fastq A10_CraA_HQ_trimmed_P_2.fastq
    gzip A19_17978_HQ_trimmed_P_1.fastq A19_17978_HQ_trimmed_P_2.fastq

    ln -s /home/jhuang/Tools/spandx/ spandx
    #nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref ATCC19606.fasta --annotation  --database ATCC19606 -resume
    nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref ../db/CP059040.fasta --annotation --database CP059040 -resume

    #TODO:   --structural   Set to true if you would like to identify structural variants
    #         Note that this step can take a considerable amount of time if
    #         you have deep sequencing data (default: false).
    #Done! Result files are in --> ./Outputs
    #If further analysis is required, bam alignments are in --> ./Outputs/bams
    #Phylogenetic tree and annotated merged variants are in --> ./Outputs/Phylogeny_and_annotation
    #Individual variant files are in --> ./Outputs/Variants/VCFs

    gzip A12_AYE_HQ_trimmed_P_1.fastq A12_AYE_HQ_trimmed_P_2.fastq
    ln -s /home/jhuang/Tools/spandx/ spandx
    #nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref AYE.fasta --annotation --database AYE -resume
    nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref ../db/CU459141.fasta --annotation --database CU459141 -resume
    cd ../A19_bacto_out/
    ln -s /home/jhuang/Tools/spandx/ spandx
    nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref ../db/CP079931.fasta --annotation --database CP079931 -resume

    # LOG
    #-t
    #snpEff eff -nodownload -no-downstream -no-intergenic -ud 100 -v CP040849.1 noAB_wildtype_trimmed.PASS.snps.vcf > noAB_wildtype_trimmed.PASS.snps.annotated.vcf

    # DEBUG: "--mixtures --structural" does not work well!
    #pindel -f ATCC19606.fasta -T 2 -i pindel.bam.config -o pindel.out                 
    #pindel -f ATCC19606.fasta -T 2 -i pindel.bam.config -o pindel.out

5, manually compare the results of snippy and spandx

    Variant Types
    Type    Name    Example
    snp     Single Nucleotide Polymorphism  A => T
    mnp     Multiple Nuclotide Polymorphism     GC => AT
    ins     Insertion   ATT => AGTT
    del     Deletion    ACGG => ACG
    complex     Combination of snp/mnp  ATTC => GTTA

    #in snippy/A6_WT_HQ/A6_WT_HQ.csv
    CHROM,POS,TYPE,REF,ALT,EVIDENCE,FTYPE,STRAND,NT_POS,AA_POS,EFFECT,LOCUS_TAG,GENE,PRODUCT
    CP059040,136447,snp,T,A,A:338 T:0
    CP059040,152318,snp,T,A,A:337 T:0,CDS,-,106/213,36/70,missense_variant c.106A>T p.Thr36Ser,H0N29_00700,,DUF1737 domain-containing protein
    CP059040,171965,snp,T,A,A:312 T:0,CDS,+,167/798,56/265,missense_variant c.167T>A p.Val56Asp,H0N29_00780,,MBL fold metallo-hydrolase
    CP059040,194020,ins,C,CT,CT:317 C:0,CDS,-,3260/3497,1087/1164,frameshift_variant c.3260dupA p.Arg1088fs,H0N29_00885,,PAS-domain containing protein
    CP059040,375297,snp,A,T,T:268 A:0

    CP059040,468939,complex,ATAATATATTCTTTATAGATGAATACATTTTTTCTTTA,TATAATATATTCTTTATAGATGAATACATTTTTTCTTTAT,TATAATATATTCTTTATAGATGAATACATTTTTTCTTTAT:111 ATAATATATTCTTTATAGATGAATACATTTTTTCTTTA:0

    CP059040,609979,snp,A,T,T:301 A:0,CDS,+,1009/1212,337/403,stop_gained c.1009A>T p.Lys337*,H0N29_02925,,SAM-dependent methyltransferase
    CP059040,1036730,ins,T,TA,TA:206 T:1
    CP059040,1059847,snp,C,T,T:245 C:0,CDS,-,119/642,40/213,missense_variant c.119G>A p.Gly40Asp,H0N29_04880,,DUF2238 domain-containing protein
    CP059040,1272639,snp,T,G,G:195 T:0
    CP059040,1300706,snp,A,G,G:202 A:0,CDS,+,351/1428,117/475,synonymous_variant c.351A>G p.Ser117Ser,H0N29_06100,,M48 family metalloprotease

    CP059040,1970200,snp,T,C,C:163 T:0,CDS,+,281/2136,94/711,missense_variant c.281T>C p.Val94Ala,H0N29_09260,,TonB-dependent siderophore receptor
    CP059040,2383727,ins,A,AT,AT:129 A:0
    CP059040,2477628,ins,T,TA,TA:90 T:0
    CP059040,2525852,ins,A,AT,AT:189 A:0,CDS,-,529/836,177/277,frameshift_variant c.529dupA p.Ile177fs,H0N29_11845,,hypothetical protein
    CP059040,3016359,ins,A,AT,AT:178 A:0,CDS,-,2136/3236,712/1077,frameshift_variant c.2136dupA p.Phe713fs,H0N29_14350,cas3f,type I-F CRISPR-associated helicase Cas3
    CP059040,3111299,snp,A,G,G:263 A:0,CDS,-,957/1185,319/394,synonymous_variant c.957T>C p.Ile319Ile,H0N29_14775,,hypothetical protein
    CP059040,3310021,ins,C,CT,CT:197 C:0
    CP059040,3542741,del,GC,G,G:307 GC:0,CDS,+,1746/2231,582/742,frameshift_variant c.1746delC p.Arg583fs,H0N29_16865,,primosomal protein N'
    CP059040,3542934,del,CT,C,C:308 CT:0,CDS,+,1938/2231,646/742,frameshift_variant c.1938delT p.Met647fs,H0N29_16865,,primosomal protein N'
    CP059040,3570717,del,AC,A,A:292 AC:0,CDS,+,187/1315,63/437,frameshift_variant c.187delC p.Gln63fs,H0N29_16960,,FAD-dependent monooxygenase
    CP059040,3629616,ins,C,CT,CT:212 C:0
    CP059040,3873573,snp,A,G,G:363 A:0,CDS,+,1705/2187,569/728,missense_variant c.1705A>G p.Asn569Asp,H0N29_18380,,polysaccharide biosynthesis tyrosine autokinase

    #in snippy/A10_CraA_HQ/A10_CraA_HQ.csv
    CHROM,POS,TYPE,REF,ALT,EVIDENCE,FTYPE,STRAND,NT_POS,AA_POS,EFFECT,LOCUS_TAG,GENE,PRODUCT
    CP059040,136447,snp,T,A,A:201 T:0
    CP059040,152318,snp,T,A,A:181 T:0,CDS,-,106/213,36/70,missense_variant c.106A>T p.Thr36Ser,H0N29_00700,,DUF1737 domain-containing protein
    CP059040,171965,snp,T,A,A:183 T:0,CDS,+,167/798,56/265,missense_variant c.167T>A p.Val56Asp,H0N29_00780,,MBL fold metallo-hydrolase
    CP059040,194020,ins,C,CT,CT:160 C:0,CDS,-,3260/3497,1087/1164,frameshift_variant c.3260dupA p.Arg1088fs,H0N29_00885,,PAS-domain containing protein
    CP059040,375297,snp,A,T,T:127 A:0

    CP059040,468931,ins,A,AT,AT:65 A:1
    CP059040,468976,ins,A,AT,AT:72 A:0

    CP059040,609979,snp,A,T,T:139 A:0,CDS,+,1009/1212,337/403,stop_gained c.1009A>T p.Lys337*,H0N29_02925,,SAM-dependent methyltransferase
    CP059040,1036730,ins,T,TA,TA:101 T:0
    CP059040,1059847,snp,C,T,T:109 C:0,CDS,-,119/642,40/213,missense_variant c.119G>A p.Gly40Asp,H0N29_04880,,DUF2238 domain-containing protein
    CP059040,1272639,snp,T,G,G:94 T:0
    CP059040,1300706,snp,A,G,G:136 A:0,CDS,+,351/1428,117/475,synonymous_variant c.351A>G p.Ser117Ser,H0N29_06100,,M48 family metalloprotease

    CP059040,1527276,del,TTGAACC,T,T:29 TTGAACC:1,CDS,+,1327/2115,443/704,conservative_inframe_deletion c.1327_1332delGAACCT p.Glu443_Pro444del,H0N29_07175,,DNA polymerase III subunit gamma/tau

    CP059040,1970200,snp,T,C,C:92 T:0,CDS,+,281/2136,94/711,missense_variant c.281T>C p.Val94Ala,H0N29_09260,,TonB-dependent siderophore receptor
    CP059040,2383727,ins,A,AT,AT:56 A:0
    CP059040,2477628,ins,T,TA,TA:27 T:1
    CP059040,2525852,ins,A,AT,AT:124 A:0,CDS,-,529/836,177/277,frameshift_variant c.529dupA p.Ile177fs,H0N29_11845,,hypothetical protein
    CP059040,3016359,ins,A,AT,AT:112 A:0,CDS,-,2136/3236,712/1077,frameshift_variant c.2136dupA p.Phe713fs,H0N29_14350,cas3f,type I-F CRISPR-associated helicase Cas3
    CP059040,3111299,snp,A,G,G:152 A:0,CDS,-,957/1185,319/394,synonymous_variant c.957T>C p.Ile319Ile,H0N29_14775,,hypothetical protein
    CP059040,3310021,ins,C,CT,CT:94 C:0
    CP059040,3542741,del,GC,G,G:152 GC:0,CDS,+,1746/2231,582/742,frameshift_variant c.1746delC p.Arg583fs,H0N29_16865,,primosomal protein N'
    CP059040,3542934,del,CT,C,C:165 CT:0,CDS,+,1938/2231,646/742,frameshift_variant c.1938delT p.Met647fs,H0N29_16865,,primosomal protein N'
    CP059040,3570717,del,AC,A,A:177 AC:0,CDS,+,187/1315,63/437,frameshift_variant c.187delC p.Gln63fs,H0N29_16960,,FAD-dependent monooxygenase
    CP059040,3629616,ins,C,CT,CT:125 C:0
    CP059040,3873573,snp,A,G,G:190 A:0,CDS,+,1705/2187,569/728,missense_variant c.1705A>G p.Asn569Asp,H0N29_18380,,polysaccharide biosynthesis tyrosine autokinase




    #in snippy/A12_AYE_HQ/A12_AYE_HQ.csv
    CHROM,POS,TYPE,REF,ALT,EVIDENCE,FTYPE,STRAND,NT_POS,AA_POS,EFFECT,LOCUS_TAG,GENE,PRODUCT
    CU459141,169174,snp,G,A,A:296 G:0,CDS,-,74/441,25/146,missense_variant c.74C>T p.Thr25Met,ABAYE0155,,conserved hypothetical protein

    #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  A12_AYE_HQ_trimmed
    CU459141        169174  .       G       A       11566.04        PASS    AC=1;AF=1.00;AN=1;DP=307;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=60.00;QD=31.04;SOR=0.763;ANN=A|missense_variant|MODERATE|ABAYE0155|ABAYE0155|transcript|ABAYE0155|protein_coding|1/1|c.74C>T|p.Thr25Met|74/441|74/441|25/146||  GT:AD:DP:GQ:PL  1:0,294:294:99:11576,0

    #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  A12_AYE_HQ_trimmed
    CU459141        519645  .       TA      T       175.01  PASS    AC=1;AF=1.00;AN=1;DP=15;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=31.88;QD=29.17;SOR=3.912 GT:AD:DP:GQ:PL  1:0,6:6:99:185,0
    CU459141        1213004 .       C       CACCTGGGCTCGAACCAGGGACCTGCGGATTAACAGTCCGTCGCTCTACCGACTGAGCTATGGGAGATTAGTAGATGGCTCTCCA   3656.01 PASS    AC=1;AF=1.00;AN=1;BaseQRankSum=-0.754;DP=160;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=59.15;MQRankSum=-0.555;QD=29.30;ReadPosRankSum=-1.188;SOR=0.846;ANN=CACCTGGGCTCGAACCAGGGACCTGCGGATTAACAGTCCGTCGCTCTACCGACTGAGCTATGGGAGATTAGTAGATGGCTCTCCA|intragenic_variant|MODIFIER|ABAYEtRNA60|ABAYEtRNA60|gene_variant|ABAYEtRNA60|||n.1213004_1213005insACCTGGGCTCGAACCAGGGACCTGCGGATTAACAGTCCGTCGCTCTACCGACTGAGCTATGGGAGATTAGTAGATGGCTCTCCA||||||     GT:AD:DP:GQ:PL  1:2,93:95:99:3666,0



    #in snippy/A19_17978_HQ/A19_17978_HQ.csv
    CHROM,POS,TYPE,REF,ALT,EVIDENCE,FTYPE,STRAND,NT_POS,AA_POS,EFFECT,LOCUS_TAG,GENE,PRODUCT
    CP079931,697999,snp,A,C,C:319 A:0,CDS,-,370/723,124/240,missense_variant c.370T>G p.Cys124Gly,KZA74_03385,,folate-binding Fe/S cluster repair protein
    CP079931,1594519,snp,A,C,C:232 A:0,CDS,-,558/1122,186/373,missense_variant c.558T>G p.Asn186Lys,KZA74_07445,,ATP-binding protein
    CP079931,2140669,snp,A,C,C:180 A:0,CDS,-,1412/1791,471/596,missense_variant c.1412T>G p.Val471Gly,KZA74_10170,,ABC transporter substrate-binding protein
    CP079931,2415049,del,CA,C,C:190 CA:0
    CP079931,3583600,snp,T,C,C:402 T:0



    #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  A19_17978_HQ_trimmed
    CP079931        499908  .       G       A       3372.04 PASS    AC=1;AF=1.00;AN=1;DP=95;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=35.85;QD=28.47;SOR=9.087 GT:AD:DP:GQ:PL  1:0,93:93:99:3382,0
    CP079931        499966  .       C       T       5744.04 PASS    AC=1;AF=1.00;AN=1;DP=150;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=37.54;QD=28.10;SOR=10.008       GT:AD:DP:GQ:PL  1:0,148:148:99:5754,0
    CP079931        697999  .       A       C       12568.04        PASS    AC=1;AF=1.00;AN=1;DP=336;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=60.00;QD=26.58;SOR=0.712;ANN=C|missense_variant|MODERATE|KZA74_03385|KZA74_03385|transcript|KZA74_03385|protein_coding|1/1|c.370T>G|p.Cys124Gly|370/723|370/723|124/240||       GT:AD:DP:GQ:PL  1:0,323:323:99:12578,0
    CP079931        1594519 .       A       C       9115.04 PASS    AC=1;AF=1.00;AN=1;DP=239;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=60.00;QD=28.16;SOR=0.782;ANN=C|missense_variant|MODERATE|KZA74_07445|KZA74_07445|transcript|KZA74_07445|protein_coding|1/1|c.558T>G|p.Asn186Lys|558/1122|558/1122|186/373||     GT:AD:DP:GQ:PL  1:0,232:232:99:9125,0
    CP079931        2140669 .       A       C       7145.04 PASS    AC=1;AF=1.00;AN=1;DP=184;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=60.00;QD=25.70;SOR=0.965;ANN=C|missense_variant|MODERATE|KZA74_10170|KZA74_10170|transcript|KZA74_10170|protein_coding|1/1|c.1412T>G|p.Val471Gly|1412/1791|1412/1791|471/596||WARNING_TRANSCRIPT_NO_START_CODON GT:AD:DP:GQ:PL  1:0,180:180:99:7155,0
    CP079931        3583600 .       T       C       16124.04        PASS    AC=1;AF=1.00;AN=1;DP=426;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=60.00;QD=28.42;SOR=0.779;ANN=C|upstream_gene_variant|MODIFIER|rplL|KZA74_16990|transcript|KZA74_16990|protein_coding||c.-23A>G|||||23|  GT:AD:DP:GQ:PL  1:0,409:409:99:16134,0

    #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  A19_17978_HQ_trimmed
    CP079931        499924  .       AT      A       4174.01 PASS    AC=1;AF=1.00;AN=1;DP=118;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=36.44;QD=30.23;SOR=9.455        GT:AD:DP:GQ:PL  1:0,112:112:99:4184,0
    CP079931        2415049 .       CA      C       6579.01 PASS    AC=1;AF=1.00;AN=1;DP=213;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=60.00;QD=32.57;SOR=1.085        GT:AD:DP:GQ:PL  1:0,202:202:99:6589,0
    CP079931        3695212 .       AT      A       342.01  PASS    AC=1;AF=1.00;AN=1;DP=10;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=35.05;QD=34.20;SOR=4.804 GT:AD:DP:GQ:PL  1:0,10:10:99:352,0



    ./Outputs/bams/A10_CraA_HQ_trimmed.dedup.bam
    ./Outputs/bams/A6_WT_HQ_trimmed.dedup.bam
    ./snippy/A10_CraA_HQ/A10_CraA_HQ.bam
    ./snippy/A6_WT_HQ/A6_WT_HQ.bam

    TODO after meeting: import the two bam files and annotation of genes into IGV, go to the position craA gene and screenshot show if the gene regions are completely removed? Send the table and screenshot to Tam, say the annotation of genom of NCBI is still pending!

7 (optional), filtering process of SPANDx results

    #Input file is Outputs/Phylogeny_and_annotation/All_SNPs_indels_annotated.txt (61203)
    #- Step1: filtering $6!=$7 (330)
    awk '{if($6!=$7) print}' < All_SNPs_indels_annotated.txt > All_SNPs_indels_annotated_.txt
    #- Step2: restricted the first 7 columns (240)
    cut -d$'\t' -f1-7 All_SNPs_indels_annotated_.txt > f1_7
    #- Step3: filtering examples as "G/C" (1450)
    grep -v "/" f1_7 > f1_7_
    #- Step4: filtering examples as ".       A" (170)
    grep -v "\." f1_7_ > f1_7__
    #- Step5: filtering examples as "T       *" (162), since we want to have only SNPs differing between BK20399 and GE3138.
    grep -v "*" f1_7__ > f1_7___
    #- Step6: only SNPs (50).
    grep -v "INDEL" f1_7___ > f1_7____
    #- Step7: get the complete rows with the positions
    cut -d$'\t' -f2-2 f1_7____ > f2
    replace "\n" with " All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt\ngrep "
    grep "CHROM" All_SNPs_indels_annotated_.txt > All_SNPs_annotated.txt
    grep "46882" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt
    grep "46892" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt
    grep "46936" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt
    ....
    #- Step8: Missense mutations ()
    grep "missense_variant" All_SNPs_annotated.txt

8 (optional), merge results from two variant calling methods!

    #-- post-processing --
    awk '{if($6!=$7) print}' < All_SNPs_indels_annotated.txt > All_SNPs_indels_annotated_.txt
    cut -d$'\t' -f1-7 All_SNPs_indels_annotated_.txt > f1_7
    grep -v "/" f1_7 > f1_7_
    grep -v "\." f1_7_ > f1_7__
    grep -v "*" f1_7__ > f1_7___

    grep -v "INDEL" f1_7___ > f1_7_SNPs
    cut -d$'\t' -f2-2 f1_7_SNPs > f1_7_SNPs_f2
    \n --> " All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt\ngrep "
    #grep "CHROM" All_SNPs_indels_annotated_.txt > All_SNPs_annotated.txt
    #grep "46882" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt
    #grep "46892" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt

    grep -v "SNP" f1_7___ > f1_7_indels
    cut -d$'\t' -f2-2 f1_7_indels > f1_7_indels_f2
    \n --> " All_SNPs_indels_annotated_.txt >> All_indels_annotated.txt\ngrep "
    #grep "CHROM" All_SNPs_indels_annotated_.txt > All_indels_annotated.txt
    #grep "49848" All_SNPs_indels_annotated_.txt >> All_indels_annotated.txt
    #grep "53872" All_SNPs_indels_annotated_.txt >> All_indels_annotated.txt

9 (optional), PubMLST, ResFinder or RGI calling https://cge.cbs.dtu.dk/services/ https://cge.cbs.dtu.dk/services/MLST/ https://pubmlst.org/bigsdb?db=pubmlst_pacnes_seqdef&page=batchProfiles&scheme_id=3 https://cge.food.dtu.dk/services/ResFinder/ https://card.mcmaster.ca/home https://www.pseudomonas.com/ https://www.pseudomonas.com/amr/list https://card.mcmaster.ca/analyze/rgi The antimicrobial resistance genes were predicted by the tool RGI 6.0.0 which utilizes the database CARD 3.2.5 (PMID: 31665441). !!!!#check the difference of SNPs in the two files, and then look for the SNPs in my tables comparing the two isolates!!!!

    cut -d$'\t' -f9-21 BK20399_contigs.fa.txt > f9_21
    cut -d$'\t' -f2-6 BK20399_contigs.fa.txt > f2_6
    #Best_Hit_ARO --> Antibiotic Resistance Ontology (ARO) Term
    paste f9_21 f2_6 > BK20399_.txt
    #grep "CARD_Protein_Sequence" BK20399_.txt > header
    grep -v "CARD_Protein_Sequence" BK20399_.txt > BK20399__.txt
    sort BK20399__.txt > BK20399_sorted.txt

    cut -d$'\t' -f9-21 GE3138_contigs.fa.txt > f9_21
    cut -d$'\t' -f2-6 GE3138_contigs.fa.txt > f2_6
    paste f9_21 f2_6 > GE3138_.txt
    grep -v "CARD_Protein_Sequence" GE3138_.txt > GE3138__.txt
    sort GE3138__.txt > GE3138_sorted.txt

    cut -d$'\t' -f1-6 BK20399_sorted.txt > BK20399_f1_6.txt  
    cut -d$'\t' -f1-6 GE3138_sorted.txt > GE3138_f1_6.txt
    diff BK20399_f1_6.txt GE3138_f1_6.txt
    #-->< APH(3'')-Ib   99.63   3002639 protein homolog model   n/a n/a

    cat header BK20399_sorted.txt > BK20399.txt
    cat header GE3138_sorted.txt > GE3138.txt

    ~/Tools/csv2xls-0.4/csv_to_xls.py BK20399.txt GE3138.txt -d$'\t' -o ARG_calling.xls

    #---- another example ----
    cut -d$'\t' -f9-21 AW27149_before.txt > f9_21
    cut -d$'\t' -f2-6 AW27149_before.txt > f2_6
    #Best_Hit_ARO --> Antibiotic Resistance Ontology (ARO) Term
    paste f9_21 f2_6 > AW27149_before_.txt
    grep "CARD_Protein_Sequence" AW27149_before_.txt > header
    grep -v "CARD_Protein_Sequence" AW27149_before_.txt > AW27149_before__.txt
    sort AW27149_before__.txt > AW27149_before_sorted.txt

    cut -d$'\t' -f9-21 AW48641_after.txt > f9_21
    cut -d$'\t' -f2-6 AW48641_after.txt > f2_6
    paste f9_21 f2_6 > AW48641_after_.txt
    grep -v "CARD_Protein_Sequence" AW48641_after_.txt > AW48641_after__.txt
    sort AW48641_after__.txt > AW48641_after_sorted.txt

    cut -d$'\t' -f1-6 AW27149_before_sorted.txt > AW27149_before_f1_6.txt  
    cut -d$'\t' -f1-6 AW48641_after_sorted.txt > AW48641_after_f1_6.txt
    diff AW27149_before_f1_6.txt AW48641_after_f1_6.txt
    #48c48
    #< Pseudomonas aeruginosa CpxR   100.0   3004054 protein homolog model   n/a     n/a
    #---
    #> Pseudomonas aeruginosa CpxR   99.56   3004054 protein homolog model   n/a     n/a

    cat header AW27149_before_sorted.txt > ../RGI_calling_AW27149_before.txt
    cat header AW48641_after_sorted.txt > ../RGI_calling_AW48641_after.txt

    #mlst add header "isolate   species ST  acsA    aroE    guaA    mutL    nuoD    ppsA    trpE"
    mv 
    ~/Tools/csv2xls-0.4/csv_to_xls.py All_SNPs_annotated.csv RGI_calling_AW27149_before.txt RGI_calling_AW48641_after.txt mlst.txt -d$'\t' -o SNP_ARG_calling.xls

    #Find the common SNP between SNP calling and RGI calling, since the SNP calling has bad annotation, marked yellow.

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum