gene_x 0 like s 574 view s
Tags: pipeline, DNA-seq
input data
#Deciphering Evolutionary Trajectories in Acinetobacter baumannii: A Comparative Study of Isolates ATCC 19606, AYE, and ATCC 17978
#PRJNA1041744
A6 WT - Acinetobacter baumannii ATCC 19606, CP059040
A10 CraA - Acinetobacter baumannii ATCC 19606, CP059040
BIT33_RS17375: adeJ
BIT33_RS17370: adeI
BIT33_RS11855: craA (MKNIQTTALNRTTLMFPLALVLFEFAVYIGNDLIQPAMLAITED
FGVSATWAPSSMSFYLLGGASVAWLLGPLSDRLGRKKVLLSGVLFFALCCFLILLTRQ
IEHFLTLRFLQGIGLSVISAVGYAAIQENFAERDAIKVMALMANISLLAPLLGPVLGA
FLIDYVSWHWGFVAIALLALLSWVGLKKQMPSHKVSVTKQPFSYLFDDFKKVFSNRQF
LGLTLALPLVGMPLMLWIALSPIILVDELKLTSVQYGLAQFPVFLGLIVGNIVLIKII
DRLALGKTVLIGLPIMLTGTLILILGVVWQAYLIPCLLIGMTLICFGEGISFSVLYRF
ALMSSEVSKGTVAAAVSMLLMTSFFAMIELVRYLYTQFHLWAFVLSAFAFIALWFTQP
RLALKREMQERVAQDLH) MFS transporter --> H0N29_01695
364663 - 365770
A12 AYE - Acinetobacter baumannii AYE, CU459141
A19 17978 - Acinetobacter baumannii ATCC 17978 CP033110
A1S_2736: adeJ
A1S_2735: adeI
A1S_3146: craA
mv A6WT_HQ_R1.fq.gz A6_WT_HQ_R1.fastq.gz
mv A6WT_HQ_R2.fq.gz A6_WT_HQ_R2.fastq.gz
mv A10CraA_HQ_R1.fq.gz A10_CraA_HQ_R1.fastq.gz
mv A10CraA_HQ_R2.fq.gz A10_CraA_HQ_R2.fastq.gz
mv A12AYE_HQ_R1.fq.gz A12_AYE_HQ_R1.fastq.gz
mv A12AYE_HQ_R2.fq.gz A12_AYE_HQ_R2.fastq.gz
mv A1917978_HQ_R1.fq.gz A19_17978_HQ_R1.fastq.gz
mv A1917978_HQ_R2.fq.gz A19_17978_HQ_R2.fastq.gz
#Nucleotide Accession Prefixes
#AP,BS DDBJ Genome project data
#AL,BX,CR,CT,CU,FP,FQ,FR EMBL Genome project data
#AE,CP,CY GenBank Genome project data
#transposon detection using gubbins
#TODOs: Submit A6, A12 and A19 to NCBI, A10 not, since the assembly is very bad! Could do mapping plot of A10 on A6, and three genebank-files to Tam!
#Wuen Ee Foong
#Jiabin Huang
#Heng-Keat Tam, Hengyang Medical College, University of South China
#pA12AYE1
prokka
for sample in A19_17978_HQ; do
prokka --force --outdir prokka/${sample} --cpus 2 --usegenus --genus Acinetobacter --kingdom Bacteria --species baumannii --addgenes --addmrna --prefix ${sample} --locustag ${sample} shovill/${sample}/contigs.fa -hmm /media/jhuang/Titisee/GAMOLA2/TIGRfam_db/TIGRFAMs_15.0_HMM.LIB
done
(prokka on titisee) for the remaining steps (f,f,f,f,f,t,f(variants_calling),t,t,t in bacto-0.1.json)
#TODO: annotate the SNPs with antimicrobial gene database!
#TODO: ask Qianjie die sample are directly gotten from the patient, oder did some motification on the reference genome?
3.1. vrap (TODO: deploy vrap on xgenes.com, it does not use the database. under (vrap) jhuang@hamm[hamburg])
#In summary: --(optional) host --> bowtie/host (bowtie_database for host-substraction)
# --(optional) virus --> blast/db/virus (blast1, in nt_dbs, custom blastn database before downloaded_db!)
# --(always) defined in download_db.py (setting by modifying the file) and run default --> database/viral_db/viral_nucleotide (blast2, in nt_dbs)
# --> database/viral_db/viral_protein (blast3, in prot_db)
# --(optional) nt+nr (blast4 and blast5)
#Under (vrap) jhuang@hamm or (bengal3_ac3) jhuang@hamburg
#DEBUG: {path}/external_tools/Lighter-master/lighter-->/home/jhuang/miniconda3/envs/vrap/bin/lighter
#DEBUG: {path}/external_tools/SPAdes-3.11.1-Linux/bin/spades.py-->/home/jhuang/miniconda3/envs/vrap/bin/spades.py
#gi|1690449040|gb|CP029454.1| Bacillus cereus strain FORC087 chromosome, complete genome
#gi|1621560499|gb|CP039269.1| Bacillus cereus strain MH19 chromosome, complete genome
#gi|1783154430|gb|CP040340.1| Bacillus cereus strain DLOU-Tangshan chromosome, complete genome
#CP059040 3980852 10 70 71
#CU459141 3936291 4037743 70 71
#CP033110 3902037 8030278 70 71
#CP059040 --> A6 and A10
#CU459141 --> A12
#CP033110 --> A19
#Command line: /home/jhuang/miniconda3/envs/bengal3_ac3/bin/spades.py -1 A6_WT_HQ_R1.fastq.gz -2 A6_WT_HQ_R2.fastq.gz --careful --trusted-contigs CP059040.fasta -o spades_A6_ATCC19606
#CP059040_RagTag 3866625 17 3866625 3866626
#CAP_1_length_17300 17300 3866663 17300 17301
#CAP_2_length_928 928 3883982 928 929
#NODE_17_length_11061_cov_141.020669 11061 3884948 11061 11062
#NODE_19_length_1654_cov_105.893255 1654 3896046 1654 1655
#NODE_20_length_1102_cov_153.073846 1102 3897737 1102 1103
#NODE_21_length_1040_cov_172.078861 1040 3898876 1040 1041
#NOTE that we need to adjust the taxid in download_db.py to download viral_nucleotide and viral_protein before running vrap/vrap.py.
#A10
#Blast with 5 DBs when --virus (blast/db/virus), viral_nucleotide (download_db.rb), viral_protein (download_db.rb), nt, nr
#Bowtie2 with 1 DB when --host (bowtie/host)
#NOTE the fasta files given with --host and --virus should be deduplicated!
#NOTE that in vrap_CP059040.py the trusted-contigs is used "--trusted-contigs /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/CP074710.fasta"
vrap/vrap_CP059040.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_out --host Bacillus_cereus.fasta --virus=Acinetobacter_baumannii.fasta -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
#A6
vrap/vrap_CP059040.py -1 ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_2.fastq.gz -o A6_vrap_out -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
#A12
vrap/vrap_CU459141.py -1 ./results_AYE/trimmed/A12_AYE_HQ_trimmed_P_1.fastq.gz -2 ./results_AYE/trimmed/A12_AYE_HQ_trimmed_P_2.fastq.gz -o A12_vrap_out -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
#A19
vrap/vrap_CP033110.py -1 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_2.fastq.gz -o A19_vrap_out -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
# -- A10 LOG --
START VrAP
START checking blast db
START lighter
START estimate k-mer size
k-mer size: 15037634
/home/jhuang/miniconda3/envs/vrap/bin/lighter -r A10_CraA_HQ_trimmed_P_1.fastq.gz -r A10_CraA_HQ_trimmed_P_2.fastq.gz -K 20 15037634 -od A10_vrap_out/lighter -t 55
START flash
/home/jhuang/Tools/vrap/external_tools/FLASH-1.2.11/flash A10_vrap_out/lighter/A10_CraA_HQ_trimmed_P_1.cor.fq.gz A10_vrap_out/lighter/A10_CraA_HQ_trimmed_P_2.cor.fq.gz -o flash -d A10_vrap_out/flash -M 1000
START bowtie2-build
START bowtie2 pe
#MAPPING to 1 DB: bowtie/host
START spades
START cap3
#/home/jhuang/Tools/vrap/external_tools/cap3 A10_vrap_out/spades/contigs.fasta -y 100
START calculating orf density
START HMMER
START blast
#grep ">" A10_vrap_out/blast/custom_viral_seq.fa == "--virus Acinetobacter_baumannii.fasta"
#MAPPING to 5 DBs: blast/db/virus[1], viral_nucleotide (defined in the download_db.py)[2], viral_protein (defined in the download_db.py)[3], nt[4], nr[5]
#INPUT_SEQ:
#(Assuming, if annotated with the previous database, will not translate to the next level). For the example below: we have no record finding in [2,1], then we have 1317-0 in blast/blastn.fa; In [3] we find 'cut -f1 blastx.csv | sort -u | wc -l = 567', then we have 1317-567=750; In [4], we 'cut -f1 blastn_nt.csv | sort -u | wc -l=710', then we have 750-710=40 in blast/blastn_nt.fa; In [5], we find 'cut -f1 blastn_nr.csv | sort -u | wc -l=39', then we remain 40-39=1 record in blastx_nr.fa.
#grep ">" vrap_contig.fasta | wc -l #1317
#grep ">" blast/blastn.fa | wc -l #1317
#grep ">" blast/blastx.fa | wc -l #750
#grep ">" blast/blastn_nt.fa | wc -l #40
#grep ">" blast/blastx_nr.fa | wc -l #1
#--
#grep ">" vrap_contig.fasta | wc -l #2370
#grep ">" blast/blastn.fa | wc -l #2306 --> 64 contigs should belong to A. baumannii! can be annotated with --custom_virus (3 speicies of Acinetobacter baumannii) and downloaded [virus_nt_db] (2.6 G Acinetobacter baumannii)
#grep ">" blast/blastx.fa | wc -l #1245: 1061 contigs can be annotated with downloaded [virus_aa_db]
#grep ">" blast/blastn_nt.fa | wc -l #41: 1204 contigs can be annotated with nt
#grep ">" blast/blastx_nr.fa | wc -l #1: 40 contigs can be annotated with nr
#--> extract 64 contigs using 'python3 extract_unique_sequences.py A10_vrap_out/vrap_contig.fasta A10_vrap_out/blast/blastn.fa A10_extracted_contigs.fa'
[1_INDEX]/home/jhuang/Tools/vrap/external_tools/blast/makeblastdb -in /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/custom_viral_seq.fa -dbtype nucl -parse_seqids -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/db/virus >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log 2>> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log
[2,1]/home/jhuang/Tools/vrap/external_tools/blast/blastn -num_threads 55 -query /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap_contig.fasta -db "/home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/db/virus" -evalue 1e-4 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastn.csv >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log
[3]/home/jhuang/Tools/vrap/external_tools/blast/blastx -num_threads 55 -query /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastn.fa -db "/home/jhuang/Tools/vrap/database/viral_db/viral_protein" -evalue 1e-6 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastx.csv >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log
[4]/home/jhuang/Tools/vrap/external_tools/blast/blastn -num_threads 55 -query /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastx.fa -db "/mnt/h1/jhuang/blast/nt" -evalue 1e-4 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastn_nt.csv >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log
[5]/home/jhuang/Tools/vrap/external_tools/blast/blastx -num_threads 55 -query /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastn_nt.fa -db "/mnt/h1/jhuang/blast/nr" -evalue 1e-6 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastx_nr.csv >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log
VrAP pipeline finished.
Thank you for using VrAP!
3.2. ragtag.py ((bengal3_ac3) jhuang@hamm)
#ragtag.py patch ragtag_output_scaffold/ragtag.scaffold.fasta CP059040.fasta
ragtag.py scaffold CP059040.fasta A6_vrap_out/vrap_contig.fasta
Acinetobacter baumannii strain ATCC 19606 plasmid unnamed, complete sequence Acinetobacter baumannii 19653 39248 62% 0.0 99.97% 18598 CP074586.1
python3 extract_unique_sequences.py vrap_contig.fasta blast/blastn.fa A10_extracted_contigs.fa
ragtag.py scaffold CP059040.fasta A10_extracted_contigs.fa
python3 filter_contigs.py A10_ragtag_output/ragtag.scaffold.fasta A10_ragtag_contigs_2000.fa
# Check the converage to determine if they are chr or plasmids!
ragtag.py scaffold CU459141.fasta A12_vrap_out/vrap_contig.fasta
Acinetobacter baumannii str. AYE plasmid p2ABAYE, complete genome Acinetobacter baumannii AYE 16835 18307 100% 0.0 99.99% 9661 CU459138.1
Acinetobacter baumannii str. AYE plasmid p1ABAYE, complete genome Acinetobacter baumannii AYE 9494 10856 100% 0.0 100.00% 5644 CU459137.1
Acinetobacter baumannii str. AYE plasmid p4ABAYE, complete genome Acinetobacter baumannii AYE 3709 5265 100% 0.0 99.95% 2726 CU459139.1
Acinetobacter baumannii str. AYE plasmid p3ABAYE, complete genome Acinetobacter baumannii AYE 1.384e+05 1.838e+05 100% 0.0 100.00% 94413 CU459140.1
ragtag.py scaffold CP033110.fasta A19_vrap_out/vrap_contig.fasta
#using CP074710.1 in the round2 (see below)
Acinetobacter baumannii strain ATCC 17978 plasmid unnamed1, complete sequence Acinetobacter baumannii 2.373e+05 2.463e+05 100% 0.0 100.00% 148956 CP074709.1
Acinetobacter baumannii strain ATCC 17978 plasmid unnamed5, complete sequence Acinetobacter baumannii 24842 50362 100% 0.0 100.00% 38118 CP074711.1
Acinetobacter baumannii strain ATCC 17978 plasmid unnamed3, complete sequence Acinetobacter baumannii 20953 37147 100% 0.0 100.00% 20351 CP074707.1
Acinetobacter baumannii ATCC 17978 chromosome, complete genome Acinetobacter baumannii ATCC 17978 13245 16527 100% 0.0 100.00% 4005343 CP053098.1
3.3 vrap round2
#A10
vrap/vrap_A6_chr.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_out --host Bacillus_cereus.fasta --virus=Acinetobacter_baumannii.fasta -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
#A19
vrap/vrap_CP074710.py -1 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_2.fastq.gz -o A19_vrap_out -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
3.4 ragtag round2
python3 extract_unique_sequences.py A10_vrap_out/vrap_contig.fasta A10_vrap_out/blast/blastn.fa A10_vrap_out/A10_extracted_contigs.fa
ragtag.py scaffold A6_chr.fasta A10_vrap_out/A10_extracted_contigs.fa
mv ragtag_output A10_ragtag_output
python3 filter_contigs.py A10_ragtag_output/ragtag.scaffold.fasta A10_ragtag_contigs_2000.fa
ragtag.py scaffold CP074710.fasta A19_vrap_out/vrap_contig.fasta
Acinetobacter baumannii strain ATCC 17978 plasmid unnamed5, complete sequence Acinetobacter baumannii 24842 50362 100% 0.0 100.00% 38118 CP074711.1
Acinetobacter baumannii strain ATCC 17978 plasmid unnamed3, complete sequence Acinetobacter baumannii 20953 37147 100% 0.0 100.00% 20351 CP074707.1
3.5 ragtag round3
ragtag.py patch CP059040.fasta A6_vrap_out/vrap_contig.fasta
ragtag.py scaffold CP059040.fasta A6_vrap_out/vrap_contig.fasta
python3 extract_unique_sequences.py vrap_contig.fasta blast/blastn.fa A10_extracted_contigs.fa
ragtag.py scaffold CP059040.fasta A10_vrap_out/A10_extracted_contigs.fa
python3 filter_contigs.py A10_ragtag_output/ragtag.scaffold.fasta A10_ragtag_contigs_2000.fa
# Check the converage to determine if they are chr or plasmids!
#>CAP_12_length_2972
#Bacillus paranthracis strain Bc006 chromosome, complete genome Bacillus paranthracis 4691 5472 100% 0.0 99.73% 5304537 CP119629.1
#Bacillus paranthracis strain Bc006 chromosome, complete genome Bacillus paranthracis 9762 11557 97% 0.0 99.87% 5304537 CP119629.1
#Bacillus paranthracis strain Bc006 chromosome, complete genome Bacillus paranthracis 14883 1.353e+05 100% 0.0 99.98% 5304537 CP119629.1
#CP119629.1 Submitted (10-MAR-2023) College of Animal Science, Ahau, Chagnjianxilu, Anhui 230000, China
ragtag.py patch CU459141.fasta A12_vrap_out/vrap_contig.fasta
ragtag.py scaffold CU459141.fasta A12_vrap_out/vrap_contig.fasta
ragtag.py patch CP074710.fasta A19_vrap_out/vrap_contig.fasta
ragtag.py scaffold CP074710.fasta A19_vrap_out/vrap_contig.fasta
A10:
CP059040_RagTag 3941335 17 60 61
CAP_11_length_2972 2972 4007061 60 61
CAP_1_length_17300 17300 4010103 60 61
samtools faidx A10_ragtag_contigs_2000.fa CAP_1_length_17300 > A10_plasmid.fasta
3.6 To map reads from one isolate (isolate 2) to the genome of another isolate (isolate 1), check coverage, and generate a consensus FASTA from the alignment, with regions not covered by the mapping masked as 'N', you'll need to follow a series of steps using bioinformatics tools.
# -- Mapping to CP059040 --
5447038 + 0 in total (QC-passed reads + QC-failed reads)
5447038 + 0 primary
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
0 + 0 primary duplicates
3346945 + 0 mapped (61.45% : N/A)
3346945 + 0 primary mapped (61.45% : N/A)
4361512 + 0 paired in sequencing
2180756 + 0 read1
2180756 + 0 read2
2579038 + 0 properly paired (59.13% : N/A)
2681114 + 0 with itself and mate mapped
21988 + 0 singletons (0.50% : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)
# -- Mapping to A6 --
5447044 + 0 in total (QC-passed reads + QC-failed reads)
5447044 + 0 primary
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
0 + 0 primary duplicates
3437574 + 0 mapped (63.11% : N/A)
3437574 + 0 primary mapped (63.11% : N/A)
4361524 + 0 paired in sequencing
2180762 + 0 read1
2180762 + 0 read2
2646686 + 0 properly paired (60.68% : N/A)
2753738 + 0 with itself and mate mapped
22055 + 0 singletons (0.51% : N/A)
1484 + 0 with mate mapped to a different chr
1390 + 0 with mate mapped to a different chr (mapQ>=5)
#- Sort and Index the BAM File:
# samtools sort -o sorted.bam input.bam
# samtools index sorted.bam
#- Call Variants:
# bcftools mpileup -Ou -f reference.fasta sorted.bam | bcftools call -mv -Oz -o calls.vcf.gz
# bcftools index calls.vcf.gz
#- Generate Consensus FASTA:
# bcftools consensus -f reference.fasta calls.vcf.gz -o consensus.fasta
Step 1: Align Reads to Reference Genome
First, align the reads from isolate 2 to the genome of isolate 1. For example, using BWA:
bwa mem reference_genome.fasta isolate2_reads.fastq > aligned.sam
or, use vrap to generate input.bam
#for bowtie (1 DB): --host Bacillus_cereus.fasta
#for blast (5 DBs): --virus=Acinetobacter_baumannii.fasta -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr, and two default download_db.py
#A6_chr_plasmid.fasta reads filtered, should only the contamination reads remaining!
vrap/vrap.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_host_A6 --host A6_chr_plasmid.fasta --noblast -t 55
Step 2: Convert SAM to BAM, Sort and Index
Convert the SAM file to BAM, sort it, and create an index:
samtools view -Sb mapped > aligned.bam
samtools sort aligned.bam -o sorted.bam
samtools index sorted.bam
Step 3: Coverage Analysis
Check the coverage of the genome regions. Regions with no coverage will be masked later.
#bedtools genomecov -ibam sorted.bam -bg > coverage.bed
#BUG above: By default, Bedtools doesn't explicitly list regions with zero coverage in the output of genomecov. To get around this, you can use the -bga option with genomecov. This option ensures that regions with zero coverage are also included in the output. Here's how you can modify your command:
bedtools genomecov -ibam sorted.bam -bga > coverage.bed
Step 4: Create a Bed File for Masking
Create a BED file that specifies regions with no or low coverage. Assume that any region with coverage less than a certain threshold (e.g., 1x) should be masked:
awk '$4 < 1 {print $1 "\t" $2 "\t" $3}' coverage.bed > low_coverage.bed
Step 5: Generate Consensus Sequence
Generate a consensus sequence, using bcftools, and mask regions with low or no coverage:
bcftools mpileup -Ou -f ../../CP059040.fasta sorted.bam | bcftools call -mv --ploidy 1 -Oz -o calls.vcf.gz
bcftools index calls.vcf.gz
bcftools consensus -f ../../CP059040.fasta -m low_coverage.bed calls.vcf.gz -o consensus.fasta
#The site CP059040:4 overlaps with another variant, skipping...
#The site CP059040:3125036 overlaps with another variant, skipping...
#Applied 58 variants
bcftools mpileup -Ou -f ../../A6_chr_plasmid.fasta sorted.bam | bcftools call -mv --ploidy 1 -Oz -o calls.vcf.gz
bcftools index calls.vcf.gz
bcftools consensus -f ../../A6_chr_plasmid.fasta -m low_coverage.bed calls.vcf.gz -o consensus.fasta
#The site seq00000000:4 overlaps with another variant, skipping...
#The site seq00000000:3125036 overlaps with another variant, skipping...
#Applied 61 variants
#--> TODO: could report the following regions are not covered in A10 comparing to A6.
#seq00000000 364666 365765
#seq00000000 2810907 2813323
#seq00000000 2813552 2813779
#seq00000000 2813923 2819663
#seq00000000 2821839 2831921
#seq00000000 2832750 2832753
#seq00000000 2833977 2860354
#seq00000000 2860505 2861780
#seq00000000 2861931 2861951
#seq00000000 2862102 2862234
#seq00000000 2862385 2863434
#seq00000000 3124939 3124943
#seq00000000 3125009 3125010
In this command, the -m option in bcftools consensus is used to specify a BED file for masking low-coverage regions.
4.2-1. generate genebank in snpEff
mkdir ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/ATCC17978
cp ATCC17978.gb ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/ATCC17978/genes.gbk
mkdir ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/ATCC19606
cp ATCC19606.gb ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/ATCC19606/genes.gbk
mkdir ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/AYE
cp AYE.gb ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/AYE/genes.gbk
vim ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/snpEff.config
/home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank ATCC17978 -d
/home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank ATCC19606 -d
/home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank AYE -d
/home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank ATCC17978
# Protein check: ATCC17978 OK: 3644 Not found: 81 Errors: 0 Error percentage: 0.0%
/home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank ATCC19606
# Protein check: ATCC19606 OK: 3702 Not found: 0 Errors: 0 Error percentage: 0.0%
/home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank AYE
# Protein check: AYE OK: 3607 Not found: 52 Errors: 0 Error percentage: 0.0%
4.3. using spandx calling variants
#Note that we recall the wild type (isolate 150) SNPs to its own assembly to increase the precision.
gzip A19_17978_HQ_trimmed_P_1.fastq A19_17978_HQ_trimmed_P_2.fastq
#ln -s /home/jhuang/Tools/spandx/ spandx
#-t
snpEff eff -nodownload -no-downstream -no-intergenic -ud 100 -v CP040849.1 noAB_wildtype_trimmed.PASS.snps.vcf > noAB_wildtype_trimmed.PASS.snps.annotated.vcf
#
# 'CP040849' 2659111 Standard
(spandx) jhuang@hamm:~/DATA/Data_Tam_Acinetobacter_baumannii/results_ATCC17978$ nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref ATCC17978.fasta --annotation --database ATCC17978 -resume
gzip A12_AYE_HQ_trimmed_P_1.fastq A12_AYE_HQ_trimmed_P_2.fastq
ln -s /home/jhuang/Tools/spandx/ spandx
nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref AYE.fasta --annotation --database AYE -resume
gzip A6_WT_HQ_trimmed_P_1.fastq A6_WT_HQ_trimmed_P_2.fastq A10_CraA_HQ_trimmed_P_1.fastq A10_CraA_HQ_trimmed_P_2.fastq
ln -s /home/jhuang/Tools/spandx/ spandx
nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref ATCC19606.fasta --annotation --database ATCC19606 -resume
# DEBUG: "--mixtures --structural" does not work well!
#pindel -f ATCC19606.fasta -T 2 -i pindel.bam.config -o pindel.out
#pindel -f ATCC19606.fasta -T 2 -i pindel.bam.config -o pindel.out
6, filtering process of SPANDx results
#Input file is Outputs/Phylogeny_and_annotation/All_SNPs_indels_annotated.txt (61203)
#- Step1: filtering $6!=$7 (330)
awk '{if($6!=$7) print}' < All_SNPs_indels_annotated.txt > All_SNPs_indels_annotated_.txt
#- Step2: restricted the first 7 columns (240)
cut -d$'\t' -f1-7 All_SNPs_indels_annotated_.txt > f1_7
#- Step3: filtering examples as "G/C" (1450)
grep -v "/" f1_7 > f1_7_
#- Step4: filtering examples as ". A" (170)
grep -v "\." f1_7_ > f1_7__
#- Step5: filtering examples as "T *" (162), since we want to have only SNPs differing between BK20399 and GE3138.
grep -v "*" f1_7__ > f1_7___
#- Step6: only SNPs (50).
grep -v "INDEL" f1_7___ > f1_7____
#- Step7: get the complete rows with the positions
cut -d$'\t' -f2-2 f1_7____ > f2
replace "\n" with " All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt\ngrep "
grep "CHROM" All_SNPs_indels_annotated_.txt > All_SNPs_annotated.txt
grep "46882" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt
grep "46892" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt
grep "46936" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt
....
#- Step8: Missense mutations ()
grep "missense_variant" All_SNPs_annotated.txt
7, indel calling using freebayes from merged bam
mkdir freebayes
cd freebayes
#run picard unter java17
(java17) picard-tools AddOrReplaceReadGroups I=../Outputs/bams/A12_AYE_HQ_trimmed.dedup.bam O=A12_AYE_readgroup-added.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=A12_AYE RGSM=A12_AYE RGLB=standard RGPU=A12_AYE VALIDATION_STRINGENCY=LENIENT
#(java17) picard-tools AddOrReplaceReadGroups I=../Outputs/bams/148_trimmed.dedup.bam O=148_readgroup-added.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=148 RGSM=148 RGLB=standard RGPU=148 VALIDATION_STRINGENCY=LENIENT
#(java17) picard-tools AddOrReplaceReadGroups I=../Outputs/bams/149_trimmed.dedup.bam O=149_readgroup-added.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=149 RGSM=149 RGLB=standard RGPU=149 VALIDATION_STRINGENCY=LENIENT
#(java17) picard-tools MergeSamFiles INPUT=150_readgroup-added.bam INPUT=148_readgroup-added.bam INPUT=149_readgroup-added.bam OUTPUT=merged_picard.bam
#(java17) picard-tools CreateSequenceDictionary R=wildtype_150.fasta O=wildtype_150.dict
#Note that the reference is 'wildtype_150'.
# Direkt SNP calling is not suggested. It is better if we take a realigning step before SNP and Indel-calling.
#conda install -c bioconda gatk4 gatk
#gatk-register /home/jhuang/Tools/GenomeAnalysisTK-3.8-1-0-gf15c1c3ef/GenomeAnalysisTK.jar
#need wildtype_150.fasta.fai and wildtype_150.dict
#under the bengal3_ac3 environment
(bengal3_ac3) samtools index merged_picard.bam
(bengal3_ac3) gatk -T RealignerTargetCreator -I merged_picard.bam -R /media/jhuang/Elements1/Data_Holger_Klebsiella_pneumoniae_SNP/wildtype_150.fasta -o realigned.merged.bam.list
(bengal3_ac3) gatk -T IndelRealigner -I merged_picard.bam -R /media/jhuang/Elements1/Data_Holger_Klebsiella_pneumoniae_SNP/wildtype_150.fasta -targetIntervals realigned.merged.bam.list -o merged_realigned.bam
freebayes --fasta-reference /media/jhuang/Elements1/Data_Holger_Klebsiella_pneumoniae_SNP_PUBLISHING/wildtype_150.fasta -b merged_realigned.bam -p 1 -i -X -u | /home/jhuang/Tools/freebayes_git/freebayes/vcflib/bin/vcffilter -f 'QUAL > 30' > merged.realigned.snps_filtered_freebayes.vcf
freebayes --fasta-reference /media/jhuang/Elements1/Data_Holger_Klebsiella_pneumoniae_SNP_PUBLISHING/wildtype_150.fasta -b merged_realigned.bam -p 1 -I -X -u | /home/jhuang/Tools/freebayes_git/freebayes/vcflib/bin/vcffilter -f 'QUAL > 30' > merged.realigned.indels_filtered_freebayes.vcf
#BETTER with realigned bam (see commands above): freebayes --fasta-reference /media/jhuang/Elements1/Data_Holger_Klebsiella_pneumoniae_SNP_PUBLISHING/wildtype_150.fasta -b merged_picard.bam -p 1 -I -X -u | /home/jhuang/Tools/freebayes_git/freebayes/vcflib/bin/vcffilter -f 'QUAL > 30' > merged.indels_filtered_freebayes.vcf
#freebayes -f {params.reference} \
# --ploidy 1 \
# --min-coverage {params.mincov} \
# --min-mapping-quality {params.mapqual} \
# --min-base-quality {params.minbasequal} \
# {input.forward} {input.reverse} > {output.vcf}
#To call structural variants (indels) and single nucleotide polymorphisms (SNPs) using FreeBayes, you'll need to follow a series of steps. FreeBayes is a Bayesian genetic variant detector designed to find small polymorphisms, specifically SNPs, indels, MNPs (multi-nucleotide polymorphisms), and complex events (composite insertion and substitution events) smaller than the length of a read.
#FreeBayes is primarily designed for calling small-scale variants like SNPs and indels (insertions and deletions) but is not typically used for large-scale structural variant detection (like large insertions, deletions, inversions, or translocations). Structural variants are usually larger in size, often exceeding the length of a single read, which is beyond the scope of what FreeBayes is optimized to handle.
#For calling structural variants, you would generally use other tools that are specifically designed for this purpose, such as:
Delly: For large deletions, tandem duplications, inversions, and translocations.
LUMPY: A probabilistic framework for structural variant discovery.
Manta: Designed for efficient calling of structural variants in cancer and germline studies.
CNVnator: For discovering copy number variants.
#If you're specifically looking to use FreeBayes for larger indels that might be on the edge of what is considered a small-scale variant, you can adjust the parameters to be more permissive with respect to variant size. However, this is not typically recommended for true structural variants.
For small indels and SNPs, FreeBayes is quite effective, and you would follow the general procedure of aligning your reads, processing the BAM files, and then running FreeBayes with appropriate parameters set for your study's requirements. But again, for structural variant detection, other tools would be more suitable.
8, merge results from two variant calling methods!
#-- post-processing --
awk '{if($6!=$7) print}' < All_SNPs_indels_annotated.txt > All_SNPs_indels_annotated_.txt
cut -d$'\t' -f1-7 All_SNPs_indels_annotated_.txt > f1_7
grep -v "/" f1_7 > f1_7_
grep -v "\." f1_7_ > f1_7__
grep -v "*" f1_7__ > f1_7___
grep -v "INDEL" f1_7___ > f1_7_SNPs
cut -d$'\t' -f2-2 f1_7_SNPs > f1_7_SNPs_f2
\n --> " All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt\ngrep "
#grep "CHROM" All_SNPs_indels_annotated_.txt > All_SNPs_annotated.txt
#grep "46882" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt
#grep "46892" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt
grep -v "SNP" f1_7___ > f1_7_indels
cut -d$'\t' -f2-2 f1_7_indels > f1_7_indels_f2
\n --> " All_SNPs_indels_annotated_.txt >> All_indels_annotated.txt\ngrep "
#grep "CHROM" All_SNPs_indels_annotated_.txt > All_indels_annotated.txt
#grep "49848" All_SNPs_indels_annotated_.txt >> All_indels_annotated.txt
#grep "53872" All_SNPs_indels_annotated_.txt >> All_indels_annotated.txt
9 (optional), PubMLST, ResFinder or RGI calling https://cge.cbs.dtu.dk/services/ https://cge.cbs.dtu.dk/services/MLST/ https://pubmlst.org/bigsdb?db=pubmlst_pacnes_seqdef&page=batchProfiles&scheme_id=3 https://cge.food.dtu.dk/services/ResFinder/ https://card.mcmaster.ca/home https://www.pseudomonas.com/ https://www.pseudomonas.com/amr/list https://card.mcmaster.ca/analyze/rgi The antimicrobial resistance genes were predicted by the tool RGI 6.0.0 which utilizes the database CARD 3.2.5 (PMID: 31665441). !!!!#check the difference of SNPs in the two files, and then look for the SNPs in my tables comparing the two isolates!!!!
cut -d$'\t' -f9-21 BK20399_contigs.fa.txt > f9_21
cut -d$'\t' -f2-6 BK20399_contigs.fa.txt > f2_6
#Best_Hit_ARO --> Antibiotic Resistance Ontology (ARO) Term
paste f9_21 f2_6 > BK20399_.txt
#grep "CARD_Protein_Sequence" BK20399_.txt > header
grep -v "CARD_Protein_Sequence" BK20399_.txt > BK20399__.txt
sort BK20399__.txt > BK20399_sorted.txt
cut -d$'\t' -f9-21 GE3138_contigs.fa.txt > f9_21
cut -d$'\t' -f2-6 GE3138_contigs.fa.txt > f2_6
paste f9_21 f2_6 > GE3138_.txt
grep -v "CARD_Protein_Sequence" GE3138_.txt > GE3138__.txt
sort GE3138__.txt > GE3138_sorted.txt
cut -d$'\t' -f1-6 BK20399_sorted.txt > BK20399_f1_6.txt
cut -d$'\t' -f1-6 GE3138_sorted.txt > GE3138_f1_6.txt
diff BK20399_f1_6.txt GE3138_f1_6.txt
#-->< APH(3'')-Ib 99.63 3002639 protein homolog model n/a n/a
cat header BK20399_sorted.txt > BK20399.txt
cat header GE3138_sorted.txt > GE3138.txt
~/Tools/csv2xls-0.4/csv_to_xls.py BK20399.txt GE3138.txt -d$'\t' -o ARG_calling.xls
#---- another example ----
cut -d$'\t' -f9-21 AW27149_before.txt > f9_21
cut -d$'\t' -f2-6 AW27149_before.txt > f2_6
#Best_Hit_ARO --> Antibiotic Resistance Ontology (ARO) Term
paste f9_21 f2_6 > AW27149_before_.txt
grep "CARD_Protein_Sequence" AW27149_before_.txt > header
grep -v "CARD_Protein_Sequence" AW27149_before_.txt > AW27149_before__.txt
sort AW27149_before__.txt > AW27149_before_sorted.txt
cut -d$'\t' -f9-21 AW48641_after.txt > f9_21
cut -d$'\t' -f2-6 AW48641_after.txt > f2_6
paste f9_21 f2_6 > AW48641_after_.txt
grep -v "CARD_Protein_Sequence" AW48641_after_.txt > AW48641_after__.txt
sort AW48641_after__.txt > AW48641_after_sorted.txt
cut -d$'\t' -f1-6 AW27149_before_sorted.txt > AW27149_before_f1_6.txt
cut -d$'\t' -f1-6 AW48641_after_sorted.txt > AW48641_after_f1_6.txt
diff AW27149_before_f1_6.txt AW48641_after_f1_6.txt
#48c48
#< Pseudomonas aeruginosa CpxR 100.0 3004054 protein homolog model n/a n/a
#---
#> Pseudomonas aeruginosa CpxR 99.56 3004054 protein homolog model n/a n/a
cat header AW27149_before_sorted.txt > ../RGI_calling_AW27149_before.txt
cat header AW48641_after_sorted.txt > ../RGI_calling_AW48641_after.txt
#mlst add header "isolate species ST acsA aroE guaA mutL nuoD ppsA trpE"
mv
~/Tools/csv2xls-0.4/csv_to_xls.py All_SNPs_annotated.csv RGI_calling_AW27149_before.txt RGI_calling_AW48641_after.txt mlst.txt -d$'\t' -o SNP_ARG_calling.xls
#Find the common SNP between SNP calling and RGI calling, since the SNP calling has bad annotation, marked yellow.
点赞本文的读者
还没有人对此文章表态
没有评论
Typing of S. epidermidis samples (MD P.B.)
Transposon analyses for the nanopore sequencing
Updated List of nf-core Pipelines (Released) Sorted by Stars (as of November 22, 2024)
© 2023 XGenes.com Impressum