gene_x 0 like s 408 view s
Tags: pipeline
input data
#Deciphering Evolutionary Trajectories in Acinetobacter baumannii: A Comparative Study of Isolates ATCC 19606, AYE, and ATCC 17978
#PRJNA1041744
A6 WT - Acinetobacter baumannii ATCC 19606, CP059040
A10 CraA - Acinetobacter baumannii ATCC 19606, CP059040
BIT33_RS17375: adeJ
BIT33_RS17370: adeI
BIT33_RS11855: craA (MKNIQTTALNRTTLMFPLALVLFEFAVYIGNDLIQPAMLAITED
FGVSATWAPSSMSFYLLGGASVAWLLGPLSDRLGRKKVLLSGVLFFALCCFLILLTRQ
IEHFLTLRFLQGIGLSVISAVGYAAIQENFAERDAIKVMALMANISLLAPLLGPVLGA
FLIDYVSWHWGFVAIALLALLSWVGLKKQMPSHKVSVTKQPFSYLFDDFKKVFSNRQF
LGLTLALPLVGMPLMLWIALSPIILVDELKLTSVQYGLAQFPVFLGLIVGNIVLIKII
DRLALGKTVLIGLPIMLTGTLILILGVVWQAYLIPCLLIGMTLICFGEGISFSVLYRF
ALMSSEVSKGTVAAAVSMLLMTSFFAMIELVRYLYTQFHLWAFVLSAFAFIALWFTQP
RLALKREMQERVAQDLH) MFS transporter --> H0N29_01695
364663 - 365770
A12 AYE - Acinetobacter baumannii AYE, CU459141
A19 17978 - Acinetobacter baumannii ATCC 17978 CP033110
A1S_2736: adeJ
A1S_2735: adeI
A1S_3146: craA
mv A6WT_HQ_R1.fq.gz A6_WT_HQ_R1.fastq.gz
mv A6WT_HQ_R2.fq.gz A6_WT_HQ_R2.fastq.gz
mv A10CraA_HQ_R1.fq.gz A10_CraA_HQ_R1.fastq.gz
mv A10CraA_HQ_R2.fq.gz A10_CraA_HQ_R2.fastq.gz
mv A12AYE_HQ_R1.fq.gz A12_AYE_HQ_R1.fastq.gz
mv A12AYE_HQ_R2.fq.gz A12_AYE_HQ_R2.fastq.gz
mv A1917978_HQ_R1.fq.gz A19_17978_HQ_R1.fastq.gz
mv A1917978_HQ_R2.fq.gz A19_17978_HQ_R2.fastq.gz
#Nucleotide Accession Prefixes
#AP,BS DDBJ Genome project data
#AL,BX,CR,CT,CU,FP,FQ,FR EMBL Genome project data
#AE,CP,CY GenBank Genome project data
#transposon detection using gubbins
#TODOs: Submit A6, A12 and A19 to NCBI, A10 not, since the assembly is very bad! Could do mapping plot of A10 on A6, and three genebank-files to Tam!
#Wuen Ee Foong
#Jiabin Huang
#Heng-Keat Tam, Hengyang Medical College, University of South China
#pA12AYE1
prokka
for sample in A19_17978_HQ; do
prokka --force --outdir prokka/${sample} --cpus 2 --usegenus --genus Acinetobacter --kingdom Bacteria --species baumannii --addgenes --addmrna --prefix ${sample} --locustag ${sample} shovill/${sample}/contigs.fa -hmm /media/jhuang/Titisee/GAMOLA2/TIGRfam_db/TIGRFAMs_15.0_HMM.LIB
done
(prokka on titisee) for the remaining steps (f,f,f,f,f,t,f(variants_calling),t,t,t in bacto-0.1.json)
#TODO: annotate the SNPs with antimicrobial gene database!
#TODO: ask Qianjie die sample are directly gotten from the patient, oder did some motification on the reference genome?
3.1. vrap (TODO: deploy vrap on xgenes.com, it does not use the database. under (vrap) jhuang@hamm[hamburg])
#In summary: --(optional) host --> bowtie/host (bowtie_database for host-substraction)
# --(optional) virus --> blast/db/virus (blast1, in nt_dbs, custom blastn database before downloaded_db!)
# --(always) defined in download_db.py (setting by modifying the file) and run default --> database/viral_db/viral_nucleotide (blast2, in nt_dbs)
# --> database/viral_db/viral_protein (blast3, in prot_db)
# --(optional) nt+nr (blast4 and blast5)
#Under (vrap) jhuang@hamm or (bengal3_ac3) jhuang@hamburg
#DEBUG: {path}/external_tools/Lighter-master/lighter-->/home/jhuang/miniconda3/envs/vrap/bin/lighter
#DEBUG: {path}/external_tools/SPAdes-3.11.1-Linux/bin/spades.py-->/home/jhuang/miniconda3/envs/vrap/bin/spades.py
#gi|1690449040|gb|CP029454.1| Bacillus cereus strain FORC087 chromosome, complete genome
#gi|1621560499|gb|CP039269.1| Bacillus cereus strain MH19 chromosome, complete genome
#gi|1783154430|gb|CP040340.1| Bacillus cereus strain DLOU-Tangshan chromosome, complete genome
#CP059040 3980852 10 70 71
#CU459141 3936291 4037743 70 71
#CP033110 3902037 8030278 70 71
#CP059040 --> A6 and A10
#CU459141 --> A12
#CP033110 --> A19
#Command line: /home/jhuang/miniconda3/envs/bengal3_ac3/bin/spades.py -1 A6_WT_HQ_R1.fastq.gz -2 A6_WT_HQ_R2.fastq.gz --careful --trusted-contigs CP059040.fasta -o spades_A6_ATCC19606
#CP059040_RagTag 3866625 17 3866625 3866626
#CAP_1_length_17300 17300 3866663 17300 17301
#CAP_2_length_928 928 3883982 928 929
#NODE_17_length_11061_cov_141.020669 11061 3884948 11061 11062
#NODE_19_length_1654_cov_105.893255 1654 3896046 1654 1655
#NODE_20_length_1102_cov_153.073846 1102 3897737 1102 1103
#NODE_21_length_1040_cov_172.078861 1040 3898876 1040 1041
#NOTE that we need to adjust the taxid in download_db.py to download viral_nucleotide and viral_protein before running vrap/vrap.py.
#A10
#Blast with 5 DBs when --virus (blast/db/virus), viral_nucleotide (download_db.rb), viral_protein (download_db.rb), nt, nr
#Bowtie2 with 1 DB when --host (bowtie/host)
#NOTE the fasta files given with --host and --virus should be deduplicated!
#NOTE that in vrap_CP059040.py the trusted-contigs is used "--trusted-contigs /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/CP074710.fasta"
vrap/vrap_CP059040.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_out --host Bacillus_cereus.fasta --virus=Acinetobacter_baumannii.fasta -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
#A6
vrap/vrap_CP059040.py -1 ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_2.fastq.gz -o A6_vrap_out -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
#A12
vrap/vrap_CU459141.py -1 ./results_AYE/trimmed/A12_AYE_HQ_trimmed_P_1.fastq.gz -2 ./results_AYE/trimmed/A12_AYE_HQ_trimmed_P_2.fastq.gz -o A12_vrap_out -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
#A19
vrap/vrap_CP033110.py -1 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_2.fastq.gz -o A19_vrap_out -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
# -- A10 LOG --
START VrAP
START checking blast db
START lighter
START estimate k-mer size
k-mer size: 15037634
/home/jhuang/miniconda3/envs/vrap/bin/lighter -r A10_CraA_HQ_trimmed_P_1.fastq.gz -r A10_CraA_HQ_trimmed_P_2.fastq.gz -K 20 15037634 -od A10_vrap_out/lighter -t 55
START flash
/home/jhuang/Tools/vrap/external_tools/FLASH-1.2.11/flash A10_vrap_out/lighter/A10_CraA_HQ_trimmed_P_1.cor.fq.gz A10_vrap_out/lighter/A10_CraA_HQ_trimmed_P_2.cor.fq.gz -o flash -d A10_vrap_out/flash -M 1000
START bowtie2-build
START bowtie2 pe
#MAPPING to 1 DB: bowtie/host
START spades
START cap3
#/home/jhuang/Tools/vrap/external_tools/cap3 A10_vrap_out/spades/contigs.fasta -y 100
START calculating orf density
START HMMER
START blast
#grep ">" A10_vrap_out/blast/custom_viral_seq.fa == "--virus Acinetobacter_baumannii.fasta"
#MAPPING to 5 DBs: blast/db/virus[1], viral_nucleotide (defined in the download_db.py)[2], viral_protein (defined in the download_db.py)[3], nt[4], nr[5]
#INPUT_SEQ:
#(Assuming, if annotated with the previous database, will not translate to the next level). For the example below: we have no record finding in [2,1], then we have 1317-0 in blast/blastn.fa; In [3] we find 'cut -f1 blastx.csv | sort -u | wc -l = 567', then we have 1317-567=750; In [4], we 'cut -f1 blastn_nt.csv | sort -u | wc -l=710', then we have 750-710=40 in blast/blastn_nt.fa; In [5], we find 'cut -f1 blastn_nr.csv | sort -u | wc -l=39', then we remain 40-39=1 record in blastx_nr.fa.
#grep ">" vrap_contig.fasta | wc -l #1317
#grep ">" blast/blastn.fa | wc -l #1317
#grep ">" blast/blastx.fa | wc -l #750
#grep ">" blast/blastn_nt.fa | wc -l #40
#grep ">" blast/blastx_nr.fa | wc -l #1
#--
#grep ">" vrap_contig.fasta | wc -l #2370
#grep ">" blast/blastn.fa | wc -l #2306 --> 64 contigs should belong to A. baumannii! can be annotated with --custom_virus (3 speicies of Acinetobacter baumannii) and downloaded [virus_nt_db] (2.6 G Acinetobacter baumannii)
#grep ">" blast/blastx.fa | wc -l #1245: 1061 contigs can be annotated with downloaded [virus_aa_db]
#grep ">" blast/blastn_nt.fa | wc -l #41: 1204 contigs can be annotated with nt
#grep ">" blast/blastx_nr.fa | wc -l #1: 40 contigs can be annotated with nr
#--> extract 64 contigs using 'python3 extract_unique_sequences.py A10_vrap_out/vrap_contig.fasta A10_vrap_out/blast/blastn.fa A10_extracted_contigs.fa'
[1_INDEX]/home/jhuang/Tools/vrap/external_tools/blast/makeblastdb -in /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/custom_viral_seq.fa -dbtype nucl -parse_seqids -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/db/virus >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log 2>> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log
[2,1]/home/jhuang/Tools/vrap/external_tools/blast/blastn -num_threads 55 -query /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap_contig.fasta -db "/home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/db/virus" -evalue 1e-4 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastn.csv >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log
[3]/home/jhuang/Tools/vrap/external_tools/blast/blastx -num_threads 55 -query /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastn.fa -db "/home/jhuang/Tools/vrap/database/viral_db/viral_protein" -evalue 1e-6 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastx.csv >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log
[4]/home/jhuang/Tools/vrap/external_tools/blast/blastn -num_threads 55 -query /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastx.fa -db "/mnt/h1/jhuang/blast/nt" -evalue 1e-4 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastn_nt.csv >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log
[5]/home/jhuang/Tools/vrap/external_tools/blast/blastx -num_threads 55 -query /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastn_nt.fa -db "/mnt/h1/jhuang/blast/nr" -evalue 1e-6 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastx_nr.csv >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log
VrAP pipeline finished.
Thank you for using VrAP!
3.2. ragtag.py ((bengal3_ac3) jhuang@hamm)
#ragtag.py patch ragtag_output_scaffold/ragtag.scaffold.fasta CP059040.fasta
ragtag.py scaffold CP059040.fasta A6_vrap_out/vrap_contig.fasta
Acinetobacter baumannii strain ATCC 19606 plasmid unnamed, complete sequence Acinetobacter baumannii 19653 39248 62% 0.0 99.97% 18598 CP074586.1
python3 extract_unique_sequences.py vrap_contig.fasta blast/blastn.fa A10_extracted_contigs.fa
ragtag.py scaffold CP059040.fasta A10_extracted_contigs.fa
python3 filter_contigs.py A10_ragtag_output/ragtag.scaffold.fasta A10_ragtag_contigs_2000.fa
# Check the converage to determine if they are chr or plasmids!
ragtag.py scaffold CU459141.fasta A12_vrap_out/vrap_contig.fasta
Acinetobacter baumannii str. AYE plasmid p2ABAYE, complete genome Acinetobacter baumannii AYE 16835 18307 100% 0.0 99.99% 9661 CU459138.1
Acinetobacter baumannii str. AYE plasmid p1ABAYE, complete genome Acinetobacter baumannii AYE 9494 10856 100% 0.0 100.00% 5644 CU459137.1
Acinetobacter baumannii str. AYE plasmid p4ABAYE, complete genome Acinetobacter baumannii AYE 3709 5265 100% 0.0 99.95% 2726 CU459139.1
Acinetobacter baumannii str. AYE plasmid p3ABAYE, complete genome Acinetobacter baumannii AYE 1.384e+05 1.838e+05 100% 0.0 100.00% 94413 CU459140.1
ragtag.py scaffold CP033110.fasta A19_vrap_out/vrap_contig.fasta
#using CP074710.1 in the round2 (see below)
Acinetobacter baumannii strain ATCC 17978 plasmid unnamed1, complete sequence Acinetobacter baumannii 2.373e+05 2.463e+05 100% 0.0 100.00% 148956 CP074709.1
Acinetobacter baumannii strain ATCC 17978 plasmid unnamed5, complete sequence Acinetobacter baumannii 24842 50362 100% 0.0 100.00% 38118 CP074711.1
Acinetobacter baumannii strain ATCC 17978 plasmid unnamed3, complete sequence Acinetobacter baumannii 20953 37147 100% 0.0 100.00% 20351 CP074707.1
Acinetobacter baumannii ATCC 17978 chromosome, complete genome Acinetobacter baumannii ATCC 17978 13245 16527 100% 0.0 100.00% 4005343 CP053098.1
3.3 vrap round2
#A10
vrap/vrap_A6_chr.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_out --host Bacillus_cereus.fasta --virus=Acinetobacter_baumannii.fasta -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
#A19
vrap/vrap_CP074710.py -1 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_2.fastq.gz -o A19_vrap_out -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
3.4 ragtag round2
python3 extract_unique_sequences.py A10_vrap_out/vrap_contig.fasta A10_vrap_out/blast/blastn.fa A10_vrap_out/A10_extracted_contigs.fa
ragtag.py scaffold A6_chr.fasta A10_vrap_out/A10_extracted_contigs.fa
mv ragtag_output A10_ragtag_output
python3 filter_contigs.py A10_ragtag_output/ragtag.scaffold.fasta A10_ragtag_contigs_2000.fa
ragtag.py scaffold CP074710.fasta A19_vrap_out/vrap_contig.fasta
Acinetobacter baumannii strain ATCC 17978 plasmid unnamed5, complete sequence Acinetobacter baumannii 24842 50362 100% 0.0 100.00% 38118 CP074711.1
Acinetobacter baumannii strain ATCC 17978 plasmid unnamed3, complete sequence Acinetobacter baumannii 20953 37147 100% 0.0 100.00% 20351 CP074707.1
3.5 ragtag round3
ragtag.py patch CP059040.fasta A6_vrap_out/vrap_contig.fasta
ragtag.py scaffold CP059040.fasta A6_vrap_out/vrap_contig.fasta
python3 extract_unique_sequences.py vrap_contig.fasta blast/blastn.fa A10_extracted_contigs.fa
ragtag.py scaffold CP059040.fasta A10_vrap_out/A10_extracted_contigs.fa
python3 filter_contigs.py A10_ragtag_output/ragtag.scaffold.fasta A10_ragtag_contigs_2000.fa
# Check the converage to determine if they are chr or plasmids!
#>CAP_12_length_2972
#Bacillus paranthracis strain Bc006 chromosome, complete genome Bacillus paranthracis 4691 5472 100% 0.0 99.73% 5304537 CP119629.1
#Bacillus paranthracis strain Bc006 chromosome, complete genome Bacillus paranthracis 9762 11557 97% 0.0 99.87% 5304537 CP119629.1
#Bacillus paranthracis strain Bc006 chromosome, complete genome Bacillus paranthracis 14883 1.353e+05 100% 0.0 99.98% 5304537 CP119629.1
#CP119629.1 Submitted (10-MAR-2023) College of Animal Science, Ahau, Chagnjianxilu, Anhui 230000, China
ragtag.py patch CU459141.fasta A12_vrap_out/vrap_contig.fasta
ragtag.py scaffold CU459141.fasta A12_vrap_out/vrap_contig.fasta
ragtag.py patch CP074710.fasta A19_vrap_out/vrap_contig.fasta
ragtag.py scaffold CP074710.fasta A19_vrap_out/vrap_contig.fasta
A10:
CP059040_RagTag 3941335 17 60 61
CAP_11_length_2972 2972 4007061 60 61
CAP_1_length_17300 17300 4010103 60 61
samtools faidx A10_ragtag_contigs_2000.fa CAP_1_length_17300 > A10_plasmid.fasta
3.6 To map reads from one isolate (isolate 2) to the genome of another isolate (isolate 1), check coverage, and generate a consensus FASTA from the alignment, with regions not covered by the mapping masked as 'N', you'll need to follow a series of steps using bioinformatics tools.
# -- Mapping to CP059040 --
5447038 + 0 in total (QC-passed reads + QC-failed reads)
5447038 + 0 primary
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
0 + 0 primary duplicates
3346945 + 0 mapped (61.45% : N/A)
3346945 + 0 primary mapped (61.45% : N/A)
4361512 + 0 paired in sequencing
2180756 + 0 read1
2180756 + 0 read2
2579038 + 0 properly paired (59.13% : N/A)
2681114 + 0 with itself and mate mapped
21988 + 0 singletons (0.50% : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)
# -- Mapping to A6 --
5447044 + 0 in total (QC-passed reads + QC-failed reads)
5447044 + 0 primary
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
0 + 0 primary duplicates
3437574 + 0 mapped (63.11% : N/A)
3437574 + 0 primary mapped (63.11% : N/A)
4361524 + 0 paired in sequencing
2180762 + 0 read1
2180762 + 0 read2
2646686 + 0 properly paired (60.68% : N/A)
2753738 + 0 with itself and mate mapped
22055 + 0 singletons (0.51% : N/A)
1484 + 0 with mate mapped to a different chr
1390 + 0 with mate mapped to a different chr (mapQ>=5)
#- Sort and Index the BAM File:
# samtools sort -o sorted.bam input.bam
# samtools index sorted.bam
#- Call Variants:
# bcftools mpileup -Ou -f reference.fasta sorted.bam | bcftools call -mv -Oz -o calls.vcf.gz
# bcftools index calls.vcf.gz
#- Generate Consensus FASTA:
# bcftools consensus -f reference.fasta calls.vcf.gz -o consensus.fasta
Step 1: Align Reads to Reference Genome
First, align the reads from isolate 2 to the genome of isolate 1. For example, using BWA:
bwa mem reference_genome.fasta isolate2_reads.fastq > aligned.sam
or, use vrap to generate input.bam
#for bowtie (1 DB): --host Bacillus_cereus.fasta
#for blast (5 DBs): --virus=Acinetobacter_baumannii.fasta -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr, and two default download_db.py
#A6_chr_plasmid.fasta reads filtered, should only the contamination reads remaining!
vrap/vrap.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_host_A6 --host A6_chr_plasmid.fasta --noblast -t 55
Step 2: Convert SAM to BAM, Sort and Index
Convert the SAM file to BAM, sort it, and create an index:
samtools view -Sb mapped > aligned.bam
samtools sort aligned.bam -o sorted.bam
samtools index sorted.bam
Step 3: Coverage Analysis
Check the coverage of the genome regions. Regions with no coverage will be masked later.
#bedtools genomecov -ibam sorted.bam -bg > coverage.bed
#BUG above: By default, Bedtools doesn't explicitly list regions with zero coverage in the output of genomecov. To get around this, you can use the -bga option with genomecov. This option ensures that regions with zero coverage are also included in the output. Here's how you can modify your command:
bedtools genomecov -ibam sorted.bam -bga > coverage.bed
Step 4: Create a Bed File for Masking
Create a BED file that specifies regions with no or low coverage. Assume that any region with coverage less than a certain threshold (e.g., 1x) should be masked:
awk '$4 < 1 {print $1 "\t" $2 "\t" $3}' coverage.bed > low_coverage.bed
Step 5: Generate Consensus Sequence
Generate a consensus sequence, using bcftools, and mask regions with low or no coverage:
bcftools mpileup -Ou -f ../../CP059040.fasta sorted.bam | bcftools call -mv --ploidy 1 -Oz -o calls.vcf.gz
bcftools index calls.vcf.gz
bcftools consensus -f ../../CP059040.fasta -m low_coverage.bed calls.vcf.gz -o consensus.fasta
#The site CP059040:4 overlaps with another variant, skipping...
#The site CP059040:3125036 overlaps with another variant, skipping...
#Applied 58 variants
bcftools mpileup -Ou -f ../../A6_chr_plasmid.fasta sorted.bam | bcftools call -mv --ploidy 1 -Oz -o calls.vcf.gz
bcftools index calls.vcf.gz
bcftools consensus -f ../../A6_chr_plasmid.fasta -m low_coverage.bed calls.vcf.gz -o consensus.fasta
#The site seq00000000:4 overlaps with another variant, skipping...
#The site seq00000000:3125036 overlaps with another variant, skipping...
#Applied 61 variants
#--> TODO: could report the following regions are not covered in A10 comparing to A6.
#seq00000000 364666 365765
#seq00000000 2810907 2813323
#seq00000000 2813552 2813779
#seq00000000 2813923 2819663
#seq00000000 2821839 2831921
#seq00000000 2832750 2832753
#seq00000000 2833977 2860354
#seq00000000 2860505 2861780
#seq00000000 2861931 2861951
#seq00000000 2862102 2862234
#seq00000000 2862385 2863434
#seq00000000 3124939 3124943
#seq00000000 3125009 3125010
In this command, the -m option in bcftools consensus is used to specify a BED file for masking low-coverage regions.
4.1, generate genebank in snpEff
mkdir ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/CP059040
cp db/CP059040.gb ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/CP059040/genes.gbk
mkdir ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/CU459141
cp db/CU459141.gb ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/CU459141/genes.gbk
mkdir ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/CP079931
cp db/CP079931.gb ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/CP079931/genes.gbk
vim ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/snpEff.config
/home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank CP059040 -d
/home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank CU459141 -d
/home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank CP079931 -d
4.2, using spandx calling variants
gzip A6_WT_HQ_trimmed_P_1.fastq A6_WT_HQ_trimmed_P_2.fastq A10_CraA_HQ_trimmed_P_1.fastq A10_CraA_HQ_trimmed_P_2.fastq
gzip A19_17978_HQ_trimmed_P_1.fastq A19_17978_HQ_trimmed_P_2.fastq
ln -s /home/jhuang/Tools/spandx/ spandx
#nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref ATCC19606.fasta --annotation --database ATCC19606 -resume
nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref ../db/CP059040.fasta --annotation --database CP059040 -resume
#TODO: --structural Set to true if you would like to identify structural variants
# Note that this step can take a considerable amount of time if
# you have deep sequencing data (default: false).
#Done! Result files are in --> ./Outputs
#If further analysis is required, bam alignments are in --> ./Outputs/bams
#Phylogenetic tree and annotated merged variants are in --> ./Outputs/Phylogeny_and_annotation
#Individual variant files are in --> ./Outputs/Variants/VCFs
gzip A12_AYE_HQ_trimmed_P_1.fastq A12_AYE_HQ_trimmed_P_2.fastq
ln -s /home/jhuang/Tools/spandx/ spandx
#nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref AYE.fasta --annotation --database AYE -resume
nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref ../db/CU459141.fasta --annotation --database CU459141 -resume
cd ../A19_bacto_out/
ln -s /home/jhuang/Tools/spandx/ spandx
nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref ../db/CP079931.fasta --annotation --database CP079931 -resume
# LOG
#-t
#snpEff eff -nodownload -no-downstream -no-intergenic -ud 100 -v CP040849.1 noAB_wildtype_trimmed.PASS.snps.vcf > noAB_wildtype_trimmed.PASS.snps.annotated.vcf
# DEBUG: "--mixtures --structural" does not work well!
#pindel -f ATCC19606.fasta -T 2 -i pindel.bam.config -o pindel.out
#pindel -f ATCC19606.fasta -T 2 -i pindel.bam.config -o pindel.out
5, manually compare the results of snippy and spandx
Variant Types
Type Name Example
snp Single Nucleotide Polymorphism A => T
mnp Multiple Nuclotide Polymorphism GC => AT
ins Insertion ATT => AGTT
del Deletion ACGG => ACG
complex Combination of snp/mnp ATTC => GTTA
#in snippy/A6_WT_HQ/A6_WT_HQ.csv
CHROM,POS,TYPE,REF,ALT,EVIDENCE,FTYPE,STRAND,NT_POS,AA_POS,EFFECT,LOCUS_TAG,GENE,PRODUCT
CP059040,136447,snp,T,A,A:338 T:0
CP059040,152318,snp,T,A,A:337 T:0,CDS,-,106/213,36/70,missense_variant c.106A>T p.Thr36Ser,H0N29_00700,,DUF1737 domain-containing protein
CP059040,171965,snp,T,A,A:312 T:0,CDS,+,167/798,56/265,missense_variant c.167T>A p.Val56Asp,H0N29_00780,,MBL fold metallo-hydrolase
CP059040,194020,ins,C,CT,CT:317 C:0,CDS,-,3260/3497,1087/1164,frameshift_variant c.3260dupA p.Arg1088fs,H0N29_00885,,PAS-domain containing protein
CP059040,375297,snp,A,T,T:268 A:0
CP059040,468939,complex,ATAATATATTCTTTATAGATGAATACATTTTTTCTTTA,TATAATATATTCTTTATAGATGAATACATTTTTTCTTTAT,TATAATATATTCTTTATAGATGAATACATTTTTTCTTTAT:111 ATAATATATTCTTTATAGATGAATACATTTTTTCTTTA:0
CP059040,609979,snp,A,T,T:301 A:0,CDS,+,1009/1212,337/403,stop_gained c.1009A>T p.Lys337*,H0N29_02925,,SAM-dependent methyltransferase
CP059040,1036730,ins,T,TA,TA:206 T:1
CP059040,1059847,snp,C,T,T:245 C:0,CDS,-,119/642,40/213,missense_variant c.119G>A p.Gly40Asp,H0N29_04880,,DUF2238 domain-containing protein
CP059040,1272639,snp,T,G,G:195 T:0
CP059040,1300706,snp,A,G,G:202 A:0,CDS,+,351/1428,117/475,synonymous_variant c.351A>G p.Ser117Ser,H0N29_06100,,M48 family metalloprotease
CP059040,1970200,snp,T,C,C:163 T:0,CDS,+,281/2136,94/711,missense_variant c.281T>C p.Val94Ala,H0N29_09260,,TonB-dependent siderophore receptor
CP059040,2383727,ins,A,AT,AT:129 A:0
CP059040,2477628,ins,T,TA,TA:90 T:0
CP059040,2525852,ins,A,AT,AT:189 A:0,CDS,-,529/836,177/277,frameshift_variant c.529dupA p.Ile177fs,H0N29_11845,,hypothetical protein
CP059040,3016359,ins,A,AT,AT:178 A:0,CDS,-,2136/3236,712/1077,frameshift_variant c.2136dupA p.Phe713fs,H0N29_14350,cas3f,type I-F CRISPR-associated helicase Cas3
CP059040,3111299,snp,A,G,G:263 A:0,CDS,-,957/1185,319/394,synonymous_variant c.957T>C p.Ile319Ile,H0N29_14775,,hypothetical protein
CP059040,3310021,ins,C,CT,CT:197 C:0
CP059040,3542741,del,GC,G,G:307 GC:0,CDS,+,1746/2231,582/742,frameshift_variant c.1746delC p.Arg583fs,H0N29_16865,,primosomal protein N'
CP059040,3542934,del,CT,C,C:308 CT:0,CDS,+,1938/2231,646/742,frameshift_variant c.1938delT p.Met647fs,H0N29_16865,,primosomal protein N'
CP059040,3570717,del,AC,A,A:292 AC:0,CDS,+,187/1315,63/437,frameshift_variant c.187delC p.Gln63fs,H0N29_16960,,FAD-dependent monooxygenase
CP059040,3629616,ins,C,CT,CT:212 C:0
CP059040,3873573,snp,A,G,G:363 A:0,CDS,+,1705/2187,569/728,missense_variant c.1705A>G p.Asn569Asp,H0N29_18380,,polysaccharide biosynthesis tyrosine autokinase
#in snippy/A10_CraA_HQ/A10_CraA_HQ.csv
CHROM,POS,TYPE,REF,ALT,EVIDENCE,FTYPE,STRAND,NT_POS,AA_POS,EFFECT,LOCUS_TAG,GENE,PRODUCT
CP059040,136447,snp,T,A,A:201 T:0
CP059040,152318,snp,T,A,A:181 T:0,CDS,-,106/213,36/70,missense_variant c.106A>T p.Thr36Ser,H0N29_00700,,DUF1737 domain-containing protein
CP059040,171965,snp,T,A,A:183 T:0,CDS,+,167/798,56/265,missense_variant c.167T>A p.Val56Asp,H0N29_00780,,MBL fold metallo-hydrolase
CP059040,194020,ins,C,CT,CT:160 C:0,CDS,-,3260/3497,1087/1164,frameshift_variant c.3260dupA p.Arg1088fs,H0N29_00885,,PAS-domain containing protein
CP059040,375297,snp,A,T,T:127 A:0
CP059040,468931,ins,A,AT,AT:65 A:1
CP059040,468976,ins,A,AT,AT:72 A:0
CP059040,609979,snp,A,T,T:139 A:0,CDS,+,1009/1212,337/403,stop_gained c.1009A>T p.Lys337*,H0N29_02925,,SAM-dependent methyltransferase
CP059040,1036730,ins,T,TA,TA:101 T:0
CP059040,1059847,snp,C,T,T:109 C:0,CDS,-,119/642,40/213,missense_variant c.119G>A p.Gly40Asp,H0N29_04880,,DUF2238 domain-containing protein
CP059040,1272639,snp,T,G,G:94 T:0
CP059040,1300706,snp,A,G,G:136 A:0,CDS,+,351/1428,117/475,synonymous_variant c.351A>G p.Ser117Ser,H0N29_06100,,M48 family metalloprotease
CP059040,1527276,del,TTGAACC,T,T:29 TTGAACC:1,CDS,+,1327/2115,443/704,conservative_inframe_deletion c.1327_1332delGAACCT p.Glu443_Pro444del,H0N29_07175,,DNA polymerase III subunit gamma/tau
CP059040,1970200,snp,T,C,C:92 T:0,CDS,+,281/2136,94/711,missense_variant c.281T>C p.Val94Ala,H0N29_09260,,TonB-dependent siderophore receptor
CP059040,2383727,ins,A,AT,AT:56 A:0
CP059040,2477628,ins,T,TA,TA:27 T:1
CP059040,2525852,ins,A,AT,AT:124 A:0,CDS,-,529/836,177/277,frameshift_variant c.529dupA p.Ile177fs,H0N29_11845,,hypothetical protein
CP059040,3016359,ins,A,AT,AT:112 A:0,CDS,-,2136/3236,712/1077,frameshift_variant c.2136dupA p.Phe713fs,H0N29_14350,cas3f,type I-F CRISPR-associated helicase Cas3
CP059040,3111299,snp,A,G,G:152 A:0,CDS,-,957/1185,319/394,synonymous_variant c.957T>C p.Ile319Ile,H0N29_14775,,hypothetical protein
CP059040,3310021,ins,C,CT,CT:94 C:0
CP059040,3542741,del,GC,G,G:152 GC:0,CDS,+,1746/2231,582/742,frameshift_variant c.1746delC p.Arg583fs,H0N29_16865,,primosomal protein N'
CP059040,3542934,del,CT,C,C:165 CT:0,CDS,+,1938/2231,646/742,frameshift_variant c.1938delT p.Met647fs,H0N29_16865,,primosomal protein N'
CP059040,3570717,del,AC,A,A:177 AC:0,CDS,+,187/1315,63/437,frameshift_variant c.187delC p.Gln63fs,H0N29_16960,,FAD-dependent monooxygenase
CP059040,3629616,ins,C,CT,CT:125 C:0
CP059040,3873573,snp,A,G,G:190 A:0,CDS,+,1705/2187,569/728,missense_variant c.1705A>G p.Asn569Asp,H0N29_18380,,polysaccharide biosynthesis tyrosine autokinase
#in snippy/A12_AYE_HQ/A12_AYE_HQ.csv
CHROM,POS,TYPE,REF,ALT,EVIDENCE,FTYPE,STRAND,NT_POS,AA_POS,EFFECT,LOCUS_TAG,GENE,PRODUCT
CU459141,169174,snp,G,A,A:296 G:0,CDS,-,74/441,25/146,missense_variant c.74C>T p.Thr25Met,ABAYE0155,,conserved hypothetical protein
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A12_AYE_HQ_trimmed
CU459141 169174 . G A 11566.04 PASS AC=1;AF=1.00;AN=1;DP=307;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=60.00;QD=31.04;SOR=0.763;ANN=A|missense_variant|MODERATE|ABAYE0155|ABAYE0155|transcript|ABAYE0155|protein_coding|1/1|c.74C>T|p.Thr25Met|74/441|74/441|25/146|| GT:AD:DP:GQ:PL 1:0,294:294:99:11576,0
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A12_AYE_HQ_trimmed
CU459141 519645 . TA T 175.01 PASS AC=1;AF=1.00;AN=1;DP=15;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=31.88;QD=29.17;SOR=3.912 GT:AD:DP:GQ:PL 1:0,6:6:99:185,0
CU459141 1213004 . C CACCTGGGCTCGAACCAGGGACCTGCGGATTAACAGTCCGTCGCTCTACCGACTGAGCTATGGGAGATTAGTAGATGGCTCTCCA 3656.01 PASS AC=1;AF=1.00;AN=1;BaseQRankSum=-0.754;DP=160;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=59.15;MQRankSum=-0.555;QD=29.30;ReadPosRankSum=-1.188;SOR=0.846;ANN=CACCTGGGCTCGAACCAGGGACCTGCGGATTAACAGTCCGTCGCTCTACCGACTGAGCTATGGGAGATTAGTAGATGGCTCTCCA|intragenic_variant|MODIFIER|ABAYEtRNA60|ABAYEtRNA60|gene_variant|ABAYEtRNA60|||n.1213004_1213005insACCTGGGCTCGAACCAGGGACCTGCGGATTAACAGTCCGTCGCTCTACCGACTGAGCTATGGGAGATTAGTAGATGGCTCTCCA|||||| GT:AD:DP:GQ:PL 1:2,93:95:99:3666,0
#in snippy/A19_17978_HQ/A19_17978_HQ.csv
CHROM,POS,TYPE,REF,ALT,EVIDENCE,FTYPE,STRAND,NT_POS,AA_POS,EFFECT,LOCUS_TAG,GENE,PRODUCT
CP079931,697999,snp,A,C,C:319 A:0,CDS,-,370/723,124/240,missense_variant c.370T>G p.Cys124Gly,KZA74_03385,,folate-binding Fe/S cluster repair protein
CP079931,1594519,snp,A,C,C:232 A:0,CDS,-,558/1122,186/373,missense_variant c.558T>G p.Asn186Lys,KZA74_07445,,ATP-binding protein
CP079931,2140669,snp,A,C,C:180 A:0,CDS,-,1412/1791,471/596,missense_variant c.1412T>G p.Val471Gly,KZA74_10170,,ABC transporter substrate-binding protein
CP079931,2415049,del,CA,C,C:190 CA:0
CP079931,3583600,snp,T,C,C:402 T:0
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A19_17978_HQ_trimmed
CP079931 499908 . G A 3372.04 PASS AC=1;AF=1.00;AN=1;DP=95;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=35.85;QD=28.47;SOR=9.087 GT:AD:DP:GQ:PL 1:0,93:93:99:3382,0
CP079931 499966 . C T 5744.04 PASS AC=1;AF=1.00;AN=1;DP=150;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=37.54;QD=28.10;SOR=10.008 GT:AD:DP:GQ:PL 1:0,148:148:99:5754,0
CP079931 697999 . A C 12568.04 PASS AC=1;AF=1.00;AN=1;DP=336;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=60.00;QD=26.58;SOR=0.712;ANN=C|missense_variant|MODERATE|KZA74_03385|KZA74_03385|transcript|KZA74_03385|protein_coding|1/1|c.370T>G|p.Cys124Gly|370/723|370/723|124/240|| GT:AD:DP:GQ:PL 1:0,323:323:99:12578,0
CP079931 1594519 . A C 9115.04 PASS AC=1;AF=1.00;AN=1;DP=239;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=60.00;QD=28.16;SOR=0.782;ANN=C|missense_variant|MODERATE|KZA74_07445|KZA74_07445|transcript|KZA74_07445|protein_coding|1/1|c.558T>G|p.Asn186Lys|558/1122|558/1122|186/373|| GT:AD:DP:GQ:PL 1:0,232:232:99:9125,0
CP079931 2140669 . A C 7145.04 PASS AC=1;AF=1.00;AN=1;DP=184;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=60.00;QD=25.70;SOR=0.965;ANN=C|missense_variant|MODERATE|KZA74_10170|KZA74_10170|transcript|KZA74_10170|protein_coding|1/1|c.1412T>G|p.Val471Gly|1412/1791|1412/1791|471/596||WARNING_TRANSCRIPT_NO_START_CODON GT:AD:DP:GQ:PL 1:0,180:180:99:7155,0
CP079931 3583600 . T C 16124.04 PASS AC=1;AF=1.00;AN=1;DP=426;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=60.00;QD=28.42;SOR=0.779;ANN=C|upstream_gene_variant|MODIFIER|rplL|KZA74_16990|transcript|KZA74_16990|protein_coding||c.-23A>G|||||23| GT:AD:DP:GQ:PL 1:0,409:409:99:16134,0
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A19_17978_HQ_trimmed
CP079931 499924 . AT A 4174.01 PASS AC=1;AF=1.00;AN=1;DP=118;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=36.44;QD=30.23;SOR=9.455 GT:AD:DP:GQ:PL 1:0,112:112:99:4184,0
CP079931 2415049 . CA C 6579.01 PASS AC=1;AF=1.00;AN=1;DP=213;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=60.00;QD=32.57;SOR=1.085 GT:AD:DP:GQ:PL 1:0,202:202:99:6589,0
CP079931 3695212 . AT A 342.01 PASS AC=1;AF=1.00;AN=1;DP=10;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=35.05;QD=34.20;SOR=4.804 GT:AD:DP:GQ:PL 1:0,10:10:99:352,0
./Outputs/bams/A10_CraA_HQ_trimmed.dedup.bam
./Outputs/bams/A6_WT_HQ_trimmed.dedup.bam
./snippy/A10_CraA_HQ/A10_CraA_HQ.bam
./snippy/A6_WT_HQ/A6_WT_HQ.bam
TODO after meeting: import the two bam files and annotation of genes into IGV, go to the position craA gene and screenshot show if the gene regions are completely removed? Send the table and screenshot to Tam, say the annotation of genom of NCBI is still pending!
7 (optional), filtering process of SPANDx results
#Input file is Outputs/Phylogeny_and_annotation/All_SNPs_indels_annotated.txt (61203)
#- Step1: filtering $6!=$7 (330)
awk '{if($6!=$7) print}' < All_SNPs_indels_annotated.txt > All_SNPs_indels_annotated_.txt
#- Step2: restricted the first 7 columns (240)
cut -d$'\t' -f1-7 All_SNPs_indels_annotated_.txt > f1_7
#- Step3: filtering examples as "G/C" (1450)
grep -v "/" f1_7 > f1_7_
#- Step4: filtering examples as ". A" (170)
grep -v "\." f1_7_ > f1_7__
#- Step5: filtering examples as "T *" (162), since we want to have only SNPs differing between BK20399 and GE3138.
grep -v "*" f1_7__ > f1_7___
#- Step6: only SNPs (50).
grep -v "INDEL" f1_7___ > f1_7____
#- Step7: get the complete rows with the positions
cut -d$'\t' -f2-2 f1_7____ > f2
replace "\n" with " All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt\ngrep "
grep "CHROM" All_SNPs_indels_annotated_.txt > All_SNPs_annotated.txt
grep "46882" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt
grep "46892" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt
grep "46936" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt
....
#- Step8: Missense mutations ()
grep "missense_variant" All_SNPs_annotated.txt
8 (optional), merge results from two variant calling methods!
#-- post-processing --
awk '{if($6!=$7) print}' < All_SNPs_indels_annotated.txt > All_SNPs_indels_annotated_.txt
cut -d$'\t' -f1-7 All_SNPs_indels_annotated_.txt > f1_7
grep -v "/" f1_7 > f1_7_
grep -v "\." f1_7_ > f1_7__
grep -v "*" f1_7__ > f1_7___
grep -v "INDEL" f1_7___ > f1_7_SNPs
cut -d$'\t' -f2-2 f1_7_SNPs > f1_7_SNPs_f2
\n --> " All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt\ngrep "
#grep "CHROM" All_SNPs_indels_annotated_.txt > All_SNPs_annotated.txt
#grep "46882" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt
#grep "46892" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt
grep -v "SNP" f1_7___ > f1_7_indels
cut -d$'\t' -f2-2 f1_7_indels > f1_7_indels_f2
\n --> " All_SNPs_indels_annotated_.txt >> All_indels_annotated.txt\ngrep "
#grep "CHROM" All_SNPs_indels_annotated_.txt > All_indels_annotated.txt
#grep "49848" All_SNPs_indels_annotated_.txt >> All_indels_annotated.txt
#grep "53872" All_SNPs_indels_annotated_.txt >> All_indels_annotated.txt
9 (optional), PubMLST, ResFinder or RGI calling https://cge.cbs.dtu.dk/services/ https://cge.cbs.dtu.dk/services/MLST/ https://pubmlst.org/bigsdb?db=pubmlst_pacnes_seqdef&page=batchProfiles&scheme_id=3 https://cge.food.dtu.dk/services/ResFinder/ https://card.mcmaster.ca/home https://www.pseudomonas.com/ https://www.pseudomonas.com/amr/list https://card.mcmaster.ca/analyze/rgi The antimicrobial resistance genes were predicted by the tool RGI 6.0.0 which utilizes the database CARD 3.2.5 (PMID: 31665441). !!!!#check the difference of SNPs in the two files, and then look for the SNPs in my tables comparing the two isolates!!!!
cut -d$'\t' -f9-21 BK20399_contigs.fa.txt > f9_21
cut -d$'\t' -f2-6 BK20399_contigs.fa.txt > f2_6
#Best_Hit_ARO --> Antibiotic Resistance Ontology (ARO) Term
paste f9_21 f2_6 > BK20399_.txt
#grep "CARD_Protein_Sequence" BK20399_.txt > header
grep -v "CARD_Protein_Sequence" BK20399_.txt > BK20399__.txt
sort BK20399__.txt > BK20399_sorted.txt
cut -d$'\t' -f9-21 GE3138_contigs.fa.txt > f9_21
cut -d$'\t' -f2-6 GE3138_contigs.fa.txt > f2_6
paste f9_21 f2_6 > GE3138_.txt
grep -v "CARD_Protein_Sequence" GE3138_.txt > GE3138__.txt
sort GE3138__.txt > GE3138_sorted.txt
cut -d$'\t' -f1-6 BK20399_sorted.txt > BK20399_f1_6.txt
cut -d$'\t' -f1-6 GE3138_sorted.txt > GE3138_f1_6.txt
diff BK20399_f1_6.txt GE3138_f1_6.txt
#-->< APH(3'')-Ib 99.63 3002639 protein homolog model n/a n/a
cat header BK20399_sorted.txt > BK20399.txt
cat header GE3138_sorted.txt > GE3138.txt
~/Tools/csv2xls-0.4/csv_to_xls.py BK20399.txt GE3138.txt -d$'\t' -o ARG_calling.xls
#---- another example ----
cut -d$'\t' -f9-21 AW27149_before.txt > f9_21
cut -d$'\t' -f2-6 AW27149_before.txt > f2_6
#Best_Hit_ARO --> Antibiotic Resistance Ontology (ARO) Term
paste f9_21 f2_6 > AW27149_before_.txt
grep "CARD_Protein_Sequence" AW27149_before_.txt > header
grep -v "CARD_Protein_Sequence" AW27149_before_.txt > AW27149_before__.txt
sort AW27149_before__.txt > AW27149_before_sorted.txt
cut -d$'\t' -f9-21 AW48641_after.txt > f9_21
cut -d$'\t' -f2-6 AW48641_after.txt > f2_6
paste f9_21 f2_6 > AW48641_after_.txt
grep -v "CARD_Protein_Sequence" AW48641_after_.txt > AW48641_after__.txt
sort AW48641_after__.txt > AW48641_after_sorted.txt
cut -d$'\t' -f1-6 AW27149_before_sorted.txt > AW27149_before_f1_6.txt
cut -d$'\t' -f1-6 AW48641_after_sorted.txt > AW48641_after_f1_6.txt
diff AW27149_before_f1_6.txt AW48641_after_f1_6.txt
#48c48
#< Pseudomonas aeruginosa CpxR 100.0 3004054 protein homolog model n/a n/a
#---
#> Pseudomonas aeruginosa CpxR 99.56 3004054 protein homolog model n/a n/a
cat header AW27149_before_sorted.txt > ../RGI_calling_AW27149_before.txt
cat header AW48641_after_sorted.txt > ../RGI_calling_AW48641_after.txt
#mlst add header "isolate species ST acsA aroE guaA mutL nuoD ppsA trpE"
mv
~/Tools/csv2xls-0.4/csv_to_xls.py All_SNPs_annotated.csv RGI_calling_AW27149_before.txt RGI_calling_AW48641_after.txt mlst.txt -d$'\t' -o SNP_ARG_calling.xls
#Find the common SNP between SNP calling and RGI calling, since the SNP calling has bad annotation, marked yellow.
点赞本文的读者
还没有人对此文章表态
没有评论
Variant Calling for Herpes Simplex Virus 1 from Patient Sample Using Capture Probe Sequencing
Typing of 81 S. epidermidis samples (Luise)
Co-Authorship Network Generator using scraped data from Google Scholar via SerpAPI
© 2023 XGenes.com Impressum