-
download viral database
cp download_db.py down_db.py.backup sed -i -e 's/39733/1396/g' download_db.py #NOTE the download_db.py cannot run alone, rather than via "vrap.py -u" (damian) jhuang@hamburg:~/DATA/Data_Tam_Acinetobacter_baumannii$ ./vrap/vrap.py -u mv vrap/database/viral_db vrap/database/viral_db_Bacillus_cereus
-
run vrap.py
#--host=/home/jhuang/REFs/Sus_scrofa.Sscrofa11.1.dna.toplevel.fa vrap.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_out --virus=/home/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/ATCC19606.fasta -n /media/jhuang/Titisee/GAMOLA2/Blast_db/nt -a /media/jhuang/Titisee/GAMOLA2/Blast_db/nr -t 4 #vrap/vrap.py -1 220211_M03701_0272_000000000-JWYW9/p12568/BovHepV_Bulgaria_19_S3_R1_001.fastq.gz -2 220211_M03701_0272_000000000-JWYW9/p12568/BovHepV_Bulgaria_19_S3_R2_001.fastq.gz -o BovHepV_Bulgaria_19_on_Hepacivirus --host=/home/jhuang/REFs/Bos_taurus.ARS-UCD1.2.dna.toplevel.fa -t 3 #IMPORTANT, it doesn't work, since the update will always downlod the repeated sequences causing ERROR! It needs to delete it as follows!! #In summary: --(optional) host --> bowtie2/host (bowtie_database for host-substraction) + blast/db/host (in nt_dbs, custom blastn database before downloaded_db!) # --(optional) virus --> blast/db/virus (in nt_dbs, custom blastn database before downloaded_db!) # --(always) defined in download_db.py (setting by modifying the file) and run default --> database/viral_db/viral_nucleotide (nt_dbs) # --> database/viral_db/viral_protein (prot_db) # --(optional) nt+nr #Therefore, the results substracted host should not need the blast with host, however it it in nt_dbs! #Under (vrap) jhuang@hamm:~/DATA/Data_Tam_Acinetobacter_baumannii$ #DEBUG: {path}/external_tools/Lighter-master/lighter-->/home/jhuang/miniconda3/envs/vrap/bin/lighter #DEBUG: {path}/external_tools/SPAdes-3.11.1-Linux/bin/spades.py-->/home/jhuang/miniconda3/envs/vrap/bin/spades.py #A6: --host=/home/jhuang/REFs/Sus_scrofa.Sscrofa11.1.dna.toplevel.fa --virus=/home/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/vrap/database/viral_db_Bacillus_cereus/nucleotide_Bacillus_cereus_0.01.fasta vrap/vrap.py -1 ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_2.fastq.gz -o A6_vrap_out -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55 #A10 #START checking blast db: no host and with virus --> mapping to 5 dbs: blast/db/virus, viral_nucleotide, viral_protein, nt, nr vrap/vrap.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_out --virus=/home/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/vrap/database/viral_db_Bacillus_cereus/nucleotide_Bacillus_cereus_0.01.fasta -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55 #A12 vrap/vrap.py -1 ./results_AYE/trimmed/A12_AYE_HQ_trimmed_P_1.fastq.gz -2 ./results_AYE/trimmed/A12_AYE_HQ_trimmed_P_2.fastq.gz -o A12_vrap_out -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55 #A19 vrap/vrap.py -1 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_2.fastq.gz -o A19_vrap_out -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55 #START VrAP #START checking blast db #START lighter #START estimate k-mer size #k-mer size: 5964136 #/home/jhuang/miniconda3/envs/vrap/bin/lighter -r ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_1.fastq.gz -r ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_2.fastq.gz -K 20 5964136 -od /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/lighter -t 55 2>> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/vrap.log #START flash #/home/jhuang/Tools/vrap/external_tools/FLASH-1.2.11/flash /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/lighter/A6_WT_HQ_trimmed_P_1.cor.fq.gz /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/lighter/A6_WT_HQ_trimmed_P_2.cor.fq.gz -o flash -d /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/flash -M 1000 >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/vrap.log #START spades
-
vrap.py (code)
#!/usr/bin/env python3 from orf import ORF from optparse import OptionParser import os import glob from Bio import SeqIO from pathlib import Path # see blast.py # .ncbirc --> BLASTDB=/media/jhuang/Titisee/GAMOLA2/Blast_db/ --> /mnt/Samsung_T5/Blast_db/ from blast import Blast from hmmer import HMMER from bowtie import Bowtie # download_db.py only for txid2697049 [Severe acute respiratory syndrome coronavirus 2] from download_db import Database import glob import logging from math import exp from scipy.optimize import brentq #old = "_2010" old = "" class VrAP: """ VrAP """ def __init__ (self): """ Class initialiser """ self.version = "1.0.5" option = self.option_parser() self.path = os.path.dirname(os.path.realpath(__file__)) self.paired_end = True self.p1 = option.p1 self.p2 = option.p2 self.single = "" self.cor1 = self.p1 self.cor2 = self.p2 self.cpu = option.cpu self.contigs = "" self.contig_length = option.clength self.host = option.host self.virus = option.virus self.nt = option.nt self.nr = option.nr self.bt2idx = option.bt2idx self.update = option.update self.gbt2 = option.gbowtie self.pre = option.pre self.assembly = option.assembly self.noblast = option.noblast if option.out!=None: self.out = os.path.abspath(option.out) else: self.out = "./" if self.update: self.update_database() if self.cpu==None: self.cpu = 1 if self.p2==None: self.single = self.p1 self.paired_end = False self.create_output_dir() open("{}/vrap.log".format(self.out),"w").close() self.log("START VrAP") with open("{}/vrap.log".format(self.out),"a") as log_opt: for opt, value in option.__dict__.items(): log_opt.write("{}:\t{}\n".format(opt,value)) if self.pre: self.log("START only read correction and assembly!") self.check_input() def check_input (self): """ check input """ bt2idx = self.bt2idx if self.bt2idx!=None: bt2idx = self.bt2idx+".1.bt2" input_files = [self.p1,self.p2,self.host,self.virus,bt2idx] for f in input_files: #print(f) if f != None: if not os.path.exists(f): print("{} don't exists\nPlease check input files\nVrAP quit\n".format(f)) quit() if self.p1 == None: self.parser.print_help() print("-1/--p1 is mandatory\nPlease check input file\nVrAP quit\n") quit() #EXPLANATION: in total, mapping to 4 dbs: viral_nucleotide, viral_protein, nt, nr dbs = ["{path}/database/viral_db/viral_nucleotide".format(path=self.path),"{path}/database/viral_db/viral_protein".format(path=self.path),self.nr,self.nt] status = [] self.log("START checking blast db") for db in dbs: x = os.system("{path}/external_tools/blast/blastdbcmd -db {db} -info >> {out}/vrap.log 2>> {out}/vrap.log".format(path=self.path,db=db,out=self.out)) status.append(x) #COMMENTED if status[0] != 0 or status[1] != 0: self.update_database() if self.nr != None and status[2] != 0: print("No alias or index file found for {}".format(self.nr)) quit() if self.nt != None and status[3] != 0: print("No alias or index file found for {}".format(self.nt)) quit() def log (self,message,err=False): """ log bowtie run """ print(message) LOG_FILENAME = "{}/vrap.log".format(self.out) FORMAT = '\n[VrAP] %(asctime)-15s %(message)s\n' logging.basicConfig(filename=LOG_FILENAME,format=FORMAT,level=logging.DEBUG) if err: logging.exception(message) else: logging.info(message) #logging.exception() def option_parser(self): """ Read input options """ usage = "usage: %prog -1 [FILE] [options]" parser = OptionParser(usage=usage,version="%prog {}".format(self.version)) parser.add_option("-1","--p1", dest="p1",help="first paired end file" , metavar="FILE") parser.add_option("-2","--p2", dest="p2",help="second paired end file", metavar="FILE") parser.add_option("-t","--threads", dest="cpu",help="number of threads to use", metavar="INT",type="int",default=1) parser.add_option("-r","--host", dest="host",help="reference host sequences", metavar="FASTA") parser.add_option("-v","--virus", dest="virus",help="reference viral sequences", metavar="FASTA") parser.add_option("-n","--nt", dest="nt",help="nucleotide blast database (e.g. nt)", metavar="DATABASE") parser.add_option("-a","--nr", dest="nr",help="protein blast database (e.g. nr)", metavar="DATABASE") parser.add_option("-o","--outdir", dest="out",help="output directory", metavar="DIR") parser.add_option("-b","--bt2idx", dest="bt2idx",help="bowtie2 index", metavar="FILE") parser.add_option("-l","--clength", dest="clength",help="minimum contig length", metavar="INT",default=500) parser.add_option("-u","--update", dest="update",help="update viral database", default=False, action="store_true") parser.add_option("-g","--gbt2", dest="gbowtie",help="use global PATH of bowtie2", default=False, action="store_true") parser.add_option("-p","--pre", dest="pre",help="read correction only", default=False, action="store_true") parser.add_option("-q","--assembly", dest="assembly",help="assembly only", default=False, action="store_true") parser.add_option("-k","--noblast", dest="noblast",help="skip blast search", default=False, action="store_true") self.parser = parser (options, args) = parser.parse_args() return options def create_output_dir(self): """ Create output directory """ if not os.path.exists(self.out): os.makedirs(self.out) def EMfit2(self,F0, f1, F1, k): epstol = 1e-8 x = F1 / F0 e = 0 e0 = 1 x0 = 0 maxit = 1000 it = 0 while abs(x - x0) / x + abs(e - e0) > epstol and it < maxit: it = it + 1 #print x, e, abs(x-x0)/x x0 = x e0 = e x1 = -100 while abs(x - x1) / x > epstol / 2: x1 = x x = F1 / F0 * (3 * k * (1 - exp(-x * e / (3 * k))) + 1 - exp(-x * (1 - e))) def func(t): return f1 / F1 - (t * exp(-x * t / (3 * k)) + (1 - t) * exp(-x * (1 - t))) print(func(0)) print(func(1.2)) e = brentq(func, 0, 1.1) return x, e def genome_size (self): """ estimate genome size """ self.log("START estimate k-mer size") lighter_path = "{}/lighter".format(self.out) if not os.path.exists(lighter_path): os.makedirs(lighter_path) if self.paired_end: input_data = "{} {}".format(os.path.abspath(self.p1),os.path.abspath(self.p2),self.out) else: input_data = "{}".format(os.path.abspath(self.p1)) command = "{path}/external_tools/KmerStream-master/KmerStream --tsv -k 20 -t {cpu} {input_data} -o {lighter}/kmer 2>> {out}/vrap.log".format(path=self.path,lighter=lighter_path,cpu=self.cpu,out=self.out,input_data=input_data) os.system(command) size = 30000 try: kmer = "{lighter}/kmer".format(lighter=lighter_path) if os.path.exists(kmer): with open(kmer) as f: head = next(f) value = next(f).split() value = [int(i) for i in value] #list comprehension size = value[2]-value[3] except Exception as e: vrap.log(e,True) self.log("k-mer size: {}".format(size)) return size def lighter(self): """ Run ligther for read correction """ self.log("START lighter") if self.paired_end: input_data = "-r {} -r {}".format(self.p1,self.p2) else: input_data = "-r {}".format(self.p1) size = self.genome_size() if size<30000: size = 30000 #MODIFIED, since lighter in external_tool doesn't work. #{path}/external_tools/Lighter-master/lighter command = "/home/jhuang/anaconda3/bin/lighter {input} -K 20 {size} -od {out}/lighter -t {cpu} 2>> {out}/vrap.log".format(path=self.path,input=input_data,out=self.out,cpu=self.cpu,size=size) print(command) os.system(command) if self.paired_end: b1 = os.path.basename(self.p1).split(".")[0] self.cor1 = glob.glob("{}/lighter/{}*cor*".format(self.out,b1))[0] b2 = os.path.basename(self.p2).split(".")[0] self.cor2 = glob.glob("{}/lighter/{}*cor*".format(self.out,b2))[0] else: b1 = os.path.basename(self.single).split(".")[0] self.single = glob.glob("{}/lighter/{}*cor*".format(self.out,b1))[0] def flash (self): """ Run flash for paired end reads """ self.log("START flash") if self.paired_end: command = "{path}/external_tools/FLASH-1.2.11/flash {cor1} {cor2} -o flash -d {out}/flash -M 1000 >> {out}/vrap.log".format(path=self.path,cor1=self.cor1,cor2=self.cor2,out=self.out) print(command) os.system(command) self.cor1 = "{}/flash/flash.notCombined_1.fastq".format(self.out) self.cor2 = "{}/flash/flash.notCombined_2.fastq".format(self.out) self.single = "{}/flash/flash.extendedFrags.fastq".format(self.out) def bowtie (self): """ Map reads to reference sequence """ bowtie2 = Bowtie(self.gbt2,self.out,self.cpu) bowtie_path = "{}/bowtie".format(self.out) if not os.path.exists(bowtie_path): os.makedirs(bowtie_path) if self.bt2idx==None and self.host!=None: self.log("START bowtie2-build") self.bt2idx = "{}/bowtie/host".format(self.out) bowtie2.build(self.host) if self.bt2idx != None: if self.paired_end: self.log("START bowtie2 pe") bowtie2.pe(self.cor1,self.cor2,self.single,self.bt2idx) self.cor1 = "{}/bowtie/bowtie.un.1.fastq".format(self.out) self.cor2 = "{}/bowtie/bowtie.un.2.fastq".format(self.out) self.single = "{}/bowtie/bowtie.un.fastq".format(self.out) else: self.log("START bowtie2 se") bowtie2.se(self.single,self.bt2idx) self.single = "{}/bowtie/bowtie.un.fastq".format(self.out) def update_database (self): """ Update viral DB""" #self.log("START update viral DB") #EXPLANATION: maybe it needs to DELETE the directory viral_db. #typs = ["nucleotide"] #typs = ["protein"] typs = ["nucleotide","protein"] #path = os.path.dirname(os.path.realpath(__file__)) db_path = "{}/database/viral_db/".format(self.path) if not os.path.exists(db_path): os.makedirs(db_path) for typ in typs: self.log("START update viral {} DB".format(typ)) database = Database(typ,db_path,self.cpu) database.download_id() database.start_download() blast = Blast(self.cpu, self.out) db_type = "nucl" if "protein" in typ: db_type = "prot" self.log("\nSTART {} makeblastdb".format(typ)) blast.makeblastdb(db_path+typ+".fa", "{}viral_{}".format(db_path,typ), db_type) #print(db_path+typ+".fa", "{}viral_{}".format(db_path,typ), db_type) #/home/jhuang/Tools/vrap/database/viral_db/nucleotide.fa /home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide nucl #MANUAL_RUNNING #V5 didn't suitable #/home/jhuang/DATA/RNAHiSwitch/ncbi-blast-2.11.0+/bin/makeblastdb -in nuleotide.fa -dbtype nucl #/home/jhuang/DATA/RNAHiSwitch/ncbi-blast-2.11.0+/bin/makeblastdb -in protein.fa -dbtype prot #USING_V4 #https://www.biostars.org/p/200742/ #vrap/external_tools/blast/makeblastdb -in nucleotide.fa -dbtype nucl -parse_seqids -out virus_nucleotide #-taxid_map nucl_gb.accession2taxid #vrap/external_tools/blast/makeblastdb -in protein.fa -dbtype prot -parse_seqids -out virus_protein #-taxid_map nucl_gb.accession2taxid #search_term="txid2697049[Organism:exp] NOT txid2[Organism:exp] NOT txid2759[Organism:exp] NOT txid2157[Organism:exp] {}".format(date) #virus_search_term="txid10239[Organism:exp] NOT txid2[Organism:exp] NOT txid2759[Organism:exp] NOT txid2157[Organism:exp] {}".format(date) #txid2=Bacteria, txid2759=Eukaryota, txid2157=Archaea, txid10239=Viruses, txid2697049=Severe acute respiratory syndrome coronavirus 2 def spades (self): """ Run spades """ self.log("START spades") paired = "" if self.paired_end: paired = "-1 {} -2 {}".format(self.cor1,self.cor2) k = "-k 33,55,77,99,127" options = "--cov-cutoff off --only-assembler --careful" #options = "--cov-cutoff off --only-assembler" single = self.single if len(self.single)>0: single = "--s1 {}".format(self.single) cpu = "-t {}".format(self.cpu) output = "-o {}/spades".format(self.out) os.system("{path}/external_tools/SPAdes-3.11.1-Linux/bin/spades.py {paired} {single} {k} {options} {cpu} {output} >> {out}/vrap.log".format(path=self.path, paired=paired, single=single, k=k, options=options, cpu=cpu, output=output,out=self.out)) #for assembly only self.contigs = "{}/spades/contigs.fasta".format(self.out) self.exclude = set() self.scaffold = {} def cap3 (self): """ Runs cap3 """ self.log("START cap3") self.contigs = "{}/spades/contigs.fasta".format(self.out) cap3_out = "{}/cap3".format(self.out) #print(cap3_out) if not os.path.exists(cap3_out): os.makedirs(cap3_out) os.system("{}/external_tools/cap3 '{}' -y 100 >{}/cap3/cap3.out".format(self.path,self.contigs,self.out)) b = False i = 0 consensi = {} consensus = "" group = {} name = "" exclude = set() with open("{}/cap3/cap3.out".format(self.out)) as f: for line in f: if not b: if "*******************" in line: name = line.split("* ")[1].split(" *")[0] #print(name) group[name] = set() if "NODE" in line: ary = line.split() for n in ary: if "NODE" in n: tmp = n.replace("+","").replace("-","") group[name].add(tmp) exclude.add(tmp) if "DETAILED" in line: b = True if b: if "*******************" in line: name = line.split("* ")[1].split(" *")[0] #print(name) ary = line.split() #print(ary) if len(ary) > 0: if "consensus" in line: seq = ary[1] tmp = consensi.get(name,"") tmp+=seq.replace("-","N") consensi[name] = tmp self.scaffold = consensi self.exclude = exclude def clean_output (self): """ Clean spades output and include scaffolds """ fasta_sequences = SeqIO.parse(open(self.contigs),'fasta') self.result = "{}/vrap_contig.fasta".format(self.out) output = open(self.result,"w") for fasta in fasta_sequences: if fasta.name not in self.exclude and len(fasta.seq)>=self.contig_length: output.write(">{}\n{}\n".format(fasta.name,fasta.seq)) i = 1 for name,seq in self.scaffold.items(): if len(seq)>=self.contig_length: output.write(">CAP_{}_length_{}\n{}\n".format(i,len(seq),seq)) i+=1 output.close() def calculate_orf_density (self): """ calculate orf density """ self.log("START calculating orf density") self.result = "{}/vrap_contig.fasta".format(self.out) orf = ORF(self.result,self.out) self.orf_density = orf.orf_density() def label_as_virus (self): """ Add virus tag to name """ out_name = "{}/blast/custom_viral_seq.fa".format(self.out) output = open(out_name,"w") with open(self.virus, "r") as handle: for record in SeqIO.parse(handle, "fasta"): output.write(">{}\n{}\n".format(record.name+" [virus_user_db]",record.seq)) output.close() return out_name def blast (self): """ blast """ self.log("START blast") self.result = "{}/vrap_contig.fasta".format(self.out) blast_path = "{}/blast".format(self.out) blast = Blast(self.cpu, self.out) blast.read_fasta(self.result) #EXPLANATION: all virus and host-databases are saved in nt_dbs # blast performed only after assembly(self.result=vrap_contig.fasta) against host+virus --> # since the host-reads are not deleted before the assembly, # vrap_contigs.fasta should contain contigs of host and virus #sort blast/blastn.csv and blast/blastn.fa nt_dbs = "{}/database/viral_db{}/viral_nucleotide".format(self.path,old) e_val = "1e-4" max_target = 1 if not os.path.exists(blast_path): os.makedirs(blast_path) if not os.path.exists(blast_path+"/db"): os.makedirs(blast_path+"/db") #EXPLANATION #if the option --virus is set, make the given fasta blast-format and add it to vrap_out/blast/db/virus #NO record added! nt_dbs should keep only 1 record! if self.virus != None: custom_viral_seq = self.label_as_virus() blast.makeblastdb(custom_viral_seq,"{}/blast/db/virus".format(self.out),"nucl") nt_dbs += " {}/blast/db/virus".format(self.out) #MODIFIED, add the self.host causing error. COMMENTED them! #EXPLANATION: if the option --host is set, make the given fasta blast-format and add it to vrap_out/blast/db/host #There are in total 3 nt_dbs, 1 prot_db, +[nt][nr], and 1 bowtie database: # nt_dbs /home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide, ./blast/db/virus, ./blast/db/host # prot_db /home/jhuang/Tools/vrap/database/viral_db/viral_protein # bowtie_database: ./bowtie/host for substraction host-reads #DEBUG: /home/jhuang/Tools/vrap/external_tools/blast/blastn -num_threads 4 -query /home/jhuang/DATA/Data_Becher_Picornavirus_BovHepV/F21_1400_Liquor_vrap_out/vrap_contig.fasta -db "/home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide /home/jhuang/DATA/Data_Becher_Picornavirus_BovHepV/F21_1400_Liquor_vrap_out/blast/db/host" -evalue 1e-4 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /home/jhuang/DATA/Data_Becher_Picornavirus_BovHepV/F21_1400_Liquor_vrap_out/blast/blastn.csv #--> it works! #Warning: [blastn] Examining 5 or more matches is recommended #grep -v "scrofa" blastn.csv > blastn_.csv; #DEBUG: BLAST Database creation error: Error: Duplicate seq_ids are found: GB|MF170920.1 #if self.host != None: # blast.makeblastdb(self.host,"{}/blast/db/host".format(self.out),"nucl") # nt_dbs += " {}/blast/db/host".format(self.out) #EXPLANATION: blastn.csv is empty means blastn finds nonthing! ##nucleotide db search blast_out = "{}/blast/blastn.csv".format(self.out) blast.blastn(self.result, nt_dbs, e_val, max_target, blast_out) no_match_fa = "{}/blast/blastn.fa".format(self.out) no_results_found = blast.no_match(blast_out,no_match_fa) #EXPLANATION: blastx -num_threads 14 -query /home/jhuang/DATA/Data_Becher_Picornavirus_BovHepV/F21_1399_1400_Serum_vrap_out/blast/blastn.fa -db /home/jhuang/Tools/vrap/database/viral_db/viral_protein -evalue 1e-6 -outfmt 6 qseqid qstart qend sstart send evalue le+ ##protein db search if no_results_found > 0: e_val = "1e-6" blast_out = "{}/blast/blastx.csv".format(self.out) prot_db = "{}/database/viral_db{}/viral_protein".format(self.path,old) blast.blastx(no_match_fa, prot_db, e_val, max_target, blast_out) no_match_fa = "{}/blast/blastx.fa".format(self.out) no_results_found = blast.no_match(blast_out,no_match_fa) #EXPLANATION: since self.nt and self.nr are not given, the following code won't run! if self.nt != None and no_results_found > 0: e_val = "1e-4" blast_out = "{}/blast/blastn_nt.csv".format(self.out) blast.blastn(no_match_fa, self.nt, e_val, max_target, blast_out) no_match_fa = "{}/blast/blastn_nt.fa".format(self.out) no_results_found = blast.no_match(blast_out,no_match_fa) if self.nr != None and no_results_found > 0: e_val = "1e-6" blast_out = "{}/blast/blastx_nr.csv".format(self.out) blast.blastx(no_match_fa, self.nr, e_val, max_target, blast_out) no_match_fa = "{}/blast/blastx_nr.fa".format(self.out) no_results_found = blast.no_match(blast_out,no_match_fa) def hmmer (self): """ hidden markov modell search """ self.log("START HMMER") hmmer = HMMER(self.cpu, self.out) hmmer.run_hmmer() def summary (self): """ summarize output """ if len(self.orf_density)>0: blast = Blast(self.cpu, self.out) self.blast_results = blast.summary(self.out) hmmer = HMMER(self.cpu, self.out) hmmer_output = hmmer.read_hmmer_output() with open("{}/vrap_summary{}.csv".format(self.out,old),"w") as f: f.write("name;qleng;orf_d;hmmer_eval;hmm_model;virus?;ident;qcov;tcov;tlength;tid;tname;mean_eval\n") for name,orf_d in self.orf_density.items(): results = {"qlength":0,"tid":"-","tname":"-","tlength":0,"qcovarage":0,"ident":0,"tcovarage":0,"eval":1} if name in self.blast_results: results = self.blast_results[name] hmmer = 1 hmm_model = "" if name in hmmer_output: hmmer,hmm_model = max(hmmer_output[name],key=lambda x:x[0]) qleng = int(name.split("_")[3]) virus = "host" if "virus" in results["tname"] or "phage" in results["tname"]: virus = "virus" else: if (len(results["tname"])<2): if ((orf_d>0.9 and qleng <=1000) or (orf_d>0.7 and qleng >1000)): virus = "potential virus" else: virus = "-" if hmmer<=1e-5: virus = "virus" tmp ="{0};{1:d};{2:.2f};{3:.2e};{4};{5};{6:.2f};{7:.2f};{8:.2f};{9:d};{10};{11};{12:.2e}\n".format(name,qleng,orf_d,hmmer,hmm_model,virus,results["ident"],results["qcovarage"],results["tcovarage"],results["tlength"],results["tid"],results["tname"].replace(";",","),results["eval"]) f.write(tmp) vrap = VrAP() try: if not vrap.assembly: vrap.lighter() vrap.flash() if not vrap.pre: if not vrap.assembly: vrap.bowtie() vrap.spades() if not vrap.assembly: vrap.cap3() vrap.clean_output() vrap.calculate_orf_density() if len(vrap.orf_density)>0: vrap.hmmer() if not vrap.noblast: vrap.blast() vrap.summary() vrap.log("VrAP pipeline finished.\nThank you for using VrAP!") except Exception as e: vrap.log(e,True)
How to run the pipeline vrap?
Leave a reply