How to run the pipeline vrap?

  1. download viral database

     cp download_db.py down_db.py.backup
     sed -i -e 's/39733/1396/g' download_db.py
     #NOTE the download_db.py cannot run alone, rather than via "vrap.py -u"
     (damian) jhuang@hamburg:~/DATA/Data_Tam_Acinetobacter_baumannii$ ./vrap/vrap.py -u
     mv vrap/database/viral_db vrap/database/viral_db_Bacillus_cereus
  2. run vrap.py

     #--host=/home/jhuang/REFs/Sus_scrofa.Sscrofa11.1.dna.toplevel.fa 
     vrap.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_out --virus=/home/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/ATCC19606.fasta -n /media/jhuang/Titisee/GAMOLA2/Blast_db/nt -a /media/jhuang/Titisee/GAMOLA2/Blast_db/nr -t 4
     #vrap/vrap.py -1 220211_M03701_0272_000000000-JWYW9/p12568/BovHepV_Bulgaria_19_S3_R1_001.fastq.gz -2 220211_M03701_0272_000000000-JWYW9/p12568/BovHepV_Bulgaria_19_S3_R2_001.fastq.gz -o BovHepV_Bulgaria_19_on_Hepacivirus --host=/home/jhuang/REFs/Bos_taurus.ARS-UCD1.2.dna.toplevel.fa  -t 3
     #IMPORTANT, it doesn't work, since the update will always downlod the repeated sequences causing ERROR! It needs to delete it as follows!!
    
     #In summary: --(optional) host --> bowtie2/host (bowtie_database for host-substraction) + blast/db/host (in nt_dbs, custom blastn database before downloaded_db!)
     #            --(optional) virus --> blast/db/virus (in nt_dbs, custom blastn database before downloaded_db!)
     #            --(always) defined in download_db.py (setting by modifying the file) and run default --> database/viral_db/viral_nucleotide (nt_dbs)
     #                                                                                        --> database/viral_db/viral_protein (prot_db)
     #            --(optional) nt+nr
     #Therefore, the results substracted host should not need the blast with host, however it it in nt_dbs!
    
     #Under (vrap) jhuang@hamm:~/DATA/Data_Tam_Acinetobacter_baumannii$ 
     #DEBUG: {path}/external_tools/Lighter-master/lighter-->/home/jhuang/miniconda3/envs/vrap/bin/lighter
     #DEBUG: {path}/external_tools/SPAdes-3.11.1-Linux/bin/spades.py-->/home/jhuang/miniconda3/envs/vrap/bin/spades.py
    
     #A6: --host=/home/jhuang/REFs/Sus_scrofa.Sscrofa11.1.dna.toplevel.fa  --virus=/home/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/vrap/database/viral_db_Bacillus_cereus/nucleotide_Bacillus_cereus_0.01.fasta
     vrap/vrap.py -1 ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_2.fastq.gz -o A6_vrap_out  -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
    
     #A10 
     #START checking blast db: no host and with virus --> mapping to 5 dbs: blast/db/virus, viral_nucleotide, viral_protein, nt, nr
     vrap/vrap.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_out --virus=/home/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/vrap/database/viral_db_Bacillus_cereus/nucleotide_Bacillus_cereus_0.01.fasta   -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
    
     #A12
     vrap/vrap.py -1 ./results_AYE/trimmed/A12_AYE_HQ_trimmed_P_1.fastq.gz -2 ./results_AYE/trimmed/A12_AYE_HQ_trimmed_P_2.fastq.gz -o A12_vrap_out  -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
    
     #A19 
     vrap/vrap.py -1 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_2.fastq.gz -o A19_vrap_out  -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
    
     #START VrAP
     #START checking blast db
     #START lighter
     #START estimate k-mer size
     #k-mer size: 5964136
     #/home/jhuang/miniconda3/envs/vrap/bin/lighter -r ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_1.fastq.gz -r ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_2.fastq.gz -K 20 5964136 -od /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/lighter -t 55 2>> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/vrap.log
     #START flash
     #/home/jhuang/Tools/vrap/external_tools/FLASH-1.2.11/flash /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/lighter/A6_WT_HQ_trimmed_P_1.cor.fq.gz /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/lighter/A6_WT_HQ_trimmed_P_2.cor.fq.gz -o flash -d /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/flash -M 1000 >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/vrap.log
     #START spades
  3. vrap.py (code)

     #!/usr/bin/env python3
    
     from orf import ORF
     from optparse import OptionParser
     import os
     import glob
     from Bio import SeqIO
     from pathlib import Path
     # see blast.py
     # .ncbirc --> BLASTDB=/media/jhuang/Titisee/GAMOLA2/Blast_db/ --> /mnt/Samsung_T5/Blast_db/
     from blast import Blast
     from hmmer import HMMER
     from bowtie import Bowtie
     # download_db.py only for txid2697049 [Severe acute respiratory syndrome coronavirus 2]
     from download_db import Database
     import glob
     import logging
     from math import exp
     from scipy.optimize import brentq
    
     #old = "_2010"
     old = ""
    
     class VrAP:
         """ VrAP """
    
         def __init__ (self):
             """ Class initialiser """
             self.version = "1.0.5"
             option = self.option_parser()
             self.path = os.path.dirname(os.path.realpath(__file__))
    
             self.paired_end = True
             self.p1  = option.p1
             self.p2  = option.p2
             self.single = ""
             self.cor1 = self.p1
             self.cor2 = self.p2
             self.cpu = option.cpu
             self.contigs = ""
             self.contig_length = option.clength
             self.host = option.host
             self.virus = option.virus
             self.nt = option.nt
             self.nr = option.nr
             self.bt2idx = option.bt2idx
             self.update = option.update
             self.gbt2 = option.gbowtie
             self.pre = option.pre
             self.assembly = option.assembly
             self.noblast = option.noblast
    
             if option.out!=None:
                 self.out = os.path.abspath(option.out)
             else:
                 self.out = "./"
    
             if self.update:
                 self.update_database()
    
             if self.cpu==None:
                 self.cpu = 1
    
             if self.p2==None:
                 self.single = self.p1
                 self.paired_end = False
    
             self.create_output_dir()
    
             open("{}/vrap.log".format(self.out),"w").close()
             self.log("START VrAP")
    
             with open("{}/vrap.log".format(self.out),"a") as log_opt:
                 for opt, value in option.__dict__.items():
                     log_opt.write("{}:\t{}\n".format(opt,value))
    
             if self.pre:
                 self.log("START only read correction and assembly!")
    
             self.check_input()
    
         def check_input (self):
             """ check input """
    
             bt2idx = self.bt2idx
             if self.bt2idx!=None:
                 bt2idx = self.bt2idx+".1.bt2"
    
             input_files = [self.p1,self.p2,self.host,self.virus,bt2idx]
    
             for f in input_files:
                 #print(f)
                 if f != None:
                     if not os.path.exists(f):
    
                         print("{} don't exists\nPlease check input files\nVrAP quit\n".format(f))
                         quit()
    
             if self.p1 == None:
                 self.parser.print_help()
                 print("-1/--p1 is mandatory\nPlease check input file\nVrAP quit\n")
                 quit()
    
             #EXPLANATION: in total, mapping to 4 dbs: viral_nucleotide, viral_protein, nt, nr
             dbs = ["{path}/database/viral_db/viral_nucleotide".format(path=self.path),"{path}/database/viral_db/viral_protein".format(path=self.path),self.nr,self.nt]
             status = []
             self.log("START checking blast db")
    
             for db in dbs:
                 x = os.system("{path}/external_tools/blast/blastdbcmd -db {db} -info >> {out}/vrap.log 2>> {out}/vrap.log".format(path=self.path,db=db,out=self.out))
                 status.append(x)
    
             #COMMENTED
             if status[0] != 0 or status[1] != 0:
                 self.update_database()
    
             if self.nr != None and status[2] != 0:
                 print("No alias or index file found for {}".format(self.nr))
                 quit()
    
             if self.nt != None and status[3] != 0:
                 print("No alias or index file found for {}".format(self.nt))
                 quit()
    
         def log (self,message,err=False):
             """ log bowtie run """
             print(message)
             LOG_FILENAME = "{}/vrap.log".format(self.out)
             FORMAT = '\n[VrAP] %(asctime)-15s %(message)s\n'
    
             logging.basicConfig(filename=LOG_FILENAME,format=FORMAT,level=logging.DEBUG)
             if err:
                 logging.exception(message)
             else:
                 logging.info(message)
             #logging.exception()
    
         def option_parser(self):
             """ Read input options """
             usage = "usage: %prog -1 [FILE] [options]"
             parser = OptionParser(usage=usage,version="%prog {}".format(self.version))
    
             parser.add_option("-1","--p1", dest="p1",help="first paired end file" , metavar="FILE")
             parser.add_option("-2","--p2", dest="p2",help="second paired end file", metavar="FILE")
             parser.add_option("-t","--threads", dest="cpu",help="number of threads to use", metavar="INT",type="int",default=1)
             parser.add_option("-r","--host", dest="host",help="reference host sequences", metavar="FASTA")
             parser.add_option("-v","--virus", dest="virus",help="reference viral sequences", metavar="FASTA")
             parser.add_option("-n","--nt", dest="nt",help="nucleotide blast database (e.g. nt)", metavar="DATABASE")
             parser.add_option("-a","--nr", dest="nr",help="protein blast database (e.g. nr)", metavar="DATABASE")
             parser.add_option("-o","--outdir", dest="out",help="output directory", metavar="DIR")
             parser.add_option("-b","--bt2idx", dest="bt2idx",help="bowtie2 index", metavar="FILE")
             parser.add_option("-l","--clength", dest="clength",help="minimum contig length", metavar="INT",default=500)
             parser.add_option("-u","--update", dest="update",help="update viral database",  default=False, action="store_true")
             parser.add_option("-g","--gbt2", dest="gbowtie",help="use global PATH of bowtie2",  default=False, action="store_true")
             parser.add_option("-p","--pre", dest="pre",help="read correction only",  default=False, action="store_true")
             parser.add_option("-q","--assembly", dest="assembly",help="assembly only",  default=False, action="store_true")
             parser.add_option("-k","--noblast", dest="noblast",help="skip blast search",  default=False, action="store_true")
             self.parser = parser
             (options, args) = parser.parse_args()
    
             return options
    
         def create_output_dir(self):
             """ Create output directory """
             if not os.path.exists(self.out):
                 os.makedirs(self.out)
    
         def EMfit2(self,F0, f1, F1, k):
             epstol = 1e-8
             x = F1 / F0
             e = 0
             e0 = 1
             x0 = 0
             maxit = 1000
             it = 0
             while abs(x - x0) / x + abs(e - e0) > epstol and it < maxit:
                 it = it + 1
                 #print x, e, abs(x-x0)/x
                 x0 = x
                 e0 = e
                 x1 = -100
                 while abs(x - x1) / x > epstol / 2:
                     x1 = x
                     x  = F1 / F0 * (3 * k * (1 - exp(-x * e / (3 * k))) + 1 - exp(-x * (1 - e)))
                 def func(t):
                     return f1 / F1 - (t * exp(-x * t / (3 * k)) + (1 - t) * exp(-x * (1 - t)))
                 print(func(0))
                 print(func(1.2))
                 e = brentq(func, 0, 1.1)
             return x, e
    
         def genome_size (self):
             """ estimate genome size """
             self.log("START estimate k-mer size")
    
             lighter_path = "{}/lighter".format(self.out)
             if not os.path.exists(lighter_path):
                 os.makedirs(lighter_path)
    
             if self.paired_end:
                 input_data = "{} {}".format(os.path.abspath(self.p1),os.path.abspath(self.p2),self.out)
             else:
                 input_data = "{}".format(os.path.abspath(self.p1))
    
             command = "{path}/external_tools/KmerStream-master/KmerStream --tsv -k 20 -t {cpu} {input_data} -o {lighter}/kmer  2>> {out}/vrap.log".format(path=self.path,lighter=lighter_path,cpu=self.cpu,out=self.out,input_data=input_data)
    
             os.system(command)
             size = 30000
    
             try:
                 kmer = "{lighter}/kmer".format(lighter=lighter_path)
                 if os.path.exists(kmer):
                     with open(kmer) as f:
                         head = next(f)
                         value = next(f).split()
    
                     value = [int(i) for i in value] #list comprehension
                     size = value[2]-value[3]
             except Exception as e:
                 vrap.log(e,True)
    
             self.log("k-mer size: {}".format(size))
    
             return size
    
         def lighter(self):
             """ Run ligther for read correction """
             self.log("START lighter")
    
             if self.paired_end:
                 input_data = "-r {} -r {}".format(self.p1,self.p2)
             else:
                 input_data = "-r {}".format(self.p1)
    
             size = self.genome_size()
             if size<30000:
                 size = 30000
             #MODIFIED, since lighter in external_tool doesn't work.
             #{path}/external_tools/Lighter-master/lighter
             command = "/home/jhuang/anaconda3/bin/lighter {input} -K 20 {size} -od {out}/lighter -t {cpu} 2>> {out}/vrap.log".format(path=self.path,input=input_data,out=self.out,cpu=self.cpu,size=size)
             print(command)
             os.system(command)
    
             if self.paired_end:
                 b1 = os.path.basename(self.p1).split(".")[0]
                 self.cor1 = glob.glob("{}/lighter/{}*cor*".format(self.out,b1))[0]
                 b2 = os.path.basename(self.p2).split(".")[0]
                 self.cor2 = glob.glob("{}/lighter/{}*cor*".format(self.out,b2))[0]
             else:
                 b1 = os.path.basename(self.single).split(".")[0]
                 self.single = glob.glob("{}/lighter/{}*cor*".format(self.out,b1))[0]
    
         def flash (self):
             """ Run flash for paired end reads """
             self.log("START flash")
    
             if self.paired_end:
                 command = "{path}/external_tools/FLASH-1.2.11/flash {cor1} {cor2} -o flash -d {out}/flash -M 1000 >> {out}/vrap.log".format(path=self.path,cor1=self.cor1,cor2=self.cor2,out=self.out)
                 print(command)
                 os.system(command)
    
                 self.cor1   = "{}/flash/flash.notCombined_1.fastq".format(self.out)
                 self.cor2   = "{}/flash/flash.notCombined_2.fastq".format(self.out)
                 self.single = "{}/flash/flash.extendedFrags.fastq".format(self.out)
    
         def bowtie (self):
             """ Map reads to reference sequence """
             bowtie2 = Bowtie(self.gbt2,self.out,self.cpu)
    
             bowtie_path = "{}/bowtie".format(self.out)
             if not os.path.exists(bowtie_path):
                 os.makedirs(bowtie_path)
    
             if self.bt2idx==None and self.host!=None:
                 self.log("START bowtie2-build")
                 self.bt2idx = "{}/bowtie/host".format(self.out)
                 bowtie2.build(self.host)
    
             if self.bt2idx != None:
                 if self.paired_end:
                     self.log("START bowtie2 pe")
                     bowtie2.pe(self.cor1,self.cor2,self.single,self.bt2idx)
                     self.cor1 = "{}/bowtie/bowtie.un.1.fastq".format(self.out)
                     self.cor2 = "{}/bowtie/bowtie.un.2.fastq".format(self.out)
                     self.single = "{}/bowtie/bowtie.un.fastq".format(self.out)
                 else:
                     self.log("START bowtie2 se")
                     bowtie2.se(self.single,self.bt2idx)
                     self.single = "{}/bowtie/bowtie.un.fastq".format(self.out)
    
         def update_database (self):
             """ Update viral DB"""
             #self.log("START update viral DB")
             #EXPLANATION: maybe it needs to DELETE the directory viral_db.
             #typs = ["nucleotide"]
             #typs = ["protein"]
             typs = ["nucleotide","protein"]
             #path = os.path.dirname(os.path.realpath(__file__))
             db_path = "{}/database/viral_db/".format(self.path)
    
             if not os.path.exists(db_path):
                     os.makedirs(db_path)
    
             for typ in typs:
                 self.log("START update viral {} DB".format(typ))
                 database = Database(typ,db_path,self.cpu)
    
                 database.download_id()
                 database.start_download()
    
                 blast = Blast(self.cpu, self.out)
                 db_type = "nucl"
                 if "protein" in typ:
                     db_type = "prot"
    
                 self.log("\nSTART {} makeblastdb".format(typ))
                 blast.makeblastdb(db_path+typ+".fa", "{}viral_{}".format(db_path,typ), db_type)
                 #print(db_path+typ+".fa", "{}viral_{}".format(db_path,typ), db_type)
                 #/home/jhuang/Tools/vrap/database/viral_db/nucleotide.fa /home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide nucl
                 #MANUAL_RUNNING
                 #V5 didn't suitable
                 #/home/jhuang/DATA/RNAHiSwitch/ncbi-blast-2.11.0+/bin/makeblastdb -in nuleotide.fa -dbtype nucl
                 #/home/jhuang/DATA/RNAHiSwitch/ncbi-blast-2.11.0+/bin/makeblastdb -in protein.fa -dbtype prot
                 #USING_V4
                 #https://www.biostars.org/p/200742/
                 #vrap/external_tools/blast/makeblastdb -in nucleotide.fa -dbtype nucl -parse_seqids -out virus_nucleotide  #-taxid_map nucl_gb.accession2taxid 
                 #vrap/external_tools/blast/makeblastdb -in protein.fa -dbtype prot -parse_seqids -out virus_protein     #-taxid_map nucl_gb.accession2taxid
                 #search_term="txid2697049[Organism:exp] NOT txid2[Organism:exp] NOT txid2759[Organism:exp] NOT txid2157[Organism:exp] {}".format(date)
                 #virus_search_term="txid10239[Organism:exp] NOT txid2[Organism:exp] NOT txid2759[Organism:exp] NOT txid2157[Organism:exp] {}".format(date)
                 #txid2=Bacteria, txid2759=Eukaryota, txid2157=Archaea, txid10239=Viruses, txid2697049=Severe acute respiratory syndrome coronavirus 2
    
         def spades (self):
             """ Run spades """
             self.log("START spades")
             paired = ""
    
             if self.paired_end:
                 paired  = "-1 {} -2 {}".format(self.cor1,self.cor2)
    
             k       = "-k 33,55,77,99,127"
             options = "--cov-cutoff off --only-assembler --careful"
             #options = "--cov-cutoff off --only-assembler"
    
             single  = self.single
             if len(self.single)>0:
                 single  = "--s1 {}".format(self.single)
    
             cpu     = "-t {}".format(self.cpu)
             output  = "-o {}/spades".format(self.out)
             os.system("{path}/external_tools/SPAdes-3.11.1-Linux/bin/spades.py {paired} {single} {k} {options} {cpu} {output} >> {out}/vrap.log".format(path=self.path, paired=paired, single=single, k=k, options=options, cpu=cpu, output=output,out=self.out))
    
             #for assembly only
             self.contigs = "{}/spades/contigs.fasta".format(self.out)
             self.exclude = set()
             self.scaffold = {}
    
         def cap3 (self):
             """ Runs cap3 """
             self.log("START cap3")
             self.contigs = "{}/spades/contigs.fasta".format(self.out)
             cap3_out = "{}/cap3".format(self.out)
             #print(cap3_out)
             if not os.path.exists(cap3_out):
                 os.makedirs(cap3_out)
    
             os.system("{}/external_tools/cap3 '{}' -y 100 >{}/cap3/cap3.out".format(self.path,self.contigs,self.out))
    
             b = False
             i = 0
             consensi = {}
             consensus = ""
             group = {}
             name = ""
             exclude = set()
    
             with open("{}/cap3/cap3.out".format(self.out)) as f:
                 for line in f:
    
                     if not b:
    
                         if "*******************" in line:
                             name = line.split("* ")[1].split(" *")[0]
                             #print(name)
                             group[name] = set()
    
                         if "NODE" in line:
                             ary = line.split()
                             for n in ary:
                                 if "NODE" in n:
                                     tmp = n.replace("+","").replace("-","")
                                     group[name].add(tmp)
                                     exclude.add(tmp)
    
                     if "DETAILED" in line:
                         b = True
    
                     if b:
    
                         if "*******************" in line:
                             name = line.split("* ")[1].split(" *")[0]
                             #print(name)
    
                         ary = line.split()
                         #print(ary)
                         if len(ary) > 0:
    
                             if "consensus" in line:
                                 seq = ary[1]
                                 tmp = consensi.get(name,"")
                                 tmp+=seq.replace("-","N")
                                 consensi[name] = tmp
    
             self.scaffold = consensi
             self.exclude = exclude
    
         def clean_output (self):
             """ Clean spades output and include scaffolds """
    
             fasta_sequences = SeqIO.parse(open(self.contigs),'fasta')
             self.result = "{}/vrap_contig.fasta".format(self.out)
             output = open(self.result,"w")
    
             for fasta in fasta_sequences:
                 if fasta.name not in self.exclude and len(fasta.seq)>=self.contig_length:
                     output.write(">{}\n{}\n".format(fasta.name,fasta.seq))
             i = 1
             for name,seq in self.scaffold.items():
                 if len(seq)>=self.contig_length:
                     output.write(">CAP_{}_length_{}\n{}\n".format(i,len(seq),seq))
                     i+=1
             output.close()
    
         def calculate_orf_density (self):
             """ calculate orf density """
             self.log("START calculating orf density")
             self.result = "{}/vrap_contig.fasta".format(self.out)
             orf = ORF(self.result,self.out)
             self.orf_density = orf.orf_density()
    
         def label_as_virus (self):
             """ Add virus tag to name """
             out_name = "{}/blast/custom_viral_seq.fa".format(self.out)
             output = open(out_name,"w")
    
             with open(self.virus, "r") as handle:
                 for record in SeqIO.parse(handle, "fasta"):
                     output.write(">{}\n{}\n".format(record.name+" [virus_user_db]",record.seq))
    
             output.close()
    
             return out_name
    
         def blast (self):
             """ blast """
             self.log("START blast")
             self.result = "{}/vrap_contig.fasta".format(self.out)
    
             blast_path = "{}/blast".format(self.out)
             blast = Blast(self.cpu, self.out)
    
             blast.read_fasta(self.result)
    
             #EXPLANATION: all virus and host-databases are saved in nt_dbs
             #             blast performed only after assembly(self.result=vrap_contig.fasta) against host+virus --> 
             #             since the host-reads are not deleted before the assembly, 
             #             vrap_contigs.fasta should contain contigs of host and virus
             #sort blast/blastn.csv and blast/blastn.fa
             nt_dbs = "{}/database/viral_db{}/viral_nucleotide".format(self.path,old)
             e_val = "1e-4"
             max_target = 1
    
             if not os.path.exists(blast_path):
                 os.makedirs(blast_path)
    
             if not os.path.exists(blast_path+"/db"):
                 os.makedirs(blast_path+"/db")
    
             #EXPLANATION
             #if the option --virus is set, make the given fasta blast-format and add it to vrap_out/blast/db/virus 
             #NO record added! nt_dbs should keep only 1 record!
             if self.virus != None:
                 custom_viral_seq = self.label_as_virus()
                 blast.makeblastdb(custom_viral_seq,"{}/blast/db/virus".format(self.out),"nucl")
                 nt_dbs += " {}/blast/db/virus".format(self.out)
             #MODIFIED, add the self.host causing error. COMMENTED them!
             #EXPLANATION: if the option --host is set, make the given fasta blast-format and add it to vrap_out/blast/db/host
             #There are in total 3 nt_dbs, 1 prot_db, +[nt][nr], and 1 bowtie database:
             # nt_dbs /home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide, ./blast/db/virus, ./blast/db/host
             # prot_db /home/jhuang/Tools/vrap/database/viral_db/viral_protein
             # bowtie_database: ./bowtie/host for substraction host-reads
             #DEBUG: /home/jhuang/Tools/vrap/external_tools/blast/blastn -num_threads 4 -query /home/jhuang/DATA/Data_Becher_Picornavirus_BovHepV/F21_1400_Liquor_vrap_out/vrap_contig.fasta -db "/home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide /home/jhuang/DATA/Data_Becher_Picornavirus_BovHepV/F21_1400_Liquor_vrap_out/blast/db/host"  -evalue 1e-4 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /home/jhuang/DATA/Data_Becher_Picornavirus_BovHepV/F21_1400_Liquor_vrap_out/blast/blastn.csv
             #--> it works!
             #Warning: [blastn] Examining 5 or more matches is recommended
             #grep -v "scrofa" blastn.csv > blastn_.csv;
             #DEBUG: BLAST Database creation error: Error: Duplicate seq_ids are found: GB|MF170920.1
             #if self.host != None:
             #   blast.makeblastdb(self.host,"{}/blast/db/host".format(self.out),"nucl")
             #   nt_dbs += " {}/blast/db/host".format(self.out)
    
             #EXPLANATION: blastn.csv is empty means blastn finds nonthing!
             ##nucleotide db search
             blast_out = "{}/blast/blastn.csv".format(self.out)
             blast.blastn(self.result, nt_dbs, e_val, max_target, blast_out)
             no_match_fa = "{}/blast/blastn.fa".format(self.out)
             no_results_found = blast.no_match(blast_out,no_match_fa)
    
             #EXPLANATION: blastx -num_threads 14 -query /home/jhuang/DATA/Data_Becher_Picornavirus_BovHepV/F21_1399_1400_Serum_vrap_out/blast/blastn.fa -db /home/jhuang/Tools/vrap/database/viral_db/viral_protein -evalue 1e-6 -outfmt 6 qseqid qstart qend sstart send evalue le+ 
             ##protein db search
             if no_results_found > 0:
                 e_val = "1e-6"
                 blast_out = "{}/blast/blastx.csv".format(self.out)
                 prot_db = "{}/database/viral_db{}/viral_protein".format(self.path,old)
                 blast.blastx(no_match_fa, prot_db, e_val, max_target, blast_out)
                 no_match_fa = "{}/blast/blastx.fa".format(self.out)
                 no_results_found = blast.no_match(blast_out,no_match_fa)
    
             #EXPLANATION: since self.nt and self.nr are not given, the following code won't run!
             if self.nt != None and no_results_found > 0:
                 e_val = "1e-4"
                 blast_out = "{}/blast/blastn_nt.csv".format(self.out)
                 blast.blastn(no_match_fa, self.nt, e_val, max_target, blast_out)
                 no_match_fa = "{}/blast/blastn_nt.fa".format(self.out)
                 no_results_found = blast.no_match(blast_out,no_match_fa)
    
             if self.nr != None and no_results_found > 0:
                 e_val = "1e-6"
                 blast_out = "{}/blast/blastx_nr.csv".format(self.out)
                 blast.blastx(no_match_fa, self.nr, e_val, max_target, blast_out)
                 no_match_fa = "{}/blast/blastx_nr.fa".format(self.out)
                 no_results_found = blast.no_match(blast_out,no_match_fa)
    
         def hmmer (self):
             """ hidden markov modell search """
             self.log("START HMMER")
             hmmer = HMMER(self.cpu, self.out)
             hmmer.run_hmmer()
    
         def summary (self):
             """ summarize output """
    
             if len(self.orf_density)>0:
                 blast = Blast(self.cpu, self.out)
                 self.blast_results = blast.summary(self.out)
    
                 hmmer = HMMER(self.cpu, self.out)
                 hmmer_output = hmmer.read_hmmer_output()
    
             with open("{}/vrap_summary{}.csv".format(self.out,old),"w") as f:
                 f.write("name;qleng;orf_d;hmmer_eval;hmm_model;virus?;ident;qcov;tcov;tlength;tid;tname;mean_eval\n")
                 for name,orf_d in self.orf_density.items():
                     results = {"qlength":0,"tid":"-","tname":"-","tlength":0,"qcovarage":0,"ident":0,"tcovarage":0,"eval":1}
    
                     if name in self.blast_results:
                         results = self.blast_results[name]
    
                     hmmer = 1
                     hmm_model = ""
    
                     if name in hmmer_output:
                         hmmer,hmm_model = max(hmmer_output[name],key=lambda x:x[0])
    
                     qleng = int(name.split("_")[3])
                     virus = "host"
    
                     if "virus" in results["tname"] or "phage" in results["tname"]:
                         virus = "virus"
                     else:
                         if (len(results["tname"])<2):
                             if ((orf_d>0.9 and qleng <=1000) or (orf_d>0.7 and qleng >1000)):
                                 virus = "potential virus"
                             else:
                                 virus = "-"
    
                         if hmmer<=1e-5:
                             virus = "virus"
    
                     tmp ="{0};{1:d};{2:.2f};{3:.2e};{4};{5};{6:.2f};{7:.2f};{8:.2f};{9:d};{10};{11};{12:.2e}\n".format(name,qleng,orf_d,hmmer,hmm_model,virus,results["ident"],results["qcovarage"],results["tcovarage"],results["tlength"],results["tid"],results["tname"].replace(";",","),results["eval"])
                     f.write(tmp)
    
     vrap = VrAP()
    
     try:
         if not vrap.assembly:
             vrap.lighter()
             vrap.flash()
    
         if not vrap.pre:
             if not vrap.assembly:
                 vrap.bowtie()
    
             vrap.spades()
    
             if not vrap.assembly:
                 vrap.cap3()
    
             vrap.clean_output()
    
             vrap.calculate_orf_density()
    
             if len(vrap.orf_density)>0:
                 vrap.hmmer()
                 if not vrap.noblast:
                     vrap.blast()
    
             vrap.summary()
    
         vrap.log("VrAP pipeline finished.\nThank you for using VrAP!")
     except Exception as e:
         vrap.log(e,True)

Leave a Reply

Your email address will not be published. Required fields are marked *