Author Archives: gene_x

RNAseq running with umi_tools

  1. install conda environment

    #conda config --set auto_activate_base false
    
    conda create --name rnaseq python=3.7
    
    #NOTE: mamba 确实快多了,以后都用 mamba❕
    #install packages
    conda activate rnaseq
    pip3 install deeptools
    pip3 install multiqc
    conda install -c bioconda stringtie subread gffread
    conda install -c conda-forge -c bioconda -c defaults -c r r-data.table r-gplots
    conda install -c conda-forge -c bioconda -c defaults -c r bioconductor-dupradar bioconductor-edger
    conda install nextflow=23.04
    
    conda install fq
    conda install -c bioconda umi_tools
    conda install -c bioconda rsem
    conda install -c bioconda salmon
    
    #conda install some tools
    #install R-packages, 
    conda install -c bioconda ucsc-bedclip
    conda install -c bioconda ucsc-bedgraphtobigwig
    conda install -c bioconda bioconductor-matrixgenerics
    #conda install -c bioconda bioconductor-deseq2
    conda install -c bioconda r-pheatmap
    conda install -c anaconda gawk
    
    conda install mamba -n base -c conda-forge
    conda config --add channels conda-forge
    mamba install -c bioconda salmon=1.10
    #salmon should be >= 1.10 since in those version salmon set `--validateMappings` as default.
    
    conda install -c bioconda trim-galore star=2.6.1d bioconductor-summarizedexperiment bioconductor-tximport bioconductor-tximeta bioconductor-deseq2
    mamba install -c bioconda samtools=1.9  
    mamba install -c conda-forge r-optparse r-vctrs=0.5.0
    conda install nextflow=23.04
    mamba install -c bioconda qualimap
    mamba install -c bioconda rseqc
    mamba install -c conda-forge openssl
    conda install -c bioconda ucsc-bedclip
    conda install -c bioconda bedtools
    conda update -c bioconda ucsc-bedclip
    #for DEBUG: bedClip: error while loading shared libraries: libssl.so.1.0.0: cannot open shared object file: No such file or directory
    conda update -c bioconda ucsc-bedgraphtobigwig
    # samtools should be >= 1.9 as only those have the option @
    #samtools sort \
    #      -@ 6 \
    #      -o HSV.d2_r1.sorted.bam \
    #      -T HSV.d2_r1.sorted \
    #      HSV.d2_r1.Aligned.out.bam
  2. run UMItools without –umitools_dedup_stats, otherwise it cannot be finished in hamm.

    • Optimize UMItools parameters: Some parameters might influence the memory usage of UMItools. For example, you can try to reduce the number of allowed mismatches in the UMI sequence (–edit-distance-threshold). This will make the deduplication process less memory intensive but might also impact the results.

    • Use other deduplication tools: If the problem persists, you might need to use alternative tools for UMI deduplication which are less memory-intensive. Tools such as fgbio have a grouping and deduplication method similar to UMItools but have been reported to require less memory.

      #https://github.com/nf-core/rnaseq/issues/827
      #INFO for DEBUG: https://umi-tools.readthedocs.io/en/latest/faq.html
      #INFO for DEBUG: https://readthedocs.org/projects/umi-tools/downloads/pdf/stable/
      #https://github.com/CGATOxford/UMI-tools/issues/173
      # excessive dedup memory usage with output-stats #409 
      #https://github.com/CGATOxford/UMI-tools/issues/409
      #umi_tools 1.0.1
      #I am aware of previously closed issues:
      #excessive dedup memory usage #173
      #speed up stats #184
      #Running a single-end bam file with 3.13M reads and a 10bp (fully random) UMI.
      #Using --method=unique
      #There still seems to be a memory problem with --output-stats
      #Running with output-stats, memory usage climbs over 100GB and eventually crashes with "MemoryError".
      #Running without output-stats, job completes in about 3 minutes, with no problems.
      
      #TRY STANDALONE RUNNING: /usr/local/bin/python /usr/local/bin/umi_tools dedup -I HSV.d8_r1.transcriptome.sorted.bam -S HSV.d8_r1.umi_dedup.transcriptome.sorted.bam --method=unique --random-seed=100 
      #/home/jhuang/miniconda3/envs/rnaseq/bin/python /home/jhuang/miniconda3/envs/rnaseq/bin/umi_tools dedup -I star_salmon/HSV.d8_r1.sorted.bam -S HSV.d8_r1.umi_dedup.sorted.bam --output-stats HSV.d8_r1.umi_dedup.sorted --method=unique --random-seed=100
      
      #umitools dedup uses large amounts of memory and runs slowly. To speed it up it is recommended to only run it on a single chromosome, see the FAQ point number 4.
      #I suggest either making the --output-stats optional, or running a second round of deduplication on a single chromosome to generate the output stats.
      
      #--Human--
      #hamm
      /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38   --with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P
      .{12}).*” -profile docker -resume –max_cpus 54 –max_memory 120.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘star_salmon’ –pseudo_aligner ‘salmon’ –umitools_grouping_method ‘unique’ #sage nextflow run rnaseq/main.nf –input samplesheet.csv –outdir results_GRCh38 –genome GRCh38 –with_umi –umitools_extract_method “regex” –umitools_bc_pattern “^(?P .{12}).*” -profile test_full -resume –max_memory 256.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘star_salmon’ –pseudo_aligner ‘salmon’ #–Virus– /usr/local/bin/nextflow run rnaseq/main.nf –input samplesheet.csv –outdir results_virus –fasta “/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1.fasta” –gtf “/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_v4.gtf” –with_umi –umitools_extract_method “regex” –umitools_bc_pattern “^(?P .{12}).*” –umitools_dedup_stats –skip_rseqc –skip_dupradar –skip_preseq -profile test_full -resume –max_cpus 55 –max_memory 120.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘hisat2’ –gtf_extra_attributes ‘gene_name’ –gtf_group_features ‘gene_id’ –featurecounts_group_type ‘gene_name’ –featurecounts_feature_type ‘exon’ –umitools_grouping_method ‘unique’
  3. R-code for evaluation of nextflow outputs

    # Import the required libraries
    library("AnnotationDbi")
    library("clusterProfiler")
    library("ReactomePA")
    library(gplots)
    
    library(tximport)
    library(DESeq2)
    
    setwd("~/DATA/Data_Manja_RNAseq_Organoids/results_GRCh38_unique/star_salmon")
    
    # Define paths to your Salmon output quantification files
    files <- c("control_r1" = "./control_r1/quant.sf",
              "control_r2" = "./control_r2/quant.sf",
              "HSV.d2_r1" = "./HSV.d2_r1/quant.sf",
              "HSV.d2_r2" = "./HSV.d2_r2/quant.sf",
              "HSV.d4_r1" = "./HSV.d4_r1/quant.sf",
              "HSV.d4_r2" = "./HSV.d4_r2/quant.sf",
              "HSV.d6_r1" = "./HSV.d6_r1/quant.sf",
              "HSV.d6_r2" = "./HSV.d6_r2/quant.sf",
              "HSV.d8_r1" = "./HSV.d8_r1/quant.sf",
              "HSV.d8_r2" = "./HSV.d8_r2/quant.sf")
    
    # Import the transcript abundance data with tximport
    txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
    
    # Define the replicates and condition of the samples
    replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2"))
    condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8", "HSV.d8"))
    
    # Define the colData for DESeq2
    colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
    
    # Create DESeqDataSet object
    dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
    
    # In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps.
    # Filter data to retain only genes with more than 2 counts > 3 across all samples
    # dds <- dds[rowSums(counts(dds) > 3) > 2, ]
    
    # Run DESeq2
    dds <- DESeq(dds)
    
    # Perform rlog transformation
    rld <- rlogTransformation(dds)
    
    # Output raw count data to a CSV file
    write.csv(counts(dds), file="transcript_counts.csv")
    
    # -- gene-level count data --
    # Read in the tx2gene map from salmon_tx2gene.tsv
    #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
    tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
    
    # Set the column names
    colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
    
    # Remove the gene_name column if not needed
    tx2gene <- tx2gene[,1:2]
    
    # Import and summarize the Salmon data with tximport
    txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
    
    # Continue with the DESeq2 workflow as before...
    colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
    dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
    #dds <- dds[rowSums(counts(dds) > 3) > 2, ]    #60605-->26543
    dds <- DESeq(dds)
    rld <- rlogTransformation(dds)
    write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")
    
    #TODO: why a lot of reads were removed due to the too_short?
    #STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output
    
    dim(counts(dds))
    head(counts(dds), 10)

X-ray holographic microscopy

X-ray holographic microscopy is a technique used to produce high-resolution, three-dimensional images of microscopic objects. The technique is based on the principles of holography, where the phase and amplitude of a wave are recorded to produce an image. In X-ray holography, this wave is an X-ray beam.

Traditional optical microscopy uses visible light to image an object, and the resolution of the image is limited by the wavelength of the light. X-rays have much shorter wavelengths than visible light, so X-ray microscopy can theoretically produce images with much higher resolution.

In X-ray holography, a coherent X-ray beam is split into two paths: one path interacts with the object being imaged, and the other path is used as a reference. The object wave and the reference wave are then combined to form a hologram. This hologram can be reconstructed to produce a 3D image of the object.

One major advantage of X-ray holographic microscopy is that it can be used to image thick samples and materials that are not transparent to visible light. However, the technique requires sophisticated equipment and careful sample preparation, and it can be difficult to interpret the resulting images.

X射线全息显微镜术是一种用来生成微观物体的高分辨率三维图像的技术。这种技术基于全息术的原理,全息术记录波的相位和振幅以产生图像。在X射线全息术中,这种波是X射线束。

传统的光学显微镜使用可见光来成像物体,图像的分辨率受到光的波长的限制。X射线的波长比可见光短得多,因此理论上X射线显微镜可以产生分辨率更高的图像。

在X射线全息术中,一个相干的X射线束被分割成两条路径:一条路径与被成像的物体相互作用,另一条路径作为参考使用。然后将物体波和参考波结合形成一个全息图。这个全息图可以被重建成物体的3D图像。

X射线全息显微镜的一个主要优点是它可以用来成像厚样本和对可见光不透明的材料。然而,这种技术需要复杂的设备和精心的样品制备,并且解析结果图像可能会有困难。

人体类器官(Organoids)

类器官(Organoids)是模拟真实器官或组织的结构和功能的三维(3D)细胞培养。它们来源于干细胞,这种细胞具有自我更新和分化为各种细胞类型的能力。在实验室中,可以使用专门的技术和生长条件培养类器官,促使干细胞发育成特定器官的细胞并形成类似目标器官的复杂微型结构。

类器官在研究中具有重要作用,因为与传统的二维(2D)细胞培养相比,它们更准确地代表了人体器官。它们在各个领域具有广泛的应用,包括:

  1. 发育生物学:类器官可以帮助研究人员研究器官发育和组织组织过程。

  2. 疾病建模:类器官可以从患者来源的干细胞中产生,使研究人员能够创建特定于疾病的模型,以研究各种疾病和病状的基本机制。

  3. 药物开发和测试:类器官为测试新药物和治疗方法提供了更具生理相关性的模型,有可能减少对动物模型的依赖,并提高药物开发的效率。

  4. 再生医学:类器官可用于开发新的组织修复和再生策略,可能为各种疾病和损伤提供新的治疗方法。

尽管类器官具有诸多优点,但它们也存在局限性,如缺乏血管、免疫细胞和其他真实器官中存在的成分。然而,正在进行的研究旨在改进类器官技术并克服这些局限性,进一步扩大其在生物医学研究中的潜在应用。

Organoids are three-dimensional (3D) cell cultures that mimic the structure and function of real organs or tissues. They are derived from stem cells, which have the ability to self-renew and differentiate into various cell types. Organoids can be grown in the lab using specialized techniques and growth conditions that encourage the stem cells to develop into organ-specific cells and form complex, miniature structures resembling the target organ.

Organoids have become an essential tool in research because they provide a more accurate representation of human organs compared to traditional two-dimensional (2D) cell cultures. They have numerous applications in various fields, including:

  1. Developmental biology: Organoids can help researchers study the processes involved in organ development and tissue organization.

  2. Disease modeling: Organoids can be generated from patient-derived stem cells, allowing researchers to create disease-specific models to study the underlying mechanisms of various diseases and conditions.

  3. Drug development and testing: Organoids provide a more physiologically relevant model for testing new drugs and therapies, potentially reducing the reliance on animal models and increasing the efficiency of drug development.

  4. Regenerative medicine: Organoids can be used to develop new strategies for tissue repair and regeneration, possibly leading to new treatments for various diseases and injuries.

Despite their advantages, organoids also have limitations, such as the lack of blood vessels, immune cells, and other components present in real organs. However, ongoing research aims to refine organoid technology and overcome these limitations, further expanding their potential applications in biomedical research.

RNAseq processing for organoids

  1. install conda environment

    #conda config --set auto_activate_base false
    
    conda create --name rnaseq python=3.7
    
    #NOTE: mamba 确实快多了,以后都用 mamba❕
    #install packages
    conda activate rnaseq
    pip3 install deeptools
    pip3 install multiqc
    conda install -c bioconda stringtie subread gffread
    conda install -c conda-forge -c bioconda -c defaults -c r r-data.table r-gplots
    conda install -c conda-forge -c bioconda -c defaults -c r bioconductor-dupradar bioconductor-edger
    conda install nextflow=23.04
    
    conda install fq
    conda install -c bioconda umi_tools
    conda install -c bioconda rsem
    conda install -c bioconda salmon
    
    #conda install some tools
    #install R-packages, 
    conda install -c bioconda ucsc-bedclip
    conda install -c bioconda ucsc-bedgraphtobigwig
    conda install -c bioconda bioconductor-matrixgenerics
    #conda install -c bioconda bioconductor-deseq2
    conda install -c bioconda r-pheatmap
    conda install -c anaconda gawk
    
    conda install mamba -n base -c conda-forge
    conda config --add channels conda-forge
    mamba install -c bioconda salmon=1.10
    #salmon should be >= 1.10 since in those version salmon set `--validateMappings` as default.
    
    conda install -c bioconda trim-galore star=2.6.1d bioconductor-summarizedexperiment bioconductor-tximport bioconductor-tximeta bioconductor-deseq2
    mamba install -c bioconda samtools=1.9  
    mamba install -c conda-forge r-optparse r-vctrs=0.5.0
    conda install nextflow=23.04
    mamba install -c bioconda qualimap
    mamba install -c bioconda rseqc
    mamba install -c conda-forge openssl
    conda install -c bioconda ucsc-bedclip
    conda install -c bioconda bedtools
    conda update -c bioconda ucsc-bedclip
    #for DEBUG: bedClip: error while loading shared libraries: libssl.so.1.0.0: cannot open shared object file: No such file or directory
    conda update -c bioconda ucsc-bedgraphtobigwig
    # samtools should be >= 1.9 as only those have the option @
    #samtools sort \
    #      -@ 6 \
    #      -o HSV.d2_r1.sorted.bam \
    #      -T HSV.d2_r1.sorted \
    #      HSV.d2_r1.Aligned.out.bam
  2. run UMItools without –umitools_dedup_stats, otherwise it cannot be finished in hamm.

    • Optimize UMItools parameters: Some parameters might influence the memory usage of UMItools. For example, you can try to reduce the number of allowed mismatches in the UMI sequence (–edit-distance-threshold). This will make the deduplication process less memory intensive but might also impact the results.

    • Use other deduplication tools: If the problem persists, you might need to use alternative tools for UMI deduplication which are less memory-intensive. Tools such as fgbio have a grouping and deduplication method similar to UMItools but have been reported to require less memory.

      #https://github.com/nf-core/rnaseq/issues/827 #INFO for DEBUG: https://umi-tools.readthedocs.io/en/latest/faq.html #INFO for DEBUG: https://readthedocs.org/projects/umi-tools/downloads/pdf/stable/ #https://github.com/CGATOxford/UMI-tools/issues/173

      excessive dedup memory usage with output-stats #409

      #https://github.com/CGATOxford/UMI-tools/issues/409 #umi_tools 1.0.1 #I am aware of previously closed issues: #excessive dedup memory usage #173 #speed up stats #184 #Running a single-end bam file with 3.13M reads and a 10bp (fully random) UMI. #Using –method=unique #There still seems to be a memory problem with –output-stats #Running with output-stats, memory usage climbs over 100GB and eventually crashes with “MemoryError”. #Running without output-stats, job completes in about 3 minutes, with no problems.

        #TRY STANDALONE RUNNING: /usr/local/bin/python /usr/local/bin/umi_tools dedup -I HSV.d8_r1.transcriptome.sorted.bam -S HSV.d8_r1.umi_dedup.transcriptome.sorted.bam --method=unique --random-seed=100 
        #/home/jhuang/miniconda3/envs/rnaseq/bin/python /home/jhuang/miniconda3/envs/rnaseq/bin/umi_tools dedup -I star_salmon/HSV.d8_r1.sorted.bam -S HSV.d8_r1.umi_dedup.sorted.bam --output-stats HSV.d8_r1.umi_dedup.sorted --method=unique --random-seed=100

      #umitools dedup uses large amounts of memory and runs slowly. To speed it up it is recommended to only run it on a single chromosome, see the FAQ point number 4. #I suggest either making the –output-stats optional, or running a second round of deduplication on a single chromosome to generate the output stats.

        #--Human--
        #hamm
        /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38   --with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P
      .{12}).*” -profile docker -resume –max_cpus 54 –max_memory 120.GB –max_time 2400.h –aligner ‘star_salmon’ –pseudo_aligner ‘salmon’ –umitools_grouping_method ‘unique’ #–save_align_intermeds –save_unaligned –save_reference #sage nextflow run rnaseq/main.nf –input samplesheet.csv –outdir results_GRCh38 –genome GRCh38 –with_umi –umitools_extract_method “regex” –umitools_bc_pattern “^(?P .{12}).*” -profile test_full -resume –max_memory 256.GB –max_time 2400.h –aligner ‘star_salmon’ –pseudo_aligner ‘salmon’ #–save_align_intermeds –save_unaligned –save_reference #–Virus– /usr/local/bin/nextflow run rnaseq/main.nf –input samplesheet.csv –outdir results_virus –fasta “/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1.fasta” –gtf “/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_v4.gtf” –with_umi –umitools_extract_method “regex” –umitools_bc_pattern “^(?P .{12}).*” –umitools_dedup_stats –skip_rseqc –skip_dupradar –skip_preseq -profile test_full -resume –max_cpus 55 –max_memory 120.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘hisat2’ –gtf_extra_attributes ‘gene_name’ –gtf_group_features ‘gene_id’ –featurecounts_group_type ‘gene_name’ –featurecounts_feature_type ‘exon’ –umitools_grouping_method ‘unique’
  3. R-code for evaluation of nextflow outputs

    # Import the required libraries
    library("AnnotationDbi")
    library("clusterProfiler")
    library("ReactomePA")
    library(gplots)
    
    library(tximport)
    library(DESeq2)
    
    setwd("~/DATA/Data_Manja_RNAseq_Organoids/results_GRCh38_unique/star_salmon")
    
    # Define paths to your Salmon output quantification files
    files <- c("control_r1" = "./control_r1/quant.sf",
              "control_r2" = "./control_r2/quant.sf",
              "HSV.d2_r1" = "./HSV.d2_r1/quant.sf",
              "HSV.d2_r2" = "./HSV.d2_r2/quant.sf",
              "HSV.d4_r1" = "./HSV.d4_r1/quant.sf",
              "HSV.d4_r2" = "./HSV.d4_r2/quant.sf",
              "HSV.d6_r1" = "./HSV.d6_r1/quant.sf",
              "HSV.d6_r2" = "./HSV.d6_r2/quant.sf",
              "HSV.d8_r1" = "./HSV.d8_r1/quant.sf",
              "HSV.d8_r2" = "./HSV.d8_r2/quant.sf")
    
    # Import the transcript abundance data with tximport
    txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
    
    # Define the replicates and condition of the samples
    replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2"))
    condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8", "HSV.d8"))
    
    # Define the colData for DESeq2
    colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
    
    # Create DESeqDataSet object
    dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
    
    # In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps.
    # Filter data to retain only genes with more than 2 counts > 3 across all samples
    # dds <- dds[rowSums(counts(dds) > 3) > 2, ]
    
    # Run DESeq2
    dds <- DESeq(dds)
    
    # Perform rlog transformation
    rld <- rlogTransformation(dds)
    
    # Output raw count data to a CSV file
    write.csv(counts(dds), file="transcript_counts.csv")
    
    # -- gene-level count data --
    # Read in the tx2gene map from salmon_tx2gene.tsv
    #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
    tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
    
    # Set the column names
    colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
    
    # Remove the gene_name column if not needed
    tx2gene <- tx2gene[,1:2]
    
    # Import and summarize the Salmon data with tximport
    txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
    
    # Continue with the DESeq2 workflow as before...
    colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
    dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
    #dds <- dds[rowSums(counts(dds) > 3) > 2, ]    #60605-->26543
    dds <- DESeq(dds)
    rld <- rlogTransformation(dds)
    write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")
    
    #TODO: why a lot of reads were removed due to the too_short?
    #STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output
    
    dim(counts(dds))
    head(counts(dds), 10)  
  4. draw 3D PCA plots.

    library(gplots) 
    library("RColorBrewer")
    
    library(ggplot2)
    data <- plotPCA(rld, intgroup=c("condition", "replicate"), returnData=TRUE)
    write.csv(data, file="plotPCA_data.csv")
    #calculate all PCs including PC3 with the following codes
    library(genefilter)
    ntop <- 500
    rv <- rowVars(assay(rld))
    select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
    mat <- t( assay(rld)[select, ] )
    pc <- prcomp(mat)
    pc$x[,1:3]
    #df_pc <- data.frame(pc$x[,1:3])
    df_pc <- data.frame(pc$x)
    identical(rownames(data), rownames(df_pc)) #-->TRUE
    
    data$PC1 <- NULL
    data$PC2 <- NULL
    merged_df <- merge(data, df_pc, by = "row.names")
    #merged_df <- merged_df[, -1]
    row.names(merged_df) <- merged_df$Row.names
    merged_df$Row.names <- NULL  # remove the "name" column
    merged_df$name <- NULL
    merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10","group","condition","replicate")]
    write.csv(merged_df, file="merged_df_10PCs.csv")
    summary(pc)  
    #0.5333  0.2125 0.06852
    
    draw_3D.py
    
    # -- before pca --
    png("pca_before_removeBatch2.png", 1200, 800)
    plotPCA(rld, intgroup=c("condition"))
    dev.off()
    
    # -- before heatmap --
    png("heatmap_before_removeBatch2.png", 1200, 800)
    distsRL <- dist(t(assay(rld)))
    mat <- as.matrix(distsRL)
    hc <- hclust(distsRL)
    hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
    heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
    dev.off()
    
    mat <- assay(rld)
    mm <- model.matrix(~replicates, colData(rld))
    mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm)
    assay(rld) <- mat
    
    # -- after pca --
    png("pca_after_removeBatch.png", 1200, 800)
    #svg("pca_after_removeBatch.svg")
    plotPCA(rld, intgroup=c("replicates"))
    dev.off()
    
    # -- after heatmap --
    png("heatmap_after_removeBatch.png", 1200, 800)
    #svg("heatmap_after_removeBatch.svg")
    distsRL <- dist(t(assay(rld)))
    mat <- as.matrix(distsRL)
    hc <- hclust(distsRL)
    hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
    heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
    dev.off()
  5. (optional) estimate size factors

    > head(dds)
    class: DESeqDataSet 
    dim: 6 10 
    metadata(1): version
    assays(6): counts avgTxLength ... H cooks
    rownames(6): ENSG00000000003 ENSG00000000005 ... ENSG00000000460
      ENSG00000000938
    rowData names(34): baseMean baseVar ... deviance maxCooks
    colnames(10): control_r1 control_r2 ... HSV.d8_r1 HSV.d8_r2
    colData names(2): condition replicate
    
    #convert bam to bigwig using deepTools by feeding inverse of DESeq’s size Factor
    sizeFactors(dds)
    #NULL
    dds <- estimateSizeFactors(dds)
    > sizeFactors(dds)
    
    normalized_counts <- counts(dds, normalized=TRUE)
    #write.table(normalized_counts, file="normalized_counts.txt", sep="\t", quote=F, col.names=NA)
    
    # ---- DEBUG sizeFactors(dds) always NULL, see https://support.bioconductor.org/p/97676/ ----
    nm <- assays(dds)[["avgTxLength"]]
    sf <- estimateSizeFactorsForMatrix(counts(dds), normMatrix=nm)
    
    assays(dds)$counts  # for count data
    assays(dds)$avgTxLength  # for average transcript length, etc.
    assays(dds)$normalizationFactors
    
    In normal circumstances, the size factors should be stored in the DESeqDataSet object itself and not in the assays, so they are typically not retrievable via the assays() function. However, due to the issues you're experiencing, you might be able to manually compute the size factors and assign them back to the DESeqDataSet.
    
    To calculate size factors manually, DESeq2 uses the median ratio method. Here's a very simplified version of how you could compute this manually:
    > assays(dds)
    List of length 6
    names(6): counts avgTxLength normalizationFactors mu H cooks
    
    To calculate size factors manually, DESeq2 uses the median ratio method. Here's a very simplified version of how you could compute this manually:
    
    geoMeans <- apply(assays(dds)$counts, 1, function(row) if (all(row == 0)) 0 else exp(mean(log(row[row != 0]))))
    sizeFactors(dds) <- median(assays(dds)$counts / geoMeans, na.rm = TRUE)
    
    # ---- DEBUG END ----
    
    #unter konsole
    #  control_r1  ...
    # 1/0.9978755  ... 
    
    > sizeFactors(dds)
                        HeLa_TO_r1                      HeLa_TO_r2 
                          0.9978755                       1.1092227 
    
    1/0.9978755=1.002129023
    1/1.1092227=
    
    #bamCoverage --bam ../markDuplicates/${sample}Aligned.sortedByCoord.out.bam -o ${sample}_norm.bw --binSize 10 --scaleFactor  --effectiveGenomeSize 2864785220
    bamCoverage --bam ../markDuplicates/HeLa_TO_r1Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r1.bw --binSize 10 --scaleFactor 1.002129023     --effectiveGenomeSize 2864785220
    bamCoverage --bam ../markDuplicates/HeLa_TO_r2Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r2.bw --binSize 10 --scaleFactor  0.901532217        --effectiveGenomeSize 2864785220
  6. differential expressions

    #A central method for exploring differences between groups of segments or samples is to perform differential gene expression analysis. 
    
    dds$condition <- relevel(dds$condition, "control")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("HSV.d2_vs_control","HSV.d4_vs_control","HSV.d6_vs_control","HSV.d8_vs_control")
    
    dds$condition <- relevel(dds$condition, "HSV.d2")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("HSV.d4_vs_HSV.d2","HSV.d6_vs_HSV.d2","HSV.d8_vs_HSV.d2")
    
    dds$condition <- relevel(dds$condition, "HSV.d4")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("HSV.d6_vs_HSV.d4","HSV.d8_vs_HSV.d4")
    
    dds$condition <- relevel(dds$condition, "HSV.d6")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("HSV.d8_vs_HSV.d6")
    
    ##https://bioconductor.statistik.tu-dortmund.de/packages/3.7/data/annotation/
    #BiocManager::install("EnsDb.Mmusculus.v79")
    #library(EnsDb.Mmusculus.v79)
    #edb <- EnsDb.Mmusculus.v79
    
    #https://bioconductor.org/packages/release/bioc/vignettes/biomaRt/inst/doc/accessing_ensembl.html#selecting-an-ensembl-biomart-database-and-dataset
    #https://bioconductor.org/packages/release/bioc/vignettes/biomaRt/inst/doc/accessing_ensembl.html#selecting-an-ensembl-biomart-database-and-dataset
    library(biomaRt)
    listEnsembl()
    listMarts()
    #ensembl <- useEnsembl(biomart = "genes", mirror="asia")  # default is Mouse strains 104
    #ensembl <- useEnsembl(biomart = "ensembl", dataset = "mmusculus_gene_ensembl", mirror = "www")
    #ensembl = useMart("ensembl_mart_44", dataset="hsapiens_gene_ensembl",archive=TRUE, mysql=TRUE)
    #ensembl <- useEnsembl(biomart = "ensembl", dataset = "mmusculus_gene_ensembl", version="104")
    #ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="86")
    #--> total 69, 27  GRCh38.p7 and 39  GRCm38.p4; we should take 104, since rnaseq-pipeline is also using annotation of 104!
    ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="104")
    datasets <- listDatasets(ensembl)
    #--> total 202   80                         GRCh38.p13         107                            GRCm39
    #80           hsapiens_gene_ensembl                                      Human genes (GRCh38.p13)                         GRCh38.p13
    #107         mmusculus_gene_ensembl                                        Mouse genes (GRCm39)                            GRCm39
    
    > listEnsemblArchives()
                name     date                                 url version
    1  Ensembl GRCh37 Feb 2014          https://grch37.ensembl.org  GRCh37
    2     Ensembl 109 Feb 2023 https://feb2023.archive.ensembl.org     109
    3     Ensembl 108 Oct 2022 https://oct2022.archive.ensembl.org     108
    4     Ensembl 107 Jul 2022 https://jul2022.archive.ensembl.org     107
    5     Ensembl 106 Apr 2022 https://apr2022.archive.ensembl.org     106
    6     Ensembl 105 Dec 2021 https://dec2021.archive.ensembl.org     105
    7     Ensembl 104 May 2021 https://may2021.archive.ensembl.org     104
    
    attributes = listAttributes(ensembl)
    attributes[1:25,]
    
    #https://www.ncbi.nlm.nih.gov/grc/human
    #BiocManager::install("org.Mmu.eg.db")
    #library("org.Mmu.eg.db")
    #edb <- org.Mmu.eg.db
    #
    #https://bioconductor.statistik.tu-dortmund.de/packages/3.6/data/annotation/
    #EnsDb.Mmusculus.v79
    #> query(hub, c("EnsDb", "apiens", "98"))
    #columns(edb)
    
    #searchAttributes(mart = ensembl, pattern = "symbol")
    
    ##https://www.geeksforgeeks.org/remove-duplicate-rows-based-on-multiple-columns-using-dplyr-in-r/
    library(dplyr)
    library(tidyverse)
    #df <- data.frame (lang =c ('Java','C','Python','GO','RUST','Javascript',
                          'Cpp','Java','Julia','Typescript','Python','GO'),
                          value = c (21,21,3,5,180,9,12,20,6,0,3,6),
                          usage =c(21,21,0,99,44,48,53,16,6,8,0,6))
    #distinct(df, lang, .keep_all= TRUE)
    
    for (i in clist) {
    #"HSV.d2_vs_control","HSV.d4_vs_control","HSV.d6_vs_control","HSV.d8_vs_control"
    #i<-clist[1]
      contrast = paste("condition", i, sep="_")
      res = results(dds, name=contrast)
      res <- res[!is.na(res$log2FoldChange),]
      #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("ENTREZID","EXONID","GENEBIOTYPE","GENEID","GENENAME","PROTEINDOMAINSOURCE","PROTEINID","SEQNAME","SEQSTRAND","SYMBOL","TXBIOTYPE","TXID","TXNAME","UNIPROTID"))
      #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("GENEID", "ENTREZID", "SYMBOL", "GENENAME","GENEBIOTYPE","TXBIOTYPE","SEQSTRAND","UNIPROTID"))
      # In the ENSEMBL-database, GENEID is ENSEMBL-ID.
      #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("GENEID", "SYMBOL", "GENEBIOTYPE"))  #  "ENTREZID", "TXID","TXBIOTYPE","TXSEQSTART","TXSEQEND"
      #geness <- geness[!duplicated(geness$GENEID), ]
    
      #using getBM replacing AnnotationDbi::select
      #filters = 'ensembl_gene_id' means the records should always have a valid ensembl_gene_ids.
      geness <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
          filters = 'ensembl_gene_id',
          values = rownames(res), 
          mart = ensembl)
      geness_uniq <- distinct(geness, ensembl_gene_id, .keep_all= TRUE)
    
      #merge by column by common colunmn name, in the case "GENEID"
      res$ENSEMBL = rownames(res)
      identical(rownames(res), geness_uniq$ensembl_gene_id)
      res_df <- as.data.frame(res)
      geness_res <- merge(geness_uniq, res_df, by.x="ensembl_gene_id", by.y="ENSEMBL")
      dim(geness_res)
      rownames(geness_res) <- geness_res$ensembl_gene_id
      geness_res$ensembl_gene_id <- NULL
      write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
      up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
      down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
      write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
      write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
    }
    
    #-- show methods of class DESeq2 --
    #x=capture.output(showMethods(class="DESeq2"))
    #unlist(lapply(strsplit(x[grep("Function: ",x,)]," "),function(x) x[2]))
  7. volcano plots with automatically finding top_g

    #A canonical visualization for interpreting differential gene expression results is the volcano plot.
    library(ggrepel) 
    
    for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do
    #HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control
    #for i in K3R_24hdox_vs_K3R_3hdox21hchase WT_3hdox21hchase_vs_K3R_3hdox21hchase; do
    #for i in WT_24hdox_vs_K3R_24hdox; do
    #for i in WT_24hdox_vs_WT_3hdox21hchase; do
      # read files to geness_res
      echo "geness_res <- read.csv(file = paste(\"${i}\", \"all.txt\", sep=\"-\"), row.names=1)"
    
      echo "subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res\$log2FoldChange) >= 2.0))"
      echo "geness_res\$Color <- \"NS or log2FC < 2.0\""
      echo "geness_res\$Color[geness_res\$pvalue < 0.05] <- \"P < 0.05\""
      echo "geness_res\$Color[geness_res\$padj < 0.05] <- \"P-adj < 0.05\""
      echo "geness_res\$Color[abs(geness_res\$log2FoldChange) < 2.0] <- \"NS or log2FC < 2.0\""
      echo "geness_res\$Color <- factor(geness_res\$Color, levels = c(\"NS or log2FC < 2.0\", \"P < 0.05\", \"P-adj < 0.05\"))"
      echo "write.csv(geness_res, \"${i}_with_Category.csv\")"
    
      # pick top genes for either side of volcano to label
      # order genes for convenience:
      echo "geness_res\$invert_P <- (-log10(geness_res\$pvalue)) * sign(geness_res\$log2FoldChange)"
      echo "top_g <- c()"
      echo "top_g <- c(top_g, \
                geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:100]], \
                geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:100]])"
      echo "top_g <- unique(top_g)"
      echo "geness_res <- geness_res[, -1*ncol(geness_res)]"  # remove invert_P from matrix
    
      # Graph results
      echo "png(\"${i}.png\",width=1200, height=2000)"
      echo "ggplot(geness_res, \
          aes(x = log2FoldChange, y = -log10(pvalue), \
              color = Color, label = external_gene_name)) + \
          geom_vline(xintercept = c(2.0, -2.0), lty = \"dashed\") + \
          geom_hline(yintercept = -log10(0.05), lty = \"dashed\") + \
          geom_point() + \
          labs(x = \"log2(FC)\", y = \"Significance, -log10(P)\", color = \"Significance\") + \
          scale_color_manual(values = c(\"P-adj < 0.05\"=\"darkblue\",\"P < 0.05\"=\"lightblue\",\"NS or log2FC < 2.0\"=\"darkgray\"),guide = guide_legend(override.aes = list(size = 4))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) + \
          geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res\$log2FoldChange) >= 2.0)), size = 4, point.padding = 0.15, color = \"black\", min.segment.length = .1, box.padding = .2, lwd = 2) + \
          theme_bw(base_size = 16) + \
          theme(legend.position = \"bottom\")"
      echo "dev.off()"
    done
    
    sed -i -e 's/Color/Category/g' *_Category.csv
    
    for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do
      echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;"
    done
  8. clustering the genes and draw heatmap

    install.packages("gplots")
    library("gplots")
    
    for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do
      echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"
      echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"
    done
    
        5 HSV.d2_vs_control-down.id
        14 HSV.d2_vs_control-up.id
        77 HSV.d4_vs_control-down.id
      460 HSV.d4_vs_control-up.id
    
      977 HSV.d6_vs_control-down.id
      1863 HSV.d6_vs_control-up.id
      1361 HSV.d8_vs_control-down.id
      1215 HSV.d8_vs_control-up.id
    
        35 HSV.d4_vs_HSV.d2-down.id
      205 HSV.d4_vs_HSV.d2-up.id
    
      832 HSV.d6_vs_HSV.d2-down.id
      1550 HSV.d6_vs_HSV.d2-up.id
      386 HSV.d6_vs_HSV.d4-down.id
      103 HSV.d6_vs_HSV.d4-up.id
    
      1136 HSV.d8_vs_HSV.d2-down.id
      1050 HSV.d8_vs_HSV.d2-up.id
      598 HSV.d8_vs_HSV.d4-down.id
      292 HSV.d8_vs_HSV.d4-up.id
      305 HSV.d8_vs_HSV.d6-down.id
      133 HSV.d8_vs_HSV.d6-up.id
    12597 total
    
    cat *.id | sort -u > ids
    #add Gene_Id in the first line, delete the ""
    GOI <- read.csv("ids")$Gene_Id
    RNASeq.NoCellLine <- assay(rld)
    
    #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
    datamat = RNASeq.NoCellLine[GOI, ]
    write.csv(as.data.frame(datamat), file ="significant_gene_expressions.txt")
    hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
    hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
    mycl = cutree(hr, h=max(hr$height)/1.05)
    mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED",  "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");
    
    mycol = mycol[as.vector(mycl)]
    sampleCols <- rep('GREY',ncol(datamat))
    names(sampleCols) <- c("control r1","control r2","day2 r1","day2 r2","day4 r1","day4 r2", "day6 r1","day6 r2", "day8 r1","day8 r2")
    #sampleCols[substr(colnames(RNASeq.NoCellLine_),1,4)=='mock'] <- 'GREY'
    
    sampleCols["control r1"] <- 'DARKBLUE'
    sampleCols["control r2"] <- 'DARKBLUE'
    sampleCols["day2 r1"] <- 'DARKRED'
    sampleCols["day2 r2"] <- 'DARKRED'
    sampleCols["day4 r1"] <- 'DARKORANGE'
    sampleCols["day4 r2"] <- 'DARKORANGE'
    sampleCols["day6 r1"] <- 'DARKGREEN'
    sampleCols["day6 r2"] <- 'DARKGREEN'
    sampleCols["day8 r1"] <- 'DARKCYAN'
    sampleCols["day8 r2"] <- 'DARKCYAN'
    
    png("DEGs_heatmap.png", width=1000, height=1200)
    heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
                scale='row',trace='none',col=bluered(75), 
                RowSideColors = mycol, ColSideColors = sampleCols, labRow="", margins=c(22,10), cexRow=8, cexCol=2, srtCol=20, lwid=c(1,7), lhei = c(1, 8))
    #legend("top", title = "",legend=c("WaGa_RNA","MKL1_RNA","WaGa_EV_RNA","MKL1_EV_RNA"), fill=c("DARKBLUE","DARKRED","DARKORANGE","DARKGREEN"), cex=0.8, box.lty=0)
    dev.off()
    
    #c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED",  "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");
    write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
    write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt') 
    write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')  
    write.csv(names(subset(mycl, mycl == '4')),file='cluster4_DARKMAGENTA.txt') 
    write.csv(names(subset(mycl, mycl == '5')),file='cluster5_DARKCYAN.txt')  
    #~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o DEGs_heatmap_cluster_members.xls
    
    ~/Tools/csv2xls-0.4/csv_to_xls.py \
    significant_gene_expressions.txt \
    -d',' -o DEGs_heatmap_expression_data.xls;

Prepare virus GTF for nextflow run

  1. Prepare GTF for non-model virus

    • The gffread command you’re using is designed to convert GFF format files to GTF format, but it doesn’t necessarily preserve all the attribute information. The -T option enforces creation of gene_id and transcript_id attributes, which are mandatory in GTF format, and gffread takes these from the ID and Parent fields of the input GFF file, respectively.

    • The GTF format is simpler than GFF and doesn’t accommodate all the possible attributes of a GFF file. That’s why you’re seeing a reduction in information in your converted file.

    • If you need to retain all information from the GFF file, you may need to do some post-processing to add the extra attributes back into the GTF file. However, keep in mind that downstream tools which expect GTF format may not correctly handle extra attributes.

      # -- Deprecated processing for virus gtf --
      #NOT_USED, since it changed a lot!
      #gffread X14112.1.gff -T -o X14112.1.gtf
      cp X14112.1.gff3 X14112.1.gff3_backup
      grep "^##" X14112.1.gff3 > X14112.1_gene.gff3
      grep "ID=gene" X14112.1.gff3 >> X14112.1_gene.gff
      #!!!!VERY_IMPORTANT!!!!: change type '\tgene\t' to '\texon\t'! 
      #sed -i -e "s/\tgene\t/\texon\t/g" X14112.1_gene_.gff # since default is --featurecounts_feature_type 'exon'
      
      # -- New processing for virus gtf --
      gffread X14112.1_orig.gff -T -o X14112.1_v2.gtf
      
      python3 add_gene_id.py  # X14112.1_v2.gtf --> X14112.1_v3.gtf
      #------------------------------------
      def add_missing_gene_id(input_gtf, output_gtf):
          with open(input_gtf, 'r') as in_gtf, open(output_gtf, 'w') as out_gtf:
              for line in in_gtf:
                  if not line.startswith('#'):  # Skip header lines
                      elements = line.strip().split('\t')
                      attributes = elements[8]
                      if 'gene_id' not in attributes:
                          # Extract transcript_id
                          transcript_id = ''
                          for attr in attributes.split(';'):
                              if 'transcript_id' in attr:
                                  transcript_id = attr.strip()
                          # Prepend transcript_id as gene_id if not empty
                          if transcript_id != '':
                              attributes = f'{transcript_id.replace("transcript_id", "gene_id")}; {attributes}'
                      elements[8] = attributes
                      line = '\t'.join(elements)
                  out_gtf.write(line + '\n')
      # Use the function
      input_gtf = 'X14112.1_v2.gtf'  # Path to your input GTF
      output_gtf = 'X14112.1_v3.gtf'  # Path to the output GTF
      add_missing_gene_id(input_gtf, output_gtf)
    • Human herpesvirus 1, also known as Herpes Simplex Virus type 1 (HSV-1), is a virus with a complex genome encoding around 70-80 genes. The number of genes can vary slightly depending on the specific strain of HSV-1, as well as the methodologies used to identify and annotate the genes.

    • IE175, also known as ICP4 (Infected Cell Polypeptide 4), is a protein encoded by the Human herpesvirus 1 (HSV-1). The gene for this protein is also referred to as the IE (immediate early) gene 3, and the protein it encodes is a major regulatory protein.

    • In the lifecycle of HSV-1, immediate early genes are the first set of genes to be transcribed following infection. The proteins produced from these genes then regulate the expression of early and late genes that are involved in viral DNA replication and the production of viral structural proteins.

    • ICP4, in particular, is essential for the onset of viral replication. It acts as a trans-activator, promoting transcription of other viral genes. It can also interact with host cell proteins and influence host gene expression. As a result of these functions, ICP4 plays a key role in the pathogenesis of HSV-1 infection.

    • Please note that the naming convention for viral genes and proteins can sometimes be inconsistent, with multiple names referring to the same gene or protein. IE175, ICP4, and IE gene 3 all refer to the same gene in HSV-1.

      # Delete the records if they are intron or manually add gene_name to the records without gene_name. 
      
      cp X14112.1_v3.gtf X14112.1_v4.gtf
      #Find all recoreds without "gene_name"
      grep -v "gene_name" X14112.1_v4.gtf
      
      #-->Delete intron records: grep "intron" X14112.1.gff3_orig
      DEL X14112.1        EMBL    transcript      4953    6907    .       -       .       transcript_id "id-X14112.1:4953..6907"; gene_id "id-X14112.1:4953..6907"
      DEL X14112.1        EMBL    exon    4953    6907    .       -       .       gene_id "id-X14112.1:4953..6907"; transcript_id "id-X14112.1:4953..6907";
      DEL X14112.1        EMBL    transcript      132374  132539  .       +       .       transcript_id "id-X14112.1:132374..132539"; gene_id "id-X14112.1:132374..132539"
      DEL X14112.1        EMBL    exon    132374  132539  .       +       .       gene_id "id-X14112.1:132374..132539"; transcript_id "id-X14112.1:132374..132539";
      DEL X14112.1        EMBL    transcript      145649  145860  .       -       .       transcript_id "id-X14112.1:145649..145860"; gene_id "id-X14112.1:145649..145860"
      DEL X14112.1        EMBL    exon    145649  145860  .       -       .       gene_id "id-X14112.1:145649..145860"; transcript_id "id-X14112.1:145649..145860";
      
      # or update: grep "146805" X14112.1_orig.gff
      UPDATE X14112.1        EMBL    transcript      146805  151063  .       +       .       transcript_id "rna-X14112.1:146805..151063"; gene_id "rna-X14112.1:146805..151063"
      UPDATE X14112.1        EMBL    exon    146805  151063  .       +       .       gene_id "rna-X14112.1:146805..151063"; transcript_id "rna-X14112.1:146805..151063";
                                                                                        --> transcript_id "rna-IE175"; gene_id "gene-IE175"; gene_name "IE175";                                                  --> transcript_id "rna-IE175"; gene_id "gene-IE175"; gene_name "IE175";
      
      # or update: grep "133941" X14112.1_orig.gff
      UPDATE X14112.1        EMBL    transcript      133941  146107  .       -       .       transcript_id "rna-X14112.1:133941..146107"; gene_id "rna-X14112.1:133941..146107"
      UPDATE X14112.1        EMBL    exon    133941  145648  .       -       .       gene_id "rna-X14112.1:133941..146107"; transcript_id "rna-X14112.1:133941..146107";
      UPDATE X14112.1        EMBL    exon    145861  146107  .       -       .       gene_id "rna-X14112.1:133941..146107"; transcript_id "rna-X14112.1:133941..146107";
                                                                                        --> transcript_id "rna-IE68"; gene_id "rna-IE68"; gene_name "IE68";
                                                                                        --> gene_id "rna-IE68"; transcript_id "rna-IE68"; gene_name "IE68";
                                                                                        --> gene_id "rna-IE68"; transcript_id "rna-IE68"; gene_name "IE68";
    • (optional) consider to update all exon and CDS with different names! for example exon-RL2-1, exon-RL2-2, cds-RL2-1. Maybe it is not nessary, since the output contains only transcript-type!

  2. Run nexflow for virus

    docker pull nfcore/rnaseq
    /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_virus    --fasta "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1.fasta" --gtf "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_v4.gtf"   --with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P
    .{12}).*” –umitools_dedup_stats –skip_rseqc –skip_dupradar –skip_preseq -profile docker -resume –max_cpus 55 –max_memory 120.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘hisat2’ –gtf_extra_attributes ‘gene_name’ –gtf_group_features ‘gene_id’ –featurecounts_group_type ‘gene_name’ –featurecounts_feature_type ‘exon’ –umitools_grouping_method ‘unique’
  3. Run nexflow for human using GRCh38 genome

    docker pull nfcore/rnaseq
    /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38   --with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P
    .{12}).*” –umitools_dedup_stats –skip_rseqc –skip_dupradar –skip_preseq -profile docker -resume –max_cpus 55 –max_memory 128.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘star_salmon’ –pseudo_aligner ‘salmon’ –gtf_extra_attributes ‘gene_name’ –gtf_group_features ‘gene_id’ –featurecounts_group_type ‘gene_biotype’ –featurecounts_feature_type ‘exon’ –umitools_grouping_method ‘unique’

3.1. BUG_1 for running d8_r1 due to memory

  # in modules/nf-core/umitools/dedup/main.nf
  process UMITOOLS_DEDUP {
      tag "$meta.id"
      //REMOVED  label "process_medium"
      //ADDED
      label 'high_memory' // this needs to be defined in your config file
      cpus 55 // adjust as per your system's capabilities

  ERROR ~ Module compilation error
  - file : /mnt/h1/jhuang/DATA/Data_Manja_RNAseq_Organoids/rnaseq/./workflows/../subworkflows/nf-core/bam_dedup_stats_samtools_umitools/../../../modules/nf-core/umitools/dedup/main.nf
  - cause: Unexpected character: '#' @ line 3, column 5.
        #label "process_medium"
      ^

3.2. BUG_2 for running d8_r1 due to memory

  # in conf/test_full.config
  process {
    //ADDED
    withLabel: 'high_memory' {
      memory = '120 GB' // adjust as per your system's capabilities
    }
    withName: 'UMITOOLS_DEDUP' {
      time = '160.h' // Adjust the time limit to your needs
    }
  }

  ERROR ~ Error executing process > 'NFCORE_RNASEQ:RNASEQ:BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_TRANSCRIPTOME:UMITOOLS_DEDUP (control_r2)'
  Caused by:
    Process requirement exceeds available memory -- req: 128 GB; avail: 125.8 GB
  Command executed:
    PYTHONHASHSEED=0 umi_tools \
        dedup \
        -I control_r2.transcriptome.sorted.bam \
        -S control_r2.umi_dedup.transcriptome.sorted.bam \
        --output-stats control_r2.umi_dedup.transcriptome.sorted \
        --method='unique' --random-seed=100
    cat <<-END_VERSIONS > versions.yml
    "NFCORE_RNASEQ:RNASEQ:BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_TRANSCRIPTOME:UMITOOLS_DEDUP":
        umitools: $(umi_tools --version 2>&1 | sed 's/^.*UMI-tools version://; s/ *$//')
    END_VERSIONS
  Command exit status:
  1. R-code for evaluation of nextflow outputs

    # Import the required libraries
    library("AnnotationDbi")
    library("clusterProfiler")
    library("ReactomePA")
    library(gplots)
    
    library(tximport)
    library(DESeq2)
    
    setwd("~/DATA/Data_Manja_RNAseq_Organoids/results_GRCh38_unique_9samples/star_salmon")
    
    # Define paths to your Salmon output quantification files
    files <- c("control_r1" = "./control_r1/quant.sf",
              "control_r2" = "./control_r2/quant.sf",
              "HSV.d2_r1" = "./HSV.d2_r1/quant.sf",
              "HSV.d2_r2" = "./HSV.d2_r2/quant.sf",
              "HSV.d4_r1" = "./HSV.d4_r1/quant.sf",
              "HSV.d4_r2" = "./HSV.d4_r2/quant.sf",
              "HSV.d6_r1" = "./HSV.d6_r1/quant.sf",
              "HSV.d6_r2" = "./HSV.d6_r2/quant.sf",
              "HSV.d8_r2" = "./HSV.d8_r2/quant.sf")
    
    # Import the transcript abundance data with tximport
    txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
    
    # Define the replicates and condition of the samples
    replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r2"))
    condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8"))
    
    # Define the colData for DESeq2
    colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
    
    # Create DESeqDataSet object
    dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
    
    # In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps.
    # Filter data to retain only genes with more than 2 counts > 3 across all samples
    # dds <- dds[rowSums(counts(dds) > 3) > 2, ]
    
    # Run DESeq2
    dds <- DESeq(dds)
    
    # Perform rlog transformation
    rld <- rlogTransformation(dds)
    
    # Output raw count data to a CSV file
    write.csv(counts(dds), file="transcript_counts.csv")
    
    # -- gene-level count data --
    # Read in the tx2gene map from salmon_tx2gene.tsv
    #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
    tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
    
    # Set the column names
    colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
    
    # Remove the gene_name column if not needed
    tx2gene <- tx2gene[,1:2]
    
    # Import and summarize the Salmon data with tximport
    txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
    
    # Continue with the DESeq2 workflow as before...
    colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
    dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
    #dds <- dds[rowSums(counts(dds) > 3) > 2, ]    #60605-->26543
    dds <- DESeq(dds)
    rld <- rlogTransformation(dds)
    write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")
    
    #TODO: why a lot of reads were removed due to the too_short?
    STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output
    
    dim(counts(dds))
    head(counts(dds), 10)  

WHY: too much “Unmapped: too short” in STAR using nextflow, I read the STAR manual, the default values are 0, it means it will never discard reads due to too short. ASSUMPTION: the umi_tools dedup counts as “Unmapped: too short”. Check the intermediate bam files.

Indeed, you’re right that according to the STAR manual, the default settings for read length filtering parameters (–outFilterScoreMinOverLread, –outFilterMatchNminOverLread, and –outFilterMatchNmin) are set to 0, which means reads will not be discarded for being too short.

The “Unmapped: too short” statistic in the STAR output usually refers to reads that didn’t map well enough to the reference genome. In other words, either the read didn’t match sufficiently to any location in the genome, or the best matches were not significantly better than other matches to be unique.

If you’re seeing a high proportion of “Unmapped: too short” reads, it could be due to several reasons:

  • The quality of the reads might be poor, leading to low mapping efficiency.
  • The reference genome might not be the correct one for your data.
  • If your data is single-end reads, it might contain a high proportion of reads shorter than the default minimum allowed length.

For troubleshooting, you could:

Check the quality of your reads using a tool like FastQC.

  • Ensure you’re using the correct reference genome.
  • If you’re working with single-end data, consider checking the length distribution of your reads. If many reads are shorter than the minimum length threshold, consider adjusting this threshold if appropriate.
  • If you’re working with paired-end data, make sure both of the reads in a pair pass the quality controls. STAR is sensitive to inconsistencies between paired reads.

Finally, always make sure your read preprocessing steps (like trimming for quality and adapter sequences) are performed correctly as these can greatly affect the downstream analysis.

Yersinia outer proteins (Yops) analysis

  1. This step uses rsync to download data from the NCBI server to a local directory, save all gff-files in the directory prokka.

    rsync --copy-links --recursive --times --verbose rsync://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/696/305/GCF_001696305.1_UCN72.1 Yersinia_pestis_1045
    
    GCF_001656035.1_ASM165603v1_genomic.fna.gz
    070 status=suppressed
    jhuang@hamburg:~/DATA/Data_Gunnar_Yersiniomics$ cp data/Yersinia_pseudotuberculosis_PB1+/GCF_000020085.1_ASM2008v1/GCF_000020085.1_ASM2008v1_genomic.fna.gz assembly/Yersinia_pseudotuberculosis_PB1+.fna.gz
    cp: cannot stat 'data/Yersinia_pseudotuberculosis_PB1+/GCF_000020085.1_ASM2008v1/GCF_000020085.1_ASM2008v1_genomic.fna.gz': No such file or directory
    088
    jhuang@hamburg:~/DATA/Data_Gunnar_Yersiniomics$ cp data/Yersinia_pseudotuberculosis_YPIII/GCF_000019465.1_ASM1946v1/GCF_000019465.1_ASM1946v1_genomic.fna.gz assembly/Yersinia_pseudotuberculosis_YPIII.fna.gz
    cp: cannot stat 'data/Yersinia_pseudotuberculosis_YPIII/GCF_000019465.1_ASM1946v1/GCF_000019465.1_ASM1946v1_genomic.fna.gz': No such file or directory
    
    #status=latest
    
    for sample in Yersinia_pestis_1045 Yersinia_pestis_SCPM-O-B-6291_C-25 Yersinia_pestis_2944 Yersinia_pestis_KIM10+ Yersinia_pestis_M-1482 Yersinia_pestis_KIM5 Yersinia_pestis_C-781 Yersinia_pestis_14D Yersinia_pestis_KM_567 Yersinia_pestis_M-1770 Yersinia_pestis_C-792 Yersinia_pestis_M2086 Yersinia_pestis_Harbin_35 Yersinia_pestis_Nicholisk_41 Yersinia_pestis_Harbin_35_bis Yersinia_pestis_SCPM-O-B-5935_I-1996 Yersinia_pestis_I-1252 Yersinia_pestis_FDAARGOS_603 Yersinia_pestis_195P Yersinia_pestis_Nepal516 Yersinia_pestis_S19960127 Yersinia_pestis_SCPM-O-B-6530 Yersinia_pestis_C-783 Yersinia_pestis_A1122 Yersinia_pestis_Cadman Yersinia_pestis_A1122_bis Yersinia_pestis_CO92_pgm-_pPCP1- Yersinia_pestis_CO92 Yersinia_pestis_Shasta Yersinia_pestis_Dodson Yersinia_pestis_El_Dorado Yersinia_pestis_EV76-CN Yersinia_pestis_EV_NIIEG Yersinia_pestis_Java9 Yersinia_pestis_PBM19 Yersinia_pestis_20 Yersinia_pestis_D182038 Yersinia_pestis_D106004 Yersinia_pestis_Z176003 Yersinia_pestis_Antiqua_bis Yersinia_pestis_FDAARGOS_601 Yersinia_pestis_Antiqua Yersinia_pestis_Nairobi Yersinia_pestis_M2085 Yersinia_pestis_SCPM-O-B-5942_I-2638 Yersinia_pestis_M2029 Yersinia_pestis_SCPM-O-DNA-18_I-3113 Yersinia_pestis_94 Yersinia_pestis_R Yersinia_pestis_790 Yersinia_pestis_SCPM-O-B-6899_231 Yersinia_pestis_FDAARGOS_602 Yersinia_pestis_Pestoides_B Yersinia_pestis_M-1974 Yersinia_pestis_91001 Yersinia_pestis_Angola Yersinia_pestis_Angola_bis Yersinia_pestis_3770 Yersinia_pestis_1412 Yersinia_pestis_1413 Yersinia_pestis_8787 Yersinia_pestis_3067 Yersinia_pestis_Pestoides_G Yersinia_pestis_Pestoides_F Yersinia_pestis_Pestoides_F_bis Yersinia_pestis_1522 Yersinia_pseudotuberculosis_FDAARGOS_582 Yersinia_pseudotuberculosis_NZYP4713 Yersinia_pseudotuberculosis_NCTC8480  Yersinia_pseudotuberculosis_PB1+_bis Yersinia_pseudotuberculosis_MD67 Yersinia_pseudotuberculosis_NCTC10217 Yersinia_pseudotuberculosis_NCTC10275 Yersinia_pseudotuberculosis_1 Yersinia_pseudotuberculosis_IP32953 Yersinia_pseudotuberculosis_IP32953_bis Yersinia_pseudotuberculosis_FDAARGOS_583 Yersinia_pseudotuberculosis_FDAARGOS_581 Yersinia_pseudotuberculosis_ATCC_6904 Yersinia_pseudotuberculosis_EP2+ Yersinia_pseudotuberculosis_IP31758 Yersinia_pseudotuberculosis_598 Yersinia_pseudotuberculosis_PA3606 Yersinia_pseudotuberculosis_FDAARGOS_665 Yersinia_pseudotuberculosis_FDAARGOS_584 Yersinia_pseudotuberculosis_YPIII_bis  Yersinia_pseudotuberculosis_FDAARGOS_579 Yersinia_pseudotuberculosis_IP2666pIB1 Yersinia_pseudotuberculosis_FDAARGOS_342 Yersinia_pseudotuberculosis_FDAARGOS_580 Yersinia_pseudotuberculosis_NCTC3571 Yersinia_similis_228 Yersinia_enterocolitica_NCTC13629 Yersinia_enterocolitica_MGYG-HGUT-02335 Yersinia_enterocolitica_Y1 Yersinia_enterocolitica_Y11 Yersinia_enterocolitica_NCTC13769 Yersinia_enterocolitica_FDAARGOS_1082 Yersinia_enterocolitica_2516-87 Yersinia_enterocolitica_KNG22703 Yersinia_enterocolitica_1055Rr Yersinia_enterocolitica_FDAARGOS_1090 Yersinia_enterocolitica_YE1 Yersinia_enterocolitica_YE3 Yersinia_enterocolitica_YE6 Yersinia_enterocolitica_YE7 Yersinia_enterocolitica_YE5 Yersinia_enterocolitica_YE165 Yersinia_enterocolitica_8081 Yersinia_enterocolitica_8081_bis Yersinia_enterocolitica_NCTC12982 Yersinia_enterocolitica_WA Yersinia_enterocolitica_NW57 Yersinia_enterocolitica_NW117 Yersinia_enterocolitica_NW51 Yersinia_enterocolitica_NW56 Yersinia_enterocolitica_NW115 Yersinia_enterocolitica_NW67 Yersinia_enterocolitica_FORC_002 Yersinia_enterocolitica_FORC_002_bis Yersinia_enterocolitica_NW66 Yersinia_enterocolitica_MP98 Yersinia_enterocolitica_Gp259 Yersinia_enterocolitica_FORC066 Yersinia_enterocolitica_Gp2 Yersinia_enterocolitica_str_YE5303 Yersinia_enterocolitica_Gp200 Yersinia_enterocolitica_NW116 Yersinia_enterocolitica_Gp169 Yersinia_enterocolitica_NW1 Yersinia_enterocolitica_FORC065 Yersinia_frederiksenii_Y225 Yersinia_kristensenii_Y231 Yersinia_rochesterensis_ATCC_33639 Yersinia_rochesterensis_ATCC_BAA-2637 Yersinia_intermedia_SCPM-O-B-9106_C-191 Yersinia_kristensenii_2012N-4030 Yersinia_hibernica_CFS1934 Yersinia_hibernica_LC20 Yersinia_canariae_NCTC_14382 Yersinia_frederiksenii_FDAARGOS_418 Yersinia_alsatica_SCPM-O-B-7604 Yersinia_rohdei_YRA Yersinia_massiliensis_GTA Yersinia_massiliensis_2011N-4075 Yersinia_frederiksenii_FDAARGOS_417 Yersinia_intermedia_SCPM-O-B-8026_C-146 Yersinia_sp_KBS0713 Yersinia_bercovieri_ATCC_43970 Yersinia_aleksiciae_159 Yersinia_mollaretii_ATCC_43969 Yersinia_intermedia_FDAARGOS_729 Yersinia_intermedia_FDAARGOS_730 Yersinia_intermedia_NCTC11469 Yersinia_intermedia_FDAARGOS_358 Yersinia_sp_FDAARGOS_228 Yersinia_intermedia_Y228 Yersinia_intermedia_N6293 Yersinia_intermedia_SCPM-O-B-10209_333 Yersinia_aldovae_670-83 Yersinia_ruckeri_NHV_3758 Yersinia_ruckeri_NVI-10705 Yersinia_ruckeri_NVI-1292 Yersinia_ruckeri_NVI-4570 Yersinia_ruckeri_NVI-6614 Yersinia_ruckeri_NVI-11267 Yersinia_ruckeri_NVI-11294 Yersinia_ruckeri_NVI-10571 Yersinia_ruckeri_NVI-8524 Yersinia_ruckeri_NVI-1176 Yersinia_ruckeri_NVI-701 Yersinia_ruckeri_17Y0412 Yersinia_ruckeri_17Y0414 Yersinia_ruckeri_NVI-492 Yersinia_ruckeri_NVI-9681 Yersinia_ruckeri_SC09 Yersinia_ruckeri_17Y0157 Yersinia_ruckeri_17Y0189 Yersinia_ruckeri_17Y0153 Yersinia_ruckeri_17Y0155 Yersinia_ruckeri_KMM821 Yersinia_ruckeri_16Y0180 Yersinia_ruckeri_NVI-11050 Yersinia_ruckeri_NVI-11076 Yersinia_ruckeri_QMA0440 Yersinia_ruckeri_Big_Creek_74 Yersinia_ruckeri_NVI-5089 Yersinia_ruckeri_NVI-10587 Yersinia_ruckeri_NVI-4840 Yersinia_ruckeri_NVI-4479 Yersinia_ruckeri_17Y0161 Yersinia_ruckeri_17Y0163 Yersinia_ruckeri_NVI-11073 Yersinia_ruckeri_NVI-11065 Yersinia_ruckeri_17Y0159 Yersinia_ruckeri_NVI-8270 Yersinia_ruckeri_YRB Yersinia_entomophaga_MH96; do
    mlst ${sample}.fna >> ../mlst/all.txt;
    done
    
    #gene-M486_RS20950
    #M486_RS20950
    
    #extract CDS with locus_tag from genbank file

        #cut -d’ ‘ -f1 ../assembly/${sample}.fna > ../assembly/${sample}.fasta; #cat ${sample}.gff ../assembly/${sample}.fasta > ../prokkaplus/$(echo $sample | cut -d’‘ -f3- | tr ” ” “_”).gff; #sed -i ‘s/###/##FASTA/g’ ../prokkaplus/$(echo $sample | cut -d’‘ -f3- | tr ” ” “_”).gff;

  2. (important since only with the modification we can track the Gene ID) The step processes GFF files containing gene annotations for a set of samples in the directory prokka. The primary goal is to modify the GFF files and create new ones with specific changes and to save them in the directory prokka_plus. The script operates on each sample one by one, and for each sample, it performs the following steps:

    * Replace all occurrences of \tCDS\t with _CDS_ in the original GFF file.
    * Extract all lines containing _CDS_ and save them in a new file with the suffix _CDS.gff.
    * Replace all occurrences of ID= with ID_old= in the new _CDS.gff file.
    * Cut the second field (delimited by ;) from the _CDS.gff file and save it in a new file with the suffix _CDS_f2.
    * Replace all occurrences of Parent=gene- with ID= in the _CDS_f2 file.
    * Paste the contents of the _CDS.gff and _CDS_f2 files side by side, with a ; delimiter, and save the result in a new file with the suffix _CDS_.gff.
    * Run the enum.py script on the _CDS_.gff file to add line numbers at the end, and save the result in a new file with the suffix _CDS__.gff.
        import sys
    
        if len(sys.argv) < 2:
            print("Please provide a filename as an argument.")
            sys.exit(1)
    
        filename = sys.argv[1]
    
        try:
            with open(filename) as f:
                for i, line in enumerate(f):
                    print(f"{line.strip()}_{i+1}")
        except FileNotFoundError:
            print(f"File {filename} not found.")
    * Extract all lines from the original GFF file that do not contain _CDS_ and save them in a new file with the suffix _nonCDS.gff.
    * Remove all lines containing ### from the _nonCDS.gff file and save the result in a new file with the suffix _nonCDS_.gff.
    * Concatenate the contents of the _nonCDS_.gff and _CDS__.gff files and save the result in a new file with the suffix _nonCDS_CDS.gff.
    * Replace all occurrences of _CDS_ with \tCDS\t in the _nonCDS_CDS.gff file.
    * Append the string ##FASTA to the end of the _nonCDS_CDS.gff file.
    * Modify the FASTA file associated with the sample by replacing the first field (delimited by a space) with the corresponding sample name.
    * Concatenate the modified GFF file (_nonCDS_CDS.gff) and the modified FASTA file, and save the result in the ../prokka_plus/ directory with a new name based on the sample name.
    * After processing all samples, the script removes intermediate files generated during the process.
    
    # ERROR: Input file contains duplicate gene IDs, attempting to fix by adding a unique suffix, new GFF in the fixed_input_files directory: /mnt/Samsung_T5/Data_Gunnar_Yersiniomics/prokka_plus/1045.gff
    #To Debug the error above, perform the data as follows.
    
    for sample in Yersinia_pestis_1045 Yersinia_pestis_SCPM-O-B-6291_C-25 Yersinia_pestis_2944 Yersinia_pestis_KIM10+ Yersinia_pestis_M-1482 Yersinia_pestis_KIM5 Yersinia_pestis_C-781 Yersinia_pestis_14D Yersinia_pestis_KM_567 Yersinia_pestis_M-1770 Yersinia_pestis_C-792 Yersinia_pestis_M2086 Yersinia_pestis_Harbin_35 Yersinia_pestis_Nicholisk_41 Yersinia_pestis_Harbin_35_bis Yersinia_pestis_SCPM-O-B-5935_I-1996 Yersinia_pestis_I-1252 Yersinia_pestis_FDAARGOS_603 Yersinia_pestis_195P Yersinia_pestis_Nepal516 Yersinia_pestis_S19960127 Yersinia_pestis_SCPM-O-B-6530 Yersinia_pestis_C-783 Yersinia_pestis_A1122 Yersinia_pestis_Cadman Yersinia_pestis_A1122_bis Yersinia_pestis_CO92_pgm-_pPCP1- Yersinia_pestis_CO92 Yersinia_pestis_Shasta Yersinia_pestis_Dodson Yersinia_pestis_El_Dorado Yersinia_pestis_EV76-CN Yersinia_pestis_EV_NIIEG Yersinia_pestis_Java9 Yersinia_pestis_PBM19 Yersinia_pestis_20 Yersinia_pestis_D182038 Yersinia_pestis_D106004 Yersinia_pestis_Z176003 Yersinia_pestis_Antiqua_bis Yersinia_pestis_FDAARGOS_601 Yersinia_pestis_Antiqua Yersinia_pestis_Nairobi Yersinia_pestis_M2085 Yersinia_pestis_SCPM-O-B-5942_I-2638 Yersinia_pestis_M2029 Yersinia_pestis_SCPM-O-DNA-18_I-3113 Yersinia_pestis_94 Yersinia_pestis_R Yersinia_pestis_790 Yersinia_pestis_SCPM-O-B-6899_231 Yersinia_pestis_FDAARGOS_602 Yersinia_pestis_Pestoides_B Yersinia_pestis_M-1974 Yersinia_pestis_91001 Yersinia_pestis_Angola Yersinia_pestis_Angola_bis Yersinia_pestis_3770 Yersinia_pestis_1412 Yersinia_pestis_1413 Yersinia_pestis_8787 Yersinia_pestis_3067 Yersinia_pestis_Pestoides_G Yersinia_pestis_Pestoides_F Yersinia_pestis_Pestoides_F_bis Yersinia_pestis_1522 Yersinia_pseudotuberculosis_FDAARGOS_582 Yersinia_pseudotuberculosis_NZYP4713 Yersinia_pseudotuberculosis_NCTC8480  Yersinia_pseudotuberculosis_PB1+_bis Yersinia_pseudotuberculosis_MD67 Yersinia_pseudotuberculosis_NCTC10217 Yersinia_pseudotuberculosis_NCTC10275 Yersinia_pseudotuberculosis_1 Yersinia_pseudotuberculosis_IP32953 Yersinia_pseudotuberculosis_IP32953_bis Yersinia_pseudotuberculosis_FDAARGOS_583 Yersinia_pseudotuberculosis_FDAARGOS_581 Yersinia_pseudotuberculosis_ATCC_6904 Yersinia_pseudotuberculosis_EP2+ Yersinia_pseudotuberculosis_IP31758 Yersinia_pseudotuberculosis_598 Yersinia_pseudotuberculosis_PA3606 Yersinia_pseudotuberculosis_FDAARGOS_665 Yersinia_pseudotuberculosis_FDAARGOS_584 Yersinia_pseudotuberculosis_YPIII_bis  Yersinia_pseudotuberculosis_FDAARGOS_579 Yersinia_pseudotuberculosis_IP2666pIB1 Yersinia_pseudotuberculosis_FDAARGOS_342 Yersinia_pseudotuberculosis_FDAARGOS_580 Yersinia_pseudotuberculosis_NCTC3571 Yersinia_similis_228 Yersinia_enterocolitica_NCTC13629 Yersinia_enterocolitica_MGYG-HGUT-02335 Yersinia_enterocolitica_Y1 Yersinia_enterocolitica_Y11 Yersinia_enterocolitica_NCTC13769 Yersinia_enterocolitica_FDAARGOS_1082 Yersinia_enterocolitica_2516-87 Yersinia_enterocolitica_KNG22703 Yersinia_enterocolitica_1055Rr Yersinia_enterocolitica_FDAARGOS_1090 Yersinia_enterocolitica_YE1 Yersinia_enterocolitica_YE3 Yersinia_enterocolitica_YE6 Yersinia_enterocolitica_YE7 Yersinia_enterocolitica_YE5 Yersinia_enterocolitica_YE165 Yersinia_enterocolitica_8081 Yersinia_enterocolitica_8081_bis Yersinia_enterocolitica_NCTC12982 Yersinia_enterocolitica_WA Yersinia_enterocolitica_NW57 Yersinia_enterocolitica_NW117 Yersinia_enterocolitica_NW51 Yersinia_enterocolitica_NW56 Yersinia_enterocolitica_NW115 Yersinia_enterocolitica_NW67 Yersinia_enterocolitica_FORC_002 Yersinia_enterocolitica_FORC_002_bis Yersinia_enterocolitica_NW66 Yersinia_enterocolitica_MP98 Yersinia_enterocolitica_Gp259 Yersinia_enterocolitica_FORC066 Yersinia_enterocolitica_Gp2 Yersinia_enterocolitica_str_YE5303 Yersinia_enterocolitica_Gp200 Yersinia_enterocolitica_NW116 Yersinia_enterocolitica_Gp169 Yersinia_enterocolitica_NW1 Yersinia_enterocolitica_FORC065 Yersinia_frederiksenii_Y225 Yersinia_kristensenii_Y231 Yersinia_rochesterensis_ATCC_33639 Yersinia_rochesterensis_ATCC_BAA-2637 Yersinia_intermedia_SCPM-O-B-9106_C-191 Yersinia_kristensenii_2012N-4030 Yersinia_hibernica_CFS1934 Yersinia_hibernica_LC20 Yersinia_canariae_NCTC_14382 Yersinia_frederiksenii_FDAARGOS_418 Yersinia_alsatica_SCPM-O-B-7604 Yersinia_rohdei_YRA Yersinia_massiliensis_GTA Yersinia_massiliensis_2011N-4075 Yersinia_frederiksenii_FDAARGOS_417 Yersinia_intermedia_SCPM-O-B-8026_C-146 Yersinia_sp_KBS0713 Yersinia_bercovieri_ATCC_43970 Yersinia_aleksiciae_159 Yersinia_mollaretii_ATCC_43969 Yersinia_intermedia_FDAARGOS_729 Yersinia_intermedia_FDAARGOS_730 Yersinia_intermedia_NCTC11469 Yersinia_intermedia_FDAARGOS_358 Yersinia_sp_FDAARGOS_228 Yersinia_intermedia_Y228 Yersinia_intermedia_N6293 Yersinia_intermedia_SCPM-O-B-10209_333 Yersinia_aldovae_670-83 Yersinia_ruckeri_NHV_3758 Yersinia_ruckeri_NVI-10705 Yersinia_ruckeri_NVI-1292 Yersinia_ruckeri_NVI-4570 Yersinia_ruckeri_NVI-6614 Yersinia_ruckeri_NVI-11267 Yersinia_ruckeri_NVI-11294 Yersinia_ruckeri_NVI-10571 Yersinia_ruckeri_NVI-8524 Yersinia_ruckeri_NVI-1176 Yersinia_ruckeri_NVI-701 Yersinia_ruckeri_17Y0412 Yersinia_ruckeri_17Y0414 Yersinia_ruckeri_NVI-492 Yersinia_ruckeri_NVI-9681 Yersinia_ruckeri_SC09 Yersinia_ruckeri_17Y0157 Yersinia_ruckeri_17Y0189 Yersinia_ruckeri_17Y0153 Yersinia_ruckeri_17Y0155 Yersinia_ruckeri_KMM821 Yersinia_ruckeri_16Y0180 Yersinia_ruckeri_NVI-11050 Yersinia_ruckeri_NVI-11076 Yersinia_ruckeri_QMA0440 Yersinia_ruckeri_Big_Creek_74 Yersinia_ruckeri_NVI-5089 Yersinia_ruckeri_NVI-10587 Yersinia_ruckeri_NVI-4840 Yersinia_ruckeri_NVI-4479 Yersinia_ruckeri_17Y0161 Yersinia_ruckeri_17Y0163 Yersinia_ruckeri_NVI-11073 Yersinia_ruckeri_NVI-11065 Yersinia_ruckeri_17Y0159 Yersinia_ruckeri_NVI-8270 Yersinia_ruckeri_YRB Yersinia_entomophaga_MH96; do
        for sample in Yersinia_pestis_1045 Yersinia_pestis_SCPM-O-B-6291_C-25 Yersinia_pestis_2944 Yersinia_pestis_KIM10+ Yersinia_pestis_M-1482; do
          sed -i 's/\tCDS\t/_CDS_/g' ${sample}.gff
          grep "_CDS_" ${sample}.gff > ${sample}_CDS.gff
          sed -i 's/ID=/ID_old=/g' ${sample}_CDS.gff
          cut -d';' -f2 ${sample}_CDS.gff > ${sample}_CDS_f2
          sed -i 's/Parent=gene-/ID=/g' ${sample}_CDS_f2
          paste -d';' ${sample}_CDS.gff ${sample}_CDS_f2 > ${sample}_CDS_.gff
          python enum.py ${sample}_CDS_.gff > ${sample}_CDS__.gff   # add a line number to end to avoid the sameple Gene_ID
    
          grep -v "_CDS_" ${sample}.gff > ${sample}_nonCDS.gff
          grep -v "###" ${sample}_nonCDS.gff > ${sample}_nonCDS_.gff
    
          cat ${sample}_nonCDS_.gff ${sample}_CDS__.gff > ${sample}_nonCDS_CDS.gff
          sed -i 's/_CDS_/\tCDS\t/g' ${sample}_nonCDS_CDS.gff
          echo "##FASTA" >> ${sample}_nonCDS_CDS.gff
    
          cut -d' ' -f1 ../assembly/${sample}.fna > ../assembly/${sample}.fasta;
          cat ${sample}_nonCDS_CDS.gff ../assembly/${sample}.fasta > ../prokka_plus/$(echo $sample | cut -d'_' -f3- | tr " " "_").gff;
        done
    
        rm *_CDS.gff *_CDS_f2 *_CDS_.gff *_CDS__.gff *_nonCDS.gff *_nonCDS_.gff
    
    #for sample in Yersinia_pestis_1045 Yersinia_pestis_SCPM-O-B-6291_C-25 ...; do
    #echo $sample | cut -d'_' -f3- | tr " " "_" >> temp
    #done
  3. After standand running of bacto-pipeline. Then we run Roary in the step, a tool for pan-genome analysis. It takes annotated bacterial genomes in GFF3 format as input and clusters the genes based on sequence similarity.

    roary -p 4 -f ./roary -i 95 -cd 99 -s -e -n -v  prokka_plus/1045.gff prokka_plus/SCPM-O-B-6291_C-25.gff prokka_plus/2944.gff prokka_plus/KIM10+.gff
    
    roary -p 4 -f ./roary -i 50 -cd 99 -s -e -n -v  prokka_plus/1045.gff prokka_plus/SCPM-O-B-6291_C-25.gff prokka_plus/2944.gff prokka_plus/KIM10+.gff prokka_plus/M-1482.gff prokka_plus/KIM5.gff prokka_plus/C-781.gff prokka_plus/14D.gff prokka_plus/KM_567.gff prokka_plus/M-1770.gff prokka_plus/C-792.gff prokka_plus/M2086.gff prokka_plus/Harbin_35.gff prokka_plus/Nicholisk_41.gff prokka_plus/Harbin_35_bis.gff prokka_plus/SCPM-O-B-5935_I-1996.gff prokka_plus/I-1252.gff prokka_plus/FDAARGOS_603.gff prokka_plus/195P.gff prokka_plus/Nepal516.gff prokka_plus/S19960127.gff prokka_plus/SCPM-O-B-6530.gff prokka_plus/C-783.gff prokka_plus/A1122.gff prokka_plus/Cadman.gff prokka_plus/A1122_bis.gff prokka_plus/CO92_pgm-_pPCP1-.gff prokka_plus/CO92.gff prokka_plus/Shasta.gff prokka_plus/Dodson.gff prokka_plus/El_Dorado.gff prokka_plus/EV76-CN.gff prokka_plus/EV_NIIEG.gff prokka_plus/Java9.gff prokka_plus/PBM19.gff prokka_plus/20.gff prokka_plus/D182038.gff prokka_plus/D106004.gff prokka_plus/Z176003.gff prokka_plus/Antiqua_bis.gff prokka_plus/FDAARGOS_601.gff prokka_plus/Antiqua.gff prokka_plus/Nairobi.gff prokka_plus/M2085.gff prokka_plus/SCPM-O-B-5942_I-2638.gff prokka_plus/M2029.gff prokka_plus/SCPM-O-DNA-18_I-3113.gff prokka_plus/94.gff prokka_plus/R.gff prokka_plus/790.gff prokka_plus/SCPM-O-B-6899_231.gff prokka_plus/FDAARGOS_602.gff prokka_plus/Pestoides_B.gff prokka_plus/M-1974.gff prokka_plus/91001.gff prokka_plus/Angola.gff prokka_plus/Angola_bis.gff prokka_plus/3770.gff prokka_plus/1412.gff prokka_plus/1413.gff prokka_plus/8787.gff prokka_plus/3067.gff prokka_plus/Pestoides_G.gff prokka_plus/Pestoides_F.gff prokka_plus/Pestoides_F_bis.gff prokka_plus/1522.gff prokka_plus/FDAARGOS_582.gff prokka_plus/NZYP4713.gff prokka_plus/NCTC8480.gff prokka_plus/PB1+_bis.gff prokka_plus/MD67.gff prokka_plus/NCTC10217.gff prokka_plus/NCTC10275.gff prokka_plus/1.gff prokka_plus/IP32953.gff prokka_plus/IP32953_bis.gff prokka_plus/FDAARGOS_583.gff prokka_plus/FDAARGOS_581.gff prokka_plus/ATCC_6904.gff prokka_plus/EP2+.gff prokka_plus/IP31758.gff prokka_plus/598.gff prokka_plus/PA3606.gff prokka_plus/FDAARGOS_665.gff prokka_plus/FDAARGOS_584.gff prokka_plus/YPIII_bis.gff prokka_plus/FDAARGOS_579.gff prokka_plus/IP2666pIB1.gff prokka_plus/FDAARGOS_342.gff prokka_plus/FDAARGOS_580.gff prokka_plus/NCTC3571.gff prokka_plus/228.gff prokka_plus/NCTC13629.gff prokka_plus/MGYG-HGUT-02335.gff prokka_plus/Y1.gff prokka_plus/Y11.gff prokka_plus/NCTC13769.gff prokka_plus/FDAARGOS_1082.gff prokka_plus/2516-87.gff prokka_plus/KNG22703.gff prokka_plus/1055Rr.gff prokka_plus/FDAARGOS_1090.gff prokka_plus/YE1.gff prokka_plus/YE3.gff prokka_plus/YE6.gff prokka_plus/YE7.gff prokka_plus/YE5.gff prokka_plus/YE165.gff prokka_plus/8081.gff prokka_plus/8081_bis.gff prokka_plus/NCTC12982.gff prokka_plus/WA.gff prokka_plus/NW57.gff prokka_plus/NW117.gff prokka_plus/NW51.gff prokka_plus/NW56.gff prokka_plus/NW115.gff prokka_plus/NW67.gff prokka_plus/FORC_002.gff prokka_plus/FORC_002_bis.gff prokka_plus/NW66.gff prokka_plus/MP98.gff prokka_plus/Gp259.gff prokka_plus/FORC066.gff prokka_plus/Gp2.gff prokka_plus/str_YE5303.gff prokka_plus/Gp200.gff prokka_plus/NW116.gff prokka_plus/Gp169.gff prokka_plus/NW1.gff prokka_plus/FORC065.gff prokka_plus/Y225.gff prokka_plus/Y231.gff prokka_plus/ATCC_33639.gff prokka_plus/ATCC_BAA-2637.gff prokka_plus/SCPM-O-B-9106_C-191.gff prokka_plus/2012N-4030.gff prokka_plus/CFS1934.gff prokka_plus/LC20.gff prokka_plus/NCTC_14382.gff prokka_plus/FDAARGOS_418.gff prokka_plus/SCPM-O-B-7604.gff prokka_plus/YRA.gff prokka_plus/GTA.gff prokka_plus/2011N-4075.gff prokka_plus/FDAARGOS_417.gff prokka_plus/SCPM-O-B-8026_C-146.gff prokka_plus/KBS0713.gff prokka_plus/ATCC_43970.gff prokka_plus/159.gff prokka_plus/ATCC_43969.gff prokka_plus/FDAARGOS_729.gff prokka_plus/FDAARGOS_730.gff prokka_plus/NCTC11469.gff prokka_plus/FDAARGOS_358.gff prokka_plus/FDAARGOS_228.gff prokka_plus/Y228.gff prokka_plus/N6293.gff prokka_plus/SCPM-O-B-10209_333.gff prokka_plus/670-83.gff prokka_plus/NHV_3758.gff prokka_plus/NVI-10705.gff prokka_plus/NVI-1292.gff prokka_plus/NVI-4570.gff prokka_plus/NVI-6614.gff prokka_plus/NVI-11267.gff prokka_plus/NVI-11294.gff prokka_plus/NVI-10571.gff prokka_plus/NVI-8524.gff prokka_plus/NVI-1176.gff prokka_plus/NVI-701.gff prokka_plus/17Y0412.gff prokka_plus/17Y0414.gff prokka_plus/NVI-492.gff prokka_plus/NVI-9681.gff prokka_plus/SC09.gff prokka_plus/17Y0157.gff prokka_plus/17Y0189.gff prokka_plus/17Y0153.gff prokka_plus/17Y0155.gff prokka_plus/KMM821.gff prokka_plus/16Y0180.gff prokka_plus/NVI-11050.gff prokka_plus/NVI-11076.gff prokka_plus/QMA0440.gff prokka_plus/Big_Creek_74.gff prokka_plus/NVI-5089.gff prokka_plus/NVI-10587.gff prokka_plus/NVI-4840.gff prokka_plus/NVI-4479.gff prokka_plus/17Y0161.gff prokka_plus/17Y0163.gff prokka_plus/NVI-11073.gff prokka_plus/NVI-11065.gff prokka_plus/17Y0159.gff prokka_plus/NVI-8270.gff prokka_plus/YRB.gff prokka_plus/MH96.gff
    
    #DEL makeblastdb -in fna -dbtype 'nucl' -out fna.db 
    #DELblastn -db fna.db -query yopK.fasta -out yopK_on_fna.blastn -evalue 10000  -num_threads 15 -outfmt 6 
  4. generate yop*_seq.txt from roary: This step extracts the coding sequences (CDS) of specific genes from multiple genome files and saves them to an output file. Start-files: roary/pan_genome_reference.fa and roary/gene_presence_absence.csv. For example for yopM.

    grep "yopM" roary/gene_presence_absence.csv
    #6+19+45=70 --> 71
    "yopM","","type III secretion system effector YopM","45","45","1","","","","","","1229","1229","1229","","M486_RS20920_3990","","M479_RS01070_4055","M480_RS01170_4076","","M481_RS01115_4071","","","","","","","","","","","","","LDH65_RS21345_4177","","","","","M478_RS01000_4055","M482_RS01070_4063","M483_RS00915_4013","","","M477_RS21610_4128","","","M484_RS01125_4011","","LDH63_RS21760_4259","","","","","","","","","","YPA_RS22550_4200","CH58_RS00945_4248","","","","","","YPO_RS00170_4130","AK38_RS00930_4114","BAY22_RS21640_4174","YPD4_RS21505_4104","YPD8_RS21525_4060","CH61_RS00195_4143","BZ20_RS00435_4174","M0M60_RS21870_4286","","CH46_RS00070_4122","","","","","","","","EGX53_RS00030_4033","EGX52_RS00260_4348","","","","","EGX46_RS00245_4205","","EGX74_RS00040_4070","","","","","","","","","","","","","YPC_RS21075_4024","CH55_RS00770_4123","","DN756_RS21785_4075","","","","CH62_RS00690_4176","","","CH44_RS00795_4078","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","CH63_RS00700_4106","","","CH59_RS00970_4231","","YPDSF_RS21140_4036","BZ18_RS00325_4042","CH43_RS00040_3994","","LDH64_RS21810_4270","S96127_RS00100_4096","","","GCK71_RS22420_4113","GD372_RS22475_4112","","DJY80_RS22415_4098","GCK69_RS22480_4113","","","","GCK70_RS22160_4053","BZ15_RS00325_4183","","","","","","","","","","","","","","","","YPZ3_RS21220_4056",""
    "group_5673","yopM","type III secretion system effector YopM","19","19","1","","","","","","1103","1103","1103","","","YE105_RS20595_4018","","","","","","","","","","","","","","","","","","","","","CH48_RS00390_4060","","","","YP598_RS21115_4110","","","YE_RS21175_4135","CH49_RS00235_4177","","YP_RS21285_4111","","","","","","","","","YPANGOLA_RS22070_4036","CH56_RS22160_4084","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","XM56_RS20545_4037","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","BZ19_RS21445_4113","","","CH60_RS01070_4100","","","","","","","","","","","","","","","","","","","","YEY1_RS21430_4040","Y11_RS21100_4128","","","","BFS78_RS21580_4258","BB936_RS22285_4398","BED35_RS00500_4353","BED32_RS00030_4182","BED33_RS21910_4325","BED34_RS22270_4407","","","","",""
    "group_23005","yopM","type III secretion system effector YopM","6","6","1","","","","","","1589","1589","1589","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","EGX47_RS00105_4453","EGX44_RS00020_4153","EGX39_RS00330_3982","","","","","","","","","","","","","","","","","","","","","","YPTB_RS21675_4159","BZ17_RS00175_4115","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","BN7064_RS22100_4159","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","",""
    
    > yopM_seq.txt
    for gene_id in M486_RS20920_3990 YE105_RS20595_4018 M479_RS01070_4055 M480_RS01170_4076  M481_RS01115_4071             LDH65_RS21345_4177    CH48_RS00390_4060 M478_RS01000_4055 M482_RS01070_4063 M483_RS00915_4013 YP598_RS21115_4110  M477_RS21610_4128 YE_RS21175_4135 CH49_RS00235_4177 M484_RS01125_4011 YP_RS21285_4111 LDH63_RS21760_4259        YPANGOLA_RS22070_4036 CH56_RS22160_4084 YPA_RS22550_4200 CH58_RS00945_4248      YPO_RS00170_4130 AK38_RS00930_4114 BAY22_RS21640_4174 YPD4_RS21505_4104 YPD8_RS21525_4060 CH61_RS00195_4143 BZ20_RS00435_4174 M0M60_RS21870_4286  CH46_RS00070_4122        EGX53_RS00030_4033 EGX52_RS00260_4348 EGX47_RS00105_4453 EGX44_RS00020_4153 EGX39_RS00330_3982  EGX46_RS00245_4205  EGX74_RS00040_4070             YPC_RS21075_4024 CH55_RS00770_4123  DN756_RS21785_4075  YPTB_RS21675_4159 BZ17_RS00175_4115 CH62_RS00690_4176   CH44_RS00795_4078   XM56_RS20545_4037                                                     BN7064_RS22100_4159   CH63_RS00700_4106 BZ19_RS21445_4113  CH59_RS00970_4231 CH60_RS01070_4100 YPDSF_RS21140_4036 BZ18_RS00325_4042 CH43_RS00040_3994  LDH64_RS21810_4270 S96127_RS00100_4096   GCK71_RS22420_4113 GD372_RS22475_4112  DJY80_RS22415_4098 GCK69_RS22480_4113    GCK70_RS22160_4053 BZ15_RS00325_4183 CH47_RS00140_4080 YEY1_RS21430_4040 Y11_RS21100_4128    BFS78_RS21580_4258 BB936_RS22285_4398 BED35_RS00500_4353 BED32_RS00030_4182 BED33_RS21910_4325 BED34_RS22270_4407    YPZ3_RS21220_4056; do
    for gbff in  Yersinia_massiliensis_2011N-4075/GCF_013282765.1_ASM1328276v1/GCF_013282765.1_ASM1328276v1_genomic.gbff.gz Yersinia_pestis_EV_NIIEG/GCF_000590535.2_ASM59053v2/GCF_000590535.2_ASM59053v2_genomic.gbff.gz Yersinia_pestis_Shasta/GCF_000834335.1_ASM83433v1/GCF_000834335.1_ASM83433v1_genomic.gbff.gz Yersinia_ruckeri_NVI-492/GCF_023212565.2_ASM2321256v2/GCF_023212565.2_ASM2321256v2_genomic.gbff.gz Yersinia_pestis_Pestoides_G/GCF_000834985.1_ASM83498v1/GCF_000834985.1_ASM83498v1_genomic.gbff.gz Yersinia_pestis_Antiqua_bis/GCF_000834825.1_ASM83482v1/GCF_000834825.1_ASM83482v1_genomic.gbff.gz Yersinia_pestis_91001/GCF_000007885.1_ASM788v1/GCF_000007885.1_ASM788v1_genomic.gbff.gz Yersinia_intermedia_Y228/GCF_000834515.1_ASM83451v1/GCF_000834515.1_ASM83451v1_genomic.gbff.gz Yersinia_pestis_Java9/GCF_000834905.1_ASM83490v1/GCF_000834905.1_ASM83490v1_genomic.gbff.gz Yersinia_pseudotuberculosis_IP32953_bis/GCF_000834295.1_ASM83429v1/GCF_000834295.1_ASM83429v1_genomic.gbff.gz Yersinia_pseudotuberculosis_YPIII_bis/GCF_000834375.1_ASM83437v1/GCF_000834375.1_ASM83437v1_genomic.gbff.gz Yersinia_enterocolitica_8081_bis/GCF_000834795.1_ASM83479v1/GCF_000834795.1_ASM83479v1_genomic.gbff.gz Yersinia_sp_FDAARGOS_228/GCF_002073315.2_ASM207331v2/GCF_002073315.2_ASM207331v2_genomic.gbff.gz Yersinia_enterocolitica_Gp169/GCF_025758435.1_ASM2575843v1/GCF_025758435.1_ASM2575843v1_genomic.gbff.gz Yersinia_pestis_195P/GCF_002005285.1_ASM200528v1/GCF_002005285.1_ASM200528v1_genomic.gbff.gz Yersinia_frederiksenii_FDAARGOS_418/GCF_002591195.1_ASM259119v1/GCF_002591195.1_ASM259119v1_genomic.gbff.gz Yersinia_pseudotuberculosis_NCTC3571/GCF_900636705.1_43908_A02/GCF_900636705.1_43908_A02_genomic.gbff.gz Yersinia_enterocolitica_FORC_002/GCF_000987925.1_ASM98792v1/GCF_000987925.1_ASM98792v1_genomic.gbff.gz Yersinia_ruckeri_NVI-1292/GCF_026435275.1_ASM2643527v1/GCF_026435275.1_ASM2643527v1_genomic.gbff.gz Yersinia_pestis_3067/GCF_001188795.1_ASM118879v1/GCF_001188795.1_ASM118879v1_genomic.gbff.gz Yersinia_pestis_M2086/GCF_015336695.1_ASM1533669v1/GCF_015336695.1_ASM1533669v1_genomic.gbff.gz Yersinia_ruckeri_16Y0180/GCF_021399215.1_ASM2139921v1/GCF_021399215.1_ASM2139921v1_genomic.gbff.gz Yersinia_pestis_2944/GCF_001188815.1_ASM118881v1/GCF_001188815.1_ASM118881v1_genomic.gbff.gz Yersinia_rochesterensis_ATCC_BAA-2637/GCF_003600645.1_ASM360064v1/GCF_003600645.1_ASM360064v1_genomic.gbff.gz Yersinia_pestis_Z176003/GCF_000022845.1_ASM2284v1/GCF_000022845.1_ASM2284v1_genomic.gbff.gz Yersinia_intermedia_SCPM-O-B-8026_C-146/GCF_026183385.1_ASM2618338v1/GCF_026183385.1_ASM2618338v1_genomic.gbff.gz Yersinia_enterocolitica_YE5/GCF_001708615.1_ASM170861v1/GCF_001708615.1_ASM170861v1_genomic.gbff.gz Yersinia_enterocolitica_YE6/GCF_001708595.1_ASM170859v1/GCF_001708595.1_ASM170859v1_genomic.gbff.gz Yersinia_pestis_CO92_pgm-_pPCP1-/GCF_001293415.1_ASM129341v1/GCF_001293415.1_ASM129341v1_genomic.gbff.gz Yersinia_pestis_1412/GCF_001188695.1_ASM118869v1/GCF_001188695.1_ASM118869v1_genomic.gbff.gz Yersinia_pestis_El_Dorado/GCF_000834495.1_ASM83449v1/GCF_000834495.1_ASM83449v1_genomic.gbff.gz Yersinia_enterocolitica_KNG22703/GCF_001305635.1_ASM130563v1/GCF_001305635.1_ASM130563v1_genomic.gbff.gz Yersinia_pestis_M-1770/GCF_015337825.2_ASM1533782v2/GCF_015337825.2_ASM1533782v2_genomic.gbff.gz Yersinia_enterocolitica_MP98/GCF_025758515.1_ASM2575851v1/GCF_025758515.1_ASM2575851v1_genomic.gbff.gz Yersinia_enterocolitica_NCTC13629/GCF_900635745.1_32868_F02/GCF_900635745.1_32868_F02_genomic.gbff.gz Yersinia_pestis_94/GCF_024498395.1_ASM2449839v1/GCF_024498395.1_ASM2449839v1_genomic.gbff.gz Yersinia_kristensenii_Y231/GCF_000834865.1_ASM83486v1/GCF_000834865.1_ASM83486v1_genomic.gbff.gz Yersinia_pestis_C-783/GCF_015337285.1_ASM1533728v1/GCF_015337285.1_ASM1533728v1_genomic.gbff.gz Yersinia_pseudotuberculosis_NCTC8480/GCF_900635715.1_32473_H02/GCF_900635715.1_32473_H02_genomic.gbff.gz Yersinia_enterocolitica_NW57/GCF_025758475.1_ASM2575847v1/GCF_025758475.1_ASM2575847v1_genomic.gbff.gz Yersinia_enterocolitica_YE1/GCF_001708635.1_ASM170863v1/GCF_001708635.1_ASM170863v1_genomic.gbff.gz Yersinia_pestis_790/GCF_001188675.1_ASM118867v1/GCF_001188675.1_ASM118867v1_genomic.gbff.gz Yersinia_ruckeri_NVI-11065/GCF_026435655.1_ASM2643565v1/GCF_026435655.1_ASM2643565v1_genomic.gbff.gz Yersinia_pestis_14D/GCF_015159615.2_ASM1515961v2/GCF_015159615.2_ASM1515961v2_genomic.gbff.gz Yersinia_enterocolitica_NW115/GCF_025758655.1_ASM2575865v1/GCF_025758655.1_ASM2575865v1_genomic.gbff.gz Yersinia_enterocolitica_Gp259/GCF_025758265.1_ASM2575826v1/GCF_025758265.1_ASM2575826v1_genomic.gbff.gz Yersinia_enterocolitica_FORC066/GCF_025340245.1_ASM2534024v1/GCF_025340245.1_ASM2534024v1_genomic.gbff.gz Yersinia_pestis_20/GCF_024498415.1_ASM2449841v1/GCF_024498415.1_ASM2449841v1_genomic.gbff.gz Yersinia_pestis_FDAARGOS_602/GCF_003798345.1_ASM379834v1/GCF_003798345.1_ASM379834v1_genomic.gbff.gz Yersinia_aleksiciae_159/GCF_001047675.1_ASM104767v1/GCF_001047675.1_ASM104767v1_genomic.gbff.gz Yersinia_enterocolitica_Gp2/GCF_025758285.1_ASM2575828v1/GCF_025758285.1_ASM2575828v1_genomic.gbff.gz Yersinia_pseudotuberculosis_1/GCF_000834435.1_ASM83443v1/GCF_000834435.1_ASM83443v1_genomic.gbff.gz Yersinia_pestis_3770/GCF_001188775.1_ASM118877v1/GCF_001188775.1_ASM118877v1_genomic.gbff.gz Yersinia_intermedia_FDAARGOS_729/GCF_009730075.1_ASM973007v1/GCF_009730075.1_ASM973007v1_genomic.gbff.gz Yersinia_enterocolitica_NW67/GCF_025758535.1_ASM2575853v1/GCF_025758535.1_ASM2575853v1_genomic.gbff.gz Yersinia_intermedia_SCPM-O-B-10209_333/GCF_026183345.1_ASM2618334v1/GCF_026183345.1_ASM2618334v1_genomic.gbff.gz Yersinia_ruckeri_17Y0414/GCF_021399075.1_ASM2139907v1/GCF_021399075.1_ASM2139907v1_genomic.gbff.gz Yersinia_pestis_SCPM-O-B-6530/GCF_009295985.1_ASM929598v1/GCF_009295985.1_ASM929598v1_genomic.gbff.gz Yersinia_pseudotuberculosis_EP2+/GCF_000834415.1_ASM83441v1/GCF_000834415.1_ASM83441v1_genomic.gbff.gz Yersinia_pestis_KM_567/GCF_015337445.1_ASM1533744v1/GCF_015337445.1_ASM1533744v1_genomic.gbff.gz Yersinia_ruckeri_Big_Creek_74/GCF_000964565.1_ASM96456v1/GCF_000964565.1_ASM96456v1_genomic.gbff.gz Yersinia_intermedia_FDAARGOS_358/GCF_002983625.1_ASM298362v1/GCF_002983625.1_ASM298362v1_genomic.gbff.gz Yersinia_ruckeri_NVI-9681/GCF_023212445.2_ASM2321244v2/GCF_023212445.2_ASM2321244v2_genomic.gbff.gz Yersinia_kristensenii_2012N-4030/GCF_013282785.1_ASM1328278v1/GCF_013282785.1_ASM1328278v1_genomic.gbff.gz Yersinia_ruckeri_17Y0157/GCF_021399195.1_ASM2139919v1/GCF_021399195.1_ASM2139919v1_genomic.gbff.gz Yersinia_ruckeri_NVI-8270/GCF_026435135.1_ASM2643513v1/GCF_026435135.1_ASM2643513v1_genomic.gbff.gz Yersinia_ruckeri_17Y0189/GCF_021399095.1_ASM2139909v1/GCF_021399095.1_ASM2139909v1_genomic.gbff.gz Yersinia_ruckeri_NVI-8524/GCF_026435115.1_ASM2643511v1/GCF_026435115.1_ASM2643511v1_genomic.gbff.gz Yersinia_pestis_M-1482/GCF_015337645.1_ASM1533764v1/GCF_015337645.1_ASM1533764v1_genomic.gbff.gz Yersinia_pestis_Harbin_35_bis/GCF_000834275.1_ASM83427v1/GCF_000834275.1_ASM83427v1_genomic.gbff.gz Yersinia_pseudotuberculosis_NCTC10217/GCF_900635755.1_33467_B01/GCF_900635755.1_33467_B01_genomic.gbff.gz Yersinia_pseudotuberculosis_598/GCF_020889805.1_ASM2088980v1/GCF_020889805.1_ASM2088980v1_genomic.gbff.gz Yersinia_ruckeri_NVI-11267/GCF_026435335.1_ASM2643533v1/GCF_026435335.1_ASM2643533v1_genomic.gbff.gz Yersinia_enterocolitica_NW56/GCF_025758635.1_ASM2575863v1/GCF_025758635.1_ASM2575863v1_genomic.gbff.gz Yersinia_pestis_Angola/GCF_000018805.1_ASM1880v1/GCF_000018805.1_ASM1880v1_genomic.gbff.gz Yersinia_pestis_SCPM-O-DNA-18_I-3113/GCF_009295945.1_ASM929594v1/GCF_009295945.1_ASM929594v1_genomic.gbff.gz Yersinia_enterocolitica_Y11/GCF_000253175.1_ASM25317v1/GCF_000253175.1_ASM25317v1_genomic.gbff.gz Yersinia_pestis_Dodson/GCF_000834775.1_ASM83477v1/GCF_000834775.1_ASM83477v1_genomic.gbff.gz Yersinia_pestis_Cadman/GCF_001693595.1_ASM169359v1/GCF_001693595.1_ASM169359v1_genomic.gbff.gz Yersinia_pestis_KIM5/GCF_000970105.1_ASM97010v1/GCF_000970105.1_ASM97010v1_genomic.gbff.gz Yersinia_ruckeri_NVI-10705/GCF_023212585.2_ASM2321258v2/GCF_023212585.2_ASM2321258v2_genomic.gbff.gz Yersinia_pestis_EV76-CN/GCF_024758685.1_ASM2475868v1/GCF_024758685.1_ASM2475868v1_genomic.gbff.gz Yersinia_intermedia_FDAARGOS_730/GCF_009730055.1_ASM973005v1/GCF_009730055.1_ASM973005v1_genomic.gbff.gz Yersinia_ruckeri_NVI-11073/GCF_026435495.1_ASM2643549v1/GCF_026435495.1_ASM2643549v1_genomic.gbff.gz Yersinia_ruckeri_17Y0161/GCF_021399155.1_ASM2139915v1/GCF_021399155.1_ASM2139915v1_genomic.gbff.gz Yersinia_sp_KBS0713/GCF_005937895.2_ASM593789v2/GCF_005937895.2_ASM593789v2_genomic.gbff.gz Yersinia_pestis_SCPM-O-B-6899_231/GCF_009295925.1_ASM929592v1/GCF_009295925.1_ASM929592v1_genomic.gbff.gz Yersinia_ruckeri_NVI-5089/GCF_026435195.1_ASM2643519v1/GCF_026435195.1_ASM2643519v1_genomic.gbff.gz Yersinia_pestis_Nicholisk_41/GCF_000834885.1_ASM83488v1/GCF_000834885.1_ASM83488v1_genomic.gbff.gz Yersinia_enterocolitica_YE7/GCF_001708555.1_ASM170855v1/GCF_001708555.1_ASM170855v1_genomic.gbff.gz Yersinia_intermedia_SCPM-O-B-9106_C-191/GCF_026183365.1_ASM2618336v1/GCF_026183365.1_ASM2618336v1_genomic.gbff.gz Yersinia_canariae_NCTC_14382/GCF_009831415.1_ASM983141v1/GCF_009831415.1_ASM983141v1_genomic.gbff.gz Yersinia_enterocolitica_YE3/GCF_001708655.1_ASM170865v1/GCF_001708655.1_ASM170865v1_genomic.gbff.gz Yersinia_pseudotuberculosis_NCTC10275/GCF_900637475.1_51108_B01/GCF_900637475.1_51108_B01_genomic.gbff.gz Yersinia_enterocolitica_8081/GCF_000009345.1_ASM934v1/GCF_000009345.1_ASM934v1_genomic.gbff.gz Yersinia_ruckeri_NVI-10571/GCF_026435835.1_ASM2643583v1/GCF_026435835.1_ASM2643583v1_genomic.gbff.gz Yersinia_enterocolitica_2516-87/GCF_000834735.1_ASM83473v1/GCF_000834735.1_ASM83473v1_genomic.gbff.gz Yersinia_frederiksenii_FDAARGOS_417/GCF_002591095.1_ASM259109v1/GCF_002591095.1_ASM259109v1_genomic.gbff.gz Yersinia_pestis_I-1252/GCF_015336465.1_ASM1533646v1/GCF_015336465.1_ASM1533646v1_genomic.gbff.gz Yersinia_ruckeri_17Y0155/GCF_021399235.1_ASM2139923v1/GCF_021399235.1_ASM2139923v1_genomic.gbff.gz Yersinia_pseudotuberculosis_FDAARGOS_665/GCF_008693365.1_ASM869336v1/GCF_008693365.1_ASM869336v1_genomic.gbff.gz Yersinia_alsatica_SCPM-O-B-7604/GCF_025133195.1_ASM2513319v1/GCF_025133195.1_ASM2513319v1_genomic.gbff.gz Yersinia_pseudotuberculosis_PA3606/GCF_000834945.1_ASM83494v1/GCF_000834945.1_ASM83494v1_genomic.gbff.gz Yersinia_pestis_KIM10+/GCF_000006645.1_ASM664v1/GCF_000006645.1_ASM664v1_genomic.gbff.gz Yersinia_ruckeri_NVI-701/GCF_026435155.1_ASM2643515v1/GCF_026435155.1_ASM2643515v1_genomic.gbff.gz Yersinia_enterocolitica_NW117/GCF_025758455.1_ASM2575845v1/GCF_025758455.1_ASM2575845v1_genomic.gbff.gz Yersinia_enterocolitica_FORC065/GCA_025340225.1_ASM2534022v1/GCA_025340225.1_ASM2534022v1_genomic.gbff.gz Yersinia_enterocolitica_NW1/GCF_025758495.1_ASM2575849v1/GCF_025758495.1_ASM2575849v1_genomic.gbff.gz Yersinia_ruckeri_QMA0440/GCF_002192595.1_ASM219259v1/GCF_002192595.1_ASM219259v1_genomic.gbff.gz Yersinia_pseudotuberculosis_FDAARGOS_579/GCF_003798305.1_ASM379830v1/GCF_003798305.1_ASM379830v1_genomic.gbff.gz Yersinia_enterocolitica_1055Rr/GCF_000192105.1_ASM19210v1/GCF_000192105.1_ASM19210v1_genomic.gbff.gz Yersinia_hibernica_CFS1934/GCF_004124235.1_ASM412423v1/GCF_004124235.1_ASM412423v1_genomic.gbff.gz Yersinia_pestis_D106004/GCF_000022805.1_ASM2280v1/GCF_000022805.1_ASM2280v1_genomic.gbff.gz Yersinia_enterocolitica_Y1/GCF_004368055.1_ASM436805v1/GCF_004368055.1_ASM436805v1_genomic.gbff.gz Yersinia_pseudotuberculosis_IP31758/GCF_000016945.1_ASM1694v1/GCF_000016945.1_ASM1694v1_genomic.gbff.gz Yersinia_pestis_Pestoides_F_bis/GCF_000834315.1_ASM83431v1/GCF_000834315.1_ASM83431v1_genomic.gbff.gz Yersinia_pestis_M-1974/GCF_015336865.1_ASM1533686v1/GCF_015336865.1_ASM1533686v1_genomic.gbff.gz Yersinia_ruckeri_NHV_3758/GCF_002442495.2_ASM244249v2/GCF_002442495.2_ASM244249v2_genomic.gbff.gz Yersinia_ruckeri_17Y0163/GCF_021399115.1_ASM2139911v1/GCF_021399115.1_ASM2139911v1_genomic.gbff.gz Yersinia_pseudotuberculosis_MD67/GCF_000834355.1_ASM83435v1/GCF_000834355.1_ASM83435v1_genomic.gbff.gz Yersinia_pestis_D182038/GCF_000022825.1_ASM2282v1/GCF_000022825.1_ASM2282v1_genomic.gbff.gz Yersinia_enterocolitica_FDAARGOS_1090/GCF_016727905.1_ASM1672790v1/GCF_016727905.1_ASM1672790v1_genomic.gbff.gz Yersinia_bercovieri_ATCC_43970/GCF_013282745.1_ASM1328274v1/GCF_013282745.1_ASM1328274v1_genomic.gbff.gz Yersinia_enterocolitica_WA/GCF_000834195.1_ASM83419v1/GCF_000834195.1_ASM83419v1_genomic.gbff.gz Yersinia_ruckeri_NVI-10587/GCF_023212425.2_ASM2321242v2/GCF_023212425.2_ASM2321242v2_genomic.gbff.gz Yersinia_pestis_R/GCF_024498375.1_ASM2449837v1/GCF_024498375.1_ASM2449837v1_genomic.gbff.gz Yersinia_intermedia_N6293/GCF_022637335.1_ASM2263733v1/GCF_022637335.1_ASM2263733v1_genomic.gbff.gz Yersinia_ruckeri_NVI-6614/GCF_026435175.1_ASM2643517v1/GCF_026435175.1_ASM2643517v1_genomic.gbff.gz Yersinia_hibernica_LC20/GCF_000597945.1_ASM59794v2/GCF_000597945.1_ASM59794v2_genomic.gbff.gz Yersinia_ruckeri_17Y0153/GCF_021399175.1_ASM2139917v1/GCF_021399175.1_ASM2139917v1_genomic.gbff.gz Yersinia_aldovae_670-83/GCF_000834395.1_ASM83439v1/GCF_000834395.1_ASM83439v1_genomic.gbff.gz Yersinia_pestis_SCPM-O-B-5935_I-1996/GCF_009295965.1_ASM929596v1/GCF_009295965.1_ASM929596v1_genomic.gbff.gz Yersinia_ruckeri_YRB/GCF_000834255.1_ASM83425v1/GCF_000834255.1_ASM83425v1_genomic.gbff.gz Yersinia_enterocolitica_FORC_002_bis/GCF_001304755.1_ASM130475v1/GCF_001304755.1_ASM130475v1_genomic.gbff.gz Yersinia_pestis_Antiqua/GCF_000013825.1_ASM1382v1/GCF_000013825.1_ASM1382v1_genomic.gbff.gz Yersinia_pestis_Pestoides_B/GCF_000834925.1_ASM83492v1/GCF_000834925.1_ASM83492v1_genomic.gbff.gz Yersinia_pestis_M2085/GCF_015338045.2_ASM1533804v2/GCF_015338045.2_ASM1533804v2_genomic.gbff.gz Yersinia_pestis_CO92/GCF_000009065.1_ASM906v1/GCF_000009065.1_ASM906v1_genomic.gbff.gz Yersinia_ruckeri_17Y0159/GCF_021399135.1_ASM2139913v1/GCF_021399135.1_ASM2139913v1_genomic.gbff.gz Yersinia_enterocolitica_NCTC12982/GCF_901472495.1_32868_C01/GCF_901472495.1_32868_C01_genomic.gbff.gz Yersinia_pestis_SCPM-O-B-5942_I-2638/GCF_009363195.1_ASM936319v1/GCF_009363195.1_ASM936319v1_genomic.gbff.gz Yersinia_pestis_Nepal516/GCF_000013805.1_ASM1380v1/GCF_000013805.1_ASM1380v1_genomic.gbff.gz Yersinia_pseudotuberculosis_FDAARGOS_342/GCF_003546905.1_ASM354690v1/GCF_003546905.1_ASM354690v1_genomic.gbff.gz Yersinia_ruckeri_SC09/GCF_000775355.2_ASM77535v2/GCF_000775355.2_ASM77535v2_genomic.gbff.gz Yersinia_mollaretii_ATCC_43969/GCF_013282725.1_ASM1328272v1/GCF_013282725.1_ASM1328272v1_genomic.gbff.gz Yersinia_pestis_Pestoides_F/GCF_000016445.1_ASM1644v1/GCF_000016445.1_ASM1644v1_genomic.gbff.gz Yersinia_pestis_Angola_bis/GCF_000834845.1_ASM83484v1/GCF_000834845.1_ASM83484v1_genomic.gbff.gz Yersinia_ruckeri_17Y0412/GCF_021399055.1_ASM2139905v1/GCF_021399055.1_ASM2139905v1_genomic.gbff.gz Yersinia_pestis_1522/GCF_001188715.1_ASM118871v1/GCF_001188715.1_ASM118871v1_genomic.gbff.gz Yersinia_enterocolitica_MGYG-HGUT-02335/GCF_902385945.1_UHGG_MGYG-HGUT-02335/GCF_902385945.1_UHGG_MGYG-HGUT-02335_genomic.gbff.gz Yersinia_pestis_C-792/GCF_015337085.2_ASM1533708v2/GCF_015337085.2_ASM1533708v2_genomic.gbff.gz Yersinia_ruckeri_NVI-11050/GCF_023212385.2_ASM2321238v2/GCF_023212385.2_ASM2321238v2_genomic.gbff.gz Yersinia_intermedia_NCTC11469/GCF_900635455.1_28307_A01/GCF_900635455.1_28307_A01_genomic.gbff.gz Yersinia_pseudotuberculosis_FDAARGOS_583/GCF_003798285.1_ASM379828v1/GCF_003798285.1_ASM379828v1_genomic.gbff.gz Yersinia_pestis_M2029/GCF_015336265.1_ASM1533626v1/GCF_015336265.1_ASM1533626v1_genomic.gbff.gz Yersinia_enterocolitica_Gp200/GCF_025758555.1_ASM2575855v1/GCF_025758555.1_ASM2575855v1_genomic.gbff.gz Yersinia_massiliensis_GTA/GCF_003048255.1_ASM304825v1/GCF_003048255.1_ASM304825v1_genomic.gbff.gz Yersinia_pestis_A1122_bis/GCF_000834755.1_ASM83475v1/GCF_000834755.1_ASM83475v1_genomic.gbff.gz Yersinia_pseudotuberculosis_NZYP4713/GCF_900092345.1_YP4713/GCF_900092345.1_YP4713_genomic.gbff.gz Yersinia_pestis_PBM19/GCF_000834235.1_ASM83423v1/GCF_000834235.1_ASM83423v1_genomic.gbff.gz Yersinia_enterocolitica_NW116/GCF_025758575.1_ASM2575857v1/GCF_025758575.1_ASM2575857v1_genomic.gbff.gz Yersinia_ruckeri_KMM821/GCF_017498685.1_ASM1749868v1/GCF_017498685.1_ASM1749868v1_genomic.gbff.gz Yersinia_ruckeri_NVI-4840/GCF_026435215.1_ASM2643521v1/GCF_026435215.1_ASM2643521v1_genomic.gbff.gz Yersinia_enterocolitica_FDAARGOS_1082/GCF_016727765.1_ASM1672776v1/GCF_016727765.1_ASM1672776v1_genomic.gbff.gz Yersinia_enterocolitica_NW51/GCF_025758615.1_ASM2575861v1/GCF_025758615.1_ASM2575861v1_genomic.gbff.gz Yersinia_ruckeri_NVI-11076/GCF_023212325.2_ASM2321232v2/GCF_023212325.2_ASM2321232v2_genomic.gbff.gz Yersinia_rohdei_YRA/GCF_000834455.1_ASM83445v1/GCF_000834455.1_ASM83445v1_genomic.gbff.gz Yersinia_pestis_C-781/GCF_015336085.1_ASM1533608v1/GCF_015336085.1_ASM1533608v1_genomic.gbff.gz Yersinia_pestis_Harbin_35/GCF_000186725.1_ASM18672v1/GCF_000186725.1_ASM18672v1_genomic.gbff.gz Yersinia_pseudotuberculosis_ATCC_6904/GCF_000750315.1_ASM75031v1/GCF_000750315.1_ASM75031v1_genomic.gbff.gz Yersinia_pseudotuberculosis_FDAARGOS_580/GCF_003798445.1_ASM379844v1/GCF_003798445.1_ASM379844v1_genomic.gbff.gz Yersinia_enterocolitica_str_YE5303/GCF_000968115.1_ASM96811v1/GCF_000968115.1_ASM96811v1_genomic.gbff.gz Yersinia_pestis_FDAARGOS_601/GCF_003798225.1_ASM379822v1/GCF_003798225.1_ASM379822v1_genomic.gbff.gz Yersinia_pestis_SCPM-O-B-6291_C-25/GCF_009296005.1_ASM929600v1/GCF_009296005.1_ASM929600v1_genomic.gbff.gz Yersinia_pestis_Nairobi/GCF_000835005.1_ASM83500v1/GCF_000835005.1_ASM83500v1_genomic.gbff.gz Yersinia_pseudotuberculosis_FDAARGOS_584/GCF_003798385.1_ASM379838v1/GCF_003798385.1_ASM379838v1_genomic.gbff.gz Yersinia_similis_228/GCF_000582515.1_ASM58251v1/GCF_000582515.1_ASM58251v1_genomic.gbff.gz Yersinia_pestis_1413/GCF_001188935.1_ASM118893v1/GCF_001188935.1_ASM118893v1_genomic.gbff.gz Yersinia_pseudotuberculosis_FDAARGOS_581/GCF_003798425.1_ASM379842v1/GCF_003798425.1_ASM379842v1_genomic.gbff.gz Yersinia_entomophaga_MH96/GCF_001656035.1_ASM165603v1/GCF_001656035.1_ASM165603v1_genomic.gbff.gz Yersinia_ruckeri_NVI-1176/GCF_026435295.1_ASM2643529v1/GCF_026435295.1_ASM2643529v1_genomic.gbff.gz Yersinia_pestis_S19960127/GCF_015190655.1_ASM1519065v1/GCF_015190655.1_ASM1519065v1_genomic.gbff.gz Yersinia_ruckeri_NVI-4479/GCF_026435255.1_ASM2643525v1/GCF_026435255.1_ASM2643525v1_genomic.gbff.gz Yersinia_frederiksenii_Y225/GCF_000834215.1_ASM83421v1/GCF_000834215.1_ASM83421v1_genomic.gbff.gz Yersinia_ruckeri_NVI-4570/GCF_026435235.1_ASM2643523v1/GCF_026435235.1_ASM2643523v1_genomic.gbff.gz Yersinia_pseudotuberculosis_IP2666pIB1/GCF_003814345.1_ASM381434v1/GCF_003814345.1_ASM381434v1_genomic.gbff.gz Yersinia_pseudotuberculosis_FDAARGOS_582/GCF_003798405.1_ASM379840v1/GCF_003798405.1_ASM379840v1_genomic.gbff.gz Yersinia_enterocolitica_NCTC13769/GCF_900637005.1_46582_C01/GCF_900637005.1_46582_C01_genomic.gbff.gz Yersinia_pestis_A1122/GCF_000222975.1_ASM22297v1/GCF_000222975.1_ASM22297v1_genomic.gbff.gz Yersinia_enterocolitica_YE165/GCF_001708575.1_ASM170857v1/GCF_001708575.1_ASM170857v1_genomic.gbff.gz Yersinia_pseudotuberculosis_IP32953/GCF_000047365.1_ASM4736v1/GCF_000047365.1_ASM4736v1_genomic.gbff.gz Yersinia_pestis_8787/GCF_001188755.1_ASM118875v1/GCF_001188755.1_ASM118875v1_genomic.gbff.gz Yersinia_rochesterensis_ATCC_33639/GCF_000750355.1_ASM75035v1/GCF_000750355.1_ASM75035v1_genomic.gbff.gz Yersinia_pestis_FDAARGOS_603/GCF_003798205.1_ASM379820v1/GCF_003798205.1_ASM379820v1_genomic.gbff.gz Yersinia_pseudotuberculosis_PB1+_bis/GCF_000834475.1_ASM83447v1/GCF_000834475.1_ASM83447v1_genomic.gbff.gz Yersinia_ruckeri_NVI-11294/GCF_026435315.1_ASM2643531v1/GCF_026435315.1_ASM2643531v1_genomic.gbff.gz Yersinia_enterocolitica_NW66/GCF_025758595.1_ASM2575859v1/GCF_025758595.1_ASM2575859v1_genomic.gbff.gz Yersinia_pestis_1045/GCF_001188735.1_ASM118873v1/GCF_001188735.1_ASM118873v1_genomic.gbff.gz; do
        output=$(python3 extract_CDS_of_a_locus_tag.py ${gbff} $(echo "${gene_id}" | cut -d '_' -f 1-2))
        if [[ ! -z "${output}" ]]; then
            gbff_short=$(echo "${gbff}" | cut -d '/' -f 1)
            printf "%s\t%s\n" "${gbff_short}" "${output}" >> yopM_seq.txt
        fi
      done
    done
  5. extract the sequences according to NCBI annotations

    #------------------------------- yopJ (+6) -------------------------------
    #grep "yopJ" selected_gtf_files/Yersinia_enterocolitica_2516-87.gtf
    NZ_CP009837.1   RefSeq  gene    69041   69701   .       -       .       gene_id "CH48_RS00445"; transcript_id ""; gbkey "Gene"; gene "yopJ"; gene_biotype "protein_coding"; locus_tag "CH48_RS00445"; old_locus_tag "CH48_4238"; part "2"; 
    NZ_CP009837.1   RefSeq  gene    1       206     .       -       .       gene_id "CH48_RS00445"; transcript_id ""; gbkey "Gene"; gene "yopJ"; gene_biotype "protein_coding"; locus_tag "CH48_RS00445"; old_locus_tag "CH48_4238"; part "1"; 
    
    #grep "yopJ" selected_gtf_files/Yersinia_pestis_790.gtf (NZ_CP006807.1)
    
    #grep "yopJ" selected_gtf_files/Yersinia_pestis_Antiqua_bis.gtf
    NZ_CP009905.1   RefSeq  gene    16737   17602   .       -       .       gene_id "CH58_RS00725"; transcript_id ""; gbkey "Gene"; gene "yopJ"; gene_biotype "pseudogene"; locus_tag "CH58_RS00725"; old_locus_tag "CH58_4444"; pseudo "true"; 
    
    #grep "yopJ" selected_gtf_files/Yersinia_pestis_FDAARGOS_602.gtf
    NZ_CP033695.1   RefSeq  gene    36152   37017   .       +       .       gene_id "EGX42_RS00935"; transcript_id ""; gbkey "Gene"; gene "yopJ"; gene_biotype "pseudogene"; locus_tag "EGX42_RS00935"; old_locus_tag "EGX42_00930"; pseudo "true"; 
    
    #grep "yopJ" selected_gtf_files/Yersinia_pestis_Pestoides_B.gtf 
    NZ_CP010022.1   RefSeq  gene    23121   23986   .       -       .       gene_id "CH60_RS00825"; transcript_id ""; gbkey "Gene"; gene "yopJ"; gene_biotype "pseudogene"; locus_tag "CH60_RS00825"; old_locus_tag "CH60_4301"; pseudo "true"; 
    
    #grep "yopJ" selected_gtf_files/Yersinia_pseudotuberculosis_EP2+.gtf
    NZ_CP009758.1   RefSeq  gene    33302   34168   .       +       .       gene_id "BZ20_RS00215"; transcript_id ""; gbkey "Gene"; gene "yopJ"; gene_biotype "pseudogene"; locus_tag "BZ20_RS00215"; old_locus_tag "BZ20_4189"; pseudo "true"; 
    
    #under selected_fna_files
    samtools faidx Yersinia_enterocolitica_2516-87.fna NZ_CP009837.1:69041-69701 > temp.fna
    samtools faidx Yersinia_enterocolitica_2516-87.fna NZ_CP009837.1:1-206 >> temp.fna
    revseq
    sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' yersinia_enterocolitica_2516-87.rev > temp_.fna
    
    samtools faidx Yersinia_pestis_Antiqua_bis.fna NZ_CP009905.1:16737-17602 > temp.fna
    revseq
    sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 16737-17602.rev > temp_.fna
    
    samtools faidx Yersinia_pestis_FDAARGOS_602.fna NZ_CP033695.1:36152-37017 > temp.fna
    sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna
    
    samtools faidx Yersinia_pestis_Pestoides_B.fna NZ_CP010022.1:23121-23986 > temp.fna
    revseq
    sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 23121-23986.rev > temp_.fna
    
    samtools faidx Yersinia_pseudotuberculosis_EP2+.fna NZ_CP009758.1:33302-34168 > temp.fna
    sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna
    
    Yersinia_enterocolitica_2516-87 ATGATTGGGCCAATATCACAAATAAACAGCTTCGGTGGCTTATCAGAAAAAGAGACCCGTTCTTTAATCAGTAATGAAGAGCTTAAAAATATCATAATACAGTTGGAAACTGATATAGCGGATGGATCCTGGTTCCATAAAAATTATTCACGCCTGGATATAGAAGTCATGCCCGCATTAGTAATTCAGGCGAACAATAAATATCCGGAAATGAATCTTAATTTTGTTACATCTCCCCAGGACCTTTCGATAGAAATAAAAAATGTCATAGAAAATGGAGTTGGATCTTCCCGCTTCATAATTAACATGGGGGAGGGTGGAATACATTTCAGTGTAATTGATTACAAACATATAAATGGGAAAACATCTCTGATATTATTTGAACCAGTAAACTTTAATAGTATGGGGCCAGCGATACTGGCAATAAGTACAAAAACGGCCATTGAACGTTATCAATTACCTGATTGCCATTTTTCCATGGTGGAAATGGATATTCAGCGAAGCTCATCTGAATGTGGTATTTTTAGTTTGGCACTGGCAAAAAAACTTTACACCGAGAGAGATAGCCTGTTGAAAATACATGAAGATAATATAAAAGGTATATTAAGTGATAGTGAAAATCCTTTACCCCACAATAAGTTGGATCCGTATCTCCCGGTAACTTTTTACAAACATACTCAAGGTAAAAAACGTCTTAATGAATATTTAAATACTAACCCGCAGGGAGTTGGTACTGTTGTTAACAAAAAAAATGAAACCATCTTTAATAGGTTTGATAACAATAAATCCATTATAGATGGAAAGGAATTATCAGTTTCGGTACATAAAAAGAGAATAGCTGAATATAAAACACTTCTCAAAGTATAA
    Yersinia_pestis_Antiqua_bis ATGATCGGACCAATATCACAAATAAATATCTCCGGTGGCTTATCAGAAAAAGAGACCAGTTCTTTAATCAGTAATGAAGAGCTTAAAAATATCATAACACAGTTGGAAACTGATATATCGGATGGATCCTGGTTCCATAAAAATTATTCACGTATGGATGTAGAAGTCATGCCCGCATTGGTAATCCAGGCGAACAATAAATATCCGGAAATGAATCTTAATCTTGTTACATCTCCATTGGACCTTTCAATAGAAATAAAAAACGTCATAGAAAATGGAGTTAGATCTTCCCGCTTCATAATTAACATGGGGGAAGGTGGAATACATTTCAGTGTAATTGATTACAAACATATAAATGGGAAAACATCTCTGATATTGTTTGAACCAGCAAACTTTAACAGTATGGGGCCAGCGATGCTGGCAATAAGGACAAAAACGGCTATTGAACGTTATCAATTACCTGATTGCCATTTCTCCATGGTGGAAATGGATATTCAGCGAAGCTCATCTGAATGTGGTATTTTTAGTTTTGCACTGGCAAAAAAACTTTACATCGAGAGAGATAGCCTGTTGAAAATACATGAAGATAATATAAAAGGTATATTAAGTGATGGTAAAAATCCTTTACCCCACGATAAGTTGGACCCGTATCTCCCGGTAACTTTTTACAAACATACTCAAGGTAAAAAACGTCTTAATGAATATTTAAATACTAACCCGCAGGGAGTTGGTACTGTTGTTAACAAAAAAATGAAACCATCGTTAATAGATTTGATAACAATAAATCCATTGTAGATGGAAAGGAATTATCAGTTTCGGTACATAAAAAGAGAATAGCTGAATATAAAACACTTCTCAAAGTATAA
    >Yersinia_pestis_FDAARGOS_602   ATGATCGGACCAATATCACAAATAAATATCTCCGGTGGCTTATCAGAAAAAGAGACCAGTTCTTTAATCAGTAATGAAGAGCTTAAAAATATCATAACACAGTTGGAAACTGATATATCGGATGGATCCTGGTTCCATAAAAATTATTCACGTATGGATGTAGAAGTCATGCCCGCATTGGTAATCCAGGCGAACAATAAATATCCGGAAATGAATCTTAATCTTGTTACATCTCCATTGGACCTTTCAATAGAAATAAAAAACGTCATAGAAAATGGAGTTAGATCTTCCCGCTTCATAATTAACATGGGGGAAGGTGGAATACATTTCAGTGTAATTGATTACAAACATATAAATGGGAAAACATCTCTGATATTGTTTGAACCAGCAAACTTTAACAGTATGGGGCCAGCGATGCTGGCAATAAGGACAAAAACGGCTATTGAACGTTATCAATTACCTGATTGCCATTTCTCCATGGTGGAAATGGATATTCAGCGAAGCTCATCTGAATGTGGTATTTTTAGTTTTGCACTGGCAAAAAAACTTTACATCGAGAGAGATAGCCTGTTGAAAATACATGAAGATAATATAAAAGGTATATTAAGTGATGGTGAAAATCCTTTACCCCACGATAAGTTGGACCCGTATCTCCCGGTAACTTTTTACAAACATACTCAAGGTAAAAAACGTCTTAATGAATATTTAAATACTAACCCGCAGGGAGTTGGTACTGTTGTTAACAAAAAAAATGAAACCATCGTTAATAGATTTGATAACAATAAATCCATTGTAGATGGAAAGGAATTATCAGTTTCGTACATAAAAAGAGAATAGCTGAATATAAAACACTTCTCAAAGTATAA
    >Yersinia_pestis_Pestoides_B    ATGATCGGACCAATATCACAAATAAATATCTCCGGTGGCTTATCAGAAAAAGAGACCAGTTCTTTAATCAGTAATGAAGAGCTTAAAAATATCATAACACAGTTGGAAACTGATATATCGGATGGATCCTGGTTCCATAAAAATTATTCACGTATGGATGTAGAAGTCATGCCCGCATTGGTAATCCAGGCGAACAATAAATATCCGGAAATGAATCTTAATCTTGTTACATCTCCATTGGACCTTTCAATAGAAATAAAAAACGTCATAGAAAATGGAGTTAGATCTTCCCGCTTCATAATTAACATGGGGGAAGGTGGAATACATTTCAGTGTAATTGATTACAAACATATAAATGGGAAAACATCTCTGATATTGTTTGAACCAGCAAACTTTAACAGTATGGGGCCAGCGATGCTGGCAATAAGGACAAAAACGGCTATTGAACGTTATCAATTACCTGATTGCCATTTCTCCATGGTGGAAATGGATATTCAGCGAAGCTCATCTGAATGTGGTATTTTTAGTTTTGCACTGGCAAAAAAACTTTACATCGAGAGAGATAGCCTGTTGAAAATACATGAAGATAATATAAAAGGTATATTAAGTGATGGTGAAAATCCTTTACCCCACGATAAGTTGGACCCGTATCTCCCGGTAACTTTTTACAAACATACTCAAGGTAAAAAACGTCTTAATGAATATTTAAATACTAACCCGCAGGGAGTTGGTACTGTTGTTAACAAAAAAAATGAAACCATCGTTAATAGATTTGATAACAATAAATCCATTGTAGATGGAAAGGAATTATCAGTTTCGTACATAAAAAGAGAATAGCTGAATATAAAACACTTCTCAAAGTATAA
    Yersinia_pseudotuberculosis_EP2+    ATGATCGGACCAATATCACAAATAAATATCTCCGGTGGCTTATCAGAAAAAGAGACCAGTTCTTTAATCAGTAATGAAGAGCTTAAAAATATCATAACACAGTTGGAAACTGATATATCGGATGGATCCTGGTTCCATAAAAATTATTCACGTATGGATGTAGAAGTCATGCCCGCATTGGTAATCTAGGCGAACAATAAATATCCGGAAATGAATCTTAATCTTGTTACATCTCCATTGGACCTTTCAATAGAAATAAAAAACGTCATAGAAAATGGAGTTAGATCTTCCCGCTTCATAATTAACATGGGGGAAGGTGGAATACATTTCAGTGTAATTGATTACAAACATATAAATGGGAAAACATCTCTGATATTGTTTGAACCAGCAAACTTTAACAGTATGGGGCCAGCGATGCTGGCAATAAGGACAAAAACGGCTATTGAACGTTATCAATTACCTGATTGCCATTTCTCCATGGTGGAAATGGATATTCAGCGAAGCTCATCTGAATGTGGTATTTTTAGTTTTGCACTGGCAAAAAAACTTTACATCGAGAGAGATAGCCTGTTGAAAATACATGAAGATAATATAAAAGGTATATTAAGTGATGGTGAAAATCCTTTACCCCACGATAAGTTGGACCCGTATCTCCCGGTAACTTTTTACAAACATACTCAAGGTAAAAAACGTCTTAATGAATATTTAAATACTAACCCGCAGGGAGTTGGTACTGTTGTTAACAAAAAAAATGAAACCATCGTTAATAGATTTGATAACAATAAATCCATTGTAGATGGAAAGGAATTATCAGTTTCGGTACATAAAAAGAGAATAGCTGAATATAAAACACTTCTCAAAGTATAA
    
    #------------------------------- yopB (+4) -------------------------------
    
    #-- grep "yopB" Yersinia_enterocolitica_YE1.gtf
    
    grep "yopB" Yersinia_enterocolitica_YE1.gtf
    NZ_CP016946.1   RefSeq  gene    73029   73029   .       +       .       gene_id "BFS78_RS21560"; transcript_id ""; gbkey "Gene"; gene "yopB"; gene_biotype "protein_coding"; locus_tag "BFS78_RS21560"; old_locus_tag "BFS78_21560"; part "1"; 
    NZ_CP016946.1   RefSeq  gene    1       1205    .       +       .       gene_id "BFS78_RS21560"; transcript_id ""; gbkey "Gene"; gene "yopB"; gene_biotype "protein_coding"; locus_tag "BFS78_RS21560"; old_locus_tag "BFS78_21560"; part "2";
    
    #-- grep "yopB" Yersinia_enterocolitica_YE3.gtf
    
    NZ_CP016943.1   RefSeq  gene    72880   73026   .       +       .       gene_id "BED35_RS00480"; transcript_id ""; gbkey "Gene"; gene "yopB"; gene_biotype "pseudogene"; locus_tag "BED35_RS00480"; old_locus_tag "BED35_00480"; part "1"; pseudo "true"; 
    NZ_CP016943.1   RefSeq  gene    1       1058    .       +       .       gene_id "BED35_RS00480"; transcript_id ""; gbkey "Gene"; gene "yopB"; gene_biotype "pseudogene"; locus_tag "BED35_RS00480"; old_locus_tag "BED35_00480"; part "2"; pseudo "true";
    
    grep "yopB" Yersinia_enterocolitica_YE5.gtf
    NZ_CP016939.1   RefSeq  gene    73034   73034   .       +       .       gene_id "BED32_RS00010"; transcript_id ""; gbkey "Gene"; gene "yopB"; gene_biotype "protein_coding"; locus_tag "BED32_RS00010"; old_locus_tag "BED32_00010"; part "1"; 
    NZ_CP016939.1   RefSeq  gene    1       1205    .       +       .       gene_id "BED32_RS00010"; transcript_id ""; gbkey "Gene"; gene "yopB"; gene_biotype "protein_coding"; locus_tag "BED32_RS00010"; old_locus_tag "BED32_00010"; part "2"; 
    
    #-- grep "yopB" Yersinia_pestis_Harbin_35_bis.gtf
    NZ_CP009703.1   RefSeq  gene    18869   20075   .       +       .       gene_id "CH55_RS00745"; transcript_id ""; gbkey "Gene"; gene "yopB"; gene_biotype "pseudogene"; locus_tag "CH55_RS00745"; old_locus_tag "CH55_4304"; pseudo "true"; 
    
    #under selected_fna_files
    samtools faidx Yersinia_enterocolitica_YE1.fna NZ_CP016946.1:73029-73029 > temp.fna
    samtools faidx Yersinia_enterocolitica_YE1.fna NZ_CP016946.1:1-1205 >> temp.fna
    samtools faidx Yersinia_enterocolitica_YE3.fna NZ_CP016943.1:72880-73026 > temp.fna
    samtools faidx Yersinia_enterocolitica_YE3.fna NZ_CP016943.1:1-1058 >> temp.fna
    samtools faidx Yersinia_enterocolitica_YE5.fna NZ_CP016939.1:73034-73034 > temp.fna
    samtools faidx Yersinia_enterocolitica_YE5.fna NZ_CP016939.1:1-1205 >> temp.fna
    samtools faidx Yersinia_pestis_Harbin_35_bis.fna NZ_CP009703.1:18869-20075 > temp.fna
    
    Yersinia_enterocolitica_YE1 ATGAGTGCGTTGATAACCCATGATCGCTCAACGCCAGTAACTGGAAGTCTAGTTCCCTACATCGAGACACCAGCGCCCGCCCCCCTTCAGACCCAACAAGTCGCGGGAGAACTGAAGGATAAAAATGGCGGGGTGAGTTCTCAGGGCGTGCAGCTCCCTGCACCACTAGCAGTGGTTGCCAGCCAAGTCACTGAAGGACAACAGCAAGAAATCACTAAATTATTGGAGTCGGTCACCCGCGGCACGGCAGGATCTCAACTGATATCAAATTATGTTTCAGTGCTAACGAATTTTACGCTCGCTTCACCTGATACATTTGAGATTGAGTTAGGTAAGCTAGTTTCTAATTTAGAAGAAGTACGCAAAGACATAAAAATCGCTGATATTCAGCGTCTTCATGAACAAAACATGAAGAAAATTGAAGAGAATCAAGAGAAAATCAAAGAAACAGAAGAGAATGCCAAGCAAGTCAAGAAATCCGGCATGGCATCAAAGATTTTTGGCTGGCTCAGCGCCATAGCCTCAGTGGTTATCGGTGCCATCATGGTGGCCTCAGGGGTAGGAGCCGTTGCCGGTGCAATGATGATTGCCTCAGGCGTAATTGGGATGGCGAATATGGCTGTGAAACAAGCGGCGGAAGATGGCCTGATATCCCAAGAGGCAATGCAAGTATTAGGGCCGATACTCACTGCGATTGAAGTCGCATTGACTGTAGTTTCAACCGTAATGACCTTTGGCGGTTCGGCACTAAAATGCCTGGCTGATATTGGCGCAAAACTCGGTGCTAACACCGCAAGTCTTGCTGCTAAAGGAGCCGAGTTTTCAGCCAAAGTTGCCCAAATTTCGACAGGCATATCAAACACTGTCGGGAGTGCAGTGACTAAATTAGGGGGCAGTTTTGGTAGTTTAACAATGAGCCATGTAATCCGTACAGGATCACAGGCAACACAAGTCGCCGTTGGTGTGGGCAGCGGAATAACTCAGACCATCAATAATAAAAAACAAGCTGATTTACAACATAATAACGCTGATTTGGCCTTGAACAAGGCAGACATGGCAGCGTTACAAAGTATTATTGACCGACTCAAAGAAGAGTTATCCCATTTGTCAGAGTCACATCAACAAGTGATGGAACTGATTTTCCAGATGATTAATGCAAAAGGTGACATGCTGCATAATTTGGCCGGCAGACCCCATACTGTTTAA
    Yersinia_enterocolitica_YE3 ATGAGTGCGTTGATAACCCATGATCGCTCAACGCCAGTAACTGGAAGTCTAGTTCCCTACATCGAGACACCAGCGCCCGCCCCCTTCAGACCCAACAAGTCGCGGGAGAACTGAAGGATAAAAATGGCGGGGTGAGTTCTCAGGGCGTGCAGCTCCCTGCACCACTAGCAGTGGTTGCCAGCCAAGTCACTGAAGGACAACAGCAAGAAATCACTAAATTATTGGAGTCGGTCACCCGCGGCACGGCAGGATCTCAACTGATATCAAATTATGTTTCAGTGCTAACGAATTTTACGCTCGCTTCACCTGATACATTTGAGATTGAGTTAGGTAAGCTAGTTTCTAATTTAGAAGAAGTACGCAAAGACATAAAAATCGCTGATATTCAGCGTCTTCATGAACAAAACATGAAGAAAATTGAAGAGAATCAAGAGAAAATCAAAGAAACAGAAGAGAATGCCAAGCAAGTCAAGAAATCCGGCATGGCATCAAAGATTTTTGGCTGGCTCAGCGCCATAGCCTCAGTGGTTATCGGTGCCATCATGGTGGCCTCAGGGGTAGGAGCCGTTGCCGGTGCAATGATGATTGCCTCAGGCGTAATTGGGATGGCGAATATGGCTGTGAAACAAGCGGCGGAAGATGGCCTGATATCCCAAGAGGCAATGCAAGTATTAGGGCCGATACTCACTGCGATTGAAGTCGCATTGACTGTAGTTTCAACCGTAATGACCTTTGGCGGTTCGGCACTAAAATGCCTGGCTGATATTGGCGCAAAACTCGGTGCTAACACCGCAAGTCTTGCTGCTAAAGGAGCCGAGTTTTCAGCCAAAGTTGCCCAAATTTCGACAGGCATATCAAACACTGTCGGGAGTGCAGTGACTAAATTAGGGGGCAGTTTTGGTAGTTTAACAATGAGCCATGTAATCCGTACAGGATCACAGGCAACACAAGTCGCCGTTGGTGTGGGCAGCGGAATAACTCAGACCATCAATAATAAAAAACAAGCTGATTTACAACATAATAACGCTGATTTGGCCTTGAACAAGGCAGACATGGCAGCGTTACAAAGTATTATTGACCGACTCAAAGAAGAGTTATCCCATTTGTCAGAGTCACATCAACAAGTGATGGAACTGATTTTCCAGATGATTAATGCAAAAGGTGACATGCTGCATAATTTGGCCGGCAGACCCCATACTGTTTAA
    Yersinia_enterocolitica_YE5 ATGAGTGCGTTGATAACCCATGATCGCTCAACGCCAGTAACTGGAAGTCTAGTTCCCTACATCGAGACACCAGCGCCCGCCCCCCTTCAGACCCAACAAGTCGCGGGAGAACTGAAGGATAAAAATGGCGGGGTGAGTTCTCAGGGCGTGCAGCTCCCTGCACCACTAGCAGTGGTTGCCAGCCAAGTCACTGAAGGACAACAGCAAGAAATCACTAAATTATTGGAGTCGGTCACCCGCGGCACGGCAGGATCTCAACTGATATCAAATTATGTTTCAGTGCTAACGAATTTTACGCTCGCTTCACCTGATACATTTGAGATTGAGTTAGGTAAGCTAGTTTCTAATTTAGAAGAAGTACGCAAAGACATAAAAATCGCTGATATTCAGCGTCTTCATGAACAAAACATGAAGAAAATTGAAGAGAATCAAGAGAAAATCAAAGAAACAGAAGAGAATGCCAAGCAAGTCAAGAAATCCGGCATGGCATCAAAGATTTTTGGCTGGCTCAGCGCCATAGCCTCAGTGGTTATCGGTGCCATCATGGTGGCCTCAGGGGTAGGAGCCGTTGCCGGTGCAATGATGATTGCCTCAGGCGTAATTGGGATGGCGAATATGGCTGTGAAACAAGCGGCGGAAGATGGCCTGATATCCCAAGAGGCAATGCAAGTATTAGGGCCGATACTCACTGCGATTGAAGTCGCATTGACTGTAGTTTCAACCGTAATGACCTTTGGCGGTTCGGCACTAAAATGCCTGGCTGATATTGGCGCAAAACTCGGTGCTAACACCGCAAGTCTTGCTGCTAAAGGAGCCGAGTTTTCAGCCAAAGTTGCCCAAATTTCGACAGGCATATCAAACACTGTCGGGAGTGCAGTGACTAAATTAGGGGGCAGTTTTGGTAGTTTAACAATGAGCCATGTAATCCGTACAGGATCACAGGCAACACAAGTCGCCGTTGGTGTGGGCAGCGGAATAACTCAGACCATCAATAATAAAAAACAAGCTGATTTACAACATAATAACGCTGATTTGGCCTTGAACAAGGCAGACATGGCAGCGTTACAAAGTATTATTGACCGACTCAAAGAAGAGTTATCCCATTTGTCAGAGTCACATCAACAAGTGATGGAACTGATTTTCCAGATGATTAATGCAAAAGGTGACATGCTGCATAATTTGGCCGGCAGACCCCATACTGTTTAA
    Yersinia_pestis_Harbin_35_bis   ATGAGTGCGTTGATAACCCATGACCGCTCAACGCCAGTAACTGGAAGTCTACTTCCCTACGTCGAGACACCAGCGCCCGCCCCCCCTTCAGACCCAACAAGTCGCGGGAGAACTGAAGGATAAAAATGGCGGGGTGAGTTCTCAGGGCGTACAGCTCCCTGCACCACTAGCAGTGGTTGCCAGCCAAGTTACTGAAGGACAACAGCAAGAAGTCACTAAATTATTGGAGTCGGTCACCCGCGGCGCGGCAGGATCTCAACTGATATCAAATTATGTTTCAGTGCTAACGAAGTTTACGCTTGCTTCACCTGATACATTTGAGATTGAGTTAGGTAAGCTAGTTTCTAATTTAGAAGAAGTACGCAAAGACATAAAAATCGCTGATATTCAGCGTCTTCATGAACAAAACATGAAGAAAATTGAAGAGAATCAAGAGAAAATCAAAGAAACAGAAGAGAATGCCAAGCAAGTCAAGAAATCCGGCATCGCATCAAAGATTTTTGGCTGGCTCAGCGCCATAGCCTCAGTGATTGTCGGTGCCATCATGGTGGCCTCAGGGGTAGGAGCCGTTGCCGGTGCAATGATGGTTGCCTCAGGCGTAATTGGGATGGCGAATATGGCAGTGAAACAAGCGGCGGAAGATGGCCTGATATCCCAAGAGGCAATGAAAATATTAGGGCCGATACTCACTGCGATTGAAGTCGCATTGACTGTAGTTTCAACCGTAATGACCTTTGGCGGTTCGGCACTAAAATGCCTGGCTAATATTGGCGCAAAACTCGGTGCTAACACCGCAAGTCTTGTGGCTAAAGGAGCCGAGTTTTCGGCCAAAGTTGCCCAAATTTCGACAGGCATATCAAACACTGTCGGGAGTGCAGTGACTAAATTAGGGGGCAGTTTTGCTGGTTTAACAATGAGCCATGCAATCCGTACAGGATCACAGGCAACACAAGTCGCCGTTGGTGTGGGCAGCGGAATAACTCAGACCATCAATAATAAAAAGCAAGCTGATTTACAACATAATAACGCTGATTTGGCCTTGAACAAGGCAGACATGGCAGCGTTACAAAGTATTATTGACCGACTCAAAGAAGAGTTATCCCATTTGTCAGAGTCACATCAACAAGTGATGGAACTGATTTTCCAGATGATTAATGCAAAAGGTGACATGCTGCATAATTTGGCCGGCAGACCCCATACTGTTTAA
    
    #------------------------------- yopT (+9) -------------------------------
    #grep "yopT" selected_gtf_files/Yersinia_pestis_1412.gtf
    NZ_CP006780.1   RefSeq  gene    43360   44327   .       +       .       gene_id "M479_RS22185"; transcript_id ""; gbkey "Gene"; gene "yopT"; gene_biotype "pseudogene"; locus_tag "M479_RS22185"; old_locus_tag "M479_4302"; pseudo "true";
    
    #grep "yopT" selected_gtf_files/Yersinia_pestis_1413.gtf
    NZ_CP006761.1   RefSeq  gene    60310   61277   .       +       .       gene_id "M480_RS22170"; transcript_id ""; gbkey "Gene"; gene "yopT"; gene_biotype "pseudogene"; locus_tag "M480_RS22170"; old_locus_tag "M480_4319"; pseudo "true"; 
    
    #grep "yopT" selected_gtf_files/Yersinia_pestis_1522.gtf
    NZ_CP006757.1   RefSeq  gene    61673   62640   .       -       .       gene_id "M481_RS22190"; transcript_id ""; gbkey "Gene"; gene "yopT"; gene_biotype "pseudogene"; locus_tag "M481_RS22190"; old_locus_tag "M481_4325"; pseudo "true";
    
    samtools faidx Yersinia_pestis_1412.fna NZ_CP006780.1:43360-44327 > temp.fna
    sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna
    
    Yersinia_pestis_1412    ATGAACAGTATTCACGGACACTACCATATTCAACTATCGAATTATTCTGCCGGTGAAAACCTTCAATCAGTACCCTCACCGAAGGGGTGATTGGCGCACACCGAGTGAAAGTGGAAACAGCACTGTCACACTCAAACCTGCAGAAAAAGTTATCAGCCACCATAAAACATAACCAGTCAGGCCGTTCTATGCTGGATAGAAAGTTGACCAGCGACGGCAAAGCTAACCAACGCAGCAGCTTTACCTTCAGTATGATTATGTATCGCATGATACATTTTGTACTCAGCACTCGTGTGCCCGCGGTGAGAGAGTCTGTTGCAAATTACGGAGGTAACATCAATTTCAAGTTTGCTCAGACCAAAGGGGCTTTTCTTCATAAAATAATAAAACATTCAGACACTGCTAGCGGTGTCTGTGAGGCTTTATGTGCACATTGGATCAGGAACCATGCACAAGGCCAAAGCTTATTTGACCAGCTCTATGTTGGCGGGCGTAAGGGGAAATTCCAGATCGATACACTTTACTCAATTAAACAGTTGCAAATAGATGGTTGTAAAGCAGACGTTGATCAAGATGAGGTAACACTAGATTGGTTCAAGAAAAATGGCATATCAGAACGTATGATTGAACGGCATTGCTTACTGCGTCCAGTTGATGTTACTGGTACGACGGAATCAGAAGGGCTGGATCAATTATTAAACGCTATCCTTGATACTCATGGGATAGGTTACGGTTATAAAAAAATACATCTCTCAGGCCAAATGTCAGCCCACGCCATAGCGGCGTATGTCAACGAAAAGAGTGGTGTTACTTTCTTCGATCCCAATTTCGGTGAATTCCACTTTTCTGATAAGGAAAAGTTCCGCAAATGGTTTACTAACTCATTCTGGGGTAATTCTATGTATCATTATCCTCTGGGGGTGGGGCAGCGTTTTAGAGTGTTAACATTTGACTCCAAGGAGGTTTAA
    
    samtools faidx Yersinia_pestis_1413.fna NZ_CP006761.1:60310-61277 > temp.fna
    sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna
    
    Yersinia_pestis_1413    ATGAACAGTATTCACGGACACTACCATATTCAACTATCGAATTATTCTGCCGGTGAAAACCTTCAATCAGTACCCTCACCGAAGGGGTGATTGGCGCACACCGAGTGAAAGTGGAAACAGCACTGTCACACTCAAACCTGCAGAAAAAGTTATCAGCCACCATAAAACATAACCAGTCAGGCCGTTCTATGCTGGATAGAAAGTTGACCAGCGACGGCAAAGCTAACCAACGCAGCAGCTTTACCTTCAGTATGATTATGTATCGCATGATACATTTTGTACTCAGCACTCGTGTGCCCGCGGTGAGAGAGTCTGTTGCAAATTACGGAGGTAACATCAATTTCAAGTTTGCTCAGACCAAAGGGGCTTTTCTTCATAAAATAATAAAACATTCAGACACTGCTAGCGGTGTCTGTGAGGCTTTATGTGCACATTGGATCAGGAACCATGCACAAGGCCAAAGCTTATTTGACCAGCTCTATGTTGGCGGGCGTAAGGGGAAATTCCAGATCGATACACTTTACTCAATTAAACAGTTGCAAATAGATGGTTGTAAAGCAGACGTTGATCAAGATGAGGTAACACTAGATTGGTTCAAGAAAAATGGCATATCAGAACGTATGATTGAACGGCATTGCTTACTGCGTCCAGTTGATGTTACTGGTACGACGGAATCAGAAGGGCTGGATCAATTATTAAACGCTATCCTTGATACTCATGGGATAGGTTACGGTTATAAAAAAATACATCTCTCAGGCCAAATGTCAGCCCACGCCATAGCGGCGTATGTCAACGAAAAGAGTGGTGTTACTTTCTTCGATCCCAATTTCGGTGAATTCCACTTTTCTGATAAGGAAAAGTTCCGCAAATGGTTTACTAACTCATTCTGGGGTAATTCTATGTATCATTATCCTCTGGGGGTGGGGCAGCGTTTTAGAGTGTTAACATTTGACTCCAAGGAGGTTTAA
    
    samtools faidx Yersinia_pestis_1522.fna NZ_CP006757.1:61673-62640 > temp.fna
    revseq
    sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 61673-62640.rev > temp_.fna
    
    Yersinia_pestis_1522    ATGAACAGTATTCACGGACACTACCATATTCAACTATCGAATTATTCTGCCGGTGAAAACCTTCAATCAGTACCCTCACCGAAGGGGTGATTGGCGCACACCGAGTGAAAGTGGAAACAGCACTGTCACACTCAAACCTGCAGAAAAAGTTATCAGCCACCATAAAACATAACCAGTCAGGCCGTTCTATGCTGGATAGAAAGTTGACCAGCGACGGCAAAGCTAACCAACGCAGCAGCTTTACCTTCAGTATGATTATGTATCGCATGATACATTTTGTACTCAGCACTCGTGTGCCCGCGGTGAGAGAGTCTGTTGCAAATTACGGAGGTAACATCAATTTCAAGTTTGCTCAGACCAAAGGGGCTTTTCTTCATAAAATAATAAAACATTCAGACACTGCTAGCGGTGTCTGTGAGGCTTTATGTGCACATTGGATCAGGAACCATGCACAAGGCCAAAGCTTATTTGACCAGCTCTATGTTGGCGGGCGTAAGGGGAAATTCCAGATCGATACACTTTACTCAATTAAACAGTTGCAAATAGATGGTTGTAAAGCAGACGTTGATCAAGATGAGGTAACACTAGATTGGTTCAAGAAAAATGGCATATCAGAACGTATGATTGAACGGCATTGCTTACTGCGTCCAGTTGATGTTACTGGTACGACGGAATCAGAAGGGCTGGATCAATTATTAAACGCTATCCTTGATACTCATGGGATAGGTTACGGTTATAAAAAAATACATCTCTCAGGCCAAATGTCAGCCCACGCCATAGCGGCGTATGTCAACGAAAAGAGTGGTGTTACTTTCTTCGATCCCAATTTCGGTGAATTCCACTTTTCTGATAAGGAAAAGTTCCGCAAATGGTTTACTAACTCATTCTGGGGTAATTCTATGTATCATTATCCTCTGGGGGTGGGGCAGCGTTTTAGAGTGTTAACATTTGACTCCAAGGAGGTTTAA
    
    #grep "yopT" selected_gtf_files/Yersinia_pestis_3067.gtf
    NZ_CP006753.1   RefSeq  gene    43515   44482   .       +       .       gene_id "M482_RS22205"; transcript_id ""; gbkey "Gene"; gene "yopT"; gene_biotype "pseudogene"; locus_tag "M482_RS22205"; old_locus_tag "M482_4297"; pseudo "true"; 
    
    samtools faidx Yersinia_pestis_3067.fna NZ_CP006753.1:43515-44482 > temp.fna
    sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna
    
    Yersinia_pestis_3067    ATGAACAGTATTCACGGACACTACCATATTCAACTATCGAATTATTCTGCCGGTGAAAACCTTCAATCAGTACCCTCACCGAAGGGGTGATTGGCGCACACCGAGTGAAAGTGGAAACAGCACTGTCACACTCAAACCTGCAGAAAAAGTTATCAGCCACCATAAAACATAACCAGTCAGGCCGTTCTATGCTGGATAGAAAGTTGACCAGCGACGGCAAAGCTAACCAACGCAGCAGCTTTACCTTCAGTATGATTATGTATCGCATGATACATTTTGTACTCAGCACTCGTGTGCCCGCGGTGAGAGAGTCTGTTGCAAATTACGGAGGTAACATCAATTTCAAGTTTGCTCAGACCAAAGGGGCTTTTCTTCATAAAATAATAAAACATTCAGACACTGCTAGCGGTGTCTGTGAGGCTTTATGTGCACATTGGATCAGGAACCATGCACAAGGCCAAAGCTTATTTGACCAGCTCTATGTTGGCGGGCGTAAGGGGAAATTCCAGATCGATACACTTTACTCAATTAAACAGTTGCAAATAGATGGTTGTAAAGCAGACGTTGATCAAGATGAGGTAACACTAGATTGGTTCAAGAAAAATGGCATATCAGAACGTATGATTGAACGGCATTGCTTACTGCGTCCAGTTGATGTTACTGGTACGACGGAATCAGAAGGGCTGGATCAATTATTAAACGCTATCCTTGATACTCATGGGATAGGTTACGGTTATAAAAAAATACATCTCTCAGGCCAAATGTCAGCCCACGCCATAGCGGCGTATGTCAACGAAAAGAGTGGTGTTACTTTCTTCGATCCCAATTTCGGTGAATTCCACTTTTCTGATAAGGAAAAGTTCCGCAAATGGTTTACTAACTCATTCTGGGGTAATTCTATGTATCATTATCCTCTGGGGGTGGGGCAGCGTTTTAGAGTGTTAACATTTGACTCCAAGGAGGTTTAA
    
    #grep "yopT" selected_gtf_files/Yersinia_pestis_3770.gtf
    NZ_CP006750.1   RefSeq  gene    18136   19103   .       +       .       gene_id "M483_RS22135"; transcript_id ""; gbkey "Gene"; gene "yopT"; gene_biotype "pseudogene"; locus_tag "M483_RS22135"; old_locus_tag "M483_4264"; pseudo "true"; 
    
    samtools faidx Yersinia_pestis_3770.fna NZ_CP006750.1:18136-19103 > temp.fna
    sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna
    
    Yersinia_pestis_3770    ATGAACAGTATTCACGGACACTACCATATTCAACTATCGAATTATTCTGCCGGTGAAAACCTTCAATCAGTACCCTCACCGAAGGGGTGATTGGCGCACACCGAGTGAAAGTGGAAACAGCACTGTCACACTCAAACCTGCAGAAAAAGTTATCAGCCACCATAAAACATAACCAGTCAGGCCGTTCTATGCTGGATAGAAAGTTGACCAGCGACGGCAAAGCTAACCAACGCAGCAGCTTTACCTTCAGTATGATTATGTATCGCATGATACATTTTGTACTCAGCACTCGTGTGCCCGCGGTGAGAGAGTCTGTTGCAAATTACGGAGGTAACATCAATTTCAAGTTTGCTCAGACCAAAGGGGCTTTTCTTCATAAAATAATAAAACATTCAGACACTGCTAGCGGTGTCTGTGAGGCTTTATGTGCACATTGGATCAGGAACCATGCACAAGGCCAAAGCTTATTTGACCAGCTCTATGTTGGCGGGCGTAAGGGGAAATTCCAGATCGATACACTTTACTCAATTAAACAGTTGCAAATAGATGGTTGTAAAGCAGACGTTGATCAAGATGAGGTAACACTAGATTGGTTCAAGAAAAATGGCATATCAGAACGTATGATTGAACGGCATTGCTTACTGCGTCCAGTTGATGTTACTGGTACGACGGAATCAGAAGGGCTGGATCAATTATTAAACGCTATCCTTGATACTCATGGGATAGGTTACGGTTATAAAAAAATACATCTCTCAGGCCAAATGTCAGCCCACGCCATAGCGGCGTATGTCAACGAAAAGAGTGGTGTTACTTTCTTCGATCCCAATTTCGGTGAATTCCACTTTTCTGATAAGGAAAAGTTCCGCAAATGGTTTACTAACTCATTCTGGGGTAATTCTATGTATCATTATCCTCTGGGGGTGGGGCAGCGTTTTAGAGTGTTAACATTTGACTCCAAGGAGGTTTAA
    
    #grep "yopT" selected_gtf_files/Yersinia_pestis_8787.gtf 
    NZ_CP006747.1   RefSeq  gene    55293   56260   .       +       .       gene_id "M484_RS21915"; transcript_id ""; gbkey "Gene"; gene "yopT"; gene_biotype "pseudogene"; locus_tag "M484_RS21915"; old_locus_tag "M484_4255"; pseudo "true"; 
    
    samtools faidx Yersinia_pestis_8787.fna NZ_CP006747.1:55293-56260 > temp.fna
    sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna
    
    Yersinia_pestis_8787    ATGAACAGTATTCACGGACACTACCATATTCAACTATCGAATTATTCTGCCGGTGAAAACCTTCAATCAGTACCCTCACCGAAGGGGTGATTGGCGCACACCGAGTGAAAGTGGAAACAGCACTGTCACACTCAAACCTGCAGAAAAAGTTATCAGCCACCATAAAACATAACCAGTCAGGCCGTTCTATGCTGGATAGAAAGTTGACCAGCGACGGCAAAGCTAACCAACGCAGCAGCTTTACCTTCAGTATGATTATGTATCGCATGATACATTTTGTACTCAGCACTCGTGTGCCCGCGGTGAGAGAGTCTGTTGCAAATTACGGAGGTAACATCAATTTCAAGTTTGCTCAGACCAAAGGGGCTTTTCTTCATAAAATAATAAAACATTCAGACACTGCTAGCGGTGTCTGTGAGGCTTTATGTGCACATTGGATCAGGAACCATGCACAAGGCCAAAGCTTATTTGACCAGCTCTATGTTGGCGGGCGTAAGGGGAAATTCCAGATCGATACACTTTACTCAATTAAACAGTTGCAAATAGATGGTTGTAAAGCAGACGTTGATCAAGATGAGGTAACACTAGATTGGTTCAAGAAAAATGGCATATCAGAACGTATGATTGAACGGCATTGCTTACTGCGTCCAGTTGATGTTACTGGTACGACGGAATCAGAAGGGCTGGATCAATTATTAAACGCTATCCTTGATACTCATGGGATAGGTTACGGTTATAAAAAAATACATCTCTCAGGCCAAATGTCAGCCCACGCCATAGCGGCGTATGTCAACGAAAAGAGTGGTGTTACTTTCTTCGATCCCAATTTCGGTGAATTCCACTTTTCTGATAAGGAAAAGTTCCGCAAATGGTTTACTAACTCATTCTGGGGTAATTCTATGTATCATTATCCTCTGGGGGTGGGGCAGCGTTTTAGAGTGTTAACATTTGACTCCAAGGAGGTTTAA
    
    #grep "yopT" selected_gtf_files/Yersinia_pestis_Pestoides_F.gtf
    NC_009377.1     RefSeq  gene    48563   49530   .       -       .       gene_id "YPDSF_RS23435"; transcript_id ""; gbkey "Gene"; gene "yopT"; gene_biotype "pseudogene"; locus_tag "YPDSF_RS23435"; old_locus_tag "YPDSF_4001"; pseudo "true"; 
    
    samtools faidx Yersinia_pestis_Pestoides_F.fna NC_009377.1:48563-49530 > temp.fna
    revseq
    sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 48563-49530.rev > temp_.fna
    
    Yersinia_pestis_Pestoides_F ATGAACAGTATTCACGGACACTACCATATTCAACTATCGAATTATTCTGCCGGTGAAAACCTTCAATCAGTACCCTCACCGAAGGGGTGATTGGCGCACACCGAGTGAAAGTGGAAACAGCACTGTCACACTCAAACCTGCAGAAAAAGTTATCAGCCACCATAAAACATAACCAGTCAGGCCGTTCTATGCTGGATAGAAAGTTGACCAGCGACGGCAAAGCTAACCAACGCAGCAGCTTTACCTTCAGTATGATTATGTATCGCATGATACATTTTGTACTCAGCACTCGTGTGCCCGCGGTGAGAGAGTCTGTTGCAAATTACGGAGGTAACATCAATTTCAAGTTTGCTCAGACCAAAGGGGCTTTTCTTCATAAAATAATAAAACATTCAGACACTGCTAGCGGTGTCTGTGAGGCTTTATGTGCACATTGGATCAGGAACCATGCACAAGGCCAAAGCTTATTTGACCAGCTCTATGTTGGCGGGCGTAAGGGGAAATTCCAGATCGATACACTTTACTCAATTAAACAGTTGCAAATAGATGGTTGTAAAGCAGACGTTGATCAAGATGAGGTAACACTAGATTGGTTCAAGAAAAATGGCATATCAGAACGTATGATTGAACGGCATTGCTTACTGCGTCCAGTTGATGTTACTGGTACGACGGAATCAGAAGGGCTGGATCAATTATTAAACGCTATCCTTGATACTCATGGGATAGGTTACGGTTATAAAAAAATACATCTCTCAGGCCAAATGTCAGCCCACGCCATAGCGGCGTATGTCAACGAAAAGAGTGGTGTTACTTTCTTCGATCCCAATTTCGGTGAATTCCACTTTTCTGATAAGGAAAAGTTCCGCAAATGGTTTACTAACTCATTCTGGGGTAATTCTATGTATCATTATCCTCTGGGGGTGGGGCAGCGTTTTAGAGTGTTAACATTTGACTCCAAGGAGGTTTAA
    
    #grep "yopT" selected_gtf_files/Yersinia_pestis_Pestoides_F_bis.gtf
    NZ_CP009713.1   RefSeq  gene    53246   54213   .       -       .       gene_id "BZ18_RS22165"; transcript_id ""; gbkey "Gene"; gene "yopT"; gene_biotype "pseudogene"; locus_tag "BZ18_RS22165"; old_locus_tag "BZ18_4298"; pseudo "true";
    
    samtools faidx Yersinia_pestis_Pestoides_F_bis.fna NZ_CP009713.1:53246-54213 > temp.fna
    revseq
    sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 53246-54213.rev > temp_.fna
    
    Yersinia_pestis_Pestoides_F_bis ATGAACAGTATTCACGGACACTACCATATTCAACTATCGAATTATTCTGCCGGTGAAAACCTTCAATCAGTACCCTCACCGAAGGGGTGATTGGCGCACACCGAGTGAAAGTGGAAACAGCACTGTCACACTCAAACCTGCAGAAAAAGTTATCAGCCACCATAAAACATAACCAGTCAGGCCGTTCTATGCTGGATAGAAAGTTGACCAGCGACGGCAAAGCTAACCAACGCAGCAGCTTTACCTTCAGTATGATTATGTATCGCATGATACATTTTGTACTCAGCACTCGTGTGCCCGCGGTGAGAGAGTCTGTTGCAAATTACGGAGGTAACATCAATTTCAAGTTTGCTCAGACCAAAGGGGCTTTTCTTCATAAAATAATAAAACATTCAGACACTGCTAGCGGTGTCTGTGAGGCTTTATGTGCACATTGGATCAGGAACCATGCACAAGGCCAAAGCTTATTTGACCAGCTCTATGTTGGCGGGCGTAAGGGGAAATTCCAGATCGATACACTTTACTCAATTAAACAGTTGCAAATAGATGGTTGTAAAGCAGACGTTGATCAAGATGAGGTAACACTAGATTGGTTCAAGAAAAATGGCATATCAGAACGTATGATTGAACGGCATTGCTTACTGCGTCCAGTTGATGTTACTGGTACGACGGAATCAGAAGGGCTGGATCAATTATTAAACGCTATCCTTGATACTCATGGGATAGGTTACGGTTATAAAAAAATACATCTCTCAGGCCAAATGTCAGCCCACGCCATAGCGGCGTATGTCAACGAAAAGAGTGGTGTTACTTTCTTCGATCCCAATTTCGGTGAATTCCACTTTTCTGATAAGGAAAAGTTCCGCAAATGGTTTACTAACTCATTCTGGGGTAATTCTATGTATCATTATCCTCTGGGGGTGGGGCAGCGTTTTAGAGTGTTAACATTTGACTCCAAGGAGGTTTAA
    
    #grep "yopT" selected_gtf_files/Yersinia_pestis_Pestoides_G.gtf
    NZ_CP010246.1   RefSeq  gene    1551    2518    .       +       .       gene_id "CH43_RS22165"; transcript_id ""; gbkey "Gene"; gene "yopT"; gene_biotype "pseudogene"; locus_tag "CH43_RS22165"; old_locus_tag "CH43_4244"; pseudo "true"; 
    
    samtools faidx Yersinia_pestis_Pestoides_G.fna NZ_CP010246.1:1551-2518 > temp.fna
    sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna
    
    Yersinia_pestis_Pestoides_G ATGAACAGTATTCACGGACACTACCATATTCAACTATCGAATTATTCTGCCGGTGAAAACCTTCAATCAGTACCCTCACCGAAGGGGTGATTGGCGCACACCGAGTGAAAGTGGAAACAGCACTGTCACACTCAAACCTGCAGAAAAAGTTATCAGCCACCATAAAACATAACCAGTCAGGCCGTTCTATGCTGGATAGAAAGTTGACCAGCGACGGCAAAGCTAACCAACGCAGCAGCTTTACCTTCAGTATGATTATGTATCGCATGATACATTTTGTACTCAGCACTCGTGTGCCCGCGGTGAGAGAGTCTGTTGCAAATTACGGAGGTAACATCAATTTCAAGTTTGCTCAGACCAAAGGGGCTTTTCTTCATAAAATAATAAAACATTCAGACACTGCTAGCGGTGTCTGTGAGGCTTTATGTGCACATTGGATCAGGAACCATGCACAAGGCCAAAGCTTATTTGACCAGCTCTATGTTGGCGGGCGTAAGGGGAAATTCCAGATCGATACACTTTACTCAATTAAACAGTTGCAAATAGATGGTTGTAAAGCAGACGTTGATCAAGATGAGGTAACACTAGATTGGTTCAAGAAAAATGGCATATCAGAACGTATGATTGAACGGCATTGCTTACTGCGTCCAGTTGATGTTACTGGTACGACGGAATCAGAAGGGCTGGATCAATTATTAAACGCTATCCTTGATACTCATGGGATAGGTTACGGTTATAAAAAAATACATCTCTCAGGCCAAATGTCAGCCCACGCCATAGCGGCGTATGTCAACGAAAAGAGTGGTGTTACTTTCTTCGATCCCAATTTCGGTGAATTCCACTTTTCTGATAAGGAAAAGTTCCGCAAATGGTTTACTAACTCATTCTGGGGTAATTCTATGTATCATTATCCTCTGGGGGTGGGGCAGCGTTTTAGAGTGTTAACATTTGACTCCAAGGAGGTTTAA
    
    #------------------------------- yopE (+3) -------------------------------
    #grep "yopE" selected_gtf_files/Yersinia_pestis_1522.gtf 
    NZ_CP006757.1   RefSeq  gene    70902   71507   .       -       .       gene_id "M481_RS24690"; transcript_id ""; gbkey "Gene"; gene "yopE"; gene_biotype "pseudogene"; locus_tag "M481_RS24690"; old_locus_tag "M481_4336"; part "2"; pseudo "true"; 
    NZ_CP006757.1   RefSeq  gene    1       53      .       -       .       gene_id "M481_RS24690"; transcript_id ""; gbkey "Gene"; gene "yopE"; gene_biotype "pseudogene"; locus_tag "M481_RS24690"; old_locus_tag "M481_4336"; part "1"; pseudo "true"; 
    
    samtools faidx Yersinia_pestis_1522.fna NZ_CP006757.1:70902-71507 > temp.fna
    samtools faidx Yersinia_pestis_1522.fna NZ_CP006757.1:1-53 >> temp.fna
    #delete the second ">****"
    revseq
    sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 70902-71507.rev > temp_.fna
    
    Yersinia_pestis_1522    ATGAAAATATCATCATTTATTTCTACATCACTGCCCCTGCCGACATCTGTGTCGGATCTAGCAGCGTAGGAGAAATGTCTGGGCGCTCAGTCTCACAGCAAACAAGTGATCAATATGCAAACAATCTGGCCGGGCGCACTGAAAGCCCTCAGGGTTCCAGCTTAGCCAGCCGTATCATTGAGAGGTTATCATCAGTGGCCCACTCTGTGATTGGGTTTATCCAACGCATGTTCTCGGAGGGGAGCCATAAACCGGTGGTGACACCAGCACCCACACCTGCACAAATGCCAAGTCCTACGTCTTTCAGTGACAGTATCAAGCAACTTGCTGCTGAGACGCTGCCAAAATACATGCAGCAGTTGAATAGCTTGGATGCAGAGATGCTGCAGAAAAATCATGATCAGTTCGCTACGGGCAGCGGCCCTCTTCGTGGCAGTATCACTCAATGCCAAGGGCTGATGCAGTTTTGTGGTGGGGAATTGCAAGCTGAGGCCAGTGCCATCTTAAACACGCCTGTTTGTGGTATTCCCTTCTCGCAGTGGGGAACTATTGGTGGGGCGGCCAGCGCGTACGTCGCCAGTGGCGTTGATCTAACGCAGGCAGCAAATGAGATCAAAGGGCTGGCGCAACAGATGCAGAAATTACTGTCATTGATGTGA
    
    #grep "yopE" selected_gtf_files/Yersinia_pestis_Nicholisk_41.gtf
    NZ_CP009990.1   RefSeq  gene    67916   68552   .       +       .       gene_id "CH63_RS00620"; transcript_id ""; gbkey "Gene"; gene "yopE"; gene_biotype "protein_coding"; locus_tag "CH63_RS00620"; part "1"; 
    NZ_CP009990.1   RefSeq  gene    1       23      .       +       .       gene_id "CH63_RS00620"; transcript_id ""; gbkey "Gene"; gene "yopE"; gene_biotype "protein_coding"; locus_tag "CH63_RS00620"; part "2"; 
    
    samtools faidx Yersinia_pestis_Nicholisk_41.fna NZ_CP009990.1:67916-68552 > temp.fna
    samtools faidx Yersinia_pestis_Nicholisk_41.fna NZ_CP009990.1:1-23 >> temp.fna
    #delete the second ">****"
    sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna
    
    Yersinia_pestis_Nicholisk_41    ATGAAAATATCATCATTTATTTCTACATCACTGCCCCTGCCGACATCTGTGTCAGGATCTAGCAGCGTAGGAGAAATGTCTGGGCGCTCAGTCTCACAGCAAACAAGTGATCAATATGCAAACAATCTGGCCGGGCGCACTGAAAGCCCTCAGGGTTCCAGCTTAGCCAGCCGTATCATTGAGAGGTTATCATCAGTGGCCCACTCTGTGATTGGGTTTATCCAACGCATGTTCTCGGAGGGGAGCCATAAACCGGTGGTGACACCGGCACCCACACCTGCACAAATGCCAAGTCCTACGTCTTTCAGTGACAGTATCAAGCAACTTGCTGCTGAGACGCTGCCAAAATACATGCAGCAGTTGAATAGCTTGGATGCAGAGATGCTGCAGAAAAATCATGATCAGTTCGCTACGGGCAGCGGCCCTCTTCGTGGCAGTATCACTCAATGCCAAGGGCTGATGCAGTTTTGTGGTGGGGAATTGCAAGCTGAGGCCAGTGCCATCTTAAACACGCCTGTTTGTGGTATTCCCTTCTCGCAGTGGGGAACTATTGGTGGGGCGGCCAGCGCGTACGTCGCCAGTGGCGTTGATCTAACGCAGGCAGCAAATGAGATCAAAGGGCTGGCGCAACAGATGCAGAAATTACTGTCATTGATGTGA
    
    #grep "yopE" selected_gtf_files/Yersinia_pseudotuberculosis_FDAARGOS_581.gtf
    NZ_CP033712.1   RefSeq  gene    69663   70035   .       +       .       gene_id "EGX47_RS00005"; transcript_id ""; gbkey "Gene"; gene "yopE"; gene_biotype "protein_coding"; locus_tag "EGX47_RS00005"; old_locus_tag "EGX47_00005"; part "1"; 
    NZ_CP033712.1   RefSeq  gene    1       287     .       +       .       gene_id "EGX47_RS00005"; transcript_id ""; gbkey "Gene"; gene "yopE"; gene_biotype "protein_coding"; locus_tag "EGX47_RS00005"; old_locus_tag "EGX47_00005"; part "2"; 
    
    samtools faidx Yersinia_pseudotuberculosis_FDAARGOS_581.fna NZ_CP033712.1:69663-70035 > temp.fna
    samtools faidx Yersinia_pseudotuberculosis_FDAARGOS_581.fna NZ_CP033712.1:1-287 >> temp.fna
    #delete the second ">****"
    sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna
    
    Yersinia_pseudotuberculosis_FDAARGOS_581    ATGAAAATATCATCATTTATTTCTACATCACTGCCCCTGCCGACATCTGTGTCAGGATCTAGCAGCGTAGGAGAAATGTCTGGGCGCTCAGTCTCACAGCAAACAAGTGATCAATATGCAAACAATCTGGCCGGGCGCACTGAAAGCCCTCAGGGTTCCAGCTTAGCCAGCCGTATCATTGAGAGGTTATCATCAGTGGCCCACTCTGTGATTGGGTTTATCCAACGCATGTTCTCGGAGGGGAGCCATAAACCGGTGGTGACACCAGCACCCACACCTGCACAAATGCCAAGTCCTACGTCTTTCAGTGACAGTATCAAGCAACTTGCTGCTGAGACGCTGCCAAAATACATGCAGCAGTTGAATAGCTTGGATGCAGAGATGCTGCAGAAAAATCATGATCAGTTCGCTACGGGCAGCGGCCCTCTTCGTGGCAGTATCACTCAATGCCAAGGGCTGATGCAGTTTTGTGGTGGGGAATTGCAAGCTGAGGCCAGTGCCATCTTAAACACGCCTGTTTGTGGTATTCCCTTCTCGCAGTGGGGAACTATTGGTGGGGCGGCCAGCGCGTACGTCGCCAGTGGCGTTGATCTAACGCAGGCAGCAAATGAGATCAAAGGGCTGGCGCAACAGATGCAGAAATTACTGTCATTGATGTGA
    
    #------------------------------- yopD (+2) -------------------------------
    #grep "yopD" selected_gtf_files/Yersinia_enterocolitica_YE165.gtf
    NZ_CP016933.1   RefSeq  gene    74497   74497   .       +       .       gene_id "BB936_RS22270"; transcript_id ""; gbkey "Gene"; gene "yopD"; gene_biotype "protein_coding"; locus_tag "BB936_RS22270"; old_locus_tag "BB936_22265"; part "1"; 
    NZ_CP016933.1   RefSeq  gene    1       920     .       +       .       gene_id "BB936_RS22270"; transcript_id ""; gbkey "Gene"; gene "yopD"; gene_biotype "protein_coding"; locus_tag "BB936_RS22270"; old_locus_tag "BB936_22265"; part "2"; 
    
    samtools faidx Yersinia_enterocolitica_YE165.fna NZ_CP016933.1:74497-74497 > temp.fna
    samtools faidx Yersinia_enterocolitica_YE165.fna NZ_CP016933.1:1-920 >> temp.fna
    #delete the second ">****"
    sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna
    
    Yersinia_enterocolitica_YE165   ATGACAATAAATATCAAGACAGACAGCCCAATTATCACGACCGGTTCACAGCTTGATGCCATCACTACAGAGACAGTCGGGCAAAGCGGTGAGGTTAAAAAAACAGAAGACACCCGTCATGAAGCACAAGCAATAAAGAGTAGCGAGGCAAGCTTATCTCGGTCACAGGTGCCTGAATTGATCAAACCGAGTCAGGGAATCAATGTTGCATTACTGAGTAAAAGCCAGGGAGATCTTAATGGTACTTTAAGTATCTTGTTGTTGCTGTTGGAACTGGCACGTAAAGCGCGAGAAATGGGTTTGCAACAAAGGGATATAGAAAATAAAGCTACTATTTCTGCCCAAAAGGAGCAGGTAGCGGAGATGGTCAGCGGTGCAAAACTGATGATCGCCATGGCGGTGGTGTCTGGCATCATGGCTGCTACTTCTACGGTTGCTAGTGCTTTTTCTATAGCGAAAGAGGTGAAAATAGTTAAACAGGAACAAATTCTAAACAGTAACATTGCCGGCCGTGATCAACTTATTGATACAAAAATGCAGCAAATGAGTAACGCTGGTGATAAAGCGGTAAGCAGAGAGGATATCGGGAGAATATGGAAACCAGAGCAGGTAGCGGATCAAAATAAGCTGGCATTATTGGATAAAGAATTCAGAATGACCGACTCAAAAGCCAATGCGTTTAATGCCGCAACGCAGCCGTTAGGACAAATGGCAAACAGTGCGATTCAAGTTCATCAAGGGTATTCTCAAGCCGAGGTCAAAGAAAAAGAAGTCAATGCAAGTATTGCTGCCAACGAGAAGCAAAAAGCCGAAGAGGCGATGAACTATAATGATAACTTTATGAAAGATGTCCTGCGCTTGATTGAACAATATGTTAGCAGTCATACTCACGCCATGAAAGCCGCTTTTGGTGTTGTCTGA
    
    #grep "yopD" selected_gtf_files/Yersinia_pseudotuberculosis_IP32953_bis.gtf
    NZ_CP009711.1   RefSeq  gene    68202   68525   .       +       .       gene_id "BZ17_RS00160"; transcript_id ""; db_xref "GeneID:66841050"; gbkey "Gene"; gene "yopD"; gene_biotype "protein_coding"; locus_tag "BZ17_RS00160"; part "1"; 
    NZ_CP009711.1   RefSeq  gene    1       597     .       +       .       gene_id "BZ17_RS00160"; transcript_id ""; db_xref "GeneID:66841050"; gbkey "Gene"; gene "yopD"; gene_biotype "protein_coding"; locus_tag "BZ17_RS00160"; part "2"; 
    
    samtools faidx Yersinia_pseudotuberculosis_IP32953_bis.fna NZ_CP009711.1:68202-68525 > temp.fna
    samtools faidx Yersinia_pseudotuberculosis_IP32953_bis.fna NZ_CP009711.1:1-597 >> temp.fna
    #delete the second ">****"
    sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna
    
    Yersinia_pseudotuberculosis_IP32953_bis ATGACAATAAATATCAAGACAGACAGCCCAATTATCACGACCGGTTCACAGCTTGATGCCATCACTACAGAGACAGTCAAGCAAAGCGGTGAGATTAAAAAAACAGAAGACACCCGTCATGAAGCACAAGCAATAAAGAGTAGCGAGGCAAGCTTATCTCGGTCACAGGTGCCAGAATTGATCAAACCGAGCCAGGGAATCAATGTTGCATTACTGAGTAAAAGCCAGGGTGATCTTAATGGTACTTTAAGTATCTTGTTGTTGCTGTTGGAACTGGCACGTAAAGCGCGAGAAATGGGTTTGCAACAAAGGGATATAGAAAATAAAGCTACTATTACTGCCCAAAAGGAGCAGGTAGCGGAGATGGTCAGCGGTGCAAAACTGATGATCGCCATGGCGGTGGTGTCTGGCATCATGGCTGCTACTTCTACGGTTGCTAGTGCTTTTTCTATAGCGAAAGAGGTGAAAATAGTTAAACAGGAACAAATTCTAAACAGTAATATTGCTGGCCGCGAACAACTTATTGATACAAAAATGCAGCAAATGAGTAACATTGGTGATAAAGCGGTAAGCAGAGAGGATATCGGGAGAATATGGAAACCAGAGCAGGTAGCGGATCAAAATAAGCTGGCATTATTGGATAAAGAATTCAGAATGACCGACTCAAAAGCCAATGCGTTTAATGCCGCAACGCAGCCGTTAGGACAAATGGCAAACAGTGCGATTCAAGTTCATCAAGGGTATTCTCAAGCCGAGGTCAAAGAGAAAGAAGTCAATGCAAGTATTGCTGCCAACGAGAAGCAAAAAGCCGAAGAGGCGATGAACTATAATGATAACTTTATGAAAGATGTCCTGCGCTTGATTGAACAATATGTTAGCAGTCATACTCACGCCATGAAAGCCGCTTTTGGTGTTGTCTGA
    
    #------------------------------- yopM (+2) -------------------------------
    #grep "yopM" selected_gtf_files/Yersinia_pestis_FDAARGOS_602.gtf
    NZ_CP033695.1   RefSeq  gene    69663   70174   .       +       .       gene_id "EGX42_RS00660"; transcript_id ""; gbkey "Gene"; gene "yopM"; gene_biotype "protein_coding"; locus_tag "EGX42_RS00660"; old_locus_tag "EGX42_00655"; part "1"; 
    NZ_CP033695.1   RefSeq  gene    1       592     .       +       .       gene_id "EGX42_RS00660"; transcript_id ""; gbkey "Gene"; gene "yopM"; gene_biotype "protein_coding"; locus_tag "EGX42_RS00660"; old_locus_tag "EGX42_00655"; part "2"; 
    
    samtools faidx Yersinia_pestis_FDAARGOS_602.fna NZ_CP033695.1:69663-70174 > temp.fna
    samtools faidx Yersinia_pestis_FDAARGOS_602.fna NZ_CP033695.1:1-592 >> temp.fna
    #delete the second ">****"
    sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna
    
    Yersinia_pestis_FDAARGOS_602    ATGTTCATAAATCCAAGAAATGTATCTAATACTTTTTTGCAAGAACCATTACGTCATTCTTCTAATTTAACTGAGATGCCGGTTGAGGCAGAAAATGTTAAATCTAAGACTGAATATTATAATGCATGGTCGGAATGGGAACGAAATGCCCCTCCGGGGAATGGTGAACAGAGGGAAATGGCGGTTTCAAGGTTACGAGATTGCCTGGACCGACAAGCCCATGAGCTAGAACTAAATAATCTGGGGCTGAGTTCTTTGCCGGAATTACCTCCGCATTTAGAGAGTTTAGTGGCGTCATGTAATTCTCTTACAGAATTACCGGAATTACCGCAGAGCCTGAAATCACTTCTAGTTGATAATAACAATCTGAAGGCATTATCCGATTTACCACCTTTACTGGAATATTTAGGTGTCTCTAATAATCAGCTGGAAAAATTGCCAGAGTTGCAAAACTCGTCCTTCTTGAAAATTATTGATGTTGATAACAATTCACTGAAAAAACTACCTGATTTACCTCCTTCACTGGAGTTTATTGCTGCTGGTAATAATCAGCTGGAAGAATTGCCAGAGTTGCAAAACTTGCCCTTCTTGACTACGATTTATGCTGATAACAATTTACTGAAAACATTACCCGATTTACCCCCTTCCCTGGAAGCACTTAATGTCAGAGATAATTATTTAACTGATCTGCCAGAATTACCGCAGAGTTTAACCTTCTTAGATGTTTCTGAAAATATTTTTTCTGGATTATCGGAATTGCCACCAAACTTGTATTATCTCAATGCATCCAGCAATGAAATAAGATCCTTATGCGATTTACCCCCTTCACTGGAAGAACTTAATGTCAGTAATAATAAGTTGATCGAACTGCCAGCGTTACCTCCACGCTTAGAACGTTTAATCGCTTCATTTAATCATCTTGCTGAAGTACCTGAATTGCCGCAAAACCTGAAACAGCTCCACGTAGAGTACAACCCTCTGAGAGAGTTTCCCGATATACCTGAGTCAGTGGAAGATCTTCGGATGAACTCTGAACGTGTAGTTGATCCATATGAATTTGCTCATGAGACTACAGACAAACTTGAAGATGATGTATTTGAGTAG
    
    #grep "yopM" selected_gtf_files/Yersinia_pseudotuberculosis_PB1+_bis.gtf
    NZ_CP009779.1   RefSeq  gene    69708   69812   .       +       .       gene_id "BZ16_RS00005"; transcript_id ""; gbkey "Gene"; gene "yopM"; gene_biotype "protein_coding"; locus_tag "BZ16_RS00005"; old_locus_tag "BZ16_4135"; part "1"; 
    NZ_CP009779.1   RefSeq  gene    1       1485    .       +       .       gene_id "BZ16_RS00005"; transcript_id ""; gbkey "Gene"; gene "yopM"; gene_biotype "protein_coding"; locus_tag "BZ16_RS00005"; old_locus_tag "BZ16_4135"; part "2"; 
    
    samtools faidx Yersinia_pseudotuberculosis_PB1+_bis.fna NZ_CP009779.1:69708-69812 > temp.fna
    samtools faidx Yersinia_pseudotuberculosis_PB1+_bis.fna NZ_CP009779.1:1-1485 >> temp.fna
    #delete the second ">****"
    sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna
    
    Yersinia_pseudotuberculosis_PB1+_bis    ATGTTCATAAATCCAAGAAATGTATCTAATACTTTTTTGCAAGAACCATTACGTCATTCTTCTAATTTAACTGAGATGCCGGTTGAGGCAGAAAATGTTAAATCTAAGACTGAATATTATAATGCATGGTCGGAATGGGAACGAAATGCCCCTCCGGGGAATGGTGAACAGAGGGAAATGGCGGTTTCAAGGTTACGAGATTGCCTGGACCGACAAGCCCATGAGCTAGAACTAAATAATCTGGGGCTGAGTTCTTTGCCGGAATTACCTCCGCATTTAGAGAGTTTAGTGGCGTCATGTAATTCTCTTACAGAATTACCGGAATTGCCGCAGAGCCTGAAATCACTTCAAGTTGAAAATAACAATCTGAAGGCATTACCCGATTTACCCCCTTCCCTGAAAAAACTTCATGTCAGAGAAAATGATTTAACTGATCTGCCAGAATTACCGCAGAGCCTGGAATCACTTCGAGTTGATAATAACAATCTGAAGGCATTATCCGATTTACCTCCTTCACTGGAATATCTTACTGCTAGTAGTAATAAGCTGGAAGAATTGCCAGAGTTGCAAAACTTGCCCTTCTTGGCTGCGATTTATGCTGATAACAATTTACTGGAAACATTACCCGATTTACCCCCTTCCCTGAAAAAACTTCATGTCAGAGAAAATGATTTAACTGATCTGCCAGAATTACCGCAGAGCCTGGAATCACTTCAAGTTGATAATAACAATCTGAAGGCATTATCCGATTTACCTCCTTCACTGGAATATCTTACTGCTAGTAGTAATAAGCTGGAAGAATTGCCAGAGTTGCAAAACTTGCCCTTCTTGGCTGCGATTTATGCTGATAACAATTTACTGGAAACATTACCCGATTTACCCCCACATTTAGAGATTTTAGTGGCGTCATATAATTCTCTTACTGAATTACCGGAATTGCCGCAGAGCCTGAAATCACTTCGAGTTGATAATAACAATCTGAAGGCATTATCCGATTTACCTCCTTCACTGGAATATCTTACTGCTAGTAGTAATAAGCTGGAAGAATTACCAGAGTTGCAAAACTTGCCCTTCTTGGCTGCGATTTATGCTGATAACAATTTACTGGAAACATTACCCGATTTACCCCCTTCCCTGAAAAAACTTCATGTCAGAGAAAATGATTTAACTGATCTGCCAGAATTACCGCAGAGTTTAACCTTCTTAGATGTTTCTGATAATAATATTTCTGGATTATCGGAATTGCCACCAAACTTGTATTATCTCGATGCATCCAGCAATGAAATAAGATCCTTATGCGATTTACCTCCTTCACTGGTAGACCTTAATGTCAAAAGTAATCAGTTGAGCGAACTGCCAGCGTTACCTCCACACTTAGAACGTTTAATCGCTTCATTTAATTATCTTGCTGAAGTACCTGAATTGCCGCAAAACCTGAAACAGCTCCACGTAGAGCAAAACGCTCTGAGAGAGTTTCCCGATATACCTGAGTCATTGGAAGAGCTTGAGATGGACTCTGAACGTGTAGTTGATCCATATGAATTTGCTCATGAGACTACAGACAAACTTGAAGATGATGTATTTGAGTAG
    
    #------------------------------- yopO (+9) -------------------------------
    #grep "yopO" selected_gtf_files/Yersinia_enterocolitica_YE165.gtf
    NZ_CP016933.1   RefSeq  gene    11705   13893   .       -       .       gene_id "BB936_RS22335"; transcript_id ""; gbkey "Gene"; gene "yopO"; gene_biotype "pseudogene"; gene_synonym "ypkA"; locus_tag "BB936_RS22335"; old_locus_tag "BB936_22330"; pseudo "true"; 
    
    samtools faidx Yersinia_enterocolitica_YE165.fna NZ_CP016933.1:11705-13893 > temp.fna
    revseq
    sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 11705-13893.rev > temp_.fna
    
    Yersinia_enterocolitica_YE165   ATGAAAATCATGGGAACTATGCCACCGTCGATCTCCCTCGCTAAAGCTCATGAGCGCATCAGCCAACATTGGCAAAATCCTGTCGGTGAGCTCAATATCGGAGGAAAACGGTATAGAATTATCGATAATCAAGTGCTGCGCTTGAACCCCCACAGTGGTTTTTCTCTCTTTCGAGAAGGGGTTGGTAAGATCTTTTCGGGGAAGATGTTTAACTTTTCAATTGCTCGTAACCTTACTGAGACACTCCATGCAGCCCAGAAAACGACTTCGCAGGAGCTAAGGTCTGATATCCCCAATGTTCTCAGTAATCTCTTTGGAGCCAAGCCACAGACCGAACTGCCGCTGGGTTGGAAAGGGAAGCCTTTGTCAGGAGCTCCGGATCTTGAAGGGATGCGAGTGGCTGAAACCGATAAGTTTGCCGAGGGCGAAAGCCATATTAGTATAATAGAAACTAAGGATAATCAGCGGTTGGTGGCTAAGATTGAACGCTCCATTGCCGAGGGGCATTTGTTCGCAGAACTGGAGGCTTATAAACACATCTATAAAACCGCGGGCAAACATCCTAATCTTGCCAATGTCCATGGCATGGCTGTGGTGCCATACGGTAACCGTAAGGAGGAAGCATTGCTGATGGATGAGGTGGATGGTTGGCGTTGTTCTGACACACTAAGAAGCCTCGCCGATAGCTGGAAGCAAGGAAAGATCAATAGTGAAGCCTACTGGGGAACGATCAAGTTTATTGCCCATCGGCTATTAGATGTAACCAATCACCTTGCCAAGGCAGGGATAGTACATAACGATATCAAACCCGGTAATGTGGTATTTGACCGCGCTAGCGGAGAGCCCGTTGTCATTGATCTAGGATTACACTCTCGTTCAGGGGAACAACCTAAGGGGTTTACAGAATCCTTCAAAGCGCCGGAGCTTGGAGTAGGAAACCTAGGCGCATCAGAAAAGAGCGATGTTTTTCTCGTAGTTTCAACCCTTCTACATGGTATCGAAGGTTTTGAGAAAGATCCGGAGATAAAACCTAATCAAGGACTGAGATCCATTACCTCAGAACCAGCGCACGTAATGGATGAGAATGGTTACCCAATCCATCGACCTGGTATAGCTGGAGTCGAGACAGCCTATACACGCTTCATCACAGACATCCTTGGCGTTTCCGCTGACTCAAGACCTGATTCCAACGAAGCCAGACTCCACGAGTTCTTGAGCGACGGAACTATTGACGAGGAGTCGGCCAAGCAGATCCTAAAAGATACTCTAACCGGAGAAATGAGCCCATTATCTACTGATGTAAGGCGGATAACACCCAAGAAGCTTCGGGAGCTCTCTGATTTGCTTAGGACGCATTTGAGTAGTGCAGCAACTAAGCAATTGGATATGGGGGTGGTTTTGTCGGATCTTGATACCATGTTGGTGACACTCGACAAGGCCGAACGCGAGGGGGAGTAGACAAGGATCAGTTGAAGAGTTTTAACAGTTTGATTCTGAAGACTTACAGCGTGATTGAAGACTATGTCAAAGGCAGAGAAGGGGATACCAAGAGTTCCAGTGCGGAAGTATCCCCCTATCATCGCAGTAACTTTATGCTATCGATCGTCGAGCCTTCACTGCAGAGGATCCAAAAGCATCTGGACCAGACACACTCTTTTTCTGATATCGGTTCACTAGTGCGCGCACATAAGCACCTGGAAACGCTTTTAGAGGTCTTAGTCACCTTGTCACCGCAAGGGCAGCCCGTGTCCTCTGAAACCTACAGCTTCCTGAATCGATTAGCTGAGGCTAAGGTCACCTTGTCGCAGCAATTGGATACTCTCCAGCAGCAGCAGGAGAGTGCGAAAGCGCAACTATCTATTCTGATTAATCGTTCAGGTTCTTGGGCCGATGTTGCTCGTCAGTCCCTGCAGCGTTTTGACAGTACCCGGCCTGTAGTGAAATTCGGCACTGAGCAGTATACCGCAATTCACCGTCAGATGATGGCGGCCCATGCAGCCATTACGCTACAGGAGGTATCGGAGTTTACTGATGATATGCGAAACTTTACAGCGGACTCTATTCCACTACTGATTCGACTTGGACGAAGCAGTTTAATAGATGAGCATTTGGTTGAACAGAGAGAGAAGTTGCGAGAGCTGACGACCATCGCCGAGCGACTGAACCGGTTGGAGCGGGAATGGATGTGA
    
    #grep "yopO" selected_gtf_files/Yersinia_enterocolitica_YE3.gtf
    NZ_CP016943.1   RefSeq  gene    12782   14970   .       -       .       gene_id "BED35_RS00550"; transcript_id ""; gbkey "Gene"; gene "yopO"; gene_biotype "pseudogene"; gene_synonym "ypkA"; locus_tag "BED35_RS00550"; old_locus_tag "BED35_00550"; pseudo "true"; 
    
    samtools faidx Yersinia_enterocolitica_YE3.fna NZ_CP016943.1:12782-14970 > temp.fna
    revseq
    sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 12782-14970.rev > temp_.fna
    
    Yersinia_enterocolitica_YE3 ATGAAAATCATGGGAACTATGCCACCGTCGATCTCCCTCGCTAAAGCTCATGAGCGCATCAGCCAACATTGGCAAAATCCTGTCGGTGAGCTCAATATCGGAGGAAAACGGTATAGAATTATCGATAATCAAGTGCTGCGCTTGAACCCCCACAGTGGTTTTTCTCTCTTTCGAGAAGGGGTTGGTAAGATCTTTTCGGGGAAGATGTTTAACTTTTCAATTGCTCGTAACCTTACTGAGACACTCCATGCAGCCCAGAAAACGACTTCGCAGGAGCTAAGGTCTGATATCCCCAATGTTCTCAGTAATCTCTTTGGAGCCAAGCCACAGACCGAACTGCCGCTGGGTTGGAAAGGGAAGCCTTTGTCAGGAGCTCCGGATCTTGAAGGGATGCGAGTGGCTGAAACCGATAAGTTTGCCGAGGGCGAAAGCCATATTAGTATAATAGAAACTAAGGATAATCAGCGGTTGGTGGCTAAGATTGAACGCTCCATTGCCGAGGGGCATTTGTTCGCAGAACTGGAGGCTTATAAACACATCTATAAAACCGCGGGCAAACATCCTAATCTTGCCAATGTCCATGGCATGGCTGTGGTGCCATACGGTAACCGTAAGGAGGAAGCATTGCTGATGGATGAGGTGGATGGTTGGCGTTGTTCTGACACACTAAGAAGCCTCGCCGATAGCTGGAAGCAAGGAAAGATCAATAGTGAAGCCTACTGGGGAACGATCAAGTTTATTGCCCATCGGCTATTAGATGTAACCAATCACCTTGCCAAGGCAGGGATAGTACATAACGATATCAAACCCGGTAATGTGGTATTTGACCGCGCTAGCGGAGAGCCCGTTGTCATTGATCTAGGATTACACTCTCGTTCAGGGGAACAACCTAAGGGGTTTACAGAATCCTTCAAAGCGCCGGAGCTTGGAGTAGGAAACCTAGGCGCATCAGAAAAGAGCGATGTTTTTCTCGTAGTTTCAACCCTTCTACATGGTATCGAAGGTTTTGAGAAAGATCCGGAGATAAAACCTAATCAAGGACTGAGATCCATTACCTCAGAACCAGCGCACGTAATGGATGAGAATGGTTACCCAATCCATCGACCTGGTATAGCTGGAGTCGAGACAGCCTATACACGCTTCATCACAGACATCCTTGGCGTTTCCGCTGACTCAAGACCTGATTCCAACGAAGCCAGACTCCACGAGTTCTTGAGCGACGGAACTATTGACGAGGAGTCGGCCAAGCAGATCCTAAAAGATACTCTAACCGGAGAAATGAGCCCATTATCTACTGATGTAAGGCGGATAACACCCAAGAAGCTTCGGGAGCTCTCTGATTTGCTTAGGACGCATTTGAGTAGTGCAGCAACTAAGCAATTGGATATGGGGGTGGTTTTGTCGGATCTTGATACCATGTTGGTGACACTCGACAAGGCCGAACGCGAGGGGGAGTAGACAAGGATCAGTTGAAGAGTTTTAACAGTTTGATTCTGAAGACTTACAGCGTGATTGAAGACTATGTCAAAGGCAGAGAAGGGGATACCAAGAGTTCCAGTGCGGAAGTATCCCCCTATCATCGCAGTAACTTTATGCTATCGATCGTCGAGCCTTCACTGCAGAGGATCCAAAAGCATCTGGACCAGACACACTCTTTTTCTGATATCGGTTCACTAGTGCGCGCACATAAGCACCTGGAAACGCTTTTAGAGGTCTTAGTCACCTTGTCACCGCAAGGGCAGCCCGTGTCCTCTGAAACCTACAGCTTCCTGAATCGATTAGCTGAGGCTAAGGTCACCTTGTCGCAGCAATTGGATACTCTCCAGCAGCAGCAGGAGAGTGCGAAAGCGCAACTATCTATTCTGATTAATCGTTCAGGTTCTTGGGCCGATGTTGCTCGTCAGTCCCTGCAGCGTTTTGACAGTACCCGGCCTGTAGTGAAATTCGGCACTGAGCAGTATACCGCAATTCACCGTCAGATGATGGCGGCCCATGCAGCCATTACGCTACAGGAGGTATCGGAGTTTACTGATGATATGCGAAACTTTACAGCGGACTCTATTCCACTACTGATTCGACTTGGACGAAGCAGTTTAATAGATGAGCATTTGGTTGAACAGAGAGAGAAGTTGCGAGAGCTGACGACCATCGCCGAGCGACTGAACCGGTTGGAGCGGGAATGGATGTGA
    
    #grep "yopO" selected_gtf_files/Yersinia_enterocolitica_YE6.gtf
    NZ_CP016937.1   RefSeq  gene    4748707 4750895 .       -       .       gene_id "BED33_RS21960"; transcript_id ""; gbkey "Gene"; gene "yopO"; gene_biotype "pseudogene"; gene_synonym "ypkA"; locus_tag "BED33_RS21960"; old_locus_tag "BED33_21960"; pseudo "true"; 
    
    samtools faidx Yersinia_enterocolitica_YE6.fna NZ_CP016937.1:4748707-4750895 > temp.fna
    revseq
    sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 4748707-4750895.rev > temp_.fna
    
    Yersinia_enterocolitica_YE6 ATGAAAATCATGGGAACTATGCCACCGTCGATCTCCCTCGCTAAAGCTCATGAGCGCATCAGCCAACATTGGCAAAATCCTGTCGGTGAGCTCAATATCGGAGGAAAACGGTATAGAATTATCGATAATCAAGTGCTGCGCTTGAACCCCCACAGTGGTTTTTCTCTCTTTCGAGAAGGGGTTGGTAAGATCTTTTCGGGGAAGATGTTTAACTTTTCAATTGCTCGTAACCTTACTGAGACACTCCATGCAGCCCAGAAAACGACTTCGCAGGAGCTAAGGTCTGATATCCCCAATGTTCTCAGTAATCTCTTTGGAGCCAAGCCACAGACCGAACTGCCGCTGGGTTGGAAAGGGAAGCCTTTGTCAGGAGCTCCGGATCTTGAAGGGATGCGAGTGGCTGAAACCGATAAGTTTGCCGAGGGCGAAAGCCATATTAGTATAATAGAAACTAAGGATAATCAGCGGTTGGTGGCTAAGATTGAACGCTCCATTGCCGAGGGGCATTTGTTCGCAGAACTGGAGGCTTATAAACACATCTATAAAACCGCGGGCAAACATCCTAATCTTGCCAATGTCCATGGCATGGCTGTGGTGCCATACGGTAACCGTAAGGAGGAAGCATTGCTGATGGATGAGGTGGATGGTTGGCGTTGTTCTGACACACTAAGAAGCCTCGCCGATAGCTGGAAGCAAGGAAAGATCAATAGTGAAGCCTACTGGGGAACGATCAAGTTTATTGCCCATCGGCTATTAGATGTAACCAATCACCTTGCCAAGGCAGGGATAGTACATAACGATATCAAACCCGGTAATGTGGTATTTGACCGCGCTAGCGGAGAGCCCGTTGTCATTGATCTAGGATTACACTCTCGTTCAGGGGAACAACCTAAGGGGTTTACAGAATCCTTCAAAGCGCCGGAGCTTGGAGTAGGAAACCTAGGCGCATCAGAAAAGAGCGATGTTTTTCTCGTAGTTTCAACCCTTCTACATGGTATCGAAGGTTTTGAGAAAGATCCGGAGATAAAACCTAATCAAGGACTGAGATCCATTACCTCAGAACCAGCGCACGTAATGGATGAGAATGGTTACCCAATCCATCGACCTGGTATAGCTGGAGTCGAGACAGCCTATACACGCTTCATCACAGACATCCTTGGCGTTTCCGCTGACTCAAGACCTGATTCCAACGAAGCCAGACTCCACGAGTTCTTGAGCGACGGAACTATTGACGAGGAGTCGGCCAAGCAGATCCTAAAAGATACTCTAACCGGAGAAATGAGCCCATTATCTACTGATGTAAGGCGGATAACACCCAAGAAGCTTCGGGAGCTCTCTGATTTGCTTAGGACGCATTTGAGTAGTGCAGCAACTAAGCAATTGGATATGGGGGTGGTTTTGTCGGATCTTGATACCATGTTGGTGACACTCGACAAGGCCGAACGCGAGGGGGAGTAGACAAGGATCAGTTGAAGAGTTTTAACAGTTTGATTCTGAAGACTTACAGCGTGATTGAAGACTATGTCAAAGGCAGAGAAGGGGATACCAAGAGTTCCAGTGCGGAAGTATCCCCCTATCATCGCAGTAACTTTATGCTATCGATCGTCGAGCCTTCACTGCAGAGGATCCAAAAGCATCTGGACCAGACACACTCTTTTTCTGATATCGGTTCACTAGTGCGCGCACATAAGCACCTGGAAACGCTTTTAGAGGTCTTAGTCACCTTGTCACCGCAAGGGCAGCCCGTGTCCTCTGAAACCTACAGCTTCCTGAATCGATTAGCTGAGGCTAAGGTCACCTTGTCGCAGCAATTGGATACTCTCCAGCAGCAGCAGGAGAGTGCGAAAGCGCAACTATCTATTCTGATTAATCGTTCAGGTTCTTGGGCCGATGTTGCTCGTCAGTCCCTGCAGCGTTTTGACAGTACCCGGCCTGTAGTGAAATTCGGCACTGAGCAGTATACCGCAATTCACCGTCAGATGATGGCGGCCCATGCAGCCATTACGCTACAGGAGGTATCGGAGTTTACTGATGATATGCGAAACTTTACAGCGGACTCTATTCCACTACTGATTCGACTTGGACGAAGCAGTTTAATAGATGAGCATTTGGTTGAACAGAGAGAGAAGTTGCGAGAGCTGACGACCATCGCCGAGCGACTGAACCGGTTGGAGCGGGAATGGATGTGA
    
    #grep "yopO" selected_gtf_files/Yersinia_pestis_790.gtf
    
    #grep "yopO" selected_gtf_files/Yersinia_pestis_FDAARGOS_601.gtf
    NZ_CP033697.1   RefSeq  gene    68815   70300   .       +       .       gene_id "EGX46_RS00005"; transcript_id ""; gbkey "Gene"; gene "yopO"; gene_biotype "protein_coding"; gene_synonym "ypkA"; locus_tag "EGX46_RS00005"; old_locus_tag "EGX46_00005"; part "1"; 
    NZ_CP033697.1   RefSeq  gene    1       713     .       +       .       gene_id "EGX46_RS00005"; transcript_id ""; gbkey "Gene"; gene "yopO"; gene_biotype "protein_coding"; gene_synonym "ypkA"; locus_tag "EGX46_RS00005"; old_locus_tag "EGX46_00005"; part "2";
    
    samtools faidx Yersinia_pestis_FDAARGOS_601.fna NZ_CP033697.1:68815-70300 > temp.fna
    samtools faidx Yersinia_pestis_FDAARGOS_601.fna NZ_CP033697.1:1-713 >> temp.fna
    #delete the second ">****"
    sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna
    
    Yersinia_pestis_FDAARGOS_601    ATGAAAAGCGTGAAAATCATGGGAACTATGCCACCGTCGATCTCCCTCGCCAAAGCTCATGAGCGCATCAGCCAACATTGGCAAAATCCTGTCGGTGAGCTCAATATCGGAGGAAAACGGTATAGAATTATCGATAATCAAGTGTTGCGCTTGAACCCCCACAGTGGTTTTTCTCTCTTTCGAGAAGGGGTTGGTAAGATCTTTTCGGGGAAGATGTTTAACTTTTCAATTGCTCGTAACCTTACTGACACACTCCATGCGGCCCAGAAAACGACTTCGCAGGAGCTAAGGTCTGATATCCCCAATGCTCTCAGTAATCTCTTTGGAGCCAAGCCACAGACCGAACTGCCGCTGGGTTGGAAAGGGGAGCCCTTGTCAGGAGCTCCGGATCTTGAAGGGATGCGAGTGGCTGAAACCGATAAGTTTGCCGAGGGCGAAAGCCATATTAGTATAATAGAAACTAAGGATAAGCAGCGGTTGGTAGCTAAGATTGAACGCTCCATTGCCGAGGGGCATTTGTTCGCAGAACTGGAGGCTTATAAACACATCTATAAAACCGCGGGCAAACATCCTAATCTTGCCAATGTTCATGGCATGGCTGTGGTGCCATACGGTAACCGTAAGGAGGAAGCATTGCTGATGGATGAGGTGGATGGTTGGCGTTGTTCTGACACACTAAGAACCCTCGCCGATAGCTGGAAGCAAGGAAAGATCAATAGTGAAGCCTACTGGGGAACGATCAAGTTTATTGCCCATCGGCTATTAGATGTAACCAATCACCTTGCCAAGGCAGGGGTAGTACATAACGATATCAAACCCGGTAATGTGGTATTTGACCGCGCTAGCGGAGAGCCCGTTGTTATTGATCTAGGATTACACTCTCGTTCAGGGGAACAACCTAAGGGGTTTACAGAATCCTTCAAAGCGCCGGAGCTTGGAGTAGGAAACCTAGGCGCATCAGAAAAGAGCGATGTTTTTCTCGTAGTGTCAACCCTTCTACATTGTATCGAAGGTTTTGAGAAAAATCCGGAGATAAAGCCTAATCAAGGACTGAGATTCATTACCTCAGAACCAGCGCACGTAATGGATGAGAATGGTTATCCAATCCATCGACCTGGTATAGCTGGAGTCGAGACAGCCTATACACGCTTCATCACAGACATCCTTGGCGTTTCCGCTGACTCAAGACCTGATTCCAACGAAGCCAGACTCCACGAGTTCTTGAGCGACGGAACTATCGACGAGGAGTCGGCCAAGCAGATCCTAAAAGATACCCTAACCGGAGAAATGAGCCCATTATCTACTGATGTAAGGCGGATAACACCCAAGAAGCTTCGGGAGCTATCTGATTTGCTTAGGACGCATTTGAGCAGTGCAGCAACTAAGCAATTGGATATGGGGGGGGTTTTGTCGGATCTTGATACCATGTTGGTGGCACTCGACAAGGCCGAACGCGAGGGGGGAGTAGACAAGGATCAGTTGAAGAGTTTTAACAGTTTGATTCTGAAGACTTACAGAGTGATTGAAGACTATGTCAAAGGCAGAGAAGGGGATACCAAGAATTCCAGTACGGAAGTATCCCCCTATCATCGCAGTAACTTTATGCTATCGATCGTCGAACCTTCACTGCAGAGGATCCAGAAGCATCTGGACCAGACACACTCTTTTTCTGATATCGGTTCACTAGTGCGCGCACATAAGCACCTGGAAACGCTTTTAGAGGTCTTAGTCACCTTGTCACAGCAAGGGCAGCCCGTGTCCTCTGAAACCTACGGCTTCCTGAATCGATTAACTGAGGCTAAGATCACCTTGTCGCAGCAATTGAATACTCTCCAGCAGCAGCAGGAGAGTGCGAAAGCGCAATTATCTATTCTGATTAATCGTTCAGGTTCTTGGGCCGATGTTGCTCGTCAGTCCCTGCAGCGTTTTGACAGTACCCGGCCTGTAGTGAAATTCGGCACTGAGCAGTATACCGCAATTCACCGTCAGATGATGGCGGCCCATGCAGCTATTACGCTACAGGAGGTATCGGAGTTTACTGATGATATGCGAAACTTTACAGTGGACTCTATTCCACTACTGATTCAACTTGGACGAAGCAGTTTAATGGATGAGCATTTGGTTGAACAGAGAGAAAAGTTGCGAGAGCTGACGACCATCGCCGAGCGACTGAACCGGTTGGAGCGGGAATGGATGTGA
    
    #grep "yopO" selected_gtf_files/Yersinia_pestis_Harbin_35.gtf
    NC_017263.1     RefSeq  gene    49729   51926   .       -       .       gene_id "YPC_RS21300"; transcript_id ""; gbkey "Gene"; gene "yopO"; gene_biotype "pseudogene"; gene_synonym "ypkA"; locus_tag "YPC_RS21300"; pseudo "true"; 
    
    samtools faidx Yersinia_pestis_Harbin_35.fna NC_017263.1:49729-51926 > temp.fna
    revseq
    sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 49729-51926.rev > temp_.fna
    
    Yersinia_pestis_Harbin_35   ATGAAAAGCGTGAAAATCATGGGAACTATGCCACCGTCGATCTCCCTCGCCAAAGCTCATGAGCGCATCAGCCAACATTGGCAAAATCCTGTCGGTGAGCTCAATATCGGAGGAAAACGGTATAGAATTATCGATAATCAAGTGTTGCGCTTGAACCCCCACAGTGGTTTTTCTCTCTTTCGAGAAGGGGTTGGTAAGATCTTTTCGGGGAAGATGTTTAACTTTTCAATTGCTCGTAACCTTACTGACACACTCCATGCGGCCCAGAAAACGACTTCGCAGGAGCTAAGGTCTGATATCCCCAATGCTCTCAGTAATCTCTTTGGAGCCAAGCCACAGACCGAACTGCCGCTGGGTTGGAAAGGGGAGCCCTTGTCAGGAGCTCCGGATCTTGAAGGGATGCGAGTGGCTGAAACCGATAAGTTTGCCGAGGGCGAAAGCCATATTAGTATAATAGAAACTAAGGATAAGCAGCGGTTGGTAGCTAAGATTGAACGCTCCATTGCCGAGGGGCATTTGTTCGCAGAACTGGAGGCTTATAAACACATCTATAAAACCGCGGGCAAACATCCTAATCTTGCCAATGTTCATGGCATGGCTGTGGTGCCATACGGTAACCGTAAGGAGGAAGCATTGCTGATGGATGAGGTGGATGGTTGGCGTTGTTCTGACACACTAAGAACCCTCGCCGATAGCTGGAAGCAAGGAAAGATCAATAGTGAAGCCTACTGGGGAACGATCAAGTTTATTGCCCATCGGCTATTAGATGTAACCAATCACCTTGCCAAGGCAGGGGTAGTACATAACGATATCAAACCCGGTAATGTGGTATTTGACCGCGCTAGCGGAGAGCCCGTTGTTATTGATCTAGGATTACACTCTCGTTCAGGGGAACAACCTAAGGGGTTTACAGAATCCTTCAAAGCGCCGGAGCTTGGAGTAGGAAACCTAGGCGCATCAGAAAAGAGCGATGTTTTTCTCGTAGTGTCAACCCTTCTACATTGTATCGAAGGTTTTGAGAAAAATCCGGAGATAAAGCCTAATCAAGGACTGAGATTCATTACCTCAGAACCAGCGCACGTAATGGATGAGAATGGTTATCCAATCCATCGACCTGGTATAGCTGGAGTCGAGACAGCCTATACACGCTTCATCACAGACATCCTTGGCGTTTCCGCTGACTCAAGACCTGATTCCAACGAAGCCAGACTCCACGAGTTCTTGAGCGACGGAACTATCGACGAGGAGTCGGCCAAGCAGATCCTAAAAGATACCCTAACCGGAGAAATGAGCCCATTATCTACTGATGTAAGGCGGATAACACCCAAGAAGCTTCGGGAGCTATCTGATTTGCTTAGGACGCATTTGAGCAGTGCAGCAACTAAGCAATTGGATATGGGGGGGTTTTGTCGGATCTTGATACCATGTTGGTGGCACTCGACAAGGCCGAACGCGAGGGGGGAGTAGACAAGGATCAGTTGAAGAGTTTTAACAGTTTGATTCTGAAGACTTACAGAGTGATTGAAGACTATGTCAAAGGCAGAGAAGGGGATACCAAGAATTCCAGTACGGAAGTATCCCCCTATCATCGCAGTAACTTTATGCTATCGATCGTCGAACCTTCACTGCAGAGGATCCAGAAGCATCTGGACCAGACACACTCTTTTTCTGATATCGGTTCACTAGTGCGCGCACATAAGCACCTGGAAACGCTTTTAGAGGTCTTAGTCACCTTGTCACAGCAAGGGCAGCCCGTGTCCTCTGAAACCTACGGCTTCCTGAATCGATTAACTGAGGCTAAGATCACCTTGTCGCAGCAATTGAATACTCTCCAGCAGCAGCAGGAGAGTGCGAAAGCGCAATTATCTATTCTGATTAATCGTTCAGGTTCTTGGGCCGATGTTGCTCGTCAGTCCCTGCAGCGTTTTGACAGTACCCGGCCTGTAGTGAAATTCGGCACTGAGCAGTATACCGCAATTCACCGTCAGATGATGGCGGCCCATGCAGCTATTACGCTACAGGAGGTATCGGAGTTTACTGATGATATGCGAAACTTTACAGTGGACTCTATTCCACTACTGATTCAACTTGGACGAAGCAGTTTAATGGATGAGCATTTGGTTGAACAGAGAGAAAAGTTGCGAGAGCTGACGACCATCGCCGAGCGACTGAACCGGTTGGAGCGGGAATGGATGTGA
    
    #grep "yopO" selected_gtf_files/Yersinia_pestis_Harbin_35_bis.gtf
    NZ_CP009703.1   RefSeq  gene    55189   57386   .       +       .       gene_id "CH55_RS00985"; transcript_id ""; gbkey "Gene"; gene "yopO"; gene_biotype "pseudogene"; gene_synonym "ypkA"; locus_tag "CH55_RS00985"; old_locus_tag "CH55_4357"; pseudo "true"; 
    
    samtools faidx Yersinia_pestis_Harbin_35_bis.fna NZ_CP009703.1:55189-57386 > temp.fna
    sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna
    
    Yersinia_pestis_Harbin_35_bis   ATGAAAAGCGTGAAAATCATGGGAACTATGCCACCGTCGATCTCCCTCGCCAAAGCTCATGAGCGCATCAGCCAACATTGGCAAAATCCTGTCGGTGAGCTCAATATCGGAGGAAAACGGTATAGAATTATCGATAATCAAGTGTTGCGCTTGAACCCCCACAGTGGTTTTTCTCTCTTTCGAGAAGGGGTTGGTAAGATCTTTTCGGGGAAGATGTTTAACTTTTCAATTGCTCGTAACCTTACTGACACACTCCATGCGGCCCAGAAAACGACTTCGCAGGAGCTAAGGTCTGATATCCCCAATGCTCTCAGTAATCTCTTTGGAGCCAAGCCACAGACCGAACTGCCGCTGGGTTGGAAAGGGGAGCCCTTGTCAGGAGCTCCGGATCTTGAAGGGATGCGAGTGGCTGAAACCGATAAGTTTGCCGAGGGCGAAAGCCATATTAGTATAATAGAAACTAAGGATAAGCAGCGGTTGGTAGCTAAGATTGAACGCTCCATTGCCGAGGGGCATTTGTTCGCAGAACTGGAGGCTTATAAACACATCTATAAAACCGCGGGCAAACATCCTAATCTTGCCAATGTTCATGGCATGGCTGTGGTGCCATACGGTAACCGTAAGGAGGAAGCATTGCTGATGGATGAGGTGGATGGTTGGCGTTGTTCTGACACACTAAGAACCCTCGCCGATAGCTGGAAGCAAGGAAAGATCAATAGTGAAGCCTACTGGGGAACGATCAAGTTTATTGCCCATCGGCTATTAGATGTAACCAATCACCTTGCCAAGGCAGGGGTAGTACATAACGATATCAAACCCGGTAATGTGGTATTTGACCGCGCTAGCGGAGAGCCCGTTGTTATTGATCTAGGATTACACTCTCGTTCAGGGGAACAACCTAAGGGGTTTACAGAATCCTTCAAAGCGCCGGAGCTTGGAGTAGGAAACCTAGGCGCATCAGAAAAGAGCGATGTTTTTCTCGTAGTGTCAACCCTTCTACATTGTATCGAAGGTTTTGAGAAAAATCCGGAGATAAAGCCTAATCAAGGACTGAGATTCATTACCTCAGAACCAGCGCACGTAATGGATGAGAATGGTTATCCAATCCATCGACCTGGTATAGCTGGAGTCGAGACAGCCTATACACGCTTCATCACAGACATCCTTGGCGTTTCCGCTGACTCAAGACCTGATTCCAACGAAGCCAGACTCCACGAGTTCTTGAGCGACGGAACTATCGACGAGGAGTCGGCCAAGCAGATCCTAAAAGATACCCTAACCGGAGAAATGAGCCCATTATCTACTGATGTAAGGCGGATAACACCCAAGAAGCTTCGGGAGCTATCTGATTTGCTTAGGACGCATTTGAGCAGTGCAGCAACTAAGCAATTGGATATGGGGGGGTTTTGTCGGATCTTGATACCATGTTGGTGGCACTCGACAAGGCCGAACGCGAGGGGGGAGTAGACAAGGATCAGTTGAAGAGTTTTAACAGTTTGATTCTGAAGACTTACAGAGTGATTGAAGACTATGTCAAAGGCAGAGAAGGGGATACCAAGAATTCCAGTACGGAAGTATCCCCCTATCATCGCAGTAACTTTATGCTATCGATCGTCGAACCTTCACTGCAGAGGATCCAGAAGCATCTGGACCAGACACACTCTTTTTCTGATATCGGTTCACTAGTGCGCGCACATAAGCACCTGGAAACGCTTTTAGAGGTCTTAGTCACCTTGTCACAGCAAGGGCAGCCCGTGTCCTCTGAAACCTACGGCTTCCTGAATCGATTAACTGAGGCTAAGATCACCTTGTCGCAGCAATTGAATACTCTCCAGCAGCAGCAGGAGAGTGCGAAAGCGCAATTATCTATTCTGATTAATCGTTCAGGTTCTTGGGCCGATGTTGCTCGTCAGTCCCTGCAGCGTTTTGACAGTACCCGGCCTGTAGTGAAATTCGGCACTGAGCAGTATACCGCAATTCACCGTCAGATGATGGCGGCCCATGCAGCTATTACGCTACAGGAGGTATCGGAGTTTACTGATGATATGCGAAACTTTACAGTGGACTCTATTCCACTACTGATTCAACTTGGACGAAGCAGTTTAATGGATGAGCATTTGGTTGAACAGAGAGAAAAGTTGCGAGAGCTGACGACCATCGCCGAGCGACTGAACCGGTTGGAGCGGGAATGGATGTGA
    
    #grep "yopO" selected_gtf_files/Yersinia_pestis_Java9.gtf
    NZ_CP009995.1   RefSeq  gene    76131   77073   .       -       .       gene_id "CH62_RS22640"; transcript_id ""; gbkey "Gene"; gene "yopO"; gene_biotype "protein_coding"; gene_synonym "ypkA"; locus_tag "CH62_RS22640"; part "2";
    NZ_CP009995.1   RefSeq  gene    1       1256    .       -       .       gene_id "CH62_RS22640"; transcript_id ""; gbkey "Gene"; gene "yopO"; gene_biotype "protein_coding"; gene_synonym "ypkA"; locus_tag "CH62_RS22640"; part "1"; 
    
    samtools faidx Yersinia_pestis_Java9.fna NZ_CP009995.1:76131-77073 > temp.fna
    samtools faidx Yersinia_pestis_Java9.fna NZ_CP009995.1:1-1256 >> temp.fna
    #delete the second ">****"
    revseq
    sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 76131-77073.rev > temp_.fna
    
    Yersinia_pestis_Java9   ATGAAAAGCGTGAAAATCATGGGAACTATGCCACCGTCGATCTCCCTCGCCAAAGCTCATGAGCGCATCAGCCAACATTGGCAAAATCCTGTCGGTGAGCTCAATATCGGAGGAAAACGGTATAGAATTATCGATAATCAAGTGTTGCGCTTGAACCCCCACAGTGGTTTTTCTCTCTTTCGAGAAGGGGTTGGTAAGATCTTTTCGGGGAAGATGTTTAACTTTTCAATTGCTCGTAACCTTACTGACACACTCCATGCGGCCCAGAAAACGACTTCGCAGGAGCTAAGGTCTGATATCCCCAATGCTCTCAGTAATCTCTTTGGAGCCAAGCCACAGACCGAACTGCCGCTGGGTTGGAAAGGGGAGCCCTTGTCAGGAGCTCCGGATCTTGAAGGGATGCGAGTGGCTGAAACCGATAAGTTTGCCGAGGGCGAAAGCCATATTAGTATAATAGAAACTAAGGATAAGCAGCGGTTGGTAGCTAAGATTGAACGCTCCATTGCCGAGGGGCATTTGTTCGCAGAACTGGAGGCTTATAAACACATCTATAAAACCGCGGGCAAACATCCTAATCTTGCCAATGTTCATGGCATGGCTGTGGTGCCATACGGTAACCGTAAGGAGGAAGCATTGCTGATGGATGAGGTGGATGGTTGGCGTTGTTCTGACACACTAAGAACCCTCGCCGATAGCTGGAAGCAAGGAAAGATCAATAGTGAAGCCTACTGGGGAACGATCAAGTTTATTGCCCATCGGCTATTAGATGTAACCAATCACCTTGCCAAGGCAGGGGTAGTACATAACGATATCAAACCCGGTAATGTGGTATTTGACCGCGCTAGCGGAGAGCCCGTTGTTATTGATCTAGGATTACACTCTCGTTCAGGGGAACAACCTAAGGGGTTTACAGAATCCTTCAAAGCGCCGGAGCTTGGAGTAGGAAACCTAGGCGCATCAGAAAAGAGCGATGTTTTTCTCGTAGTGTCAACCCTTCTACATTGTATCGAAGGTTTTGAGAAAAATCCGGAGATAAAGCCTAATCAAGGACTGAGATTCATTACCTCAGAACCAGCGCACGTAATGGATGAGAATGGTTATCCAATCCATCGACCTGGTATAGCTGGAGTCGAGACAGCCTATACACGCTTCATCACAGACATCCTTGGCGTTTCCGCTGACTCAAGACCTGATTCCAACGAAGCCAGACTCCACGAGTTCTTGAGCGACGGAACTATCGACGAGGAGTCGGCCAAGCAGATCCTAAAAGATACCCTAACCGGAGAAATGAGCCCATTATCTACTGATGTAAGGCGGATAACACCCAAGAAGCTTCGGGAGCTATCTGATTTGCTTAGGACGCATTTGAGCAGTGCAGCAACTAAGCAATTGGATATGGGGGGGGTTTTGTCGGATCTTGATACCATGTTGGTGGCACTCGACAAGGCCGAACGCGAGGGGGGAGTAGACAAGGATCAGTTGAAGAGTTTTAACAGTTTGATTCTGAAGACTTACAGAGTGATTGAAGACTATGTCAAAGGCAGAGAAGGGGATACCAAGAATTCCAGTACGGAAGTATCCCCCTATCATCGCAGTAACTTTATGCTATCGATCGTCGAACCTTCACTGCAGAGGATCCAGAAGCATCTGGACCAGACACACTCTTTTTCTGATATCGGTTCACTAGTGCGCGCACATAAGCACCTGGAAACGCTTTTAGAGGTCTTAGTCACCTTGTCACAGCAAGGGCAGCCCGTGTCCTCTGAAACCTACGGCTTCCTGAATCGATTAACTGAGGCTAAGATCACCTTGTCGCAGCAATTGAATACTCTCCAGCAGCAGCAGGAGAGTGCGAAAGCGCAATTATCTATTCTGATTAATCGTTCAGGTTCTTGGGCCGATGTTGCTCGTCAGTCCCTGCAGCGTTTTGACAGTACCCGGCCTGTAGTGAAATTCGGCACTGAGCAGTATACCGCAATTCACCGTCAGATGATGGCGGCCCATGCAGCTATTACGCTACAGGAGGTATCGGAGTTTACTGATGATATGCGAAACTTTACAGTGGACTCTATTCCACTACTGATTCAACTTGGACGAAGCAGTTTAATGGATGAGCATTTGGTTGAACAGAGAGAAAAGTTGCGAGAGCTGACGACCATCGCCGAGCGACTGAACCGGTTGGAGCGGGAATGGATGTGA
    
    #grep "yopO" selected_gtf_files/Yersinia_pestis_Nicholisk_41.gtf
    NZ_CP009990.1   RefSeq  gene    47448   49645   .       -       .       gene_id "CH63_RS00925"; transcript_id ""; gbkey "Gene"; gene "yopO"; gene_biotype "pseudogene"; gene_synonym "ypkA"; locus_tag "CH63_RS00925"; old_locus_tag "CH63_4306"; pseudo "true"; 
    
    samtools faidx Yersinia_pestis_Nicholisk_41.fna NZ_CP009990.1:47448-49645 > temp.fna
    revseq
    sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 47448-49645.rev > temp_.fna
    
    Yersinia_pestis_Nicholisk_41    ATGAAAAGCGTGAAAATCATGGGAACTATGCCACCGTCGATCTCCCTCGCCAAAGCTCATGAGCGCATCAGCCAACATTGGCAAAATCCTGTCGGTGAGCTCAATATCGGAGGAAAACGGTATAGAATTATCGATAATCAAGTGTTGCGCTTGAACCCCCACAGTGGTTTTTCTCTCTTTCGAGAAGGGGTTGGTAAGATCTTTTCGGGGAAGATGTTTAACTTTTCAATTGCTCGTAACCTTACTGACACACTCCATGCGGCCCAGAAAACGACTTCGCAGGAGCTAAGGTCTGATATCCCCAATGCTCTCAGTAATCTCTTTGGAGCCAAGCCACAGACCGAACTGCCGCTGGGTTGGAAAGGGGAGCCCTTGTCAGGAGCTCCGGATCTTGAAGGGATGCGAGTGGCTGAAACCGATAAGTTTGCCGAGGGCGAAAGCCATATTAGTATAATAGAAACTAAGGATAAGCAGCGGTTGGTAGCTAAGATTGAACGCTCCATTGCCGAGGGGCATTTGTTCGCAGAACTGGAGGCTTATAAACACATCTATAAAACCGCGGGCAAACATCCTAATCTTGCCAATGTTCATGGCATGGCTGTGGTGCCATACGGTAACCGTAAGGAGGAAGCATTGCTGATGGATGAGGTGGATGGTTGGCGTTGTTCTGACACACTAAGAACCCTCGCCGATAGCTGGAAGCAAGGAAAGATCAATAGTGAAGCCTACTGGGGAACGATCAAGTTTATTGCCCATCGGCTATTAGATGTAACCAATCACCTTGCCAAGGCAGGGGTAGTACATAACGATATCAAACCCGGTAATGTGGTATTTGACCGCGCTAGCGGAGAGCCCGTTGTTATTGATCTAGGATTACACTCTCGTTCAGGGGAACAACCTAAGGGGTTTACAGAATCCTTCAAAGCGCCGGAGCTTGGAGTAGGAAACCTAGGCGCATCAGAAAAGAGCGATGTTTTTCTCGTAGTGTCAACCCTTCTACATTGTATCGAAGGTTTTGAGAAAAATCCGGAGATAAAGCCTAATCAAGGACTGAGATTCATTACCTCAGAACCAGCGCACGTAATGGATGAGAATGGTTATCCAATCCATCGACCTGGTATAGCTGGAGTCGAGACAGCCTATACACGCTTCATCACAGACATCCTTGGCGTTTCCGCTGACTCAAGACCTGATTCCAACGAAGCCAGACTCCACGAGTTCTTGAGCGACGGAACTATCGACGAGGAGTCGGCCAAGCAGATCCTAAAAGATACCCTAACCGGAGAAATGAGCCCATTATCTACTGATGTAAGGCGGATAACACCCAAGAAGCTTCGGGAGCTATCTGATTTGCTTAGGACGCATTTGAGCAGTGCAGCAACTAAGCAATTGGATATGGGGGGGTTTTGTCGGATCTTGATACCATGTTGGTGGCACTCGACAAGGCCGAACGCGAGGGGGGAGTAGACAAGGATCAGTTGAAGAGTTTTAACAGTTTGATTCTGAAGACTTACAGAGTGATTGAAGACTATGTCAAAGGCAGAGAAGGGGATACCAAGAATTCCAGTACGGAAGTATCCCCCTATCATCGCAGTAACTTTATGCTATCGATCGTCGAACCTTCACTGCAGAGGATCCAGAAGCATCTGGACCAGACACACTCTTTTTCTGATATCGGTTCACTAGTGCGCGCACATAAGCACCTGGAAACGCTTTTAGAGGTCTTAGTCACCTTGTCACAGCAAGGGCAGCCCGTGTCCTCTGAAACCTACGGCTTCCTGAATCGATTAACTGAGGCTAAGATCACCTTGTCGCAGCAATTGAATACTCTCCAGCAGCAGCAGGAGAGTGCGAAAGCGCAATTATCTATTCTGATTAATCGTTCAGGTTCTTGGGCCGATGTTGCTCGTCAGTCCCTGCAGCGTTTTGACAGTACCCGGCCTGTAGTGAAATTCGGCACTGAGCAGTATACCGCAATTCACCGTCAGATGATGGCGGCCCATGCAGCTATTACGCTACAGGAGGTATCGGAGTTTACTGATGATATGCGAAACTTTACAGTGGACTCTATTCCACTACTGATTCAACTTGGACGAAGCAGTTTAATGGATGAGCATTTGGTTGAACAGAGAGAAAAGTTGCGAGAGCTGACGACCATCGCCGAGCGACTGAACCGGTTGGAGCGGGAATGGATGTGA
  6. manually correct point-nt-errors in the sequences according to _seq_additional.aln and then added the corrected sequences to _seq.txt (time-consuming)

    for yop in yopJ yopB yopT yopE yopD yopM yopK yopO yopH; do
        grep "Yersinia_enterocolitica_WA" ${yop}_seq.txt > ${yop}_seq_additional.fasta
    done
    
    for yop in yopJ yopB yopT yopE yopD yopM yopK yopO yopH; do
        mafft --adjustdirection --clustalout ${yop}_seq_additional.fasta > ${yop}_seq_additional.aln
    done
  7. from ${yop}_seq.txt –> ${yop}_protein.fasta –> ${yop}_aligned_protein.fasta

    cd data/yop_files
    for yop in yopJ yopB yopT yopE yopD yopM yopK yopO yopH; do
        python3 txt_to_protein.py ${yop}_seq.txt ${yop}_protein.fasta
    done
    for yop in yopJ yopB yopT yopE yopD yopM yopK yopO yopH; do
        #NOTE: sometimes the alignment didn't work well since the manually added sequences missing bases!
        python3 protein_alignment.py ${yop}_protein.fasta ${yop}_aligned_protein.fasta mafft
        #awk -F '_' '/^>/ { printf(">%s", $3); for (i = 4; i <= NF; ++i) printf("_%s", $i); printf("\n"); next } { print }' ${yop}_aligned_protein.fasta > ${yop}_aligned_protein_.fasta
    done
    
    conda install mamba -c conda-forge  #-n base
    mamba env create -f environment.yml
    
    grep ">" yopB_seq.txt | wc -l
    67 --> 73
    grep ">" yopJ_seq.txt | wc -l  #*
    67 --> 72
    grep ">" yopT_seq.txt | wc -l
    64 --> 73
    grep ">" yopE_seq.txt | wc -l
    70 --> 73
    grep ">" yopD_seq.txt | wc -l
    71 --> 73
    grep ">" yopM_seq.txt | wc -l
    70 --> 71 --> 73
    grep ">"  yopK_seq.txt  | wc -l
    73
    grep ">" yopO_seq.txt | wc -l  #*
    64 --> 72
    grep ">" yopH_seq.txt | wc -l
    73
  8. cluster all sequences in yopM_aligned_protein.fasta, all 100% identital sequences will in a group clustered. For each cluster, output a record as representative. Give a table for All members of groups.

    for yop in yopJ yopB yopT yopE yopD yopM yopK yopO yopH; do
      usearch -cluster_fast ${yop}_aligned_protein.fasta -id 1.0 -centroids ${yop}_clustered.fasta -uc ${yop}_clusters.uc;
    done
    for yop in yopJ yopB yopT yopE yopD yopM yopK yopO yopH; do
      #parse the output of usarch to give a list a members for each class.
      python3 ~/Scripts/yop_analysis/parse_uc_file.py ${yop}_clusters.uc > ${yop}_clusters.txt
      sed -i "s/Members: \['//g" ${yop}_clusters.txt
      sed -i "s/'\]//g" ${yop}_clusters.txt
      sed -i "s/', '/, /g" ${yop}_clusters.txt
      sed -i "s/, /,/g" ${yop}_clusters.txt
      cut -d',' -f2- ${yop}_clusters.txt | sort > ${yop}_clusters_.txt
    done
    
    ~/Tools/csv2xls-0.4/csv_to_xls.py yopJ_clusters_.txt yopB_clusters_.txt yopT_clusters_.txt yopE_clusters_.txt yopD_clusters_.txt yopM_clusters_.txt yopK_clusters_.txt yopO_clusters_.txt yopH_clusters_.txt -o yop_clusters.xls
    
    for yop in yopJ yopB yopT yopE yopD yopM yopK yopO yopH; do
      python3 protein_alignment.py ${yop}_clustered.fasta ${yop}_clustered_aligned_protein.fasta mafft
    done
    for yop in yopJ yopB yopT yopE yopD yopM yopK yopO yopH; do
      python3 sort_fasta2.py ${yop}_clustered_aligned_protein.fasta ${yop}_sorted_selected_aligned_protein.fasta
    done
  9. draw alignments

    library(ggmsa)
    library(ggplot2)
    library(ggtree)
    #library(gggenes)
    library(ape)
    library(Biostrings)
    library(ggnewscale)
    library(dplyr)
    library(ggtreeExtra)
    library(phangorn)
    library(RColorBrewer)
    library(patchwork)
    library(ggplotify)
    library(aplot)
    library(magick)
    library(treeio)
    
    #219 --> 5
    data <- "yopE_sorted_selected_aligned_protein.fasta"
    tidymsa <- tidy_msa(data)
    png("alignment_yopE.png", width=1100, height=800*1.2)
    msa_plot <- ggplot() +
    geom_msa(data = tidymsa, char_width = 0.5, seq_name = TRUE, show.legend = TRUE) + theme_msa() + facet_msa(50)
    msa_plot
    dev.off()
    
    #288 --> 6
    data <- "yopJ_sorted_selected_aligned_protein.fasta"
    tidymsa <- tidy_msa(data)
    png("alignment_yopJ.png", width=1100, height=192*6)
    msa_plot <- ggplot() +
    geom_msa(data = tidymsa, char_width = 0.5, seq_name = TRUE, show.legend = TRUE) + theme_msa() + facet_msa(50)
    msa_plot
    dev.off()
    
    #306 --> 7
    data <- "yopD_sorted_selected_aligned_protein.fasta"
    tidymsa <- tidy_msa(data)
    png("alignment_yopD.png", width=1100, height=192*6)
    msa_plot <- ggplot() +
    geom_msa(data = tidymsa, char_width = 0.5, seq_name = TRUE, show.legend = TRUE) + theme_msa() + facet_msa(50)
    msa_plot
    dev.off()
    
    #529 --> 11
    data <- "yopM_sorted_selected_aligned_protein.fasta"
    tidymsa <- tidy_msa(data)
    png("alignment_yopM.png", width=1100, height=192*12)
    msa_plot <- ggplot() +
    geom_msa(data = tidymsa, char_width = 0.5, seq_name = TRUE, show.legend = TRUE) + theme_msa() + facet_msa(50)
    msa_plot
    dev.off()
    
    #182 --> 4
    data <- "yopK_sorted_selected_aligned_protein.fasta"
    tidymsa <- tidy_msa(data)
    png("alignment_yopK.png", width=1100, height=192*4)
    msa_plot <- ggplot() +
    geom_msa(data = tidymsa, char_width = 0.5, seq_name = TRUE, show.legend = TRUE) + theme_msa() + facet_msa(50)
    msa_plot
    dev.off()
    
    #732 --> 15
    data <- "yopO_sorted_selected_aligned_protein.fasta"
    tidymsa <- tidy_msa(data)
    png("alignment_yopO.png", width=1100, height=192*15)
    msa_plot <- ggplot() +
    geom_msa(data = tidymsa, char_width = 0.5, seq_name = TRUE, show.legend = TRUE) + theme_msa() + facet_msa(50)
    msa_plot
    dev.off()
    
    # -- RERUN due to the one-letter-in-last-line Bug 
    #401 --> 9 --> 8
    data <- "yopB_sorted_selected_aligned_protein.fasta"
    tidymsa <- tidy_msa(data)
    png("alignment_yopB.png", width=1100, height=192*8)
    msa_plot <- ggplot() +
    geom_msa(data = tidymsa, char_width = 0.5, seq_name = TRUE, show.legend = TRUE) + theme_msa() + facet_msa(51)
    msa_plot
    dev.off()
    
    # -- RERUN due to Error in tidy_msa(data) : Sequences must have unique names --
    #322 --> 7 --> delete the repeated Yersinia_pestis_D182038 --> merge the two partial CDS into one
    data <- "yopT_sorted_selected_aligned_protein.fasta"
    tidymsa <- tidy_msa(data)
    png("alignment_yopT.png", width=1100, height=192*8)
    msa_plot <- ggplot() +
    geom_msa(data = tidymsa, char_width = 0.5, seq_name = TRUE, show.legend = TRUE) + theme_msa() + facet_msa(50)
    msa_plot
    dev.off()
    
    #468 --> 10 --> delete the repeated Yersinia_enterocolitica_YE6
    data <- "yopH_sorted_selected_aligned_protein.fasta"
    tidymsa <- tidy_msa(data)
    png("alignment_yopH.png", width=1100, height=192*10)
    msa_plot <- ggplot() +
    geom_msa(data = tidymsa, char_width = 0.5, seq_name = TRUE, show.legend = TRUE) + theme_msa() + facet_msa(50)
    msa_plot
    dev.off()
  10. blast search and mauve analysis (mauve should be opened under bengal3_ac3)

    makeblastdb -in Yersinia_pestis_790.fna -dbtype nucl
    blastn -query yopJ_WA.fasta  -db Yersinia_pestis_790.fna -out yopJ_WA_on_790.txt
    blastn -query yopO_WA.fasta  -db Yersinia_pestis_790.fna -out yopO_WA_on_790.txt

prepare virus X14112 gtf for nextflow running

  1. install conda environment rnaseq_2021

    conda create -n rnaseq_2021 python=3.6.7
    conda activate rnaseq_2021
    conda install -c conda-forge -c bioconda -c defaults  nextflow=21.04 fastqc=0.11.8 trim-galore=0.5.0 star=2.6.1d hisat2=2.1.0
    conda install -c conda-forge -c bioconda -c defaults  picard=2.18.27 csvtk=0.17.0 preseq=2.0.3 
    conda install -c conda-forge -c bioconda -c defaults  samtools=1.9
    conda install -c conda-forge -c bioconda -c defaults  gffread=0.9.12
    conda install -c conda-forge -c bioconda -c defaults  subread=1.6.4
    conda install -c conda-forge -c bioconda -c defaults  deeptools=3.2.0
    conda install -c conda-forge -c bioconda -c defaults  multiqc=1.7    #*
    conda install -c conda-forge -c bioconda -c defaults  conda-forge::r-data.table=1.12.0 conda-forge::r-gplots=3.0.1.1 bioconductor-dupradar=1.12.1 bioconductor-edger=3.24.1 
    conda install -c conda-forge -c bioconda -c defaults  stringtie=1.3.5
    conda install -c conda-forge -c bioconda -c defaults  rseqc=3.0.0
  2. prepare the virus gtf-file

    # -- processing for virus gtf --
    # #gffread X14112.1_gene.gff -T -o X14112.1_gene.gtf
    # cp X14112.1.gff X14112.1.gff3
    # #gffread -E -F --bed X14112.1.gff3 -o X14112.1.bed (change the name errors in 1 intron and 2 genes)
    # grep "^##" X14112.1.gff3 > X14112.1_gene.gff3
    # 
    # # --try to filter the file with genes --> failed --
    # grep "ID=gene" X14112.1.gff3 >> X14112.1_gene.gff
    # cp X14112.1.gff3 X14112.1.gff
    
    # -- generating *_gene.gtf file containing only gene records --
    python3 gff2gtf.py
    # -- check if gene_id is unique --
    cut -f9- -d$'\t' X14112.1_gene.gtf > temp
    cut -f1 -d';' temp > temp_  #111
    sort temp_ > temp_1
    sort temp_ -u > temp_2
    diff temp_1 temp_2
    #39d38
    #< ID=gene-UL29
    #59d57
    #< ID=gene-UL43
    #--> delete short ones of the repeated records --> 109 records
    
    python3 extends.py #generating the file X14112.1_gene_extended.gtf
    #Then replace 'transcript_id "exon' --> 'transcript_id "rna' in X14112.1_gene_extended.gtf
    
    gffread -E -F --bed X14112.1_gene_extended.gtf -o X14112.1_gene_extended.bed
    #-->bed contains 109 transcript-name for example "rna-gene-RS1-2"
    
    ##!!!!OPTIONAL!!!!: don't need to change type '\tgene\t' to '\texon\t', since X14112.1_gene_extended.gtf contains exon-records.
    nextflow run rnaseq_old/main.nf --reads "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/Raw_Data/*.fastq.gz" --fasta "K14112.1.fasta" --gtf "K14112.1_gene_extended.gtf" --bed12 "K14112.1_gene_extendced.bed"  --singleEnd -profile standard --aligner star --fcGroupFeaturesType gene_biotype --skip_genebody_coverage -resume --saveReference
    
    # -- correct some special records (optional, as the processing above didn't genrate the records) --
    # delete the lines starting with "#"
    # replace "X14112.1:146805..151063" to IE175
    # replace "X14112.1:133941..146107" to IE68
    # add ;gene=IE68 ;gene=IE175 to the corresponding lines
    # -- python code for convert gff to gtf --
    # open the input file for reading and the output file for writing
    
    # -- scripts choose gene or exon or mRNA --
    # python3 gff2gtf.py #X14112.1.gff-->X14112.1.gtf
    # replace '; transcript_id "gene' to '; transcript_id "tx'
    # !!!!VERY_IMPORTANT!!!!: change type '\tgene\t' to '\texon\t'! 
    # sed -i -e "s/\tgene\t/\texon\t/g" X14112.1_gene.gff # since default is --featurecounts_feature_type 'exon'. 
  3. nextflow command: the input should be *.umi_extract.fastq.gz.

    #SUCCESSFUL
    (rnaseq) jhuang@hamburg:~/DATA/Data_Manja_RNAseq_Organoids_Virus$ /home/jhuang/anaconda3/bin/nextflow run rnaseq_old/main.nf --reads "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/Raw_Data/*.fastq.gz" --fasta "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1.fasta" --gtf "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_gene_extended.gtf" --bed12 "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_gene_extended.bed" --singleEnd -profile standard --aligner hisat2 --fcGroupFeaturesType gene_biotype --skip_genebody_coverage --skip_preseq -resume --saveReference
    
    #NOT_TESTED
    (rnaseq_2021) jhuang@hamm:~/DATA/Data_Manja_RNAseq_Organoids_Virus$ nextflow run rnaseq_old/main.nf --reads "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/Raw_Data/*.fastq.gz" --fasta "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1.fasta" --gtf "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_gene_extended.gtf" --bed12 "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_gene_extended.bed" --singleEnd -profile standard --aligner star  -resume --saveReference
  4. snippet of the human hg38 gtf served as a pattern

    1       ensembl_havana  gene    685679  686673  .       -       .       gene_id "ENSG00000284662"; gene_version "1"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding";
    
    1       ensembl_havana  transcript      685679  686673  .       -       .       gene_id "ENSG00000284662"; gene_version "1"; transcript_id "ENST00000332831"; transcript_version "4"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F16-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS41221"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)";
    
    1       ensembl_havana  exon    685679  686673  .       -       .       gene_id "ENSG00000284662"; gene_version "1"; transcript_id "ENST00000332831"; transcript_version "4"; exon_number "1"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F16-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS41221"; exon_id "ENSE00002324228"; exon_version "3"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)";
    
    1       ensembl_havana  CDS     685719  686654  .       -       0       gene_id "ENSG00000284662"; gene_version "1"; transcript_id "ENST00000332831"; transcript_version "4"; exon_number "1"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F16-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS41221"; protein_id "ENSP00000329982"; protein_version "2"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)";
    
    1       ensembl_havana  gene    1211340 1214153 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding";
    
    1       havana  transcript      1211340 1214138 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "2";
    
    1       havana  exon    1213983 1214138 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "1"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001480264"; exon_version "3"; transcript_support_level "2";
    1       havana  exon    1212992 1213785 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "2"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001906619"; exon_version "1"; transcript_support_level "2";
    1       havana  exon    1212638 1212704 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "3"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003550137"; exon_version "1"; transcript_support_level "2";
    1       havana  exon    1211942 1212138 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003604411"; exon_version "1"; transcript_support_level "2";
    1       ensembl_havana  exon    1211704 1211832 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "6"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; exon_id "ENSE00001333051"; exon_v
    ersion "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
    1       ensembl_havana  CDS     1211704 1211832 .       -       2       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "6"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; protein_id "ENSP00000368538"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
    1       ensembl_havana  exon    1211340 1211625 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "7"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; exon_id "ENSE00001915458"; exon_version "2"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
    1       ensembl_havana  CDS     1211558 1211625 .       -       2       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "7"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; protein_id "ENSP00000368538"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
    1       ensembl_havana  stop_codon      1211555 1211557 .       -       0       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "7"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
    1       ensembl_havana  five_prime_utr  1214128 1214153 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
    1       ensembl_havana  three_prime_utr 1211340 1211554 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
    
    1       havana  transcript      1211340 1214138 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "2";
    1       havana  exon    1213983 1214138 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "1"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001480264"; exon_version "3"; transcript_support_level "2";
    1       havana  exon    1212992 1213785 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "2"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203
    "; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001906619"; exon_version "1"; transcript_support_level "2";
    1       havana  exon    1212638 1212704 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "3"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003550137"; exon_version "1"; transcript_support_level "2";
    1       havana  exon    1211942 1212138 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003604411"; exon_version "1"; transcript_support_level "2";
    1       havana  exon    1211340 1211832 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "5"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001923078"; exon_version "1"; transcript_support_level "2";
    
    1       havana  transcript      1212019 1213498 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; transcript_support_level "3";
    1       havana  exon    1213395 1213498 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; exon_number "1"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00001680308"; exon_version "1"; transcript_support_level "3";
    1       havana  exon    1212992 1213093 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; exon_number "2"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003495433"; exon_version "1"; transcript_support_level "3";
    1       havana  exon    1212638 1212704 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; exon_number "3"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003550137"; exon_version "1"; transcript_support_level "3";
    1       havana  exon    1212019 1212138 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; exon_number "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00001723250"; exon_version "1"; transcript_support_level "3";
  5. generate wig files

    samtools faidx X14112.1.fasta
    cut -f1,2 X14112.1.fasta.fai > results/markDuplicates/chrom.sizes
    cd results/markDuplicates/
    for sample in control_r1 control_r2 HSV.d2_r1 HSV.d2_r2 HSV.d4_r1 HSV.d4_r2 HSV.d6_r1 HSV.d6_r2 HSV.d8_r1 HSV.d8_r2; do
        #bamCoverage -b ${sample}.umi_extract.sorted.markDups.bam -o ${sample}.bw
        #bedtools genomecov -ibam ${sample}.umi_extract.sorted.markDups.bam -bg > ${sample}.bedgraph
        bedGraphToBigWig ${sample}.bedgraph chrom.sizes ${sample}.bw
        bigWigToWig ${sample}.bw ${sample}.wig
    done
  6. input and clean data using R

    #BiocManager::install(c("DESeq2"))
    requiredPackages1 <-c("AnnotationDbi","clusterProfiler","ReactomePA","org.Hs.eg.db","DESeq2", "gplots", "RColorBrewer")
    ipak <- function(pkg){
            new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
            if (length(new.pkg))
                    install.packages(new.pkg, dependencies = TRUE)
            sapply(pkg, require, character.only = TRUE)
    }
    ipak(requiredPackages1)
    #requiredPackages2 <- c("tidyverse")
    #ipak(requiredPackages2)
    
    #cut -f1-1 merged_gene_counts.txt > col1.txt
    #paste -d$'\t' col1.txt merged_gene_counts2.txt > merged_gene_counts3.txt
    #sed -i 's/gene-//g' merged_gene_counts3.txt
    
    #replace "X14112.1" to "X14112"; delete "rna-gene-"; get X14112.1_gene_extended2.gtf; using it in IGV
    
    library("AnnotationDbi")
    library("clusterProfiler")
    library("ReactomePA")
    library("org.Hs.eg.db")
    library(DESeq2)
    library(gplots)
    setwd("~/DATA/Data_Manja_RNAseq_Organoids_Virus/results/featureCounts")
    
    #---- dataset (27) samples (firstly import all samples, then spring to 27-3-1-2) ----
    d.raw<- read.delim2("merged_gene_counts3.txt",sep="\t", header=TRUE, row.names=1)
    #> head(d.raw,0)
    # [1] HSV.d4_r2.umi_extract.sorted.bam  HSV.d6_r1.umi_extract.sorted.bam 
    # [3] HSV.d4_r1.umi_extract.sorted.bam  control_r1.umi_extract.sorted.bam
    # [5] HSV.d2_r2.umi_extract.sorted.bam  control_r2.umi_extract.sorted.bam
    # [7] HSV.d2_r1.umi_extract.sorted.bam  HSV.d8_r2.umi_extract.sorted.bam 
    # [9] HSV.d8_r1.umi_extract.sorted.bam  HSV.d6_r2.umi_extract.sorted.bam 
    
    col.order <-  c("control_r1.umi_extract.sorted.bam","control_r2.umi_extract.sorted.bam", "HSV.d2_r1.umi_extract.sorted.bam","HSV.d2_r2.umi_extract.sorted.bam", "HSV.d4_r1.umi_extract.sorted.bam","HSV.d4_r2.umi_extract.sorted.bam", "HSV.d6_r1.umi_extract.sorted.bam","HSV.d6_r2.umi_extract.sorted.bam", "HSV.d8_r1.umi_extract.sorted.bam","HSV.d8_r2.umi_extract.sorted.bam")
    d <- d.raw[,col.order]  #reordered.raw
    #d <- reordered.raw[rowSums(reordered.raw>3)>2,]
    
    colnames(d) =  c("control_r1","control_r2", "HSV.d2_r1","HSV.d2_r2", "HSV.d4_r1","HSV.d4_r2", "HSV.d6_r1","HSV.d6_r2", "HSV.d8_r1","HSV.d8_r2")
    
    # Define the replicates and condition of the samples
    ids <- factor(c("control_r1","control_r2", "HSV.d2_r1","HSV.d2_r2", "HSV.d4_r1","HSV.d4_r2", "HSV.d6_r1","HSV.d6_r2", "HSV.d8_r1","HSV.d8_r2"))
    replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2"))
    condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8", "HSV.d8"))
    
    # Construct the DESeqDataSet
    cData = data.frame(row.names=colnames(d), replicate=replicate, condition=condition, ids=ids)
    dds<-DESeqDataSetFromMatrix(countData=d, colData=cData, design=~condition)
    
    # Run DESeq (early without the step, WRONG?)
    dds <- DESeq(dds)
    
    # Apply the rlog transformation
    rld <- rlogTransformation(dds)
    #rld <- vst(dds)  #vsd
    
    #-- save raw_data as xls --
    write.csv(d, file="d.csv")
    #~/Tools/csv2xls-0.4/csv_to_xls.py d.csv -d$',' -o d.xls
  7. plotting pca and heatmap and remove batchEffect

    # -- pca --
    png("pca.png", 1200, 800)
    plotPCA(rld, intgroup=c("condition"))
    #plotPCA(rld, intgroup = c("condition", "batch"))
    #plotPCA(rld, intgroup = c("condition", "ids"))
    #plotPCA(rld, "batch")
    dev.off()
    
    # -- heatmap --
    ## generate the pairwise comparison between samples
    library(gplots) 
    library("RColorBrewer")
    png("heatmap.png", 1200, 800)
    distsRL <- dist(t(assay(rld)))
    mat <- as.matrix(distsRL)
    #paste( rld$dex, rld$cell, sep="-" )
    #rownames(mat) <- colnames(mat) <- with(colData(dds),paste(condition,batch, sep=":"))
    rownames(mat) <- colnames(mat) <- with(colData(dds),paste(condition,ids, sep=":"))
    hc <- hclust(distsRL)
    hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
    heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
    dev.off()
  8. select the differentially expressed genes

    #https://galaxyproject.eu/posts/2020/08/22/three-steps-to-galaxify-your-tool/
    #https://www.biostars.org/p/282295/
    #https://www.biostars.org/p/335751/
    
    #> condition
    # [1] control control HSV.d2  HSV.d2  HSV.d4  HSV.d4  HSV.d6  HSV.d6  HSV.d8  HSV.d8 
    #Levels: control HSV.d2 HSV.d4 HSV.d6 HSV.d8
    
    #CONSOLE: mkdir /home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/results/featureCounts/degenes
    setwd("/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/results/featureCounts/degenes")
    #---- relevel to control ----
    dds$condition <- relevel(dds$condition, "control")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("HSV.d2_vs_control","HSV.d4_vs_control", "HSV.d6_vs_control", "HSV.d8_vs_control")
    
    dds$condition <- relevel(dds$condition, "HSV.d2")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("HSV.d4_vs_HSV.d2", "HSV.d6_vs_HSV.d2", "HSV.d8_vs_HSV.d2")
    
    dds$condition <- relevel(dds$condition, "HSV.d4")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("HSV.d6_vs_HSV.d4", "HSV.d8_vs_HSV.d4")
    
    dds$condition <- relevel(dds$condition, "HSV.d6")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("HSV.d8_vs_HSV.d6")
    
    for (i in clist) {
      contrast = paste("condition", i, sep="_")
      res = results(dds, name=contrast)
      res <- res[!is.na(res$log2FoldChange),]
      res_df <- as.data.frame(res)
    
      write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
      up <- subset(res_df, padj<=0.05 & log2FoldChange>=1.2)
      down <- subset(res_df, padj<=0.05 & log2FoldChange<=-1.2)
      write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
      write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
    }
    
    ##https://github.com/kevinblighe/EnhancedVolcano
    #BiocManager::install("EnhancedVolcano")
    library("EnhancedVolcano")
    #for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control; do
    #for i in HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2; do
    #for i in HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4; do
    for i in HSV.d8_vs_HSV.d6; do
      echo "contrast = paste(\"condition\", \"${i}\", sep=\"_\")"
      echo "res = results(dds, name=contrast)"
      #echo "res <- res[!is.na(res$log2FoldChange),]"
      echo "res <- na.omit(res)"
      echo "res_df <- as.data.frame(res)"
      #selectLab = selectLab_italics,
      echo "png(\"${i}.png\",width=1200, height=1000)"
      #legendPosition = 'right',legendLabSize = 12,  arrowheads = FALSE,
      echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.2, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(\"$(echo $i | cut -d'_' -f1) versus $(echo $i | cut -d'_' -f3)\"))"
      echo "dev.off()"
    done
    
    #under DIR degenes under KONSOLE
    for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do
      echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;"
    done

9 (optional). clustering the genes and draw heatmap

    install.packages("gplots")
    library("gplots")

    for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do
      echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"
      echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"
    done

    cat *.id | sort -u > ids
    #add Gene_Id in the first line, delete the ""
    GOI <- read.csv("ids")$Gene_Id
    RNASeq.NoCellLine <- assay(rld)

    #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
    datamat = RNASeq.NoCellLine[GOI, ]
    write.csv(as.data.frame(datamat), file ="significant_gene_expressions.txt")
    hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
    hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
    mycl = cutree(hr, h=max(hr$height)/1.5)
    mycol = c("YELLOW", "BLUE", "ORANGE", "MAGENTA", "CYAN", "RED", "GREEN", "MAROON", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN", "LIGHTRED", "LIGHTGREEN");
    mycol = mycol[as.vector(mycl)]

    #png("DEGs_heatmap.png", width=900, height=800)
    #cex.lab=10, labRow="",
    png("DEGs_heatmap.png", width=900, height=1000)
    heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
                scale='row',trace='none',col=bluered(75), 
                RowSideColors = mycol, margins=c(10,20), cexRow=1.5, srtCol=45, lhei = c(2, 8))  #rownames(datamat)  
    #heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(5,5), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', labRow="", srtCol=30, lhei=c(0.1,2))
    dev.off()

    #### cluster members #####
    write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
    write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt') 
    write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')  
    #~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o DEGs_heatmap_cluster_members.xls

    ~/Tools/csv2xls-0.4/csv_to_xls.py \
    significant_gene_expressions.txt \
    -d',' -o DEGs_heatmap_expression_data.xls;

Peak calling using homer combining sicer and macs2

  1. nextflow processing data

    V_8_1_6_p601_d8_D1_H3K4me3.fastq.gz,V_8_1_6_p601_d8_D1_input.fastq.gz,p601_H3K4me3_D1
    V_8_1_5_p601_d8_D2_H3K4me3.fastq.gz,V_8_1_5_p601_d8_D2_input.fastq.gz,p601_H3K4me3_D2
    V_8_1_6_p604_d8_D1_H3K4me3.fastq.gz,V_8_1_6_p604_d8_D1_input.fastq.gz,p604_H3K4me3_D1
    V_8_1_5_p604_d8_D2_H3K4me3.fastq.gz,V_8_1_5_p604_d8_D2_input.fastq.gz,p604_H3K4me3_D2
    V_8_1_6_p601_d8_D1_H3K27me3.fastq.gz,V_8_1_6_p601_d8_D1_input.fastq.gz,p601_H3K27me3_D1
    V_8_1_5_p601_d8_D2_H3K27me3.fastq.gz,V_8_1_5_p601_d8_D2_input.fastq.gz,p601_H3K27me3_D2
    V_8_1_6_p604_d8_D1_H3K27me3.fastq.gz,V_8_1_6_p604_d8_D1_input.fastq.gz,p604_H3K27me3_D1
    V_8_1_5_p604_d8_D2_H3K27me3.fastq.gz,V_8_1_5_p604_d8_D2_input.fastq.gz,p604_H3K27me3_D2
    V_8_1_7_p601_d8_D1_H3K9me3.fastq.gz,V_8_1_6_p601_d8_D1_input.fastq.gz,p601_H3K9me3_D1
    V_8_1_7_p601_d8_D2_H3K9me3.fastq.gz,V_8_1_5_p601_d8_D2_input.fastq.gz,p601_H3K9me3_D2
    V_8_1_7_p604_d8_D1_H3K9me3.fastq.gz,V_8_1_6_p604_d8_D1_input.fastq.gz,p604_H3K9me3_D1
    V_8_1_7_p604_d8_D2_H3K9me3.fastq.gz,V_8_1_5_p604_d8_D2_input.fastq.gz,p604_H3K9me3_D2
    V_8_1_8_p601_d8_D1_H3K27ac.fastq.gz,V_8_1_6_p601_d8_D1_input.fastq.gz,p601_H3K27ac_D1
    V_8_1_8_p601_d8_D2_H3K27ac.fastq.gz,V_8_1_5_p601_d8_D2_input.fastq.gz,p601_H3K27ac_D2
    V_8_1_8_p604_d8_D1_H3K27ac.fastq.gz,V_8_1_6_p604_d8_D1_input.fastq.gz,p604_H3K27ac_D1
    V_8_1_8_p604_d8_D2_H3K27ac.fastq.gz,V_8_1_5_p604_d8_D2_input.fastq.gz,p604_H3K27ac_D2
    
    nextflow run NGI-ChIPseq/main.nf --reads '/home/jhuang/DATA/Data_Denise_LT_DNA_Binding/ChIPseq_histone_hg38/H3K4me3_H3K27ac__H3K27me3_H3K9me3/Raw_Data_GEO_uploaded/*.fastq.gz' --genome hg38 --macsconfig macs.config --saveReference --saveAlignedIntermediates --singleEnd --blacklist_filtering -profile standard --project NHDF_enhancer_analysis_hg38 -resume
    
    nextflow run NGI-ChIPseq/main.nf --reads '/mnt/h1/jhuang/DATA/Data_Denise_LT_DNA_Binding/ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/Raw_Data/*.fastq.gz' --genome hg38 --macsconfig macs.config --saveReference --saveAlignedIntermediates --singleEnd --blacklist_filtering -profile standard --project NHDF_enhancer_analysis_hg38 -resume
    
    (DEBUG: Control doesn't work well!)
    lrwxrwxrwx 1 jhuang jhuang   37 Mai 19 11:19 NHDF-Ad_H3K4me1_r1.fastq.gz -> ../Raw_Data_orig/SRR568344_1.fastq.gz
    lrwxrwxrwx 1 jhuang jhuang   37 Mai 19 11:19 NHDF-Ad_H3K4me1_r2.fastq.gz -> ../Raw_Data_orig/SRR568345_1.fastq.gz
    lrwxrwxrwx 1 jhuang jhuang   37 Mai 19 11:19 NHDF-Ad_H3K27ac_r1.fastq.gz -> ../Raw_Data_orig/SRR227397_1.fastq.gz
    lrwxrwxrwx 1 jhuang jhuang   37 Mai 19 11:19 NHDF-Ad_H3K27ac_r2.fastq.gz -> ../Raw_Data_orig/SRR227398_1.fastq.gz
    lrwxrwxrwx 1 jhuang jhuang   37 Mai 19 11:19 NHDF-Ad_Control_r1.fastq.gz -> ../Raw_Data_orig/SRR227590_1.fastq.gz
    lrwxrwxrwx 1 jhuang jhuang   37 Mai 19 11:19 NHDF-Ad_Control_r2.fastq.gz -> ../Raw_Data_orig/SRR227591_1.fastq.gz
  2. make homer directories and findPeaks with HOMER under (myperl)

    conda activate myperl
    
    #Why do I need give "-genome hg38" in makeTagDirectory?
    #If you don't provide a genome with the -genome option, HOMER will only count the number of tags in each region without any genomic context or sequence information. #So, it is essential to include this information when creating a tag directory if you plan to perform any genome-based analysis.
    
    makeTagDirectory p601_d8_D1_input ../results/picard/V_8_1_6_p601_d8_D1_input.dedup.sorted.bam -genome hg38
    makeTagDirectory p601_d8_D2_input ../results/picard/V_8_1_5_p601_d8_D2_input.dedup.sorted.bam -genome hg38
    makeTagDirectory p604_d8_D1_input ../results/picard/V_8_1_6_p604_d8_D1_input.dedup.sorted.bam -genome hg38
    makeTagDirectory p604_d8_D2_input ../results/picard/V_8_1_5_p604_d8_D2_input.dedup.sorted.bam -genome hg38
    makeTagDirectory p601_d8_D1_H3K4me3 ../results/picard/V_8_1_6_p601_d8_D1_H3K4me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p601_d8_D2_H3K4me3 ../results/picard/V_8_1_5_p601_d8_D2_H3K4me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p604_d8_D1_H3K4me3 ../results/picard/V_8_1_6_p604_d8_D1_H3K4me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p604_d8_D2_H3K4me3 ../results/picard/V_8_1_5_p604_d8_D2_H3K4me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p601_d8_D1_H3K27me3 ../results/picard/V_8_1_6_p601_d8_D1_H3K27me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p601_d8_D2_H3K27me3 ../results/picard/V_8_1_5_p601_d8_D2_H3K27me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p604_d8_D1_H3K27me3 ../results/picard/V_8_1_6_p604_d8_D1_H3K27me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p604_d8_D2_H3K27me3 ../results/picard/V_8_1_5_p604_d8_D2_H3K27me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p601_d8_D1_H3K27ac ../results/picard/V_8_1_8_p601_d8_D1_H3K27ac.dedup.sorted.bam -genome hg38
    makeTagDirectory p601_d8_D2_H3K27ac ../results/picard/V_8_1_8_p601_d8_D2_H3K27ac.dedup.sorted.bam -genome hg38
    makeTagDirectory p604_d8_D1_H3K27ac ../results/picard/V_8_1_8_p604_d8_D1_H3K27ac.dedup.sorted.bam -genome hg38
    makeTagDirectory p604_d8_D2_H3K27ac ../results/picard/V_8_1_8_p604_d8_D2_H3K27ac.dedup.sorted.bam -genome hg38
    
    for sample in p601_d8_D1_input p601_d8_D2_input p604_d8_D1_input p604_d8_D2_input  p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 p604_d8_D1_H3K4me3 p604_d8_D2_H3K4me3  p601_d8_D1_H3K27me3 p601_d8_D2_H3K27me3 p604_d8_D1_H3K27me3 p604_d8_D2_H3K27me3  p601_d8_D1_H3K27ac p601_d8_D2_H3K27ac p604_d8_D1_H3K27ac p604_d8_D2_H3K27ac; do
        makeUCSCfile ${sample} -pseudo 1 -bigWig /home/jhuang/REFs/hg38.chromSizes -o auto -style chipseq    -norm 1e7 -normLength 100 -fsize 1
    done
    
    # -- not necessary any more: using MACS2 and SICER instead of using findPeaks --
    # #factor (transcription factor ChIP-Seq, uses -center, output: peaks.txt,  default)
    # #histone (histone modification ChIP-Seq, region based, uses -region -size 500 -L 0, regions.txt)
    # for sample in p601_d8_D1 p601_d8_D2 p604_d8_D1 p604_d8_D2; do
    #   #Finding peaks of size 1000, no closer than 2000
    #   findPeaks ${sample}_H3K4me3 -style factor -size 1000  -o auto -i ${sample}_input
    #   #-minDist <#> (minimum distance between peaks, default: peak size x2)
    #   #findPeaks ${sample}_H3K27me3 -style histone -region -size 3000 -minDist 5000 -o auto -i ${sample}_input
    #   #findPeaks ${sample}_H3K27ac -style factor -size 200 -minDist 200 -o auto -i ${sample}_input
    #   #findPeaks ${sample}_H3K4me1 -style histone -region -size 1000 -minDist 2500 -o auto -i ${sample}_input
    # done
    
    ./p601_d8_D1_H3K4me3/peaks.txt
    ./p601_d8_D2_H3K4me3/peaks.txt
    ./p604_d8_D1_H3K4me3/peaks.txt
    ./p604_d8_D2_H3K4me3/peaks.txt
    
    ./p601_d8_D1_H3K27me3/regions.txt
    ./p601_d8_D2_H3K27me3/regions.txt
    ./p604_d8_D1_H3K27me3/regions.txt
    ./p604_d8_D2_H3K27me3/regions.txt
    
    for dir in p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 p604_d8_D1_H3K4me3 p604_d8_D2_H3K4me3; do
    awk -v OFS='\t' '{print $2, $3, $4, $1, $6}' ./${dir}/peaks.txt > ${dir}_peaks.bed
    grep -v "#" ${dir}_peaks.bed | sort -k1,1 -k2,2n > ${dir}_sorted_peaks.bed
    done
    
    for dir in p601_d8_D1_H3K27me3 p601_d8_D2_H3K27me3 p604_d8_D1_H3K27me3 p604_d8_D2_H3K27me3; do
    awk -v OFS='\t' '{print $2, $3, $4, $1, $6}' ./${dir}/regions.txt > ${dir}_regions.bed
    grep -v "#" ${dir}_regions.bed | sort -k1,1 -k2,2n > ${dir}_sorted_regions.bed
    done
    
    #DEBUG: why the bam files so small?
    makeTagDirectory NHDF-Ad_Control_r1 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_Control_r1.dedup.sorted.bam -genome hg38
    makeTagDirectory NHDF-Ad_Control_r2 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_Control_r2.dedup.sorted.bam -genome hg38
    makeTagDirectory NHDF-Ad_H3K27ac_r1 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K27ac_r1.dedup.sorted.bam -genome hg38
    makeTagDirectory NHDF-Ad_H3K27ac_r2 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K27ac_r2.dedup.sorted.bam -genome hg38
    makeTagDirectory NHDF-Ad_H3K4me1_r1 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K4me1_r1.dedup.sorted.bam -genome hg38
    makeTagDirectory NHDF-Ad_H3K4me1_r2 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K4me1_r2.dedup.sorted.bam -genome hg38
    
    NHDF-Ad_Control_r1 NHDF-Ad_Control_r2 NHDF-Ad_H3K27ac_r1 NHDF-Ad_H3K27ac_r2 NHDF-Ad_H3K4me1_r1 NHDF-Ad_H3K4me1_r2
    
    > (myperl) environments for HOMER, ~/Tools/diffreps/bin/diffReps.pl, MACS2, ~/Tools/SICER1.1/SICER/SICER.sh
  3. combine the diffReps.pl and HOMER annotatePeaks.pl

    #Dynamic regions were defined as MACS (H3K4me3, H3K27ac) or SICER (H3K4me1, H3K27me3) peaks overlapping significantly (≥ 2-fold change, adjusted P-value ≤ 0.05) up- or down-regulated differentially enriched regions from diffReps in the three pairwise comparisons WAC vs mock, WA314 vs mock and WAC vs WA314.
    
    #STEP1
    #--> not given "--gname hg38"
    ## Step4: Annotate differential sites.
    #unless($noanno or $gname eq ''){
    #        `region_analysis.pl -i $report -r -d refseq -g $gname`;
    #}
    ## Step5: Look for hotspots.
    #unless($nohs){
    #        my $hotspot = $report . '.hotspot';
    #        `findHotspots.pl -d $report -o $hotspot`;
    #}
    ~/Tools/diffreps/bin/diffReps.pl -tr ../results/picard/V_8_1_6_p601_d8_D1_H3K4me3.dedup.sorted.bed ../results/picard/V_8_1_5_p601_d8_D2_H3K4me3.dedup.sorted.bed -co ../results/picard/V_8_1_6_p601_d8_D1_input.dedup.sorted.bed ../results/picard/V_8_1_5_p601_d8_D2_input.dedup.sorted.bed  --report output_results  --chrlen /home/jhuang/REFs/hg38.chromSizes --nsd sharp
    
    #STEP2
    #replace Chr to '#Chr'
    grep -v "#" output_results | sort -k1,1 -k2,2n > output_results_
    awk 'BEGIN {OFS="\t"} {print $1, $2, $3, "diffreps_peak_"NR, $12}' output_results_ > H3K4me3.bed
    #grep -v "#" H3K4me3.bed | sort -k1,1 -k2,2n > H3K4me3_sorted_peaks.bed
    
    #STEP3 (under myperl) peak calling macs2 for narrow peaks, CISER for broad peaks!
    #process the output of diffReps.pl to BED file.
    annotatePeaks.pl H3K4me3.bed hg38 > H3K4me3_annotated_peaks.txt
  4. combine macs2 to getDifferentialPeaksReplicates.pl

    replace the initial peak identification by using your MACS2 output.
    
    #http://homer.ucsd.edu/homer/ngs/diffExpression.html
    #getDifferentialPeaksReplicates.pl = findPeaks + annotatePeaks.pl + getDiffExpression.pl 
    #annotatePeaks.pl tss hg38 -raw -d H3K4me3-Mock-rep1/ H3K4me3-Mock-rep2/ H3K4me3-WNT-rep1/ H3K4me3-WNT-rep3/ > countTable.peaks.txt
    
    Here's an outline of how we might be able to replace the initial peak identification by using your MACS2 output.
    
    #TODO: using MACS call peaks of the data H3K27ac.

4.1. MACS2 peak calling

  #macs2 --> bed --> annotatePeaks.pl
  conda activate ngi_chipseq_ac2
  macs2 callpeak -t ../results/picard/V_8_1_6_p601_d8_D1_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_1_6_p601_d8_D1_input.dedup.sorted.bam -f BAM -g hs -n p601_d8_D1 -q 0.05
  macs2 callpeak -t ../results/picard/V_8_1_5_p601_d8_D2_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_1_5_p601_d8_D2_input.dedup.sorted.bam -f BAM -g hs -n p601_d8_D2 -q 0.05
  macs2 callpeak -t ../results/picard/V_8_1_6_p604_d8_D1_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_1_6_p604_d8_D1_input.dedup.sorted.bam -f BAM -g hs -n p604_d8_D1 -q 0.05
  macs2 callpeak -t ../results/picard/V_8_1_5_p604_d8_D2_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_1_5_p604_d8_D2_input.dedup.sorted.bam -f BAM -g hs -n p604_d8_D2 -q 0.05

  awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p601_d8_D1_peaks.narrowPeak > p601_d8_D1_peaks.bed
  awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p601_d8_D2_peaks.narrowPeak > p601_d8_D2_peaks.bed
  awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p604_d8_D1_peaks.narrowPeak > p604_d8_D1_peaks.bed
  awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p604_d8_D2_peaks.narrowPeak > p604_d8_D2_peaks.bed

  #annotatePeaks.pl p601_d8_D1_peaks.bed hg38 > p601_d8_D1_annotated_peaks.txt
  #annotatePeaks.pl p601_d8_D2_peaks.bed hg38 > p601_d8_D2_annotated_peaks.txt
  #annotatePeaks.pl p604_d8_D1_peaks.bed hg38 > p604_d8_D1_annotated_peaks.txt
  #annotatePeaks.pl p604_d8_D2_peaks.bed hg38 > p604_d8_D2_annotated_peaks.txt

4.2. Convert your MACS2 peaks to HOMER-compatible format. You can do this manually or with a script. For example:

It’s possible to use more information from the MACS2 output file to create a more informative peaks.txt file for HOMER. However, it’s important to note that some information that HOMER needs for its differential peak analysis is not available in the MACS2 output (such as Normalized Tag Count, Control Tags, and others). But we can certainly map more of the available MACS2 columns to the corresponding HOMER columns.

  #The following awk command can be used to convert more MACS2 information into the HOMER format:

  cd macs2
  awk 'BEGIN{OFS="\t"}{print $1,$2,$3,"Peak_"NR,$5,$6,$7,$8,$9,$10}' macs2_peaks.bed > macs2_peaks.txt
  awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p601_d8_D1_peaks.xls > p601_d8_D1_macs2_peaks.txt
  awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p601_d8_D2_peaks.xls > p601_d8_D2_macs2_peaks.txt
  awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p604_d8_D1_peaks.xls > p604_d8_D1_macs2_peaks.txt
  awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p604_d8_D2_peaks.xls > p604_d8_D2_macs2_peaks.txt

  This command will:

      * Skip the header line (NR > 1)
      * Map the MACS2 peak name ($10) to the HOMER PeakID
      * Map the MACS2 chromosome, start, and end ($1, $2, $3) to the HOMER chr, start, end
      * Use a placeholder "+" for the HOMER strand
      * Use a placeholder "0" for the HOMER Normalized Tag Count and Focus Ratio
      * Map the MACS2 pileup ($6) to the HOMER findPeaks Score and Total Tags
      * Use a placeholder "0" for the HOMER Control Tags
      * Map the MACS2 fold_enrichment ($8) to the HOMER Fold Change vs Control
      * Map the MACS2 abs_summit ($5) to the HOMER p-value vs Control
      * Map the MACS2 -log10(qvalue) ($9) to the HOMER Fold Change vs Local
      * Use a placeholder "0" for the HOMER p-value vs Local and Clonal Fold Change

  This script is limited by the differences in the information provided by MACS2 and HOMER. While it makes use of as much information as possible from the MACS2 output, some columns in the HOMER format still have to be filled with placeholder values.

- The following awk command can be used to convert more SICER information into the HOMER format (TODO) oder directly using findPeaks.pl.

- The following awk command can be used to convert more diffReps.pl information into the HOMER format (TODO).

4.3. Associate the converted peak files with their respective tag directories. In HOMER, peak files can be associated with a tag directory by placing them in the tag directory with the filename “peaks.txt”.

  mv homer/p601_d8_D1_H3K4me3/peaks.txt homer/p601_d8_D1_H3K4me3/peaks_raw.txt
  mv homer/p601_d8_D2_H3K4me3/peaks.txt homer/p601_d8_D2_H3K4me3/peaks_raw.txt
  mv homer/p604_d8_D1_H3K4me3/peaks.txt homer/p604_d8_D1_H3K4me3/peaks_raw.txt
  mv homer/p604_d8_D2_H3K4me3/peaks.txt homer/p604_d8_D2_H3K4me3/peaks_raw.txt
  cp macs2/p601_d8_D1_macs2_peaks.txt homer/p601_d8_D1_H3K4me3/peaks.txt
  cp macs2/p601_d8_D2_macs2_peaks.txt homer/p601_d8_D2_H3K4me3/peaks.txt
  cp macs2/p604_d8_D1_macs2_peaks.txt homer/p604_d8_D1_H3K4me3/peaks.txt
  cp macs2/p604_d8_D2_macs2_peaks.txt homer/p604_d8_D2_H3K4me3/peaks.txt

  #Repeat this for each of your tag directories.

4.4. The program getDifferentialPeaksReplicates will essentially perform 3 steps, in the step 2 was modified.

First, it will pool the target tag directories and input directories separately into pooled experiments and perform an initial peak identification (using findPeaks). Pooling the experiments is generally more sensitive than trying to merge the individual peak files coming from each experiment (although this can be done using the “-use ” option if each directory already has a peak file associated with it). Next, it will quantify the reads at the initial putative peaks across each of the target and input tag directories using annotatePeaks.pl. Finally, it calls getDiffExpression.pl and ultimately passes these values to the R/Bioconductor package DESeq2 to calculate enrichment values for each peak, returning only those peaks that pass a given fold enrichment (default: 2-fold) and FDR cutoff (default 5%). We can run getDifferentialPeaksReplicates.pl with the -use option to specify that the provided peaks should be used instead of calling findPeaks:

    #-- Successful modification of the script getDifferentialPeaksReplicates.pl --
    #The -d parameter in the mergePeaks function in HOMER is used to specify the maximum distance between peak centers
    #change Max distance to merge to 30000 bp in getDifferentialPeaksReplicates.pl
    #mergePeaks -d 30000 temp_sorted | sort

    #conda list homer  #4.11
    mergePeaks p601_d8_D1_H3K27me3/peaks.txt p601_d8_D2_H3K27me3/peaks.txt >  mergePeaks_res.txt
    python3 update_header.py
    cat p601_d8_D1_H3K27me3/peaks.txt p601_d8_D2_H3K27me3/peaks.txt > temp
    awk '{print $2 "\t" $3 "\t" $4 "\t" $1}' temp | sort -k1,1 -k2,2n | bedtools merge -d 1000 > bedtools_res.txt
    python3 adjust_mergePeaks_res.py

    #check if the results are correct
    cut -d$'\t' -f2-4 filtered_mergePeaks_res.txt > control1
    diff control1 bedtools_res.txt

  #(myperl) jhuang@hamburg:~/DATA/Data_Denise_LT_DNA_Bindung/results_chipseq_histone_hg38/H3K4me3_H3K27ac__H3K27me3_H3K9me3/homer$
  #getDifferentialPeaksReplicates.pl -use 
/peaks.txt,/peaks.txt … #TOO TIME_CONSUMING, using original version getDifferentialPeaksReplicates.pl -t p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 -i p601_d8_D1_input p601_d8_D2_input -genome hg38 -use peaks.txt > p601_d8_H3K4me3_macs2_peaks.txt getDifferentialPeaksReplicates.pl -t p604_d8_D1_H3K4me3 p604_d8_D2_H3K4me3 -i p604_d8_D1_input p604_d8_D2_input -genome hg38 -use peaks.txt > p604_d8_H3K4me3_macs2_peaks.txt #Remember to replace /peaks.txt,/peaks.txt … with the path to your own tag directories and peak files. 4.5. Draw plots import matplotlib.pyplot as plt from matplotlib_venn import venn2 venn2(subsets=(2476, 3567, 22719), set_labels=(‘Donor 1’, ‘Donor 2’)) plt.title(‘Peaks between p601 d8 H3K4me3 Donors’) plt.xlabel(‘Number of Elements’) plt.ylabel(”) plt.savefig(‘Peak_Consistency_Between_p601_d8_H3K4me3_Donors.png’, dpi=300, bbox_inches=’tight’) #2476+3567+22719=28762 venn2(subsets=(2681, 3410, 19044), set_labels=(‘Donor 1’, ‘Donor 2’)) plt.title(‘Peaks between p604 d8 H3K4me3 Donors’) plt.xlabel(‘Number of Elements’) plt.ylabel(”) plt.savefig(‘Peak_Consistency_Between_p604_d8_H3K4me3_Donors.png’, dpi=300, bbox_inches=’tight’) #2681+3410+19044=25135 awk ‘NR>1 {print $2 “\t” $3 “\t” $4 “\t” $1}’ p601_d8_H3K4me3_macs2_peaks.txt > p601_d8_H3K4me3_macs2_peaks.bed awk ‘NR>1 {print $2 “\t” $3 “\t” $4 “\t” $1}’ p604_d8_H3K4me3_macs2_peaks.txt > p604_d8_H3K4me3_macs2_peaks.bed ~/Tools/csv2xls-0.4/csv_to_xls.py p601_d8_H3K4me3_macs2_peaks.txt -d$’\t’ -o p601_d8_H3K4me3_macs2_peaks.xls ~/Tools/csv2xls-0.4/csv_to_xls.py p604_d8_H3K4me3_macs2_peaks.txt -d$’\t’ -o p604_d8_H3K4me3_macs2_peaks.xls 5.1. SICER peak calling (under env myperl) #SICER –> bed –> annotatePeaks.pl mkdir sicer; cd sicer; ln -s ../results/picard/V_8_1_6_p601_d8_D1_H3K27me3.dedup.sorted.bed . ln -s ../results/picard/V_8_1_6_p601_d8_D1_input.dedup.sorted.bed . ln -s ../results/picard/V_8_1_5_p601_d8_D2_H3K27me3.dedup.sorted.bed . ln -s ../results/picard/V_8_1_5_p601_d8_D2_input.dedup.sorted.bed . ln -s ../results/picard/V_8_1_6_p604_d8_D1_H3K27me3.dedup.sorted.bed . ln -s ../results/picard/V_8_1_6_p604_d8_D1_input.dedup.sorted.bed . ln -s ../results/picard/V_8_1_5_p604_d8_D2_H3K27me3.dedup.sorted.bed . ln -s ../results/picard/V_8_1_5_p604_d8_D2_input.dedup.sorted.bed . #chr10:49,003,170-51,222,175 #chr10:48,528,307-50,747,312 #chr10:58,422,744-60,641,749 mkdir p601_d8_D1 p601_d8_D2 p604_d8_D1 p604_d8_D2 #/home/jhuang/Tools/SICER1.1/SICER/SICER.sh [InputDir] [bed file] [control file] [OutputDir] [Species] [redundancy threshold] [window size (bp)] [fragment size] [effective genome fraction] [gap size (bp)] [FDR] # 10000 is window size, 30000 is the gap size, 160 is the fragment size ~/Tools/SICER1.1/SICER/SICER.sh . V_8_1_6_p601_d8_D1_H3K27me3.dedup.sorted.bed V_8_1_6_p601_d8_D1_input.dedup.sorted.bed p601_d8_D1 hg38 1 10000 160 0.74 30000 0.01; ~/Tools/SICER1.1/SICER/SICER.sh . V_8_1_5_p601_d8_D2_H3K27me3.dedup.sorted.bed V_8_1_5_p601_d8_D2_input.dedup.sorted.bed p601_d8_D2 hg38 1 10000 160 0.74 30000 0.01; ~/Tools/SICER1.1/SICER/SICER.sh . V_8_1_6_p604_d8_D1_H3K27me3.dedup.sorted.bed V_8_1_6_p604_d8_D1_input.dedup.sorted.bed p604_d8_D1 hg38 1 10000 160 0.74 30000 0.01; ~/Tools/SICER1.1/SICER/SICER.sh . V_8_1_5_p604_d8_D2_H3K27me3.dedup.sorted.bed V_8_1_5_p604_d8_D2_input.dedup.sorted.bed p604_d8_D2 hg38 1 10000 160 0.74 30000 0.01; #TODO: – check if the peak calling works well in IGV! # – call peaks for H3K27me1, adjust the SICER-parameters! # – transform them peaks.txt of HOMER, and call getDifferentialPeaksReplicates.pl -t p601_d8_D1_H3K27me3 p601_d8_D2_H3K27me3 -i p601_d8_D1_input p601_d8_D2_input -genome hg38 -use peaks.txt > p601_d8_H3K27me3_sicer_peaks.txt getDifferentialPeaksReplicates.pl -t p604_d8_D1_H3K27me3 p604_d8_D2_H3K27me3 -i p604_d8_D1_input p604_d8_D2_input -genome hg38 -use peaks.txt > p604_d8_H3K27me3_sicer_peaks.txt # – write a mail to Denise, sending the results of bigWig-files of H3K4me3 and H3K27me3, and called peaks. By the way, asks if she needs the Gene members of the red colors in the PCA plot! #Note that histone using ‘cat file1.bed file2.bed | sort -k1,1 -k2,2n | bedtools merge > merged.bed’ #Note that factor using HOMER getDifferentialPeaksReplicates from begining: “mergePeaks.pl –> DESeq recheck”! mergePeaks sicer/p601_d8_D1/V_8_1_6_p601_d8_D1_H3K27me3.dedup.sorted-W10000-G30000-FDR0.01-island.bed sicer/p601_d8_D2/V_8_1_5_p601_d8_D2_H3K27me3.dedup.sorted-W10000-G30000-FDR0.01-island.bed -d 10000 > mergedPeaks1.txt cat sicer/p601_d8_D1/V_8_1_6_p601_d8_D1_H3K27me3.dedup.sorted-W10000-G30000-FDR0.01-island.bed sicer/p601_d8_D2/V_8_1_5_p601_d8_D2_H3K27me3.dedup.sorted-W10000-G30000-FDR0.01-island.bed | sort -k1,1 -k2,2n | bedtools merge > merged_p601_d8.bed #4957 (4388, 4822) awk ‘BEGIN{OFS=”\t”; print “PeakID\tchr\tstart\tend\tstrand”} {print “Peak-“NR, $1, $2, $3, “.”}’ merged_p601_d8.bed > homer_peaks.bed #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!TODO!!!!!!!!!!!!!!!!!!!!!!!!: The bedtools merge command does not generate output in the HOMER peak format directly. However, you can use a combination of Unix command line tools (e.g., awk, sort) to convert bedtools output into HOMER format. #bedtools merge -i input.bed > merged.bed #awk ‘BEGIN{OFS=”\t”; print “PeakID\tchr\tstart\tend\tstrand”} {print “Peak-“NR, $1, $2, $3, “.”}’ merged.bed > homer_peaks.bed 5.2-3. Convert your SICER peaks to HOMER-compatible format, and replace them in homer/${sample}/peaks.txt. You can do this manually or with a script. – The following awk command can be used to convert more MACS2 information into the HOMER format: mv homer/p601_d8_D1_H3K27me3/regions.txt homer/p601_d8_D1_H3K27me3/regions_raw.txt mv homer/p601_d8_D2_H3K27me3/regions.txt homer/p601_d8_D2_H3K27me3/regions_raw.txt mv homer/p604_d8_D1_H3K27me3/regions.txt homer/p604_d8_D1_H3K27me3/regions_raw.txt mv homer/p604_d8_D2_H3K27me3/regions.txt homer/p604_d8_D2_H3K27me3/regions_raw.txt #NOTE that we should name the file as peaks.txt, since the Input-samples contain only peaks.txt. awk ‘BEGIN{OFS=”\t”} {print $1″-“$2”-“$3,$1,$2,$3,”+”,$4,0,0,$4,$5,0,$6,0}’ sicer/p601_d8_D1/V_8_1_6_p601_d8_D1_H3K27me3.dedup.sorted-W10000-G30000-islands-summary-FDR0.01 > homer/p601_d8_D1_H3K27me3/peaks.txt awk ‘BEGIN{OFS=”\t”} {print $1″-“$2”-“$3,$1,$2,$3,”+”,$4,0,0,$4,$5,0,$6,0}’ sicer/p601_d8_D2/V_8_1_5_p601_d8_D2_H3K27me3.dedup.sorted-W10000-G30000-islands-summary-FDR0.01 > homer/p601_d8_D2_H3K27me3/peaks.txt awk ‘BEGIN{OFS=”\t”} {print $1″-“$2”-“$3,$1,$2,$3,”+”,$4,0,0,$4,$5,0,$6,0}’ sicer/p604_d8_D1/V_8_1_6_p604_d8_D1_H3K27me3.dedup.sorted-W10000-G30000-islands-summary-FDR0.01 > homer/p604_d8_D1_H3K27me3/peaks.txt awk ‘BEGIN{OFS=”\t”} {print $1″-“$2”-“$3,$1,$2,$3,”+”,$4,0,0,$4,$5,0,$6,0}’ sicer/p604_d8_D2/V_8_1_5_p604_d8_D2_H3K27me3.dedup.sorted-W10000-G30000-islands-summary-FDR0.01 > homer/p604_d8_D2_H3K27me3/peaks.txt #The format is similar to the following. #chr1-83000-89999 chr1 83000 89999 + 21 0 0 21 27 0 5.623763510806874e-10 0 0 0 #chr1-859000-865999 chr1 859000 865999 + 23 0 0 23 28 0 3.501056563888787e-11 0 0 0 This command will: * PeakID: Format it as “chr-start-end”. * chr: Same as SICER. * start: Same as SICER. * end: Same as SICER. * strand: SICER doesn’t provide strand information, we could fill with a default value ‘+’. * Normalized Tag Count: Same as the normalized read count column in SICER. * focus ratio: SICER doesn’t provide this, we could fill with a default value ‘0’. * findPeaks Score: Corresponds to Fold Change or FDR column in SICER. * Total Tags: SICER doesn’t provide this, we could fill with a default value ‘0’. * Control Tags (normalized to IP Experiment): Same as Clonal reads or local background column in SICER. * Fold Change vs Control: SICER doesn’t provide this, we could fill with a default value ‘0’. * p-value vs Control: Same as the p-value column in SICER. * Fold Change vs Local: SICER doesn’t provide this, we could fill with a default value ‘0’. * p-value vs Local: SICER doesn’t provide this, we could fill with a default value ‘0’. * Clonal Fold Change: SICER doesn’t provide this, we could fill with a default value ‘0’. 5.4. The program getDifferentialPeaksReplicates will essentially perform 3 steps, in the step 2 was modified. conda activate myperl cd homer #diff /home/jhuang/anaconda3/envs/myperl/bin/getDifferentialPeaksReplicates.pl /home/jhuang/homer/bin/getDifferentialPeaksReplicates.pl #vim /home/jhuang/anaconda3/envs/myperl/bin/getDifferentialPeaksReplicates.pl #max_distance_to_merge at line 203 to 133000, “-d <#>” ./getDifferentialPeaksReplicates.pl -t p601_d8_D1_H3K27me3 p601_d8_D2_H3K27me3 -i p601_d8_D1_input p601_d8_D2_input -genome hg38 -use peaks.txt > p601_d8_H3K27me3_sicer_regions.txt ./getDifferentialPeaksReplicates.pl -t p604_d8_D1_H3K27me3 p604_d8_D2_H3K27me3 -i p604_d8_D1_input p604_d8_D2_input -genome hg38 -use peaks.txt > p604_d8_H3K27me3_sicer_regions.txt 5.5. Draw plots grep “p601_d8_D1_H3K27me3/peaks.txt|p601_d8_D2_H3K27me3/peaks.txt” p601_d8_H3K27me3_sicer_regions.txt | wc -l # 799 grep “p601_d8_D1_H3K27me3/peaks.txt” p601_d8_H3K27me3_sicer_regions.txt | wc -l # 867 – 799 = 68 grep “p601_d8_D2_H3K27me3/peaks.txt” p601_d8_H3K27me3_sicer_regions.txt | wc -l # 3019 – 799 = 2220 # 68 + 799 + 2220 = 3087 grep “p604_d8_D1_H3K27me3/peaks.txt|p604_d8_D2_H3K27me3/peaks.txt” p604_d8_H3K27me3_sicer_regions.txt | wc -l # 677 grep “p604_d8_D1_H3K27me3/peaks.txt” p604_d8_H3K27me3_sicer_regions.txt | wc -l # 740 – 677 = 63 grep “p604_d8_D2_H3K27me3/peaks.txt” p604_d8_H3K27me3_sicer_regions.txt | wc -l # 2393 – 677 = 1716 # 63 + 677 + 1716 = 2456 import matplotlib.pyplot as plt from matplotlib_venn import venn2 venn2(subsets=(2476, 3567, 22719), set_labels=(‘Donor 1’, ‘Donor 2’)) plt.title(‘Peaks between p601 d8 H3K27me3 Donors’) plt.xlabel(‘Number of Elements’) plt.ylabel(”) plt.savefig(‘Peak_Consistency_Between_p601_d8_H3K27me3_Donors.png’, dpi=300, bbox_inches=’tight’) #2476+3567+22719=28762 venn2(subsets=(2681, 3410, 19044), set_labels=(‘Donor 1’, ‘Donor 2’)) plt.title(‘Peaks between p604 d8 H3K27me3 Donors’) plt.xlabel(‘Number of Elements’) plt.ylabel(”) plt.savefig(‘Peak_Consistency_Between_p604_d8_H3K27me3_Donors.png’, dpi=300, bbox_inches=’tight’) #2681+3410+19044=25135 awk ‘NR>1 {print $2 “\t” $3 “\t” $4 “\t” $1}’ p601_d8_H3K27me3_sicer_regions.txt > p601_d8_H3K27me3_sicer_regions.bed awk ‘NR>1 {print $2 “\t” $3 “\t” $4 “\t” $1}’ p604_d8_H3K27me3_sicer_regions.txt > p604_d8_H3K27me3_sicer_regions.bed ~/Tools/csv2xls-0.4/csv_to_xls.py p601_d8_H3K27me3_sicer_regions.txt -d$’\t’ -o p601_d8_H3K27me3_sicer_regions.xls ~/Tools/csv2xls-0.4/csv_to_xls.py p604_d8_H3K27me3_sicer_regions.txt -d$’\t’ -o p604_d8_H3K27me3_sicer_regions.xls 6. getDifferentialPeaksReplicates.pl #!/usr/bin/env perl use warnings; use lib “/home/jhuang/homer/.//bin”; my $homeDir = “/home/jhuang/homer/./”; my $foldThresh = 2; my $fdrThresh = 0.05; my $peakFoldInput = 2; my $peakFdrInput = 0.001; my $style = “”; sub printCMD { print STDERR “\n\tUsage: getDifferentialPeaksReplicates.pl [options] -t [IP tagdir2] …\n”; print STDERR “\t -b [background tagdir2] …\n”; print STDERR “\t -i [Input tagdir2] …\n”; print STDERR “\t\tNote: if input is provided, peaks will be called.\n”; print STDERR “\n\tOptions:\n”; #print STDERR “\t\t-F <#> (fold enrichment over bg, default: $foldThresh)\n”; #print STDERR “\t\t-fdr <#> (FDR over input, default: $fdrThresh)\n”; print STDERR “\t\t-f <#> (fold enrichment over bg, default: $foldThresh)\n”; print STDERR “\t\t-q <#> (FDR over bg, default: $fdrThresh)\n”; print STDERR “\t\t-fdr <#>, -F <#>, -L <#> (parameters for findPeaks)\n”; print STDERR “\t\t-genome (genome version to use for annotation)\n”; print STDERR “\t\t-DESeq2 | -DESeq | -edgeR (differential stats algorithm, default: DESeq2)\n”; print STDERR “\t\t-balanced (normalize signal across peaks, default: normalize to read totals)\n”; print STDERR “\t\t-fragLength <#> (standardize estimated fragment length across analysis)\n”; print STDERR “\t\t-all (report all peaks, not just differentially regulated)\n”; print STDERR “\n\tPeak finding directives:\n”; print STDERR “\t\t-style (findPeaks style to use for finding peaks)\n”; print STDERR “\t\t-use (use existing peaks in tag directories)\n”; print STDERR “\t\t-p (use specific peak file instead of tagDir/peaks.txt or finding new one)\n”; print STDERR “\t\tOther options will be passed to findPeaks\n”; print STDERR “\n”; exit; } my @targets = (); my @background = (); my @inputs = (); my $findPeaksOpts = “”; my $use = “”; my $givenPeakFile = ”; my $norm2total = “-norm2total”; my $diffAlg = “-DESeq2”; my $genome = ‘none’; my $annOptions = “”; my $fragLength = ”; my $allFlag = 0; my $ogCmd = “getDifferentialPeaksReplicates.pl”; for (my $i=0;$i<@ARGV;$i++) { $ogCmd .= " " . $ARGV[$i]; } for (my $i=0;$i<@ARGV;$i++) { if ($ARGV[$i] eq '-t') { $i++; while ($i < @ARGV) { if ($ARGV[$i] =~ /^-/) { $i--; last; } push(@targets, $ARGV[$i++]); } } elsif ($ARGV[$i] eq '-i') { $i++; while ($i < @ARGV) { if ($ARGV[$i] =~ /^-/) { $i--; last; } push(@inputs, $ARGV[$i++]); } } elsif ($ARGV[$i] eq '-b') { $i++; while ($i < @ARGV) { if ($ARGV[$i] =~ /^-/) { $i--; last; } push(@background, $ARGV[$i++]); } } elsif ($ARGV[$i] eq '-f') { $foldThresh = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-genome') { $genome = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-q') { $fdrThresh = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-use') { $use = $ARGV[++$i]; if ($use eq 'tss.txt') { $annOptions .= " -fragLength 1 -strand +"; $fragLength = " -fragLength 1" if ($fragLength eq ''); } } elsif ($ARGV[$i] eq '-p') { $givenPeakFile = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-style') { $style = $ARGV[++$i]; if ($style eq 'tss') { $annOptions .= " -fragLength 1 -strand +"; $fragLength = " -fragLength 1" if ($fragLength eq ''); } } elsif ($ARGV[$i] eq '-edgeR') { $diffAlg = $ARGV[$i]; } elsif ($ARGV[$i] eq '-fragLength') { $fragLength = " -fragLength $ARGV[++$i]"; } elsif ($ARGV[$i] eq '-DESeq2') { $diffAlg = $ARGV[$i]; } elsif ($ARGV[$i] eq '-all') { $allFlag = 1; } elsif ($ARGV[$i] eq '-DESeq') { $diffAlg = $ARGV[$i]; } elsif ($ARGV[$i] eq '-balanced') { $norm2total = ""; } elsif ($ARGV[$i] eq '-h' || $ARGV[$i] eq '--help' || $ARGV[$i] eq '--') { printCMD(); } else { $findPeaksOpts .= " " . $ARGV[$i]; #print STDERR "!!! \"$ARGV[$i]\" not recognized\n"; #printCMD(); } } my $rand = rand(); my %toDelete = (); if ($diffAlg eq '-edgeR' && $norm2total ne '') { print STDERR "!!! Error, -edgeR requires \"-balanced\" to work correctly!!!\n"; exit; } $log2Thresh = log($foldThresh)/log(2); if (@targets < 1) { print STDERR "!!! Error, need at least one target directory!!!\n"; printCMD(); } if (scalar(@inputs) + scalar(@background) < 1) { print STDERR "\t!!! Error: program requires either input or background experiments to perform differential calculations!\n"; exit; } my $targetDirs = ''; my $targetStr = ""; foreach(@targets) { $targetDirs .= " \"$_\""; $targetStr .= " target"; } my $inputDirs = ''; my $inputStr = ""; foreach(@inputs) { $inputDirs .= " \"$_\""; $inputStr .= " input"; } my $bgDirs = ''; my $bgStr = ""; foreach(@background) { $bgDirs .= " \"$_\""; $bgStr .= " bg"; } if ($givenPeakFile eq '') { $peakFile = $rand . ".peaks"; $toDelete{$peakFile}=1; } else { $peakFile = $givenPeakFile; } if ($findPeaksOpts ne '') { print STDERR "\tUsing the following extra parameters for findPeaks:\n\t\t$findPeaksOpts\n"; } print STDERR "\tStep1: Defining Putative Peak Set\n"; if ($use eq '' && $givenPeakFile eq '') { print STDERR "\t\tFinding peaks in merged meta-experiment from target tag directories\n"; my $targetDir = $rand . ".targetTagDir"; `makeTagDirectory \"$targetDir\" -d $targetDirs $fragLength`; $toDelete{$targetDir}=1; my $inputDir = $rand . ".inputTagDir"; my $cmd = "findPeaks \"$targetDir\""; if ($style eq '') { $style = 'factor'; print STDERR "\tUsing -style $style...\n"; } $cmd .= " -style $style"; $cmd .= $fragLength . " " . $findPeaksOpts; if (@inputs > 0) { `makeTagDirectory \”$inputDir\” -d $inputDirs $fragLength`; $toDelete{$inputDir}=1; $cmd .= ” -i \”$inputDir\””; } #print STDERR “`$cmd > \”$peakFile\”`\n”; `$cmd > \”$peakFile\”`; } elsif ($givenPeakFile eq ” && $use ne ”) { my $files = “”; my @allDirs = (); push(@allDirs, @targets, @inputs, @background); print STDERR “\t\tUsing existing peak files for features:\n”; foreach(@allDirs) { my $p = $_ . “/” . $use; if (-e $p) { print STDERR “\t\t\t$p\n”; $files .= ” \”$p\””; } } #MODIFIED #`mergePeaks $files > \”$peakFile\”`; #print “$files\n”; #print “\”$peakFile\”\n”; `mergePeaks $files -d 1000 > mergePeaks_res.txt`; `python3 update_header.py`; `cat $files > merged_peaks.txt`; ## Split the list of files into an array #my @files = split / /, $files; # Open the output file #open(my $out, ‘>’, ‘merged_peaks.txt’) or die “Cannot open merged_peaks.txt: $!”; #foreach my $file (@files) { # print(“xxxxx$file\n”); # open(my $in, ‘<', $file) or die "Cannot open $file: $!"; # while (<$in>) { # s/\t+$//; # removes trailing tabs # print $out “$_\n”; # } # close $in; #} #close $out; system(“awk ‘{print \$2 \”\\t\” \$3 \”\\t\” \$4 \”\\t\” \$1}’ merged_peaks.txt | sort -k1,1 -k2,2n | bedtools merge -d 1000 > bedtools_res.txt”); `python3 adjust_mergePeaks_res.py`; `mv filtered_mergePeaks_res.txt \”$peakFile\”`; } $rawFile= $rand . “.raw.txt”; $diffFile= $rand . “.diff.txt”; $upFile = $rand . “.Up_target_vs_bg.txt”; $downFile = $rand . “.Down_target_vs_bg.txt”; if ($bgDirs eq ”) { $bgDirs = $inputDirs; $bgStr = $inputStr; @background = @inputs; $upFile = $rand . “.Up_target_vs_input.txt”; $downFile = $rand . “.Down_target_vs_input.txt”; } print STDERR “\n\tStep2: Quantifying reads across target/background/input tag directories\n\n”; #print STDERR “`annotatePeaks.pl \”$peakFile\” none -d $bgDirs $targetDirs -raw > \”$rawFile\”`;\n”; `annotatePeaks.pl \”$peakFile\” $genome -d $bgDirs $targetDirs -raw $annOptions $fragLength > \”$rawFile\”`; #print STDERR “`getDiffExpression.pl \”$rawFile\” $bgStr $targetStr $norm2total $diffAlg -fdr $fdrThresh -log2fold $log2Thresh -export $rand > $diffFile`;\n”; # print STDERR “\n\tStep3: Calling R for differential enrichment statistics ($diffAlg)\n\n”; `getDiffExpression.pl \”$rawFile\” $bgStr $targetStr $norm2total $diffAlg -fdr $fdrThresh -log2fold $log2Thresh -export $rand > $diffFile`; $toDelete{$rawFile}=1; $toDelete{$diffFile}=1; $toDelete{$upFile}=1; $toDelete{$downFile}=1; print “#cmd=$ogCmd|”; my $ofile = $upFile; $ofile = $diffFile if ($allFlag); open IN, $ofile; while ( ) { print $_; } close IN; foreach(keys %toDelete) { next if ($_ eq ‘/’); #print STDERR “\trm -r \”$_\”\n”; `rm -r “$_”`; } 7. update_header.py import os # File path file_path = ‘mergePeaks_res.txt’ # Read in the file with open(file_path, ‘r’) as file: lines = file.readlines() # Split the first line into components components = lines[0].split(“\t”) # Change the first component to “name” components[0] = “name” # Join the components back into a single string lines[0] = “\t”.join(components) # Write the file back out with open(file_path, ‘w’) as file: file.writelines(lines) 8. adjust_mergePeaks_res.py import pandas as pd # Read the mergePeaks result file and bedtools result file mergePeaks_df = pd.read_csv(‘mergePeaks_res.txt’, sep=’\t’, header=0) bedtools_df = pd.read_csv(‘bedtools_res.txt’, sep=’\t’, header=None, names=[‘chr’, ‘start’, ‘end’]) # Function to check if a peak is within bedtools ranges # def is_in_bedtools_range(row): filtered_mergePeaks_df = [] for _, row in mergePeaks_df.iterrows(): for _, bed_row in bedtools_df.iterrows(): if row[‘chr’] == bed_row[‘chr’] and row[‘start’] >= bed_row[‘start’] and row[‘end’] <= bed_row['end']: row['start'] = bed_row['start'] row['end'] = bed_row['end'] filtered_mergePeaks_df.append(row) # Convert the filtered results to a DataFrame filtered_mergePeaks_df = pd.DataFrame(filtered_mergePeaks_df) ## Write the filtered results to a file #filtered_mergePeaks_df.to_csv('filtered_mergePeaks_res.txt', sep='\t', index=False) ## Filter the mergePeaks rows #filtered_mergePeaks_df = mergePeaks_df[mergePeaks_df.apply(is_in_bedtools_range, axis=1)] ## Sort and drop duplicates sorted_df = filtered_mergePeaks_df.sort_values(by=['chr', 'start', 'end']) deduplicated_df = sorted_df.drop_duplicates(subset=['chr', 'start', 'end']) ## Save to file deduplicated_df.to_csv('filtered_mergePeaks_res.txt', sep='\t', index=False)

T细胞与B细胞在免疫反应中的相互作用

T cells and B cells interact in several ways as part of the immune response. Here’s a general overview of their interaction:

Antigen presentation: When a pathogen invades the body, it is engulfed by a type of cell known as an antigen-presenting cell (APC). The APC processes the pathogen and displays fragments of it, known as antigens, on its surface. B cells can also act as antigen-presenting cells.

T cell activation: A type of T cell known as a helper T cell (Th cell) can recognize these antigens. The Th cell binds to the antigen, causing the T cell to become activated. This process usually requires additional signals from the APC, provided through other surface molecules.

B cell activation: Once activated, the Th cell can interact with B cells that are displaying the same antigen. The T cell releases signaling molecules known as cytokines, which help to activate the B cell.

Antibody production: Once activated, the B cell begins to proliferate and differentiate into plasma cells. These plasma cells produce antibodies that are specific to the antigen. These antibodies can then neutralize the pathogen or mark it for destruction by other immune cells.

Memory cells: Some of the B cells and T cells will differentiate into memory cells. These cells “remember” the antigen and can mount a rapid response if the same pathogen invades the body again.

So, the interaction between T cells and B cells is crucial for the adaptive immune response. It allows the immune system to mount a targeted response to specific pathogens and to remember those pathogens in case of future invasions.

T细胞和B细胞在免疫反应中有多种相互作用方式。以下是它们相互作用的一般概述:

抗原呈递:当病原体侵入身体时,会被一种称为抗原呈递细胞(APC)的细胞吞噬。该APC处理病原体并将其碎片,也就是抗原,展示在其表面。B细胞也可以作为抗原呈递细胞。

T细胞激活:一种称为辅助T细胞(Th细胞)的T细胞可以识别这些抗原。Th细胞与抗原结合,导致T细胞被激活。这个过程通常需要APC提供的其他表面分子的额外信号。 CD4+ is a type of T cell often referred to as a helper T cell.

B细胞激活:一旦激活,Th细胞可以与显示相同抗原的B细胞进行交互。T细胞释放称为细胞因子的信号分子,这些分子有助于激活B细胞。

抗体产生:一旦激活,B细胞开始增殖并分化为浆细胞。这些浆细胞产生特异性的抗原抗体。这些抗体可以中和病原体或将其标记为由其他免疫细胞销毁。

记忆细胞:一部分B细胞和T细胞会分化为记忆细胞。这些细胞“记住”了抗原,并且如果同一病原体再次侵入身体,它们可以快速反应。

因此,T细胞和B细胞之间的相互作用对适应性免疫反应至关重要。它使免疫系统能够对特定的病原体产生针对性的反应,并记住这些病原体以防未来的侵入。

T细胞表面上的表位(epitope)在”T细胞激活”这一步骤中发挥作用。在这个阶段,辅助T细胞(Th细胞)可以识别抗原呈递细胞(APC)表面上的抗原。抗原是通过与T细胞受体(TCR)结合的MHC分子展示的,其中抗原中的特定部分——表位,是被TCR识别的部分。因此,在这个过程中,T细胞表面的表位是关键。这种识别过程触发了T细胞的激活,进而影响了免疫反应的其他步骤,如B细胞的激活和抗体的生成。

抗原是存在于抗原呈递细胞(APC)上的。当病原体,比如细菌或病毒,进入身体后,抗原呈递细胞(如巨噬细胞、树突状细胞等)会捕获并处理病原体,把处理后的病原体的一部分(抗原)放在它们的表面上。接着,T细胞通过自身表面的T细胞受体(TCR)识别并与这些抗原结合,这样就触发了免疫反应。

在这个上下文中,”表位(Epitope)”通常指的是抗原(即病原体蛋白质的一个部分)的一个特定区域,这个区域可以被免疫系统(特别是抗体或T细胞受体)识别和结合。当我们说”表位出现在T细胞上”时,实际上是指T细胞受体(TCR)能够识别并结合到抗原的这个特定区域,而不是把表位物质本身放在T细胞上。 在T细胞激活的过程中,T细胞受体(TCR)会识别和绑定到抗原呈递细胞(APC)表面的抗原表位,然后这个信息(即信号)会传递给T细胞,触发免疫反应。因此,我们可以说,表位是抗原在与T细胞相互作用时所起的关键作用。

Generation of Heatmap from DEGs Data and Annotation of Identified Gene Clusters

This script is structured to process gene expression data, specifically DEGs (Differentially Expressed Genes) and create a heatmap visualizing the patterns of the data. The steps involved are as follows:

  1. Package Installation and Library Loading: The script first ensures that essential packages are installed and then loads them. Some of the key packages include “gplots” for generating heatmaps, “readxl” and “writexl” for reading and writing Excel data, and “biomaRt” for fetching gene annotation data from Ensembl.

  2. Data Input: It reads in the gene expression data from an Excel file named “DEGs_heatmap_data.xls”.

  3. Hierarchical Clustering: The script performs hierarchical clustering on the data using both Pearson and Spearman correlations to determine the relationships between genes.

  4. Heatmap Generation: A heatmap is generated to visualize the clustered data, and this visualization is saved as an image file named “DEGs_heatmap.png”.

  5. Annotation and Data Segregation: The genes are further grouped into clusters, and for each cluster, annotation details such as gene ID, gene name, chromosome name, start and end positions, and more are fetched from Ensembl. This annotated data for each cluster is stored with the expression data in distinct data frames.

  6. Output: All the processed clusters are then compiled and written to an Excel file named “gene_clusters.xlsx”, with each cluster having its designated sheet.

This script aids in the identification and exploration of gene expression patterns and further provides essential annotations for identified gene clusters.

#ensure you have the following packages installed. If not, you'll have to install them
install.packages("gplots")
install.packages("readxl")
install.packages("writexl")
install.packages("dplyr")
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("biomaRt")

library(gplots)
library(readxl)
library(writexl)
library(dplyr)
library(biomaRt)
listEnsembl()
listMarts()
ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="104")
datasets <- listDatasets(ensembl)

# Read the Excel file
datamat = read_excel(path = "DEGs_heatmap_data.xls", sheet = 1, col_names = TRUE)
datamat <- as.data.frame(datamat)
rownames(datamat) <- datamat[, 1]
datamat <- datamat[, -1] # Remove the first column which is now the row names

hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.2)
mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED",  "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");
mycol = mycol[as.vector(mycl)]
png("DEGs_heatmap.png", width=900, height=1010)
heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
            scale='row',trace='none',col=bluered(75),
            RowSideColors = mycol, labRow="", srtCol=30, keysize=0.72, cexRow = 2, cexCol = 1.4)
dev.off()

#### cluster members #####
subset_1<-names(subset(mycl, mycl == '1'))
subset_1_ <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
      filters = 'ensembl_gene_id',
      values = subset_1,
      mart = ensembl)
subset_1_uniq <- distinct(subset_1_, ensembl_gene_id, .keep_all= TRUE)
subset_1_expr  <- datamat[subset_1,]
subset_1_expr$ENSEMBL = rownames(subset_1_expr)
cluster1_YELLOW <- merge(subset_1_uniq, subset_1_expr, by.x="ensembl_gene_id", by.y="ENSEMBL")
#write.csv(cluster1_YELLOW,file='cluster1_YELLOW.txt')

subset_2<-names(subset(mycl, mycl == '2'))
subset_2_ <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
      filters = 'ensembl_gene_id',
      values = subset_2,
      mart = ensembl)
subset_2_uniq <- distinct(subset_2_, ensembl_gene_id, .keep_all= TRUE)
subset_2_expr  <- datamat[subset_2,]
subset_2_expr$ENSEMBL = rownames(subset_2_expr)
cluster2_DARKBLUE <- merge(subset_2_uniq, subset_2_expr, by.x="ensembl_gene_id", by.y="ENSEMBL")
#write.csv(cluster2_DARKBLUE,file='cluster2_DARKBLUE.txt')

subset_3<-names(subset(mycl, mycl == '3'))
subset_3_ <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
      filters = 'ensembl_gene_id',
      values = subset_3,
      mart = ensembl)
subset_3_uniq <- distinct(subset_3_, ensembl_gene_id, .keep_all= TRUE)
subset_3_expr  <- datamat[subset_3,]
subset_3_expr$ENSEMBL = rownames(subset_3_expr)
cluster3_DARKORANGE <- merge(subset_3_uniq, subset_3_expr, by.x="ensembl_gene_id", by.y="ENSEMBL")
#write.csv(cluster3_DARKORANGE,file='cluster3_DARKORANGE.txt')

subset_4<-names(subset(mycl, mycl == '4'))
subset_4_ <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
      filters = 'ensembl_gene_id',
      values = subset_4,
      mart = ensembl)
subset_4_uniq <- distinct(subset_4_, ensembl_gene_id, .keep_all= TRUE)
subset_4_expr  <- datamat[subset_4,]
subset_4_expr$ENSEMBL = rownames(subset_4_expr)
cluster4_DARKMAGENTA <- merge(subset_4_uniq, subset_4_expr, by.x="ensembl_gene_id", by.y="ENSEMBL")
#write.csv(cluster4_DARKMAGENTA,file='cluster4_DARKMAGENTA.txt')

subset_5<-names(subset(mycl, mycl == '5'))
subset_5_ <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
      filters = 'ensembl_gene_id',
      values = subset_5,
      mart = ensembl)
subset_5_uniq <- distinct(subset_5_, ensembl_gene_id, .keep_all= TRUE)
subset_5_expr  <- datamat[subset_5,]
subset_5_expr$ENSEMBL = rownames(subset_5_expr)
cluster5_DARKCYAN <- merge(subset_5_uniq, subset_5_expr, by.x="ensembl_gene_id", by.y="ENSEMBL")
#write.csv(cluster5_DARKCYAN,file='cluster5_DARKCYAN.txt')

subset_6<-names(subset(mycl, mycl == '6'))
subset_6_ <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
      filters = 'ensembl_gene_id',
      values = subset_6,
      mart = ensembl)
subset_6_uniq <- distinct(subset_6_, ensembl_gene_id, .keep_all= TRUE)
subset_6_expr  <- datamat[subset_6,]
subset_6_expr$ENSEMBL = rownames(subset_6_expr)
cluster6_DARKRED <- merge(subset_6_uniq, subset_6_expr, by.x="ensembl_gene_id", by.y="ENSEMBL")
#write.csv(cluster6_DARKRED,file='cluster6_DARKRED.txt')

write_xlsx(list(
  "Cluster 1 YELLOW" = cluster1_YELLOW,
  "Cluster 2 DARKBLUE" = cluster2_DARKBLUE,
  "Cluster 3 DARKORANGE" = cluster3_DARKORANGE,
  "Cluster 4 DARKMAGENTA" = cluster4_DARKMAGENTA,
  "Cluster 5 DARKCYAN" = cluster5_DARKCYAN,
  "Cluster 6 DARKRED" = cluster6_DARKRED
), "gene_clusters.xlsx")