-
install conda environment
#conda config --set auto_activate_base false conda create --name rnaseq python=3.7 #NOTE: mamba 确实快多了,以后都用 mamba❕ #install packages conda activate rnaseq pip3 install deeptools pip3 install multiqc conda install -c bioconda stringtie subread gffread conda install -c conda-forge -c bioconda -c defaults -c r r-data.table r-gplots conda install -c conda-forge -c bioconda -c defaults -c r bioconductor-dupradar bioconductor-edger conda install nextflow=23.04 conda install fq conda install -c bioconda umi_tools conda install -c bioconda rsem conda install -c bioconda salmon #conda install some tools #install R-packages, conda install -c bioconda ucsc-bedclip conda install -c bioconda ucsc-bedgraphtobigwig conda install -c bioconda bioconductor-matrixgenerics #conda install -c bioconda bioconductor-deseq2 conda install -c bioconda r-pheatmap conda install -c anaconda gawk conda install mamba -n base -c conda-forge conda config --add channels conda-forge mamba install -c bioconda salmon=1.10 #salmon should be >= 1.10 since in those version salmon set `--validateMappings` as default. conda install -c bioconda trim-galore star=2.6.1d bioconductor-summarizedexperiment bioconductor-tximport bioconductor-tximeta bioconductor-deseq2 mamba install -c bioconda samtools=1.9 mamba install -c conda-forge r-optparse r-vctrs=0.5.0 conda install nextflow=23.04 mamba install -c bioconda qualimap mamba install -c bioconda rseqc mamba install -c conda-forge openssl conda install -c bioconda ucsc-bedclip conda install -c bioconda bedtools conda update -c bioconda ucsc-bedclip #for DEBUG: bedClip: error while loading shared libraries: libssl.so.1.0.0: cannot open shared object file: No such file or directory conda update -c bioconda ucsc-bedgraphtobigwig # samtools should be >= 1.9 as only those have the option @ #samtools sort \ # -@ 6 \ # -o HSV.d2_r1.sorted.bam \ # -T HSV.d2_r1.sorted \ # HSV.d2_r1.Aligned.out.bam
-
run UMItools without –umitools_dedup_stats, otherwise it cannot be finished in hamm.
-
Optimize UMItools parameters: Some parameters might influence the memory usage of UMItools. For example, you can try to reduce the number of allowed mismatches in the UMI sequence (–edit-distance-threshold). This will make the deduplication process less memory intensive but might also impact the results.
-
Use other deduplication tools: If the problem persists, you might need to use alternative tools for UMI deduplication which are less memory-intensive. Tools such as fgbio have a grouping and deduplication method similar to UMItools but have been reported to require less memory.
#https://github.com/nf-core/rnaseq/issues/827 #INFO for DEBUG: https://umi-tools.readthedocs.io/en/latest/faq.html #INFO for DEBUG: https://readthedocs.org/projects/umi-tools/downloads/pdf/stable/ #https://github.com/CGATOxford/UMI-tools/issues/173 # excessive dedup memory usage with output-stats #409 #https://github.com/CGATOxford/UMI-tools/issues/409 #umi_tools 1.0.1 #I am aware of previously closed issues: #excessive dedup memory usage #173 #speed up stats #184 #Running a single-end bam file with 3.13M reads and a 10bp (fully random) UMI. #Using --method=unique #There still seems to be a memory problem with --output-stats #Running with output-stats, memory usage climbs over 100GB and eventually crashes with "MemoryError". #Running without output-stats, job completes in about 3 minutes, with no problems. #TRY STANDALONE RUNNING: /usr/local/bin/python /usr/local/bin/umi_tools dedup -I HSV.d8_r1.transcriptome.sorted.bam -S HSV.d8_r1.umi_dedup.transcriptome.sorted.bam --method=unique --random-seed=100 #/home/jhuang/miniconda3/envs/rnaseq/bin/python /home/jhuang/miniconda3/envs/rnaseq/bin/umi_tools dedup -I star_salmon/HSV.d8_r1.sorted.bam -S HSV.d8_r1.umi_dedup.sorted.bam --output-stats HSV.d8_r1.umi_dedup.sorted --method=unique --random-seed=100 #umitools dedup uses large amounts of memory and runs slowly. To speed it up it is recommended to only run it on a single chromosome, see the FAQ point number 4. #I suggest either making the --output-stats optional, or running a second round of deduplication on a single chromosome to generate the output stats. #--Human-- #hamm /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38 --with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P
.{12}).*” -profile docker -resume –max_cpus 54 –max_memory 120.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘star_salmon’ –pseudo_aligner ‘salmon’ –umitools_grouping_method ‘unique’ #sage nextflow run rnaseq/main.nf –input samplesheet.csv –outdir results_GRCh38 –genome GRCh38 –with_umi –umitools_extract_method “regex” –umitools_bc_pattern “^(?P .{12}).*” -profile test_full -resume –max_memory 256.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘star_salmon’ –pseudo_aligner ‘salmon’ #–Virus– /usr/local/bin/nextflow run rnaseq/main.nf –input samplesheet.csv –outdir results_virus –fasta “/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1.fasta” –gtf “/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_v4.gtf” –with_umi –umitools_extract_method “regex” –umitools_bc_pattern “^(?P .{12}).*” –umitools_dedup_stats –skip_rseqc –skip_dupradar –skip_preseq -profile test_full -resume –max_cpus 55 –max_memory 120.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘hisat2’ –gtf_extra_attributes ‘gene_name’ –gtf_group_features ‘gene_id’ –featurecounts_group_type ‘gene_name’ –featurecounts_feature_type ‘exon’ –umitools_grouping_method ‘unique’
-
-
R-code for evaluation of nextflow outputs
# Import the required libraries library("AnnotationDbi") library("clusterProfiler") library("ReactomePA") library(gplots) library(tximport) library(DESeq2) setwd("~/DATA/Data_Manja_RNAseq_Organoids/results_GRCh38_unique/star_salmon") # Define paths to your Salmon output quantification files files <- c("control_r1" = "./control_r1/quant.sf", "control_r2" = "./control_r2/quant.sf", "HSV.d2_r1" = "./HSV.d2_r1/quant.sf", "HSV.d2_r2" = "./HSV.d2_r2/quant.sf", "HSV.d4_r1" = "./HSV.d4_r1/quant.sf", "HSV.d4_r2" = "./HSV.d4_r2/quant.sf", "HSV.d6_r1" = "./HSV.d6_r1/quant.sf", "HSV.d6_r2" = "./HSV.d6_r2/quant.sf", "HSV.d8_r1" = "./HSV.d8_r1/quant.sf", "HSV.d8_r2" = "./HSV.d8_r2/quant.sf") # Import the transcript abundance data with tximport txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE) # Define the replicates and condition of the samples replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2")) condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8", "HSV.d8")) # Define the colData for DESeq2 colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files)) # Create DESeqDataSet object dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition) # In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps. # Filter data to retain only genes with more than 2 counts > 3 across all samples # dds <- dds[rowSums(counts(dds) > 3) > 2, ] # Run DESeq2 dds <- DESeq(dds) # Perform rlog transformation rld <- rlogTransformation(dds) # Output raw count data to a CSV file write.csv(counts(dds), file="transcript_counts.csv") # -- gene-level count data -- # Read in the tx2gene map from salmon_tx2gene.tsv #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE) tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE) # Set the column names colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name") # Remove the gene_name column if not needed tx2gene <- tx2gene[,1:2] # Import and summarize the Salmon data with tximport txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE) # Continue with the DESeq2 workflow as before... colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files)) dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition) #dds <- dds[rowSums(counts(dds) > 3) > 2, ] #60605-->26543 dds <- DESeq(dds) rld <- rlogTransformation(dds) write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv") #TODO: why a lot of reads were removed due to the too_short? #STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output dim(counts(dds)) head(counts(dds), 10)
Author Archives: gene_x
X-ray holographic microscopy
X-ray holographic microscopy is a technique used to produce high-resolution, three-dimensional images of microscopic objects. The technique is based on the principles of holography, where the phase and amplitude of a wave are recorded to produce an image. In X-ray holography, this wave is an X-ray beam.
Traditional optical microscopy uses visible light to image an object, and the resolution of the image is limited by the wavelength of the light. X-rays have much shorter wavelengths than visible light, so X-ray microscopy can theoretically produce images with much higher resolution.
In X-ray holography, a coherent X-ray beam is split into two paths: one path interacts with the object being imaged, and the other path is used as a reference. The object wave and the reference wave are then combined to form a hologram. This hologram can be reconstructed to produce a 3D image of the object.
One major advantage of X-ray holographic microscopy is that it can be used to image thick samples and materials that are not transparent to visible light. However, the technique requires sophisticated equipment and careful sample preparation, and it can be difficult to interpret the resulting images.
X射线全息显微镜术是一种用来生成微观物体的高分辨率三维图像的技术。这种技术基于全息术的原理,全息术记录波的相位和振幅以产生图像。在X射线全息术中,这种波是X射线束。
传统的光学显微镜使用可见光来成像物体,图像的分辨率受到光的波长的限制。X射线的波长比可见光短得多,因此理论上X射线显微镜可以产生分辨率更高的图像。
在X射线全息术中,一个相干的X射线束被分割成两条路径:一条路径与被成像的物体相互作用,另一条路径作为参考使用。然后将物体波和参考波结合形成一个全息图。这个全息图可以被重建成物体的3D图像。
X射线全息显微镜的一个主要优点是它可以用来成像厚样本和对可见光不透明的材料。然而,这种技术需要复杂的设备和精心的样品制备,并且解析结果图像可能会有困难。
人体类器官(Organoids)
类器官(Organoids)是模拟真实器官或组织的结构和功能的三维(3D)细胞培养。它们来源于干细胞,这种细胞具有自我更新和分化为各种细胞类型的能力。在实验室中,可以使用专门的技术和生长条件培养类器官,促使干细胞发育成特定器官的细胞并形成类似目标器官的复杂微型结构。
类器官在研究中具有重要作用,因为与传统的二维(2D)细胞培养相比,它们更准确地代表了人体器官。它们在各个领域具有广泛的应用,包括:
-
发育生物学:类器官可以帮助研究人员研究器官发育和组织组织过程。
-
疾病建模:类器官可以从患者来源的干细胞中产生,使研究人员能够创建特定于疾病的模型,以研究各种疾病和病状的基本机制。
-
药物开发和测试:类器官为测试新药物和治疗方法提供了更具生理相关性的模型,有可能减少对动物模型的依赖,并提高药物开发的效率。
-
再生医学:类器官可用于开发新的组织修复和再生策略,可能为各种疾病和损伤提供新的治疗方法。
尽管类器官具有诸多优点,但它们也存在局限性,如缺乏血管、免疫细胞和其他真实器官中存在的成分。然而,正在进行的研究旨在改进类器官技术并克服这些局限性,进一步扩大其在生物医学研究中的潜在应用。
Organoids are three-dimensional (3D) cell cultures that mimic the structure and function of real organs or tissues. They are derived from stem cells, which have the ability to self-renew and differentiate into various cell types. Organoids can be grown in the lab using specialized techniques and growth conditions that encourage the stem cells to develop into organ-specific cells and form complex, miniature structures resembling the target organ.
Organoids have become an essential tool in research because they provide a more accurate representation of human organs compared to traditional two-dimensional (2D) cell cultures. They have numerous applications in various fields, including:
-
Developmental biology: Organoids can help researchers study the processes involved in organ development and tissue organization.
-
Disease modeling: Organoids can be generated from patient-derived stem cells, allowing researchers to create disease-specific models to study the underlying mechanisms of various diseases and conditions.
-
Drug development and testing: Organoids provide a more physiologically relevant model for testing new drugs and therapies, potentially reducing the reliance on animal models and increasing the efficiency of drug development.
-
Regenerative medicine: Organoids can be used to develop new strategies for tissue repair and regeneration, possibly leading to new treatments for various diseases and injuries.
Despite their advantages, organoids also have limitations, such as the lack of blood vessels, immune cells, and other components present in real organs. However, ongoing research aims to refine organoid technology and overcome these limitations, further expanding their potential applications in biomedical research.
RNAseq processing for organoids
-
install conda environment
#conda config --set auto_activate_base false conda create --name rnaseq python=3.7 #NOTE: mamba 确实快多了,以后都用 mamba❕ #install packages conda activate rnaseq pip3 install deeptools pip3 install multiqc conda install -c bioconda stringtie subread gffread conda install -c conda-forge -c bioconda -c defaults -c r r-data.table r-gplots conda install -c conda-forge -c bioconda -c defaults -c r bioconductor-dupradar bioconductor-edger conda install nextflow=23.04 conda install fq conda install -c bioconda umi_tools conda install -c bioconda rsem conda install -c bioconda salmon #conda install some tools #install R-packages, conda install -c bioconda ucsc-bedclip conda install -c bioconda ucsc-bedgraphtobigwig conda install -c bioconda bioconductor-matrixgenerics #conda install -c bioconda bioconductor-deseq2 conda install -c bioconda r-pheatmap conda install -c anaconda gawk conda install mamba -n base -c conda-forge conda config --add channels conda-forge mamba install -c bioconda salmon=1.10 #salmon should be >= 1.10 since in those version salmon set `--validateMappings` as default. conda install -c bioconda trim-galore star=2.6.1d bioconductor-summarizedexperiment bioconductor-tximport bioconductor-tximeta bioconductor-deseq2 mamba install -c bioconda samtools=1.9 mamba install -c conda-forge r-optparse r-vctrs=0.5.0 conda install nextflow=23.04 mamba install -c bioconda qualimap mamba install -c bioconda rseqc mamba install -c conda-forge openssl conda install -c bioconda ucsc-bedclip conda install -c bioconda bedtools conda update -c bioconda ucsc-bedclip #for DEBUG: bedClip: error while loading shared libraries: libssl.so.1.0.0: cannot open shared object file: No such file or directory conda update -c bioconda ucsc-bedgraphtobigwig # samtools should be >= 1.9 as only those have the option @ #samtools sort \ # -@ 6 \ # -o HSV.d2_r1.sorted.bam \ # -T HSV.d2_r1.sorted \ # HSV.d2_r1.Aligned.out.bam
-
run UMItools without –umitools_dedup_stats, otherwise it cannot be finished in hamm.
-
Optimize UMItools parameters: Some parameters might influence the memory usage of UMItools. For example, you can try to reduce the number of allowed mismatches in the UMI sequence (–edit-distance-threshold). This will make the deduplication process less memory intensive but might also impact the results.
-
Use other deduplication tools: If the problem persists, you might need to use alternative tools for UMI deduplication which are less memory-intensive. Tools such as fgbio have a grouping and deduplication method similar to UMItools but have been reported to require less memory.
#https://github.com/nf-core/rnaseq/issues/827 #INFO for DEBUG: https://umi-tools.readthedocs.io/en/latest/faq.html #INFO for DEBUG: https://readthedocs.org/projects/umi-tools/downloads/pdf/stable/ #https://github.com/CGATOxford/UMI-tools/issues/173
excessive dedup memory usage with output-stats #409
#https://github.com/CGATOxford/UMI-tools/issues/409 #umi_tools 1.0.1 #I am aware of previously closed issues: #excessive dedup memory usage #173 #speed up stats #184 #Running a single-end bam file with 3.13M reads and a 10bp (fully random) UMI. #Using –method=unique #There still seems to be a memory problem with –output-stats #Running with output-stats, memory usage climbs over 100GB and eventually crashes with “MemoryError”. #Running without output-stats, job completes in about 3 minutes, with no problems.
#TRY STANDALONE RUNNING: /usr/local/bin/python /usr/local/bin/umi_tools dedup -I HSV.d8_r1.transcriptome.sorted.bam -S HSV.d8_r1.umi_dedup.transcriptome.sorted.bam --method=unique --random-seed=100 #/home/jhuang/miniconda3/envs/rnaseq/bin/python /home/jhuang/miniconda3/envs/rnaseq/bin/umi_tools dedup -I star_salmon/HSV.d8_r1.sorted.bam -S HSV.d8_r1.umi_dedup.sorted.bam --output-stats HSV.d8_r1.umi_dedup.sorted --method=unique --random-seed=100
#umitools dedup uses large amounts of memory and runs slowly. To speed it up it is recommended to only run it on a single chromosome, see the FAQ point number 4. #I suggest either making the –output-stats optional, or running a second round of deduplication on a single chromosome to generate the output stats.
#--Human-- #hamm /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38 --with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P
.{12}).*” -profile docker -resume –max_cpus 54 –max_memory 120.GB –max_time 2400.h –aligner ‘star_salmon’ –pseudo_aligner ‘salmon’ –umitools_grouping_method ‘unique’ #–save_align_intermeds –save_unaligned –save_reference #sage nextflow run rnaseq/main.nf –input samplesheet.csv –outdir results_GRCh38 –genome GRCh38 –with_umi –umitools_extract_method “regex” –umitools_bc_pattern “^(?P .{12}).*” -profile test_full -resume –max_memory 256.GB –max_time 2400.h –aligner ‘star_salmon’ –pseudo_aligner ‘salmon’ #–save_align_intermeds –save_unaligned –save_reference #–Virus– /usr/local/bin/nextflow run rnaseq/main.nf –input samplesheet.csv –outdir results_virus –fasta “/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1.fasta” –gtf “/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_v4.gtf” –with_umi –umitools_extract_method “regex” –umitools_bc_pattern “^(?P .{12}).*” –umitools_dedup_stats –skip_rseqc –skip_dupradar –skip_preseq -profile test_full -resume –max_cpus 55 –max_memory 120.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘hisat2’ –gtf_extra_attributes ‘gene_name’ –gtf_group_features ‘gene_id’ –featurecounts_group_type ‘gene_name’ –featurecounts_feature_type ‘exon’ –umitools_grouping_method ‘unique’
-
-
R-code for evaluation of nextflow outputs
# Import the required libraries library("AnnotationDbi") library("clusterProfiler") library("ReactomePA") library(gplots) library(tximport) library(DESeq2) setwd("~/DATA/Data_Manja_RNAseq_Organoids/results_GRCh38_unique/star_salmon") # Define paths to your Salmon output quantification files files <- c("control_r1" = "./control_r1/quant.sf", "control_r2" = "./control_r2/quant.sf", "HSV.d2_r1" = "./HSV.d2_r1/quant.sf", "HSV.d2_r2" = "./HSV.d2_r2/quant.sf", "HSV.d4_r1" = "./HSV.d4_r1/quant.sf", "HSV.d4_r2" = "./HSV.d4_r2/quant.sf", "HSV.d6_r1" = "./HSV.d6_r1/quant.sf", "HSV.d6_r2" = "./HSV.d6_r2/quant.sf", "HSV.d8_r1" = "./HSV.d8_r1/quant.sf", "HSV.d8_r2" = "./HSV.d8_r2/quant.sf") # Import the transcript abundance data with tximport txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE) # Define the replicates and condition of the samples replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2")) condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8", "HSV.d8")) # Define the colData for DESeq2 colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files)) # Create DESeqDataSet object dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition) # In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps. # Filter data to retain only genes with more than 2 counts > 3 across all samples # dds <- dds[rowSums(counts(dds) > 3) > 2, ] # Run DESeq2 dds <- DESeq(dds) # Perform rlog transformation rld <- rlogTransformation(dds) # Output raw count data to a CSV file write.csv(counts(dds), file="transcript_counts.csv") # -- gene-level count data -- # Read in the tx2gene map from salmon_tx2gene.tsv #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE) tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE) # Set the column names colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name") # Remove the gene_name column if not needed tx2gene <- tx2gene[,1:2] # Import and summarize the Salmon data with tximport txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE) # Continue with the DESeq2 workflow as before... colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files)) dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition) #dds <- dds[rowSums(counts(dds) > 3) > 2, ] #60605-->26543 dds <- DESeq(dds) rld <- rlogTransformation(dds) write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv") #TODO: why a lot of reads were removed due to the too_short? #STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output dim(counts(dds)) head(counts(dds), 10)
-
draw 3D PCA plots.
library(gplots) library("RColorBrewer") library(ggplot2) data <- plotPCA(rld, intgroup=c("condition", "replicate"), returnData=TRUE) write.csv(data, file="plotPCA_data.csv") #calculate all PCs including PC3 with the following codes library(genefilter) ntop <- 500 rv <- rowVars(assay(rld)) select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))] mat <- t( assay(rld)[select, ] ) pc <- prcomp(mat) pc$x[,1:3] #df_pc <- data.frame(pc$x[,1:3]) df_pc <- data.frame(pc$x) identical(rownames(data), rownames(df_pc)) #-->TRUE data$PC1 <- NULL data$PC2 <- NULL merged_df <- merge(data, df_pc, by = "row.names") #merged_df <- merged_df[, -1] row.names(merged_df) <- merged_df$Row.names merged_df$Row.names <- NULL # remove the "name" column merged_df$name <- NULL merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10","group","condition","replicate")] write.csv(merged_df, file="merged_df_10PCs.csv") summary(pc) #0.5333 0.2125 0.06852 draw_3D.py # -- before pca -- png("pca_before_removeBatch2.png", 1200, 800) plotPCA(rld, intgroup=c("condition")) dev.off() # -- before heatmap -- png("heatmap_before_removeBatch2.png", 1200, 800) distsRL <- dist(t(assay(rld))) mat <- as.matrix(distsRL) hc <- hclust(distsRL) hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100) heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13)) dev.off() mat <- assay(rld) mm <- model.matrix(~replicates, colData(rld)) mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm) assay(rld) <- mat # -- after pca -- png("pca_after_removeBatch.png", 1200, 800) #svg("pca_after_removeBatch.svg") plotPCA(rld, intgroup=c("replicates")) dev.off() # -- after heatmap -- png("heatmap_after_removeBatch.png", 1200, 800) #svg("heatmap_after_removeBatch.svg") distsRL <- dist(t(assay(rld))) mat <- as.matrix(distsRL) hc <- hclust(distsRL) hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100) heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13)) dev.off()
-
(optional) estimate size factors
> head(dds) class: DESeqDataSet dim: 6 10 metadata(1): version assays(6): counts avgTxLength ... H cooks rownames(6): ENSG00000000003 ENSG00000000005 ... ENSG00000000460 ENSG00000000938 rowData names(34): baseMean baseVar ... deviance maxCooks colnames(10): control_r1 control_r2 ... HSV.d8_r1 HSV.d8_r2 colData names(2): condition replicate #convert bam to bigwig using deepTools by feeding inverse of DESeq’s size Factor sizeFactors(dds) #NULL dds <- estimateSizeFactors(dds) > sizeFactors(dds) normalized_counts <- counts(dds, normalized=TRUE) #write.table(normalized_counts, file="normalized_counts.txt", sep="\t", quote=F, col.names=NA) # ---- DEBUG sizeFactors(dds) always NULL, see https://support.bioconductor.org/p/97676/ ---- nm <- assays(dds)[["avgTxLength"]] sf <- estimateSizeFactorsForMatrix(counts(dds), normMatrix=nm) assays(dds)$counts # for count data assays(dds)$avgTxLength # for average transcript length, etc. assays(dds)$normalizationFactors In normal circumstances, the size factors should be stored in the DESeqDataSet object itself and not in the assays, so they are typically not retrievable via the assays() function. However, due to the issues you're experiencing, you might be able to manually compute the size factors and assign them back to the DESeqDataSet. To calculate size factors manually, DESeq2 uses the median ratio method. Here's a very simplified version of how you could compute this manually: > assays(dds) List of length 6 names(6): counts avgTxLength normalizationFactors mu H cooks To calculate size factors manually, DESeq2 uses the median ratio method. Here's a very simplified version of how you could compute this manually: geoMeans <- apply(assays(dds)$counts, 1, function(row) if (all(row == 0)) 0 else exp(mean(log(row[row != 0])))) sizeFactors(dds) <- median(assays(dds)$counts / geoMeans, na.rm = TRUE) # ---- DEBUG END ---- #unter konsole # control_r1 ... # 1/0.9978755 ... > sizeFactors(dds) HeLa_TO_r1 HeLa_TO_r2 0.9978755 1.1092227 1/0.9978755=1.002129023 1/1.1092227= #bamCoverage --bam ../markDuplicates/${sample}Aligned.sortedByCoord.out.bam -o ${sample}_norm.bw --binSize 10 --scaleFactor --effectiveGenomeSize 2864785220 bamCoverage --bam ../markDuplicates/HeLa_TO_r1Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r1.bw --binSize 10 --scaleFactor 1.002129023 --effectiveGenomeSize 2864785220 bamCoverage --bam ../markDuplicates/HeLa_TO_r2Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r2.bw --binSize 10 --scaleFactor 0.901532217 --effectiveGenomeSize 2864785220
-
differential expressions
#A central method for exploring differences between groups of segments or samples is to perform differential gene expression analysis. dds$condition <- relevel(dds$condition, "control") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("HSV.d2_vs_control","HSV.d4_vs_control","HSV.d6_vs_control","HSV.d8_vs_control") dds$condition <- relevel(dds$condition, "HSV.d2") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("HSV.d4_vs_HSV.d2","HSV.d6_vs_HSV.d2","HSV.d8_vs_HSV.d2") dds$condition <- relevel(dds$condition, "HSV.d4") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("HSV.d6_vs_HSV.d4","HSV.d8_vs_HSV.d4") dds$condition <- relevel(dds$condition, "HSV.d6") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("HSV.d8_vs_HSV.d6") ##https://bioconductor.statistik.tu-dortmund.de/packages/3.7/data/annotation/ #BiocManager::install("EnsDb.Mmusculus.v79") #library(EnsDb.Mmusculus.v79) #edb <- EnsDb.Mmusculus.v79 #https://bioconductor.org/packages/release/bioc/vignettes/biomaRt/inst/doc/accessing_ensembl.html#selecting-an-ensembl-biomart-database-and-dataset #https://bioconductor.org/packages/release/bioc/vignettes/biomaRt/inst/doc/accessing_ensembl.html#selecting-an-ensembl-biomart-database-and-dataset library(biomaRt) listEnsembl() listMarts() #ensembl <- useEnsembl(biomart = "genes", mirror="asia") # default is Mouse strains 104 #ensembl <- useEnsembl(biomart = "ensembl", dataset = "mmusculus_gene_ensembl", mirror = "www") #ensembl = useMart("ensembl_mart_44", dataset="hsapiens_gene_ensembl",archive=TRUE, mysql=TRUE) #ensembl <- useEnsembl(biomart = "ensembl", dataset = "mmusculus_gene_ensembl", version="104") #ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="86") #--> total 69, 27 GRCh38.p7 and 39 GRCm38.p4; we should take 104, since rnaseq-pipeline is also using annotation of 104! ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="104") datasets <- listDatasets(ensembl) #--> total 202 80 GRCh38.p13 107 GRCm39 #80 hsapiens_gene_ensembl Human genes (GRCh38.p13) GRCh38.p13 #107 mmusculus_gene_ensembl Mouse genes (GRCm39) GRCm39 > listEnsemblArchives() name date url version 1 Ensembl GRCh37 Feb 2014 https://grch37.ensembl.org GRCh37 2 Ensembl 109 Feb 2023 https://feb2023.archive.ensembl.org 109 3 Ensembl 108 Oct 2022 https://oct2022.archive.ensembl.org 108 4 Ensembl 107 Jul 2022 https://jul2022.archive.ensembl.org 107 5 Ensembl 106 Apr 2022 https://apr2022.archive.ensembl.org 106 6 Ensembl 105 Dec 2021 https://dec2021.archive.ensembl.org 105 7 Ensembl 104 May 2021 https://may2021.archive.ensembl.org 104 attributes = listAttributes(ensembl) attributes[1:25,] #https://www.ncbi.nlm.nih.gov/grc/human #BiocManager::install("org.Mmu.eg.db") #library("org.Mmu.eg.db") #edb <- org.Mmu.eg.db # #https://bioconductor.statistik.tu-dortmund.de/packages/3.6/data/annotation/ #EnsDb.Mmusculus.v79 #> query(hub, c("EnsDb", "apiens", "98")) #columns(edb) #searchAttributes(mart = ensembl, pattern = "symbol") ##https://www.geeksforgeeks.org/remove-duplicate-rows-based-on-multiple-columns-using-dplyr-in-r/ library(dplyr) library(tidyverse) #df <- data.frame (lang =c ('Java','C','Python','GO','RUST','Javascript', 'Cpp','Java','Julia','Typescript','Python','GO'), value = c (21,21,3,5,180,9,12,20,6,0,3,6), usage =c(21,21,0,99,44,48,53,16,6,8,0,6)) #distinct(df, lang, .keep_all= TRUE) for (i in clist) { #"HSV.d2_vs_control","HSV.d4_vs_control","HSV.d6_vs_control","HSV.d8_vs_control" #i<-clist[1] contrast = paste("condition", i, sep="_") res = results(dds, name=contrast) res <- res[!is.na(res$log2FoldChange),] #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("ENTREZID","EXONID","GENEBIOTYPE","GENEID","GENENAME","PROTEINDOMAINSOURCE","PROTEINID","SEQNAME","SEQSTRAND","SYMBOL","TXBIOTYPE","TXID","TXNAME","UNIPROTID")) #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("GENEID", "ENTREZID", "SYMBOL", "GENENAME","GENEBIOTYPE","TXBIOTYPE","SEQSTRAND","UNIPROTID")) # In the ENSEMBL-database, GENEID is ENSEMBL-ID. #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("GENEID", "SYMBOL", "GENEBIOTYPE")) # "ENTREZID", "TXID","TXBIOTYPE","TXSEQSTART","TXSEQEND" #geness <- geness[!duplicated(geness$GENEID), ] #using getBM replacing AnnotationDbi::select #filters = 'ensembl_gene_id' means the records should always have a valid ensembl_gene_ids. geness <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'), filters = 'ensembl_gene_id', values = rownames(res), mart = ensembl) geness_uniq <- distinct(geness, ensembl_gene_id, .keep_all= TRUE) #merge by column by common colunmn name, in the case "GENEID" res$ENSEMBL = rownames(res) identical(rownames(res), geness_uniq$ensembl_gene_id) res_df <- as.data.frame(res) geness_res <- merge(geness_uniq, res_df, by.x="ensembl_gene_id", by.y="ENSEMBL") dim(geness_res) rownames(geness_res) <- geness_res$ensembl_gene_id geness_res$ensembl_gene_id <- NULL write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-")) up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2) down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2) write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-")) write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-")) } #-- show methods of class DESeq2 -- #x=capture.output(showMethods(class="DESeq2")) #unlist(lapply(strsplit(x[grep("Function: ",x,)]," "),function(x) x[2]))
-
volcano plots with automatically finding top_g
#A canonical visualization for interpreting differential gene expression results is the volcano plot. library(ggrepel) for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do #HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control #for i in K3R_24hdox_vs_K3R_3hdox21hchase WT_3hdox21hchase_vs_K3R_3hdox21hchase; do #for i in WT_24hdox_vs_K3R_24hdox; do #for i in WT_24hdox_vs_WT_3hdox21hchase; do # read files to geness_res echo "geness_res <- read.csv(file = paste(\"${i}\", \"all.txt\", sep=\"-\"), row.names=1)" echo "subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res\$log2FoldChange) >= 2.0))" echo "geness_res\$Color <- \"NS or log2FC < 2.0\"" echo "geness_res\$Color[geness_res\$pvalue < 0.05] <- \"P < 0.05\"" echo "geness_res\$Color[geness_res\$padj < 0.05] <- \"P-adj < 0.05\"" echo "geness_res\$Color[abs(geness_res\$log2FoldChange) < 2.0] <- \"NS or log2FC < 2.0\"" echo "geness_res\$Color <- factor(geness_res\$Color, levels = c(\"NS or log2FC < 2.0\", \"P < 0.05\", \"P-adj < 0.05\"))" echo "write.csv(geness_res, \"${i}_with_Category.csv\")" # pick top genes for either side of volcano to label # order genes for convenience: echo "geness_res\$invert_P <- (-log10(geness_res\$pvalue)) * sign(geness_res\$log2FoldChange)" echo "top_g <- c()" echo "top_g <- c(top_g, \ geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:100]], \ geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:100]])" echo "top_g <- unique(top_g)" echo "geness_res <- geness_res[, -1*ncol(geness_res)]" # remove invert_P from matrix # Graph results echo "png(\"${i}.png\",width=1200, height=2000)" echo "ggplot(geness_res, \ aes(x = log2FoldChange, y = -log10(pvalue), \ color = Color, label = external_gene_name)) + \ geom_vline(xintercept = c(2.0, -2.0), lty = \"dashed\") + \ geom_hline(yintercept = -log10(0.05), lty = \"dashed\") + \ geom_point() + \ labs(x = \"log2(FC)\", y = \"Significance, -log10(P)\", color = \"Significance\") + \ scale_color_manual(values = c(\"P-adj < 0.05\"=\"darkblue\",\"P < 0.05\"=\"lightblue\",\"NS or log2FC < 2.0\"=\"darkgray\"),guide = guide_legend(override.aes = list(size = 4))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) + \ geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res\$log2FoldChange) >= 2.0)), size = 4, point.padding = 0.15, color = \"black\", min.segment.length = .1, box.padding = .2, lwd = 2) + \ theme_bw(base_size = 16) + \ theme(legend.position = \"bottom\")" echo "dev.off()" done sed -i -e 's/Color/Category/g' *_Category.csv for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;" done
-
clustering the genes and draw heatmap
install.packages("gplots") library("gplots") for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id" echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id" done 5 HSV.d2_vs_control-down.id 14 HSV.d2_vs_control-up.id 77 HSV.d4_vs_control-down.id 460 HSV.d4_vs_control-up.id 977 HSV.d6_vs_control-down.id 1863 HSV.d6_vs_control-up.id 1361 HSV.d8_vs_control-down.id 1215 HSV.d8_vs_control-up.id 35 HSV.d4_vs_HSV.d2-down.id 205 HSV.d4_vs_HSV.d2-up.id 832 HSV.d6_vs_HSV.d2-down.id 1550 HSV.d6_vs_HSV.d2-up.id 386 HSV.d6_vs_HSV.d4-down.id 103 HSV.d6_vs_HSV.d4-up.id 1136 HSV.d8_vs_HSV.d2-down.id 1050 HSV.d8_vs_HSV.d2-up.id 598 HSV.d8_vs_HSV.d4-down.id 292 HSV.d8_vs_HSV.d4-up.id 305 HSV.d8_vs_HSV.d6-down.id 133 HSV.d8_vs_HSV.d6-up.id 12597 total cat *.id | sort -u > ids #add Gene_Id in the first line, delete the "" GOI <- read.csv("ids")$Gene_Id RNASeq.NoCellLine <- assay(rld) #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). pearson or spearman datamat = RNASeq.NoCellLine[GOI, ] write.csv(as.data.frame(datamat), file ="significant_gene_expressions.txt") hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete") hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete") mycl = cutree(hr, h=max(hr$height)/1.05) mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED", "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN"); mycol = mycol[as.vector(mycl)] sampleCols <- rep('GREY',ncol(datamat)) names(sampleCols) <- c("control r1","control r2","day2 r1","day2 r2","day4 r1","day4 r2", "day6 r1","day6 r2", "day8 r1","day8 r2") #sampleCols[substr(colnames(RNASeq.NoCellLine_),1,4)=='mock'] <- 'GREY' sampleCols["control r1"] <- 'DARKBLUE' sampleCols["control r2"] <- 'DARKBLUE' sampleCols["day2 r1"] <- 'DARKRED' sampleCols["day2 r2"] <- 'DARKRED' sampleCols["day4 r1"] <- 'DARKORANGE' sampleCols["day4 r2"] <- 'DARKORANGE' sampleCols["day6 r1"] <- 'DARKGREEN' sampleCols["day6 r2"] <- 'DARKGREEN' sampleCols["day8 r1"] <- 'DARKCYAN' sampleCols["day8 r2"] <- 'DARKCYAN' png("DEGs_heatmap.png", width=1000, height=1200) heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row', scale='row',trace='none',col=bluered(75), RowSideColors = mycol, ColSideColors = sampleCols, labRow="", margins=c(22,10), cexRow=8, cexCol=2, srtCol=20, lwid=c(1,7), lhei = c(1, 8)) #legend("top", title = "",legend=c("WaGa_RNA","MKL1_RNA","WaGa_EV_RNA","MKL1_EV_RNA"), fill=c("DARKBLUE","DARKRED","DARKORANGE","DARKGREEN"), cex=0.8, box.lty=0) dev.off() #c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED", "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN"); write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt') write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt') write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt') write.csv(names(subset(mycl, mycl == '4')),file='cluster4_DARKMAGENTA.txt') write.csv(names(subset(mycl, mycl == '5')),file='cluster5_DARKCYAN.txt') #~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o DEGs_heatmap_cluster_members.xls ~/Tools/csv2xls-0.4/csv_to_xls.py \ significant_gene_expressions.txt \ -d',' -o DEGs_heatmap_expression_data.xls;
Prepare virus GTF for nextflow run
-
Prepare GTF for non-model virus
-
The gffread command you’re using is designed to convert GFF format files to GTF format, but it doesn’t necessarily preserve all the attribute information. The -T option enforces creation of gene_id and transcript_id attributes, which are mandatory in GTF format, and gffread takes these from the ID and Parent fields of the input GFF file, respectively.
-
The GTF format is simpler than GFF and doesn’t accommodate all the possible attributes of a GFF file. That’s why you’re seeing a reduction in information in your converted file.
-
If you need to retain all information from the GFF file, you may need to do some post-processing to add the extra attributes back into the GTF file. However, keep in mind that downstream tools which expect GTF format may not correctly handle extra attributes.
# -- Deprecated processing for virus gtf -- #NOT_USED, since it changed a lot! #gffread X14112.1.gff -T -o X14112.1.gtf cp X14112.1.gff3 X14112.1.gff3_backup grep "^##" X14112.1.gff3 > X14112.1_gene.gff3 grep "ID=gene" X14112.1.gff3 >> X14112.1_gene.gff #!!!!VERY_IMPORTANT!!!!: change type '\tgene\t' to '\texon\t'! #sed -i -e "s/\tgene\t/\texon\t/g" X14112.1_gene_.gff # since default is --featurecounts_feature_type 'exon' # -- New processing for virus gtf -- gffread X14112.1_orig.gff -T -o X14112.1_v2.gtf python3 add_gene_id.py # X14112.1_v2.gtf --> X14112.1_v3.gtf #------------------------------------ def add_missing_gene_id(input_gtf, output_gtf): with open(input_gtf, 'r') as in_gtf, open(output_gtf, 'w') as out_gtf: for line in in_gtf: if not line.startswith('#'): # Skip header lines elements = line.strip().split('\t') attributes = elements[8] if 'gene_id' not in attributes: # Extract transcript_id transcript_id = '' for attr in attributes.split(';'): if 'transcript_id' in attr: transcript_id = attr.strip() # Prepend transcript_id as gene_id if not empty if transcript_id != '': attributes = f'{transcript_id.replace("transcript_id", "gene_id")}; {attributes}' elements[8] = attributes line = '\t'.join(elements) out_gtf.write(line + '\n') # Use the function input_gtf = 'X14112.1_v2.gtf' # Path to your input GTF output_gtf = 'X14112.1_v3.gtf' # Path to the output GTF add_missing_gene_id(input_gtf, output_gtf)
-
Human herpesvirus 1, also known as Herpes Simplex Virus type 1 (HSV-1), is a virus with a complex genome encoding around 70-80 genes. The number of genes can vary slightly depending on the specific strain of HSV-1, as well as the methodologies used to identify and annotate the genes.
-
IE175, also known as ICP4 (Infected Cell Polypeptide 4), is a protein encoded by the Human herpesvirus 1 (HSV-1). The gene for this protein is also referred to as the IE (immediate early) gene 3, and the protein it encodes is a major regulatory protein.
-
In the lifecycle of HSV-1, immediate early genes are the first set of genes to be transcribed following infection. The proteins produced from these genes then regulate the expression of early and late genes that are involved in viral DNA replication and the production of viral structural proteins.
-
ICP4, in particular, is essential for the onset of viral replication. It acts as a trans-activator, promoting transcription of other viral genes. It can also interact with host cell proteins and influence host gene expression. As a result of these functions, ICP4 plays a key role in the pathogenesis of HSV-1 infection.
-
Please note that the naming convention for viral genes and proteins can sometimes be inconsistent, with multiple names referring to the same gene or protein. IE175, ICP4, and IE gene 3 all refer to the same gene in HSV-1.
# Delete the records if they are intron or manually add gene_name to the records without gene_name. cp X14112.1_v3.gtf X14112.1_v4.gtf #Find all recoreds without "gene_name" grep -v "gene_name" X14112.1_v4.gtf #-->Delete intron records: grep "intron" X14112.1.gff3_orig DEL X14112.1 EMBL transcript 4953 6907 . - . transcript_id "id-X14112.1:4953..6907"; gene_id "id-X14112.1:4953..6907" DEL X14112.1 EMBL exon 4953 6907 . - . gene_id "id-X14112.1:4953..6907"; transcript_id "id-X14112.1:4953..6907"; DEL X14112.1 EMBL transcript 132374 132539 . + . transcript_id "id-X14112.1:132374..132539"; gene_id "id-X14112.1:132374..132539" DEL X14112.1 EMBL exon 132374 132539 . + . gene_id "id-X14112.1:132374..132539"; transcript_id "id-X14112.1:132374..132539"; DEL X14112.1 EMBL transcript 145649 145860 . - . transcript_id "id-X14112.1:145649..145860"; gene_id "id-X14112.1:145649..145860" DEL X14112.1 EMBL exon 145649 145860 . - . gene_id "id-X14112.1:145649..145860"; transcript_id "id-X14112.1:145649..145860"; # or update: grep "146805" X14112.1_orig.gff UPDATE X14112.1 EMBL transcript 146805 151063 . + . transcript_id "rna-X14112.1:146805..151063"; gene_id "rna-X14112.1:146805..151063" UPDATE X14112.1 EMBL exon 146805 151063 . + . gene_id "rna-X14112.1:146805..151063"; transcript_id "rna-X14112.1:146805..151063"; --> transcript_id "rna-IE175"; gene_id "gene-IE175"; gene_name "IE175"; --> transcript_id "rna-IE175"; gene_id "gene-IE175"; gene_name "IE175"; # or update: grep "133941" X14112.1_orig.gff UPDATE X14112.1 EMBL transcript 133941 146107 . - . transcript_id "rna-X14112.1:133941..146107"; gene_id "rna-X14112.1:133941..146107" UPDATE X14112.1 EMBL exon 133941 145648 . - . gene_id "rna-X14112.1:133941..146107"; transcript_id "rna-X14112.1:133941..146107"; UPDATE X14112.1 EMBL exon 145861 146107 . - . gene_id "rna-X14112.1:133941..146107"; transcript_id "rna-X14112.1:133941..146107"; --> transcript_id "rna-IE68"; gene_id "rna-IE68"; gene_name "IE68"; --> gene_id "rna-IE68"; transcript_id "rna-IE68"; gene_name "IE68"; --> gene_id "rna-IE68"; transcript_id "rna-IE68"; gene_name "IE68";
-
(optional) consider to update all exon and CDS with different names! for example exon-RL2-1, exon-RL2-2, cds-RL2-1. Maybe it is not nessary, since the output contains only transcript-type!
-
-
Run nexflow for virus
docker pull nfcore/rnaseq /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_virus --fasta "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1.fasta" --gtf "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_v4.gtf" --with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P
.{12}).*” –umitools_dedup_stats –skip_rseqc –skip_dupradar –skip_preseq -profile docker -resume –max_cpus 55 –max_memory 120.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘hisat2’ –gtf_extra_attributes ‘gene_name’ –gtf_group_features ‘gene_id’ –featurecounts_group_type ‘gene_name’ –featurecounts_feature_type ‘exon’ –umitools_grouping_method ‘unique’ -
Run nexflow for human using GRCh38 genome
docker pull nfcore/rnaseq /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38 --with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P
.{12}).*” –umitools_dedup_stats –skip_rseqc –skip_dupradar –skip_preseq -profile docker -resume –max_cpus 55 –max_memory 128.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘star_salmon’ –pseudo_aligner ‘salmon’ –gtf_extra_attributes ‘gene_name’ –gtf_group_features ‘gene_id’ –featurecounts_group_type ‘gene_biotype’ –featurecounts_feature_type ‘exon’ –umitools_grouping_method ‘unique’
3.1. BUG_1 for running d8_r1 due to memory
# in modules/nf-core/umitools/dedup/main.nf
process UMITOOLS_DEDUP {
tag "$meta.id"
//REMOVED label "process_medium"
//ADDED
label 'high_memory' // this needs to be defined in your config file
cpus 55 // adjust as per your system's capabilities
ERROR ~ Module compilation error
- file : /mnt/h1/jhuang/DATA/Data_Manja_RNAseq_Organoids/rnaseq/./workflows/../subworkflows/nf-core/bam_dedup_stats_samtools_umitools/../../../modules/nf-core/umitools/dedup/main.nf
- cause: Unexpected character: '#' @ line 3, column 5.
#label "process_medium"
^
3.2. BUG_2 for running d8_r1 due to memory
# in conf/test_full.config
process {
//ADDED
withLabel: 'high_memory' {
memory = '120 GB' // adjust as per your system's capabilities
}
withName: 'UMITOOLS_DEDUP' {
time = '160.h' // Adjust the time limit to your needs
}
}
ERROR ~ Error executing process > 'NFCORE_RNASEQ:RNASEQ:BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_TRANSCRIPTOME:UMITOOLS_DEDUP (control_r2)'
Caused by:
Process requirement exceeds available memory -- req: 128 GB; avail: 125.8 GB
Command executed:
PYTHONHASHSEED=0 umi_tools \
dedup \
-I control_r2.transcriptome.sorted.bam \
-S control_r2.umi_dedup.transcriptome.sorted.bam \
--output-stats control_r2.umi_dedup.transcriptome.sorted \
--method='unique' --random-seed=100
cat <<-END_VERSIONS > versions.yml
"NFCORE_RNASEQ:RNASEQ:BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_TRANSCRIPTOME:UMITOOLS_DEDUP":
umitools: $(umi_tools --version 2>&1 | sed 's/^.*UMI-tools version://; s/ *$//')
END_VERSIONS
Command exit status:
-
R-code for evaluation of nextflow outputs
# Import the required libraries library("AnnotationDbi") library("clusterProfiler") library("ReactomePA") library(gplots) library(tximport) library(DESeq2) setwd("~/DATA/Data_Manja_RNAseq_Organoids/results_GRCh38_unique_9samples/star_salmon") # Define paths to your Salmon output quantification files files <- c("control_r1" = "./control_r1/quant.sf", "control_r2" = "./control_r2/quant.sf", "HSV.d2_r1" = "./HSV.d2_r1/quant.sf", "HSV.d2_r2" = "./HSV.d2_r2/quant.sf", "HSV.d4_r1" = "./HSV.d4_r1/quant.sf", "HSV.d4_r2" = "./HSV.d4_r2/quant.sf", "HSV.d6_r1" = "./HSV.d6_r1/quant.sf", "HSV.d6_r2" = "./HSV.d6_r2/quant.sf", "HSV.d8_r2" = "./HSV.d8_r2/quant.sf") # Import the transcript abundance data with tximport txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE) # Define the replicates and condition of the samples replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r2")) condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8")) # Define the colData for DESeq2 colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files)) # Create DESeqDataSet object dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition) # In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps. # Filter data to retain only genes with more than 2 counts > 3 across all samples # dds <- dds[rowSums(counts(dds) > 3) > 2, ] # Run DESeq2 dds <- DESeq(dds) # Perform rlog transformation rld <- rlogTransformation(dds) # Output raw count data to a CSV file write.csv(counts(dds), file="transcript_counts.csv") # -- gene-level count data -- # Read in the tx2gene map from salmon_tx2gene.tsv #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE) tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE) # Set the column names colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name") # Remove the gene_name column if not needed tx2gene <- tx2gene[,1:2] # Import and summarize the Salmon data with tximport txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE) # Continue with the DESeq2 workflow as before... colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files)) dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition) #dds <- dds[rowSums(counts(dds) > 3) > 2, ] #60605-->26543 dds <- DESeq(dds) rld <- rlogTransformation(dds) write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv") #TODO: why a lot of reads were removed due to the too_short? STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output dim(counts(dds)) head(counts(dds), 10)
WHY: too much “Unmapped: too short” in STAR using nextflow, I read the STAR manual, the default values are 0, it means it will never discard reads due to too short. ASSUMPTION: the umi_tools dedup counts as “Unmapped: too short”. Check the intermediate bam files.
Indeed, you’re right that according to the STAR manual, the default settings for read length filtering parameters (–outFilterScoreMinOverLread, –outFilterMatchNminOverLread, and –outFilterMatchNmin) are set to 0, which means reads will not be discarded for being too short.
The “Unmapped: too short” statistic in the STAR output usually refers to reads that didn’t map well enough to the reference genome. In other words, either the read didn’t match sufficiently to any location in the genome, or the best matches were not significantly better than other matches to be unique.
If you’re seeing a high proportion of “Unmapped: too short” reads, it could be due to several reasons:
- The quality of the reads might be poor, leading to low mapping efficiency.
- The reference genome might not be the correct one for your data.
- If your data is single-end reads, it might contain a high proportion of reads shorter than the default minimum allowed length.
For troubleshooting, you could:
Check the quality of your reads using a tool like FastQC.
- Ensure you’re using the correct reference genome.
- If you’re working with single-end data, consider checking the length distribution of your reads. If many reads are shorter than the minimum length threshold, consider adjusting this threshold if appropriate.
- If you’re working with paired-end data, make sure both of the reads in a pair pass the quality controls. STAR is sensitive to inconsistencies between paired reads.
Finally, always make sure your read preprocessing steps (like trimming for quality and adapter sequences) are performed correctly as these can greatly affect the downstream analysis.
Yersinia outer proteins (Yops) analysis
-
This step uses rsync to download data from the NCBI server to a local directory, save all gff-files in the directory prokka.
rsync --copy-links --recursive --times --verbose rsync://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/696/305/GCF_001696305.1_UCN72.1 Yersinia_pestis_1045 GCF_001656035.1_ASM165603v1_genomic.fna.gz 070 status=suppressed jhuang@hamburg:~/DATA/Data_Gunnar_Yersiniomics$ cp data/Yersinia_pseudotuberculosis_PB1+/GCF_000020085.1_ASM2008v1/GCF_000020085.1_ASM2008v1_genomic.fna.gz assembly/Yersinia_pseudotuberculosis_PB1+.fna.gz cp: cannot stat 'data/Yersinia_pseudotuberculosis_PB1+/GCF_000020085.1_ASM2008v1/GCF_000020085.1_ASM2008v1_genomic.fna.gz': No such file or directory 088 jhuang@hamburg:~/DATA/Data_Gunnar_Yersiniomics$ cp data/Yersinia_pseudotuberculosis_YPIII/GCF_000019465.1_ASM1946v1/GCF_000019465.1_ASM1946v1_genomic.fna.gz assembly/Yersinia_pseudotuberculosis_YPIII.fna.gz cp: cannot stat 'data/Yersinia_pseudotuberculosis_YPIII/GCF_000019465.1_ASM1946v1/GCF_000019465.1_ASM1946v1_genomic.fna.gz': No such file or directory #status=latest for sample in Yersinia_pestis_1045 Yersinia_pestis_SCPM-O-B-6291_C-25 Yersinia_pestis_2944 Yersinia_pestis_KIM10+ Yersinia_pestis_M-1482 Yersinia_pestis_KIM5 Yersinia_pestis_C-781 Yersinia_pestis_14D Yersinia_pestis_KM_567 Yersinia_pestis_M-1770 Yersinia_pestis_C-792 Yersinia_pestis_M2086 Yersinia_pestis_Harbin_35 Yersinia_pestis_Nicholisk_41 Yersinia_pestis_Harbin_35_bis Yersinia_pestis_SCPM-O-B-5935_I-1996 Yersinia_pestis_I-1252 Yersinia_pestis_FDAARGOS_603 Yersinia_pestis_195P Yersinia_pestis_Nepal516 Yersinia_pestis_S19960127 Yersinia_pestis_SCPM-O-B-6530 Yersinia_pestis_C-783 Yersinia_pestis_A1122 Yersinia_pestis_Cadman Yersinia_pestis_A1122_bis Yersinia_pestis_CO92_pgm-_pPCP1- Yersinia_pestis_CO92 Yersinia_pestis_Shasta Yersinia_pestis_Dodson Yersinia_pestis_El_Dorado Yersinia_pestis_EV76-CN Yersinia_pestis_EV_NIIEG Yersinia_pestis_Java9 Yersinia_pestis_PBM19 Yersinia_pestis_20 Yersinia_pestis_D182038 Yersinia_pestis_D106004 Yersinia_pestis_Z176003 Yersinia_pestis_Antiqua_bis Yersinia_pestis_FDAARGOS_601 Yersinia_pestis_Antiqua Yersinia_pestis_Nairobi Yersinia_pestis_M2085 Yersinia_pestis_SCPM-O-B-5942_I-2638 Yersinia_pestis_M2029 Yersinia_pestis_SCPM-O-DNA-18_I-3113 Yersinia_pestis_94 Yersinia_pestis_R Yersinia_pestis_790 Yersinia_pestis_SCPM-O-B-6899_231 Yersinia_pestis_FDAARGOS_602 Yersinia_pestis_Pestoides_B Yersinia_pestis_M-1974 Yersinia_pestis_91001 Yersinia_pestis_Angola Yersinia_pestis_Angola_bis Yersinia_pestis_3770 Yersinia_pestis_1412 Yersinia_pestis_1413 Yersinia_pestis_8787 Yersinia_pestis_3067 Yersinia_pestis_Pestoides_G Yersinia_pestis_Pestoides_F Yersinia_pestis_Pestoides_F_bis Yersinia_pestis_1522 Yersinia_pseudotuberculosis_FDAARGOS_582 Yersinia_pseudotuberculosis_NZYP4713 Yersinia_pseudotuberculosis_NCTC8480 Yersinia_pseudotuberculosis_PB1+_bis Yersinia_pseudotuberculosis_MD67 Yersinia_pseudotuberculosis_NCTC10217 Yersinia_pseudotuberculosis_NCTC10275 Yersinia_pseudotuberculosis_1 Yersinia_pseudotuberculosis_IP32953 Yersinia_pseudotuberculosis_IP32953_bis Yersinia_pseudotuberculosis_FDAARGOS_583 Yersinia_pseudotuberculosis_FDAARGOS_581 Yersinia_pseudotuberculosis_ATCC_6904 Yersinia_pseudotuberculosis_EP2+ Yersinia_pseudotuberculosis_IP31758 Yersinia_pseudotuberculosis_598 Yersinia_pseudotuberculosis_PA3606 Yersinia_pseudotuberculosis_FDAARGOS_665 Yersinia_pseudotuberculosis_FDAARGOS_584 Yersinia_pseudotuberculosis_YPIII_bis Yersinia_pseudotuberculosis_FDAARGOS_579 Yersinia_pseudotuberculosis_IP2666pIB1 Yersinia_pseudotuberculosis_FDAARGOS_342 Yersinia_pseudotuberculosis_FDAARGOS_580 Yersinia_pseudotuberculosis_NCTC3571 Yersinia_similis_228 Yersinia_enterocolitica_NCTC13629 Yersinia_enterocolitica_MGYG-HGUT-02335 Yersinia_enterocolitica_Y1 Yersinia_enterocolitica_Y11 Yersinia_enterocolitica_NCTC13769 Yersinia_enterocolitica_FDAARGOS_1082 Yersinia_enterocolitica_2516-87 Yersinia_enterocolitica_KNG22703 Yersinia_enterocolitica_1055Rr Yersinia_enterocolitica_FDAARGOS_1090 Yersinia_enterocolitica_YE1 Yersinia_enterocolitica_YE3 Yersinia_enterocolitica_YE6 Yersinia_enterocolitica_YE7 Yersinia_enterocolitica_YE5 Yersinia_enterocolitica_YE165 Yersinia_enterocolitica_8081 Yersinia_enterocolitica_8081_bis Yersinia_enterocolitica_NCTC12982 Yersinia_enterocolitica_WA Yersinia_enterocolitica_NW57 Yersinia_enterocolitica_NW117 Yersinia_enterocolitica_NW51 Yersinia_enterocolitica_NW56 Yersinia_enterocolitica_NW115 Yersinia_enterocolitica_NW67 Yersinia_enterocolitica_FORC_002 Yersinia_enterocolitica_FORC_002_bis Yersinia_enterocolitica_NW66 Yersinia_enterocolitica_MP98 Yersinia_enterocolitica_Gp259 Yersinia_enterocolitica_FORC066 Yersinia_enterocolitica_Gp2 Yersinia_enterocolitica_str_YE5303 Yersinia_enterocolitica_Gp200 Yersinia_enterocolitica_NW116 Yersinia_enterocolitica_Gp169 Yersinia_enterocolitica_NW1 Yersinia_enterocolitica_FORC065 Yersinia_frederiksenii_Y225 Yersinia_kristensenii_Y231 Yersinia_rochesterensis_ATCC_33639 Yersinia_rochesterensis_ATCC_BAA-2637 Yersinia_intermedia_SCPM-O-B-9106_C-191 Yersinia_kristensenii_2012N-4030 Yersinia_hibernica_CFS1934 Yersinia_hibernica_LC20 Yersinia_canariae_NCTC_14382 Yersinia_frederiksenii_FDAARGOS_418 Yersinia_alsatica_SCPM-O-B-7604 Yersinia_rohdei_YRA Yersinia_massiliensis_GTA Yersinia_massiliensis_2011N-4075 Yersinia_frederiksenii_FDAARGOS_417 Yersinia_intermedia_SCPM-O-B-8026_C-146 Yersinia_sp_KBS0713 Yersinia_bercovieri_ATCC_43970 Yersinia_aleksiciae_159 Yersinia_mollaretii_ATCC_43969 Yersinia_intermedia_FDAARGOS_729 Yersinia_intermedia_FDAARGOS_730 Yersinia_intermedia_NCTC11469 Yersinia_intermedia_FDAARGOS_358 Yersinia_sp_FDAARGOS_228 Yersinia_intermedia_Y228 Yersinia_intermedia_N6293 Yersinia_intermedia_SCPM-O-B-10209_333 Yersinia_aldovae_670-83 Yersinia_ruckeri_NHV_3758 Yersinia_ruckeri_NVI-10705 Yersinia_ruckeri_NVI-1292 Yersinia_ruckeri_NVI-4570 Yersinia_ruckeri_NVI-6614 Yersinia_ruckeri_NVI-11267 Yersinia_ruckeri_NVI-11294 Yersinia_ruckeri_NVI-10571 Yersinia_ruckeri_NVI-8524 Yersinia_ruckeri_NVI-1176 Yersinia_ruckeri_NVI-701 Yersinia_ruckeri_17Y0412 Yersinia_ruckeri_17Y0414 Yersinia_ruckeri_NVI-492 Yersinia_ruckeri_NVI-9681 Yersinia_ruckeri_SC09 Yersinia_ruckeri_17Y0157 Yersinia_ruckeri_17Y0189 Yersinia_ruckeri_17Y0153 Yersinia_ruckeri_17Y0155 Yersinia_ruckeri_KMM821 Yersinia_ruckeri_16Y0180 Yersinia_ruckeri_NVI-11050 Yersinia_ruckeri_NVI-11076 Yersinia_ruckeri_QMA0440 Yersinia_ruckeri_Big_Creek_74 Yersinia_ruckeri_NVI-5089 Yersinia_ruckeri_NVI-10587 Yersinia_ruckeri_NVI-4840 Yersinia_ruckeri_NVI-4479 Yersinia_ruckeri_17Y0161 Yersinia_ruckeri_17Y0163 Yersinia_ruckeri_NVI-11073 Yersinia_ruckeri_NVI-11065 Yersinia_ruckeri_17Y0159 Yersinia_ruckeri_NVI-8270 Yersinia_ruckeri_YRB Yersinia_entomophaga_MH96; do mlst ${sample}.fna >> ../mlst/all.txt; done #gene-M486_RS20950 #M486_RS20950 #extract CDS with locus_tag from genbank file
#cut -d’ ‘ -f1 ../assembly/${sample}.fna > ../assembly/${sample}.fasta; #cat ${sample}.gff ../assembly/${sample}.fasta > ../prokkaplus/$(echo $sample | cut -d’‘ -f3- | tr ” ” “_”).gff; #sed -i ‘s/###/##FASTA/g’ ../prokkaplus/$(echo $sample | cut -d’‘ -f3- | tr ” ” “_”).gff;
-
(important since only with the modification we can track the Gene ID) The step processes GFF files containing gene annotations for a set of samples in the directory prokka. The primary goal is to modify the GFF files and create new ones with specific changes and to save them in the directory prokka_plus. The script operates on each sample one by one, and for each sample, it performs the following steps:
* Replace all occurrences of \tCDS\t with _CDS_ in the original GFF file. * Extract all lines containing _CDS_ and save them in a new file with the suffix _CDS.gff. * Replace all occurrences of ID= with ID_old= in the new _CDS.gff file. * Cut the second field (delimited by ;) from the _CDS.gff file and save it in a new file with the suffix _CDS_f2. * Replace all occurrences of Parent=gene- with ID= in the _CDS_f2 file. * Paste the contents of the _CDS.gff and _CDS_f2 files side by side, with a ; delimiter, and save the result in a new file with the suffix _CDS_.gff. * Run the enum.py script on the _CDS_.gff file to add line numbers at the end, and save the result in a new file with the suffix _CDS__.gff. import sys if len(sys.argv) < 2: print("Please provide a filename as an argument.") sys.exit(1) filename = sys.argv[1] try: with open(filename) as f: for i, line in enumerate(f): print(f"{line.strip()}_{i+1}") except FileNotFoundError: print(f"File {filename} not found.") * Extract all lines from the original GFF file that do not contain _CDS_ and save them in a new file with the suffix _nonCDS.gff. * Remove all lines containing ### from the _nonCDS.gff file and save the result in a new file with the suffix _nonCDS_.gff. * Concatenate the contents of the _nonCDS_.gff and _CDS__.gff files and save the result in a new file with the suffix _nonCDS_CDS.gff. * Replace all occurrences of _CDS_ with \tCDS\t in the _nonCDS_CDS.gff file. * Append the string ##FASTA to the end of the _nonCDS_CDS.gff file. * Modify the FASTA file associated with the sample by replacing the first field (delimited by a space) with the corresponding sample name. * Concatenate the modified GFF file (_nonCDS_CDS.gff) and the modified FASTA file, and save the result in the ../prokka_plus/ directory with a new name based on the sample name. * After processing all samples, the script removes intermediate files generated during the process. # ERROR: Input file contains duplicate gene IDs, attempting to fix by adding a unique suffix, new GFF in the fixed_input_files directory: /mnt/Samsung_T5/Data_Gunnar_Yersiniomics/prokka_plus/1045.gff #To Debug the error above, perform the data as follows. for sample in Yersinia_pestis_1045 Yersinia_pestis_SCPM-O-B-6291_C-25 Yersinia_pestis_2944 Yersinia_pestis_KIM10+ Yersinia_pestis_M-1482 Yersinia_pestis_KIM5 Yersinia_pestis_C-781 Yersinia_pestis_14D Yersinia_pestis_KM_567 Yersinia_pestis_M-1770 Yersinia_pestis_C-792 Yersinia_pestis_M2086 Yersinia_pestis_Harbin_35 Yersinia_pestis_Nicholisk_41 Yersinia_pestis_Harbin_35_bis Yersinia_pestis_SCPM-O-B-5935_I-1996 Yersinia_pestis_I-1252 Yersinia_pestis_FDAARGOS_603 Yersinia_pestis_195P Yersinia_pestis_Nepal516 Yersinia_pestis_S19960127 Yersinia_pestis_SCPM-O-B-6530 Yersinia_pestis_C-783 Yersinia_pestis_A1122 Yersinia_pestis_Cadman Yersinia_pestis_A1122_bis Yersinia_pestis_CO92_pgm-_pPCP1- Yersinia_pestis_CO92 Yersinia_pestis_Shasta Yersinia_pestis_Dodson Yersinia_pestis_El_Dorado Yersinia_pestis_EV76-CN Yersinia_pestis_EV_NIIEG Yersinia_pestis_Java9 Yersinia_pestis_PBM19 Yersinia_pestis_20 Yersinia_pestis_D182038 Yersinia_pestis_D106004 Yersinia_pestis_Z176003 Yersinia_pestis_Antiqua_bis Yersinia_pestis_FDAARGOS_601 Yersinia_pestis_Antiqua Yersinia_pestis_Nairobi Yersinia_pestis_M2085 Yersinia_pestis_SCPM-O-B-5942_I-2638 Yersinia_pestis_M2029 Yersinia_pestis_SCPM-O-DNA-18_I-3113 Yersinia_pestis_94 Yersinia_pestis_R Yersinia_pestis_790 Yersinia_pestis_SCPM-O-B-6899_231 Yersinia_pestis_FDAARGOS_602 Yersinia_pestis_Pestoides_B Yersinia_pestis_M-1974 Yersinia_pestis_91001 Yersinia_pestis_Angola Yersinia_pestis_Angola_bis Yersinia_pestis_3770 Yersinia_pestis_1412 Yersinia_pestis_1413 Yersinia_pestis_8787 Yersinia_pestis_3067 Yersinia_pestis_Pestoides_G Yersinia_pestis_Pestoides_F Yersinia_pestis_Pestoides_F_bis Yersinia_pestis_1522 Yersinia_pseudotuberculosis_FDAARGOS_582 Yersinia_pseudotuberculosis_NZYP4713 Yersinia_pseudotuberculosis_NCTC8480 Yersinia_pseudotuberculosis_PB1+_bis Yersinia_pseudotuberculosis_MD67 Yersinia_pseudotuberculosis_NCTC10217 Yersinia_pseudotuberculosis_NCTC10275 Yersinia_pseudotuberculosis_1 Yersinia_pseudotuberculosis_IP32953 Yersinia_pseudotuberculosis_IP32953_bis Yersinia_pseudotuberculosis_FDAARGOS_583 Yersinia_pseudotuberculosis_FDAARGOS_581 Yersinia_pseudotuberculosis_ATCC_6904 Yersinia_pseudotuberculosis_EP2+ Yersinia_pseudotuberculosis_IP31758 Yersinia_pseudotuberculosis_598 Yersinia_pseudotuberculosis_PA3606 Yersinia_pseudotuberculosis_FDAARGOS_665 Yersinia_pseudotuberculosis_FDAARGOS_584 Yersinia_pseudotuberculosis_YPIII_bis Yersinia_pseudotuberculosis_FDAARGOS_579 Yersinia_pseudotuberculosis_IP2666pIB1 Yersinia_pseudotuberculosis_FDAARGOS_342 Yersinia_pseudotuberculosis_FDAARGOS_580 Yersinia_pseudotuberculosis_NCTC3571 Yersinia_similis_228 Yersinia_enterocolitica_NCTC13629 Yersinia_enterocolitica_MGYG-HGUT-02335 Yersinia_enterocolitica_Y1 Yersinia_enterocolitica_Y11 Yersinia_enterocolitica_NCTC13769 Yersinia_enterocolitica_FDAARGOS_1082 Yersinia_enterocolitica_2516-87 Yersinia_enterocolitica_KNG22703 Yersinia_enterocolitica_1055Rr Yersinia_enterocolitica_FDAARGOS_1090 Yersinia_enterocolitica_YE1 Yersinia_enterocolitica_YE3 Yersinia_enterocolitica_YE6 Yersinia_enterocolitica_YE7 Yersinia_enterocolitica_YE5 Yersinia_enterocolitica_YE165 Yersinia_enterocolitica_8081 Yersinia_enterocolitica_8081_bis Yersinia_enterocolitica_NCTC12982 Yersinia_enterocolitica_WA Yersinia_enterocolitica_NW57 Yersinia_enterocolitica_NW117 Yersinia_enterocolitica_NW51 Yersinia_enterocolitica_NW56 Yersinia_enterocolitica_NW115 Yersinia_enterocolitica_NW67 Yersinia_enterocolitica_FORC_002 Yersinia_enterocolitica_FORC_002_bis Yersinia_enterocolitica_NW66 Yersinia_enterocolitica_MP98 Yersinia_enterocolitica_Gp259 Yersinia_enterocolitica_FORC066 Yersinia_enterocolitica_Gp2 Yersinia_enterocolitica_str_YE5303 Yersinia_enterocolitica_Gp200 Yersinia_enterocolitica_NW116 Yersinia_enterocolitica_Gp169 Yersinia_enterocolitica_NW1 Yersinia_enterocolitica_FORC065 Yersinia_frederiksenii_Y225 Yersinia_kristensenii_Y231 Yersinia_rochesterensis_ATCC_33639 Yersinia_rochesterensis_ATCC_BAA-2637 Yersinia_intermedia_SCPM-O-B-9106_C-191 Yersinia_kristensenii_2012N-4030 Yersinia_hibernica_CFS1934 Yersinia_hibernica_LC20 Yersinia_canariae_NCTC_14382 Yersinia_frederiksenii_FDAARGOS_418 Yersinia_alsatica_SCPM-O-B-7604 Yersinia_rohdei_YRA Yersinia_massiliensis_GTA Yersinia_massiliensis_2011N-4075 Yersinia_frederiksenii_FDAARGOS_417 Yersinia_intermedia_SCPM-O-B-8026_C-146 Yersinia_sp_KBS0713 Yersinia_bercovieri_ATCC_43970 Yersinia_aleksiciae_159 Yersinia_mollaretii_ATCC_43969 Yersinia_intermedia_FDAARGOS_729 Yersinia_intermedia_FDAARGOS_730 Yersinia_intermedia_NCTC11469 Yersinia_intermedia_FDAARGOS_358 Yersinia_sp_FDAARGOS_228 Yersinia_intermedia_Y228 Yersinia_intermedia_N6293 Yersinia_intermedia_SCPM-O-B-10209_333 Yersinia_aldovae_670-83 Yersinia_ruckeri_NHV_3758 Yersinia_ruckeri_NVI-10705 Yersinia_ruckeri_NVI-1292 Yersinia_ruckeri_NVI-4570 Yersinia_ruckeri_NVI-6614 Yersinia_ruckeri_NVI-11267 Yersinia_ruckeri_NVI-11294 Yersinia_ruckeri_NVI-10571 Yersinia_ruckeri_NVI-8524 Yersinia_ruckeri_NVI-1176 Yersinia_ruckeri_NVI-701 Yersinia_ruckeri_17Y0412 Yersinia_ruckeri_17Y0414 Yersinia_ruckeri_NVI-492 Yersinia_ruckeri_NVI-9681 Yersinia_ruckeri_SC09 Yersinia_ruckeri_17Y0157 Yersinia_ruckeri_17Y0189 Yersinia_ruckeri_17Y0153 Yersinia_ruckeri_17Y0155 Yersinia_ruckeri_KMM821 Yersinia_ruckeri_16Y0180 Yersinia_ruckeri_NVI-11050 Yersinia_ruckeri_NVI-11076 Yersinia_ruckeri_QMA0440 Yersinia_ruckeri_Big_Creek_74 Yersinia_ruckeri_NVI-5089 Yersinia_ruckeri_NVI-10587 Yersinia_ruckeri_NVI-4840 Yersinia_ruckeri_NVI-4479 Yersinia_ruckeri_17Y0161 Yersinia_ruckeri_17Y0163 Yersinia_ruckeri_NVI-11073 Yersinia_ruckeri_NVI-11065 Yersinia_ruckeri_17Y0159 Yersinia_ruckeri_NVI-8270 Yersinia_ruckeri_YRB Yersinia_entomophaga_MH96; do for sample in Yersinia_pestis_1045 Yersinia_pestis_SCPM-O-B-6291_C-25 Yersinia_pestis_2944 Yersinia_pestis_KIM10+ Yersinia_pestis_M-1482; do sed -i 's/\tCDS\t/_CDS_/g' ${sample}.gff grep "_CDS_" ${sample}.gff > ${sample}_CDS.gff sed -i 's/ID=/ID_old=/g' ${sample}_CDS.gff cut -d';' -f2 ${sample}_CDS.gff > ${sample}_CDS_f2 sed -i 's/Parent=gene-/ID=/g' ${sample}_CDS_f2 paste -d';' ${sample}_CDS.gff ${sample}_CDS_f2 > ${sample}_CDS_.gff python enum.py ${sample}_CDS_.gff > ${sample}_CDS__.gff # add a line number to end to avoid the sameple Gene_ID grep -v "_CDS_" ${sample}.gff > ${sample}_nonCDS.gff grep -v "###" ${sample}_nonCDS.gff > ${sample}_nonCDS_.gff cat ${sample}_nonCDS_.gff ${sample}_CDS__.gff > ${sample}_nonCDS_CDS.gff sed -i 's/_CDS_/\tCDS\t/g' ${sample}_nonCDS_CDS.gff echo "##FASTA" >> ${sample}_nonCDS_CDS.gff cut -d' ' -f1 ../assembly/${sample}.fna > ../assembly/${sample}.fasta; cat ${sample}_nonCDS_CDS.gff ../assembly/${sample}.fasta > ../prokka_plus/$(echo $sample | cut -d'_' -f3- | tr " " "_").gff; done rm *_CDS.gff *_CDS_f2 *_CDS_.gff *_CDS__.gff *_nonCDS.gff *_nonCDS_.gff #for sample in Yersinia_pestis_1045 Yersinia_pestis_SCPM-O-B-6291_C-25 ...; do #echo $sample | cut -d'_' -f3- | tr " " "_" >> temp #done
-
After standand running of bacto-pipeline. Then we run Roary in the step, a tool for pan-genome analysis. It takes annotated bacterial genomes in GFF3 format as input and clusters the genes based on sequence similarity.
roary -p 4 -f ./roary -i 95 -cd 99 -s -e -n -v prokka_plus/1045.gff prokka_plus/SCPM-O-B-6291_C-25.gff prokka_plus/2944.gff prokka_plus/KIM10+.gff roary -p 4 -f ./roary -i 50 -cd 99 -s -e -n -v prokka_plus/1045.gff prokka_plus/SCPM-O-B-6291_C-25.gff prokka_plus/2944.gff prokka_plus/KIM10+.gff prokka_plus/M-1482.gff prokka_plus/KIM5.gff prokka_plus/C-781.gff prokka_plus/14D.gff prokka_plus/KM_567.gff prokka_plus/M-1770.gff prokka_plus/C-792.gff prokka_plus/M2086.gff prokka_plus/Harbin_35.gff prokka_plus/Nicholisk_41.gff prokka_plus/Harbin_35_bis.gff prokka_plus/SCPM-O-B-5935_I-1996.gff prokka_plus/I-1252.gff prokka_plus/FDAARGOS_603.gff prokka_plus/195P.gff prokka_plus/Nepal516.gff prokka_plus/S19960127.gff prokka_plus/SCPM-O-B-6530.gff prokka_plus/C-783.gff prokka_plus/A1122.gff prokka_plus/Cadman.gff prokka_plus/A1122_bis.gff prokka_plus/CO92_pgm-_pPCP1-.gff prokka_plus/CO92.gff prokka_plus/Shasta.gff prokka_plus/Dodson.gff prokka_plus/El_Dorado.gff prokka_plus/EV76-CN.gff prokka_plus/EV_NIIEG.gff prokka_plus/Java9.gff prokka_plus/PBM19.gff prokka_plus/20.gff prokka_plus/D182038.gff prokka_plus/D106004.gff prokka_plus/Z176003.gff prokka_plus/Antiqua_bis.gff prokka_plus/FDAARGOS_601.gff prokka_plus/Antiqua.gff prokka_plus/Nairobi.gff prokka_plus/M2085.gff prokka_plus/SCPM-O-B-5942_I-2638.gff prokka_plus/M2029.gff prokka_plus/SCPM-O-DNA-18_I-3113.gff prokka_plus/94.gff prokka_plus/R.gff prokka_plus/790.gff prokka_plus/SCPM-O-B-6899_231.gff prokka_plus/FDAARGOS_602.gff prokka_plus/Pestoides_B.gff prokka_plus/M-1974.gff prokka_plus/91001.gff prokka_plus/Angola.gff prokka_plus/Angola_bis.gff prokka_plus/3770.gff prokka_plus/1412.gff prokka_plus/1413.gff prokka_plus/8787.gff prokka_plus/3067.gff prokka_plus/Pestoides_G.gff prokka_plus/Pestoides_F.gff prokka_plus/Pestoides_F_bis.gff prokka_plus/1522.gff prokka_plus/FDAARGOS_582.gff prokka_plus/NZYP4713.gff prokka_plus/NCTC8480.gff prokka_plus/PB1+_bis.gff prokka_plus/MD67.gff prokka_plus/NCTC10217.gff prokka_plus/NCTC10275.gff prokka_plus/1.gff prokka_plus/IP32953.gff prokka_plus/IP32953_bis.gff prokka_plus/FDAARGOS_583.gff prokka_plus/FDAARGOS_581.gff prokka_plus/ATCC_6904.gff prokka_plus/EP2+.gff prokka_plus/IP31758.gff prokka_plus/598.gff prokka_plus/PA3606.gff prokka_plus/FDAARGOS_665.gff prokka_plus/FDAARGOS_584.gff prokka_plus/YPIII_bis.gff prokka_plus/FDAARGOS_579.gff prokka_plus/IP2666pIB1.gff prokka_plus/FDAARGOS_342.gff prokka_plus/FDAARGOS_580.gff prokka_plus/NCTC3571.gff prokka_plus/228.gff prokka_plus/NCTC13629.gff prokka_plus/MGYG-HGUT-02335.gff prokka_plus/Y1.gff prokka_plus/Y11.gff prokka_plus/NCTC13769.gff prokka_plus/FDAARGOS_1082.gff prokka_plus/2516-87.gff prokka_plus/KNG22703.gff prokka_plus/1055Rr.gff prokka_plus/FDAARGOS_1090.gff prokka_plus/YE1.gff prokka_plus/YE3.gff prokka_plus/YE6.gff prokka_plus/YE7.gff prokka_plus/YE5.gff prokka_plus/YE165.gff prokka_plus/8081.gff prokka_plus/8081_bis.gff prokka_plus/NCTC12982.gff prokka_plus/WA.gff prokka_plus/NW57.gff prokka_plus/NW117.gff prokka_plus/NW51.gff prokka_plus/NW56.gff prokka_plus/NW115.gff prokka_plus/NW67.gff prokka_plus/FORC_002.gff prokka_plus/FORC_002_bis.gff prokka_plus/NW66.gff prokka_plus/MP98.gff prokka_plus/Gp259.gff prokka_plus/FORC066.gff prokka_plus/Gp2.gff prokka_plus/str_YE5303.gff prokka_plus/Gp200.gff prokka_plus/NW116.gff prokka_plus/Gp169.gff prokka_plus/NW1.gff prokka_plus/FORC065.gff prokka_plus/Y225.gff prokka_plus/Y231.gff prokka_plus/ATCC_33639.gff prokka_plus/ATCC_BAA-2637.gff prokka_plus/SCPM-O-B-9106_C-191.gff prokka_plus/2012N-4030.gff prokka_plus/CFS1934.gff prokka_plus/LC20.gff prokka_plus/NCTC_14382.gff prokka_plus/FDAARGOS_418.gff prokka_plus/SCPM-O-B-7604.gff prokka_plus/YRA.gff prokka_plus/GTA.gff prokka_plus/2011N-4075.gff prokka_plus/FDAARGOS_417.gff prokka_plus/SCPM-O-B-8026_C-146.gff prokka_plus/KBS0713.gff prokka_plus/ATCC_43970.gff prokka_plus/159.gff prokka_plus/ATCC_43969.gff prokka_plus/FDAARGOS_729.gff prokka_plus/FDAARGOS_730.gff prokka_plus/NCTC11469.gff prokka_plus/FDAARGOS_358.gff prokka_plus/FDAARGOS_228.gff prokka_plus/Y228.gff prokka_plus/N6293.gff prokka_plus/SCPM-O-B-10209_333.gff prokka_plus/670-83.gff prokka_plus/NHV_3758.gff prokka_plus/NVI-10705.gff prokka_plus/NVI-1292.gff prokka_plus/NVI-4570.gff prokka_plus/NVI-6614.gff prokka_plus/NVI-11267.gff prokka_plus/NVI-11294.gff prokka_plus/NVI-10571.gff prokka_plus/NVI-8524.gff prokka_plus/NVI-1176.gff prokka_plus/NVI-701.gff prokka_plus/17Y0412.gff prokka_plus/17Y0414.gff prokka_plus/NVI-492.gff prokka_plus/NVI-9681.gff prokka_plus/SC09.gff prokka_plus/17Y0157.gff prokka_plus/17Y0189.gff prokka_plus/17Y0153.gff prokka_plus/17Y0155.gff prokka_plus/KMM821.gff prokka_plus/16Y0180.gff prokka_plus/NVI-11050.gff prokka_plus/NVI-11076.gff prokka_plus/QMA0440.gff prokka_plus/Big_Creek_74.gff prokka_plus/NVI-5089.gff prokka_plus/NVI-10587.gff prokka_plus/NVI-4840.gff prokka_plus/NVI-4479.gff prokka_plus/17Y0161.gff prokka_plus/17Y0163.gff prokka_plus/NVI-11073.gff prokka_plus/NVI-11065.gff prokka_plus/17Y0159.gff prokka_plus/NVI-8270.gff prokka_plus/YRB.gff prokka_plus/MH96.gff #DEL makeblastdb -in fna -dbtype 'nucl' -out fna.db #DELblastn -db fna.db -query yopK.fasta -out yopK_on_fna.blastn -evalue 10000 -num_threads 15 -outfmt 6
-
generate yop*_seq.txt from roary: This step extracts the coding sequences (CDS) of specific genes from multiple genome files and saves them to an output file. Start-files: roary/pan_genome_reference.fa and roary/gene_presence_absence.csv. For example for yopM.
grep "yopM" roary/gene_presence_absence.csv #6+19+45=70 --> 71 "yopM","","type III secretion system effector YopM","45","45","1","","","","","","1229","1229","1229","","M486_RS20920_3990","","M479_RS01070_4055","M480_RS01170_4076","","M481_RS01115_4071","","","","","","","","","","","","","LDH65_RS21345_4177","","","","","M478_RS01000_4055","M482_RS01070_4063","M483_RS00915_4013","","","M477_RS21610_4128","","","M484_RS01125_4011","","LDH63_RS21760_4259","","","","","","","","","","YPA_RS22550_4200","CH58_RS00945_4248","","","","","","YPO_RS00170_4130","AK38_RS00930_4114","BAY22_RS21640_4174","YPD4_RS21505_4104","YPD8_RS21525_4060","CH61_RS00195_4143","BZ20_RS00435_4174","M0M60_RS21870_4286","","CH46_RS00070_4122","","","","","","","","EGX53_RS00030_4033","EGX52_RS00260_4348","","","","","EGX46_RS00245_4205","","EGX74_RS00040_4070","","","","","","","","","","","","","YPC_RS21075_4024","CH55_RS00770_4123","","DN756_RS21785_4075","","","","CH62_RS00690_4176","","","CH44_RS00795_4078","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","CH63_RS00700_4106","","","CH59_RS00970_4231","","YPDSF_RS21140_4036","BZ18_RS00325_4042","CH43_RS00040_3994","","LDH64_RS21810_4270","S96127_RS00100_4096","","","GCK71_RS22420_4113","GD372_RS22475_4112","","DJY80_RS22415_4098","GCK69_RS22480_4113","","","","GCK70_RS22160_4053","BZ15_RS00325_4183","","","","","","","","","","","","","","","","YPZ3_RS21220_4056","" "group_5673","yopM","type III secretion system effector YopM","19","19","1","","","","","","1103","1103","1103","","","YE105_RS20595_4018","","","","","","","","","","","","","","","","","","","","","CH48_RS00390_4060","","","","YP598_RS21115_4110","","","YE_RS21175_4135","CH49_RS00235_4177","","YP_RS21285_4111","","","","","","","","","YPANGOLA_RS22070_4036","CH56_RS22160_4084","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","XM56_RS20545_4037","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","BZ19_RS21445_4113","","","CH60_RS01070_4100","","","","","","","","","","","","","","","","","","","","YEY1_RS21430_4040","Y11_RS21100_4128","","","","BFS78_RS21580_4258","BB936_RS22285_4398","BED35_RS00500_4353","BED32_RS00030_4182","BED33_RS21910_4325","BED34_RS22270_4407","","","","","" "group_23005","yopM","type III secretion system effector YopM","6","6","1","","","","","","1589","1589","1589","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","EGX47_RS00105_4453","EGX44_RS00020_4153","EGX39_RS00330_3982","","","","","","","","","","","","","","","","","","","","","","YPTB_RS21675_4159","BZ17_RS00175_4115","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","BN7064_RS22100_4159","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","" > yopM_seq.txt for gene_id in M486_RS20920_3990 YE105_RS20595_4018 M479_RS01070_4055 M480_RS01170_4076 M481_RS01115_4071 LDH65_RS21345_4177 CH48_RS00390_4060 M478_RS01000_4055 M482_RS01070_4063 M483_RS00915_4013 YP598_RS21115_4110 M477_RS21610_4128 YE_RS21175_4135 CH49_RS00235_4177 M484_RS01125_4011 YP_RS21285_4111 LDH63_RS21760_4259 YPANGOLA_RS22070_4036 CH56_RS22160_4084 YPA_RS22550_4200 CH58_RS00945_4248 YPO_RS00170_4130 AK38_RS00930_4114 BAY22_RS21640_4174 YPD4_RS21505_4104 YPD8_RS21525_4060 CH61_RS00195_4143 BZ20_RS00435_4174 M0M60_RS21870_4286 CH46_RS00070_4122 EGX53_RS00030_4033 EGX52_RS00260_4348 EGX47_RS00105_4453 EGX44_RS00020_4153 EGX39_RS00330_3982 EGX46_RS00245_4205 EGX74_RS00040_4070 YPC_RS21075_4024 CH55_RS00770_4123 DN756_RS21785_4075 YPTB_RS21675_4159 BZ17_RS00175_4115 CH62_RS00690_4176 CH44_RS00795_4078 XM56_RS20545_4037 BN7064_RS22100_4159 CH63_RS00700_4106 BZ19_RS21445_4113 CH59_RS00970_4231 CH60_RS01070_4100 YPDSF_RS21140_4036 BZ18_RS00325_4042 CH43_RS00040_3994 LDH64_RS21810_4270 S96127_RS00100_4096 GCK71_RS22420_4113 GD372_RS22475_4112 DJY80_RS22415_4098 GCK69_RS22480_4113 GCK70_RS22160_4053 BZ15_RS00325_4183 CH47_RS00140_4080 YEY1_RS21430_4040 Y11_RS21100_4128 BFS78_RS21580_4258 BB936_RS22285_4398 BED35_RS00500_4353 BED32_RS00030_4182 BED33_RS21910_4325 BED34_RS22270_4407 YPZ3_RS21220_4056; do for gbff in Yersinia_massiliensis_2011N-4075/GCF_013282765.1_ASM1328276v1/GCF_013282765.1_ASM1328276v1_genomic.gbff.gz Yersinia_pestis_EV_NIIEG/GCF_000590535.2_ASM59053v2/GCF_000590535.2_ASM59053v2_genomic.gbff.gz Yersinia_pestis_Shasta/GCF_000834335.1_ASM83433v1/GCF_000834335.1_ASM83433v1_genomic.gbff.gz Yersinia_ruckeri_NVI-492/GCF_023212565.2_ASM2321256v2/GCF_023212565.2_ASM2321256v2_genomic.gbff.gz Yersinia_pestis_Pestoides_G/GCF_000834985.1_ASM83498v1/GCF_000834985.1_ASM83498v1_genomic.gbff.gz Yersinia_pestis_Antiqua_bis/GCF_000834825.1_ASM83482v1/GCF_000834825.1_ASM83482v1_genomic.gbff.gz Yersinia_pestis_91001/GCF_000007885.1_ASM788v1/GCF_000007885.1_ASM788v1_genomic.gbff.gz Yersinia_intermedia_Y228/GCF_000834515.1_ASM83451v1/GCF_000834515.1_ASM83451v1_genomic.gbff.gz Yersinia_pestis_Java9/GCF_000834905.1_ASM83490v1/GCF_000834905.1_ASM83490v1_genomic.gbff.gz Yersinia_pseudotuberculosis_IP32953_bis/GCF_000834295.1_ASM83429v1/GCF_000834295.1_ASM83429v1_genomic.gbff.gz Yersinia_pseudotuberculosis_YPIII_bis/GCF_000834375.1_ASM83437v1/GCF_000834375.1_ASM83437v1_genomic.gbff.gz Yersinia_enterocolitica_8081_bis/GCF_000834795.1_ASM83479v1/GCF_000834795.1_ASM83479v1_genomic.gbff.gz Yersinia_sp_FDAARGOS_228/GCF_002073315.2_ASM207331v2/GCF_002073315.2_ASM207331v2_genomic.gbff.gz Yersinia_enterocolitica_Gp169/GCF_025758435.1_ASM2575843v1/GCF_025758435.1_ASM2575843v1_genomic.gbff.gz Yersinia_pestis_195P/GCF_002005285.1_ASM200528v1/GCF_002005285.1_ASM200528v1_genomic.gbff.gz Yersinia_frederiksenii_FDAARGOS_418/GCF_002591195.1_ASM259119v1/GCF_002591195.1_ASM259119v1_genomic.gbff.gz Yersinia_pseudotuberculosis_NCTC3571/GCF_900636705.1_43908_A02/GCF_900636705.1_43908_A02_genomic.gbff.gz Yersinia_enterocolitica_FORC_002/GCF_000987925.1_ASM98792v1/GCF_000987925.1_ASM98792v1_genomic.gbff.gz Yersinia_ruckeri_NVI-1292/GCF_026435275.1_ASM2643527v1/GCF_026435275.1_ASM2643527v1_genomic.gbff.gz Yersinia_pestis_3067/GCF_001188795.1_ASM118879v1/GCF_001188795.1_ASM118879v1_genomic.gbff.gz Yersinia_pestis_M2086/GCF_015336695.1_ASM1533669v1/GCF_015336695.1_ASM1533669v1_genomic.gbff.gz Yersinia_ruckeri_16Y0180/GCF_021399215.1_ASM2139921v1/GCF_021399215.1_ASM2139921v1_genomic.gbff.gz Yersinia_pestis_2944/GCF_001188815.1_ASM118881v1/GCF_001188815.1_ASM118881v1_genomic.gbff.gz Yersinia_rochesterensis_ATCC_BAA-2637/GCF_003600645.1_ASM360064v1/GCF_003600645.1_ASM360064v1_genomic.gbff.gz Yersinia_pestis_Z176003/GCF_000022845.1_ASM2284v1/GCF_000022845.1_ASM2284v1_genomic.gbff.gz Yersinia_intermedia_SCPM-O-B-8026_C-146/GCF_026183385.1_ASM2618338v1/GCF_026183385.1_ASM2618338v1_genomic.gbff.gz Yersinia_enterocolitica_YE5/GCF_001708615.1_ASM170861v1/GCF_001708615.1_ASM170861v1_genomic.gbff.gz Yersinia_enterocolitica_YE6/GCF_001708595.1_ASM170859v1/GCF_001708595.1_ASM170859v1_genomic.gbff.gz Yersinia_pestis_CO92_pgm-_pPCP1-/GCF_001293415.1_ASM129341v1/GCF_001293415.1_ASM129341v1_genomic.gbff.gz Yersinia_pestis_1412/GCF_001188695.1_ASM118869v1/GCF_001188695.1_ASM118869v1_genomic.gbff.gz Yersinia_pestis_El_Dorado/GCF_000834495.1_ASM83449v1/GCF_000834495.1_ASM83449v1_genomic.gbff.gz Yersinia_enterocolitica_KNG22703/GCF_001305635.1_ASM130563v1/GCF_001305635.1_ASM130563v1_genomic.gbff.gz Yersinia_pestis_M-1770/GCF_015337825.2_ASM1533782v2/GCF_015337825.2_ASM1533782v2_genomic.gbff.gz Yersinia_enterocolitica_MP98/GCF_025758515.1_ASM2575851v1/GCF_025758515.1_ASM2575851v1_genomic.gbff.gz Yersinia_enterocolitica_NCTC13629/GCF_900635745.1_32868_F02/GCF_900635745.1_32868_F02_genomic.gbff.gz Yersinia_pestis_94/GCF_024498395.1_ASM2449839v1/GCF_024498395.1_ASM2449839v1_genomic.gbff.gz Yersinia_kristensenii_Y231/GCF_000834865.1_ASM83486v1/GCF_000834865.1_ASM83486v1_genomic.gbff.gz Yersinia_pestis_C-783/GCF_015337285.1_ASM1533728v1/GCF_015337285.1_ASM1533728v1_genomic.gbff.gz Yersinia_pseudotuberculosis_NCTC8480/GCF_900635715.1_32473_H02/GCF_900635715.1_32473_H02_genomic.gbff.gz Yersinia_enterocolitica_NW57/GCF_025758475.1_ASM2575847v1/GCF_025758475.1_ASM2575847v1_genomic.gbff.gz Yersinia_enterocolitica_YE1/GCF_001708635.1_ASM170863v1/GCF_001708635.1_ASM170863v1_genomic.gbff.gz Yersinia_pestis_790/GCF_001188675.1_ASM118867v1/GCF_001188675.1_ASM118867v1_genomic.gbff.gz Yersinia_ruckeri_NVI-11065/GCF_026435655.1_ASM2643565v1/GCF_026435655.1_ASM2643565v1_genomic.gbff.gz Yersinia_pestis_14D/GCF_015159615.2_ASM1515961v2/GCF_015159615.2_ASM1515961v2_genomic.gbff.gz Yersinia_enterocolitica_NW115/GCF_025758655.1_ASM2575865v1/GCF_025758655.1_ASM2575865v1_genomic.gbff.gz Yersinia_enterocolitica_Gp259/GCF_025758265.1_ASM2575826v1/GCF_025758265.1_ASM2575826v1_genomic.gbff.gz Yersinia_enterocolitica_FORC066/GCF_025340245.1_ASM2534024v1/GCF_025340245.1_ASM2534024v1_genomic.gbff.gz Yersinia_pestis_20/GCF_024498415.1_ASM2449841v1/GCF_024498415.1_ASM2449841v1_genomic.gbff.gz Yersinia_pestis_FDAARGOS_602/GCF_003798345.1_ASM379834v1/GCF_003798345.1_ASM379834v1_genomic.gbff.gz Yersinia_aleksiciae_159/GCF_001047675.1_ASM104767v1/GCF_001047675.1_ASM104767v1_genomic.gbff.gz Yersinia_enterocolitica_Gp2/GCF_025758285.1_ASM2575828v1/GCF_025758285.1_ASM2575828v1_genomic.gbff.gz Yersinia_pseudotuberculosis_1/GCF_000834435.1_ASM83443v1/GCF_000834435.1_ASM83443v1_genomic.gbff.gz Yersinia_pestis_3770/GCF_001188775.1_ASM118877v1/GCF_001188775.1_ASM118877v1_genomic.gbff.gz Yersinia_intermedia_FDAARGOS_729/GCF_009730075.1_ASM973007v1/GCF_009730075.1_ASM973007v1_genomic.gbff.gz Yersinia_enterocolitica_NW67/GCF_025758535.1_ASM2575853v1/GCF_025758535.1_ASM2575853v1_genomic.gbff.gz Yersinia_intermedia_SCPM-O-B-10209_333/GCF_026183345.1_ASM2618334v1/GCF_026183345.1_ASM2618334v1_genomic.gbff.gz Yersinia_ruckeri_17Y0414/GCF_021399075.1_ASM2139907v1/GCF_021399075.1_ASM2139907v1_genomic.gbff.gz Yersinia_pestis_SCPM-O-B-6530/GCF_009295985.1_ASM929598v1/GCF_009295985.1_ASM929598v1_genomic.gbff.gz Yersinia_pseudotuberculosis_EP2+/GCF_000834415.1_ASM83441v1/GCF_000834415.1_ASM83441v1_genomic.gbff.gz Yersinia_pestis_KM_567/GCF_015337445.1_ASM1533744v1/GCF_015337445.1_ASM1533744v1_genomic.gbff.gz Yersinia_ruckeri_Big_Creek_74/GCF_000964565.1_ASM96456v1/GCF_000964565.1_ASM96456v1_genomic.gbff.gz Yersinia_intermedia_FDAARGOS_358/GCF_002983625.1_ASM298362v1/GCF_002983625.1_ASM298362v1_genomic.gbff.gz Yersinia_ruckeri_NVI-9681/GCF_023212445.2_ASM2321244v2/GCF_023212445.2_ASM2321244v2_genomic.gbff.gz Yersinia_kristensenii_2012N-4030/GCF_013282785.1_ASM1328278v1/GCF_013282785.1_ASM1328278v1_genomic.gbff.gz Yersinia_ruckeri_17Y0157/GCF_021399195.1_ASM2139919v1/GCF_021399195.1_ASM2139919v1_genomic.gbff.gz Yersinia_ruckeri_NVI-8270/GCF_026435135.1_ASM2643513v1/GCF_026435135.1_ASM2643513v1_genomic.gbff.gz Yersinia_ruckeri_17Y0189/GCF_021399095.1_ASM2139909v1/GCF_021399095.1_ASM2139909v1_genomic.gbff.gz Yersinia_ruckeri_NVI-8524/GCF_026435115.1_ASM2643511v1/GCF_026435115.1_ASM2643511v1_genomic.gbff.gz Yersinia_pestis_M-1482/GCF_015337645.1_ASM1533764v1/GCF_015337645.1_ASM1533764v1_genomic.gbff.gz Yersinia_pestis_Harbin_35_bis/GCF_000834275.1_ASM83427v1/GCF_000834275.1_ASM83427v1_genomic.gbff.gz Yersinia_pseudotuberculosis_NCTC10217/GCF_900635755.1_33467_B01/GCF_900635755.1_33467_B01_genomic.gbff.gz Yersinia_pseudotuberculosis_598/GCF_020889805.1_ASM2088980v1/GCF_020889805.1_ASM2088980v1_genomic.gbff.gz Yersinia_ruckeri_NVI-11267/GCF_026435335.1_ASM2643533v1/GCF_026435335.1_ASM2643533v1_genomic.gbff.gz Yersinia_enterocolitica_NW56/GCF_025758635.1_ASM2575863v1/GCF_025758635.1_ASM2575863v1_genomic.gbff.gz Yersinia_pestis_Angola/GCF_000018805.1_ASM1880v1/GCF_000018805.1_ASM1880v1_genomic.gbff.gz Yersinia_pestis_SCPM-O-DNA-18_I-3113/GCF_009295945.1_ASM929594v1/GCF_009295945.1_ASM929594v1_genomic.gbff.gz Yersinia_enterocolitica_Y11/GCF_000253175.1_ASM25317v1/GCF_000253175.1_ASM25317v1_genomic.gbff.gz Yersinia_pestis_Dodson/GCF_000834775.1_ASM83477v1/GCF_000834775.1_ASM83477v1_genomic.gbff.gz Yersinia_pestis_Cadman/GCF_001693595.1_ASM169359v1/GCF_001693595.1_ASM169359v1_genomic.gbff.gz Yersinia_pestis_KIM5/GCF_000970105.1_ASM97010v1/GCF_000970105.1_ASM97010v1_genomic.gbff.gz Yersinia_ruckeri_NVI-10705/GCF_023212585.2_ASM2321258v2/GCF_023212585.2_ASM2321258v2_genomic.gbff.gz Yersinia_pestis_EV76-CN/GCF_024758685.1_ASM2475868v1/GCF_024758685.1_ASM2475868v1_genomic.gbff.gz Yersinia_intermedia_FDAARGOS_730/GCF_009730055.1_ASM973005v1/GCF_009730055.1_ASM973005v1_genomic.gbff.gz Yersinia_ruckeri_NVI-11073/GCF_026435495.1_ASM2643549v1/GCF_026435495.1_ASM2643549v1_genomic.gbff.gz Yersinia_ruckeri_17Y0161/GCF_021399155.1_ASM2139915v1/GCF_021399155.1_ASM2139915v1_genomic.gbff.gz Yersinia_sp_KBS0713/GCF_005937895.2_ASM593789v2/GCF_005937895.2_ASM593789v2_genomic.gbff.gz Yersinia_pestis_SCPM-O-B-6899_231/GCF_009295925.1_ASM929592v1/GCF_009295925.1_ASM929592v1_genomic.gbff.gz Yersinia_ruckeri_NVI-5089/GCF_026435195.1_ASM2643519v1/GCF_026435195.1_ASM2643519v1_genomic.gbff.gz Yersinia_pestis_Nicholisk_41/GCF_000834885.1_ASM83488v1/GCF_000834885.1_ASM83488v1_genomic.gbff.gz Yersinia_enterocolitica_YE7/GCF_001708555.1_ASM170855v1/GCF_001708555.1_ASM170855v1_genomic.gbff.gz Yersinia_intermedia_SCPM-O-B-9106_C-191/GCF_026183365.1_ASM2618336v1/GCF_026183365.1_ASM2618336v1_genomic.gbff.gz Yersinia_canariae_NCTC_14382/GCF_009831415.1_ASM983141v1/GCF_009831415.1_ASM983141v1_genomic.gbff.gz Yersinia_enterocolitica_YE3/GCF_001708655.1_ASM170865v1/GCF_001708655.1_ASM170865v1_genomic.gbff.gz Yersinia_pseudotuberculosis_NCTC10275/GCF_900637475.1_51108_B01/GCF_900637475.1_51108_B01_genomic.gbff.gz Yersinia_enterocolitica_8081/GCF_000009345.1_ASM934v1/GCF_000009345.1_ASM934v1_genomic.gbff.gz Yersinia_ruckeri_NVI-10571/GCF_026435835.1_ASM2643583v1/GCF_026435835.1_ASM2643583v1_genomic.gbff.gz Yersinia_enterocolitica_2516-87/GCF_000834735.1_ASM83473v1/GCF_000834735.1_ASM83473v1_genomic.gbff.gz Yersinia_frederiksenii_FDAARGOS_417/GCF_002591095.1_ASM259109v1/GCF_002591095.1_ASM259109v1_genomic.gbff.gz Yersinia_pestis_I-1252/GCF_015336465.1_ASM1533646v1/GCF_015336465.1_ASM1533646v1_genomic.gbff.gz Yersinia_ruckeri_17Y0155/GCF_021399235.1_ASM2139923v1/GCF_021399235.1_ASM2139923v1_genomic.gbff.gz Yersinia_pseudotuberculosis_FDAARGOS_665/GCF_008693365.1_ASM869336v1/GCF_008693365.1_ASM869336v1_genomic.gbff.gz Yersinia_alsatica_SCPM-O-B-7604/GCF_025133195.1_ASM2513319v1/GCF_025133195.1_ASM2513319v1_genomic.gbff.gz Yersinia_pseudotuberculosis_PA3606/GCF_000834945.1_ASM83494v1/GCF_000834945.1_ASM83494v1_genomic.gbff.gz Yersinia_pestis_KIM10+/GCF_000006645.1_ASM664v1/GCF_000006645.1_ASM664v1_genomic.gbff.gz Yersinia_ruckeri_NVI-701/GCF_026435155.1_ASM2643515v1/GCF_026435155.1_ASM2643515v1_genomic.gbff.gz Yersinia_enterocolitica_NW117/GCF_025758455.1_ASM2575845v1/GCF_025758455.1_ASM2575845v1_genomic.gbff.gz Yersinia_enterocolitica_FORC065/GCA_025340225.1_ASM2534022v1/GCA_025340225.1_ASM2534022v1_genomic.gbff.gz Yersinia_enterocolitica_NW1/GCF_025758495.1_ASM2575849v1/GCF_025758495.1_ASM2575849v1_genomic.gbff.gz Yersinia_ruckeri_QMA0440/GCF_002192595.1_ASM219259v1/GCF_002192595.1_ASM219259v1_genomic.gbff.gz Yersinia_pseudotuberculosis_FDAARGOS_579/GCF_003798305.1_ASM379830v1/GCF_003798305.1_ASM379830v1_genomic.gbff.gz Yersinia_enterocolitica_1055Rr/GCF_000192105.1_ASM19210v1/GCF_000192105.1_ASM19210v1_genomic.gbff.gz Yersinia_hibernica_CFS1934/GCF_004124235.1_ASM412423v1/GCF_004124235.1_ASM412423v1_genomic.gbff.gz Yersinia_pestis_D106004/GCF_000022805.1_ASM2280v1/GCF_000022805.1_ASM2280v1_genomic.gbff.gz Yersinia_enterocolitica_Y1/GCF_004368055.1_ASM436805v1/GCF_004368055.1_ASM436805v1_genomic.gbff.gz Yersinia_pseudotuberculosis_IP31758/GCF_000016945.1_ASM1694v1/GCF_000016945.1_ASM1694v1_genomic.gbff.gz Yersinia_pestis_Pestoides_F_bis/GCF_000834315.1_ASM83431v1/GCF_000834315.1_ASM83431v1_genomic.gbff.gz Yersinia_pestis_M-1974/GCF_015336865.1_ASM1533686v1/GCF_015336865.1_ASM1533686v1_genomic.gbff.gz Yersinia_ruckeri_NHV_3758/GCF_002442495.2_ASM244249v2/GCF_002442495.2_ASM244249v2_genomic.gbff.gz Yersinia_ruckeri_17Y0163/GCF_021399115.1_ASM2139911v1/GCF_021399115.1_ASM2139911v1_genomic.gbff.gz Yersinia_pseudotuberculosis_MD67/GCF_000834355.1_ASM83435v1/GCF_000834355.1_ASM83435v1_genomic.gbff.gz Yersinia_pestis_D182038/GCF_000022825.1_ASM2282v1/GCF_000022825.1_ASM2282v1_genomic.gbff.gz Yersinia_enterocolitica_FDAARGOS_1090/GCF_016727905.1_ASM1672790v1/GCF_016727905.1_ASM1672790v1_genomic.gbff.gz Yersinia_bercovieri_ATCC_43970/GCF_013282745.1_ASM1328274v1/GCF_013282745.1_ASM1328274v1_genomic.gbff.gz Yersinia_enterocolitica_WA/GCF_000834195.1_ASM83419v1/GCF_000834195.1_ASM83419v1_genomic.gbff.gz Yersinia_ruckeri_NVI-10587/GCF_023212425.2_ASM2321242v2/GCF_023212425.2_ASM2321242v2_genomic.gbff.gz Yersinia_pestis_R/GCF_024498375.1_ASM2449837v1/GCF_024498375.1_ASM2449837v1_genomic.gbff.gz Yersinia_intermedia_N6293/GCF_022637335.1_ASM2263733v1/GCF_022637335.1_ASM2263733v1_genomic.gbff.gz Yersinia_ruckeri_NVI-6614/GCF_026435175.1_ASM2643517v1/GCF_026435175.1_ASM2643517v1_genomic.gbff.gz Yersinia_hibernica_LC20/GCF_000597945.1_ASM59794v2/GCF_000597945.1_ASM59794v2_genomic.gbff.gz Yersinia_ruckeri_17Y0153/GCF_021399175.1_ASM2139917v1/GCF_021399175.1_ASM2139917v1_genomic.gbff.gz Yersinia_aldovae_670-83/GCF_000834395.1_ASM83439v1/GCF_000834395.1_ASM83439v1_genomic.gbff.gz Yersinia_pestis_SCPM-O-B-5935_I-1996/GCF_009295965.1_ASM929596v1/GCF_009295965.1_ASM929596v1_genomic.gbff.gz Yersinia_ruckeri_YRB/GCF_000834255.1_ASM83425v1/GCF_000834255.1_ASM83425v1_genomic.gbff.gz Yersinia_enterocolitica_FORC_002_bis/GCF_001304755.1_ASM130475v1/GCF_001304755.1_ASM130475v1_genomic.gbff.gz Yersinia_pestis_Antiqua/GCF_000013825.1_ASM1382v1/GCF_000013825.1_ASM1382v1_genomic.gbff.gz Yersinia_pestis_Pestoides_B/GCF_000834925.1_ASM83492v1/GCF_000834925.1_ASM83492v1_genomic.gbff.gz Yersinia_pestis_M2085/GCF_015338045.2_ASM1533804v2/GCF_015338045.2_ASM1533804v2_genomic.gbff.gz Yersinia_pestis_CO92/GCF_000009065.1_ASM906v1/GCF_000009065.1_ASM906v1_genomic.gbff.gz Yersinia_ruckeri_17Y0159/GCF_021399135.1_ASM2139913v1/GCF_021399135.1_ASM2139913v1_genomic.gbff.gz Yersinia_enterocolitica_NCTC12982/GCF_901472495.1_32868_C01/GCF_901472495.1_32868_C01_genomic.gbff.gz Yersinia_pestis_SCPM-O-B-5942_I-2638/GCF_009363195.1_ASM936319v1/GCF_009363195.1_ASM936319v1_genomic.gbff.gz Yersinia_pestis_Nepal516/GCF_000013805.1_ASM1380v1/GCF_000013805.1_ASM1380v1_genomic.gbff.gz Yersinia_pseudotuberculosis_FDAARGOS_342/GCF_003546905.1_ASM354690v1/GCF_003546905.1_ASM354690v1_genomic.gbff.gz Yersinia_ruckeri_SC09/GCF_000775355.2_ASM77535v2/GCF_000775355.2_ASM77535v2_genomic.gbff.gz Yersinia_mollaretii_ATCC_43969/GCF_013282725.1_ASM1328272v1/GCF_013282725.1_ASM1328272v1_genomic.gbff.gz Yersinia_pestis_Pestoides_F/GCF_000016445.1_ASM1644v1/GCF_000016445.1_ASM1644v1_genomic.gbff.gz Yersinia_pestis_Angola_bis/GCF_000834845.1_ASM83484v1/GCF_000834845.1_ASM83484v1_genomic.gbff.gz Yersinia_ruckeri_17Y0412/GCF_021399055.1_ASM2139905v1/GCF_021399055.1_ASM2139905v1_genomic.gbff.gz Yersinia_pestis_1522/GCF_001188715.1_ASM118871v1/GCF_001188715.1_ASM118871v1_genomic.gbff.gz Yersinia_enterocolitica_MGYG-HGUT-02335/GCF_902385945.1_UHGG_MGYG-HGUT-02335/GCF_902385945.1_UHGG_MGYG-HGUT-02335_genomic.gbff.gz Yersinia_pestis_C-792/GCF_015337085.2_ASM1533708v2/GCF_015337085.2_ASM1533708v2_genomic.gbff.gz Yersinia_ruckeri_NVI-11050/GCF_023212385.2_ASM2321238v2/GCF_023212385.2_ASM2321238v2_genomic.gbff.gz Yersinia_intermedia_NCTC11469/GCF_900635455.1_28307_A01/GCF_900635455.1_28307_A01_genomic.gbff.gz Yersinia_pseudotuberculosis_FDAARGOS_583/GCF_003798285.1_ASM379828v1/GCF_003798285.1_ASM379828v1_genomic.gbff.gz Yersinia_pestis_M2029/GCF_015336265.1_ASM1533626v1/GCF_015336265.1_ASM1533626v1_genomic.gbff.gz Yersinia_enterocolitica_Gp200/GCF_025758555.1_ASM2575855v1/GCF_025758555.1_ASM2575855v1_genomic.gbff.gz Yersinia_massiliensis_GTA/GCF_003048255.1_ASM304825v1/GCF_003048255.1_ASM304825v1_genomic.gbff.gz Yersinia_pestis_A1122_bis/GCF_000834755.1_ASM83475v1/GCF_000834755.1_ASM83475v1_genomic.gbff.gz Yersinia_pseudotuberculosis_NZYP4713/GCF_900092345.1_YP4713/GCF_900092345.1_YP4713_genomic.gbff.gz Yersinia_pestis_PBM19/GCF_000834235.1_ASM83423v1/GCF_000834235.1_ASM83423v1_genomic.gbff.gz Yersinia_enterocolitica_NW116/GCF_025758575.1_ASM2575857v1/GCF_025758575.1_ASM2575857v1_genomic.gbff.gz Yersinia_ruckeri_KMM821/GCF_017498685.1_ASM1749868v1/GCF_017498685.1_ASM1749868v1_genomic.gbff.gz Yersinia_ruckeri_NVI-4840/GCF_026435215.1_ASM2643521v1/GCF_026435215.1_ASM2643521v1_genomic.gbff.gz Yersinia_enterocolitica_FDAARGOS_1082/GCF_016727765.1_ASM1672776v1/GCF_016727765.1_ASM1672776v1_genomic.gbff.gz Yersinia_enterocolitica_NW51/GCF_025758615.1_ASM2575861v1/GCF_025758615.1_ASM2575861v1_genomic.gbff.gz Yersinia_ruckeri_NVI-11076/GCF_023212325.2_ASM2321232v2/GCF_023212325.2_ASM2321232v2_genomic.gbff.gz Yersinia_rohdei_YRA/GCF_000834455.1_ASM83445v1/GCF_000834455.1_ASM83445v1_genomic.gbff.gz Yersinia_pestis_C-781/GCF_015336085.1_ASM1533608v1/GCF_015336085.1_ASM1533608v1_genomic.gbff.gz Yersinia_pestis_Harbin_35/GCF_000186725.1_ASM18672v1/GCF_000186725.1_ASM18672v1_genomic.gbff.gz Yersinia_pseudotuberculosis_ATCC_6904/GCF_000750315.1_ASM75031v1/GCF_000750315.1_ASM75031v1_genomic.gbff.gz Yersinia_pseudotuberculosis_FDAARGOS_580/GCF_003798445.1_ASM379844v1/GCF_003798445.1_ASM379844v1_genomic.gbff.gz Yersinia_enterocolitica_str_YE5303/GCF_000968115.1_ASM96811v1/GCF_000968115.1_ASM96811v1_genomic.gbff.gz Yersinia_pestis_FDAARGOS_601/GCF_003798225.1_ASM379822v1/GCF_003798225.1_ASM379822v1_genomic.gbff.gz Yersinia_pestis_SCPM-O-B-6291_C-25/GCF_009296005.1_ASM929600v1/GCF_009296005.1_ASM929600v1_genomic.gbff.gz Yersinia_pestis_Nairobi/GCF_000835005.1_ASM83500v1/GCF_000835005.1_ASM83500v1_genomic.gbff.gz Yersinia_pseudotuberculosis_FDAARGOS_584/GCF_003798385.1_ASM379838v1/GCF_003798385.1_ASM379838v1_genomic.gbff.gz Yersinia_similis_228/GCF_000582515.1_ASM58251v1/GCF_000582515.1_ASM58251v1_genomic.gbff.gz Yersinia_pestis_1413/GCF_001188935.1_ASM118893v1/GCF_001188935.1_ASM118893v1_genomic.gbff.gz Yersinia_pseudotuberculosis_FDAARGOS_581/GCF_003798425.1_ASM379842v1/GCF_003798425.1_ASM379842v1_genomic.gbff.gz Yersinia_entomophaga_MH96/GCF_001656035.1_ASM165603v1/GCF_001656035.1_ASM165603v1_genomic.gbff.gz Yersinia_ruckeri_NVI-1176/GCF_026435295.1_ASM2643529v1/GCF_026435295.1_ASM2643529v1_genomic.gbff.gz Yersinia_pestis_S19960127/GCF_015190655.1_ASM1519065v1/GCF_015190655.1_ASM1519065v1_genomic.gbff.gz Yersinia_ruckeri_NVI-4479/GCF_026435255.1_ASM2643525v1/GCF_026435255.1_ASM2643525v1_genomic.gbff.gz Yersinia_frederiksenii_Y225/GCF_000834215.1_ASM83421v1/GCF_000834215.1_ASM83421v1_genomic.gbff.gz Yersinia_ruckeri_NVI-4570/GCF_026435235.1_ASM2643523v1/GCF_026435235.1_ASM2643523v1_genomic.gbff.gz Yersinia_pseudotuberculosis_IP2666pIB1/GCF_003814345.1_ASM381434v1/GCF_003814345.1_ASM381434v1_genomic.gbff.gz Yersinia_pseudotuberculosis_FDAARGOS_582/GCF_003798405.1_ASM379840v1/GCF_003798405.1_ASM379840v1_genomic.gbff.gz Yersinia_enterocolitica_NCTC13769/GCF_900637005.1_46582_C01/GCF_900637005.1_46582_C01_genomic.gbff.gz Yersinia_pestis_A1122/GCF_000222975.1_ASM22297v1/GCF_000222975.1_ASM22297v1_genomic.gbff.gz Yersinia_enterocolitica_YE165/GCF_001708575.1_ASM170857v1/GCF_001708575.1_ASM170857v1_genomic.gbff.gz Yersinia_pseudotuberculosis_IP32953/GCF_000047365.1_ASM4736v1/GCF_000047365.1_ASM4736v1_genomic.gbff.gz Yersinia_pestis_8787/GCF_001188755.1_ASM118875v1/GCF_001188755.1_ASM118875v1_genomic.gbff.gz Yersinia_rochesterensis_ATCC_33639/GCF_000750355.1_ASM75035v1/GCF_000750355.1_ASM75035v1_genomic.gbff.gz Yersinia_pestis_FDAARGOS_603/GCF_003798205.1_ASM379820v1/GCF_003798205.1_ASM379820v1_genomic.gbff.gz Yersinia_pseudotuberculosis_PB1+_bis/GCF_000834475.1_ASM83447v1/GCF_000834475.1_ASM83447v1_genomic.gbff.gz Yersinia_ruckeri_NVI-11294/GCF_026435315.1_ASM2643531v1/GCF_026435315.1_ASM2643531v1_genomic.gbff.gz Yersinia_enterocolitica_NW66/GCF_025758595.1_ASM2575859v1/GCF_025758595.1_ASM2575859v1_genomic.gbff.gz Yersinia_pestis_1045/GCF_001188735.1_ASM118873v1/GCF_001188735.1_ASM118873v1_genomic.gbff.gz; do output=$(python3 extract_CDS_of_a_locus_tag.py ${gbff} $(echo "${gene_id}" | cut -d '_' -f 1-2)) if [[ ! -z "${output}" ]]; then gbff_short=$(echo "${gbff}" | cut -d '/' -f 1) printf "%s\t%s\n" "${gbff_short}" "${output}" >> yopM_seq.txt fi done done
-
extract the sequences according to NCBI annotations
#------------------------------- yopJ (+6) ------------------------------- #grep "yopJ" selected_gtf_files/Yersinia_enterocolitica_2516-87.gtf NZ_CP009837.1 RefSeq gene 69041 69701 . - . gene_id "CH48_RS00445"; transcript_id ""; gbkey "Gene"; gene "yopJ"; gene_biotype "protein_coding"; locus_tag "CH48_RS00445"; old_locus_tag "CH48_4238"; part "2"; NZ_CP009837.1 RefSeq gene 1 206 . - . gene_id "CH48_RS00445"; transcript_id ""; gbkey "Gene"; gene "yopJ"; gene_biotype "protein_coding"; locus_tag "CH48_RS00445"; old_locus_tag "CH48_4238"; part "1"; #grep "yopJ" selected_gtf_files/Yersinia_pestis_790.gtf (NZ_CP006807.1) #grep "yopJ" selected_gtf_files/Yersinia_pestis_Antiqua_bis.gtf NZ_CP009905.1 RefSeq gene 16737 17602 . - . gene_id "CH58_RS00725"; transcript_id ""; gbkey "Gene"; gene "yopJ"; gene_biotype "pseudogene"; locus_tag "CH58_RS00725"; old_locus_tag "CH58_4444"; pseudo "true"; #grep "yopJ" selected_gtf_files/Yersinia_pestis_FDAARGOS_602.gtf NZ_CP033695.1 RefSeq gene 36152 37017 . + . gene_id "EGX42_RS00935"; transcript_id ""; gbkey "Gene"; gene "yopJ"; gene_biotype "pseudogene"; locus_tag "EGX42_RS00935"; old_locus_tag "EGX42_00930"; pseudo "true"; #grep "yopJ" selected_gtf_files/Yersinia_pestis_Pestoides_B.gtf NZ_CP010022.1 RefSeq gene 23121 23986 . - . gene_id "CH60_RS00825"; transcript_id ""; gbkey "Gene"; gene "yopJ"; gene_biotype "pseudogene"; locus_tag "CH60_RS00825"; old_locus_tag "CH60_4301"; pseudo "true"; #grep "yopJ" selected_gtf_files/Yersinia_pseudotuberculosis_EP2+.gtf NZ_CP009758.1 RefSeq gene 33302 34168 . + . gene_id "BZ20_RS00215"; transcript_id ""; gbkey "Gene"; gene "yopJ"; gene_biotype "pseudogene"; locus_tag "BZ20_RS00215"; old_locus_tag "BZ20_4189"; pseudo "true"; #under selected_fna_files samtools faidx Yersinia_enterocolitica_2516-87.fna NZ_CP009837.1:69041-69701 > temp.fna samtools faidx Yersinia_enterocolitica_2516-87.fna NZ_CP009837.1:1-206 >> temp.fna revseq sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' yersinia_enterocolitica_2516-87.rev > temp_.fna samtools faidx Yersinia_pestis_Antiqua_bis.fna NZ_CP009905.1:16737-17602 > temp.fna revseq sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 16737-17602.rev > temp_.fna samtools faidx Yersinia_pestis_FDAARGOS_602.fna NZ_CP033695.1:36152-37017 > temp.fna sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna samtools faidx Yersinia_pestis_Pestoides_B.fna NZ_CP010022.1:23121-23986 > temp.fna revseq sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 23121-23986.rev > temp_.fna samtools faidx Yersinia_pseudotuberculosis_EP2+.fna NZ_CP009758.1:33302-34168 > temp.fna sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna Yersinia_enterocolitica_2516-87 ATGATTGGGCCAATATCACAAATAAACAGCTTCGGTGGCTTATCAGAAAAAGAGACCCGTTCTTTAATCAGTAATGAAGAGCTTAAAAATATCATAATACAGTTGGAAACTGATATAGCGGATGGATCCTGGTTCCATAAAAATTATTCACGCCTGGATATAGAAGTCATGCCCGCATTAGTAATTCAGGCGAACAATAAATATCCGGAAATGAATCTTAATTTTGTTACATCTCCCCAGGACCTTTCGATAGAAATAAAAAATGTCATAGAAAATGGAGTTGGATCTTCCCGCTTCATAATTAACATGGGGGAGGGTGGAATACATTTCAGTGTAATTGATTACAAACATATAAATGGGAAAACATCTCTGATATTATTTGAACCAGTAAACTTTAATAGTATGGGGCCAGCGATACTGGCAATAAGTACAAAAACGGCCATTGAACGTTATCAATTACCTGATTGCCATTTTTCCATGGTGGAAATGGATATTCAGCGAAGCTCATCTGAATGTGGTATTTTTAGTTTGGCACTGGCAAAAAAACTTTACACCGAGAGAGATAGCCTGTTGAAAATACATGAAGATAATATAAAAGGTATATTAAGTGATAGTGAAAATCCTTTACCCCACAATAAGTTGGATCCGTATCTCCCGGTAACTTTTTACAAACATACTCAAGGTAAAAAACGTCTTAATGAATATTTAAATACTAACCCGCAGGGAGTTGGTACTGTTGTTAACAAAAAAAATGAAACCATCTTTAATAGGTTTGATAACAATAAATCCATTATAGATGGAAAGGAATTATCAGTTTCGGTACATAAAAAGAGAATAGCTGAATATAAAACACTTCTCAAAGTATAA Yersinia_pestis_Antiqua_bis ATGATCGGACCAATATCACAAATAAATATCTCCGGTGGCTTATCAGAAAAAGAGACCAGTTCTTTAATCAGTAATGAAGAGCTTAAAAATATCATAACACAGTTGGAAACTGATATATCGGATGGATCCTGGTTCCATAAAAATTATTCACGTATGGATGTAGAAGTCATGCCCGCATTGGTAATCCAGGCGAACAATAAATATCCGGAAATGAATCTTAATCTTGTTACATCTCCATTGGACCTTTCAATAGAAATAAAAAACGTCATAGAAAATGGAGTTAGATCTTCCCGCTTCATAATTAACATGGGGGAAGGTGGAATACATTTCAGTGTAATTGATTACAAACATATAAATGGGAAAACATCTCTGATATTGTTTGAACCAGCAAACTTTAACAGTATGGGGCCAGCGATGCTGGCAATAAGGACAAAAACGGCTATTGAACGTTATCAATTACCTGATTGCCATTTCTCCATGGTGGAAATGGATATTCAGCGAAGCTCATCTGAATGTGGTATTTTTAGTTTTGCACTGGCAAAAAAACTTTACATCGAGAGAGATAGCCTGTTGAAAATACATGAAGATAATATAAAAGGTATATTAAGTGATGGTAAAAATCCTTTACCCCACGATAAGTTGGACCCGTATCTCCCGGTAACTTTTTACAAACATACTCAAGGTAAAAAACGTCTTAATGAATATTTAAATACTAACCCGCAGGGAGTTGGTACTGTTGTTAACAAAAAAATGAAACCATCGTTAATAGATTTGATAACAATAAATCCATTGTAGATGGAAAGGAATTATCAGTTTCGGTACATAAAAAGAGAATAGCTGAATATAAAACACTTCTCAAAGTATAA >Yersinia_pestis_FDAARGOS_602 ATGATCGGACCAATATCACAAATAAATATCTCCGGTGGCTTATCAGAAAAAGAGACCAGTTCTTTAATCAGTAATGAAGAGCTTAAAAATATCATAACACAGTTGGAAACTGATATATCGGATGGATCCTGGTTCCATAAAAATTATTCACGTATGGATGTAGAAGTCATGCCCGCATTGGTAATCCAGGCGAACAATAAATATCCGGAAATGAATCTTAATCTTGTTACATCTCCATTGGACCTTTCAATAGAAATAAAAAACGTCATAGAAAATGGAGTTAGATCTTCCCGCTTCATAATTAACATGGGGGAAGGTGGAATACATTTCAGTGTAATTGATTACAAACATATAAATGGGAAAACATCTCTGATATTGTTTGAACCAGCAAACTTTAACAGTATGGGGCCAGCGATGCTGGCAATAAGGACAAAAACGGCTATTGAACGTTATCAATTACCTGATTGCCATTTCTCCATGGTGGAAATGGATATTCAGCGAAGCTCATCTGAATGTGGTATTTTTAGTTTTGCACTGGCAAAAAAACTTTACATCGAGAGAGATAGCCTGTTGAAAATACATGAAGATAATATAAAAGGTATATTAAGTGATGGTGAAAATCCTTTACCCCACGATAAGTTGGACCCGTATCTCCCGGTAACTTTTTACAAACATACTCAAGGTAAAAAACGTCTTAATGAATATTTAAATACTAACCCGCAGGGAGTTGGTACTGTTGTTAACAAAAAAAATGAAACCATCGTTAATAGATTTGATAACAATAAATCCATTGTAGATGGAAAGGAATTATCAGTTTCGTACATAAAAAGAGAATAGCTGAATATAAAACACTTCTCAAAGTATAA >Yersinia_pestis_Pestoides_B ATGATCGGACCAATATCACAAATAAATATCTCCGGTGGCTTATCAGAAAAAGAGACCAGTTCTTTAATCAGTAATGAAGAGCTTAAAAATATCATAACACAGTTGGAAACTGATATATCGGATGGATCCTGGTTCCATAAAAATTATTCACGTATGGATGTAGAAGTCATGCCCGCATTGGTAATCCAGGCGAACAATAAATATCCGGAAATGAATCTTAATCTTGTTACATCTCCATTGGACCTTTCAATAGAAATAAAAAACGTCATAGAAAATGGAGTTAGATCTTCCCGCTTCATAATTAACATGGGGGAAGGTGGAATACATTTCAGTGTAATTGATTACAAACATATAAATGGGAAAACATCTCTGATATTGTTTGAACCAGCAAACTTTAACAGTATGGGGCCAGCGATGCTGGCAATAAGGACAAAAACGGCTATTGAACGTTATCAATTACCTGATTGCCATTTCTCCATGGTGGAAATGGATATTCAGCGAAGCTCATCTGAATGTGGTATTTTTAGTTTTGCACTGGCAAAAAAACTTTACATCGAGAGAGATAGCCTGTTGAAAATACATGAAGATAATATAAAAGGTATATTAAGTGATGGTGAAAATCCTTTACCCCACGATAAGTTGGACCCGTATCTCCCGGTAACTTTTTACAAACATACTCAAGGTAAAAAACGTCTTAATGAATATTTAAATACTAACCCGCAGGGAGTTGGTACTGTTGTTAACAAAAAAAATGAAACCATCGTTAATAGATTTGATAACAATAAATCCATTGTAGATGGAAAGGAATTATCAGTTTCGTACATAAAAAGAGAATAGCTGAATATAAAACACTTCTCAAAGTATAA Yersinia_pseudotuberculosis_EP2+ ATGATCGGACCAATATCACAAATAAATATCTCCGGTGGCTTATCAGAAAAAGAGACCAGTTCTTTAATCAGTAATGAAGAGCTTAAAAATATCATAACACAGTTGGAAACTGATATATCGGATGGATCCTGGTTCCATAAAAATTATTCACGTATGGATGTAGAAGTCATGCCCGCATTGGTAATCTAGGCGAACAATAAATATCCGGAAATGAATCTTAATCTTGTTACATCTCCATTGGACCTTTCAATAGAAATAAAAAACGTCATAGAAAATGGAGTTAGATCTTCCCGCTTCATAATTAACATGGGGGAAGGTGGAATACATTTCAGTGTAATTGATTACAAACATATAAATGGGAAAACATCTCTGATATTGTTTGAACCAGCAAACTTTAACAGTATGGGGCCAGCGATGCTGGCAATAAGGACAAAAACGGCTATTGAACGTTATCAATTACCTGATTGCCATTTCTCCATGGTGGAAATGGATATTCAGCGAAGCTCATCTGAATGTGGTATTTTTAGTTTTGCACTGGCAAAAAAACTTTACATCGAGAGAGATAGCCTGTTGAAAATACATGAAGATAATATAAAAGGTATATTAAGTGATGGTGAAAATCCTTTACCCCACGATAAGTTGGACCCGTATCTCCCGGTAACTTTTTACAAACATACTCAAGGTAAAAAACGTCTTAATGAATATTTAAATACTAACCCGCAGGGAGTTGGTACTGTTGTTAACAAAAAAAATGAAACCATCGTTAATAGATTTGATAACAATAAATCCATTGTAGATGGAAAGGAATTATCAGTTTCGGTACATAAAAAGAGAATAGCTGAATATAAAACACTTCTCAAAGTATAA #------------------------------- yopB (+4) ------------------------------- #-- grep "yopB" Yersinia_enterocolitica_YE1.gtf grep "yopB" Yersinia_enterocolitica_YE1.gtf NZ_CP016946.1 RefSeq gene 73029 73029 . + . gene_id "BFS78_RS21560"; transcript_id ""; gbkey "Gene"; gene "yopB"; gene_biotype "protein_coding"; locus_tag "BFS78_RS21560"; old_locus_tag "BFS78_21560"; part "1"; NZ_CP016946.1 RefSeq gene 1 1205 . + . gene_id "BFS78_RS21560"; transcript_id ""; gbkey "Gene"; gene "yopB"; gene_biotype "protein_coding"; locus_tag "BFS78_RS21560"; old_locus_tag "BFS78_21560"; part "2"; #-- grep "yopB" Yersinia_enterocolitica_YE3.gtf NZ_CP016943.1 RefSeq gene 72880 73026 . + . gene_id "BED35_RS00480"; transcript_id ""; gbkey "Gene"; gene "yopB"; gene_biotype "pseudogene"; locus_tag "BED35_RS00480"; old_locus_tag "BED35_00480"; part "1"; pseudo "true"; NZ_CP016943.1 RefSeq gene 1 1058 . + . gene_id "BED35_RS00480"; transcript_id ""; gbkey "Gene"; gene "yopB"; gene_biotype "pseudogene"; locus_tag "BED35_RS00480"; old_locus_tag "BED35_00480"; part "2"; pseudo "true"; grep "yopB" Yersinia_enterocolitica_YE5.gtf NZ_CP016939.1 RefSeq gene 73034 73034 . + . gene_id "BED32_RS00010"; transcript_id ""; gbkey "Gene"; gene "yopB"; gene_biotype "protein_coding"; locus_tag "BED32_RS00010"; old_locus_tag "BED32_00010"; part "1"; NZ_CP016939.1 RefSeq gene 1 1205 . + . gene_id "BED32_RS00010"; transcript_id ""; gbkey "Gene"; gene "yopB"; gene_biotype "protein_coding"; locus_tag "BED32_RS00010"; old_locus_tag "BED32_00010"; part "2"; #-- grep "yopB" Yersinia_pestis_Harbin_35_bis.gtf NZ_CP009703.1 RefSeq gene 18869 20075 . + . gene_id "CH55_RS00745"; transcript_id ""; gbkey "Gene"; gene "yopB"; gene_biotype "pseudogene"; locus_tag "CH55_RS00745"; old_locus_tag "CH55_4304"; pseudo "true"; #under selected_fna_files samtools faidx Yersinia_enterocolitica_YE1.fna NZ_CP016946.1:73029-73029 > temp.fna samtools faidx Yersinia_enterocolitica_YE1.fna NZ_CP016946.1:1-1205 >> temp.fna samtools faidx Yersinia_enterocolitica_YE3.fna NZ_CP016943.1:72880-73026 > temp.fna samtools faidx Yersinia_enterocolitica_YE3.fna NZ_CP016943.1:1-1058 >> temp.fna samtools faidx Yersinia_enterocolitica_YE5.fna NZ_CP016939.1:73034-73034 > temp.fna samtools faidx Yersinia_enterocolitica_YE5.fna NZ_CP016939.1:1-1205 >> temp.fna samtools faidx Yersinia_pestis_Harbin_35_bis.fna NZ_CP009703.1:18869-20075 > temp.fna Yersinia_enterocolitica_YE1 ATGAGTGCGTTGATAACCCATGATCGCTCAACGCCAGTAACTGGAAGTCTAGTTCCCTACATCGAGACACCAGCGCCCGCCCCCCTTCAGACCCAACAAGTCGCGGGAGAACTGAAGGATAAAAATGGCGGGGTGAGTTCTCAGGGCGTGCAGCTCCCTGCACCACTAGCAGTGGTTGCCAGCCAAGTCACTGAAGGACAACAGCAAGAAATCACTAAATTATTGGAGTCGGTCACCCGCGGCACGGCAGGATCTCAACTGATATCAAATTATGTTTCAGTGCTAACGAATTTTACGCTCGCTTCACCTGATACATTTGAGATTGAGTTAGGTAAGCTAGTTTCTAATTTAGAAGAAGTACGCAAAGACATAAAAATCGCTGATATTCAGCGTCTTCATGAACAAAACATGAAGAAAATTGAAGAGAATCAAGAGAAAATCAAAGAAACAGAAGAGAATGCCAAGCAAGTCAAGAAATCCGGCATGGCATCAAAGATTTTTGGCTGGCTCAGCGCCATAGCCTCAGTGGTTATCGGTGCCATCATGGTGGCCTCAGGGGTAGGAGCCGTTGCCGGTGCAATGATGATTGCCTCAGGCGTAATTGGGATGGCGAATATGGCTGTGAAACAAGCGGCGGAAGATGGCCTGATATCCCAAGAGGCAATGCAAGTATTAGGGCCGATACTCACTGCGATTGAAGTCGCATTGACTGTAGTTTCAACCGTAATGACCTTTGGCGGTTCGGCACTAAAATGCCTGGCTGATATTGGCGCAAAACTCGGTGCTAACACCGCAAGTCTTGCTGCTAAAGGAGCCGAGTTTTCAGCCAAAGTTGCCCAAATTTCGACAGGCATATCAAACACTGTCGGGAGTGCAGTGACTAAATTAGGGGGCAGTTTTGGTAGTTTAACAATGAGCCATGTAATCCGTACAGGATCACAGGCAACACAAGTCGCCGTTGGTGTGGGCAGCGGAATAACTCAGACCATCAATAATAAAAAACAAGCTGATTTACAACATAATAACGCTGATTTGGCCTTGAACAAGGCAGACATGGCAGCGTTACAAAGTATTATTGACCGACTCAAAGAAGAGTTATCCCATTTGTCAGAGTCACATCAACAAGTGATGGAACTGATTTTCCAGATGATTAATGCAAAAGGTGACATGCTGCATAATTTGGCCGGCAGACCCCATACTGTTTAA Yersinia_enterocolitica_YE3 ATGAGTGCGTTGATAACCCATGATCGCTCAACGCCAGTAACTGGAAGTCTAGTTCCCTACATCGAGACACCAGCGCCCGCCCCCTTCAGACCCAACAAGTCGCGGGAGAACTGAAGGATAAAAATGGCGGGGTGAGTTCTCAGGGCGTGCAGCTCCCTGCACCACTAGCAGTGGTTGCCAGCCAAGTCACTGAAGGACAACAGCAAGAAATCACTAAATTATTGGAGTCGGTCACCCGCGGCACGGCAGGATCTCAACTGATATCAAATTATGTTTCAGTGCTAACGAATTTTACGCTCGCTTCACCTGATACATTTGAGATTGAGTTAGGTAAGCTAGTTTCTAATTTAGAAGAAGTACGCAAAGACATAAAAATCGCTGATATTCAGCGTCTTCATGAACAAAACATGAAGAAAATTGAAGAGAATCAAGAGAAAATCAAAGAAACAGAAGAGAATGCCAAGCAAGTCAAGAAATCCGGCATGGCATCAAAGATTTTTGGCTGGCTCAGCGCCATAGCCTCAGTGGTTATCGGTGCCATCATGGTGGCCTCAGGGGTAGGAGCCGTTGCCGGTGCAATGATGATTGCCTCAGGCGTAATTGGGATGGCGAATATGGCTGTGAAACAAGCGGCGGAAGATGGCCTGATATCCCAAGAGGCAATGCAAGTATTAGGGCCGATACTCACTGCGATTGAAGTCGCATTGACTGTAGTTTCAACCGTAATGACCTTTGGCGGTTCGGCACTAAAATGCCTGGCTGATATTGGCGCAAAACTCGGTGCTAACACCGCAAGTCTTGCTGCTAAAGGAGCCGAGTTTTCAGCCAAAGTTGCCCAAATTTCGACAGGCATATCAAACACTGTCGGGAGTGCAGTGACTAAATTAGGGGGCAGTTTTGGTAGTTTAACAATGAGCCATGTAATCCGTACAGGATCACAGGCAACACAAGTCGCCGTTGGTGTGGGCAGCGGAATAACTCAGACCATCAATAATAAAAAACAAGCTGATTTACAACATAATAACGCTGATTTGGCCTTGAACAAGGCAGACATGGCAGCGTTACAAAGTATTATTGACCGACTCAAAGAAGAGTTATCCCATTTGTCAGAGTCACATCAACAAGTGATGGAACTGATTTTCCAGATGATTAATGCAAAAGGTGACATGCTGCATAATTTGGCCGGCAGACCCCATACTGTTTAA Yersinia_enterocolitica_YE5 ATGAGTGCGTTGATAACCCATGATCGCTCAACGCCAGTAACTGGAAGTCTAGTTCCCTACATCGAGACACCAGCGCCCGCCCCCCTTCAGACCCAACAAGTCGCGGGAGAACTGAAGGATAAAAATGGCGGGGTGAGTTCTCAGGGCGTGCAGCTCCCTGCACCACTAGCAGTGGTTGCCAGCCAAGTCACTGAAGGACAACAGCAAGAAATCACTAAATTATTGGAGTCGGTCACCCGCGGCACGGCAGGATCTCAACTGATATCAAATTATGTTTCAGTGCTAACGAATTTTACGCTCGCTTCACCTGATACATTTGAGATTGAGTTAGGTAAGCTAGTTTCTAATTTAGAAGAAGTACGCAAAGACATAAAAATCGCTGATATTCAGCGTCTTCATGAACAAAACATGAAGAAAATTGAAGAGAATCAAGAGAAAATCAAAGAAACAGAAGAGAATGCCAAGCAAGTCAAGAAATCCGGCATGGCATCAAAGATTTTTGGCTGGCTCAGCGCCATAGCCTCAGTGGTTATCGGTGCCATCATGGTGGCCTCAGGGGTAGGAGCCGTTGCCGGTGCAATGATGATTGCCTCAGGCGTAATTGGGATGGCGAATATGGCTGTGAAACAAGCGGCGGAAGATGGCCTGATATCCCAAGAGGCAATGCAAGTATTAGGGCCGATACTCACTGCGATTGAAGTCGCATTGACTGTAGTTTCAACCGTAATGACCTTTGGCGGTTCGGCACTAAAATGCCTGGCTGATATTGGCGCAAAACTCGGTGCTAACACCGCAAGTCTTGCTGCTAAAGGAGCCGAGTTTTCAGCCAAAGTTGCCCAAATTTCGACAGGCATATCAAACACTGTCGGGAGTGCAGTGACTAAATTAGGGGGCAGTTTTGGTAGTTTAACAATGAGCCATGTAATCCGTACAGGATCACAGGCAACACAAGTCGCCGTTGGTGTGGGCAGCGGAATAACTCAGACCATCAATAATAAAAAACAAGCTGATTTACAACATAATAACGCTGATTTGGCCTTGAACAAGGCAGACATGGCAGCGTTACAAAGTATTATTGACCGACTCAAAGAAGAGTTATCCCATTTGTCAGAGTCACATCAACAAGTGATGGAACTGATTTTCCAGATGATTAATGCAAAAGGTGACATGCTGCATAATTTGGCCGGCAGACCCCATACTGTTTAA Yersinia_pestis_Harbin_35_bis ATGAGTGCGTTGATAACCCATGACCGCTCAACGCCAGTAACTGGAAGTCTACTTCCCTACGTCGAGACACCAGCGCCCGCCCCCCCTTCAGACCCAACAAGTCGCGGGAGAACTGAAGGATAAAAATGGCGGGGTGAGTTCTCAGGGCGTACAGCTCCCTGCACCACTAGCAGTGGTTGCCAGCCAAGTTACTGAAGGACAACAGCAAGAAGTCACTAAATTATTGGAGTCGGTCACCCGCGGCGCGGCAGGATCTCAACTGATATCAAATTATGTTTCAGTGCTAACGAAGTTTACGCTTGCTTCACCTGATACATTTGAGATTGAGTTAGGTAAGCTAGTTTCTAATTTAGAAGAAGTACGCAAAGACATAAAAATCGCTGATATTCAGCGTCTTCATGAACAAAACATGAAGAAAATTGAAGAGAATCAAGAGAAAATCAAAGAAACAGAAGAGAATGCCAAGCAAGTCAAGAAATCCGGCATCGCATCAAAGATTTTTGGCTGGCTCAGCGCCATAGCCTCAGTGATTGTCGGTGCCATCATGGTGGCCTCAGGGGTAGGAGCCGTTGCCGGTGCAATGATGGTTGCCTCAGGCGTAATTGGGATGGCGAATATGGCAGTGAAACAAGCGGCGGAAGATGGCCTGATATCCCAAGAGGCAATGAAAATATTAGGGCCGATACTCACTGCGATTGAAGTCGCATTGACTGTAGTTTCAACCGTAATGACCTTTGGCGGTTCGGCACTAAAATGCCTGGCTAATATTGGCGCAAAACTCGGTGCTAACACCGCAAGTCTTGTGGCTAAAGGAGCCGAGTTTTCGGCCAAAGTTGCCCAAATTTCGACAGGCATATCAAACACTGTCGGGAGTGCAGTGACTAAATTAGGGGGCAGTTTTGCTGGTTTAACAATGAGCCATGCAATCCGTACAGGATCACAGGCAACACAAGTCGCCGTTGGTGTGGGCAGCGGAATAACTCAGACCATCAATAATAAAAAGCAAGCTGATTTACAACATAATAACGCTGATTTGGCCTTGAACAAGGCAGACATGGCAGCGTTACAAAGTATTATTGACCGACTCAAAGAAGAGTTATCCCATTTGTCAGAGTCACATCAACAAGTGATGGAACTGATTTTCCAGATGATTAATGCAAAAGGTGACATGCTGCATAATTTGGCCGGCAGACCCCATACTGTTTAA #------------------------------- yopT (+9) ------------------------------- #grep "yopT" selected_gtf_files/Yersinia_pestis_1412.gtf NZ_CP006780.1 RefSeq gene 43360 44327 . + . gene_id "M479_RS22185"; transcript_id ""; gbkey "Gene"; gene "yopT"; gene_biotype "pseudogene"; locus_tag "M479_RS22185"; old_locus_tag "M479_4302"; pseudo "true"; #grep "yopT" selected_gtf_files/Yersinia_pestis_1413.gtf NZ_CP006761.1 RefSeq gene 60310 61277 . + . gene_id "M480_RS22170"; transcript_id ""; gbkey "Gene"; gene "yopT"; gene_biotype "pseudogene"; locus_tag "M480_RS22170"; old_locus_tag "M480_4319"; pseudo "true"; #grep "yopT" selected_gtf_files/Yersinia_pestis_1522.gtf NZ_CP006757.1 RefSeq gene 61673 62640 . - . gene_id "M481_RS22190"; transcript_id ""; gbkey "Gene"; gene "yopT"; gene_biotype "pseudogene"; locus_tag "M481_RS22190"; old_locus_tag "M481_4325"; pseudo "true"; samtools faidx Yersinia_pestis_1412.fna NZ_CP006780.1:43360-44327 > temp.fna sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna Yersinia_pestis_1412 ATGAACAGTATTCACGGACACTACCATATTCAACTATCGAATTATTCTGCCGGTGAAAACCTTCAATCAGTACCCTCACCGAAGGGGTGATTGGCGCACACCGAGTGAAAGTGGAAACAGCACTGTCACACTCAAACCTGCAGAAAAAGTTATCAGCCACCATAAAACATAACCAGTCAGGCCGTTCTATGCTGGATAGAAAGTTGACCAGCGACGGCAAAGCTAACCAACGCAGCAGCTTTACCTTCAGTATGATTATGTATCGCATGATACATTTTGTACTCAGCACTCGTGTGCCCGCGGTGAGAGAGTCTGTTGCAAATTACGGAGGTAACATCAATTTCAAGTTTGCTCAGACCAAAGGGGCTTTTCTTCATAAAATAATAAAACATTCAGACACTGCTAGCGGTGTCTGTGAGGCTTTATGTGCACATTGGATCAGGAACCATGCACAAGGCCAAAGCTTATTTGACCAGCTCTATGTTGGCGGGCGTAAGGGGAAATTCCAGATCGATACACTTTACTCAATTAAACAGTTGCAAATAGATGGTTGTAAAGCAGACGTTGATCAAGATGAGGTAACACTAGATTGGTTCAAGAAAAATGGCATATCAGAACGTATGATTGAACGGCATTGCTTACTGCGTCCAGTTGATGTTACTGGTACGACGGAATCAGAAGGGCTGGATCAATTATTAAACGCTATCCTTGATACTCATGGGATAGGTTACGGTTATAAAAAAATACATCTCTCAGGCCAAATGTCAGCCCACGCCATAGCGGCGTATGTCAACGAAAAGAGTGGTGTTACTTTCTTCGATCCCAATTTCGGTGAATTCCACTTTTCTGATAAGGAAAAGTTCCGCAAATGGTTTACTAACTCATTCTGGGGTAATTCTATGTATCATTATCCTCTGGGGGTGGGGCAGCGTTTTAGAGTGTTAACATTTGACTCCAAGGAGGTTTAA samtools faidx Yersinia_pestis_1413.fna NZ_CP006761.1:60310-61277 > temp.fna sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna Yersinia_pestis_1413 ATGAACAGTATTCACGGACACTACCATATTCAACTATCGAATTATTCTGCCGGTGAAAACCTTCAATCAGTACCCTCACCGAAGGGGTGATTGGCGCACACCGAGTGAAAGTGGAAACAGCACTGTCACACTCAAACCTGCAGAAAAAGTTATCAGCCACCATAAAACATAACCAGTCAGGCCGTTCTATGCTGGATAGAAAGTTGACCAGCGACGGCAAAGCTAACCAACGCAGCAGCTTTACCTTCAGTATGATTATGTATCGCATGATACATTTTGTACTCAGCACTCGTGTGCCCGCGGTGAGAGAGTCTGTTGCAAATTACGGAGGTAACATCAATTTCAAGTTTGCTCAGACCAAAGGGGCTTTTCTTCATAAAATAATAAAACATTCAGACACTGCTAGCGGTGTCTGTGAGGCTTTATGTGCACATTGGATCAGGAACCATGCACAAGGCCAAAGCTTATTTGACCAGCTCTATGTTGGCGGGCGTAAGGGGAAATTCCAGATCGATACACTTTACTCAATTAAACAGTTGCAAATAGATGGTTGTAAAGCAGACGTTGATCAAGATGAGGTAACACTAGATTGGTTCAAGAAAAATGGCATATCAGAACGTATGATTGAACGGCATTGCTTACTGCGTCCAGTTGATGTTACTGGTACGACGGAATCAGAAGGGCTGGATCAATTATTAAACGCTATCCTTGATACTCATGGGATAGGTTACGGTTATAAAAAAATACATCTCTCAGGCCAAATGTCAGCCCACGCCATAGCGGCGTATGTCAACGAAAAGAGTGGTGTTACTTTCTTCGATCCCAATTTCGGTGAATTCCACTTTTCTGATAAGGAAAAGTTCCGCAAATGGTTTACTAACTCATTCTGGGGTAATTCTATGTATCATTATCCTCTGGGGGTGGGGCAGCGTTTTAGAGTGTTAACATTTGACTCCAAGGAGGTTTAA samtools faidx Yersinia_pestis_1522.fna NZ_CP006757.1:61673-62640 > temp.fna revseq sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 61673-62640.rev > temp_.fna Yersinia_pestis_1522 ATGAACAGTATTCACGGACACTACCATATTCAACTATCGAATTATTCTGCCGGTGAAAACCTTCAATCAGTACCCTCACCGAAGGGGTGATTGGCGCACACCGAGTGAAAGTGGAAACAGCACTGTCACACTCAAACCTGCAGAAAAAGTTATCAGCCACCATAAAACATAACCAGTCAGGCCGTTCTATGCTGGATAGAAAGTTGACCAGCGACGGCAAAGCTAACCAACGCAGCAGCTTTACCTTCAGTATGATTATGTATCGCATGATACATTTTGTACTCAGCACTCGTGTGCCCGCGGTGAGAGAGTCTGTTGCAAATTACGGAGGTAACATCAATTTCAAGTTTGCTCAGACCAAAGGGGCTTTTCTTCATAAAATAATAAAACATTCAGACACTGCTAGCGGTGTCTGTGAGGCTTTATGTGCACATTGGATCAGGAACCATGCACAAGGCCAAAGCTTATTTGACCAGCTCTATGTTGGCGGGCGTAAGGGGAAATTCCAGATCGATACACTTTACTCAATTAAACAGTTGCAAATAGATGGTTGTAAAGCAGACGTTGATCAAGATGAGGTAACACTAGATTGGTTCAAGAAAAATGGCATATCAGAACGTATGATTGAACGGCATTGCTTACTGCGTCCAGTTGATGTTACTGGTACGACGGAATCAGAAGGGCTGGATCAATTATTAAACGCTATCCTTGATACTCATGGGATAGGTTACGGTTATAAAAAAATACATCTCTCAGGCCAAATGTCAGCCCACGCCATAGCGGCGTATGTCAACGAAAAGAGTGGTGTTACTTTCTTCGATCCCAATTTCGGTGAATTCCACTTTTCTGATAAGGAAAAGTTCCGCAAATGGTTTACTAACTCATTCTGGGGTAATTCTATGTATCATTATCCTCTGGGGGTGGGGCAGCGTTTTAGAGTGTTAACATTTGACTCCAAGGAGGTTTAA #grep "yopT" selected_gtf_files/Yersinia_pestis_3067.gtf NZ_CP006753.1 RefSeq gene 43515 44482 . + . gene_id "M482_RS22205"; transcript_id ""; gbkey "Gene"; gene "yopT"; gene_biotype "pseudogene"; locus_tag "M482_RS22205"; old_locus_tag "M482_4297"; pseudo "true"; samtools faidx Yersinia_pestis_3067.fna NZ_CP006753.1:43515-44482 > temp.fna sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna Yersinia_pestis_3067 ATGAACAGTATTCACGGACACTACCATATTCAACTATCGAATTATTCTGCCGGTGAAAACCTTCAATCAGTACCCTCACCGAAGGGGTGATTGGCGCACACCGAGTGAAAGTGGAAACAGCACTGTCACACTCAAACCTGCAGAAAAAGTTATCAGCCACCATAAAACATAACCAGTCAGGCCGTTCTATGCTGGATAGAAAGTTGACCAGCGACGGCAAAGCTAACCAACGCAGCAGCTTTACCTTCAGTATGATTATGTATCGCATGATACATTTTGTACTCAGCACTCGTGTGCCCGCGGTGAGAGAGTCTGTTGCAAATTACGGAGGTAACATCAATTTCAAGTTTGCTCAGACCAAAGGGGCTTTTCTTCATAAAATAATAAAACATTCAGACACTGCTAGCGGTGTCTGTGAGGCTTTATGTGCACATTGGATCAGGAACCATGCACAAGGCCAAAGCTTATTTGACCAGCTCTATGTTGGCGGGCGTAAGGGGAAATTCCAGATCGATACACTTTACTCAATTAAACAGTTGCAAATAGATGGTTGTAAAGCAGACGTTGATCAAGATGAGGTAACACTAGATTGGTTCAAGAAAAATGGCATATCAGAACGTATGATTGAACGGCATTGCTTACTGCGTCCAGTTGATGTTACTGGTACGACGGAATCAGAAGGGCTGGATCAATTATTAAACGCTATCCTTGATACTCATGGGATAGGTTACGGTTATAAAAAAATACATCTCTCAGGCCAAATGTCAGCCCACGCCATAGCGGCGTATGTCAACGAAAAGAGTGGTGTTACTTTCTTCGATCCCAATTTCGGTGAATTCCACTTTTCTGATAAGGAAAAGTTCCGCAAATGGTTTACTAACTCATTCTGGGGTAATTCTATGTATCATTATCCTCTGGGGGTGGGGCAGCGTTTTAGAGTGTTAACATTTGACTCCAAGGAGGTTTAA #grep "yopT" selected_gtf_files/Yersinia_pestis_3770.gtf NZ_CP006750.1 RefSeq gene 18136 19103 . + . gene_id "M483_RS22135"; transcript_id ""; gbkey "Gene"; gene "yopT"; gene_biotype "pseudogene"; locus_tag "M483_RS22135"; old_locus_tag "M483_4264"; pseudo "true"; samtools faidx Yersinia_pestis_3770.fna NZ_CP006750.1:18136-19103 > temp.fna sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna Yersinia_pestis_3770 ATGAACAGTATTCACGGACACTACCATATTCAACTATCGAATTATTCTGCCGGTGAAAACCTTCAATCAGTACCCTCACCGAAGGGGTGATTGGCGCACACCGAGTGAAAGTGGAAACAGCACTGTCACACTCAAACCTGCAGAAAAAGTTATCAGCCACCATAAAACATAACCAGTCAGGCCGTTCTATGCTGGATAGAAAGTTGACCAGCGACGGCAAAGCTAACCAACGCAGCAGCTTTACCTTCAGTATGATTATGTATCGCATGATACATTTTGTACTCAGCACTCGTGTGCCCGCGGTGAGAGAGTCTGTTGCAAATTACGGAGGTAACATCAATTTCAAGTTTGCTCAGACCAAAGGGGCTTTTCTTCATAAAATAATAAAACATTCAGACACTGCTAGCGGTGTCTGTGAGGCTTTATGTGCACATTGGATCAGGAACCATGCACAAGGCCAAAGCTTATTTGACCAGCTCTATGTTGGCGGGCGTAAGGGGAAATTCCAGATCGATACACTTTACTCAATTAAACAGTTGCAAATAGATGGTTGTAAAGCAGACGTTGATCAAGATGAGGTAACACTAGATTGGTTCAAGAAAAATGGCATATCAGAACGTATGATTGAACGGCATTGCTTACTGCGTCCAGTTGATGTTACTGGTACGACGGAATCAGAAGGGCTGGATCAATTATTAAACGCTATCCTTGATACTCATGGGATAGGTTACGGTTATAAAAAAATACATCTCTCAGGCCAAATGTCAGCCCACGCCATAGCGGCGTATGTCAACGAAAAGAGTGGTGTTACTTTCTTCGATCCCAATTTCGGTGAATTCCACTTTTCTGATAAGGAAAAGTTCCGCAAATGGTTTACTAACTCATTCTGGGGTAATTCTATGTATCATTATCCTCTGGGGGTGGGGCAGCGTTTTAGAGTGTTAACATTTGACTCCAAGGAGGTTTAA #grep "yopT" selected_gtf_files/Yersinia_pestis_8787.gtf NZ_CP006747.1 RefSeq gene 55293 56260 . + . gene_id "M484_RS21915"; transcript_id ""; gbkey "Gene"; gene "yopT"; gene_biotype "pseudogene"; locus_tag "M484_RS21915"; old_locus_tag "M484_4255"; pseudo "true"; samtools faidx Yersinia_pestis_8787.fna NZ_CP006747.1:55293-56260 > temp.fna sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna Yersinia_pestis_8787 ATGAACAGTATTCACGGACACTACCATATTCAACTATCGAATTATTCTGCCGGTGAAAACCTTCAATCAGTACCCTCACCGAAGGGGTGATTGGCGCACACCGAGTGAAAGTGGAAACAGCACTGTCACACTCAAACCTGCAGAAAAAGTTATCAGCCACCATAAAACATAACCAGTCAGGCCGTTCTATGCTGGATAGAAAGTTGACCAGCGACGGCAAAGCTAACCAACGCAGCAGCTTTACCTTCAGTATGATTATGTATCGCATGATACATTTTGTACTCAGCACTCGTGTGCCCGCGGTGAGAGAGTCTGTTGCAAATTACGGAGGTAACATCAATTTCAAGTTTGCTCAGACCAAAGGGGCTTTTCTTCATAAAATAATAAAACATTCAGACACTGCTAGCGGTGTCTGTGAGGCTTTATGTGCACATTGGATCAGGAACCATGCACAAGGCCAAAGCTTATTTGACCAGCTCTATGTTGGCGGGCGTAAGGGGAAATTCCAGATCGATACACTTTACTCAATTAAACAGTTGCAAATAGATGGTTGTAAAGCAGACGTTGATCAAGATGAGGTAACACTAGATTGGTTCAAGAAAAATGGCATATCAGAACGTATGATTGAACGGCATTGCTTACTGCGTCCAGTTGATGTTACTGGTACGACGGAATCAGAAGGGCTGGATCAATTATTAAACGCTATCCTTGATACTCATGGGATAGGTTACGGTTATAAAAAAATACATCTCTCAGGCCAAATGTCAGCCCACGCCATAGCGGCGTATGTCAACGAAAAGAGTGGTGTTACTTTCTTCGATCCCAATTTCGGTGAATTCCACTTTTCTGATAAGGAAAAGTTCCGCAAATGGTTTACTAACTCATTCTGGGGTAATTCTATGTATCATTATCCTCTGGGGGTGGGGCAGCGTTTTAGAGTGTTAACATTTGACTCCAAGGAGGTTTAA #grep "yopT" selected_gtf_files/Yersinia_pestis_Pestoides_F.gtf NC_009377.1 RefSeq gene 48563 49530 . - . gene_id "YPDSF_RS23435"; transcript_id ""; gbkey "Gene"; gene "yopT"; gene_biotype "pseudogene"; locus_tag "YPDSF_RS23435"; old_locus_tag "YPDSF_4001"; pseudo "true"; samtools faidx Yersinia_pestis_Pestoides_F.fna NC_009377.1:48563-49530 > temp.fna revseq sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 48563-49530.rev > temp_.fna Yersinia_pestis_Pestoides_F ATGAACAGTATTCACGGACACTACCATATTCAACTATCGAATTATTCTGCCGGTGAAAACCTTCAATCAGTACCCTCACCGAAGGGGTGATTGGCGCACACCGAGTGAAAGTGGAAACAGCACTGTCACACTCAAACCTGCAGAAAAAGTTATCAGCCACCATAAAACATAACCAGTCAGGCCGTTCTATGCTGGATAGAAAGTTGACCAGCGACGGCAAAGCTAACCAACGCAGCAGCTTTACCTTCAGTATGATTATGTATCGCATGATACATTTTGTACTCAGCACTCGTGTGCCCGCGGTGAGAGAGTCTGTTGCAAATTACGGAGGTAACATCAATTTCAAGTTTGCTCAGACCAAAGGGGCTTTTCTTCATAAAATAATAAAACATTCAGACACTGCTAGCGGTGTCTGTGAGGCTTTATGTGCACATTGGATCAGGAACCATGCACAAGGCCAAAGCTTATTTGACCAGCTCTATGTTGGCGGGCGTAAGGGGAAATTCCAGATCGATACACTTTACTCAATTAAACAGTTGCAAATAGATGGTTGTAAAGCAGACGTTGATCAAGATGAGGTAACACTAGATTGGTTCAAGAAAAATGGCATATCAGAACGTATGATTGAACGGCATTGCTTACTGCGTCCAGTTGATGTTACTGGTACGACGGAATCAGAAGGGCTGGATCAATTATTAAACGCTATCCTTGATACTCATGGGATAGGTTACGGTTATAAAAAAATACATCTCTCAGGCCAAATGTCAGCCCACGCCATAGCGGCGTATGTCAACGAAAAGAGTGGTGTTACTTTCTTCGATCCCAATTTCGGTGAATTCCACTTTTCTGATAAGGAAAAGTTCCGCAAATGGTTTACTAACTCATTCTGGGGTAATTCTATGTATCATTATCCTCTGGGGGTGGGGCAGCGTTTTAGAGTGTTAACATTTGACTCCAAGGAGGTTTAA #grep "yopT" selected_gtf_files/Yersinia_pestis_Pestoides_F_bis.gtf NZ_CP009713.1 RefSeq gene 53246 54213 . - . gene_id "BZ18_RS22165"; transcript_id ""; gbkey "Gene"; gene "yopT"; gene_biotype "pseudogene"; locus_tag "BZ18_RS22165"; old_locus_tag "BZ18_4298"; pseudo "true"; samtools faidx Yersinia_pestis_Pestoides_F_bis.fna NZ_CP009713.1:53246-54213 > temp.fna revseq sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 53246-54213.rev > temp_.fna Yersinia_pestis_Pestoides_F_bis ATGAACAGTATTCACGGACACTACCATATTCAACTATCGAATTATTCTGCCGGTGAAAACCTTCAATCAGTACCCTCACCGAAGGGGTGATTGGCGCACACCGAGTGAAAGTGGAAACAGCACTGTCACACTCAAACCTGCAGAAAAAGTTATCAGCCACCATAAAACATAACCAGTCAGGCCGTTCTATGCTGGATAGAAAGTTGACCAGCGACGGCAAAGCTAACCAACGCAGCAGCTTTACCTTCAGTATGATTATGTATCGCATGATACATTTTGTACTCAGCACTCGTGTGCCCGCGGTGAGAGAGTCTGTTGCAAATTACGGAGGTAACATCAATTTCAAGTTTGCTCAGACCAAAGGGGCTTTTCTTCATAAAATAATAAAACATTCAGACACTGCTAGCGGTGTCTGTGAGGCTTTATGTGCACATTGGATCAGGAACCATGCACAAGGCCAAAGCTTATTTGACCAGCTCTATGTTGGCGGGCGTAAGGGGAAATTCCAGATCGATACACTTTACTCAATTAAACAGTTGCAAATAGATGGTTGTAAAGCAGACGTTGATCAAGATGAGGTAACACTAGATTGGTTCAAGAAAAATGGCATATCAGAACGTATGATTGAACGGCATTGCTTACTGCGTCCAGTTGATGTTACTGGTACGACGGAATCAGAAGGGCTGGATCAATTATTAAACGCTATCCTTGATACTCATGGGATAGGTTACGGTTATAAAAAAATACATCTCTCAGGCCAAATGTCAGCCCACGCCATAGCGGCGTATGTCAACGAAAAGAGTGGTGTTACTTTCTTCGATCCCAATTTCGGTGAATTCCACTTTTCTGATAAGGAAAAGTTCCGCAAATGGTTTACTAACTCATTCTGGGGTAATTCTATGTATCATTATCCTCTGGGGGTGGGGCAGCGTTTTAGAGTGTTAACATTTGACTCCAAGGAGGTTTAA #grep "yopT" selected_gtf_files/Yersinia_pestis_Pestoides_G.gtf NZ_CP010246.1 RefSeq gene 1551 2518 . + . gene_id "CH43_RS22165"; transcript_id ""; gbkey "Gene"; gene "yopT"; gene_biotype "pseudogene"; locus_tag "CH43_RS22165"; old_locus_tag "CH43_4244"; pseudo "true"; samtools faidx Yersinia_pestis_Pestoides_G.fna NZ_CP010246.1:1551-2518 > temp.fna sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna Yersinia_pestis_Pestoides_G ATGAACAGTATTCACGGACACTACCATATTCAACTATCGAATTATTCTGCCGGTGAAAACCTTCAATCAGTACCCTCACCGAAGGGGTGATTGGCGCACACCGAGTGAAAGTGGAAACAGCACTGTCACACTCAAACCTGCAGAAAAAGTTATCAGCCACCATAAAACATAACCAGTCAGGCCGTTCTATGCTGGATAGAAAGTTGACCAGCGACGGCAAAGCTAACCAACGCAGCAGCTTTACCTTCAGTATGATTATGTATCGCATGATACATTTTGTACTCAGCACTCGTGTGCCCGCGGTGAGAGAGTCTGTTGCAAATTACGGAGGTAACATCAATTTCAAGTTTGCTCAGACCAAAGGGGCTTTTCTTCATAAAATAATAAAACATTCAGACACTGCTAGCGGTGTCTGTGAGGCTTTATGTGCACATTGGATCAGGAACCATGCACAAGGCCAAAGCTTATTTGACCAGCTCTATGTTGGCGGGCGTAAGGGGAAATTCCAGATCGATACACTTTACTCAATTAAACAGTTGCAAATAGATGGTTGTAAAGCAGACGTTGATCAAGATGAGGTAACACTAGATTGGTTCAAGAAAAATGGCATATCAGAACGTATGATTGAACGGCATTGCTTACTGCGTCCAGTTGATGTTACTGGTACGACGGAATCAGAAGGGCTGGATCAATTATTAAACGCTATCCTTGATACTCATGGGATAGGTTACGGTTATAAAAAAATACATCTCTCAGGCCAAATGTCAGCCCACGCCATAGCGGCGTATGTCAACGAAAAGAGTGGTGTTACTTTCTTCGATCCCAATTTCGGTGAATTCCACTTTTCTGATAAGGAAAAGTTCCGCAAATGGTTTACTAACTCATTCTGGGGTAATTCTATGTATCATTATCCTCTGGGGGTGGGGCAGCGTTTTAGAGTGTTAACATTTGACTCCAAGGAGGTTTAA #------------------------------- yopE (+3) ------------------------------- #grep "yopE" selected_gtf_files/Yersinia_pestis_1522.gtf NZ_CP006757.1 RefSeq gene 70902 71507 . - . gene_id "M481_RS24690"; transcript_id ""; gbkey "Gene"; gene "yopE"; gene_biotype "pseudogene"; locus_tag "M481_RS24690"; old_locus_tag "M481_4336"; part "2"; pseudo "true"; NZ_CP006757.1 RefSeq gene 1 53 . - . gene_id "M481_RS24690"; transcript_id ""; gbkey "Gene"; gene "yopE"; gene_biotype "pseudogene"; locus_tag "M481_RS24690"; old_locus_tag "M481_4336"; part "1"; pseudo "true"; samtools faidx Yersinia_pestis_1522.fna NZ_CP006757.1:70902-71507 > temp.fna samtools faidx Yersinia_pestis_1522.fna NZ_CP006757.1:1-53 >> temp.fna #delete the second ">****" revseq sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 70902-71507.rev > temp_.fna Yersinia_pestis_1522 ATGAAAATATCATCATTTATTTCTACATCACTGCCCCTGCCGACATCTGTGTCGGATCTAGCAGCGTAGGAGAAATGTCTGGGCGCTCAGTCTCACAGCAAACAAGTGATCAATATGCAAACAATCTGGCCGGGCGCACTGAAAGCCCTCAGGGTTCCAGCTTAGCCAGCCGTATCATTGAGAGGTTATCATCAGTGGCCCACTCTGTGATTGGGTTTATCCAACGCATGTTCTCGGAGGGGAGCCATAAACCGGTGGTGACACCAGCACCCACACCTGCACAAATGCCAAGTCCTACGTCTTTCAGTGACAGTATCAAGCAACTTGCTGCTGAGACGCTGCCAAAATACATGCAGCAGTTGAATAGCTTGGATGCAGAGATGCTGCAGAAAAATCATGATCAGTTCGCTACGGGCAGCGGCCCTCTTCGTGGCAGTATCACTCAATGCCAAGGGCTGATGCAGTTTTGTGGTGGGGAATTGCAAGCTGAGGCCAGTGCCATCTTAAACACGCCTGTTTGTGGTATTCCCTTCTCGCAGTGGGGAACTATTGGTGGGGCGGCCAGCGCGTACGTCGCCAGTGGCGTTGATCTAACGCAGGCAGCAAATGAGATCAAAGGGCTGGCGCAACAGATGCAGAAATTACTGTCATTGATGTGA #grep "yopE" selected_gtf_files/Yersinia_pestis_Nicholisk_41.gtf NZ_CP009990.1 RefSeq gene 67916 68552 . + . gene_id "CH63_RS00620"; transcript_id ""; gbkey "Gene"; gene "yopE"; gene_biotype "protein_coding"; locus_tag "CH63_RS00620"; part "1"; NZ_CP009990.1 RefSeq gene 1 23 . + . gene_id "CH63_RS00620"; transcript_id ""; gbkey "Gene"; gene "yopE"; gene_biotype "protein_coding"; locus_tag "CH63_RS00620"; part "2"; samtools faidx Yersinia_pestis_Nicholisk_41.fna NZ_CP009990.1:67916-68552 > temp.fna samtools faidx Yersinia_pestis_Nicholisk_41.fna NZ_CP009990.1:1-23 >> temp.fna #delete the second ">****" sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna Yersinia_pestis_Nicholisk_41 ATGAAAATATCATCATTTATTTCTACATCACTGCCCCTGCCGACATCTGTGTCAGGATCTAGCAGCGTAGGAGAAATGTCTGGGCGCTCAGTCTCACAGCAAACAAGTGATCAATATGCAAACAATCTGGCCGGGCGCACTGAAAGCCCTCAGGGTTCCAGCTTAGCCAGCCGTATCATTGAGAGGTTATCATCAGTGGCCCACTCTGTGATTGGGTTTATCCAACGCATGTTCTCGGAGGGGAGCCATAAACCGGTGGTGACACCGGCACCCACACCTGCACAAATGCCAAGTCCTACGTCTTTCAGTGACAGTATCAAGCAACTTGCTGCTGAGACGCTGCCAAAATACATGCAGCAGTTGAATAGCTTGGATGCAGAGATGCTGCAGAAAAATCATGATCAGTTCGCTACGGGCAGCGGCCCTCTTCGTGGCAGTATCACTCAATGCCAAGGGCTGATGCAGTTTTGTGGTGGGGAATTGCAAGCTGAGGCCAGTGCCATCTTAAACACGCCTGTTTGTGGTATTCCCTTCTCGCAGTGGGGAACTATTGGTGGGGCGGCCAGCGCGTACGTCGCCAGTGGCGTTGATCTAACGCAGGCAGCAAATGAGATCAAAGGGCTGGCGCAACAGATGCAGAAATTACTGTCATTGATGTGA #grep "yopE" selected_gtf_files/Yersinia_pseudotuberculosis_FDAARGOS_581.gtf NZ_CP033712.1 RefSeq gene 69663 70035 . + . gene_id "EGX47_RS00005"; transcript_id ""; gbkey "Gene"; gene "yopE"; gene_biotype "protein_coding"; locus_tag "EGX47_RS00005"; old_locus_tag "EGX47_00005"; part "1"; NZ_CP033712.1 RefSeq gene 1 287 . + . gene_id "EGX47_RS00005"; transcript_id ""; gbkey "Gene"; gene "yopE"; gene_biotype "protein_coding"; locus_tag "EGX47_RS00005"; old_locus_tag "EGX47_00005"; part "2"; samtools faidx Yersinia_pseudotuberculosis_FDAARGOS_581.fna NZ_CP033712.1:69663-70035 > temp.fna samtools faidx Yersinia_pseudotuberculosis_FDAARGOS_581.fna NZ_CP033712.1:1-287 >> temp.fna #delete the second ">****" sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna Yersinia_pseudotuberculosis_FDAARGOS_581 ATGAAAATATCATCATTTATTTCTACATCACTGCCCCTGCCGACATCTGTGTCAGGATCTAGCAGCGTAGGAGAAATGTCTGGGCGCTCAGTCTCACAGCAAACAAGTGATCAATATGCAAACAATCTGGCCGGGCGCACTGAAAGCCCTCAGGGTTCCAGCTTAGCCAGCCGTATCATTGAGAGGTTATCATCAGTGGCCCACTCTGTGATTGGGTTTATCCAACGCATGTTCTCGGAGGGGAGCCATAAACCGGTGGTGACACCAGCACCCACACCTGCACAAATGCCAAGTCCTACGTCTTTCAGTGACAGTATCAAGCAACTTGCTGCTGAGACGCTGCCAAAATACATGCAGCAGTTGAATAGCTTGGATGCAGAGATGCTGCAGAAAAATCATGATCAGTTCGCTACGGGCAGCGGCCCTCTTCGTGGCAGTATCACTCAATGCCAAGGGCTGATGCAGTTTTGTGGTGGGGAATTGCAAGCTGAGGCCAGTGCCATCTTAAACACGCCTGTTTGTGGTATTCCCTTCTCGCAGTGGGGAACTATTGGTGGGGCGGCCAGCGCGTACGTCGCCAGTGGCGTTGATCTAACGCAGGCAGCAAATGAGATCAAAGGGCTGGCGCAACAGATGCAGAAATTACTGTCATTGATGTGA #------------------------------- yopD (+2) ------------------------------- #grep "yopD" selected_gtf_files/Yersinia_enterocolitica_YE165.gtf NZ_CP016933.1 RefSeq gene 74497 74497 . + . gene_id "BB936_RS22270"; transcript_id ""; gbkey "Gene"; gene "yopD"; gene_biotype "protein_coding"; locus_tag "BB936_RS22270"; old_locus_tag "BB936_22265"; part "1"; NZ_CP016933.1 RefSeq gene 1 920 . + . gene_id "BB936_RS22270"; transcript_id ""; gbkey "Gene"; gene "yopD"; gene_biotype "protein_coding"; locus_tag "BB936_RS22270"; old_locus_tag "BB936_22265"; part "2"; samtools faidx Yersinia_enterocolitica_YE165.fna NZ_CP016933.1:74497-74497 > temp.fna samtools faidx Yersinia_enterocolitica_YE165.fna NZ_CP016933.1:1-920 >> temp.fna #delete the second ">****" sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna Yersinia_enterocolitica_YE165 ATGACAATAAATATCAAGACAGACAGCCCAATTATCACGACCGGTTCACAGCTTGATGCCATCACTACAGAGACAGTCGGGCAAAGCGGTGAGGTTAAAAAAACAGAAGACACCCGTCATGAAGCACAAGCAATAAAGAGTAGCGAGGCAAGCTTATCTCGGTCACAGGTGCCTGAATTGATCAAACCGAGTCAGGGAATCAATGTTGCATTACTGAGTAAAAGCCAGGGAGATCTTAATGGTACTTTAAGTATCTTGTTGTTGCTGTTGGAACTGGCACGTAAAGCGCGAGAAATGGGTTTGCAACAAAGGGATATAGAAAATAAAGCTACTATTTCTGCCCAAAAGGAGCAGGTAGCGGAGATGGTCAGCGGTGCAAAACTGATGATCGCCATGGCGGTGGTGTCTGGCATCATGGCTGCTACTTCTACGGTTGCTAGTGCTTTTTCTATAGCGAAAGAGGTGAAAATAGTTAAACAGGAACAAATTCTAAACAGTAACATTGCCGGCCGTGATCAACTTATTGATACAAAAATGCAGCAAATGAGTAACGCTGGTGATAAAGCGGTAAGCAGAGAGGATATCGGGAGAATATGGAAACCAGAGCAGGTAGCGGATCAAAATAAGCTGGCATTATTGGATAAAGAATTCAGAATGACCGACTCAAAAGCCAATGCGTTTAATGCCGCAACGCAGCCGTTAGGACAAATGGCAAACAGTGCGATTCAAGTTCATCAAGGGTATTCTCAAGCCGAGGTCAAAGAAAAAGAAGTCAATGCAAGTATTGCTGCCAACGAGAAGCAAAAAGCCGAAGAGGCGATGAACTATAATGATAACTTTATGAAAGATGTCCTGCGCTTGATTGAACAATATGTTAGCAGTCATACTCACGCCATGAAAGCCGCTTTTGGTGTTGTCTGA #grep "yopD" selected_gtf_files/Yersinia_pseudotuberculosis_IP32953_bis.gtf NZ_CP009711.1 RefSeq gene 68202 68525 . + . gene_id "BZ17_RS00160"; transcript_id ""; db_xref "GeneID:66841050"; gbkey "Gene"; gene "yopD"; gene_biotype "protein_coding"; locus_tag "BZ17_RS00160"; part "1"; NZ_CP009711.1 RefSeq gene 1 597 . + . gene_id "BZ17_RS00160"; transcript_id ""; db_xref "GeneID:66841050"; gbkey "Gene"; gene "yopD"; gene_biotype "protein_coding"; locus_tag "BZ17_RS00160"; part "2"; samtools faidx Yersinia_pseudotuberculosis_IP32953_bis.fna NZ_CP009711.1:68202-68525 > temp.fna samtools faidx Yersinia_pseudotuberculosis_IP32953_bis.fna NZ_CP009711.1:1-597 >> temp.fna #delete the second ">****" sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna Yersinia_pseudotuberculosis_IP32953_bis ATGACAATAAATATCAAGACAGACAGCCCAATTATCACGACCGGTTCACAGCTTGATGCCATCACTACAGAGACAGTCAAGCAAAGCGGTGAGATTAAAAAAACAGAAGACACCCGTCATGAAGCACAAGCAATAAAGAGTAGCGAGGCAAGCTTATCTCGGTCACAGGTGCCAGAATTGATCAAACCGAGCCAGGGAATCAATGTTGCATTACTGAGTAAAAGCCAGGGTGATCTTAATGGTACTTTAAGTATCTTGTTGTTGCTGTTGGAACTGGCACGTAAAGCGCGAGAAATGGGTTTGCAACAAAGGGATATAGAAAATAAAGCTACTATTACTGCCCAAAAGGAGCAGGTAGCGGAGATGGTCAGCGGTGCAAAACTGATGATCGCCATGGCGGTGGTGTCTGGCATCATGGCTGCTACTTCTACGGTTGCTAGTGCTTTTTCTATAGCGAAAGAGGTGAAAATAGTTAAACAGGAACAAATTCTAAACAGTAATATTGCTGGCCGCGAACAACTTATTGATACAAAAATGCAGCAAATGAGTAACATTGGTGATAAAGCGGTAAGCAGAGAGGATATCGGGAGAATATGGAAACCAGAGCAGGTAGCGGATCAAAATAAGCTGGCATTATTGGATAAAGAATTCAGAATGACCGACTCAAAAGCCAATGCGTTTAATGCCGCAACGCAGCCGTTAGGACAAATGGCAAACAGTGCGATTCAAGTTCATCAAGGGTATTCTCAAGCCGAGGTCAAAGAGAAAGAAGTCAATGCAAGTATTGCTGCCAACGAGAAGCAAAAAGCCGAAGAGGCGATGAACTATAATGATAACTTTATGAAAGATGTCCTGCGCTTGATTGAACAATATGTTAGCAGTCATACTCACGCCATGAAAGCCGCTTTTGGTGTTGTCTGA #------------------------------- yopM (+2) ------------------------------- #grep "yopM" selected_gtf_files/Yersinia_pestis_FDAARGOS_602.gtf NZ_CP033695.1 RefSeq gene 69663 70174 . + . gene_id "EGX42_RS00660"; transcript_id ""; gbkey "Gene"; gene "yopM"; gene_biotype "protein_coding"; locus_tag "EGX42_RS00660"; old_locus_tag "EGX42_00655"; part "1"; NZ_CP033695.1 RefSeq gene 1 592 . + . gene_id "EGX42_RS00660"; transcript_id ""; gbkey "Gene"; gene "yopM"; gene_biotype "protein_coding"; locus_tag "EGX42_RS00660"; old_locus_tag "EGX42_00655"; part "2"; samtools faidx Yersinia_pestis_FDAARGOS_602.fna NZ_CP033695.1:69663-70174 > temp.fna samtools faidx Yersinia_pestis_FDAARGOS_602.fna NZ_CP033695.1:1-592 >> temp.fna #delete the second ">****" sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna Yersinia_pestis_FDAARGOS_602 ATGTTCATAAATCCAAGAAATGTATCTAATACTTTTTTGCAAGAACCATTACGTCATTCTTCTAATTTAACTGAGATGCCGGTTGAGGCAGAAAATGTTAAATCTAAGACTGAATATTATAATGCATGGTCGGAATGGGAACGAAATGCCCCTCCGGGGAATGGTGAACAGAGGGAAATGGCGGTTTCAAGGTTACGAGATTGCCTGGACCGACAAGCCCATGAGCTAGAACTAAATAATCTGGGGCTGAGTTCTTTGCCGGAATTACCTCCGCATTTAGAGAGTTTAGTGGCGTCATGTAATTCTCTTACAGAATTACCGGAATTACCGCAGAGCCTGAAATCACTTCTAGTTGATAATAACAATCTGAAGGCATTATCCGATTTACCACCTTTACTGGAATATTTAGGTGTCTCTAATAATCAGCTGGAAAAATTGCCAGAGTTGCAAAACTCGTCCTTCTTGAAAATTATTGATGTTGATAACAATTCACTGAAAAAACTACCTGATTTACCTCCTTCACTGGAGTTTATTGCTGCTGGTAATAATCAGCTGGAAGAATTGCCAGAGTTGCAAAACTTGCCCTTCTTGACTACGATTTATGCTGATAACAATTTACTGAAAACATTACCCGATTTACCCCCTTCCCTGGAAGCACTTAATGTCAGAGATAATTATTTAACTGATCTGCCAGAATTACCGCAGAGTTTAACCTTCTTAGATGTTTCTGAAAATATTTTTTCTGGATTATCGGAATTGCCACCAAACTTGTATTATCTCAATGCATCCAGCAATGAAATAAGATCCTTATGCGATTTACCCCCTTCACTGGAAGAACTTAATGTCAGTAATAATAAGTTGATCGAACTGCCAGCGTTACCTCCACGCTTAGAACGTTTAATCGCTTCATTTAATCATCTTGCTGAAGTACCTGAATTGCCGCAAAACCTGAAACAGCTCCACGTAGAGTACAACCCTCTGAGAGAGTTTCCCGATATACCTGAGTCAGTGGAAGATCTTCGGATGAACTCTGAACGTGTAGTTGATCCATATGAATTTGCTCATGAGACTACAGACAAACTTGAAGATGATGTATTTGAGTAG #grep "yopM" selected_gtf_files/Yersinia_pseudotuberculosis_PB1+_bis.gtf NZ_CP009779.1 RefSeq gene 69708 69812 . + . gene_id "BZ16_RS00005"; transcript_id ""; gbkey "Gene"; gene "yopM"; gene_biotype "protein_coding"; locus_tag "BZ16_RS00005"; old_locus_tag "BZ16_4135"; part "1"; NZ_CP009779.1 RefSeq gene 1 1485 . + . gene_id "BZ16_RS00005"; transcript_id ""; gbkey "Gene"; gene "yopM"; gene_biotype "protein_coding"; locus_tag "BZ16_RS00005"; old_locus_tag "BZ16_4135"; part "2"; samtools faidx Yersinia_pseudotuberculosis_PB1+_bis.fna NZ_CP009779.1:69708-69812 > temp.fna samtools faidx Yersinia_pseudotuberculosis_PB1+_bis.fna NZ_CP009779.1:1-1485 >> temp.fna #delete the second ">****" sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna Yersinia_pseudotuberculosis_PB1+_bis ATGTTCATAAATCCAAGAAATGTATCTAATACTTTTTTGCAAGAACCATTACGTCATTCTTCTAATTTAACTGAGATGCCGGTTGAGGCAGAAAATGTTAAATCTAAGACTGAATATTATAATGCATGGTCGGAATGGGAACGAAATGCCCCTCCGGGGAATGGTGAACAGAGGGAAATGGCGGTTTCAAGGTTACGAGATTGCCTGGACCGACAAGCCCATGAGCTAGAACTAAATAATCTGGGGCTGAGTTCTTTGCCGGAATTACCTCCGCATTTAGAGAGTTTAGTGGCGTCATGTAATTCTCTTACAGAATTACCGGAATTGCCGCAGAGCCTGAAATCACTTCAAGTTGAAAATAACAATCTGAAGGCATTACCCGATTTACCCCCTTCCCTGAAAAAACTTCATGTCAGAGAAAATGATTTAACTGATCTGCCAGAATTACCGCAGAGCCTGGAATCACTTCGAGTTGATAATAACAATCTGAAGGCATTATCCGATTTACCTCCTTCACTGGAATATCTTACTGCTAGTAGTAATAAGCTGGAAGAATTGCCAGAGTTGCAAAACTTGCCCTTCTTGGCTGCGATTTATGCTGATAACAATTTACTGGAAACATTACCCGATTTACCCCCTTCCCTGAAAAAACTTCATGTCAGAGAAAATGATTTAACTGATCTGCCAGAATTACCGCAGAGCCTGGAATCACTTCAAGTTGATAATAACAATCTGAAGGCATTATCCGATTTACCTCCTTCACTGGAATATCTTACTGCTAGTAGTAATAAGCTGGAAGAATTGCCAGAGTTGCAAAACTTGCCCTTCTTGGCTGCGATTTATGCTGATAACAATTTACTGGAAACATTACCCGATTTACCCCCACATTTAGAGATTTTAGTGGCGTCATATAATTCTCTTACTGAATTACCGGAATTGCCGCAGAGCCTGAAATCACTTCGAGTTGATAATAACAATCTGAAGGCATTATCCGATTTACCTCCTTCACTGGAATATCTTACTGCTAGTAGTAATAAGCTGGAAGAATTACCAGAGTTGCAAAACTTGCCCTTCTTGGCTGCGATTTATGCTGATAACAATTTACTGGAAACATTACCCGATTTACCCCCTTCCCTGAAAAAACTTCATGTCAGAGAAAATGATTTAACTGATCTGCCAGAATTACCGCAGAGTTTAACCTTCTTAGATGTTTCTGATAATAATATTTCTGGATTATCGGAATTGCCACCAAACTTGTATTATCTCGATGCATCCAGCAATGAAATAAGATCCTTATGCGATTTACCTCCTTCACTGGTAGACCTTAATGTCAAAAGTAATCAGTTGAGCGAACTGCCAGCGTTACCTCCACACTTAGAACGTTTAATCGCTTCATTTAATTATCTTGCTGAAGTACCTGAATTGCCGCAAAACCTGAAACAGCTCCACGTAGAGCAAAACGCTCTGAGAGAGTTTCCCGATATACCTGAGTCATTGGAAGAGCTTGAGATGGACTCTGAACGTGTAGTTGATCCATATGAATTTGCTCATGAGACTACAGACAAACTTGAAGATGATGTATTTGAGTAG #------------------------------- yopO (+9) ------------------------------- #grep "yopO" selected_gtf_files/Yersinia_enterocolitica_YE165.gtf NZ_CP016933.1 RefSeq gene 11705 13893 . - . gene_id "BB936_RS22335"; transcript_id ""; gbkey "Gene"; gene "yopO"; gene_biotype "pseudogene"; gene_synonym "ypkA"; locus_tag "BB936_RS22335"; old_locus_tag "BB936_22330"; pseudo "true"; samtools faidx Yersinia_enterocolitica_YE165.fna NZ_CP016933.1:11705-13893 > temp.fna revseq sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 11705-13893.rev > temp_.fna Yersinia_enterocolitica_YE165 ATGAAAATCATGGGAACTATGCCACCGTCGATCTCCCTCGCTAAAGCTCATGAGCGCATCAGCCAACATTGGCAAAATCCTGTCGGTGAGCTCAATATCGGAGGAAAACGGTATAGAATTATCGATAATCAAGTGCTGCGCTTGAACCCCCACAGTGGTTTTTCTCTCTTTCGAGAAGGGGTTGGTAAGATCTTTTCGGGGAAGATGTTTAACTTTTCAATTGCTCGTAACCTTACTGAGACACTCCATGCAGCCCAGAAAACGACTTCGCAGGAGCTAAGGTCTGATATCCCCAATGTTCTCAGTAATCTCTTTGGAGCCAAGCCACAGACCGAACTGCCGCTGGGTTGGAAAGGGAAGCCTTTGTCAGGAGCTCCGGATCTTGAAGGGATGCGAGTGGCTGAAACCGATAAGTTTGCCGAGGGCGAAAGCCATATTAGTATAATAGAAACTAAGGATAATCAGCGGTTGGTGGCTAAGATTGAACGCTCCATTGCCGAGGGGCATTTGTTCGCAGAACTGGAGGCTTATAAACACATCTATAAAACCGCGGGCAAACATCCTAATCTTGCCAATGTCCATGGCATGGCTGTGGTGCCATACGGTAACCGTAAGGAGGAAGCATTGCTGATGGATGAGGTGGATGGTTGGCGTTGTTCTGACACACTAAGAAGCCTCGCCGATAGCTGGAAGCAAGGAAAGATCAATAGTGAAGCCTACTGGGGAACGATCAAGTTTATTGCCCATCGGCTATTAGATGTAACCAATCACCTTGCCAAGGCAGGGATAGTACATAACGATATCAAACCCGGTAATGTGGTATTTGACCGCGCTAGCGGAGAGCCCGTTGTCATTGATCTAGGATTACACTCTCGTTCAGGGGAACAACCTAAGGGGTTTACAGAATCCTTCAAAGCGCCGGAGCTTGGAGTAGGAAACCTAGGCGCATCAGAAAAGAGCGATGTTTTTCTCGTAGTTTCAACCCTTCTACATGGTATCGAAGGTTTTGAGAAAGATCCGGAGATAAAACCTAATCAAGGACTGAGATCCATTACCTCAGAACCAGCGCACGTAATGGATGAGAATGGTTACCCAATCCATCGACCTGGTATAGCTGGAGTCGAGACAGCCTATACACGCTTCATCACAGACATCCTTGGCGTTTCCGCTGACTCAAGACCTGATTCCAACGAAGCCAGACTCCACGAGTTCTTGAGCGACGGAACTATTGACGAGGAGTCGGCCAAGCAGATCCTAAAAGATACTCTAACCGGAGAAATGAGCCCATTATCTACTGATGTAAGGCGGATAACACCCAAGAAGCTTCGGGAGCTCTCTGATTTGCTTAGGACGCATTTGAGTAGTGCAGCAACTAAGCAATTGGATATGGGGGTGGTTTTGTCGGATCTTGATACCATGTTGGTGACACTCGACAAGGCCGAACGCGAGGGGGAGTAGACAAGGATCAGTTGAAGAGTTTTAACAGTTTGATTCTGAAGACTTACAGCGTGATTGAAGACTATGTCAAAGGCAGAGAAGGGGATACCAAGAGTTCCAGTGCGGAAGTATCCCCCTATCATCGCAGTAACTTTATGCTATCGATCGTCGAGCCTTCACTGCAGAGGATCCAAAAGCATCTGGACCAGACACACTCTTTTTCTGATATCGGTTCACTAGTGCGCGCACATAAGCACCTGGAAACGCTTTTAGAGGTCTTAGTCACCTTGTCACCGCAAGGGCAGCCCGTGTCCTCTGAAACCTACAGCTTCCTGAATCGATTAGCTGAGGCTAAGGTCACCTTGTCGCAGCAATTGGATACTCTCCAGCAGCAGCAGGAGAGTGCGAAAGCGCAACTATCTATTCTGATTAATCGTTCAGGTTCTTGGGCCGATGTTGCTCGTCAGTCCCTGCAGCGTTTTGACAGTACCCGGCCTGTAGTGAAATTCGGCACTGAGCAGTATACCGCAATTCACCGTCAGATGATGGCGGCCCATGCAGCCATTACGCTACAGGAGGTATCGGAGTTTACTGATGATATGCGAAACTTTACAGCGGACTCTATTCCACTACTGATTCGACTTGGACGAAGCAGTTTAATAGATGAGCATTTGGTTGAACAGAGAGAGAAGTTGCGAGAGCTGACGACCATCGCCGAGCGACTGAACCGGTTGGAGCGGGAATGGATGTGA #grep "yopO" selected_gtf_files/Yersinia_enterocolitica_YE3.gtf NZ_CP016943.1 RefSeq gene 12782 14970 . - . gene_id "BED35_RS00550"; transcript_id ""; gbkey "Gene"; gene "yopO"; gene_biotype "pseudogene"; gene_synonym "ypkA"; locus_tag "BED35_RS00550"; old_locus_tag "BED35_00550"; pseudo "true"; samtools faidx Yersinia_enterocolitica_YE3.fna NZ_CP016943.1:12782-14970 > temp.fna revseq sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 12782-14970.rev > temp_.fna Yersinia_enterocolitica_YE3 ATGAAAATCATGGGAACTATGCCACCGTCGATCTCCCTCGCTAAAGCTCATGAGCGCATCAGCCAACATTGGCAAAATCCTGTCGGTGAGCTCAATATCGGAGGAAAACGGTATAGAATTATCGATAATCAAGTGCTGCGCTTGAACCCCCACAGTGGTTTTTCTCTCTTTCGAGAAGGGGTTGGTAAGATCTTTTCGGGGAAGATGTTTAACTTTTCAATTGCTCGTAACCTTACTGAGACACTCCATGCAGCCCAGAAAACGACTTCGCAGGAGCTAAGGTCTGATATCCCCAATGTTCTCAGTAATCTCTTTGGAGCCAAGCCACAGACCGAACTGCCGCTGGGTTGGAAAGGGAAGCCTTTGTCAGGAGCTCCGGATCTTGAAGGGATGCGAGTGGCTGAAACCGATAAGTTTGCCGAGGGCGAAAGCCATATTAGTATAATAGAAACTAAGGATAATCAGCGGTTGGTGGCTAAGATTGAACGCTCCATTGCCGAGGGGCATTTGTTCGCAGAACTGGAGGCTTATAAACACATCTATAAAACCGCGGGCAAACATCCTAATCTTGCCAATGTCCATGGCATGGCTGTGGTGCCATACGGTAACCGTAAGGAGGAAGCATTGCTGATGGATGAGGTGGATGGTTGGCGTTGTTCTGACACACTAAGAAGCCTCGCCGATAGCTGGAAGCAAGGAAAGATCAATAGTGAAGCCTACTGGGGAACGATCAAGTTTATTGCCCATCGGCTATTAGATGTAACCAATCACCTTGCCAAGGCAGGGATAGTACATAACGATATCAAACCCGGTAATGTGGTATTTGACCGCGCTAGCGGAGAGCCCGTTGTCATTGATCTAGGATTACACTCTCGTTCAGGGGAACAACCTAAGGGGTTTACAGAATCCTTCAAAGCGCCGGAGCTTGGAGTAGGAAACCTAGGCGCATCAGAAAAGAGCGATGTTTTTCTCGTAGTTTCAACCCTTCTACATGGTATCGAAGGTTTTGAGAAAGATCCGGAGATAAAACCTAATCAAGGACTGAGATCCATTACCTCAGAACCAGCGCACGTAATGGATGAGAATGGTTACCCAATCCATCGACCTGGTATAGCTGGAGTCGAGACAGCCTATACACGCTTCATCACAGACATCCTTGGCGTTTCCGCTGACTCAAGACCTGATTCCAACGAAGCCAGACTCCACGAGTTCTTGAGCGACGGAACTATTGACGAGGAGTCGGCCAAGCAGATCCTAAAAGATACTCTAACCGGAGAAATGAGCCCATTATCTACTGATGTAAGGCGGATAACACCCAAGAAGCTTCGGGAGCTCTCTGATTTGCTTAGGACGCATTTGAGTAGTGCAGCAACTAAGCAATTGGATATGGGGGTGGTTTTGTCGGATCTTGATACCATGTTGGTGACACTCGACAAGGCCGAACGCGAGGGGGAGTAGACAAGGATCAGTTGAAGAGTTTTAACAGTTTGATTCTGAAGACTTACAGCGTGATTGAAGACTATGTCAAAGGCAGAGAAGGGGATACCAAGAGTTCCAGTGCGGAAGTATCCCCCTATCATCGCAGTAACTTTATGCTATCGATCGTCGAGCCTTCACTGCAGAGGATCCAAAAGCATCTGGACCAGACACACTCTTTTTCTGATATCGGTTCACTAGTGCGCGCACATAAGCACCTGGAAACGCTTTTAGAGGTCTTAGTCACCTTGTCACCGCAAGGGCAGCCCGTGTCCTCTGAAACCTACAGCTTCCTGAATCGATTAGCTGAGGCTAAGGTCACCTTGTCGCAGCAATTGGATACTCTCCAGCAGCAGCAGGAGAGTGCGAAAGCGCAACTATCTATTCTGATTAATCGTTCAGGTTCTTGGGCCGATGTTGCTCGTCAGTCCCTGCAGCGTTTTGACAGTACCCGGCCTGTAGTGAAATTCGGCACTGAGCAGTATACCGCAATTCACCGTCAGATGATGGCGGCCCATGCAGCCATTACGCTACAGGAGGTATCGGAGTTTACTGATGATATGCGAAACTTTACAGCGGACTCTATTCCACTACTGATTCGACTTGGACGAAGCAGTTTAATAGATGAGCATTTGGTTGAACAGAGAGAGAAGTTGCGAGAGCTGACGACCATCGCCGAGCGACTGAACCGGTTGGAGCGGGAATGGATGTGA #grep "yopO" selected_gtf_files/Yersinia_enterocolitica_YE6.gtf NZ_CP016937.1 RefSeq gene 4748707 4750895 . - . gene_id "BED33_RS21960"; transcript_id ""; gbkey "Gene"; gene "yopO"; gene_biotype "pseudogene"; gene_synonym "ypkA"; locus_tag "BED33_RS21960"; old_locus_tag "BED33_21960"; pseudo "true"; samtools faidx Yersinia_enterocolitica_YE6.fna NZ_CP016937.1:4748707-4750895 > temp.fna revseq sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 4748707-4750895.rev > temp_.fna Yersinia_enterocolitica_YE6 ATGAAAATCATGGGAACTATGCCACCGTCGATCTCCCTCGCTAAAGCTCATGAGCGCATCAGCCAACATTGGCAAAATCCTGTCGGTGAGCTCAATATCGGAGGAAAACGGTATAGAATTATCGATAATCAAGTGCTGCGCTTGAACCCCCACAGTGGTTTTTCTCTCTTTCGAGAAGGGGTTGGTAAGATCTTTTCGGGGAAGATGTTTAACTTTTCAATTGCTCGTAACCTTACTGAGACACTCCATGCAGCCCAGAAAACGACTTCGCAGGAGCTAAGGTCTGATATCCCCAATGTTCTCAGTAATCTCTTTGGAGCCAAGCCACAGACCGAACTGCCGCTGGGTTGGAAAGGGAAGCCTTTGTCAGGAGCTCCGGATCTTGAAGGGATGCGAGTGGCTGAAACCGATAAGTTTGCCGAGGGCGAAAGCCATATTAGTATAATAGAAACTAAGGATAATCAGCGGTTGGTGGCTAAGATTGAACGCTCCATTGCCGAGGGGCATTTGTTCGCAGAACTGGAGGCTTATAAACACATCTATAAAACCGCGGGCAAACATCCTAATCTTGCCAATGTCCATGGCATGGCTGTGGTGCCATACGGTAACCGTAAGGAGGAAGCATTGCTGATGGATGAGGTGGATGGTTGGCGTTGTTCTGACACACTAAGAAGCCTCGCCGATAGCTGGAAGCAAGGAAAGATCAATAGTGAAGCCTACTGGGGAACGATCAAGTTTATTGCCCATCGGCTATTAGATGTAACCAATCACCTTGCCAAGGCAGGGATAGTACATAACGATATCAAACCCGGTAATGTGGTATTTGACCGCGCTAGCGGAGAGCCCGTTGTCATTGATCTAGGATTACACTCTCGTTCAGGGGAACAACCTAAGGGGTTTACAGAATCCTTCAAAGCGCCGGAGCTTGGAGTAGGAAACCTAGGCGCATCAGAAAAGAGCGATGTTTTTCTCGTAGTTTCAACCCTTCTACATGGTATCGAAGGTTTTGAGAAAGATCCGGAGATAAAACCTAATCAAGGACTGAGATCCATTACCTCAGAACCAGCGCACGTAATGGATGAGAATGGTTACCCAATCCATCGACCTGGTATAGCTGGAGTCGAGACAGCCTATACACGCTTCATCACAGACATCCTTGGCGTTTCCGCTGACTCAAGACCTGATTCCAACGAAGCCAGACTCCACGAGTTCTTGAGCGACGGAACTATTGACGAGGAGTCGGCCAAGCAGATCCTAAAAGATACTCTAACCGGAGAAATGAGCCCATTATCTACTGATGTAAGGCGGATAACACCCAAGAAGCTTCGGGAGCTCTCTGATTTGCTTAGGACGCATTTGAGTAGTGCAGCAACTAAGCAATTGGATATGGGGGTGGTTTTGTCGGATCTTGATACCATGTTGGTGACACTCGACAAGGCCGAACGCGAGGGGGAGTAGACAAGGATCAGTTGAAGAGTTTTAACAGTTTGATTCTGAAGACTTACAGCGTGATTGAAGACTATGTCAAAGGCAGAGAAGGGGATACCAAGAGTTCCAGTGCGGAAGTATCCCCCTATCATCGCAGTAACTTTATGCTATCGATCGTCGAGCCTTCACTGCAGAGGATCCAAAAGCATCTGGACCAGACACACTCTTTTTCTGATATCGGTTCACTAGTGCGCGCACATAAGCACCTGGAAACGCTTTTAGAGGTCTTAGTCACCTTGTCACCGCAAGGGCAGCCCGTGTCCTCTGAAACCTACAGCTTCCTGAATCGATTAGCTGAGGCTAAGGTCACCTTGTCGCAGCAATTGGATACTCTCCAGCAGCAGCAGGAGAGTGCGAAAGCGCAACTATCTATTCTGATTAATCGTTCAGGTTCTTGGGCCGATGTTGCTCGTCAGTCCCTGCAGCGTTTTGACAGTACCCGGCCTGTAGTGAAATTCGGCACTGAGCAGTATACCGCAATTCACCGTCAGATGATGGCGGCCCATGCAGCCATTACGCTACAGGAGGTATCGGAGTTTACTGATGATATGCGAAACTTTACAGCGGACTCTATTCCACTACTGATTCGACTTGGACGAAGCAGTTTAATAGATGAGCATTTGGTTGAACAGAGAGAGAAGTTGCGAGAGCTGACGACCATCGCCGAGCGACTGAACCGGTTGGAGCGGGAATGGATGTGA #grep "yopO" selected_gtf_files/Yersinia_pestis_790.gtf #grep "yopO" selected_gtf_files/Yersinia_pestis_FDAARGOS_601.gtf NZ_CP033697.1 RefSeq gene 68815 70300 . + . gene_id "EGX46_RS00005"; transcript_id ""; gbkey "Gene"; gene "yopO"; gene_biotype "protein_coding"; gene_synonym "ypkA"; locus_tag "EGX46_RS00005"; old_locus_tag "EGX46_00005"; part "1"; NZ_CP033697.1 RefSeq gene 1 713 . + . gene_id "EGX46_RS00005"; transcript_id ""; gbkey "Gene"; gene "yopO"; gene_biotype "protein_coding"; gene_synonym "ypkA"; locus_tag "EGX46_RS00005"; old_locus_tag "EGX46_00005"; part "2"; samtools faidx Yersinia_pestis_FDAARGOS_601.fna NZ_CP033697.1:68815-70300 > temp.fna samtools faidx Yersinia_pestis_FDAARGOS_601.fna NZ_CP033697.1:1-713 >> temp.fna #delete the second ">****" sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna Yersinia_pestis_FDAARGOS_601 ATGAAAAGCGTGAAAATCATGGGAACTATGCCACCGTCGATCTCCCTCGCCAAAGCTCATGAGCGCATCAGCCAACATTGGCAAAATCCTGTCGGTGAGCTCAATATCGGAGGAAAACGGTATAGAATTATCGATAATCAAGTGTTGCGCTTGAACCCCCACAGTGGTTTTTCTCTCTTTCGAGAAGGGGTTGGTAAGATCTTTTCGGGGAAGATGTTTAACTTTTCAATTGCTCGTAACCTTACTGACACACTCCATGCGGCCCAGAAAACGACTTCGCAGGAGCTAAGGTCTGATATCCCCAATGCTCTCAGTAATCTCTTTGGAGCCAAGCCACAGACCGAACTGCCGCTGGGTTGGAAAGGGGAGCCCTTGTCAGGAGCTCCGGATCTTGAAGGGATGCGAGTGGCTGAAACCGATAAGTTTGCCGAGGGCGAAAGCCATATTAGTATAATAGAAACTAAGGATAAGCAGCGGTTGGTAGCTAAGATTGAACGCTCCATTGCCGAGGGGCATTTGTTCGCAGAACTGGAGGCTTATAAACACATCTATAAAACCGCGGGCAAACATCCTAATCTTGCCAATGTTCATGGCATGGCTGTGGTGCCATACGGTAACCGTAAGGAGGAAGCATTGCTGATGGATGAGGTGGATGGTTGGCGTTGTTCTGACACACTAAGAACCCTCGCCGATAGCTGGAAGCAAGGAAAGATCAATAGTGAAGCCTACTGGGGAACGATCAAGTTTATTGCCCATCGGCTATTAGATGTAACCAATCACCTTGCCAAGGCAGGGGTAGTACATAACGATATCAAACCCGGTAATGTGGTATTTGACCGCGCTAGCGGAGAGCCCGTTGTTATTGATCTAGGATTACACTCTCGTTCAGGGGAACAACCTAAGGGGTTTACAGAATCCTTCAAAGCGCCGGAGCTTGGAGTAGGAAACCTAGGCGCATCAGAAAAGAGCGATGTTTTTCTCGTAGTGTCAACCCTTCTACATTGTATCGAAGGTTTTGAGAAAAATCCGGAGATAAAGCCTAATCAAGGACTGAGATTCATTACCTCAGAACCAGCGCACGTAATGGATGAGAATGGTTATCCAATCCATCGACCTGGTATAGCTGGAGTCGAGACAGCCTATACACGCTTCATCACAGACATCCTTGGCGTTTCCGCTGACTCAAGACCTGATTCCAACGAAGCCAGACTCCACGAGTTCTTGAGCGACGGAACTATCGACGAGGAGTCGGCCAAGCAGATCCTAAAAGATACCCTAACCGGAGAAATGAGCCCATTATCTACTGATGTAAGGCGGATAACACCCAAGAAGCTTCGGGAGCTATCTGATTTGCTTAGGACGCATTTGAGCAGTGCAGCAACTAAGCAATTGGATATGGGGGGGGTTTTGTCGGATCTTGATACCATGTTGGTGGCACTCGACAAGGCCGAACGCGAGGGGGGAGTAGACAAGGATCAGTTGAAGAGTTTTAACAGTTTGATTCTGAAGACTTACAGAGTGATTGAAGACTATGTCAAAGGCAGAGAAGGGGATACCAAGAATTCCAGTACGGAAGTATCCCCCTATCATCGCAGTAACTTTATGCTATCGATCGTCGAACCTTCACTGCAGAGGATCCAGAAGCATCTGGACCAGACACACTCTTTTTCTGATATCGGTTCACTAGTGCGCGCACATAAGCACCTGGAAACGCTTTTAGAGGTCTTAGTCACCTTGTCACAGCAAGGGCAGCCCGTGTCCTCTGAAACCTACGGCTTCCTGAATCGATTAACTGAGGCTAAGATCACCTTGTCGCAGCAATTGAATACTCTCCAGCAGCAGCAGGAGAGTGCGAAAGCGCAATTATCTATTCTGATTAATCGTTCAGGTTCTTGGGCCGATGTTGCTCGTCAGTCCCTGCAGCGTTTTGACAGTACCCGGCCTGTAGTGAAATTCGGCACTGAGCAGTATACCGCAATTCACCGTCAGATGATGGCGGCCCATGCAGCTATTACGCTACAGGAGGTATCGGAGTTTACTGATGATATGCGAAACTTTACAGTGGACTCTATTCCACTACTGATTCAACTTGGACGAAGCAGTTTAATGGATGAGCATTTGGTTGAACAGAGAGAAAAGTTGCGAGAGCTGACGACCATCGCCGAGCGACTGAACCGGTTGGAGCGGGAATGGATGTGA #grep "yopO" selected_gtf_files/Yersinia_pestis_Harbin_35.gtf NC_017263.1 RefSeq gene 49729 51926 . - . gene_id "YPC_RS21300"; transcript_id ""; gbkey "Gene"; gene "yopO"; gene_biotype "pseudogene"; gene_synonym "ypkA"; locus_tag "YPC_RS21300"; pseudo "true"; samtools faidx Yersinia_pestis_Harbin_35.fna NC_017263.1:49729-51926 > temp.fna revseq sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 49729-51926.rev > temp_.fna Yersinia_pestis_Harbin_35 ATGAAAAGCGTGAAAATCATGGGAACTATGCCACCGTCGATCTCCCTCGCCAAAGCTCATGAGCGCATCAGCCAACATTGGCAAAATCCTGTCGGTGAGCTCAATATCGGAGGAAAACGGTATAGAATTATCGATAATCAAGTGTTGCGCTTGAACCCCCACAGTGGTTTTTCTCTCTTTCGAGAAGGGGTTGGTAAGATCTTTTCGGGGAAGATGTTTAACTTTTCAATTGCTCGTAACCTTACTGACACACTCCATGCGGCCCAGAAAACGACTTCGCAGGAGCTAAGGTCTGATATCCCCAATGCTCTCAGTAATCTCTTTGGAGCCAAGCCACAGACCGAACTGCCGCTGGGTTGGAAAGGGGAGCCCTTGTCAGGAGCTCCGGATCTTGAAGGGATGCGAGTGGCTGAAACCGATAAGTTTGCCGAGGGCGAAAGCCATATTAGTATAATAGAAACTAAGGATAAGCAGCGGTTGGTAGCTAAGATTGAACGCTCCATTGCCGAGGGGCATTTGTTCGCAGAACTGGAGGCTTATAAACACATCTATAAAACCGCGGGCAAACATCCTAATCTTGCCAATGTTCATGGCATGGCTGTGGTGCCATACGGTAACCGTAAGGAGGAAGCATTGCTGATGGATGAGGTGGATGGTTGGCGTTGTTCTGACACACTAAGAACCCTCGCCGATAGCTGGAAGCAAGGAAAGATCAATAGTGAAGCCTACTGGGGAACGATCAAGTTTATTGCCCATCGGCTATTAGATGTAACCAATCACCTTGCCAAGGCAGGGGTAGTACATAACGATATCAAACCCGGTAATGTGGTATTTGACCGCGCTAGCGGAGAGCCCGTTGTTATTGATCTAGGATTACACTCTCGTTCAGGGGAACAACCTAAGGGGTTTACAGAATCCTTCAAAGCGCCGGAGCTTGGAGTAGGAAACCTAGGCGCATCAGAAAAGAGCGATGTTTTTCTCGTAGTGTCAACCCTTCTACATTGTATCGAAGGTTTTGAGAAAAATCCGGAGATAAAGCCTAATCAAGGACTGAGATTCATTACCTCAGAACCAGCGCACGTAATGGATGAGAATGGTTATCCAATCCATCGACCTGGTATAGCTGGAGTCGAGACAGCCTATACACGCTTCATCACAGACATCCTTGGCGTTTCCGCTGACTCAAGACCTGATTCCAACGAAGCCAGACTCCACGAGTTCTTGAGCGACGGAACTATCGACGAGGAGTCGGCCAAGCAGATCCTAAAAGATACCCTAACCGGAGAAATGAGCCCATTATCTACTGATGTAAGGCGGATAACACCCAAGAAGCTTCGGGAGCTATCTGATTTGCTTAGGACGCATTTGAGCAGTGCAGCAACTAAGCAATTGGATATGGGGGGGTTTTGTCGGATCTTGATACCATGTTGGTGGCACTCGACAAGGCCGAACGCGAGGGGGGAGTAGACAAGGATCAGTTGAAGAGTTTTAACAGTTTGATTCTGAAGACTTACAGAGTGATTGAAGACTATGTCAAAGGCAGAGAAGGGGATACCAAGAATTCCAGTACGGAAGTATCCCCCTATCATCGCAGTAACTTTATGCTATCGATCGTCGAACCTTCACTGCAGAGGATCCAGAAGCATCTGGACCAGACACACTCTTTTTCTGATATCGGTTCACTAGTGCGCGCACATAAGCACCTGGAAACGCTTTTAGAGGTCTTAGTCACCTTGTCACAGCAAGGGCAGCCCGTGTCCTCTGAAACCTACGGCTTCCTGAATCGATTAACTGAGGCTAAGATCACCTTGTCGCAGCAATTGAATACTCTCCAGCAGCAGCAGGAGAGTGCGAAAGCGCAATTATCTATTCTGATTAATCGTTCAGGTTCTTGGGCCGATGTTGCTCGTCAGTCCCTGCAGCGTTTTGACAGTACCCGGCCTGTAGTGAAATTCGGCACTGAGCAGTATACCGCAATTCACCGTCAGATGATGGCGGCCCATGCAGCTATTACGCTACAGGAGGTATCGGAGTTTACTGATGATATGCGAAACTTTACAGTGGACTCTATTCCACTACTGATTCAACTTGGACGAAGCAGTTTAATGGATGAGCATTTGGTTGAACAGAGAGAAAAGTTGCGAGAGCTGACGACCATCGCCGAGCGACTGAACCGGTTGGAGCGGGAATGGATGTGA #grep "yopO" selected_gtf_files/Yersinia_pestis_Harbin_35_bis.gtf NZ_CP009703.1 RefSeq gene 55189 57386 . + . gene_id "CH55_RS00985"; transcript_id ""; gbkey "Gene"; gene "yopO"; gene_biotype "pseudogene"; gene_synonym "ypkA"; locus_tag "CH55_RS00985"; old_locus_tag "CH55_4357"; pseudo "true"; samtools faidx Yersinia_pestis_Harbin_35_bis.fna NZ_CP009703.1:55189-57386 > temp.fna sed -i -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' temp.fna Yersinia_pestis_Harbin_35_bis ATGAAAAGCGTGAAAATCATGGGAACTATGCCACCGTCGATCTCCCTCGCCAAAGCTCATGAGCGCATCAGCCAACATTGGCAAAATCCTGTCGGTGAGCTCAATATCGGAGGAAAACGGTATAGAATTATCGATAATCAAGTGTTGCGCTTGAACCCCCACAGTGGTTTTTCTCTCTTTCGAGAAGGGGTTGGTAAGATCTTTTCGGGGAAGATGTTTAACTTTTCAATTGCTCGTAACCTTACTGACACACTCCATGCGGCCCAGAAAACGACTTCGCAGGAGCTAAGGTCTGATATCCCCAATGCTCTCAGTAATCTCTTTGGAGCCAAGCCACAGACCGAACTGCCGCTGGGTTGGAAAGGGGAGCCCTTGTCAGGAGCTCCGGATCTTGAAGGGATGCGAGTGGCTGAAACCGATAAGTTTGCCGAGGGCGAAAGCCATATTAGTATAATAGAAACTAAGGATAAGCAGCGGTTGGTAGCTAAGATTGAACGCTCCATTGCCGAGGGGCATTTGTTCGCAGAACTGGAGGCTTATAAACACATCTATAAAACCGCGGGCAAACATCCTAATCTTGCCAATGTTCATGGCATGGCTGTGGTGCCATACGGTAACCGTAAGGAGGAAGCATTGCTGATGGATGAGGTGGATGGTTGGCGTTGTTCTGACACACTAAGAACCCTCGCCGATAGCTGGAAGCAAGGAAAGATCAATAGTGAAGCCTACTGGGGAACGATCAAGTTTATTGCCCATCGGCTATTAGATGTAACCAATCACCTTGCCAAGGCAGGGGTAGTACATAACGATATCAAACCCGGTAATGTGGTATTTGACCGCGCTAGCGGAGAGCCCGTTGTTATTGATCTAGGATTACACTCTCGTTCAGGGGAACAACCTAAGGGGTTTACAGAATCCTTCAAAGCGCCGGAGCTTGGAGTAGGAAACCTAGGCGCATCAGAAAAGAGCGATGTTTTTCTCGTAGTGTCAACCCTTCTACATTGTATCGAAGGTTTTGAGAAAAATCCGGAGATAAAGCCTAATCAAGGACTGAGATTCATTACCTCAGAACCAGCGCACGTAATGGATGAGAATGGTTATCCAATCCATCGACCTGGTATAGCTGGAGTCGAGACAGCCTATACACGCTTCATCACAGACATCCTTGGCGTTTCCGCTGACTCAAGACCTGATTCCAACGAAGCCAGACTCCACGAGTTCTTGAGCGACGGAACTATCGACGAGGAGTCGGCCAAGCAGATCCTAAAAGATACCCTAACCGGAGAAATGAGCCCATTATCTACTGATGTAAGGCGGATAACACCCAAGAAGCTTCGGGAGCTATCTGATTTGCTTAGGACGCATTTGAGCAGTGCAGCAACTAAGCAATTGGATATGGGGGGGTTTTGTCGGATCTTGATACCATGTTGGTGGCACTCGACAAGGCCGAACGCGAGGGGGGAGTAGACAAGGATCAGTTGAAGAGTTTTAACAGTTTGATTCTGAAGACTTACAGAGTGATTGAAGACTATGTCAAAGGCAGAGAAGGGGATACCAAGAATTCCAGTACGGAAGTATCCCCCTATCATCGCAGTAACTTTATGCTATCGATCGTCGAACCTTCACTGCAGAGGATCCAGAAGCATCTGGACCAGACACACTCTTTTTCTGATATCGGTTCACTAGTGCGCGCACATAAGCACCTGGAAACGCTTTTAGAGGTCTTAGTCACCTTGTCACAGCAAGGGCAGCCCGTGTCCTCTGAAACCTACGGCTTCCTGAATCGATTAACTGAGGCTAAGATCACCTTGTCGCAGCAATTGAATACTCTCCAGCAGCAGCAGGAGAGTGCGAAAGCGCAATTATCTATTCTGATTAATCGTTCAGGTTCTTGGGCCGATGTTGCTCGTCAGTCCCTGCAGCGTTTTGACAGTACCCGGCCTGTAGTGAAATTCGGCACTGAGCAGTATACCGCAATTCACCGTCAGATGATGGCGGCCCATGCAGCTATTACGCTACAGGAGGTATCGGAGTTTACTGATGATATGCGAAACTTTACAGTGGACTCTATTCCACTACTGATTCAACTTGGACGAAGCAGTTTAATGGATGAGCATTTGGTTGAACAGAGAGAAAAGTTGCGAGAGCTGACGACCATCGCCGAGCGACTGAACCGGTTGGAGCGGGAATGGATGTGA #grep "yopO" selected_gtf_files/Yersinia_pestis_Java9.gtf NZ_CP009995.1 RefSeq gene 76131 77073 . - . gene_id "CH62_RS22640"; transcript_id ""; gbkey "Gene"; gene "yopO"; gene_biotype "protein_coding"; gene_synonym "ypkA"; locus_tag "CH62_RS22640"; part "2"; NZ_CP009995.1 RefSeq gene 1 1256 . - . gene_id "CH62_RS22640"; transcript_id ""; gbkey "Gene"; gene "yopO"; gene_biotype "protein_coding"; gene_synonym "ypkA"; locus_tag "CH62_RS22640"; part "1"; samtools faidx Yersinia_pestis_Java9.fna NZ_CP009995.1:76131-77073 > temp.fna samtools faidx Yersinia_pestis_Java9.fna NZ_CP009995.1:1-1256 >> temp.fna #delete the second ">****" revseq sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 76131-77073.rev > temp_.fna Yersinia_pestis_Java9 ATGAAAAGCGTGAAAATCATGGGAACTATGCCACCGTCGATCTCCCTCGCCAAAGCTCATGAGCGCATCAGCCAACATTGGCAAAATCCTGTCGGTGAGCTCAATATCGGAGGAAAACGGTATAGAATTATCGATAATCAAGTGTTGCGCTTGAACCCCCACAGTGGTTTTTCTCTCTTTCGAGAAGGGGTTGGTAAGATCTTTTCGGGGAAGATGTTTAACTTTTCAATTGCTCGTAACCTTACTGACACACTCCATGCGGCCCAGAAAACGACTTCGCAGGAGCTAAGGTCTGATATCCCCAATGCTCTCAGTAATCTCTTTGGAGCCAAGCCACAGACCGAACTGCCGCTGGGTTGGAAAGGGGAGCCCTTGTCAGGAGCTCCGGATCTTGAAGGGATGCGAGTGGCTGAAACCGATAAGTTTGCCGAGGGCGAAAGCCATATTAGTATAATAGAAACTAAGGATAAGCAGCGGTTGGTAGCTAAGATTGAACGCTCCATTGCCGAGGGGCATTTGTTCGCAGAACTGGAGGCTTATAAACACATCTATAAAACCGCGGGCAAACATCCTAATCTTGCCAATGTTCATGGCATGGCTGTGGTGCCATACGGTAACCGTAAGGAGGAAGCATTGCTGATGGATGAGGTGGATGGTTGGCGTTGTTCTGACACACTAAGAACCCTCGCCGATAGCTGGAAGCAAGGAAAGATCAATAGTGAAGCCTACTGGGGAACGATCAAGTTTATTGCCCATCGGCTATTAGATGTAACCAATCACCTTGCCAAGGCAGGGGTAGTACATAACGATATCAAACCCGGTAATGTGGTATTTGACCGCGCTAGCGGAGAGCCCGTTGTTATTGATCTAGGATTACACTCTCGTTCAGGGGAACAACCTAAGGGGTTTACAGAATCCTTCAAAGCGCCGGAGCTTGGAGTAGGAAACCTAGGCGCATCAGAAAAGAGCGATGTTTTTCTCGTAGTGTCAACCCTTCTACATTGTATCGAAGGTTTTGAGAAAAATCCGGAGATAAAGCCTAATCAAGGACTGAGATTCATTACCTCAGAACCAGCGCACGTAATGGATGAGAATGGTTATCCAATCCATCGACCTGGTATAGCTGGAGTCGAGACAGCCTATACACGCTTCATCACAGACATCCTTGGCGTTTCCGCTGACTCAAGACCTGATTCCAACGAAGCCAGACTCCACGAGTTCTTGAGCGACGGAACTATCGACGAGGAGTCGGCCAAGCAGATCCTAAAAGATACCCTAACCGGAGAAATGAGCCCATTATCTACTGATGTAAGGCGGATAACACCCAAGAAGCTTCGGGAGCTATCTGATTTGCTTAGGACGCATTTGAGCAGTGCAGCAACTAAGCAATTGGATATGGGGGGGGTTTTGTCGGATCTTGATACCATGTTGGTGGCACTCGACAAGGCCGAACGCGAGGGGGGAGTAGACAAGGATCAGTTGAAGAGTTTTAACAGTTTGATTCTGAAGACTTACAGAGTGATTGAAGACTATGTCAAAGGCAGAGAAGGGGATACCAAGAATTCCAGTACGGAAGTATCCCCCTATCATCGCAGTAACTTTATGCTATCGATCGTCGAACCTTCACTGCAGAGGATCCAGAAGCATCTGGACCAGACACACTCTTTTTCTGATATCGGTTCACTAGTGCGCGCACATAAGCACCTGGAAACGCTTTTAGAGGTCTTAGTCACCTTGTCACAGCAAGGGCAGCCCGTGTCCTCTGAAACCTACGGCTTCCTGAATCGATTAACTGAGGCTAAGATCACCTTGTCGCAGCAATTGAATACTCTCCAGCAGCAGCAGGAGAGTGCGAAAGCGCAATTATCTATTCTGATTAATCGTTCAGGTTCTTGGGCCGATGTTGCTCGTCAGTCCCTGCAGCGTTTTGACAGTACCCGGCCTGTAGTGAAATTCGGCACTGAGCAGTATACCGCAATTCACCGTCAGATGATGGCGGCCCATGCAGCTATTACGCTACAGGAGGTATCGGAGTTTACTGATGATATGCGAAACTTTACAGTGGACTCTATTCCACTACTGATTCAACTTGGACGAAGCAGTTTAATGGATGAGCATTTGGTTGAACAGAGAGAAAAGTTGCGAGAGCTGACGACCATCGCCGAGCGACTGAACCGGTTGGAGCGGGAATGGATGTGA #grep "yopO" selected_gtf_files/Yersinia_pestis_Nicholisk_41.gtf NZ_CP009990.1 RefSeq gene 47448 49645 . - . gene_id "CH63_RS00925"; transcript_id ""; gbkey "Gene"; gene "yopO"; gene_biotype "pseudogene"; gene_synonym "ypkA"; locus_tag "CH63_RS00925"; old_locus_tag "CH63_4306"; pseudo "true"; samtools faidx Yersinia_pestis_Nicholisk_41.fna NZ_CP009990.1:47448-49645 > temp.fna revseq sed -e ':a;N;$!ba;s/\n//g' -e 's/:/\t/g' 47448-49645.rev > temp_.fna Yersinia_pestis_Nicholisk_41 ATGAAAAGCGTGAAAATCATGGGAACTATGCCACCGTCGATCTCCCTCGCCAAAGCTCATGAGCGCATCAGCCAACATTGGCAAAATCCTGTCGGTGAGCTCAATATCGGAGGAAAACGGTATAGAATTATCGATAATCAAGTGTTGCGCTTGAACCCCCACAGTGGTTTTTCTCTCTTTCGAGAAGGGGTTGGTAAGATCTTTTCGGGGAAGATGTTTAACTTTTCAATTGCTCGTAACCTTACTGACACACTCCATGCGGCCCAGAAAACGACTTCGCAGGAGCTAAGGTCTGATATCCCCAATGCTCTCAGTAATCTCTTTGGAGCCAAGCCACAGACCGAACTGCCGCTGGGTTGGAAAGGGGAGCCCTTGTCAGGAGCTCCGGATCTTGAAGGGATGCGAGTGGCTGAAACCGATAAGTTTGCCGAGGGCGAAAGCCATATTAGTATAATAGAAACTAAGGATAAGCAGCGGTTGGTAGCTAAGATTGAACGCTCCATTGCCGAGGGGCATTTGTTCGCAGAACTGGAGGCTTATAAACACATCTATAAAACCGCGGGCAAACATCCTAATCTTGCCAATGTTCATGGCATGGCTGTGGTGCCATACGGTAACCGTAAGGAGGAAGCATTGCTGATGGATGAGGTGGATGGTTGGCGTTGTTCTGACACACTAAGAACCCTCGCCGATAGCTGGAAGCAAGGAAAGATCAATAGTGAAGCCTACTGGGGAACGATCAAGTTTATTGCCCATCGGCTATTAGATGTAACCAATCACCTTGCCAAGGCAGGGGTAGTACATAACGATATCAAACCCGGTAATGTGGTATTTGACCGCGCTAGCGGAGAGCCCGTTGTTATTGATCTAGGATTACACTCTCGTTCAGGGGAACAACCTAAGGGGTTTACAGAATCCTTCAAAGCGCCGGAGCTTGGAGTAGGAAACCTAGGCGCATCAGAAAAGAGCGATGTTTTTCTCGTAGTGTCAACCCTTCTACATTGTATCGAAGGTTTTGAGAAAAATCCGGAGATAAAGCCTAATCAAGGACTGAGATTCATTACCTCAGAACCAGCGCACGTAATGGATGAGAATGGTTATCCAATCCATCGACCTGGTATAGCTGGAGTCGAGACAGCCTATACACGCTTCATCACAGACATCCTTGGCGTTTCCGCTGACTCAAGACCTGATTCCAACGAAGCCAGACTCCACGAGTTCTTGAGCGACGGAACTATCGACGAGGAGTCGGCCAAGCAGATCCTAAAAGATACCCTAACCGGAGAAATGAGCCCATTATCTACTGATGTAAGGCGGATAACACCCAAGAAGCTTCGGGAGCTATCTGATTTGCTTAGGACGCATTTGAGCAGTGCAGCAACTAAGCAATTGGATATGGGGGGGTTTTGTCGGATCTTGATACCATGTTGGTGGCACTCGACAAGGCCGAACGCGAGGGGGGAGTAGACAAGGATCAGTTGAAGAGTTTTAACAGTTTGATTCTGAAGACTTACAGAGTGATTGAAGACTATGTCAAAGGCAGAGAAGGGGATACCAAGAATTCCAGTACGGAAGTATCCCCCTATCATCGCAGTAACTTTATGCTATCGATCGTCGAACCTTCACTGCAGAGGATCCAGAAGCATCTGGACCAGACACACTCTTTTTCTGATATCGGTTCACTAGTGCGCGCACATAAGCACCTGGAAACGCTTTTAGAGGTCTTAGTCACCTTGTCACAGCAAGGGCAGCCCGTGTCCTCTGAAACCTACGGCTTCCTGAATCGATTAACTGAGGCTAAGATCACCTTGTCGCAGCAATTGAATACTCTCCAGCAGCAGCAGGAGAGTGCGAAAGCGCAATTATCTATTCTGATTAATCGTTCAGGTTCTTGGGCCGATGTTGCTCGTCAGTCCCTGCAGCGTTTTGACAGTACCCGGCCTGTAGTGAAATTCGGCACTGAGCAGTATACCGCAATTCACCGTCAGATGATGGCGGCCCATGCAGCTATTACGCTACAGGAGGTATCGGAGTTTACTGATGATATGCGAAACTTTACAGTGGACTCTATTCCACTACTGATTCAACTTGGACGAAGCAGTTTAATGGATGAGCATTTGGTTGAACAGAGAGAAAAGTTGCGAGAGCTGACGACCATCGCCGAGCGACTGAACCGGTTGGAGCGGGAATGGATGTGA
-
manually correct point-nt-errors in the sequences according to _seq_additional.aln and then added the corrected sequences to _seq.txt (time-consuming)
for yop in yopJ yopB yopT yopE yopD yopM yopK yopO yopH; do grep "Yersinia_enterocolitica_WA" ${yop}_seq.txt > ${yop}_seq_additional.fasta done for yop in yopJ yopB yopT yopE yopD yopM yopK yopO yopH; do mafft --adjustdirection --clustalout ${yop}_seq_additional.fasta > ${yop}_seq_additional.aln done
-
from ${yop}_seq.txt –> ${yop}_protein.fasta –> ${yop}_aligned_protein.fasta
cd data/yop_files for yop in yopJ yopB yopT yopE yopD yopM yopK yopO yopH; do python3 txt_to_protein.py ${yop}_seq.txt ${yop}_protein.fasta done for yop in yopJ yopB yopT yopE yopD yopM yopK yopO yopH; do #NOTE: sometimes the alignment didn't work well since the manually added sequences missing bases! python3 protein_alignment.py ${yop}_protein.fasta ${yop}_aligned_protein.fasta mafft #awk -F '_' '/^>/ { printf(">%s", $3); for (i = 4; i <= NF; ++i) printf("_%s", $i); printf("\n"); next } { print }' ${yop}_aligned_protein.fasta > ${yop}_aligned_protein_.fasta done conda install mamba -c conda-forge #-n base mamba env create -f environment.yml grep ">" yopB_seq.txt | wc -l 67 --> 73 grep ">" yopJ_seq.txt | wc -l #* 67 --> 72 grep ">" yopT_seq.txt | wc -l 64 --> 73 grep ">" yopE_seq.txt | wc -l 70 --> 73 grep ">" yopD_seq.txt | wc -l 71 --> 73 grep ">" yopM_seq.txt | wc -l 70 --> 71 --> 73 grep ">" yopK_seq.txt | wc -l 73 grep ">" yopO_seq.txt | wc -l #* 64 --> 72 grep ">" yopH_seq.txt | wc -l 73
-
cluster all sequences in yopM_aligned_protein.fasta, all 100% identital sequences will in a group clustered. For each cluster, output a record as representative. Give a table for All members of groups.
for yop in yopJ yopB yopT yopE yopD yopM yopK yopO yopH; do usearch -cluster_fast ${yop}_aligned_protein.fasta -id 1.0 -centroids ${yop}_clustered.fasta -uc ${yop}_clusters.uc; done for yop in yopJ yopB yopT yopE yopD yopM yopK yopO yopH; do #parse the output of usarch to give a list a members for each class. python3 ~/Scripts/yop_analysis/parse_uc_file.py ${yop}_clusters.uc > ${yop}_clusters.txt sed -i "s/Members: \['//g" ${yop}_clusters.txt sed -i "s/'\]//g" ${yop}_clusters.txt sed -i "s/', '/, /g" ${yop}_clusters.txt sed -i "s/, /,/g" ${yop}_clusters.txt cut -d',' -f2- ${yop}_clusters.txt | sort > ${yop}_clusters_.txt done ~/Tools/csv2xls-0.4/csv_to_xls.py yopJ_clusters_.txt yopB_clusters_.txt yopT_clusters_.txt yopE_clusters_.txt yopD_clusters_.txt yopM_clusters_.txt yopK_clusters_.txt yopO_clusters_.txt yopH_clusters_.txt -o yop_clusters.xls for yop in yopJ yopB yopT yopE yopD yopM yopK yopO yopH; do python3 protein_alignment.py ${yop}_clustered.fasta ${yop}_clustered_aligned_protein.fasta mafft done for yop in yopJ yopB yopT yopE yopD yopM yopK yopO yopH; do python3 sort_fasta2.py ${yop}_clustered_aligned_protein.fasta ${yop}_sorted_selected_aligned_protein.fasta done
-
draw alignments
library(ggmsa) library(ggplot2) library(ggtree) #library(gggenes) library(ape) library(Biostrings) library(ggnewscale) library(dplyr) library(ggtreeExtra) library(phangorn) library(RColorBrewer) library(patchwork) library(ggplotify) library(aplot) library(magick) library(treeio) #219 --> 5 data <- "yopE_sorted_selected_aligned_protein.fasta" tidymsa <- tidy_msa(data) png("alignment_yopE.png", width=1100, height=800*1.2) msa_plot <- ggplot() + geom_msa(data = tidymsa, char_width = 0.5, seq_name = TRUE, show.legend = TRUE) + theme_msa() + facet_msa(50) msa_plot dev.off() #288 --> 6 data <- "yopJ_sorted_selected_aligned_protein.fasta" tidymsa <- tidy_msa(data) png("alignment_yopJ.png", width=1100, height=192*6) msa_plot <- ggplot() + geom_msa(data = tidymsa, char_width = 0.5, seq_name = TRUE, show.legend = TRUE) + theme_msa() + facet_msa(50) msa_plot dev.off() #306 --> 7 data <- "yopD_sorted_selected_aligned_protein.fasta" tidymsa <- tidy_msa(data) png("alignment_yopD.png", width=1100, height=192*6) msa_plot <- ggplot() + geom_msa(data = tidymsa, char_width = 0.5, seq_name = TRUE, show.legend = TRUE) + theme_msa() + facet_msa(50) msa_plot dev.off() #529 --> 11 data <- "yopM_sorted_selected_aligned_protein.fasta" tidymsa <- tidy_msa(data) png("alignment_yopM.png", width=1100, height=192*12) msa_plot <- ggplot() + geom_msa(data = tidymsa, char_width = 0.5, seq_name = TRUE, show.legend = TRUE) + theme_msa() + facet_msa(50) msa_plot dev.off() #182 --> 4 data <- "yopK_sorted_selected_aligned_protein.fasta" tidymsa <- tidy_msa(data) png("alignment_yopK.png", width=1100, height=192*4) msa_plot <- ggplot() + geom_msa(data = tidymsa, char_width = 0.5, seq_name = TRUE, show.legend = TRUE) + theme_msa() + facet_msa(50) msa_plot dev.off() #732 --> 15 data <- "yopO_sorted_selected_aligned_protein.fasta" tidymsa <- tidy_msa(data) png("alignment_yopO.png", width=1100, height=192*15) msa_plot <- ggplot() + geom_msa(data = tidymsa, char_width = 0.5, seq_name = TRUE, show.legend = TRUE) + theme_msa() + facet_msa(50) msa_plot dev.off() # -- RERUN due to the one-letter-in-last-line Bug #401 --> 9 --> 8 data <- "yopB_sorted_selected_aligned_protein.fasta" tidymsa <- tidy_msa(data) png("alignment_yopB.png", width=1100, height=192*8) msa_plot <- ggplot() + geom_msa(data = tidymsa, char_width = 0.5, seq_name = TRUE, show.legend = TRUE) + theme_msa() + facet_msa(51) msa_plot dev.off() # -- RERUN due to Error in tidy_msa(data) : Sequences must have unique names -- #322 --> 7 --> delete the repeated Yersinia_pestis_D182038 --> merge the two partial CDS into one data <- "yopT_sorted_selected_aligned_protein.fasta" tidymsa <- tidy_msa(data) png("alignment_yopT.png", width=1100, height=192*8) msa_plot <- ggplot() + geom_msa(data = tidymsa, char_width = 0.5, seq_name = TRUE, show.legend = TRUE) + theme_msa() + facet_msa(50) msa_plot dev.off() #468 --> 10 --> delete the repeated Yersinia_enterocolitica_YE6 data <- "yopH_sorted_selected_aligned_protein.fasta" tidymsa <- tidy_msa(data) png("alignment_yopH.png", width=1100, height=192*10) msa_plot <- ggplot() + geom_msa(data = tidymsa, char_width = 0.5, seq_name = TRUE, show.legend = TRUE) + theme_msa() + facet_msa(50) msa_plot dev.off()
-
blast search and mauve analysis (mauve should be opened under bengal3_ac3)
makeblastdb -in Yersinia_pestis_790.fna -dbtype nucl blastn -query yopJ_WA.fasta -db Yersinia_pestis_790.fna -out yopJ_WA_on_790.txt blastn -query yopO_WA.fasta -db Yersinia_pestis_790.fna -out yopO_WA_on_790.txt
prepare virus X14112 gtf for nextflow running
-
install conda environment rnaseq_2021
conda create -n rnaseq_2021 python=3.6.7 conda activate rnaseq_2021 conda install -c conda-forge -c bioconda -c defaults nextflow=21.04 fastqc=0.11.8 trim-galore=0.5.0 star=2.6.1d hisat2=2.1.0 conda install -c conda-forge -c bioconda -c defaults picard=2.18.27 csvtk=0.17.0 preseq=2.0.3 conda install -c conda-forge -c bioconda -c defaults samtools=1.9 conda install -c conda-forge -c bioconda -c defaults gffread=0.9.12 conda install -c conda-forge -c bioconda -c defaults subread=1.6.4 conda install -c conda-forge -c bioconda -c defaults deeptools=3.2.0 conda install -c conda-forge -c bioconda -c defaults multiqc=1.7 #* conda install -c conda-forge -c bioconda -c defaults conda-forge::r-data.table=1.12.0 conda-forge::r-gplots=3.0.1.1 bioconductor-dupradar=1.12.1 bioconductor-edger=3.24.1 conda install -c conda-forge -c bioconda -c defaults stringtie=1.3.5 conda install -c conda-forge -c bioconda -c defaults rseqc=3.0.0
-
prepare the virus gtf-file
# -- processing for virus gtf -- # #gffread X14112.1_gene.gff -T -o X14112.1_gene.gtf # cp X14112.1.gff X14112.1.gff3 # #gffread -E -F --bed X14112.1.gff3 -o X14112.1.bed (change the name errors in 1 intron and 2 genes) # grep "^##" X14112.1.gff3 > X14112.1_gene.gff3 # # # --try to filter the file with genes --> failed -- # grep "ID=gene" X14112.1.gff3 >> X14112.1_gene.gff # cp X14112.1.gff3 X14112.1.gff # -- generating *_gene.gtf file containing only gene records -- python3 gff2gtf.py # -- check if gene_id is unique -- cut -f9- -d$'\t' X14112.1_gene.gtf > temp cut -f1 -d';' temp > temp_ #111 sort temp_ > temp_1 sort temp_ -u > temp_2 diff temp_1 temp_2 #39d38 #< ID=gene-UL29 #59d57 #< ID=gene-UL43 #--> delete short ones of the repeated records --> 109 records python3 extends.py #generating the file X14112.1_gene_extended.gtf #Then replace 'transcript_id "exon' --> 'transcript_id "rna' in X14112.1_gene_extended.gtf gffread -E -F --bed X14112.1_gene_extended.gtf -o X14112.1_gene_extended.bed #-->bed contains 109 transcript-name for example "rna-gene-RS1-2" ##!!!!OPTIONAL!!!!: don't need to change type '\tgene\t' to '\texon\t', since X14112.1_gene_extended.gtf contains exon-records. nextflow run rnaseq_old/main.nf --reads "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/Raw_Data/*.fastq.gz" --fasta "K14112.1.fasta" --gtf "K14112.1_gene_extended.gtf" --bed12 "K14112.1_gene_extendced.bed" --singleEnd -profile standard --aligner star --fcGroupFeaturesType gene_biotype --skip_genebody_coverage -resume --saveReference # -- correct some special records (optional, as the processing above didn't genrate the records) -- # delete the lines starting with "#" # replace "X14112.1:146805..151063" to IE175 # replace "X14112.1:133941..146107" to IE68 # add ;gene=IE68 ;gene=IE175 to the corresponding lines # -- python code for convert gff to gtf -- # open the input file for reading and the output file for writing # -- scripts choose gene or exon or mRNA -- # python3 gff2gtf.py #X14112.1.gff-->X14112.1.gtf # replace '; transcript_id "gene' to '; transcript_id "tx' # !!!!VERY_IMPORTANT!!!!: change type '\tgene\t' to '\texon\t'! # sed -i -e "s/\tgene\t/\texon\t/g" X14112.1_gene.gff # since default is --featurecounts_feature_type 'exon'.
-
nextflow command: the input should be *.umi_extract.fastq.gz.
#SUCCESSFUL (rnaseq) jhuang@hamburg:~/DATA/Data_Manja_RNAseq_Organoids_Virus$ /home/jhuang/anaconda3/bin/nextflow run rnaseq_old/main.nf --reads "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/Raw_Data/*.fastq.gz" --fasta "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1.fasta" --gtf "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_gene_extended.gtf" --bed12 "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_gene_extended.bed" --singleEnd -profile standard --aligner hisat2 --fcGroupFeaturesType gene_biotype --skip_genebody_coverage --skip_preseq -resume --saveReference #NOT_TESTED (rnaseq_2021) jhuang@hamm:~/DATA/Data_Manja_RNAseq_Organoids_Virus$ nextflow run rnaseq_old/main.nf --reads "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/Raw_Data/*.fastq.gz" --fasta "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1.fasta" --gtf "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_gene_extended.gtf" --bed12 "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_gene_extended.bed" --singleEnd -profile standard --aligner star -resume --saveReference
-
snippet of the human hg38 gtf served as a pattern
1 ensembl_havana gene 685679 686673 . - . gene_id "ENSG00000284662"; gene_version "1"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; 1 ensembl_havana transcript 685679 686673 . - . gene_id "ENSG00000284662"; gene_version "1"; transcript_id "ENST00000332831"; transcript_version "4"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F16-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS41221"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)"; 1 ensembl_havana exon 685679 686673 . - . gene_id "ENSG00000284662"; gene_version "1"; transcript_id "ENST00000332831"; transcript_version "4"; exon_number "1"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F16-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS41221"; exon_id "ENSE00002324228"; exon_version "3"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)"; 1 ensembl_havana CDS 685719 686654 . - 0 gene_id "ENSG00000284662"; gene_version "1"; transcript_id "ENST00000332831"; transcript_version "4"; exon_number "1"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F16-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS41221"; protein_id "ENSP00000329982"; protein_version "2"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)"; 1 ensembl_havana gene 1211340 1214153 . - . gene_id "ENSG00000186827"; gene_version "11"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; 1 havana transcript 1211340 1214138 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "2"; 1 havana exon 1213983 1214138 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "1"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001480264"; exon_version "3"; transcript_support_level "2"; 1 havana exon 1212992 1213785 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "2"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001906619"; exon_version "1"; transcript_support_level "2"; 1 havana exon 1212638 1212704 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "3"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003550137"; exon_version "1"; transcript_support_level "2"; 1 havana exon 1211942 1212138 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003604411"; exon_version "1"; transcript_support_level "2"; 1 ensembl_havana exon 1211704 1211832 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "6"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; exon_id "ENSE00001333051"; exon_v ersion "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; 1 ensembl_havana CDS 1211704 1211832 . - 2 gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "6"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; protein_id "ENSP00000368538"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; 1 ensembl_havana exon 1211340 1211625 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "7"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; exon_id "ENSE00001915458"; exon_version "2"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; 1 ensembl_havana CDS 1211558 1211625 . - 2 gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "7"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; protein_id "ENSP00000368538"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; 1 ensembl_havana stop_codon 1211555 1211557 . - 0 gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "7"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; 1 ensembl_havana five_prime_utr 1214128 1214153 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; 1 ensembl_havana three_prime_utr 1211340 1211554 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; 1 havana transcript 1211340 1214138 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "2"; 1 havana exon 1213983 1214138 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "1"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001480264"; exon_version "3"; transcript_support_level "2"; 1 havana exon 1212992 1213785 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "2"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203 "; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001906619"; exon_version "1"; transcript_support_level "2"; 1 havana exon 1212638 1212704 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "3"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003550137"; exon_version "1"; transcript_support_level "2"; 1 havana exon 1211942 1212138 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003604411"; exon_version "1"; transcript_support_level "2"; 1 havana exon 1211340 1211832 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "5"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001923078"; exon_version "1"; transcript_support_level "2"; 1 havana transcript 1212019 1213498 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; transcript_support_level "3"; 1 havana exon 1213395 1213498 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; exon_number "1"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00001680308"; exon_version "1"; transcript_support_level "3"; 1 havana exon 1212992 1213093 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; exon_number "2"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003495433"; exon_version "1"; transcript_support_level "3"; 1 havana exon 1212638 1212704 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; exon_number "3"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003550137"; exon_version "1"; transcript_support_level "3"; 1 havana exon 1212019 1212138 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; exon_number "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00001723250"; exon_version "1"; transcript_support_level "3";
-
generate wig files
samtools faidx X14112.1.fasta cut -f1,2 X14112.1.fasta.fai > results/markDuplicates/chrom.sizes cd results/markDuplicates/ for sample in control_r1 control_r2 HSV.d2_r1 HSV.d2_r2 HSV.d4_r1 HSV.d4_r2 HSV.d6_r1 HSV.d6_r2 HSV.d8_r1 HSV.d8_r2; do #bamCoverage -b ${sample}.umi_extract.sorted.markDups.bam -o ${sample}.bw #bedtools genomecov -ibam ${sample}.umi_extract.sorted.markDups.bam -bg > ${sample}.bedgraph bedGraphToBigWig ${sample}.bedgraph chrom.sizes ${sample}.bw bigWigToWig ${sample}.bw ${sample}.wig done
-
input and clean data using R
#BiocManager::install(c("DESeq2")) requiredPackages1 <-c("AnnotationDbi","clusterProfiler","ReactomePA","org.Hs.eg.db","DESeq2", "gplots", "RColorBrewer") ipak <- function(pkg){ new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])] if (length(new.pkg)) install.packages(new.pkg, dependencies = TRUE) sapply(pkg, require, character.only = TRUE) } ipak(requiredPackages1) #requiredPackages2 <- c("tidyverse") #ipak(requiredPackages2) #cut -f1-1 merged_gene_counts.txt > col1.txt #paste -d$'\t' col1.txt merged_gene_counts2.txt > merged_gene_counts3.txt #sed -i 's/gene-//g' merged_gene_counts3.txt #replace "X14112.1" to "X14112"; delete "rna-gene-"; get X14112.1_gene_extended2.gtf; using it in IGV library("AnnotationDbi") library("clusterProfiler") library("ReactomePA") library("org.Hs.eg.db") library(DESeq2) library(gplots) setwd("~/DATA/Data_Manja_RNAseq_Organoids_Virus/results/featureCounts") #---- dataset (27) samples (firstly import all samples, then spring to 27-3-1-2) ---- d.raw<- read.delim2("merged_gene_counts3.txt",sep="\t", header=TRUE, row.names=1) #> head(d.raw,0) # [1] HSV.d4_r2.umi_extract.sorted.bam HSV.d6_r1.umi_extract.sorted.bam # [3] HSV.d4_r1.umi_extract.sorted.bam control_r1.umi_extract.sorted.bam # [5] HSV.d2_r2.umi_extract.sorted.bam control_r2.umi_extract.sorted.bam # [7] HSV.d2_r1.umi_extract.sorted.bam HSV.d8_r2.umi_extract.sorted.bam # [9] HSV.d8_r1.umi_extract.sorted.bam HSV.d6_r2.umi_extract.sorted.bam col.order <- c("control_r1.umi_extract.sorted.bam","control_r2.umi_extract.sorted.bam", "HSV.d2_r1.umi_extract.sorted.bam","HSV.d2_r2.umi_extract.sorted.bam", "HSV.d4_r1.umi_extract.sorted.bam","HSV.d4_r2.umi_extract.sorted.bam", "HSV.d6_r1.umi_extract.sorted.bam","HSV.d6_r2.umi_extract.sorted.bam", "HSV.d8_r1.umi_extract.sorted.bam","HSV.d8_r2.umi_extract.sorted.bam") d <- d.raw[,col.order] #reordered.raw #d <- reordered.raw[rowSums(reordered.raw>3)>2,] colnames(d) = c("control_r1","control_r2", "HSV.d2_r1","HSV.d2_r2", "HSV.d4_r1","HSV.d4_r2", "HSV.d6_r1","HSV.d6_r2", "HSV.d8_r1","HSV.d8_r2") # Define the replicates and condition of the samples ids <- factor(c("control_r1","control_r2", "HSV.d2_r1","HSV.d2_r2", "HSV.d4_r1","HSV.d4_r2", "HSV.d6_r1","HSV.d6_r2", "HSV.d8_r1","HSV.d8_r2")) replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2")) condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8", "HSV.d8")) # Construct the DESeqDataSet cData = data.frame(row.names=colnames(d), replicate=replicate, condition=condition, ids=ids) dds<-DESeqDataSetFromMatrix(countData=d, colData=cData, design=~condition) # Run DESeq (early without the step, WRONG?) dds <- DESeq(dds) # Apply the rlog transformation rld <- rlogTransformation(dds) #rld <- vst(dds) #vsd #-- save raw_data as xls -- write.csv(d, file="d.csv") #~/Tools/csv2xls-0.4/csv_to_xls.py d.csv -d$',' -o d.xls
-
plotting pca and heatmap and remove batchEffect
# -- pca -- png("pca.png", 1200, 800) plotPCA(rld, intgroup=c("condition")) #plotPCA(rld, intgroup = c("condition", "batch")) #plotPCA(rld, intgroup = c("condition", "ids")) #plotPCA(rld, "batch") dev.off() # -- heatmap -- ## generate the pairwise comparison between samples library(gplots) library("RColorBrewer") png("heatmap.png", 1200, 800) distsRL <- dist(t(assay(rld))) mat <- as.matrix(distsRL) #paste( rld$dex, rld$cell, sep="-" ) #rownames(mat) <- colnames(mat) <- with(colData(dds),paste(condition,batch, sep=":")) rownames(mat) <- colnames(mat) <- with(colData(dds),paste(condition,ids, sep=":")) hc <- hclust(distsRL) hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100) heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13)) dev.off()
-
select the differentially expressed genes
#https://galaxyproject.eu/posts/2020/08/22/three-steps-to-galaxify-your-tool/ #https://www.biostars.org/p/282295/ #https://www.biostars.org/p/335751/ #> condition # [1] control control HSV.d2 HSV.d2 HSV.d4 HSV.d4 HSV.d6 HSV.d6 HSV.d8 HSV.d8 #Levels: control HSV.d2 HSV.d4 HSV.d6 HSV.d8 #CONSOLE: mkdir /home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/results/featureCounts/degenes setwd("/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/results/featureCounts/degenes") #---- relevel to control ---- dds$condition <- relevel(dds$condition, "control") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("HSV.d2_vs_control","HSV.d4_vs_control", "HSV.d6_vs_control", "HSV.d8_vs_control") dds$condition <- relevel(dds$condition, "HSV.d2") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("HSV.d4_vs_HSV.d2", "HSV.d6_vs_HSV.d2", "HSV.d8_vs_HSV.d2") dds$condition <- relevel(dds$condition, "HSV.d4") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("HSV.d6_vs_HSV.d4", "HSV.d8_vs_HSV.d4") dds$condition <- relevel(dds$condition, "HSV.d6") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("HSV.d8_vs_HSV.d6") for (i in clist) { contrast = paste("condition", i, sep="_") res = results(dds, name=contrast) res <- res[!is.na(res$log2FoldChange),] res_df <- as.data.frame(res) write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-")) up <- subset(res_df, padj<=0.05 & log2FoldChange>=1.2) down <- subset(res_df, padj<=0.05 & log2FoldChange<=-1.2) write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-")) write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-")) } ##https://github.com/kevinblighe/EnhancedVolcano #BiocManager::install("EnhancedVolcano") library("EnhancedVolcano") #for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control; do #for i in HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2; do #for i in HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4; do for i in HSV.d8_vs_HSV.d6; do echo "contrast = paste(\"condition\", \"${i}\", sep=\"_\")" echo "res = results(dds, name=contrast)" #echo "res <- res[!is.na(res$log2FoldChange),]" echo "res <- na.omit(res)" echo "res_df <- as.data.frame(res)" #selectLab = selectLab_italics, echo "png(\"${i}.png\",width=1200, height=1000)" #legendPosition = 'right',legendLabSize = 12, arrowheads = FALSE, echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.2, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(\"$(echo $i | cut -d'_' -f1) versus $(echo $i | cut -d'_' -f3)\"))" echo "dev.off()" done #under DIR degenes under KONSOLE for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;" done
9 (optional). clustering the genes and draw heatmap
install.packages("gplots")
library("gplots")
for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do
echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"
echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"
done
cat *.id | sort -u > ids
#add Gene_Id in the first line, delete the ""
GOI <- read.csv("ids")$Gene_Id
RNASeq.NoCellLine <- assay(rld)
#clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). pearson or spearman
datamat = RNASeq.NoCellLine[GOI, ]
write.csv(as.data.frame(datamat), file ="significant_gene_expressions.txt")
hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.5)
mycol = c("YELLOW", "BLUE", "ORANGE", "MAGENTA", "CYAN", "RED", "GREEN", "MAROON", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN", "LIGHTRED", "LIGHTGREEN");
mycol = mycol[as.vector(mycl)]
#png("DEGs_heatmap.png", width=900, height=800)
#cex.lab=10, labRow="",
png("DEGs_heatmap.png", width=900, height=1000)
heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
scale='row',trace='none',col=bluered(75),
RowSideColors = mycol, margins=c(10,20), cexRow=1.5, srtCol=45, lhei = c(2, 8)) #rownames(datamat)
#heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(5,5), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', labRow="", srtCol=30, lhei=c(0.1,2))
dev.off()
#### cluster members #####
write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt')
write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')
#~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o DEGs_heatmap_cluster_members.xls
~/Tools/csv2xls-0.4/csv_to_xls.py \
significant_gene_expressions.txt \
-d',' -o DEGs_heatmap_expression_data.xls;
Peak calling using homer combining sicer and macs2
-
nextflow processing data
V_8_1_6_p601_d8_D1_H3K4me3.fastq.gz,V_8_1_6_p601_d8_D1_input.fastq.gz,p601_H3K4me3_D1 V_8_1_5_p601_d8_D2_H3K4me3.fastq.gz,V_8_1_5_p601_d8_D2_input.fastq.gz,p601_H3K4me3_D2 V_8_1_6_p604_d8_D1_H3K4me3.fastq.gz,V_8_1_6_p604_d8_D1_input.fastq.gz,p604_H3K4me3_D1 V_8_1_5_p604_d8_D2_H3K4me3.fastq.gz,V_8_1_5_p604_d8_D2_input.fastq.gz,p604_H3K4me3_D2 V_8_1_6_p601_d8_D1_H3K27me3.fastq.gz,V_8_1_6_p601_d8_D1_input.fastq.gz,p601_H3K27me3_D1 V_8_1_5_p601_d8_D2_H3K27me3.fastq.gz,V_8_1_5_p601_d8_D2_input.fastq.gz,p601_H3K27me3_D2 V_8_1_6_p604_d8_D1_H3K27me3.fastq.gz,V_8_1_6_p604_d8_D1_input.fastq.gz,p604_H3K27me3_D1 V_8_1_5_p604_d8_D2_H3K27me3.fastq.gz,V_8_1_5_p604_d8_D2_input.fastq.gz,p604_H3K27me3_D2 V_8_1_7_p601_d8_D1_H3K9me3.fastq.gz,V_8_1_6_p601_d8_D1_input.fastq.gz,p601_H3K9me3_D1 V_8_1_7_p601_d8_D2_H3K9me3.fastq.gz,V_8_1_5_p601_d8_D2_input.fastq.gz,p601_H3K9me3_D2 V_8_1_7_p604_d8_D1_H3K9me3.fastq.gz,V_8_1_6_p604_d8_D1_input.fastq.gz,p604_H3K9me3_D1 V_8_1_7_p604_d8_D2_H3K9me3.fastq.gz,V_8_1_5_p604_d8_D2_input.fastq.gz,p604_H3K9me3_D2 V_8_1_8_p601_d8_D1_H3K27ac.fastq.gz,V_8_1_6_p601_d8_D1_input.fastq.gz,p601_H3K27ac_D1 V_8_1_8_p601_d8_D2_H3K27ac.fastq.gz,V_8_1_5_p601_d8_D2_input.fastq.gz,p601_H3K27ac_D2 V_8_1_8_p604_d8_D1_H3K27ac.fastq.gz,V_8_1_6_p604_d8_D1_input.fastq.gz,p604_H3K27ac_D1 V_8_1_8_p604_d8_D2_H3K27ac.fastq.gz,V_8_1_5_p604_d8_D2_input.fastq.gz,p604_H3K27ac_D2 nextflow run NGI-ChIPseq/main.nf --reads '/home/jhuang/DATA/Data_Denise_LT_DNA_Binding/ChIPseq_histone_hg38/H3K4me3_H3K27ac__H3K27me3_H3K9me3/Raw_Data_GEO_uploaded/*.fastq.gz' --genome hg38 --macsconfig macs.config --saveReference --saveAlignedIntermediates --singleEnd --blacklist_filtering -profile standard --project NHDF_enhancer_analysis_hg38 -resume nextflow run NGI-ChIPseq/main.nf --reads '/mnt/h1/jhuang/DATA/Data_Denise_LT_DNA_Binding/ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/Raw_Data/*.fastq.gz' --genome hg38 --macsconfig macs.config --saveReference --saveAlignedIntermediates --singleEnd --blacklist_filtering -profile standard --project NHDF_enhancer_analysis_hg38 -resume (DEBUG: Control doesn't work well!) lrwxrwxrwx 1 jhuang jhuang 37 Mai 19 11:19 NHDF-Ad_H3K4me1_r1.fastq.gz -> ../Raw_Data_orig/SRR568344_1.fastq.gz lrwxrwxrwx 1 jhuang jhuang 37 Mai 19 11:19 NHDF-Ad_H3K4me1_r2.fastq.gz -> ../Raw_Data_orig/SRR568345_1.fastq.gz lrwxrwxrwx 1 jhuang jhuang 37 Mai 19 11:19 NHDF-Ad_H3K27ac_r1.fastq.gz -> ../Raw_Data_orig/SRR227397_1.fastq.gz lrwxrwxrwx 1 jhuang jhuang 37 Mai 19 11:19 NHDF-Ad_H3K27ac_r2.fastq.gz -> ../Raw_Data_orig/SRR227398_1.fastq.gz lrwxrwxrwx 1 jhuang jhuang 37 Mai 19 11:19 NHDF-Ad_Control_r1.fastq.gz -> ../Raw_Data_orig/SRR227590_1.fastq.gz lrwxrwxrwx 1 jhuang jhuang 37 Mai 19 11:19 NHDF-Ad_Control_r2.fastq.gz -> ../Raw_Data_orig/SRR227591_1.fastq.gz
-
make homer directories and findPeaks with HOMER under (myperl)
conda activate myperl #Why do I need give "-genome hg38" in makeTagDirectory? #If you don't provide a genome with the -genome option, HOMER will only count the number of tags in each region without any genomic context or sequence information. #So, it is essential to include this information when creating a tag directory if you plan to perform any genome-based analysis. makeTagDirectory p601_d8_D1_input ../results/picard/V_8_1_6_p601_d8_D1_input.dedup.sorted.bam -genome hg38 makeTagDirectory p601_d8_D2_input ../results/picard/V_8_1_5_p601_d8_D2_input.dedup.sorted.bam -genome hg38 makeTagDirectory p604_d8_D1_input ../results/picard/V_8_1_6_p604_d8_D1_input.dedup.sorted.bam -genome hg38 makeTagDirectory p604_d8_D2_input ../results/picard/V_8_1_5_p604_d8_D2_input.dedup.sorted.bam -genome hg38 makeTagDirectory p601_d8_D1_H3K4me3 ../results/picard/V_8_1_6_p601_d8_D1_H3K4me3.dedup.sorted.bam -genome hg38 makeTagDirectory p601_d8_D2_H3K4me3 ../results/picard/V_8_1_5_p601_d8_D2_H3K4me3.dedup.sorted.bam -genome hg38 makeTagDirectory p604_d8_D1_H3K4me3 ../results/picard/V_8_1_6_p604_d8_D1_H3K4me3.dedup.sorted.bam -genome hg38 makeTagDirectory p604_d8_D2_H3K4me3 ../results/picard/V_8_1_5_p604_d8_D2_H3K4me3.dedup.sorted.bam -genome hg38 makeTagDirectory p601_d8_D1_H3K27me3 ../results/picard/V_8_1_6_p601_d8_D1_H3K27me3.dedup.sorted.bam -genome hg38 makeTagDirectory p601_d8_D2_H3K27me3 ../results/picard/V_8_1_5_p601_d8_D2_H3K27me3.dedup.sorted.bam -genome hg38 makeTagDirectory p604_d8_D1_H3K27me3 ../results/picard/V_8_1_6_p604_d8_D1_H3K27me3.dedup.sorted.bam -genome hg38 makeTagDirectory p604_d8_D2_H3K27me3 ../results/picard/V_8_1_5_p604_d8_D2_H3K27me3.dedup.sorted.bam -genome hg38 makeTagDirectory p601_d8_D1_H3K27ac ../results/picard/V_8_1_8_p601_d8_D1_H3K27ac.dedup.sorted.bam -genome hg38 makeTagDirectory p601_d8_D2_H3K27ac ../results/picard/V_8_1_8_p601_d8_D2_H3K27ac.dedup.sorted.bam -genome hg38 makeTagDirectory p604_d8_D1_H3K27ac ../results/picard/V_8_1_8_p604_d8_D1_H3K27ac.dedup.sorted.bam -genome hg38 makeTagDirectory p604_d8_D2_H3K27ac ../results/picard/V_8_1_8_p604_d8_D2_H3K27ac.dedup.sorted.bam -genome hg38 for sample in p601_d8_D1_input p601_d8_D2_input p604_d8_D1_input p604_d8_D2_input p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 p604_d8_D1_H3K4me3 p604_d8_D2_H3K4me3 p601_d8_D1_H3K27me3 p601_d8_D2_H3K27me3 p604_d8_D1_H3K27me3 p604_d8_D2_H3K27me3 p601_d8_D1_H3K27ac p601_d8_D2_H3K27ac p604_d8_D1_H3K27ac p604_d8_D2_H3K27ac; do makeUCSCfile ${sample} -pseudo 1 -bigWig /home/jhuang/REFs/hg38.chromSizes -o auto -style chipseq -norm 1e7 -normLength 100 -fsize 1 done # -- not necessary any more: using MACS2 and SICER instead of using findPeaks -- # #factor (transcription factor ChIP-Seq, uses -center, output: peaks.txt, default) # #histone (histone modification ChIP-Seq, region based, uses -region -size 500 -L 0, regions.txt) # for sample in p601_d8_D1 p601_d8_D2 p604_d8_D1 p604_d8_D2; do # #Finding peaks of size 1000, no closer than 2000 # findPeaks ${sample}_H3K4me3 -style factor -size 1000 -o auto -i ${sample}_input # #-minDist <#> (minimum distance between peaks, default: peak size x2) # #findPeaks ${sample}_H3K27me3 -style histone -region -size 3000 -minDist 5000 -o auto -i ${sample}_input # #findPeaks ${sample}_H3K27ac -style factor -size 200 -minDist 200 -o auto -i ${sample}_input # #findPeaks ${sample}_H3K4me1 -style histone -region -size 1000 -minDist 2500 -o auto -i ${sample}_input # done ./p601_d8_D1_H3K4me3/peaks.txt ./p601_d8_D2_H3K4me3/peaks.txt ./p604_d8_D1_H3K4me3/peaks.txt ./p604_d8_D2_H3K4me3/peaks.txt ./p601_d8_D1_H3K27me3/regions.txt ./p601_d8_D2_H3K27me3/regions.txt ./p604_d8_D1_H3K27me3/regions.txt ./p604_d8_D2_H3K27me3/regions.txt for dir in p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 p604_d8_D1_H3K4me3 p604_d8_D2_H3K4me3; do awk -v OFS='\t' '{print $2, $3, $4, $1, $6}' ./${dir}/peaks.txt > ${dir}_peaks.bed grep -v "#" ${dir}_peaks.bed | sort -k1,1 -k2,2n > ${dir}_sorted_peaks.bed done for dir in p601_d8_D1_H3K27me3 p601_d8_D2_H3K27me3 p604_d8_D1_H3K27me3 p604_d8_D2_H3K27me3; do awk -v OFS='\t' '{print $2, $3, $4, $1, $6}' ./${dir}/regions.txt > ${dir}_regions.bed grep -v "#" ${dir}_regions.bed | sort -k1,1 -k2,2n > ${dir}_sorted_regions.bed done #DEBUG: why the bam files so small? makeTagDirectory NHDF-Ad_Control_r1 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_Control_r1.dedup.sorted.bam -genome hg38 makeTagDirectory NHDF-Ad_Control_r2 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_Control_r2.dedup.sorted.bam -genome hg38 makeTagDirectory NHDF-Ad_H3K27ac_r1 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K27ac_r1.dedup.sorted.bam -genome hg38 makeTagDirectory NHDF-Ad_H3K27ac_r2 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K27ac_r2.dedup.sorted.bam -genome hg38 makeTagDirectory NHDF-Ad_H3K4me1_r1 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K4me1_r1.dedup.sorted.bam -genome hg38 makeTagDirectory NHDF-Ad_H3K4me1_r2 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K4me1_r2.dedup.sorted.bam -genome hg38 NHDF-Ad_Control_r1 NHDF-Ad_Control_r2 NHDF-Ad_H3K27ac_r1 NHDF-Ad_H3K27ac_r2 NHDF-Ad_H3K4me1_r1 NHDF-Ad_H3K4me1_r2 > (myperl) environments for HOMER, ~/Tools/diffreps/bin/diffReps.pl, MACS2, ~/Tools/SICER1.1/SICER/SICER.sh
-
combine the diffReps.pl and HOMER annotatePeaks.pl
#Dynamic regions were defined as MACS (H3K4me3, H3K27ac) or SICER (H3K4me1, H3K27me3) peaks overlapping significantly (≥ 2-fold change, adjusted P-value ≤ 0.05) up- or down-regulated differentially enriched regions from diffReps in the three pairwise comparisons WAC vs mock, WA314 vs mock and WAC vs WA314. #STEP1 #--> not given "--gname hg38" ## Step4: Annotate differential sites. #unless($noanno or $gname eq ''){ # `region_analysis.pl -i $report -r -d refseq -g $gname`; #} ## Step5: Look for hotspots. #unless($nohs){ # my $hotspot = $report . '.hotspot'; # `findHotspots.pl -d $report -o $hotspot`; #} ~/Tools/diffreps/bin/diffReps.pl -tr ../results/picard/V_8_1_6_p601_d8_D1_H3K4me3.dedup.sorted.bed ../results/picard/V_8_1_5_p601_d8_D2_H3K4me3.dedup.sorted.bed -co ../results/picard/V_8_1_6_p601_d8_D1_input.dedup.sorted.bed ../results/picard/V_8_1_5_p601_d8_D2_input.dedup.sorted.bed --report output_results --chrlen /home/jhuang/REFs/hg38.chromSizes --nsd sharp #STEP2 #replace Chr to '#Chr' grep -v "#" output_results | sort -k1,1 -k2,2n > output_results_ awk 'BEGIN {OFS="\t"} {print $1, $2, $3, "diffreps_peak_"NR, $12}' output_results_ > H3K4me3.bed #grep -v "#" H3K4me3.bed | sort -k1,1 -k2,2n > H3K4me3_sorted_peaks.bed #STEP3 (under myperl) peak calling macs2 for narrow peaks, CISER for broad peaks! #process the output of diffReps.pl to BED file. annotatePeaks.pl H3K4me3.bed hg38 > H3K4me3_annotated_peaks.txt
-
combine macs2 to getDifferentialPeaksReplicates.pl
replace the initial peak identification by using your MACS2 output. #http://homer.ucsd.edu/homer/ngs/diffExpression.html #getDifferentialPeaksReplicates.pl = findPeaks + annotatePeaks.pl + getDiffExpression.pl #annotatePeaks.pl tss hg38 -raw -d H3K4me3-Mock-rep1/ H3K4me3-Mock-rep2/ H3K4me3-WNT-rep1/ H3K4me3-WNT-rep3/ > countTable.peaks.txt Here's an outline of how we might be able to replace the initial peak identification by using your MACS2 output. #TODO: using MACS call peaks of the data H3K27ac.
4.1. MACS2 peak calling
#macs2 --> bed --> annotatePeaks.pl
conda activate ngi_chipseq_ac2
macs2 callpeak -t ../results/picard/V_8_1_6_p601_d8_D1_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_1_6_p601_d8_D1_input.dedup.sorted.bam -f BAM -g hs -n p601_d8_D1 -q 0.05
macs2 callpeak -t ../results/picard/V_8_1_5_p601_d8_D2_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_1_5_p601_d8_D2_input.dedup.sorted.bam -f BAM -g hs -n p601_d8_D2 -q 0.05
macs2 callpeak -t ../results/picard/V_8_1_6_p604_d8_D1_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_1_6_p604_d8_D1_input.dedup.sorted.bam -f BAM -g hs -n p604_d8_D1 -q 0.05
macs2 callpeak -t ../results/picard/V_8_1_5_p604_d8_D2_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_1_5_p604_d8_D2_input.dedup.sorted.bam -f BAM -g hs -n p604_d8_D2 -q 0.05
awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p601_d8_D1_peaks.narrowPeak > p601_d8_D1_peaks.bed
awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p601_d8_D2_peaks.narrowPeak > p601_d8_D2_peaks.bed
awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p604_d8_D1_peaks.narrowPeak > p604_d8_D1_peaks.bed
awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p604_d8_D2_peaks.narrowPeak > p604_d8_D2_peaks.bed
#annotatePeaks.pl p601_d8_D1_peaks.bed hg38 > p601_d8_D1_annotated_peaks.txt
#annotatePeaks.pl p601_d8_D2_peaks.bed hg38 > p601_d8_D2_annotated_peaks.txt
#annotatePeaks.pl p604_d8_D1_peaks.bed hg38 > p604_d8_D1_annotated_peaks.txt
#annotatePeaks.pl p604_d8_D2_peaks.bed hg38 > p604_d8_D2_annotated_peaks.txt
4.2. Convert your MACS2 peaks to HOMER-compatible format. You can do this manually or with a script. For example:
It’s possible to use more information from the MACS2 output file to create a more informative peaks.txt file for HOMER. However, it’s important to note that some information that HOMER needs for its differential peak analysis is not available in the MACS2 output (such as Normalized Tag Count, Control Tags, and others). But we can certainly map more of the available MACS2 columns to the corresponding HOMER columns.
#The following awk command can be used to convert more MACS2 information into the HOMER format:
cd macs2
awk 'BEGIN{OFS="\t"}{print $1,$2,$3,"Peak_"NR,$5,$6,$7,$8,$9,$10}' macs2_peaks.bed > macs2_peaks.txt
awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p601_d8_D1_peaks.xls > p601_d8_D1_macs2_peaks.txt
awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p601_d8_D2_peaks.xls > p601_d8_D2_macs2_peaks.txt
awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p604_d8_D1_peaks.xls > p604_d8_D1_macs2_peaks.txt
awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p604_d8_D2_peaks.xls > p604_d8_D2_macs2_peaks.txt
This command will:
* Skip the header line (NR > 1)
* Map the MACS2 peak name ($10) to the HOMER PeakID
* Map the MACS2 chromosome, start, and end ($1, $2, $3) to the HOMER chr, start, end
* Use a placeholder "+" for the HOMER strand
* Use a placeholder "0" for the HOMER Normalized Tag Count and Focus Ratio
* Map the MACS2 pileup ($6) to the HOMER findPeaks Score and Total Tags
* Use a placeholder "0" for the HOMER Control Tags
* Map the MACS2 fold_enrichment ($8) to the HOMER Fold Change vs Control
* Map the MACS2 abs_summit ($5) to the HOMER p-value vs Control
* Map the MACS2 -log10(qvalue) ($9) to the HOMER Fold Change vs Local
* Use a placeholder "0" for the HOMER p-value vs Local and Clonal Fold Change
This script is limited by the differences in the information provided by MACS2 and HOMER. While it makes use of as much information as possible from the MACS2 output, some columns in the HOMER format still have to be filled with placeholder values.
- The following awk command can be used to convert more SICER information into the HOMER format (TODO) oder directly using findPeaks.pl.
- The following awk command can be used to convert more diffReps.pl information into the HOMER format (TODO).
4.3. Associate the converted peak files with their respective tag directories. In HOMER, peak files can be associated with a tag directory by placing them in the tag directory with the filename “peaks.txt”.
mv homer/p601_d8_D1_H3K4me3/peaks.txt homer/p601_d8_D1_H3K4me3/peaks_raw.txt
mv homer/p601_d8_D2_H3K4me3/peaks.txt homer/p601_d8_D2_H3K4me3/peaks_raw.txt
mv homer/p604_d8_D1_H3K4me3/peaks.txt homer/p604_d8_D1_H3K4me3/peaks_raw.txt
mv homer/p604_d8_D2_H3K4me3/peaks.txt homer/p604_d8_D2_H3K4me3/peaks_raw.txt
cp macs2/p601_d8_D1_macs2_peaks.txt homer/p601_d8_D1_H3K4me3/peaks.txt
cp macs2/p601_d8_D2_macs2_peaks.txt homer/p601_d8_D2_H3K4me3/peaks.txt
cp macs2/p604_d8_D1_macs2_peaks.txt homer/p604_d8_D1_H3K4me3/peaks.txt
cp macs2/p604_d8_D2_macs2_peaks.txt homer/p604_d8_D2_H3K4me3/peaks.txt
#Repeat this for each of your tag directories.
4.4. The program getDifferentialPeaksReplicates will essentially perform 3 steps, in the step 2 was modified.
First, it will pool the target tag directories and input directories separately into pooled experiments and perform an initial peak identification (using findPeaks). Pooling the experiments is generally more sensitive than trying to merge the individual peak files coming from each experiment (although this can be done using the “-use
#-- Successful modification of the script getDifferentialPeaksReplicates.pl --
#The -d parameter in the mergePeaks function in HOMER is used to specify the maximum distance between peak centers
#change Max distance to merge to 30000 bp in getDifferentialPeaksReplicates.pl
#mergePeaks -d 30000 temp_sorted | sort
#conda list homer #4.11
mergePeaks p601_d8_D1_H3K27me3/peaks.txt p601_d8_D2_H3K27me3/peaks.txt > mergePeaks_res.txt
python3 update_header.py
cat p601_d8_D1_H3K27me3/peaks.txt p601_d8_D2_H3K27me3/peaks.txt > temp
awk '{print $2 "\t" $3 "\t" $4 "\t" $1}' temp | sort -k1,1 -k2,2n | bedtools merge -d 1000 > bedtools_res.txt
python3 adjust_mergePeaks_res.py
#check if the results are correct
cut -d$'\t' -f2-4 filtered_mergePeaks_res.txt > control1
diff control1 bedtools_res.txt
#(myperl) jhuang@hamburg:~/DATA/Data_Denise_LT_DNA_Bindung/results_chipseq_histone_hg38/H3K4me3_H3K27ac__H3K27me3_H3K9me3/homer$
#getDifferentialPeaksReplicates.pl -use
T细胞与B细胞在免疫反应中的相互作用
T cells and B cells interact in several ways as part of the immune response. Here’s a general overview of their interaction:
Antigen presentation: When a pathogen invades the body, it is engulfed by a type of cell known as an antigen-presenting cell (APC). The APC processes the pathogen and displays fragments of it, known as antigens, on its surface. B cells can also act as antigen-presenting cells.
T cell activation: A type of T cell known as a helper T cell (Th cell) can recognize these antigens. The Th cell binds to the antigen, causing the T cell to become activated. This process usually requires additional signals from the APC, provided through other surface molecules.
B cell activation: Once activated, the Th cell can interact with B cells that are displaying the same antigen. The T cell releases signaling molecules known as cytokines, which help to activate the B cell.
Antibody production: Once activated, the B cell begins to proliferate and differentiate into plasma cells. These plasma cells produce antibodies that are specific to the antigen. These antibodies can then neutralize the pathogen or mark it for destruction by other immune cells.
Memory cells: Some of the B cells and T cells will differentiate into memory cells. These cells “remember” the antigen and can mount a rapid response if the same pathogen invades the body again.
So, the interaction between T cells and B cells is crucial for the adaptive immune response. It allows the immune system to mount a targeted response to specific pathogens and to remember those pathogens in case of future invasions.
T细胞和B细胞在免疫反应中有多种相互作用方式。以下是它们相互作用的一般概述:
抗原呈递:当病原体侵入身体时,会被一种称为抗原呈递细胞(APC)的细胞吞噬。该APC处理病原体并将其碎片,也就是抗原,展示在其表面。B细胞也可以作为抗原呈递细胞。
T细胞激活:一种称为辅助T细胞(Th细胞)的T细胞可以识别这些抗原。Th细胞与抗原结合,导致T细胞被激活。这个过程通常需要APC提供的其他表面分子的额外信号。 CD4+ is a type of T cell often referred to as a helper T cell.
B细胞激活:一旦激活,Th细胞可以与显示相同抗原的B细胞进行交互。T细胞释放称为细胞因子的信号分子,这些分子有助于激活B细胞。
抗体产生:一旦激活,B细胞开始增殖并分化为浆细胞。这些浆细胞产生特异性的抗原抗体。这些抗体可以中和病原体或将其标记为由其他免疫细胞销毁。
记忆细胞:一部分B细胞和T细胞会分化为记忆细胞。这些细胞“记住”了抗原,并且如果同一病原体再次侵入身体,它们可以快速反应。
因此,T细胞和B细胞之间的相互作用对适应性免疫反应至关重要。它使免疫系统能够对特定的病原体产生针对性的反应,并记住这些病原体以防未来的侵入。
T细胞表面上的表位(epitope)在”T细胞激活”这一步骤中发挥作用。在这个阶段,辅助T细胞(Th细胞)可以识别抗原呈递细胞(APC)表面上的抗原。抗原是通过与T细胞受体(TCR)结合的MHC分子展示的,其中抗原中的特定部分——表位,是被TCR识别的部分。因此,在这个过程中,T细胞表面的表位是关键。这种识别过程触发了T细胞的激活,进而影响了免疫反应的其他步骤,如B细胞的激活和抗体的生成。
抗原是存在于抗原呈递细胞(APC)上的。当病原体,比如细菌或病毒,进入身体后,抗原呈递细胞(如巨噬细胞、树突状细胞等)会捕获并处理病原体,把处理后的病原体的一部分(抗原)放在它们的表面上。接着,T细胞通过自身表面的T细胞受体(TCR)识别并与这些抗原结合,这样就触发了免疫反应。
在这个上下文中,”表位(Epitope)”通常指的是抗原(即病原体蛋白质的一个部分)的一个特定区域,这个区域可以被免疫系统(特别是抗体或T细胞受体)识别和结合。当我们说”表位出现在T细胞上”时,实际上是指T细胞受体(TCR)能够识别并结合到抗原的这个特定区域,而不是把表位物质本身放在T细胞上。 在T细胞激活的过程中,T细胞受体(TCR)会识别和绑定到抗原呈递细胞(APC)表面的抗原表位,然后这个信息(即信号)会传递给T细胞,触发免疫反应。因此,我们可以说,表位是抗原在与T细胞相互作用时所起的关键作用。
Generation of Heatmap from DEGs Data and Annotation of Identified Gene Clusters
This script is structured to process gene expression data, specifically DEGs (Differentially Expressed Genes) and create a heatmap visualizing the patterns of the data. The steps involved are as follows:
-
Package Installation and Library Loading: The script first ensures that essential packages are installed and then loads them. Some of the key packages include “gplots” for generating heatmaps, “readxl” and “writexl” for reading and writing Excel data, and “biomaRt” for fetching gene annotation data from Ensembl.
-
Data Input: It reads in the gene expression data from an Excel file named “DEGs_heatmap_data.xls”.
-
Hierarchical Clustering: The script performs hierarchical clustering on the data using both Pearson and Spearman correlations to determine the relationships between genes.
-
Heatmap Generation: A heatmap is generated to visualize the clustered data, and this visualization is saved as an image file named “DEGs_heatmap.png”.
-
Annotation and Data Segregation: The genes are further grouped into clusters, and for each cluster, annotation details such as gene ID, gene name, chromosome name, start and end positions, and more are fetched from Ensembl. This annotated data for each cluster is stored with the expression data in distinct data frames.
-
Output: All the processed clusters are then compiled and written to an Excel file named “gene_clusters.xlsx”, with each cluster having its designated sheet.
This script aids in the identification and exploration of gene expression patterns and further provides essential annotations for identified gene clusters.
#ensure you have the following packages installed. If not, you'll have to install them
install.packages("gplots")
install.packages("readxl")
install.packages("writexl")
install.packages("dplyr")
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("biomaRt")
library(gplots)
library(readxl)
library(writexl)
library(dplyr)
library(biomaRt)
listEnsembl()
listMarts()
ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="104")
datasets <- listDatasets(ensembl)
# Read the Excel file
datamat = read_excel(path = "DEGs_heatmap_data.xls", sheet = 1, col_names = TRUE)
datamat <- as.data.frame(datamat)
rownames(datamat) <- datamat[, 1]
datamat <- datamat[, -1] # Remove the first column which is now the row names
hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.2)
mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED", "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");
mycol = mycol[as.vector(mycl)]
png("DEGs_heatmap.png", width=900, height=1010)
heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
scale='row',trace='none',col=bluered(75),
RowSideColors = mycol, labRow="", srtCol=30, keysize=0.72, cexRow = 2, cexCol = 1.4)
dev.off()
#### cluster members #####
subset_1<-names(subset(mycl, mycl == '1'))
subset_1_ <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
filters = 'ensembl_gene_id',
values = subset_1,
mart = ensembl)
subset_1_uniq <- distinct(subset_1_, ensembl_gene_id, .keep_all= TRUE)
subset_1_expr <- datamat[subset_1,]
subset_1_expr$ENSEMBL = rownames(subset_1_expr)
cluster1_YELLOW <- merge(subset_1_uniq, subset_1_expr, by.x="ensembl_gene_id", by.y="ENSEMBL")
#write.csv(cluster1_YELLOW,file='cluster1_YELLOW.txt')
subset_2<-names(subset(mycl, mycl == '2'))
subset_2_ <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
filters = 'ensembl_gene_id',
values = subset_2,
mart = ensembl)
subset_2_uniq <- distinct(subset_2_, ensembl_gene_id, .keep_all= TRUE)
subset_2_expr <- datamat[subset_2,]
subset_2_expr$ENSEMBL = rownames(subset_2_expr)
cluster2_DARKBLUE <- merge(subset_2_uniq, subset_2_expr, by.x="ensembl_gene_id", by.y="ENSEMBL")
#write.csv(cluster2_DARKBLUE,file='cluster2_DARKBLUE.txt')
subset_3<-names(subset(mycl, mycl == '3'))
subset_3_ <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
filters = 'ensembl_gene_id',
values = subset_3,
mart = ensembl)
subset_3_uniq <- distinct(subset_3_, ensembl_gene_id, .keep_all= TRUE)
subset_3_expr <- datamat[subset_3,]
subset_3_expr$ENSEMBL = rownames(subset_3_expr)
cluster3_DARKORANGE <- merge(subset_3_uniq, subset_3_expr, by.x="ensembl_gene_id", by.y="ENSEMBL")
#write.csv(cluster3_DARKORANGE,file='cluster3_DARKORANGE.txt')
subset_4<-names(subset(mycl, mycl == '4'))
subset_4_ <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
filters = 'ensembl_gene_id',
values = subset_4,
mart = ensembl)
subset_4_uniq <- distinct(subset_4_, ensembl_gene_id, .keep_all= TRUE)
subset_4_expr <- datamat[subset_4,]
subset_4_expr$ENSEMBL = rownames(subset_4_expr)
cluster4_DARKMAGENTA <- merge(subset_4_uniq, subset_4_expr, by.x="ensembl_gene_id", by.y="ENSEMBL")
#write.csv(cluster4_DARKMAGENTA,file='cluster4_DARKMAGENTA.txt')
subset_5<-names(subset(mycl, mycl == '5'))
subset_5_ <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
filters = 'ensembl_gene_id',
values = subset_5,
mart = ensembl)
subset_5_uniq <- distinct(subset_5_, ensembl_gene_id, .keep_all= TRUE)
subset_5_expr <- datamat[subset_5,]
subset_5_expr$ENSEMBL = rownames(subset_5_expr)
cluster5_DARKCYAN <- merge(subset_5_uniq, subset_5_expr, by.x="ensembl_gene_id", by.y="ENSEMBL")
#write.csv(cluster5_DARKCYAN,file='cluster5_DARKCYAN.txt')
subset_6<-names(subset(mycl, mycl == '6'))
subset_6_ <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
filters = 'ensembl_gene_id',
values = subset_6,
mart = ensembl)
subset_6_uniq <- distinct(subset_6_, ensembl_gene_id, .keep_all= TRUE)
subset_6_expr <- datamat[subset_6,]
subset_6_expr$ENSEMBL = rownames(subset_6_expr)
cluster6_DARKRED <- merge(subset_6_uniq, subset_6_expr, by.x="ensembl_gene_id", by.y="ENSEMBL")
#write.csv(cluster6_DARKRED,file='cluster6_DARKRED.txt')
write_xlsx(list(
"Cluster 1 YELLOW" = cluster1_YELLOW,
"Cluster 2 DARKBLUE" = cluster2_DARKBLUE,
"Cluster 3 DARKORANGE" = cluster3_DARKORANGE,
"Cluster 4 DARKMAGENTA" = cluster4_DARKMAGENTA,
"Cluster 5 DARKCYAN" = cluster5_DARKCYAN,
"Cluster 6 DARKRED" = cluster6_DARKRED
), "gene_clusters.xlsx")