Author Archives: gene_x

RNA-seq data analysis of Yersinia on GRCh38

YopM is a systematic antagonist of YopP in suppressing inflammatory gene expression in Yersinia-infected human macrophages

  1. goal

    # 0th heatmap see manuscript: The 
    # 1.th heatmap mock WAC WAP 1.5 + 6 hours: WAC, WA314 comparing to mock
    # 2.th heatmap deltaYopM, deltaYopP, deltaYopQ Vergleich zu WAP (6 hours). The sample deltaYopMP is completed deleted: deltaM, deltaP, deltaQ to WA314
  2. download the batch 1 data

    #-- PRJEB10086 --
    
    mock_6h_a
    mock_6h_b
    mock_90min_a
    mock_90min_b
    WA314_6h_a
    WA314_6h_b
    WA314_90min_a
    WA314_90min_b
    WA314dYopM_6h_a
    WA314dYopM_6h_b
    WA314dYopM_90min_a
    WA314dYopM_90min_b
    
    #-- PRJEB45780 --
    
    wget ftp.sra.ebi.ac.uk/vol1/run/ERR592/ERR5924425/mock_DoA.fastq.gz
    wget ftp.sra.ebi.ac.uk/vol1/run/ERR592/ERR5924426/mock_DoII.fastq.gz
    #mv mock_DoA.fastq.gz mock_6h_DoA.fastq.gz
    #mv mock_DoII.fastq.gz mock_6h_DoII.fastq.gz
    
    ftp.sra.ebi.ac.uk/vol1/run/ERR592/ERR5924427/WAC1_5h_DoA.fastq.gz
    ftp.sra.ebi.ac.uk/vol1/run/ERR592/ERR5924428/WAC1_5h_DoII.fastq.gz
    ftp.sra.ebi.ac.uk/vol1/run/ERR592/ERR5924429/WAC6h_DoA.fastq.gz
    ftp.sra.ebi.ac.uk/vol1/run/ERR592/ERR5924430/WAC6h_DoII.fastq.gz
    ftp.sra.ebi.ac.uk/vol1/run/ERR592/ERR5924431/WAP1_5h_DoA.fastq.gz
    ftp.sra.ebi.ac.uk/vol1/run/ERR592/ERR5924421/WAP1_5h_DoII.fastq.gz
    ftp.sra.ebi.ac.uk/vol1/run/ERR592/ERR5924422/WAP6h_DoA.fastq.gz
    ftp.sra.ebi.ac.uk/vol1/run/ERR592/ERR5924423/WAP6h_DoII.fastq.gz
    
    ftp.sra.ebi.ac.uk/vol1/run/ERR592/ERR5924420/dP6h_DoII.fastq.gz
    ftp.sra.ebi.ac.uk/vol1/run/ERR592/ERR5924424/dP6h_DoIII.fastq.gz
    
    #-- PRJEB44958 --
    
    ftp.sra.ebi.ac.uk/vol1/run/ERR608/ERR6087955/dYopP_1.5h_DonorII.fastq.gz
    ftp.sra.ebi.ac.uk/vol1/run/ERR608/ERR6087956/dYopP_1.5h_DonorIII.fastq.gz
    
    wget ftp.sra.ebi.ac.uk/vol1/run/ERR608/ERR6087951/dYopMP_1.5h_DonorII.fastq.gz
    wget ftp.sra.ebi.ac.uk/vol1/run/ERR608/ERR6087952/dYopMP_1.5h_DonorIII.fastq.gz
    wget ftp.sra.ebi.ac.uk/vol1/run/ERR608/ERR6087953/dYopMP_6h_DonorII.fastq.gz
    wget ftp.sra.ebi.ac.uk/vol1/run/ERR608/ERR6087954/dYopMP_6h_DonorIII.fastq.gz
    
    # ---- 1st. batch: 1.5h + 6h ---- 
    mock
    WAC
    WA314 == WAP
    dYopM,dYopP
    
    E-MTAB-10602
    E-MTAB-10473
    
    Project: PRJEB45780
    Pathogenic bacteria Yersinia enterocolitica injects virulence plasmid-encoded effectors through the type three secretion system into macrophages to modulate gene expression. Here we analyzed the effect on gene expression in primary human macrophages of Y. enterocolitica strains lacking effector YopP (1.5 h infection) or effectors YopP and YopM (1.5 h or 6 h infection) simultaneously using RNA-seq. This is part of a larger sequencing experiment for which other samples can be found in EMBL-EBI (www.ebi.ac.uk/arrayexpress) under accession number E-MTAB-10473 and European Nucleotide Archive (ENA) at http://www.ebi.ac.uk/ena/data/view/PRJEB10086.
    
    ./ena-file-download-read_run-PRJEB10086-submitted_ftp-20231220-1619_sorted.sh
    #? check WA314_90min_a.fastq.gz == WAP1_5h_DoA.fastq.gz
    #? check WA314_90min_b.fastq.gz == WAP1_5h_DoII.fastq.gz
    #? check WA314_6h_a.fastq.gz == WAP6h_DoA.fastq.gz
    #? check WA314_6h_b.fastq.gz == WAP6h_DoII.fastq.gz
    #TODO: if they are not the same, take WA314_* as the WAP-samples in the 1st batch!
    
    cat mock_90min_a_L004_R1_001.fastq.gz mock_90min_a_L005_R1_001.fastq.gz mock_90min_a_L006_R1_001.fastq.gz mock_90min_a_L007_R1_001.fastq.gz mock_90min_a_L008_R1_001.fastq.gz > mock_90min_a.fastq.gz
    cat mock_90min_b_L004_R1_001.fastq.gz mock_90min_b_L005_R1_001.fastq.gz mock_90min_b_L006_R1_001.fastq.gz mock_90min_b_L007_R1_001.fastq.gz mock_90min_b_L008_R1_001.fastq.gz > mock_90min_b.fastq.gz
    cat mock_6h_a_L004_R1_001.fastq.gz mock_6h_a_L005_R1_001.fastq.gz mock_6h_a_L006_R1_001.fastq.gz mock_6h_a_L007_R1_001.fastq.gz mock_6h_a_L008_R1_001.fastq.gz > mock_6h_a.fastq.gz
    cat mock_6h_b_L004_R1_001.fastq.gz mock_6h_b_L005_R1_001.fastq.gz mock_6h_b_L006_R1_001.fastq.gz mock_6h_b_L007_R1_001.fastq.gz mock_6h_b_L008_R1_001.fastq.gz > mock_6h_b.fastq.gz
    
    cat WA314_90min_a_L004_R1_001.fastq.gz WA314_90min_a_L005_R1_001.fastq.gz WA314_90min_a_L006_R1_001.fastq.gz WA314_90min_a_L007_R1_001.fastq.gz WA314_90min_a_L008_R1_001.fastq.gz > WA314_90min_a.fastq.gz
    cat WA314_90min_b_L004_R1_001.fastq.gz WA314_90min_b_L005_R1_001.fastq.gz WA314_90min_b_L006_R1_001.fastq.gz WA314_90min_b_L007_R1_001.fastq.gz WA314_90min_b_L008_R1_001.fastq.gz > WA314_90min_b.fastq.gz
    cat WA314_6h_a_L004_R1_001.fastq.gz WA314_6h_a_L005_R1_001.fastq.gz WA314_6h_a_L006_R1_001.fastq.gz WA314_6h_a_L007_R1_001.fastq.gz WA314_6h_a_L008_R1_001.fastq.gz > WA314_6h_a.fastq.gz
    cat WA314_6h_b_L004_R1_001.fastq.gz WA314_6h_b_L005_R1_001.fastq.gz WA314_6h_b_L006_R1_001.fastq.gz WA314_6h_b_L007_R1_001.fastq.gz WA314_6h_b_L008_R1_001.fastq.gz > WA314_6h_b.fastq.gz
    
    cat WA314dYopM_90min_a_L004_R1_001.fastq.gz WA314dYopM_90min_a_L005_R1_001.fastq.gz WA314dYopM_90min_a_L006_R1_001.fastq.gz WA314dYopM_90min_a_L007_R1_001.fastq.gz WA314dYopM_90min_a_L008_R1_001.fastq.gz > WA314dYopM_90min_a.fastq.gz
    cat WA314dYopM_90min_b_L004_R1_001.fastq.gz WA314dYopM_90min_b_L005_R1_001.fastq.gz WA314dYopM_90min_b_L006_R1_001.fastq.gz WA314dYopM_90min_b_L007_R1_001.fastq.gz WA314dYopM_90min_b_L008_R1_001.fastq.gz > WA314dYopM_90min_b.fastq.gz
    cat WA314dYopM_6h_a_L004_R1_001.fastq.gz WA314dYopM_6h_a_L005_R1_001.fastq.gz WA314dYopM_6h_a_L006_R1_001.fastq.gz WA314dYopM_6h_a_L007_R1_001.fastq.gz WA314dYopM_6h_a_L008_R1_001.fastq.gz > WA314dYopM_6h_a.fastq.gz
    cat WA314dYopM_6h_b_L004_R1_001.fastq.gz WA314dYopM_6h_b_L005_R1_001.fastq.gz WA314dYopM_6h_b_L006_R1_001.fastq.gz WA314dYopM_6h_b_L007_R1_001.fastq.gz WA314dYopM_6h_b_L008_R1_001.fastq.gz > WA314dYopM_6h_b.fastq.gz
    
    #END
    
    ./WAC1_5h_DoA.fastq.gz
    ./WAC1_5h_DoII.fastq.gz
    ./WAC6h_DoA.fastq.gz
    ./WAC6h_DoII.fastq.gz
    
    ./WAP1_5h_DoA.fastq.gz
    ./WAP1_5h_DoII.fastq.gz
    ./WAP6h_DoA.fastq.gz
    ./WAP6h_DoII.fastq.gz
    
    ./dYopP_1.5h_DonorII.fastq.gz
    ./dYopP_1.5h_DonorIII.fastq.gz
    ./dP6h_DoII.fastq.gz
    ./dP6h_DoIII.fastq.gz
    
    mv dP6h_DoII.fastq.gz dYopP_6h_DoII.fastq.gz
    mv dP6h_DoIII.fastq.gz dYopP_6h_DoIII.fastq.gz
    
    mv WAC1_5h_DoA.fastq.gz   WAC_1.5h_DoA.fastq.gz
    mv WAC1_5h_DoII.fastq.gz  WAC_1.5h_DoII.fastq.gz
    mv WAC6h_DoA.fastq.gz     WAC_6h_DoA.fastq.gz
    mv WAC6h_DoII.fastq.gz    WAC_6h_DoII.fastq.gz
    
    mv WAP1_5h_DoA.fastq.gz   WAP_1.5h_DoA.fastq.gz
    mv WAP1_5h_DoII.fastq.gz  WAP_1.5h_DoII.fastq.gz
    mv WAP6h_DoA.fastq.gz     WAP_6h_DoA.fastq.gz
    mv WAP6h_DoII.fastq.gz    WAP_6h_DoII.fastq.gz
    
    mv WA314dYopM_90min_a.fastq.gz dYopM_90min_a.fastq.gz
    mv WA314dYopM_90min_b.fastq.gz dYopM_90min_b.fastq.gz
    mv WA314dYopM_6h_a.fastq.gz    dYopM_6h_a.fastq.gz
    mv WA314dYopM_6h_b.fastq.gz    dYopM_6h_b.fastq.gz
    
    #Desperated!
    #wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR592/ERR5924425/mock_DoA.fastq.gz
    #wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR592/ERR5924426/mock_DoII.fastq.gz
    #wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR608/ERR6087953/dYopMP_6h_DonorII.fastq.gz
    #wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR608/ERR6087954/dYopMP_6h_DonorIII.fastq.gz
    #wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR608/ERR6087951/dYopMP_1.5h_DonorII.fastq.gz
    #wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR608/ERR6087952/dYopMP_1.5h_DonorIII.fastq.gz
  3. download the public data and prepare the batch 2 data

    # ---- data public (naive and LPS is public data)---- 
    jhuang@hamburg:/media/jhuang/Elements2/Data_Indra_RNASeq_GSM2262901/raw_data
    
    #Dataset name GEO ID Series Experiment
    
    #- naive macrophages GSM2679941 GSE100382 RNA-seq (RNA_N_rep1)
    #    SRR5746425 4,000,000   204M    136Mb   2017-06-25
    #    SRR5746426 4,000,000   204M    137Mb   2017-06-25
    #    SRR5746427 4,000,000   204M    137.2Mb 2017-06-25
    #    SRR5746428 4,000,000   204M    135.6Mb 2017-06-25
    #    SRR5746429 4,000,000   204M    134.5Mb 2017-06-25
    #    SRR5746430 4,000,000   204M    136.2Mb 2017-06-25
    #    SRR5746431 903,661 46.1M   32.7Mb  2017-06-25
    cp RNA_N_rep1.fastq.gz ../raw_data/naive_rep1.fastq.gz
    
    #- naive macrophages GSM2679942 GSE100382 RNA-seq (RNA_N_rep2)
    #    SRR5746432 14,176,210  723M    350.2Mb 2017-06-25
    #    SRR5746433 14,157,173  722M    353.4Mb 2017-06-25
    cp RNA_N_rep2.fastq.gz ../raw_data/naive_rep2.fastq.gz
    
    #- LPS stimulated macrophages GSM2679944 GSE100382 RNA-seq (RNA_L_rep1)
    #    SRR5746436 4,000,000   204M    135.8Mb 2017-06-25
    #    SRR5746437 4,000,000   204M    137.1Mb 2017-06-25
    #    SRR5746438 4,000,000   204M    136.6Mb 2017-06-25
    #    SRR5746439 4,000,000   204M    135.4Mb 2017-06-25
    #    SRR5746440 4,000,000   204M    135.3Mb 2017-06-25
    #    SRR5746441 2,178,850   111.1M  76.7Mb  2017-06-25
    cp RNA_L_rep1.fastq.gz ../raw_data/LPS_rep1.fastq.gz
    
    #- LPS stimulated macrophages GSM2679945 GSE100382 RNA-seq (RNA_L_rep2)
    #    SRR5746442 17,351,249  884.9M  428Mb   2017-06-25
    #    SRR5746443 17,318,750  883.3M  431.8Mb 2017-06-25
    cp RNA_L_rep2.fastq.gz ../raw_data/LPS_rep2.fastq.gz
    
    #- naive macrophages GSM2262901 GSE85243 RNA-seq (RPMI_d6_6095)
    #    SRR4004025 11,559,434  485.5M  338.3Mb 2016-11-21
    cp RPMI_d6_6095.fastq.gz ../raw_data/naive_rep3.fastq.gz
    
    #- naive macrophages GSM2262902 GSE85243 RNA-seq (RPMI_d6_6718)
    #    SRR4004026 22,911,696  962.3M  635Mb   2016-11-21
    cp RPMI_d6_6718.fastq.gz ../raw_data/naive_rep4.fastq.gz
    
    #- LPS stimulated macrophages GSM2262906 GSE85243 RNA-seq (RPMI_d6_Restim_8749)
    #    SRR4004030 26,566,129  1.1G    681.3Mb 2016-11-21
    cp RPMI_d6_Restim_8749.fastq.gz ../raw_data/LPS_rep3.fastq.gz
    
    #- LPS stimulated macrophages GSM2262907 GSE85243 RNA-seq (RPMI_d6_Restim_8754)
    #    SRR4004031 28,719,690  1.2G    731.2Mb 2016-11-21
    cp RPMI_d6_Restim_8754.fastq.gz ../raw_data/LPS_rep4.fastq.gz
    
    #END
    
    # ---- 2nd. batch: 6h         jhuang@hamburg:~/DATA/Data_Soeren_RNA-seq_2022/Raw_Data ---- 
    mock
    WAC
    WAP
    deltaYopQ -->
    Vergleich zu WAP
    
    mv MOCK_A.fastq.gz ../raw_data/
    mv MOCK_B.fastq.gz ../raw_data/
    mv WAC_A.fastq.gz ../raw_data/
    mv WAC_B.fastq.gz ../raw_data/
    mv WAP_A.fastq.gz ../raw_data/
    mv WAP_B.fastq.gz ../raw_data/
    mv deltaQ_A.fastq.gz ../raw_data/
    mv deltaQ_B.fastq.gz ../raw_data/
  4. run nextflow

        (rnaseq) [jhuang@sage DATA]$ pwd
        /home/jhuang/DATA
        /home/jhuang/REFs/Homo_sapiens/Ensembl
        #"/home/jhuang/REFs/Homo_sapiens/UCSC/hg38/blacklists/hg38-blacklist.bed"
    
        nextflow run rnaseq/main.nf --input samplesheet.csv    --outdir results_GRCh38 --genome GRCh38   -profile test_full -resume --max_memory 300.GB --max_time 2400.h --save_reference --aligner star_salmon  --skip_deseq2_qc --skip_fastqc
    
        ln -s ~/Tools/rnaseq/assets/multiqc_config.yaml multiqc_config.yaml
        multiqc -f --config multiqc_config.yaml . 2>&1
        rm multiqc_config.yaml
  5. summary:

        #ENSG00000000419 DPM1    385.616
        #grep "ENSG00000000419" quant.genes.sf
        #ENSG00000000419 1159.45 909.453 15.8259 385.616
    
        Attached, you will find the results of the RNASeq analysis discussed in our last online meeting.
    
        The raw counts are located in Yersinia_RNAseq_results_GRCh38/star_salmon/salmon.merged.gene_counts.tsv. Note that this table was generated using the STAR-Salmon strategy, with STAR aligning the reads and Salmon quantifying transcript abundances. Salmon's advanced probabilistic model assigns fractional counts to transcripts, particularly when reads map to multiple similar transcripts. This is the reason non-integer numbers appear in this table.
    
        The samples included in the analysis are as follows:
    
        batch 1:
    
        ./mock_90min_a.fastq.gz
        ./mock_90min_b.fastq.gz
        ./mock_6h_a.fastq.gz
        ./mock_6h_b.fastq.gz
        ./WAC_1.5h_DoA.fastq.gz
        ./WAC_1.5h_DoII.fastq.gz
        ./WAC_6h_DoA.fastq.gz
        ./WAC_6h_DoII.fastq.gz
        ./WAP_1.5h_DoA.fastq.gz
        ./WAP_1.5h_DoII.fastq.gz
        ./WAP_6h_DoA.fastq.gz
        ./WAP_6h_DoII.fastq.gz
        ./WA314_90min_a.fastq.gz
        ./WA314_90min_b.fastq.gz
        ./WA314_6h_a.fastq.gz
        ./WA314_6h_b.fastq.gz
        ./dYopM_90min_a.fastq.gz
        ./dYopM_90min_b.fastq.gz
        ./dYopM_6h_a.fastq.gz
        ./dYopM_6h_b.fastq.gz
        ./dYopP_1.5h_DonorII.fastq.gz
        ./dYopP_1.5h_DonorIII.fastq.gz
        ./dYopP_6h_DoII.fastq.gz
        ./dYopP_6h_DoIII.fastq.gz
        ./mock_6h_DoA.fastq.gz
        ./mock_6h_DoII.fastq.gz
        ./dYopMP_1.5h_DonorII.fastq.gz
        ./dYopMP_1.5h_DonorIII.fastq.gz
        ./dYopMP_6h_DonorII.fastq.gz
        ./dYopMP_6h_DonorIII.fastq.gz
    
        batch public:
    
        naive_rep1.fastq.gz
        naive_rep2.fastq.gz
        LPS_rep1.fastq.gz
        LPS_rep2.fastq.gz
        naive_rep3.fastq.gz
        naive_rep4.fastq.gz
        LPS_rep3.fastq.gz
        LPS_rep4.fastq.gz
    
        batch 2:
    
        MOCK_A.fastq.gz (6h)
        MOCK_B.fastq.gz (6h)
        WAC_A.fastq.gz  (6h)
        WAC_B.fastq.gz  (6h)
        WAP_A.fastq.gz  (6h)
        WAP_B.fastq.gz  (6h)
        deltaQ_A.fastq.gz (6h)
        deltaQ_B.fastq.gz (6h)

merging of hg38 counts of two tables and cbind virus counts

#-- merge the hg38 counts of 38 samples and the hg38 counts of LT_K331A --

# Load necessary libraries
library(dplyr)

# Read the first table
first_table <- read.csv("merged_gene_counts.csv", stringsAsFactors = FALSE)

# Read the second table
second_table <- read.csv("~/DATA/Data_Denise_LT_K331A_RNASeq/results_GRCh38/star_salmon/gene_name_gene_counts.csv", sep = ',', stringsAsFactors = FALSE)

# Extract the relevant columns from the second table
second_table_filtered <- second_table %>%
  select(gene_name, LT_K331A_DI, LT_K331A_DII)

# Summarise the second table by gene_name, summing the LT_K331A_DI and LT_K331A_DII columns for duplicate gene names
second_table_summarised <- second_table_filtered %>%
  group_by(gene_name) %>%
  summarise(LT_K331A_DI = sum(LT_K331A_DI, na.rm = TRUE), LT_K331A_DII = sum(LT_K331A_DII, na.rm = TRUE))

# Merge the tables by matching the 'gene_name' from the second table to the gene_name of the first table
merged_table <- left_join(first_table, second_table_summarised, by = "gene_name")

# Replace NA values with 0
merged_table[is.na(merged_table)] <- 0

# View the result
print(head(merged_table))

# Optionally, write the result to a new file
write.csv(merged_table, "gene_counts_hg38_30samples.csv", row.names = TRUE)

#-- cbind the hg38 and virus counts --

# Load necessary libraries
library(dplyr)

# Read the first table
gene_counts_hg38_30samples <- read.csv("gene_counts_hg38_30samples.csv", stringsAsFactors = FALSE, row.names=1)

# Read the second table
salmon_merged_gene_counts <- read.csv("~/DATA/Data_Denise_LT_K331A_RNASeq/results_JN707599/star_salmon/salmon.merged.gene_counts.csv", sep = ',', stringsAsFactors = FALSE)

# Rename the columns in the second table as specified
names(salmon_merged_gene_counts)[names(salmon_merged_gene_counts) == "LT_K331A_d8_DonorI"] <- "LT_K331A_DI"
names(salmon_merged_gene_counts)[names(salmon_merged_gene_counts) == "LT.K331A.d8.DII_re"] <- "LT_K331A_DII"

# Determine which columns are in both tables
common_columns <- intersect(names(gene_counts_hg38_30samples), names(salmon_merged_gene_counts))

# Select only the common columns from the second table (ignoring the extra 5 columns)
salmon_merged_gene_counts <- salmon_merged_gene_counts[, common_columns]

# Sort the columns of the second table to match the first table
# Ensuring that 'gene_name' remains the first column
cols_order <- c("gene_name", setdiff(names(gene_counts_hg38_30samples), "gene_name"))
salmon_merged_gene_counts <- salmon_merged_gene_counts[, cols_order]

# Merge the tables by pasting together
# Since the columns are now aligned, this should paste them side by side
merged_table <- cbind(gene_counts_hg38_30samples, salmon_merged_gene_counts[,-1])  # -1 to not duplicate gene_name

# Replace NA values with 0 in the entire table
merged_table[is.na(merged_table)] <- 0

# View the result
print(head(merged_table))

# Optionally, write the result to a new file
write.csv(merged_table, "updated_gene_counts_hg38_30samples.csv", row.names = FALSE)

DNA-damage specific ubiquitination

DNA损伤特异性泛素化是指在DNA损伤响应中,泛素分子特异性地附加到蛋白质上的细胞过程。这一过程是DNA损伤响应(DDR)机制的重要组成部分,DDR是一种复杂的细胞网络,负责检测和修复受损DNA,维护基因组稳定性,防止突变。以下是更详细的概述:

- 泛素化:这是一种翻译后修饰过程,其中一个名为泛素的小蛋白质共价地附加到目标蛋白质上的赖氨酸残基上。这个过程由一系列酶进行:E1(泛素激活酶),E2(泛素结合酶)和E3(泛素连接酶)。这些酶通过级联反应将泛素从E1转移到E2,最终通过E3的作用转移到目标蛋白上。

- DNA损伤:DNA可以由各种因素损伤,如紫外线辐射、化学试剂和代谢副产品。当检测到这种损伤时,细胞启动DDR以修复DNA并防止在细胞分裂过程中传播错误。

- 泛素化在DDR中的作用:对DNA损伤的响应中,涉及DDR途径的特定蛋白质被泛素化。这种泛素化可以具有几种功能:
    * 信号传导:泛素化可以改变被修饰蛋白的功能,激活或失活酶,或作为其他蛋白质识别并结合到受损部位的信号。
    * 招募:它可以帮助招募DNA修复蛋白到损伤部位。例如,泛素链可以被具有泛素结合域的蛋白质识别,导致它们在DNA损伤部位累积。
    * 蛋白酶体降解:在某些情况下,泛素化将蛋白质靶向蛋白酶体降解,蛋白酶体是一个大型的蛋白质复合体,负责降解和回收蛋白质。这可以帮助调节DDR中涉及的蛋白质水平或移除可能妨碍修复过程的蛋白质。

- 泛素链的类型:泛素链的性质(例如,泛素分子之间的连接类型)可以决定泛素化的结果。不同的连接可以导致目标蛋白质的不同命运,包括蛋白酶体降解、蛋白质相互作用的变化或活性的改变。

理解DNA损伤特异性泛素化在分子生物学和遗传学领域至关重要,特别是在研究癌症和其他疾病中,这些疾病通常存在DNA损伤和修复机制的失调。它也是开发治疗药物的目标,因为调节这些途径可以增强癌症治疗(如化疗和放疗)的效果,这些治疗依赖于在癌细胞中引起DNA损伤。

Rad51 Foci and UL36DUB: The formation of Rad51 foci, which are crucial for DNA repair, is not impacted by the deubiquitinating activity of UL36, a protein found in herpesviruses.

DUB Activity in Human Herpesviruses: The deubiquitinating (DUB) activity, which involves the removal of ubiquitin from proteins, is a conserved function across human herpesviruses, influencing their interaction with host cells.

Role of ICP0 in the DNA damage response

Rad51 foci refer to the visible aggregations or clusters of the Rad51 protein within the nucleus of a cell, typically observed using fluorescence microscopy. Rad51 is a crucial protein involved in the homologous recombination repair pathway, a mechanism used by cells to repair double-strand breaks in DNA. The formation of Rad51 foci is an important cellular response to DNA damage, particularly during the S and G2 phases of the cell cycle. Here’s a bit more detail:

- Function of Rad51: Rad51 plays a central role in the search for homology and strand pairing during homologous recombination. It forms a nucleoprotein filament on single-stranded DNA (ssDNA) that is essential for the homologous pairing and strand exchange with a complementary DNA strand. This process is critical for accurate DNA repair.

- Formation of Rad51 Foci: In response to DNA damage, particularly double-strand breaks, Rad51 is recruited to the sites of damage and forms nuclear foci. These foci represent the active sites where DNA repair is taking place.

- Detection of Rad51 Foci: Rad51 foci can be visualized using immunofluorescence techniques, where antibodies specific to Rad51 are used, followed by detection with fluorescent dyes. The presence and number of these foci in a cell can be indicative of the cell’s ability to repair DNA damage effectively.

- Clinical Relevance: The formation of Rad51 foci can be used as a biomarker for assessing the proficiency of homologous recombination repair in cells. This has implications in cancer biology, as cells deficient in homologous recombination repair (such as those lacking functional BRCA1 or BRCA2) exhibit impaired formation of Rad51 foci. These cells are more susceptible to certain types of chemotherapy, such as PARP inhibitors.

In summary, Rad51 foci are indicators of active DNA repair processes within the cell, and their study is important in understanding cellular responses to DNA damage, as well as in the context of cancer diagnosis and treatment.

Early viral and host transcriptome atlas of lytic nuclear DNA virus infection

A “lytic nuclear DNA virus infection” refers to a type of viral infection where a DNA virus enters a host cell, replicates in the nucleus, and eventually causes the lysis (destruction) of the cell. Here’s a breakdown of the process:

  • Infection: The virus attaches to the host cell and injects its DNA into it. For nuclear DNA viruses, this DNA is transported into the nucleus of the host cell.

  • Replication and Transcription: Inside the nucleus, the viral DNA is replicated and transcribed using the host cell’s machinery. This process leads to the production of viral proteins and the replication of the viral genome.

  • Assembly: Newly formed viral genomes and structural proteins are assembled into new virus particles within the host cell.

  • Lysis and Release: In the lytic cycle, once a significant number of new viruses have been produced, the host cell’s membrane is disrupted (lysis), leading to the death of the cell and the release of the new viruses, which can then infect surrounding cells.

This lytic phase is contrasted with the lysogenic phase seen in some viruses, where the viral DNA integrates into the host genome and can remain dormant for a period before becoming active.

Lytic nuclear DNA viruses include many well-known pathogens such as the Herpesviruses (including Herpes Simplex Virus and Varicella-Zoster Virus), Adenoviruses, and some types of Papillomaviruses. These viruses are significant in medicine due to their roles in various diseases, ranging from mild conditions like colds and skin warts to more severe illnesses like certain cancers and encephalitis. Understanding the lytic cycle of these viruses is crucial for developing treatments and preventive measures against these viral infections.

4sU-tagging, ribosome profiling and ATAC-Seq

https://www.virologie.uni-wuerzburg.de/virologie/ags-virologie/ag-doelken/research/

  1. Principle of 4sU-tagging: 4-thiouridine (4sU) is added to cell culture medium to start metabolic labelling of newly transcribed (nascent) RNA. 5 to 60 min later total RNA is isolated. Following thiol-specific biotinylation, total RNA is separated into labelled nascent and unlabelled pre-existing RNA. All three RNA fractions are suitable for down-stream analyses including qRT-PCR, microarrays and RNA-seq.

  2. Principle of ribosome profiling: Following a mild cell lysis, a stringent RNAse digest is performed. The short fragment of RNA molecules actively being translated at the time of cell lysis is protected from RNAse treatment within the ribosome and can then be recovered, cloned and sequenced. Thereby, a detailed picture of translational activity at the time of cell lysis is obtained

  3. Chromatin accessibility profiling by ATAC-Seq

https://www.nature.com/articles/s41596-022-00692-9 https://www.illumina.com/techniques/multiomics/epigenetics/atac-seq-chromatin-accessibility.html

4sU-tagging_and_ribosome_profiling

  • The assay for transposase-accessible chromatin with sequencing (ATAC-Seq) is a popular method for determining chromatin accessibility across the genome. By sequencing regions of open chromatin, ATAC-Seq can help you uncover how chromatin packaging and other factors affect gene expression. ATAC-Seq does not require prior knowledge of regulatory elements, making it a powerful epigenetic discovery tool. It has been used to better understand chromatin accessibility, transcription factor binding, and gene regulation in complex diseases, embryonic development, T-cell activation, and cancer. ATAC-Seq can be performed on bulk cell populations or on single cells at high resolution.

  • How Does ATAC-Seq Work? In ATAC-Seq, genomic DNA is exposed to Tn5, a highly active transposase. Tn5 simultaneously fragments DNA, preferentially inserts into open chromatin sites, and adds sequencing primers (a process known as tagmentation). The sequenced DNA identifies the open chromatin and data analysis can provide insight into gene regulation. Additionally, ATAC-Seq can be combined with other methods, such as RNA sequencing, for a multiomic approach to studying gene expression. Subsequent experiments often include ChIP-Seq, Methyl-Seq, or Hi-C-Seq to further characterize forms of epigenetic regulation.

ATAC-seq

Which hypervariable regions do the amplicon sequences belong to: V3–V4 or V4–V5?

#Questions about V3–V4 primers for 16S rRNA amplicon sequencing, and calculating overlap
https://forum.qiime2.org/t/questions-about-v3-v4-primers-for-16s-rrna-amplicon-sequencing-and-calculating-overlap/20250
#Comparison of Two 16S rRNA Primers (V3–V4 and V4–V5) for Studies of Arctic Microbial Communities
https://pure.mpg.de/rest/items/item_3335003/component/file_3339382/content

#Two different hypervariable regions of the bacterial 16S rRNA gene were amplified using aliquots of the isolated DNA from each sample.
- The V3–V4 region was amplified using the S-D-Bact-0341-b-S-17 (5′-CCTACGGGNGGCWGCAG-3′) and the S-D-Bact-0785-a-A-21 (5′-GACTACHVGGGTATCTAATCC-3′) primers (Klindworth et al., 2013). 
- The V4–V5 regions was amplified using the 515F-Y (5′-GTGYCAGCMGCCGCGGTAA-3′) and the 926R (5′-CCGYCAATTYMTTTRAGTTT-3′) primers (Parada et al., 2016). 

A. Klindworth, E. Pruesse, T. Schweer, J. Peplies, C. Quast, M. Horn, F.O. Glöckner
Evaluation of general 16S ribosomal RNA gene PCR primers for classical and next-generation sequencing-based diversity studies
Nucleic Acids Res., 41 (2013), 10.1093/nar/gks808
e1–e1

# check if the data are from V3-V4 or V4-V5?

mkdir pandaseq.out_V3_V4
for file in cutadapted_16S/filtered_R1/*_R1.fastq.gz; do echo "pandaseq -f ${file} -r ${file/_R1/_R2} -l 300 -p CCTACGGGNGGCWGCAG -q GACTACHVGGGTATCTAATCC  -w pandaseq.out_V3_V4/$(echo $file | cut -d'/' -f3 | cut -d'_' -f1)_merged.fasta >> LOG_pandaseq_V3_V4"; done

mkdir pandaseq.out_V4_V5
for file in cutadapted_16S/filtered_R1/*_R1.fastq.gz; do echo "pandaseq -f ${file} -r ${file/_R1/_R2} -l 300 -p GTGYCAGCMGCCGCGGTAA -q GACTACHVGGGTATCTAATCC  -w pandaseq.out_V4_V5/$(echo $file | cut -d'/' -f3 | cut -d'_' -f1)_merged.fasta >> LOG_pandaseq_V4_V5"; done

#repalce the second _R1 to _R2
pandaseq -f cutadapted_16S/filtered_R1/AH-LH_R1.fastq.gz -r cutadapted_16S/filtered_R2/AH-LH_R2.fastq.gz -l 300 -p CCTACGGGNGGCWGCAG -q GACTACHVGGGTATCTAATCC  -w pandaseq.out_V3_V4/AH-LH_merged.fasta >> LOG_pandaseq_V3_V4
pandaseq -f cutadapted_16S/filtered_R1/AH-NLH_R1.fastq.gz -r cutadapted_16S/filtered_R2/AH-NLH_R2.fastq.gz -l 300 -p CCTACGGGNGGCWGCAG -q GACTACHVGGGTATCTAATCC  -w pandaseq.out_V3_V4/AH-NLH_merged.fasta >> LOG_pandaseq_V3_V4
pandaseq -f cutadapted_16S/filtered_R1/AH-Nose_R1.fastq.gz -r cutadapted_16S/filtered_R2/AH-Nose_R2.fastq.gz -l 300 -p CCTACGGGNGGCWGCAG -q GACTACHVGGGTATCTAATCC  -w pandaseq.out_V3_V4/AH-Nose_merged.fasta >> LOG_pandaseq_V3_V4
pandaseq -f cutadapted_16S/filtered_R1/AL-LH_R1.fastq.gz -r cutadapted_16S/filtered_R2/AL-LH_R2.fastq.gz -l 300 -p CCTACGGGNGGCWGCAG -q GACTACHVGGGTATCTAATCC  -w pandaseq.out_V3_V4/AL-LH_merged.fasta >> LOG_pandaseq_V3_V4

pandaseq -f cutadapted_16S/filtered_R1/AH-LH_R1.fastq.gz -r cutadapted_16S/filtered_R2/AH-LH_R2.fastq.gz -l 300 -p GTGYCAGCMGCCGCGGTAA -q GACTACHVGGGTATCTAATCC  -w pandaseq.out_V4_V5/AH-LH_merged.fasta >> LOG_pandaseq_V4_V5
pandaseq -f cutadapted_16S/filtered_R1/AH-NLH_R1.fastq.gz -r cutadapted_16S/filtered_R2/AH-NLH_R2.fastq.gz -l 300 -p GTGYCAGCMGCCGCGGTAA -q GACTACHVGGGTATCTAATCC  -w pandaseq.out_V4_V5/AH-NLH_merged.fasta >> LOG_pandaseq_V4_V5
pandaseq -f cutadapted_16S/filtered_R1/AH-Nose_R1.fastq.gz -r cutadapted_16S/filtered_R2/AH-Nose_R2.fastq.gz -l 300 -p GTGYCAGCMGCCGCGGTAA -q GACTACHVGGGTATCTAATCC  -w pandaseq.out_V4_V5/AH-Nose_merged.fasta >> LOG_pandaseq_V4_V5
pandaseq -f cutadapted_16S/filtered_R1/AL-LH_R1.fastq.gz -r cutadapted_16S/filtered_R2/AL-LH_R2.fastq.gz -l 300 -p GTGYCAGCMGCCGCGGTAA -q GACTACHVGGGTATCTAATCC  -w pandaseq.out_V4_V5/AL-LH_merged.fasta >> LOG_pandaseq_V4_V5

#-p GTGYCAGCMGCCGCGGTAA -q GACTACHVGGGTATCTAATCC --> Used!
#-rwxrwxrwx 1 jhuang jhuang 39590797 Jan 11 17:09 AH-LH_merged.fasta (80630 sequences)
pandaseq -f cutadapted_16S/filtered_R1/AH-LH_R1.fastq.gz -r cutadapted_16S/filtered_R2/AH-LH_R2.fastq.gz -l 300   -w pandaseq.out_notfiltering_with_primer/AH-LH_merged.fasta >> LOG_pandaseq_notfiltering_with_primer

EV vs OMV

The terms “EV” and “outer membrane vesicle (OMV)” are related to biological structures, particularly in the context of bacterial or cell biology. They refer to different types of vesicles or membrane-bound structures, and here’s the difference between them:

  • EV (Extracellular Vesicle):

    • Definition: EV is a broad term used to describe vesicles that are released from cells into the extracellular environment. These vesicles can be produced by various cell types, including eukaryotic cells (such as human cells) and some prokaryotic cells (like bacteria).
    • Origin: EVs can originate from different cellular compartments, including the plasma membrane, endosomes, and even the cytoplasm. They are involved in intercellular communication, transporting various molecules (such as proteins, nucleic acids, lipids) between cells.
    • Function: EVs have diverse functions, including cell-to-cell signaling, immune response modulation, and the transfer of genetic material between cells. They play a crucial role in various physiological and pathological processes.
  • Outer Membrane Vesicle (OMV):

    • Definition: OMVs are a specific type of extracellular vesicle produced by Gram-negative bacteria. They are derived from the outer membrane of these bacteria and contain various components from the bacterial cell envelope, including lipopolysaccharides (LPS), outer membrane proteins, and periplasmic contents.
    • Origin: OMVs are produced as blebs or protrusions from the outer membrane of Gram-negative bacteria. They are released into the extracellular environment and can be found in the surrounding medium during bacterial growth.
    • Function: OMVs have multiple functions in bacterial physiology and interactions with their environment. They can serve as a delivery system for bacterial toxins, virulence factors, and other molecules. OMVs also play a role in bacterial communication and host-pathogen interactions.

In summary, “EV” is a general term encompassing vesicles released by various cell types, including both eukaryotic and some prokaryotic cells, for diverse purposes. On the other hand, “OMV” specifically refers to vesicles produced by Gram-negative bacteria, which are derived from their outer membrane and contain components relevant to bacterial biology and interactions. While OMVs are a subset of EVs, not all EVs are OMVs, as EVs can come from different sources and serve different functions.

Utilizing Random Forest to Differentiate Bacterial vs. Viral Infections via Host Gene Expression Signatures

Random Forest is a machine learning model that falls under the category of ensemble learning methods. It is particularly known for classification and regression tasks but can also be used for other machine learning tasks like clustering.

A Random Forest model works by constructing a multitude of decision trees during the training process and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees. Random decision forests correct for decision trees’ habit of overfitting to their training set.

The “random” part of the name comes from two key aspects of randomness used in the model:

  • Bootstrap aggregating (bagging): Each tree in the forest is trained on a random subset of the data points, and this subset is drawn with replacement (meaning some samples can be used multiple times).

  • Feature Randomness: When splitting a node during the construction of the tree, the split that is chosen is no longer the best split among all features. Instead, the split that is picked is the best split among a random subset of the features.

This approach of combining multiple trees helps to reduce the variance, leading to better performance and robustness than a single decision tree, especially on complex datasets. Random Forests are widely used because they are easy to use, can handle binary, categorical, and numerical data, require very little input preparation, and often produce a model that performs very well without complex tuning.

Using machine learning, specifically a Random Forest classifier, to distinguish between bacterial and viral infections based on transcript expression levels involves several steps. This process generally includes data preparation, feature selection, model training, validation, and testing. Here’s a more detailed breakdown of how you might implement this:

  1. Data Collection and Preparation

    • Collect Data: Obtain gene expression data from patients with known bacterial and viral infections. This data might come from public databases or your own experimental data.
    • Preprocess Data: Normalize the expression levels to make the data comparable across samples. Handling missing values and filtering out low-variance transcripts can also improve model performance.
  2. Feature Selection

    • Identify Features: The features are the expression levels of various transcripts. You might start with a large number of potential features. (We can choose the published transcripts as starting features!)
    • Reduce Dimensionality: Use techniques like Principal Component Analysis (PCA), variance thresholding, or mutual information to reduce the number of features. This step is crucial to improve the model’s performance and reduce overfitting.
  3. Model Training

    • Split the Data: Divide your dataset into training, validation, and testing sets. A common split ratio is 70% training, 15% validation, and 15% testing.
    • Train the Random Forest: Use the training data to fit a Random Forest classifier. This involves choosing a set of parameters, such as the number of trees in the forest (n_estimators) and the depth of the trees (max_depth). Tools like scikit-learn in Python provide straightforward implementations.
  4. Model Validation and Tuning

    • Cross-Validation: Use cross-validation on the training set to estimate the model’s performance. Adjust the model parameters based on validation results.
    • Hyperparameter Tuning: Techniques like grid search or random search can help identify the best parameters for your Random Forest model.
  5. Testing and Evaluation

    • Test the Model: Use the unseen test data to evaluate the model’s performance. Metrics like accuracy, precision, recall, and the area under the ROC curve (AUC) can provide insight into how well your model distinguishes between bacterial and viral infections.
    • Feature Importance: Random Forest can provide information on which transcripts (features) are most important for classification, offering potential biological insights.
  6. Implementation Tools

    • Python Libraries: Libraries like scikit-learn for machine learning, pandas for data manipulation, and matplotlib or seaborn for visualization are commonly used.
    • R Packages: For users more comfortable with R, packages like randomForest, caret, and tidyverse can be used for similar steps.

    Example Code Snippet (Python with scikit-learn)

      from sklearn.model_selection import train_test_split
      from sklearn.ensemble import RandomForestClassifier
      from sklearn.metrics import accuracy_score
      import pandas as pd
    
      # Assuming `data` is a Pandas DataFrame with the last column as the target label
      X = data.iloc[:, :-1]  # Features: Expression levels
      y = data.iloc[:, -1]   # Target: Infection type (0 for viral, 1 for bacterial)
    
      # Splitting the dataset
      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
      # Creating and training the Random Forest model
      model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
      model.fit(X_train, y_train)
    
      # Predicting and evaluating the model
      predictions = model.predict(X_test)
      print("Accuracy:", accuracy_score(y_test, predictions))
    
      #In the context of machine learning, X_train and y_train are variables used during the training phase of a model. They represent the "features" and "target" (or "labels") of your training dataset, respectively.
    
      #X_train: This variable holds the features of your dataset that are used to make predictions. The "X" usually stands for the input variables or independent variables. In the case of transcript expression levels to distinguish between bacterial and viral infections, X_train would contain the expression levels of various transcripts for each sample in the training set. It's often structured as a matrix or DataFrame where each row represents a sample and each column represents a feature (e.g., the level of expression of a specific transcript).
    
      #y_train: This variable contains the target or labels for the training dataset. The "y" represents the output or dependent variable that you are trying to predict. In your case, y_train would contain the classification of each sample as either bacterial or viral, corresponding to the rows in X_train. y_train is typically a vector or Series, where each entry corresponds to the label of a sample in X_train.
    
      #Together, X_train and y_train are used to "train" or fit a machine learning model. The model learns from these training examples: it tries to understand the relationship between the features (X_train) and the labels (y_train) so that it can accurately predict the labels of new, unseen data. After the model is trained, it can then be tested using a separate dataset (not used during training) to evaluate its performance, typically using X_test and y_test, which hold the features and labels of the test dataset, respectively.
    
      #The three parameters you mentioned are among the key hyperparameters used to configure a Random Forest model. Here's what each of them means:
      #* n_estimators
      #  - Definition: The number of trees in the forest.
      #  - Purpose: More trees increase the model's robustness, making its predictions more stable, but also increase computational cost. Finding the right number of trees is a balance between improving model performance and computational efficiency.
      #  - Default Value in scikit-learn: As of version 0.22, the default value is 100.
    
      #* max_depth
      #  - Definition: The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
      #  - Purpose: This parameter controls the depth of each tree. Deeper trees can model more complex patterns but also increase the risk of overfitting. Limiting the depth of trees can help in creating simpler models that generalize better.
      #  - Default Value in scikit-learn: The default value is None, meaning the nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
    
      #* random_state
      #  - Definition: Controls both the randomness of the bootstrapping of the samples used when building trees (if bootstrap=True) and the sampling of the features to consider when looking for the best split at each node.
      #  - Purpose: This parameter makes the model's output reproducible. It ensures that the same sequence of random numbers is generated each time you run the model with that specific seed. It's helpful for debugging and for situations where you need reproducibility.
      #  - Default Value in scikit-learn: The default value is None, which means the random number generator is the RandomState instance used by np.random.
    
      #Adjusting these parameters can significantly impact the model's performance and training time. It's common practice to use techniques like cross-validation and grid search to find the optimal values for these and other hyperparameters based on the specific dataset you're working with.
  7. Final Thoughts

    • Iterate: Machine learning is an iterative process. You may need to go back and adjust your preprocessing, feature selection, or model parameters based on test results.
    • Biological Interpretation: Beyond classification, consider how the identified transcripts and their expression levels contribute to the biological understanding of infection responses.

Unicycler vs. Trycycler

  1. prapare the input sequencing data

    NGS.id  Sample.name  ONT_barcode
    jk3332  5179R1  Native Barcode NB01
    jk3333  1585  Native Barcode NB02
    jk3334  1585V  Native Barcode NB03
    jk3335  5179  Native Barcode NB04
    jk3336  HD_05_2  Native Barcode NB05
    jk3337  HD_05_2_K5  Native Barcode NB06
    jk3338  HD_05_2_K6  Native Barcode NB07
  2. assembly using trycycler

    cat FAN41335_pass_barcode01_46d24d87_69a75752_0.fastq.gz FAN41335_pass_barcode01_46d24d87_69a75752_1.fastq.gz > FAN41335_pass_barcode01.fastq.gz
    cat FAN41335_pass_barcode03_46d24d87_69a75752_0.fastq.gz FAN41335_pass_barcode03_46d24d87_69a75752_1.fastq.gz > FAN41335_pass_barcode03.fastq.gz
    cat FAN41335_pass_barcode04_46d24d87_69a75752_0.fastq.gz FAN41335_pass_barcode04_46d24d87_69a75752_1.fastq.gz FAN41335_pass_barcode04_46d24d87_69a75752_2.fastq.gz > FAN41335_pass_barcode04.fastq.gz
    cat FAN41335_pass_barcode05_46d24d87_69a75752_0.fastq.gz FAN41335_pass_barcode05_46d24d87_69a75752_1.fastq.gz > FAN41335_pass_barcode05.fastq.gz
    
    unicycler -1 s-epidermidis-5179-r1_R1.fastq.gz -2 s-epidermidis-5179-r1_R2.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode01/FAN41335_pass_barcode01.fastq.gz --mode normal -t 55 -o 5179R1_normal
    unicycler -1 s-epidermidis-1585_R1.fastq.gz -2 s-epidermidis-1585_R2.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode02/FAN41335_pass_barcode02.fastq.gz --mode normal -t 55 -o 1585_normal
    #3 no short sequencing
    unicycler -1 s-epidermidis-5179_R1.fastq.gz -2 s-epidermidis-5179_R2.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode04/FAN41335_pass_barcode04.fastq.gz --mode normal -t 55 -o 5179_normal
    unicycler -1 HD5_2_S38_R1_001.fastq.gz -2 HD5_2_S38_R2_001.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode05/FAN41335_pass_barcode05.fastq.gz --mode normal -t 55 -o HD05_2_normal
    unicycler -1 275_K5_Holger_S92_R1_001.fastq.gz -2 275_K5_Holger_S92_R2_001.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode06/FAN41335_pass_barcode06.fastq.gz --mode normal -t 55 -o HD05_2_K5_normal
    unicycler -1 276_K6_Holger_S95_R1_001.fastq.gz -2 276_K6_Holger_S95_R2_001.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode07/FAN41335_pass_barcode07.fastq.gz --mode normal -t 55 -o HD05_2_K6_normal
    
    unicycler -1 s-epidermidis-5179-r1_R1.fastq.gz -2 s-epidermidis-5179-r1_R2.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode01/FAN41335_pass_barcode01.fastq.gz --mode bold -t 55 -o 5179R1_bold
    unicycler -1 s-epidermidis-1585_R1.fastq.gz -2 s-epidermidis-1585_R2.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode02/FAN41335_pass_barcode02.fastq.gz --mode bold -t 55 -o 1585_bold
    #3 no short sequencing
    unicycler -1 s-epidermidis-5179_R1.fastq.gz -2 s-epidermidis-5179_R2.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode04/FAN41335_pass_barcode04.fastq.gz --mode bold -t 55 -o 5179_bold
    unicycler -1 HD5_2_S38_R1_001.fastq.gz -2 HD5_2_S38_R2_001.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode05/FAN41335_pass_barcode05.fastq.gz --mode bold -t 55 -o HD05_2_bold
    unicycler -1 275_K5_Holger_S92_R1_001.fastq.gz -2 275_K5_Holger_S92_R2_001.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode06/FAN41335_pass_barcode06.fastq.gz --mode bold -t 55 -o HD05_2_K5_bold
    unicycler -1 276_K6_Holger_S95_R1_001.fastq.gz -2 276_K6_Holger_S95_R2_001.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode07/FAN41335_pass_barcode07.fastq.gz --mode bold -t 55 -o HD05_2_K6_bold
    
    unicycler -1 s-epidermidis-5179-r1_R1.fastq.gz -2 s-epidermidis-5179-r1_R2.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode01/FAN41335_pass_barcode01.fastq.gz --mode bold -t 55 -o 5179R1_bold
    unicycler -1 s-epidermidis-1585_R1.fastq.gz -2 s-epidermidis-1585_R2.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode02/FAN41335_pass_barcode02.fastq.gz --mode bold -t 55 -o 1585_bold
    #3 no short sequencing
    unicycler -1 s-epidermidis-5179_R1.fastq.gz -2 s-epidermidis-5179_R2.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode04/FAN41335_pass_barcode04.fastq.gz --mode bold -t 55 -o 5179_bold
    unicycler -1 HD5_2_S38_R1_001.fastq.gz -2 HD5_2_S38_R2_001.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode05/FAN41335_pass_barcode05.fastq.gz --mode bold -t 55 -o HD05_2_bold
    unicycler -1 275_K5_Holger_S92_R1_001.fastq.gz -2 275_K5_Holger_S92_R2_001.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode06/FAN41335_pass_barcode06.fastq.gz --mode bold -t 55 -o HD05_2_K5_bold
    unicycler -1 276_K6_Holger_S95_R1_001.fastq.gz -2 276_K6_Holger_S95_R2_001.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode07/FAN41335_pass_barcode07.fastq.gz --mode bold -t 55 -o HD05_2_K6_bold
    
    unicycler -1 s-epidermidis-5179-r1_R1.fastq.gz -2 s-epidermidis-5179-r1_R2.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode01/FAN41335_pass_barcode01.fastq.gz --mode conservative -t 55 -o 5179R1_conservative
    unicycler -1 s-epidermidis-1585_R1.fastq.gz -2 s-epidermidis-1585_R2.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode02/FAN41335_pass_barcode02.fastq.gz --mode conservative -t 55 -o 1585_conservative
    #3 no short sequencing
    unicycler -1 s-epidermidis-5179_R1.fastq.gz -2 s-epidermidis-5179_R2.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode04/FAN41335_pass_barcode04.fastq.gz --mode conservative -t 55 -o 5179_conservative
    
    unicycler -1 HD5_2_S38_R1_001.fastq.gz -2 HD5_2_S38_R2_001.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode05/FAN41335_pass_barcode05.fastq.gz --mode conservative -t 55 -o HD05_2_conservative
    unicycler -1 275_K5_Holger_S92_R1_001.fastq.gz -2 275_K5_Holger_S92_R2_001.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode06/FAN41335_pass_barcode06.fastq.gz --mode conservative -t 55 -o HD05_2_K5_conservative
    unicycler -1 276_K6_Holger_S95_R1_001.fastq.gz -2 276_K6_Holger_S95_R2_001.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode07/FAN41335_pass_barcode07.fastq.gz --mode conservative -t 55 -o HD05_2_K6_conservative
    
    ragtag.py scaffold  ../assembly_flye_HD05_2/assembly.fasta assembly.fasta
    ragtag.py patch  ragtag.scaffold.fasta ../../assembly_flye_HD05_2/assembly.fasta
    grep -o 'N' ragtag.patch.fasta | wc -l
    
    makeblastdb -in ../assembly_flye_HD05_2/assembly.fasta -dbtype nucl
    blastn -db ../assembly_flye_HD05_2/assembly.fasta -query 1-4.fasta -out assmbly_vs_flye.blastn -evalue 0.00000000001 -num_threads 15 -outfmt 6 -strand both -max_target_seqs 1 
  3. install the trycycler environment

    nextdenovo_dir="/path/to/NextDenovo"
    nextpolish_dir="/path/to/NextPolish"
    genome_size="2500000" #2 503 927
    /home/jhuang/Tools/canu/build/bin/canu    -p canu -d canu_temp -fast genomeSize="$genome_size" useGrid=false maxThreads="$threads" -nanopore read_subsets/sample_"$i".fastq
    /home/jhuang/Tools/Trycycler/scripts/canu_trim.py    canu_temp/canu.contigs.fasta > assemblies/assembly_"$i".fasta
    /home/jhuang/Tools/Minipolish/miniasm_and_minipolish.sh    read_subsets/sample_"$i".fastq "$threads" > assemblies/assembly_"$i".gfa
    /home/jhuang/Tools/NECAT/Linux-amd64/bin/necat.pl    config config.txt
    /home/jhuang/Tools/NECAT/Linux-amd64/bin/necat.pl    bridge config.txt
    /home/jhuang/Tools/raven/build/bin/raven    --threads "$threads" --disable-checkpoints --graphical-fragment-assembly assemblies/assembly_"$i".gfa read_subsets/sample_"$i".fastq > assemblies/assembly_"$i".fasta
    
    #https://github.com/rrwick (Bandage, Unicycler, Filtlong, Trycycler, Polypolish
    install canu, flye, raven, miniasm, minipolish, any2fasta via 'mamba install'
    #install fastp, medaka, polypolish, masurca (install Polca) with 'mamba install'
    
    install NextDenovo and NextPolish from https://github.com/Nextomics
    wget https://github.com/Nextomics/NextDenovo/releases/latest/download/NextDenovo.tgz
    tar -vxzf NextDenovo.tgz && cd NextDenovo
    #cd NextDenovo && make
    wget https://github.com/Nextomics/NextPolish/releases/download/v1.4.1/NextPolish.tgz
    pip install paralleltask
    tar -vxzf NextPolish.tgz && cd NextPolish   #&& make
    
    git clone https://github.com/rrwick/Minipolish.git
    
    $ wget https://github.com/xiaochuanle/NECAT/releases/download/v0.0.1_update20200803/necat_20200803_Linux-amd64.tar.gz
    $ tar xzvf necat_20200803_Linux-amd64.tar.gz
    $ cd NECAT/Linux-amd64/bin
    $ export PATH=$PATH:$(pwd)
    
    # Install canu and raven under ~/Tools/
    git clone https://github.com/marbl/canu.git
    cd canu/src
    make -j 50  #<number of threads>
    
    git clone https://github.com/lbcb-sci/raven && cd raven
    cmake -S ./ -B./build -DRAVEN_BUILD_EXE=1 -DCMAKE_BUILD_TYPE=Release
    cmake --build build
    
    # Adapt the script trycycler_assembly_extra-thorough.sh with the following complete paths.
    /home/jhuang/Tools/NECAT/Linux-amd64/bin/necat.pl
    /home/jhuang/Tools/Minipolish/miniasm_and_minipolish.sh
  4. assembly using trycycler

    TODO (IMPORTANT): assmeble all genomes using the following methods. compare them to the unicycler results.
    
    (trycycler) jhuang@hamm:~/DATA/Data_Holger_S.epidermidis_1585_5179_HD05$ ./trycycler_assembly_extra-thorough.sh 
    
    #In the HD05 project, we use the following strategies!
    
    I. At first construct the genome only with Trycycler (Trycycler: a consensus long-read assembly tool), 
    
    cp long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode01/FAN41335_pass_barcode01.fastq.gz trycycler_5179R1/reads.fastq.gz
    cp long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode02/FAN41335_pass_barcode02.fastq.gz trycycler_1585/reads.fastq.gz
    cp long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode03/FAN41335_pass_barcode03.fastq.gz trycycler_1585v/reads.fastq.gz
    cp long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode04/FAN41335_pass_barcode04.fastq.gz trycycler_5179/reads.fastq.gz
    cp long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode05/FAN41335_pass_barcode05.fastq.gz trycycler_HD05_2/reads.fastq.gz
    cp long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode06/FAN41335_pass_barcode06.fastq.gz trycycler_HD05_2_K5/reads.fastq.gz
    cp long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode07/FAN41335_pass_barcode07.fastq.gz trycycler_HD05_2_K6/reads.fastq.gz
    
    for sample in trycycler_5179R1 trycycler_1585 trycycler_1585v trycycler_5179 trycycler_HD05_2 trycycler_HD05_2_K5 trycycler_HD05_2_K6; do
    cd ${sample};
    ../trycycler_assembly_extra-thorough.sh;
    cd ..;
    done
    #TODO: further steps (see https://github.com/rrwick/Trycycler/wiki)
    
    for sample in trycycler_5179R1 trycycler_1585 trycycler_1585v trycycler_5179 trycycler_HD05_2 trycycler_HD05_2_K5 trycycler_HD05_2_K6; do
    cd ${sample};
    ../trycycler_assembly_extra-thorough_raven.sh;
    cd ..;
    done
    #TODO: further steps (see https://github.com/rrwick/Trycycler/wiki)
    
    for sample in trycycler_5179R1 trycycler_1585 trycycler_1585v trycycler_5179 trycycler_HD05_2 trycycler_HD05_2_K5 trycycler_HD05_2_K6; do
    cd ${sample};
    ../trycycler_assembly_extra-thorough_canu.sh;
    cd ..;
    done
    #TODO: further steps (see https://github.com/rrwick/Trycycler/wiki)
    
    for sample in trycycler_5179R1 trycycler_1585 trycycler_1585v trycycler_5179 trycycler_HD05_2 trycycler_HD05_2_K5 trycycler_HD05_2_K6; do
    cd ${sample};
    trycycler cluster --threads 55 --assemblies assemblies/*.fasta --reads reads.fastq --out_dir trycycler;
    cd ..;
    done
    
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_001
    #Error: failed to circularise sequence D_bctg00000000 because its start could not be found in other sequences. You can either trim some sequence off the start of D_bctg00000000 or exclude the sequence altogether
    and try again.
    #Error: failed to circularise sequence E_ctg000010 for multiple reasons. You must either repair this sequence or exclude it and then try running trycycler reconcile again.
    #Error: failed to circularise sequence W_ctg000000 because its start could not be found in other sequences. You can either trim some sequence off the start of W_ctg000000 or exclude the sequence altogether and try again.
    
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_001
    #Error: failed to circularise sequence K_ctg000000 because its end could not be found in other sequences. You can either trim some sequence off the end of K_ctg000000 or exclude the sequence altogether and try
    #Worst-1kbp: W_Utg714
    
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_001
    #Error: failed to circularise sequence T_contig_1 because its end could not be found in other sequences. You can either trim some sequence off the end of T_contig_1 or exclude the sequence altogether and try again.
    # Worst-1kbp: D_bctg00000000, J_bctg00000000, P_bctg00000000
    
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_001
    #Error: failed to circularise sequence A_tig00000003 because its start could not be found in other sequences. You can either trim some sequence off the start of A_tig00000003 or exclude the sequence altogether and try again.
    #Error: failed to circularise sequence E_ctg000000 because its start could not be found in other sequences. You can either trim some sequence off the start of E_ctg000000 or exclude the sequence altogether and try again.
    #Error: failed to circularise sequence Q_ctg000000 because its end could not be found in other sequences. You can either trim some sequence off the end of Q_ctg000000 or exclude the sequence altogether and try again.
    # Worst-1kbp: L_Utg716, X_Utg654
    
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_001
    
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_001
    
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_001
    
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_002
    #M_tig00000002, S_tig00000003, A_tig00000003, C_utg000003l, G_tig00000002, I_utg000002l
    #E_ctg000000, K_ctg000000, Q_ctg000000
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_003
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_004
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_005
    trycycler reconcile --threads 55 --reads reads.fastq --cluster_dir trycycler/cluster_006
    #
    #--> When finished, Trycycler reconcile will make 2_all_seqs.fasta in the cluster directory, a multi-FASTA file containing each of the contigs ready for multiple sequence alignment.
    
    trycycler msa --threads 55 --cluster_dir trycycler/cluster_001
    trycycler msa --threads 55 --cluster_dir trycycler/cluster_002
    trycycler msa --threads 55 --cluster_dir trycycler/cluster_003
    trycycler msa --threads 55 --cluster_dir trycycler/cluster_004
    trycycler msa --threads 55 --cluster_dir trycycler/cluster_005
    #--> When finished, Trycycler reconcile will make a 3_msa.fasta file in the cluster directory
    
    #generate 4_reads.fastq for each contig!
    trycycler partition --threads 55 --reads reads.fastq --cluster_dirs trycycler/cluster_*
    #trycycler partition --threads 55 --reads reads.fastq --cluster_dirs trycycler/cluster_001 trycycler/cluster_002 trycycler/cluster_003
    
    trycycler consensus --threads 55 --cluster_dir trycycler/cluster_001
    trycycler consensus --threads 55 --cluster_dir trycycler/cluster_002
    trycycler consensus --threads 55 --cluster_dir trycycler/cluster_003
    trycycler consensus --threads 55 --cluster_dir trycycler/cluster_004
    trycycler consensus --threads 55 --cluster_dir trycycler/cluster_005
    
    #!!NOTE that we take the isolates of HD05_2_K5 and HD05_2_K6 assembled by Unicycler instead of Trycycler!!
    
    # TODO (TODAY), generate the 3 datasets below!
    # TODO (IMPORTANT): write a Email to Holger, say the short sequencing of HD5_2 is not correct, since the 3 datasets! However, the MTxxxxxxx is confirmed not in K5 and K6!
    TODO: variant calling needs the short-sequencing, they are not dorable without the correct short-reads! resequencing? It is difficult to call variants only from long-reads since too much errors in long-reads!
    #TODO: check the MT sequence if in the isolates, more deteiled annotations come late!
    #II. Comparing the results of Trycycler with Unicycler.
    #III. Eventually add the plasmids assembled from unicycler to the final results. E.g. add the 4 plasmids to K5 and K6
  5. Polishing after Trycycler

    #1. Oxford Nanopore sequencer (Ignored due to the samtools version incompatibility!)
    # for c in trycycler/cluster_*; do
    #     medaka_consensus -i "$c"/4_reads.fastq -d "$c"/7_final_consensus.fasta -o "$c"/medaka -m r941_min_sup_g507 -t 12
    #     mv "$c"/medaka/consensus.fasta "$c"/8_medaka.fasta
    #     rm -r "$c"/medaka "$c"/*.fai "$c"/*.mmi  # clean up
    # done
    # cat trycycler/cluster_*/8_medaka.fasta > trycycler/consensus.fasta
    
    #2. Short-read polishing
    
    #---- 5179_R1 (2) ----
    #  mean read depth: 205.8x
    #  188 bp have a depth of zero (99.9924% coverage)
    #  355 positions changed (0.0144% of total positions)
    #  estimated pre-polishing sequence accuracy: 99.9856% (Q38.42)
    
    #Step 1: read QC
    fastp --in1 ../../s-epidermidis-5179-r1_R1.fastq.gz --in2 ../../s-epidermidis-5179-r1_R2.fastq.gz --out1 1.fastq.gz --out2 2.fastq.gz --unpaired1 u.fastq.gz --unpaired2 u.fastq.gz
    
    #Step 2: Polypolish
    for cluster in cluster_001 cluster_002; do
    bwa index ${cluster}/7_final_consensus.fasta
    bwa mem -t 55 -a ${cluster}/7_final_consensus.fasta 1.fastq.gz > ${cluster}/alignments_1.sam
    bwa mem -t 55 -a ${cluster}/7_final_consensus.fasta 2.fastq.gz > ${cluster}/alignments_2.sam
    polypolish polish ${cluster}/7_final_consensus.fasta ${cluster}/alignments_1.sam ${cluster}/alignments_2.sam > ${cluster}/polypolish.fasta
    done
    
    #Step 3: POLCA
    for cluster in cluster_001 cluster_002; do
    cd ${cluster}
    polca.sh -a polypolish.fasta -r "../../../s-epidermidis-5179-r1_R1.fastq.gz ../../../s-epidermidis-5179-r1_R2.fastq.gz" -t 55 -m 120G
    cd ..
    done
    
    #Substitution Errors: 37
    #Insertion/Deletion Errors: 2
    #Assembly Size: 2470001
    #Consensus Quality: 99.9984
    
    #Substitution Errors: 4
    #Insertion/Deletion Errors: 0
    #Assembly Size: 17748
    #Consensus Quality: 99.9775
    
    #Step 4: (optional) more rounds and/or other polishers
    #After one round of Polypolish and one round of POLCA, your assembly should be in very good shape! 
    #However, there may still be a few lingering errors. You can try running additional rounds of Polypolish or POLCA to see if they make any more changes.
    
    for cluster in cluster_001 cluster_002; do
    cd ${cluster}
    polca.sh -a polypolish.fasta.PolcaCorrected.fa -r "../../../s-epidermidis-5179-r1_R1.fastq.gz ../../../s-epidermidis-5179-r1_R2.fastq.gz" -t 55 -m 120G
    cd ..
    done
    
    Substitution Errors: 13
    Insertion/Deletion Errors: 0
    Assembly Size: 2470004
    Consensus Quality: 99.9995
    
    Substitution Errors: 0
    Insertion/Deletion Errors: 0
    Assembly Size: 17748
    Consensus Quality: 100
    
    for cluster in cluster_001; do
    cd ${cluster}
    polca.sh -a polypolish.fasta.PolcaCorrected.fa.PolcaCorrected.fa -r "../../../s-epidermidis-5179-r1_R1.fastq.gz ../../../s-epidermidis-5179-r1_R2.fastq.gz" -t 55 -m 120G
    cd ..
    done
    
    #Substitution Errors: 0
    #Insertion/Deletion Errors: 0
    #Assembly Size: 2470004
    #Consensus Quality: 100
    
    #---- 1585 (4) ----
    #  mean read depth: 174.7x
    #  8,297 bp have a depth of zero (99.6604% coverage)
    #  271 positions changed (0.0111% of total positions)
    #  estimated pre-polishing sequence accuracy: 99.9889% (Q39.55)
    
    #Step 1: read QC
    fastp --in1 ../../s-epidermidis-1585_R1.fastq.gz --in2 ../../s-epidermidis-1585_R2.fastq.gz --out1 1.fastq.gz --out2 2.fastq.gz --unpaired1 u.fastq.gz --unpaired2 u.fastq.gz
    
    #Step 2: Polypolish
    for cluster in cluster_001 cluster_002 cluster_003 cluster_004; do
      bwa index ${cluster}/7_final_consensus.fasta
      bwa mem -t 55 -a ${cluster}/7_final_consensus.fasta 1.fastq.gz > ${cluster}/alignments_1.sam
      bwa mem -t 55 -a ${cluster}/7_final_consensus.fasta 2.fastq.gz > ${cluster}/alignments_2.sam
      polypolish polish ${cluster}/7_final_consensus.fasta ${cluster}/alignments_1.sam ${cluster}/alignments_2.sam > ${cluster}/polypolish.fasta
    done
    
    #Step 3: POLCA
    for cluster in cluster_001 cluster_002 cluster_003 cluster_004; do
      cd ${cluster}
      polca.sh -a polypolish.fasta -r "../../../s-epidermidis-1585_R1.fastq.gz ../../../s-epidermidis-1585_R2.fastq.gz" -t 55 -m 120G
      cd ..
    done
    
    #Substitution Errors: 7
    #Insertion/Deletion Errors: 4
    #Assembly Size: 2443174
    #Consensus Quality: 99.9995
    
    #Substitution Errors: 0
    #Insertion/Deletion Errors: 0
    #Assembly Size: 9014
    #Consensus Quality: 100
    
    #Substitution Errors: 0
    #Insertion/Deletion Errors: 0
    #Assembly Size: 9014
    #Consensus Quality: 100
    
    #Substitution Errors: 0
    #Insertion/Deletion Errors: 0
    #Assembly Size: 2344
    #Consensus Quality: 100
    
    #Step 4: (optional) more rounds and/or other polishers
    #After one round of Polypolish and one round of POLCA, your assembly should be in very good shape! 
    #However, there may still be a few lingering errors. You can try running additional rounds of Polypolish or POLCA to see if they make any more changes.
    
    for cluster in cluster_001; do
      cd ${cluster}
      polca.sh -a polypolish.fasta.PolcaCorrected.fa -r "../../../s-epidermidis-1585_R1.fastq.gz ../../../s-epidermidis-1585_R2.fastq.gz" -t 55 -m 120G
      cd ..
    done
    
    #Substitution Errors: 0
    #Insertion/Deletion Errors: 0
    #Assembly Size: 2443176
    #Consensus Quality: 100
    
    #---- 1585 derived from unicycler, under 1585_normal/unicycler (4) ----
    #Step 0: copy chrom and plasmid1, plasmid2, plasmid3 to cluster_001/7_final_consensus.fasta, ...
    
    #Step 1: read QC
    fastp --in1 ../../s-epidermidis-1585_R1.fastq.gz --in2 ../../s-epidermidis-1585_R2.fastq.gz --out1 1.fastq.gz --out2 2.fastq.gz --unpaired1 u.fastq.gz --unpaired2 u.fastq.gz
    
    #Step 2: Polypolish
    for cluster in cluster_001 cluster_002 cluster_003 cluster_004; do
      bwa index ${cluster}/7_final_consensus.fasta
      bwa mem -t 55 -a ${cluster}/7_final_consensus.fasta 1.fastq.gz > ${cluster}/alignments_1.sam
      bwa mem -t 55 -a ${cluster}/7_final_consensus.fasta 2.fastq.gz > ${cluster}/alignments_2.sam
      polypolish polish ${cluster}/7_final_consensus.fasta ${cluster}/alignments_1.sam ${cluster}/alignments_2.sam > ${cluster}/polypolish.fasta
    done
    #Polishing 1 (2,443,574 bp):
    #mean read depth: 174.7x
    #8,298 bp have a depth of zero (99.6604% coverage)
    #52 positions changed (0.0021% of total positions)
    #estimated pre-polishing sequence accuracy: 99.9979% (Q46.72)
    #Polishing 2 (9,014 bp):
    #mean read depth: 766.5x
    #3 bp have a depth of zero (99.9667% coverage)
    #0 positions changed (0.0000% of total positions)
    #estimated pre-polishing sequence accuracy: 100.0000% (Q∞)
    #Polishing 7 (2,344 bp):
    #mean read depth: 2893.0x
    #4 bp have a depth of zero (99.8294% coverage)
    #0 positions changed (0.0000% of total positions)
    #estimated pre-polishing sequence accuracy: 100.0000% (Q∞)
    #Polishing 8 (2,255 bp):
    #mean read depth: 2719.6x
    #4 bp have a depth of zero (99.8226% coverage)
    #0 positions changed (0.0000% of total positions)
    #estimated pre-polishing sequence accuracy: 100.0000% (Q∞)
    
    #Step 3: POLCA
    for cluster in cluster_001 cluster_002 cluster_003 cluster_004; do
      cd ${cluster}
      polca.sh -a polypolish.fasta -r "../../../s-epidermidis-1585_R1.fastq.gz ../../../s-epidermidis-1585_R2.fastq.gz" -t 55 -m 120G
      cd ..
    done
    
    #Substitution Errors: 7
    #Insertion/Deletion Errors: 4
    #Assembly Size: 2443598
    #Consensus Quality: 99.9995
    
    #Substitution Errors: 0
    #Insertion/Deletion Errors: 0
    #Assembly Size: 9014
    #Consensus Quality: 100
    
    #Substitution Errors: 0
    #Insertion/Deletion Errors: 0
    #Assembly Size: 2344
    #Consensus Quality: 100
    
    #Substitution Errors: 0
    #Insertion/Deletion Errors: 0
    #Assembly Size: 2255
    #Consensus Quality: 100
    
    #Step 4: (optional) more rounds and/or other polishers
    #After one round of Polypolish and one round of POLCA, your assembly should be in very good shape! 
    #However, there may still be a few lingering errors. You can try running additional rounds of Polypolish or POLCA to see if they make any more changes.
    
    for cluster in cluster_001; do
      cd ${cluster}
      polca.sh -a polypolish.fasta.PolcaCorrected.fa -r "../../../s-epidermidis-1585_R1.fastq.gz ../../../s-epidermidis-1585_R2.fastq.gz" -t 55 -m 120G
      cd ..
    done
    
    #Substitution Errors: 0
    #Insertion/Deletion Errors: 0
    #Assembly Size: 2443600
    #Consensus Quality: 100
    
    #-- 1585v (1, no short reads, waiting) --
    # TODO!
    
    #-- 5179 (2) --
    #mean read depth: 120.7x
    #7,547 bp have a depth of zero (99.6946% coverage)
    #356 positions changed (0.0144% of total positions)
    #estimated pre-polishing sequence accuracy: 99.9856% (Q38.41)
    
    #Step 1: read QC
    fastp --in1 ../../s-epidermidis-5179_R1.fastq.gz --in2 ../../s-epidermidis-5179_R2.fastq.gz --out1 1.fastq.gz --out2 2.fastq.gz --unpaired1 u.fastq.gz --unpaired2 u.fastq.gz
    
    #Step 2: Polypolish
    for cluster in cluster_001 cluster_002; do
    bwa index ${cluster}/7_final_consensus.fasta
    bwa mem -t 55 -a ${cluster}/7_final_consensus.fasta 1.fastq.gz > ${cluster}/alignments_1.sam
    bwa mem -t 55 -a ${cluster}/7_final_consensus.fasta 2.fastq.gz > ${cluster}/alignments_2.sam
    polypolish polish ${cluster}/7_final_consensus.fasta ${cluster}/alignments_1.sam ${cluster}/alignments_2.sam > ${cluster}/polypolish.fasta
    done
    
    #Step 3: POLCA
    for cluster in cluster_001 cluster_002; do
    cd ${cluster}
    polca.sh -a polypolish.fasta -r "../../../s-epidermidis-5179_R1.fastq.gz ../../../s-epidermidis-5179_R2.fastq.gz" -t 55 -m 120G
    cd ..
    done
    
    #Substitution Errors: 49
    #Insertion/Deletion Errors: 23
    #Assembly Size: 2471418
    #Consensus Quality: 99.9971
    
    #Substitution Errors: 0
    #Insertion/Deletion Errors: 0
    #Assembly Size: 17748
    #Consensus Quality: 100
    
    #Step 4: (optional) more rounds POLCA
    for cluster in cluster_001; do
    cd ${cluster}
    polca.sh -a polypolish.fasta.PolcaCorrected.fa -r "../../../s-epidermidis-5179_R1.fastq.gz ../../../s-epidermidis-5179_R2.fastq.gz" -t 55 -m 120G
    cd ..
    done
    #Substitution Errors: 10
    #Insertion/Deletion Errors: 5
    #Assembly Size: 2471442
    #Consensus Quality: 99.9994
    
    for cluster in cluster_001; do
    cd ${cluster}
    polca.sh -a polypolish.fasta.PolcaCorrected.fa.PolcaCorrected.fa -r "../../../s-epidermidis-5179_R1.fastq.gz ../../../s-epidermidis-5179_R2.fastq.gz" -t 55 -m 120G
    cd ..
    done
    Substitution Errors: 6
    Insertion/Deletion Errors: 0
    Assembly Size: 2471445
    Consensus Quality: 99.9998
    
    for cluster in cluster_001; do
    cd ${cluster}
    polca.sh -a polypolish.fasta.PolcaCorrected.fa.PolcaCorrected.fa.PolcaCorrected.fa -r "../../../s-epidermidis-5179_R1.fastq.gz ../../../s-epidermidis-5179_R2.fastq.gz" -t 55 -m 120G
    cd ..
    done
    Substitution Errors: 0
    Insertion/Deletion Errors: 0
    Assembly Size: 2471445
    Consensus Quality: 100
    
    #-- HD5_2 (2): without the short-sequencing we cannot correct the base-calling! --
    # !ERROR to be REPORTED, the 
    #Polishing cluster_001_consensus (2,504,140 bp):
    #mean read depth: 94.4x
    #240,420 bp have a depth of zero (90.3991% coverage)
    #56,894 positions changed (2.2720% of total positions)
    #estimated pre-polishing sequence accuracy: 97.7280% (Q16.44)
    
    /media/jhuang/Elements2/Data_Anna12_HAPDICS_HyAsP/180821_rohde/HD5_1_S37_R1_001.fastq
    /media/jhuang/Elements2/Data_Anna12_HAPDICS_HyAsP/180821_rohde/HD5_1_S37_R2_001.fastq
    /media/jhuang/Elements2/Data_Anna12_HAPDICS_HyAsP/180821_rohde/HD5_2_S38_R1_001.fastq
    /media/jhuang/Elements2/Data_Anna12_HAPDICS_HyAsP/180821_rohde/HD5_2_S38_R2_001.fastq
    /media/jhuang/Elements2/Data_Anna12_HAPDICS_HyAsP/180821_rohde/HD5_3_S39_R1_001.fastq
    /media/jhuang/Elements2/Data_Anna12_HAPDICS_HyAsP/180821_rohde/HD5_3_S39_R2_001.fastq
    /media/jhuang/Elements2/Data_Anna12_HAPDICS_HyAsP/180821_rohde/HD5_4_S40_R1_001.fastq
    /media/jhuang/Elements2/Data_Anna12_HAPDICS_HyAsP/180821_rohde/HD5_4_S40_R2_001.fastq
    /media/jhuang/Elements2/Data_Anna12_HAPDICS_HyAsP/180821_rohde/HD5_5_S41_R1_001.fastq
    /media/jhuang/Elements2/Data_Anna12_HAPDICS_HyAsP/180821_rohde/HD5_5_S41_R2_001.fastq
    /media/jhuang/Elements2/Data_Anna12_HAPDICS_HyAsP/180821_rohde/HD5_6_S42_R1_001.fastq
    /media/jhuang/Elements2/Data_Anna12_HAPDICS_HyAsP/180821_rohde/HD5_6_S42_R2_001.fastq
    /media/jhuang/Elements2/Data_Anna12_HAPDICS_HyAsP/180821_rohde/HD5_7_S43_R1_001.fastq
    /media/jhuang/Elements2/Data_Anna12_HAPDICS_HyAsP/180821_rohde/HD5_7_S43_R2_001.fastq
    /media/jhuang/Elements2/Data_Anna12_HAPDICS_HyAsP/180821_rohde/HD5_8_S44_R1_001.fastq
    /media/jhuang/Elements2/Data_Anna12_HAPDICS_HyAsP/180821_rohde/HD5_8_S44_R2_001.fastq
    /media/jhuang/Elements2/Data_Anna12_HAPDICS_HyAsP/180821_rohde/HD5_9_S45_R1_001.fastq
    /media/jhuang/Elements2/Data_Anna12_HAPDICS_HyAsP/180821_rohde/HD5_9_S45_R2_001.fastq
    /media/jhuang/Elements2/Data_Anna12_HAPDICS_HyAsP/180821_rohde/HD5_10_S46_R1_001.fastq
    /media/jhuang/Elements2/Data_Anna12_HAPDICS_HyAsP/180821_rohde/HD5_10_S46_R2_001.fastq
    #Step 1: read QC
    fastp --in1 ../../HD5_2_S38_R1_001.fastq.gz --in2 ../../HD5_2_S38_R2_001.fastq.gz --out1 1.fastq.gz --out2 2.fastq.gz --unpaired1 u.fastq.gz --unpaired2 u.fastq.gz
    
    # NOTE that the following steps are not run since the short-reads are not correct!
    # #Step 2: Polypolish
    # for cluster in cluster_001 cluster_005; do
    #   bwa index ${cluster}/7_final_consensus.fasta
    #   bwa mem -t 55 -a ${cluster}/7_final_consensus.fasta 1.fastq.gz > ${cluster}/alignments_1.sam
    #   bwa mem -t 55 -a ${cluster}/7_final_consensus.fasta 2.fastq.gz > ${cluster}/alignments_2.sam
    #   polypolish polish ${cluster}/7_final_consensus.fasta ${cluster}/alignments_1.sam ${cluster}/alignments_2.sam > ${cluster}/polypolish.fasta
    # done
    
    # #Step 3: POLCA
    # for cluster in cluster_001 cluster_005; do
    #   cd ${cluster}
    #   polca.sh -a polypolish.fasta -r "../../../HD5_2_S38_R1_001.fastq.gz ../../../HD5_2_S38_R2_001.fastq.gz" -t 55 -m 120G
    #   cd ..
    # done
    
    # #Step 4: (optional) more rounds POLCA
    # for cluster in cluster_001; do
    #   cd ${cluster}
    #   polca.sh -a polypolish.fasta.PolcaCorrected.fa -r "../../../HD5_2_S38_R1_001.fastq.gz ../../../HD5_2_S38_R2_001.fastq.gz" -t 55 -m 120G
    #   cd ..
    # done
    
    # NOTE that the plasmids of HD5_2_K5 and HD5_2_K6 were copied from Unicycler!
    #-- HD5_2_K5 (4) --
    mean read depth: 87.1x
    25 bp have a depth of zero (99.9990% coverage)
    1,085 positions changed (0.0433% of total positions)
    estimated pre-polishing sequence accuracy: 99.9567% (Q33.63)
    
    #Step 1: read QC
    fastp --in1 ../../275_K5_Holger_S92_R1_001.fastq.gz --in2 ../../275_K5_Holger_S92_R2_001.fastq.gz --out1 1.fastq.gz --out2 2.fastq.gz --unpaired1 u.fastq.gz --unpaired2 u.fastq.gz
    
    #Step 2: Polypolish
    for cluster in cluster_001 cluster_002 cluster_003 cluster_004; do
      bwa index ${cluster}/7_final_consensus.fasta
      bwa mem -t 55 -a ${cluster}/7_final_consensus.fasta 1.fastq.gz > ${cluster}/alignments_1.sam
      bwa mem -t 55 -a ${cluster}/7_final_consensus.fasta 2.fastq.gz > ${cluster}/alignments_2.sam
      polypolish polish ${cluster}/7_final_consensus.fasta ${cluster}/alignments_1.sam ${cluster}/alignments_2.sam > ${cluster}/polypolish.fasta
    done
    
    #Step 3: POLCA
    for cluster in cluster_001 cluster_002 cluster_003 cluster_004; do
      cd ${cluster}
      polca.sh -a polypolish.fasta -r "../../../275_K5_Holger_S92_R1_001.fastq.gz ../../../275_K5_Holger_S92_R2_001.fastq.gz" -t 55 -m 120G
      cd ..
    done
    #Substitution Errors: 146
    #Insertion/Deletion Errors: 2
    #Assembly Size: 2504401
    #Consensus Quality: 99.9941
    
    #Substitution Errors: 41
    #Insertion/Deletion Errors: 0
    #Assembly Size: 41288
    #Consensus Quality: 99.9007
    
    #Substitution Errors: 0
    #Insertion/Deletion Errors: 0
    #Assembly Size: 9191
    #Consensus Quality: 100
    
    #Substitution Errors: 0
    #Insertion/Deletion Errors: 0
    #Assembly Size: 2767
    #Consensus Quality: 100
    
    #Step 4: (optional) more rounds POLCA
    for cluster in cluster_001 cluster_002; do
      cd ${cluster}
      polca.sh -a polypolish.fasta.PolcaCorrected.fa -r "../../../275_K5_Holger_S92_R1_001.fastq.gz ../../../275_K5_Holger_S92_R2_001.fastq.gz" -t 55 -m 120G
      cd ..
    done
    #Substitution Errors: 41
    #Insertion/Deletion Errors: 0
    #Assembly Size: 2504401
    #Consensus Quality: 99.9984
    
    #Substitution Errors: 8
    #Insertion/Deletion Errors: 0
    #Assembly Size: 41288
    #Consensus Quality: 99.9806
    
    for cluster in cluster_001 cluster_002; do
      cd ${cluster}
      polca.sh -a polypolish.fasta.PolcaCorrected.fa.PolcaCorrected.fa -r "../../../275_K5_Holger_S92_R1_001.fastq.gz ../../../275_K5_Holger_S92_R2_001.fastq.gz" -t 55 -m 120G
      cd ..
    done
    #Substitution Errors: 8
    #Insertion/Deletion Errors: 0
    #Assembly Size: 2504401
    #Consensus Quality: 99.9997
    
    #Substitution Errors: 4
    #Insertion/Deletion Errors: 0
    #Assembly Size: 41288
    #Consensus Quality: 99.9903
    
    for cluster in cluster_001 cluster_002; do
      cd ${cluster}
      polca.sh -a polypolish.fasta.PolcaCorrected.fa.PolcaCorrected.fa.PolcaCorrected.fa -r "../../../275_K5_Holger_S92_R1_001.fastq.gz ../../../275_K5_Holger_S92_R2_001.fastq.gz" -t 55 -m 120G
      cd ..
    done
    #Substitution Errors: 8
    #Insertion/Deletion Errors: 0
    #Assembly Size: 2504401
    #Consensus Quality: 99.9997
    
    #Substitution Errors: 4
    #Insertion/Deletion Errors: 0
    #Assembly Size: 41288
    #Consensus Quality: 99.9903
    
    #-- HD5_2_K6 (4) --
    #mean read depth: 116.7x
    #4 bp have a depth of zero (99.9998% coverage)
    #1,022 positions changed (0.0408% of total positions)
    #estimated pre-polishing sequence accuracy: 99.9592% (Q33.89)
    
    #Step 1: read QC
    fastp --in1 ../../276_K6_Holger_S95_R1_001.fastq.gz --in2 ../../276_K6_Holger_S95_R2_001.fastq.gz --out1 1.fastq.gz --out2 2.fastq.gz --unpaired1 u.fastq.gz --unpaired2 u.fastq.gz
    
    #Step 2: Polypolish
    for cluster in cluster_001 cluster_002 cluster_003 cluster_004; do
      bwa index ${cluster}/7_final_consensus.fasta
      bwa mem -t 55 -a ${cluster}/7_final_consensus.fasta 1.fastq.gz > ${cluster}/alignments_1.sam
      bwa mem -t 55 -a ${cluster}/7_final_consensus.fasta 2.fastq.gz > ${cluster}/alignments_2.sam
      polypolish polish ${cluster}/7_final_consensus.fasta ${cluster}/alignments_1.sam ${cluster}/alignments_2.sam > ${cluster}/polypolish.fasta
    done
    
    #Step 3: POLCA
    for cluster in cluster_001 cluster_002 cluster_003 cluster_004; do
      cd ${cluster}
      polca.sh -a polypolish.fasta -r "../../../276_K6_Holger_S95_R1_001.fastq.gz ../../../276_K6_Holger_S95_R2_001.fastq.gz" -t 55 -m 120G
      cd ..
    done
    #Substitution Errors: 164
    #Insertion/Deletion Errors: 2
    #Assembly Size: 2504398
    #Consensus Quality: 99.9934
    
    #Substitution Errors: 22
    #Insertion/Deletion Errors: 0
    #Assembly Size: 41288
    #Consensus Quality: 99.9467
    
    #Substitution Errors: 0
    #Insertion/Deletion Errors: 0
    #Assembly Size: 9191
    #Consensus Quality: 100
    
    #Substitution Errors: 0
    #Insertion/Deletion Errors: 0
    #Assembly Size: 2767
    #Consensus Quality: 100
    
    #Step 4: (optional) more rounds POLCA
    for cluster in cluster_001 cluster_002; do
      cd ${cluster}
      polca.sh -a polypolish.fasta.PolcaCorrected.fa -r "../../../276_K6_Holger_S95_R1_001.fastq.gz ../../../276_K6_Holger_S95_R2_001.fastq.gz" -t 55 -m 120G
      cd ..
    done
    #Substitution Errors: 32
    #Insertion/Deletion Errors: 0
    #Assembly Size: 2504400
    #Consensus Quality: 99.9987
    
    #Substitution Errors: 0
    #Insertion/Deletion Errors: 0
    #Assembly Size: 41288
    #Consensus Quality: 100
    
    for cluster in cluster_001; do
      cd ${cluster}
      polca.sh -a polypolish.fasta.PolcaCorrected.fa.PolcaCorrected.fa -r "../../../276_K6_Holger_S95_R1_001.fastq.gz ../../../276_K6_Holger_S95_R2_001.fastq.gz" -t 55 -m 120G
      cd ..
    done
    #Substitution Errors: 4
    #Insertion/Deletion Errors: 0
    #Assembly Size: 2504400
    #Consensus Quality: 99.9998
    
    for cluster in cluster_001; do
      cd ${cluster}
      polca.sh -a polypolish.fasta.PolcaCorrected.fa.PolcaCorrected.fa.PolcaCorrected.fa -r "../../../276_K6_Holger_S95_R1_001.fastq.gz ../../../276_K6_Holger_S95_R2_001.fastq.gz" -t 55 -m 120G
      cd ..
    done
    #Substitution Errors: 2
    #Insertion/Deletion Errors: 0
    #Assembly Size: 2504400
    #Consensus Quality: 99.9999
  6. Results by directly using Unicycler

    #----------------------- 5179R1_normal ----------------------- 
    
    >1 length=2468563 depth=1.00x circular=true
    >2 length=17748 depth=1.42x circular=true
    
    Component   Segments   Links   Length      N50         Longest segment   Status
        total          2       2   2,486,311   2,468,563         2,468,563
            1          1       1   2,468,563   2,468,563         2,468,563   complete
            2          1       1      17,748      17,748            17,748   complete
    
    Segment   Length      Depth   Starting gene         Position    Strand    Identity   Coverage
        1   2,468,563   1.00x   UniRef90_Q5HJZ9       1,212,460   forward     100.0%     100.0%
        2      17,748   1.42x   UniRef90_A0A0H2VIR3       4,804   reverse      93.2%      99.7%
    
    # ---- 5179_bold ----
    
    Segment   Length      Depth    Starting gene         Position    Strand    Identity   Coverage
        1   2,469,173    1.00x   UniRef90_Q5HJZ9       1,901,872   reverse     100.0%     100.0%
        2      17,749    2.27x   UniRef90_A0A0H2VIR3       4,771   forward      93.2%      99.7%
        4       4,595   10.19x   none found
        8       2,449   17.14x   none found
    
    >1 length=2469173 depth=1.00x circular=true
    >2 length=17749 depth=2.27x circular=true
    >3 length=4761 depth=0.44x
    >4 length=4595 depth=10.19x circular=true
    >5 length=3735 depth=0.29x
    >6 length=3718 depth=0.42x
    >7 length=3573 depth=0.52x
    >8 length=2449 depth=17.14x circular=true
    >9 length=2411 depth=0.35x
    >10 length=2371 depth=0.32x
    >11 length=2365 depth=0.43x
    >12 length=1637 depth=0.44x
    >13 length=1568 depth=0.66x
    >14 length=1505 depth=0.65x
    >15 length=1403 depth=0.93x
    >16 length=1329 depth=0.55x
    
    makeblastdb -in assembly.fasta -dbtype nucl
    blastn -task blastn-short -db ../HD05_2_K5_conservative/assembly.fasta -query assembly.fasta -out 2-16_vs_1.blastn -evalue 0.00000000001 -num_threads 15 -outfmt 6 -strand both -max_target_seqs 1 
    
    #TODO: manually fill the gap in the HD05_2 genome!
    
    5       1       99.946  3728    1       1       1       3728    1535666 1539392 0.0     7366
    6       1       99.973  3718    0       1       1       3718    702963  706679  0.0     7355
    7       1       99.888  3573    1       3       1       3573    1764622 1768191 0.0     7027
    9       1       100.000 2411    0       0       1       2411    1060914 1063324 0.0     4779
    10      1       100.000 2371    0       0       1       2371    615275  612905  0.0     4700
    11      1       99.958  2365    0       1       1       2365    1088713 1086350 0.0     4672
    12      1       100.000 1637    0       0       1       1637    146635  144999  0.0     3245
    13      1       99.936  1568    0       1       1       1568    2024197 2025763 0.0     3092
    14      1       100.000 1505    0       0       1       1505    2445480 2443976 0.0     2983
    15      1       100.000 1403    0       0       1       1403    197723  196321  0.0     2781
    16      1       99.925  1329    1       0       1       1329    49854   48526   0.0     2627
    
    # -------------------- 1585_normal --------------------
    >1 length=2443574 depth=1.00x circular=true       #contig_1        2442282 10      60      61
    >2 length=9014 depth=3.72x circular=true
    >3 length=4388 depth=0.89x
    >4 length=3443 depth=0.48x
    >5 length=3338 depth=0.48x
    >6 length=3336 depth=0.45x
    >7 length=2344 depth=11.44x circular=true
    >8 length=2255 depth=9.81x circular=true
    >9 length=1929 depth=0.37x
    >10 length=1703 depth=1.67x
    >11 length=1605 depth=0.26x
    >12 length=1381 depth=0.56x
    >13 length=1360 depth=0.39x
    >14 length=1281 depth=0.41x
    >15 length=1163 depth=0.51x
    >16 length=1088 depth=0.24x
    
    2594107
    
    ragtag.py scaffold  ../HD05_2_K5_normal/assembly.fasta assembly.fasta
    ragtag.py patch    ragtag.scaffold.fasta ../../HD05_2_K5_normal/assembly.fasta
    grep -o 'N' ragtag.patch.fasta | wc -l
    
    3       1       99.977  4388    0       1       1       4388    2410738 2406352 0.0     8683
    4       1       99.942  3443    0       2       1       3443    2222741 2219301 0.0     6794
    5       1       99.970  3338    0       1       1       3338    455636  452300  0.0     6601
    6       1       99.940  3336    0       2       1       3336    1617740 1614407 0.0     6581
    9       1       99.948  1929    0       1       1       1929    1321522 1319595 0.0     3808
    10      1       99.941  1703    1       0       1       1703    90503   88801   0.0     3368
    11      1       99.938  1605    0       1       1       1605    2361795 2363398 0.0     3166
    12      1       99.928  1381    0       1       1       1381    241092  242471  0.0     2722
    13      1       100.000 1360    0       0       1       1360    1157897 1159256 0.0     2696
    14      1       100.000 1281    0       0       1       1281    218323  219603  0.0     2539
    15      1       100.000 1163    0       0       1       1163    2077536 2078698 0.0     2305
    16      1       100.000 1088    0       0       1       1088    283284  284371  0.0     2157
    
    >1 length=2503585 depth=1.00x circular=true
    >2 length=41288 depth=3.32x circular=true
    >3 length=9191 depth=8.29x circular=true
    >4 length=2767 depth=9.36x circular=true
    
    >1 length=2503927 depth=1.00x circular=true
    >2 length=41288 depth=3.77x circular=true
    >3 length=9191 depth=7.83x circular=true
    >4 length=2767 depth=10.11x circular=true
    
    #--------------------------
    1585V
    #[2024-01-17 13:42:28] INFO: Assembly statistics:
    
        Total length:   2438882 vs 2443574
        Fragments:  1
        Fragments N50:  2438882
        Largest frg:    2438882
        Scaffolds:  0
        Mean coverage:  47
    
    unicycler -1 s-epidermidis-5179-r1_R1.fastq.gz -2 s-epidermidis-5179-r1_R2.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode01/FAN41335_pass_barcode01.fastq.gz --mode conservative -t 55 -o 5179R1_conservative
    unicycler -1 s-epidermidis-1585_R1.fastq.gz -2 s-epidermidis-1585_R2.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode02/FAN41335_pass_barcode02.fastq.gz --mode conservative -t 55 -o 1585_conservative
    #3 no short sequencing
    unicycler -1 s-epidermidis-5179_R1.fastq.gz -2 s-epidermidis-5179_R2.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode04/FAN41335_pass_barcode04.fastq.gz --mode conservative -t 55 -o 5179_conservative
    
    unicycler -1 HD5_2_S38_R1_001.fastq.gz -2 HD5_2_S38_R2_001.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode05/FAN41335_pass_barcode05.fastq.gz --mode conservative -t 55 -o HD05_2_conservative
    unicycler -1 275_K5_Holger_S92_R1_001.fastq.gz -2 275_K5_Holger_S92_R2_001.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode06/FAN41335_pass_barcode06.fastq.gz --mode conservative -t 55 -o HD05_2_K5_conservative
    unicycler -1 276_K6_Holger_S95_R1_001.fastq.gz -2 276_K6_Holger_S95_R2_001.fastq.gz -l long_reads/240109_FAN41335_S_epidermidis/fastq_pass/barcode07/FAN41335_pass_barcode07.fastq.gz --mode conservative -t 55 -o HD05_2_K6_conservative
    
    # ---- 1  5179R1  2469692 ----
    >1 length=2468563 depth=1.00x circular=true
    >2 length=17748 depth=1.42x circular=true
    
    # ---- 2  1585    2442282 ---- (compring to Trycyler chrom is 2443176 nt)
    >1 length=2443574 depth=1.00x circular=true
    >2 length=9014 depth=3.72x circular=true
    >7 length=2344 depth=11.44x circular=true
    >8 length=2255 depth=9.81x circular=true
    
    # ---- 3  1585v   2438882 ----
    #using long sequencing only 1
    
    # ---- 4  5179_bold    2471107+17740 ----
    >1 length=2469173 depth=1.00x circular=true
    >2 length=17749 depth=2.27x circular=true
    >4 length=4595 depth=10.19x circular=true
    >8 length=2449 depth=17.14x circular=true
    
    # ---- 5  HD05_2  2504622 ----
    # Note in HD05_2_bold_hq_lq including the bad long-reads.
    >1 length=965875 depth=0.95x
    >2 length=855325 depth=1.00x
    >3 length=582944 depth=1.02x
    >4 length=183656 depth=1.02x
    >5 length=13570 depth=4.73x circular=true
    >6 length=1503 depth=4.85x
    >7 length=1271 depth=5.06x
    >8 length=227 depth=2.03x
    >9 length=153 depth=0.93x
    >10 length=152 depth=1.09x
    
    # trycycler: 2503231 (yes), 9183 (yes), 22394, 18541 --
    
    # ---- 6  HD05_2_K5  2504656+41290+9191 ----
    conservative
    >1 length=2503585 depth=1.00x circular=true
    >2 length=41288 depth=3.32x circular=true
    >3 length=9191 depth=8.29x circular=true
    >4 length=2767 depth=9.36x circular=true
    
    # ---- 7  HD05_2_K6  2504588+41285+9192 ----
    conservative
    >1 length=2503927 depth=1.00x circular=true
    >2 length=41288 depth=3.77x circular=true
    >3 length=9191 depth=7.83x circular=true
    >4 length=2767 depth=10.11x circular=true
    
    ragtag.py scaffold  ../assembly_flye_HD05_2/assembly.fasta assembly.fasta
    ragtag.py patch  ragtag.scaffold.fasta ../../assembly_flye_HD05_2/assembly.fasta
    grep -o 'N' ragtag.patch.fasta | wc -l
    
    makeblastdb -in ../assembly_flye_HD05_2/assembly.fasta -dbtype nucl
    blastn -db ../assembly_flye_HD05_2/assembly.fasta -query 1-4.fasta -out assmbly_vs_flye.blastn -evalue 0.000001 -num_threads 15 -outfmt 6 -strand minus -max_target_seqs 1 
  7. Submit all genomes to NCBI

    TODO: If 1585V using only the long reads to assemble the genome!
    BioSample accession
            BioProject: PRJNA1038700
            Staphylococcus epidermidis strain:1585v | isolate:1585v Genome sequencing 
                SAMN38198576
                Pathogen: clinical or host-associated sample from Staphylococcus epidermidis 
                    0 SRAs
    
    Status
        To be released
    
    Release date
        2027-11-10
    
    Created
        2023-11-10 15:24
    
    Updated
        2023-11-17 16:57
    
    Sample name
        1585v 
    
    Package
        Pathogen: clinical or host-associated; version 1.0 
    
    Organism
    
            Name:
                Staphylococcus epidermidis
    
            Taxonomy ID:
                1282
    
    Attributes
        Attribute name  Attribute value
    
            collected by
            H R
    
            collection date
            2004
    
            geographic location
            Germany: Hamburg
    
            host
            Homo sapiens
    
            host disease
            port-catheter infection
    
            isolation source
            port-catheter
    
            isolate
            missing
    
            strain
            1585v
    
            latitude and longitude
            53.551672 N 9.955081 E
    
            https://www.ncbi.nlm.nih.gov/genbank/genomesubmit/#run_pgap
  8. Background of 1585v and 1585

    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8346721/ S. epidermidis 1585 is known to be biofilm-negative in laboratory media, but to form biofilm in the presence of human serum.

In contrast, S. epidermidis 1585 v is a variant derived from strain 1585 in which, due to a chromosomal re-arrangement, a 460 kDa isoform of Embp is overexpressed even in TSB, while mutant M135 is an isogenic mutant of 1585 v in which expression of Embp is interrupted by insertion of transposon Tn917.

Staphylococcus epidermidis (S. epidermidis) 1585 is a specific strain of S. epidermidis that is classified as a wild-type strain. Wild-type in bacterial terminology refers to the strain of an organism that is found in nature, as opposed to those that have been modified or mutated in a laboratory setting. Here are some key points about the 1585 wild-type strain of S. epidermidis:

  1. No Embp Production in TSB: One notable characteristic of the S. epidermidis 1585 strain is that it does not produce Embp (extracellular matrix binding protein) when grown in TSB (Tryptic Soy Broth). Embp is a protein that plays a crucial role in the biofilm formation and adherence of bacteria to surfaces. The absence of Embp production in this strain could impact its ability to form biofilms, a common virulence factor in Staphylococcus infections.

  2. Biofilm Formation: S. epidermidis is known for its ability to form biofilms, especially on medical devices, leading to infections that are difficult to treat. The fact that the 1585 strain doesn’t produce Embp in TSB suggests it may have a reduced capacity for biofilm formation under these conditions, which could be significant in understanding and managing such infections.

  3. Research and Clinical Implications: Studying wild-type strains like S. epidermidis 1585 is important for understanding the natural behavior and characteristics of the species. Since this strain behaves differently from other strains in terms of Embp production and possibly biofilm formation, it can provide insights into the mechanisms and genetic factors that control these processes. This knowledge is valuable for developing strategies to prevent and treat infections, especially in hospital and healthcare settings where S. epidermidis infections are common.

  4. Genetic Studies: The 1585 strain can also serve as a baseline or control in genetic studies. By comparing the genome and behavior of 1585 with other strains of S. epidermidis, researchers can identify genetic variations and mutations that may be responsible for different phenotypes, such as increased virulence or antibiotic resistance.

Model Organism for Understanding Staphylococcal Behavior: As a wild-type strain, 1585 offers a model for studying the natural state of S. epidermidis. This is crucial for understanding the fundamental biology of the bacterium, which can help in the development of treatments and interventions against infections caused by more virulent or drug-resistant strains. In summary, the S. epidermidis

1585 wild-type strain is significant in microbiological research due to its natural characteristics, particularly its behavior in biofilm formation and Embp production. Understanding these aspects can contribute to better insights into the pathogenicity and treatment of Staphylococcus infections, particularly in clinical settings where these bacteria are a common source of nosocomial infections.

cfDNA Sequencing: Technological Approaches and Bioinformatic Issues

Cell free circulating DNA (cfDNA) refers to DNA fragments present outside of cells in body fluids such as plasma, urine, and cerebrospinal fluid (CSF). CfDNA was first identified in 1948 from plasma of healthy individuals [1]. Afterward, studies showed that the quantity of this cfDNA in the blood was increased under pathological conditions such as auto-immune diseases [2] but also cancers [3]. In 1989, Philippe Anker and Maurice Stroun, from the University of Geneva, demonstrated that this cfDNA from cancer patients carries the characteristics of the DNA from tumoral cells [4]. Next, using the recently developed technique of PCR, David Sidransky and his team found the same mutations of TP53 in bladder tumoral samples and urine pellets from patients [5]. Then, the research and identification of genomic anomalies specific of a cancer type in the circulating DNA, such as NRAS and KRAS mutations or HER-2 amplifications [6,7,8], started to expand, and for the first time, the term of circulating tumor DNA (ctDNA) appeared.

Since the highlighting of this circulating DNA of tumoral origin, technological developments in molecular biology, from quantitative and digital PCR to Next Generation Sequencing, turned it into a powerful liquid biopsy tool. At the era of precision medicine, it seems crucial to identify molecular alterations that will be able to guide the therapeutic management of patients. As tumors release DNA in the blood or other body fluids such as urine, this circulating tumoral DNA, containing the molecular characteristics of the tumor, can be collected with a simple body fluid sample. Since it is minimally invasive, this liquid biopsy is easily repeatable during follow up and in case of relapse. It is also of major interest in some particular cancers where a tumoral biopsy is difficult to obtain such as primary central nervous system lymphoma [9] or cancer subtypes with tissue biopsy containing very little tumoral cells such as Hodgkin lymphoma (HL) for which Reed–Sternbeg cells represent only 0.1 to 2% of the tumoral mass [10,11]. In these particular conditions and malignancies, the sequencing of ctDNA in body fluids could serve as a surrogate for a tumor biopsy. Other body fluids than blood are often used according to the localization of the tumor, such as urine for bladder cancers or cerebrospinal fluid for cerebral tumors [9,12] but blood is the body fluid most often used in studies.

In blood, average cfDNA concentration in healthy individuals can range between 0 and 100 ng/mL of plasma with an average of 30 ng/mL of plasma and is significantly higher in blood of cancer patients, varying between 0 and 1000 ng/mL, with an average of 180 ng/mL [13]. This concentration is correlated with the stage of the cancer, increasing with higher stages, and the size of the tumor. Circulating DNA of tumoral origin represents from 0.01 to more than 90% of the total cell free DNA found in blood [14]. In different types of cancers, a large scale ctDNA sequencing study has shown an association between ctDNA levels and mutational tumor burden [15]. Moreover, given the spatial heterogeneity observed in tumor tissue, ctDNA analysis can determine the complete molecular landscape of a patient’s tumor and give supplementary information on drug targetable alterations and resistant variants [16]. ctDNA kinetics during follow up is correlated with prognosis, as a drastic reduction in its level after treatment is associated with better prognosis, whereas an increase usually means the evolution of drug resistant clones and an ultimate therapeutic failure [17,18,19,20].

Detection of ctDNA during MRD follow up to predict early relapse and at diagnosis in early stages of cancer continues to be a challenge, as the fraction of tumoral DNA contents in total circulating DNA may be <0.01% [21,22]. The development of sequencing technologies being more and more sensitive allows the detection of alterations present in cfDNA at very low variant allele frequencies (VAF), not only for mutational profiling at diagnosis but also for the early detection of disease recurrence and monitoring for therapy response. However, several parameters can affect the sensitivity of ctDNA detection. First, adequate handling of the blood sample, from blood collection to the quality control of the cfDNA extracted, is crucial in analysis. Next, an important step is the choice of the biomarker (s) and the sequencing technology used to detect it. Then, bioinformatic analysis, using error suppression algorithms, is the ultimate tool to discriminate the true variant from false positives.

无细胞循环DNA(cfDNA)指的是体液中细胞外的DNA碎片,如血浆、尿液和脑脊液(CSF)。cfDNA最早在1948年从健康个体的血浆中被发现[1]。此后,研究表明,这种cfDNA在血液中的数量在如自身免疫性疾病[2]等病理状态下增加,以及癌症[3]。1989年,日内瓦大学的Philippe Anker和Maurice Stroun展示了癌症患者的cfDNA携带了肿瘤细胞DNA的特征[4]。接下来,使用新开发的PCR技术,David Sidransky及其团队在膀胱肿瘤样本和患者的尿沉渣中发现了相同的TP53突变[5]。然后,对循环DNA中特定癌症类型的基因组异常的研究和识别,如NRAS和KRAS突变或HER-2扩增[6,7,8]开始扩展,首次出现了循环肿瘤DNA(ctDNA)这一术语。

自从突出了这种肿瘤来源的循环DNA以来,分子生物学的技术发展,从定量和数字PCR到下一代测序,使其成为了一个强大的液体活检工具。在精准医学时代,识别能够指导患者治疗管理的分子改变似乎至关重要。由于肿瘤释放DNA到血液或其他体液如尿液,这种含有肿瘤分子特征的循环肿瘤DNA可以通过简单的体液样本收集。由于它是微创的,这种液体活检在随访和复发时容易重复。它在某些难以获得肿瘤活检的特定癌症中也非常重要,如原发性中枢神经系统淋巴瘤[9]或组织活检中肿瘤细胞很少的癌症亚型,如霍奇金淋巴瘤(HL),其Reed–Sternbeg细胞仅占肿瘤质量的0.1到2%[10,11]。在这些特殊条件和恶性病中,体液中的ctDNA测序可以作为肿瘤活检的替代品。除了血液外,根据肿瘤的位置,常常使用其他体液,如膀胱癌的尿液或脑瘤的脑脊液[9,12],但血液是研究中最常用的体液。

在血液中,健康个体的平均cfDNA浓度可以在0至100 ng/mL血浆之间,平均为30 ng/mL血浆,而癌症患者的血液中则显著更高,变化在0至1000 ng/mL,平均为180 ng/mL [13]。这个浓度与癌症的阶段相关,随着阶段的提高和肿瘤大小的增加而增加。肿瘤来源的循环DNA占血液中发现的总无细胞DNA的0.01%到90%以上[14]。在不同类型的癌症中,一个大规模ctDNA测序研究显示ctDNA水平与突变肿瘤负担之间存在关联[15]。此外,鉴于在肿瘤组织中观察到的空间异质性,ctDNA分析可以确定患者肿瘤的完整分子景观,并提供关于可药物靶向的改变和耐药变异的补充信息[16]。随访期间ctDNA动态与预后相关,治疗后其水平的急剧减少与更好的预后相关,而增加通常意味着耐药克隆的发展和最终的治疗失败[17,18,19,20]。

在MRD随访期间检测ctDNA以预测早期复发,在癌症早期阶段诊断中继续是一个挑战,因为总循环DNA中的肿瘤DNA比例可能小于0.01%[21,22]。越来越敏感的测序技术的发展允许在非常低的变异等位基因频率(VAF)下检测cfDNA中存在的改变,不仅用于诊断时的突变分析,也用于疾病复发的早期检测和治疗响应的监测。然而,几个参数可能影响ctDNA检测的敏感性。首先,从采血到提取的cfDNA的质量控制,血液样本的适当处理在分析中至关重要。接下来,选择生物标志物和用于检测它的测序技术是一个重要步骤。然后,使用错误抑制算法的生物信息学分析是区分真变异和假阳性的最终工具。

  1. using DAMIAN to analyse the cfDNA sequencing data

    cd ~/Tools/damian/databases/blast
    
    damian.rb --host human3 --type dna -1 ./231114_VH00358_62_AACYCYWM5_cfDNA/p20428/neg_control_S2_R1_001.fastq.gz -2 ./231114_VH00358_62_AACYCYWM5_cfDNA/p20428/neg_control_S2_R2_001.fastq.gz --sample neg_control_S2_megablast --blastn never --blastp never --min_contiglength 200 --threads 55 --force
    damian_report.rb
    zip -r neg_control_S2_megablast.zip neg_control_S2_megablast/
    echo -e "xxxx" | mutt -a "./neg_control_S2_megablast.zip" -s "New results from DAMIAN" -- "xxx@xxx.com"
    
    damian.rb --host human3 --type dna -1 ./231114_VH00358_62_AACYCYWM5_cfDNA/p20429/635724976_S_aureus_epidermidis_S3_R1_001.fastq.gz -2 ./231114_VH00358_62_AACYCYWM5_cfDNA/p20429/635724976_S_aureus_epidermidis_S3_R2_001.fastq.gz --sample 635724976_S_aureus_epidermidis_S3_megablast --blastn never --blastp never --min_contiglength 200 --threads 55 --force
    damian_report.rb
    zip -r 635724976_S_aureus_epidermidis_S3_megablast.zip 635724976_S_aureus_epidermidis_S3_megablast/
    echo -e "xxxx" | mutt -a "./635724976_S_aureus_epidermidis_S3_megablast.zip" -s "New results from DAMIAN" -- "xxx@xxx.com"
    
    damian.rb --host human3 --type dna -1 ./231114_VH00358_62_AACYCYWM5_cfDNA/p20430/635290002_CMV_S4_R1_001.fastq.gz -2 ./231114_VH00358_62_AACYCYWM5_cfDNA/p20430/635290002_CMV_S4_R2_001.fastq.gz --sample 635290002_CMV_S4_megablast --blastn never --blastp never --min_contiglength 200 --threads 55 --force
    damian_report.rb
    zip -r 635290002_CMV_S4_megablast.zip 635290002_CMV_S4_megablast/
    echo -e "xxxx " | mutt -a "./635290002_CMV_S4_megablast.zip" -s "New results from DAMIAN" -- "xxx@xxx.com"
    
    damian.rb --host human3 --type dna -1 ./231114_VH00358_62_AACYCYWM5_cfDNA/p20431/635850623_EBV_S5_R1_001.fastq.gz -2 ./231114_VH00358_62_AACYCYWM5_cfDNA/p20431/635850623_EBV_S5_R2_001.fastq.gz --sample 635850623_EBV_S5_megablast --blastn never --blastp never --min_contiglength 200 --threads 55 --force
    damian_report.rb
    zip -r 635850623_EBV_S5_megablast.zip 635850623_EBV_S5_megablast/
    echo -e "xxxx " | mutt -a "./635850623_EBV_S5_megablast.zip" -s "New results from DAMIAN" -- "xxx@xxx.com"
    
    damian.rb --host human3 --type dna -1 ./231114_VH00358_62_AACYCYWM5_cfDNA/p20427/635031018_E_faecium_S1_R1_001.fastq.gz -2 ./231114_VH00358_62_AACYCYWM5_cfDNA/p20427/635031018_E_faecium_S1_R2_001.fastq.gz --sample 635031018_E_faecium_S1_megablast --blastn never --blastp never --min_contiglength 200 --threads 55 --force
    damian_report.rb
    zip -r 635031018_E_faecium_S1_megablast.zip 635031018_E_faecium_S1_megablast/
    echo -e "xxxx" | mutt -a "./635031018_E_faecium_S1_megablast.zip" -s "New results from DAMIAN" -- "xxx@xxx.com"
    
    #END
  2. using vrap to analyse the cfDNA sequencing data

    conda activate vrap
    cd vrap_outputs
    ln -s ~/Tools/vrap .
    
    vrap/vrap.py -1 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20427/635031018_E_faecium_S1_R1_001.fastq.gz -2 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20427/635031018_E_faecium_S1_R2_001.fastq.gz -o E_faecium_S1_vrap_out --host /home/jhuang/REFs/genome.fa -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 40 -l 200
    vrap/vrap.py -1 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20428/neg_control_S2_R1_001.fastq.gz -2 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20428/neg_control_S2_R2_001.fastq.gz -o neg_control_S2_vrap_out --host /home/jhuang/REFs/genome.fa -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 40 -l 200
    vrap/vrap.py -1 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20429/635724976_S_aureus_epidermidis_S3_R1_001.fastq.gz -2 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20429/635724976_S_aureus_epidermidis_S3_R2_001.fastq.gz -o S_aureus_epidermidis_S3_vrap_out --host /home/jhuang/REFs/genome.fa -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 40 -l 200
    
    #txid10358 (https://www.ncbi.nlm.nih.gov/nuccore/?term=Cytomegalovirus)
    #txid10376 https://www.ncbi.nlm.nih.gov/nuccore/?term=Epstein-Barr-Virus
    sed -i -e 's/txid10239/txid10358/g' vrap/download_db.py
    sed -i -e 's/retmax=100000/retmax=10000000/g' vrap/download_db.py
    vrap/vrap.py -u
    vrap/vrap.py -1 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20430/635290002_CMV_S4_R1_001.fastq.gz -2 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20430/635290002_CMV_S4_R2_001.fastq.gz -o CMV_S4_vrap_out --host /home/jhuang/REFs/genome.fa -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 40 -l 200 #[--virus=Cytomegalovirus.fasta]
    mv vrap/database/viral_db vrap/database/viral_db_CMV
    sed -i -e 's/txid10358/txid10376/g' vrap/download_db.py
    vrap/vrap.py -1 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20431/635850623_EBV_S5_R1_001.fastq.gz -2 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20431/635850623_EBV_S5_R2_001.fastq.gz -o EBV_S5_vrap_out --host /home/jhuang/REFs/genome.fa -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 40 -l 200 #[--virus=Epstein-Barr-Virus.fasta]
    mv vrap/database/viral_db vrap/database/viral_db_EBV
    
    mv vrap/database/viral_db_orig vrap/database/viral_db
    vrap/vrap.py -1 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20430/635290002_CMV_S4_R1_001.fastq.gz -2 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20430/635290002_CMV_S4_R2_001.fastq.gz -o CMV_S4_vrap_out_host_CMV --host vrap/database/viral_db_CMV/nucleotide.fa -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 40 -l 200
    vrap/vrap.py -1 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20431/635850623_EBV_S5_R1_001.fastq.gz -2 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20431/635850623_EBV_S5_R2_001.fastq.gz -o EBV_S5_vrap_out_host_EBV --host vrap/database/viral_db_EBV/nucleotide.fa -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 40 -l 200
    #show samtools flagstat mapped and screen of mapped on IGV, the bam and fasta files to her.  
    #END