Author Archives: gene_x

Assembly and Variant Calling in Acinetobacter baumannii Genomes

  1. input data

    #Deciphering Evolutionary Trajectories in Acinetobacter baumannii: A Comparative Study of Isolates ATCC 19606, AYE, and ATCC 17978
    
    #PRJNA1041744
    
    A6 WT - Acinetobacter baumannii ATCC 19606, CP059040
    A10 CraA - Acinetobacter baumannii ATCC 19606, CP059040
    BIT33_RS17375: adeJ
    BIT33_RS17370: adeI
    BIT33_RS11855: craA (MKNIQTTALNRTTLMFPLALVLFEFAVYIGNDLIQPAMLAITED
                 FGVSATWAPSSMSFYLLGGASVAWLLGPLSDRLGRKKVLLSGVLFFALCCFLILLTRQ
                 IEHFLTLRFLQGIGLSVISAVGYAAIQENFAERDAIKVMALMANISLLAPLLGPVLGA
                 FLIDYVSWHWGFVAIALLALLSWVGLKKQMPSHKVSVTKQPFSYLFDDFKKVFSNRQF
                 LGLTLALPLVGMPLMLWIALSPIILVDELKLTSVQYGLAQFPVFLGLIVGNIVLIKII
                 DRLALGKTVLIGLPIMLTGTLILILGVVWQAYLIPCLLIGMTLICFGEGISFSVLYRF
                 ALMSSEVSKGTVAAAVSMLLMTSFFAMIELVRYLYTQFHLWAFVLSAFAFIALWFTQP
                 RLALKREMQERVAQDLH) MFS transporter --> H0N29_01695
    364663 - 365770
    
    A12 AYE - Acinetobacter baumannii AYE,         CU459141
    A19 17978 - Acinetobacter baumannii ATCC 17978  CP033110
    A1S_2736: adeJ
    A1S_2735: adeI
    A1S_3146: craA       
    
    mv A6WT_HQ_R1.fq.gz A6_WT_HQ_R1.fastq.gz 
    mv A6WT_HQ_R2.fq.gz A6_WT_HQ_R2.fastq.gz 
    mv A10CraA_HQ_R1.fq.gz A10_CraA_HQ_R1.fastq.gz 
    mv A10CraA_HQ_R2.fq.gz A10_CraA_HQ_R2.fastq.gz 
    mv A12AYE_HQ_R1.fq.gz A12_AYE_HQ_R1.fastq.gz 
    mv A12AYE_HQ_R2.fq.gz A12_AYE_HQ_R2.fastq.gz 
    mv A1917978_HQ_R1.fq.gz A19_17978_HQ_R1.fastq.gz 
    mv A1917978_HQ_R2.fq.gz A19_17978_HQ_R2.fastq.gz
    
    #Nucleotide Accession Prefixes
    #AP,BS                                                      DDBJ           Genome project data
    #AL,BX,CR,CT,CU,FP,FQ,FR                                    EMBL           Genome project data
    #AE,CP,CY                                                   GenBank        Genome project data
    
    #transposon detection using gubbins
    
    #TODOs: Submit A6, A12 and A19 to NCBI, A10 not, since the assembly is very bad! Could do mapping plot of A10 on A6, and three genebank-files to Tam!     
    #Wuen Ee Foong
    #Jiabin Huang
    #Heng-Keat Tam, Hengyang Medical College, University of South China
    #pA12AYE1
  2. prokka

    for sample in A19_17978_HQ; do
    prokka --force --outdir prokka/${sample} --cpus 2 --usegenus --genus Acinetobacter --kingdom Bacteria --species baumannii --addgenes --addmrna --prefix ${sample} --locustag ${sample} shovill/${sample}/contigs.fa -hmm /media/jhuang/Titisee/GAMOLA2/TIGRfam_db/TIGRFAMs_15.0_HMM.LIB
    done
    
    (prokka on titisee) for the remaining steps (f,f,f,f,f,t,f(variants_calling),t,t,t in bacto-0.1.json)
    
    #TODO: annotate the SNPs with antimicrobial gene database!
    #TODO: ask Qianjie die sample are directly gotten from the patient, oder did some motification on the reference genome?

3.1. vrap (TODO: deploy vrap on xgenes.com, it does not use the database. under (vrap) jhuang@hamm[hamburg])

    #In summary: --(optional) host --> bowtie/host (bowtie_database for host-substraction)
    #            --(optional) virus --> blast/db/virus (blast1, in nt_dbs, custom blastn database before downloaded_db!)
    #            --(always) defined in download_db.py (setting by modifying the file) and run default --> database/viral_db/viral_nucleotide (blast2, in nt_dbs)
    #                                                                                                 --> database/viral_db/viral_protein (blast3, in prot_db)
    #            --(optional) nt+nr (blast4 and blast5)

    #Under (vrap) jhuang@hamm or (bengal3_ac3) jhuang@hamburg
    #DEBUG: {path}/external_tools/Lighter-master/lighter-->/home/jhuang/miniconda3/envs/vrap/bin/lighter
    #DEBUG: {path}/external_tools/SPAdes-3.11.1-Linux/bin/spades.py-->/home/jhuang/miniconda3/envs/vrap/bin/spades.py

    #gi|1690449040|gb|CP029454.1|    Bacillus cereus strain FORC087 chromosome, complete genome
    #gi|1621560499|gb|CP039269.1|    Bacillus cereus strain MH19 chromosome, complete genome
    #gi|1783154430|gb|CP040340.1|    Bacillus cereus strain DLOU-Tangshan chromosome, complete genome
    #CP059040        3980852 10      70      71
    #CU459141        3936291 4037743 70      71
    #CP033110        3902037 8030278 70      71

    #CP059040 --> A6 and A10
    #CU459141 --> A12 
    #CP033110 --> A19

    #Command line: /home/jhuang/miniconda3/envs/bengal3_ac3/bin/spades.py -1 A6_WT_HQ_R1.fastq.gz -2 A6_WT_HQ_R2.fastq.gz --careful --trusted-contigs CP059040.fasta -o spades_A6_ATCC19606

    #CP059040_RagTag 3866625 17      3866625 3866626
    #CAP_1_length_17300      17300   3866663 17300   17301
    #CAP_2_length_928        928     3883982 928     929
    #NODE_17_length_11061_cov_141.020669     11061   3884948 11061   11062
    #NODE_19_length_1654_cov_105.893255      1654    3896046 1654    1655
    #NODE_20_length_1102_cov_153.073846      1102    3897737 1102    1103
    #NODE_21_length_1040_cov_172.078861      1040    3898876 1040    1041

    #NOTE that we need to adjust the taxid in download_db.py to download viral_nucleotide and viral_protein before running vrap/vrap.py.

    #A10 
    #Blast with 5 DBs when --virus (blast/db/virus), viral_nucleotide (download_db.rb), viral_protein (download_db.rb), nt, nr
    #Bowtie2 with 1 DB when --host (bowtie/host)
    #NOTE the fasta files given with --host and --virus should be deduplicated!
    #NOTE that in vrap_CP059040.py the trusted-contigs is used "--trusted-contigs /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/CP074710.fasta"
    vrap/vrap_CP059040.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_out --host Bacillus_cereus.fasta --virus=Acinetobacter_baumannii.fasta   -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55

    #A6
    vrap/vrap_CP059040.py -1 ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_2.fastq.gz -o A6_vrap_out  -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55

    #A12
    vrap/vrap_CU459141.py -1 ./results_AYE/trimmed/A12_AYE_HQ_trimmed_P_1.fastq.gz -2 ./results_AYE/trimmed/A12_AYE_HQ_trimmed_P_2.fastq.gz -o A12_vrap_out  -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55

    #A19 
    vrap/vrap_CP033110.py -1 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_2.fastq.gz -o A19_vrap_out  -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55

    # -- A10 LOG --
    START VrAP
    START checking blast db
    START lighter
    START estimate k-mer size
    k-mer size: 15037634
    /home/jhuang/miniconda3/envs/vrap/bin/lighter -r A10_CraA_HQ_trimmed_P_1.fastq.gz -r A10_CraA_HQ_trimmed_P_2.fastq.gz -K 20 15037634 -od A10_vrap_out/lighter -t 55
    START flash
    /home/jhuang/Tools/vrap/external_tools/FLASH-1.2.11/flash A10_vrap_out/lighter/A10_CraA_HQ_trimmed_P_1.cor.fq.gz A10_vrap_out/lighter/A10_CraA_HQ_trimmed_P_2.cor.fq.gz -o flash -d A10_vrap_out/flash -M 1000

    START bowtie2-build
    START bowtie2 pe
    #MAPPING to 1 DB: bowtie/host

    START spades    
    START cap3
    #/home/jhuang/Tools/vrap/external_tools/cap3 A10_vrap_out/spades/contigs.fasta -y 100  
    START calculating orf density
    START HMMER

    START blast
    #grep ">" A10_vrap_out/blast/custom_viral_seq.fa == "--virus Acinetobacter_baumannii.fasta" 
    #MAPPING to 5 DBs: blast/db/virus[1], viral_nucleotide (defined in the download_db.py)[2], viral_protein (defined in the download_db.py)[3], nt[4], nr[5]
    #INPUT_SEQ: 
    #(Assuming, if annotated with the previous database, will not translate to the next level). For the example below: we have no record finding in [2,1], then we have 1317-0 in blast/blastn.fa; In [3] we find 'cut -f1 blastx.csv | sort -u | wc -l = 567', then we have 1317-567=750; In [4], we 'cut -f1 blastn_nt.csv | sort -u | wc -l=710', then we have 750-710=40 in blast/blastn_nt.fa; In [5], we find 'cut -f1 blastn_nr.csv | sort -u | wc -l=39', then we remain 40-39=1 record in blastx_nr.fa.
    #grep ">" vrap_contig.fasta | wc -l   #1317 
    #grep ">" blast/blastn.fa | wc -l     #1317
    #grep ">" blast/blastx.fa | wc -l     #750
    #grep ">" blast/blastn_nt.fa | wc -l  #40
    #grep ">" blast/blastx_nr.fa | wc -l  #1
    #--
    #grep ">" vrap_contig.fasta | wc -l   #2370 
    #grep ">" blast/blastn.fa | wc -l     #2306 --> 64 contigs should belong to A. baumannii! can be annotated with --custom_virus (3 speicies of Acinetobacter baumannii) and downloaded [virus_nt_db] (2.6 G Acinetobacter baumannii)
    #grep ">" blast/blastx.fa | wc -l     #1245: 1061 contigs can be annotated with downloaded [virus_aa_db]
    #grep ">" blast/blastn_nt.fa | wc -l  #41: 1204 contigs can be annotated with nt
    #grep ">" blast/blastx_nr.fa | wc -l  #1: 40 contigs can be annotated with nr 
    #--> extract 64 contigs using 'python3 extract_unique_sequences.py A10_vrap_out/vrap_contig.fasta A10_vrap_out/blast/blastn.fa A10_extracted_contigs.fa'

    [1_INDEX]/home/jhuang/Tools/vrap/external_tools/blast/makeblastdb -in /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/custom_viral_seq.fa -dbtype nucl -parse_seqids -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/db/virus >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log 2>> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log

    [2,1]/home/jhuang/Tools/vrap/external_tools/blast/blastn -num_threads 55 -query /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap_contig.fasta -db "/home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/db/virus"  -evalue 1e-4 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastn.csv >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log

    [3]/home/jhuang/Tools/vrap/external_tools/blast/blastx -num_threads 55 -query /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastn.fa -db "/home/jhuang/Tools/vrap/database/viral_db/viral_protein"  -evalue 1e-6 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastx.csv >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log

    [4]/home/jhuang/Tools/vrap/external_tools/blast/blastn -num_threads 55 -query /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastx.fa -db "/mnt/h1/jhuang/blast/nt"  -evalue 1e-4 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastn_nt.csv >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log

    [5]/home/jhuang/Tools/vrap/external_tools/blast/blastx -num_threads 55 -query /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastn_nt.fa -db "/mnt/h1/jhuang/blast/nr"  -evalue 1e-6 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastx_nr.csv >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log

    VrAP pipeline finished.
    Thank you for using VrAP!

3.2. ragtag.py ((bengal3_ac3) jhuang@hamm)

    #ragtag.py patch ragtag_output_scaffold/ragtag.scaffold.fasta CP059040.fasta

    ragtag.py scaffold CP059040.fasta A6_vrap_out/vrap_contig.fasta
        Acinetobacter baumannii strain ATCC 19606 plasmid unnamed, complete sequence    Acinetobacter baumannii     19653   39248   62%     0.0     99.97%  18598   CP074586.1

    python3 extract_unique_sequences.py vrap_contig.fasta blast/blastn.fa A10_extracted_contigs.fa
    ragtag.py scaffold CP059040.fasta A10_extracted_contigs.fa
    python3 filter_contigs.py A10_ragtag_output/ragtag.scaffold.fasta A10_ragtag_contigs_2000.fa
    # Check the converage to determine if they are chr or plasmids!

    ragtag.py scaffold CU459141.fasta A12_vrap_out/vrap_contig.fasta
        Acinetobacter baumannii str. AYE plasmid p2ABAYE, complete genome   Acinetobacter baumannii AYE     16835   18307   100%    0.0     99.99%  9661    CU459138.1
        Acinetobacter baumannii str. AYE plasmid p1ABAYE, complete genome   Acinetobacter baumannii AYE     9494    10856   100%    0.0     100.00%     5644    CU459137.1
        Acinetobacter baumannii str. AYE plasmid p4ABAYE, complete genome   Acinetobacter baumannii AYE     3709    5265    100%    0.0     99.95%  2726    CU459139.1
        Acinetobacter baumannii str. AYE plasmid p3ABAYE, complete genome   Acinetobacter baumannii AYE     1.384e+05   1.838e+05   100%    0.0     100.00%     94413   CU459140.1

    ragtag.py scaffold CP033110.fasta A19_vrap_out/vrap_contig.fasta
    #using CP074710.1 in the round2 (see below)
        Acinetobacter baumannii strain ATCC 17978 plasmid unnamed1, complete sequence   Acinetobacter baumannii     2.373e+05   2.463e+05   100%    0.0     100.00%     148956  CP074709.1
        Acinetobacter baumannii strain ATCC 17978 plasmid unnamed5, complete sequence   Acinetobacter baumannii     24842   50362   100%    0.0     100.00%     38118   CP074711.1
        Acinetobacter baumannii strain ATCC 17978 plasmid unnamed3, complete sequence   Acinetobacter baumannii     20953   37147   100%    0.0     100.00%     20351   CP074707.1
        Acinetobacter baumannii ATCC 17978 chromosome, complete genome  Acinetobacter baumannii ATCC 17978  13245   16527   100%    0.0     100.00%     4005343     CP053098.1

3.3 vrap round2

    #A10 
    vrap/vrap_A6_chr.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_out --host Bacillus_cereus.fasta --virus=Acinetobacter_baumannii.fasta   -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55

    #A19 
    vrap/vrap_CP074710.py -1 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_2.fastq.gz -o A19_vrap_out  -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55

3.4 ragtag round2

    python3 extract_unique_sequences.py A10_vrap_out/vrap_contig.fasta A10_vrap_out/blast/blastn.fa A10_vrap_out/A10_extracted_contigs.fa
    ragtag.py scaffold A6_chr.fasta A10_vrap_out/A10_extracted_contigs.fa
    mv ragtag_output A10_ragtag_output
    python3 filter_contigs.py A10_ragtag_output/ragtag.scaffold.fasta A10_ragtag_contigs_2000.fa

    ragtag.py scaffold CP074710.fasta A19_vrap_out/vrap_contig.fasta
    Acinetobacter baumannii strain ATCC 17978 plasmid unnamed5, complete sequence   Acinetobacter baumannii     24842   50362   100%    0.0     100.00%     38118   CP074711.1
    Acinetobacter baumannii strain ATCC 17978 plasmid unnamed3, complete sequence   Acinetobacter baumannii     20953   37147   100%    0.0     100.00%     20351   CP074707.1

3.5 ragtag round3

    ragtag.py patch CP059040.fasta A6_vrap_out/vrap_contig.fasta
    ragtag.py scaffold CP059040.fasta A6_vrap_out/vrap_contig.fasta

    python3 extract_unique_sequences.py vrap_contig.fasta blast/blastn.fa A10_extracted_contigs.fa
    ragtag.py scaffold CP059040.fasta A10_vrap_out/A10_extracted_contigs.fa
    python3 filter_contigs.py A10_ragtag_output/ragtag.scaffold.fasta A10_ragtag_contigs_2000.fa
    # Check the converage to determine if they are chr or plasmids!
    #>CAP_12_length_2972
    #Bacillus paranthracis strain Bc006 chromosome, complete genome     Bacillus paranthracis   4691    5472    100%    0.0     99.73%  5304537     CP119629.1
    #Bacillus paranthracis strain Bc006 chromosome, complete genome     Bacillus paranthracis   9762    11557   97%     0.0     99.87%  5304537     CP119629.1
    #Bacillus paranthracis strain Bc006 chromosome, complete genome     Bacillus paranthracis   14883   1.353e+05   100%    0.0     99.98%  5304537     CP119629.1
    #CP119629.1 Submitted (10-MAR-2023) College of Animal Science, Ahau, Chagnjianxilu, Anhui 230000, China

    ragtag.py patch CU459141.fasta A12_vrap_out/vrap_contig.fasta
    ragtag.py scaffold CU459141.fasta A12_vrap_out/vrap_contig.fasta

    ragtag.py patch CP074710.fasta A19_vrap_out/vrap_contig.fasta
    ragtag.py scaffold CP074710.fasta A19_vrap_out/vrap_contig.fasta

    A10:
    CP059040_RagTag 3941335 17      60      61
    CAP_11_length_2972      2972    4007061 60      61
    CAP_1_length_17300      17300   4010103 60      61
    samtools faidx A10_ragtag_contigs_2000.fa CAP_1_length_17300 > A10_plasmid.fasta

3.6 To map reads from one isolate (isolate 2) to the genome of another isolate (isolate 1), check coverage, and generate a consensus FASTA from the alignment, with regions not covered by the mapping masked as ‘N’, you’ll need to follow a series of steps using bioinformatics tools.

    # -- Mapping to CP059040 --
    5447038 + 0 in total (QC-passed reads + QC-failed reads)
    5447038 + 0 primary
    0 + 0 secondary
    0 + 0 supplementary
    0 + 0 duplicates
    0 + 0 primary duplicates
    3346945 + 0 mapped (61.45% : N/A)
    3346945 + 0 primary mapped (61.45% : N/A)
    4361512 + 0 paired in sequencing
    2180756 + 0 read1
    2180756 + 0 read2
    2579038 + 0 properly paired (59.13% : N/A)
    2681114 + 0 with itself and mate mapped
    21988 + 0 singletons (0.50% : N/A)
    0 + 0 with mate mapped to a different chr
    0 + 0 with mate mapped to a different chr (mapQ>=5)

    # -- Mapping to A6 --
    5447044 + 0 in total (QC-passed reads + QC-failed reads)
    5447044 + 0 primary
    0 + 0 secondary
    0 + 0 supplementary
    0 + 0 duplicates
    0 + 0 primary duplicates
    3437574 + 0 mapped (63.11% : N/A)
    3437574 + 0 primary mapped (63.11% : N/A)
    4361524 + 0 paired in sequencing
    2180762 + 0 read1
    2180762 + 0 read2
    2646686 + 0 properly paired (60.68% : N/A)
    2753738 + 0 with itself and mate mapped
    22055 + 0 singletons (0.51% : N/A)
    1484 + 0 with mate mapped to a different chr
    1390 + 0 with mate mapped to a different chr (mapQ>=5)

    #- Sort and Index the BAM File:
    #    samtools sort -o sorted.bam input.bam
    #    samtools index sorted.bam
    #- Call Variants:
    #    bcftools mpileup -Ou -f reference.fasta sorted.bam | bcftools call -mv -Oz -o calls.vcf.gz
    #    bcftools index calls.vcf.gz
    #- Generate Consensus FASTA:
    #    bcftools consensus -f reference.fasta calls.vcf.gz -o consensus.fasta

    Step 1: Align Reads to Reference Genome
        First, align the reads from isolate 2 to the genome of isolate 1. For example, using BWA:

            bwa mem reference_genome.fasta isolate2_reads.fastq > aligned.sam

        or, use vrap to generate input.bam

            #for bowtie (1 DB): --host Bacillus_cereus.fasta 
            #for blast (5 DBs): --virus=Acinetobacter_baumannii.fasta -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr, and two default download_db.py
            #A6_chr_plasmid.fasta reads filtered, should only the contamination reads remaining! 
            vrap/vrap.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_host_A6 --host A6_chr_plasmid.fasta --noblast -t 55

    Step 2: Convert SAM to BAM, Sort and Index
        Convert the SAM file to BAM, sort it, and create an index:

            samtools view -Sb mapped > aligned.bam
            samtools sort aligned.bam -o sorted.bam
            samtools index sorted.bam

    Step 3: Coverage Analysis
        Check the coverage of the genome regions. Regions with no coverage will be masked later.

            #bedtools genomecov -ibam sorted.bam -bg > coverage.bed
            #BUG above: By default, Bedtools doesn't explicitly list regions with zero coverage in the output of genomecov. To get around this, you can use the -bga option with genomecov. This option ensures that regions with zero coverage are also included in the output. Here's how you can modify your command:
            bedtools genomecov -ibam sorted.bam -bga > coverage.bed

    Step 4: Create a Bed File for Masking
        Create a BED file that specifies regions with no or low coverage. Assume that any region with coverage less than a certain threshold (e.g., 1x) should be masked:

            awk '$4 < 1 {print $1 "\t" $2 "\t" $3}' coverage.bed > low_coverage.bed

    Step 5: Generate Consensus Sequence
        Generate a consensus sequence, using bcftools, and mask regions with low or no coverage:

            bcftools mpileup -Ou -f ../../CP059040.fasta sorted.bam | bcftools call -mv --ploidy 1 -Oz -o calls.vcf.gz
            bcftools index calls.vcf.gz
            bcftools consensus -f ../../CP059040.fasta -m low_coverage.bed calls.vcf.gz -o consensus.fasta
            #The site CP059040:4 overlaps with another variant, skipping...
            #The site CP059040:3125036 overlaps with another variant, skipping...
            #Applied 58 variants

            bcftools mpileup -Ou -f ../../A6_chr_plasmid.fasta sorted.bam | bcftools call -mv --ploidy 1 -Oz -o calls.vcf.gz
            bcftools index calls.vcf.gz
            bcftools consensus -f ../../A6_chr_plasmid.fasta -m low_coverage.bed calls.vcf.gz -o consensus.fasta
            #The site seq00000000:4 overlaps with another variant, skipping...
            #The site seq00000000:3125036 overlaps with another variant, skipping...
            #Applied 61 variants
            #--> TODO: could report the following regions are not covered in A10 comparing to A6.
            #seq00000000     364666  365765
            #seq00000000     2810907 2813323
            #seq00000000     2813552 2813779
            #seq00000000     2813923 2819663
            #seq00000000     2821839 2831921
            #seq00000000     2832750 2832753
            #seq00000000     2833977 2860354
            #seq00000000     2860505 2861780
            #seq00000000     2861931 2861951
            #seq00000000     2862102 2862234
            #seq00000000     2862385 2863434
            #seq00000000     3124939 3124943
            #seq00000000     3125009 3125010

        In this command, the -m option in bcftools consensus is used to specify a BED file for masking low-coverage regions.

4.1, generate genebank in snpEff

    mkdir ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/CP059040
    cp db/CP059040.gb ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/CP059040/genes.gbk
    mkdir ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/CU459141 
    cp db/CU459141.gb ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/CU459141/genes.gbk
    mkdir ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/CP079931 
    cp db/CP079931.gb ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/CP079931/genes.gbk

    vim ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/snpEff.config
    /home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank CP059040      -d
    /home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank CU459141      -d
    /home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank CP079931      -d

4.2, using spandx calling variants

    gzip A6_WT_HQ_trimmed_P_1.fastq A6_WT_HQ_trimmed_P_2.fastq A10_CraA_HQ_trimmed_P_1.fastq A10_CraA_HQ_trimmed_P_2.fastq
    gzip A19_17978_HQ_trimmed_P_1.fastq A19_17978_HQ_trimmed_P_2.fastq

    ln -s /home/jhuang/Tools/spandx/ spandx
    #nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref ATCC19606.fasta --annotation  --database ATCC19606 -resume
    nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref ../db/CP059040.fasta --annotation --database CP059040 -resume

    #TODO:   --structural   Set to true if you would like to identify structural variants
    #         Note that this step can take a considerable amount of time if
    #         you have deep sequencing data (default: false).
    #Done! Result files are in --> ./Outputs
    #If further analysis is required, bam alignments are in --> ./Outputs/bams
    #Phylogenetic tree and annotated merged variants are in --> ./Outputs/Phylogeny_and_annotation
    #Individual variant files are in --> ./Outputs/Variants/VCFs

    gzip A12_AYE_HQ_trimmed_P_1.fastq A12_AYE_HQ_trimmed_P_2.fastq
    ln -s /home/jhuang/Tools/spandx/ spandx
    #nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref AYE.fasta --annotation --database AYE -resume
    nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref ../db/CU459141.fasta --annotation --database CU459141 -resume
    cd ../A19_bacto_out/
    ln -s /home/jhuang/Tools/spandx/ spandx
    nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref ../db/CP079931.fasta --annotation --database CP079931 -resume

    # LOG
    #-t
    #snpEff eff -nodownload -no-downstream -no-intergenic -ud 100 -v CP040849.1 noAB_wildtype_trimmed.PASS.snps.vcf > noAB_wildtype_trimmed.PASS.snps.annotated.vcf

    # DEBUG: "--mixtures --structural" does not work well!
    #pindel -f ATCC19606.fasta -T 2 -i pindel.bam.config -o pindel.out                 
    #pindel -f ATCC19606.fasta -T 2 -i pindel.bam.config -o pindel.out     

5, manually compare the results of snippy and spandx

    Variant Types
    Type    Name    Example
    snp     Single Nucleotide Polymorphism  A => T
    mnp     Multiple Nuclotide Polymorphism     GC => AT
    ins     Insertion   ATT => AGTT
    del     Deletion    ACGG => ACG
    complex     Combination of snp/mnp  ATTC => GTTA

    #in snippy/A6_WT_HQ/A6_WT_HQ.csv
    CHROM,POS,TYPE,REF,ALT,EVIDENCE,FTYPE,STRAND,NT_POS,AA_POS,EFFECT,LOCUS_TAG,GENE,PRODUCT
    CP059040,136447,snp,T,A,A:338 T:0
    CP059040,152318,snp,T,A,A:337 T:0,CDS,-,106/213,36/70,missense_variant c.106A>T p.Thr36Ser,H0N29_00700,,DUF1737 domain-containing protein
    CP059040,171965,snp,T,A,A:312 T:0,CDS,+,167/798,56/265,missense_variant c.167T>A p.Val56Asp,H0N29_00780,,MBL fold metallo-hydrolase
    CP059040,194020,ins,C,CT,CT:317 C:0,CDS,-,3260/3497,1087/1164,frameshift_variant c.3260dupA p.Arg1088fs,H0N29_00885,,PAS-domain containing protein
    CP059040,375297,snp,A,T,T:268 A:0

    CP059040,468939,complex,ATAATATATTCTTTATAGATGAATACATTTTTTCTTTA,TATAATATATTCTTTATAGATGAATACATTTTTTCTTTAT,TATAATATATTCTTTATAGATGAATACATTTTTTCTTTAT:111 ATAATATATTCTTTATAGATGAATACATTTTTTCTTTA:0

    CP059040,609979,snp,A,T,T:301 A:0,CDS,+,1009/1212,337/403,stop_gained c.1009A>T p.Lys337*,H0N29_02925,,SAM-dependent methyltransferase
    CP059040,1036730,ins,T,TA,TA:206 T:1
    CP059040,1059847,snp,C,T,T:245 C:0,CDS,-,119/642,40/213,missense_variant c.119G>A p.Gly40Asp,H0N29_04880,,DUF2238 domain-containing protein
    CP059040,1272639,snp,T,G,G:195 T:0
    CP059040,1300706,snp,A,G,G:202 A:0,CDS,+,351/1428,117/475,synonymous_variant c.351A>G p.Ser117Ser,H0N29_06100,,M48 family metalloprotease

    CP059040,1970200,snp,T,C,C:163 T:0,CDS,+,281/2136,94/711,missense_variant c.281T>C p.Val94Ala,H0N29_09260,,TonB-dependent siderophore receptor
    CP059040,2383727,ins,A,AT,AT:129 A:0
    CP059040,2477628,ins,T,TA,TA:90 T:0
    CP059040,2525852,ins,A,AT,AT:189 A:0,CDS,-,529/836,177/277,frameshift_variant c.529dupA p.Ile177fs,H0N29_11845,,hypothetical protein
    CP059040,3016359,ins,A,AT,AT:178 A:0,CDS,-,2136/3236,712/1077,frameshift_variant c.2136dupA p.Phe713fs,H0N29_14350,cas3f,type I-F CRISPR-associated helicase Cas3
    CP059040,3111299,snp,A,G,G:263 A:0,CDS,-,957/1185,319/394,synonymous_variant c.957T>C p.Ile319Ile,H0N29_14775,,hypothetical protein
    CP059040,3310021,ins,C,CT,CT:197 C:0
    CP059040,3542741,del,GC,G,G:307 GC:0,CDS,+,1746/2231,582/742,frameshift_variant c.1746delC p.Arg583fs,H0N29_16865,,primosomal protein N'
    CP059040,3542934,del,CT,C,C:308 CT:0,CDS,+,1938/2231,646/742,frameshift_variant c.1938delT p.Met647fs,H0N29_16865,,primosomal protein N'
    CP059040,3570717,del,AC,A,A:292 AC:0,CDS,+,187/1315,63/437,frameshift_variant c.187delC p.Gln63fs,H0N29_16960,,FAD-dependent monooxygenase
    CP059040,3629616,ins,C,CT,CT:212 C:0
    CP059040,3873573,snp,A,G,G:363 A:0,CDS,+,1705/2187,569/728,missense_variant c.1705A>G p.Asn569Asp,H0N29_18380,,polysaccharide biosynthesis tyrosine autokinase

    #in snippy/A10_CraA_HQ/A10_CraA_HQ.csv
    CHROM,POS,TYPE,REF,ALT,EVIDENCE,FTYPE,STRAND,NT_POS,AA_POS,EFFECT,LOCUS_TAG,GENE,PRODUCT
    CP059040,136447,snp,T,A,A:201 T:0
    CP059040,152318,snp,T,A,A:181 T:0,CDS,-,106/213,36/70,missense_variant c.106A>T p.Thr36Ser,H0N29_00700,,DUF1737 domain-containing protein
    CP059040,171965,snp,T,A,A:183 T:0,CDS,+,167/798,56/265,missense_variant c.167T>A p.Val56Asp,H0N29_00780,,MBL fold metallo-hydrolase
    CP059040,194020,ins,C,CT,CT:160 C:0,CDS,-,3260/3497,1087/1164,frameshift_variant c.3260dupA p.Arg1088fs,H0N29_00885,,PAS-domain containing protein
    CP059040,375297,snp,A,T,T:127 A:0

    CP059040,468931,ins,A,AT,AT:65 A:1
    CP059040,468976,ins,A,AT,AT:72 A:0

    CP059040,609979,snp,A,T,T:139 A:0,CDS,+,1009/1212,337/403,stop_gained c.1009A>T p.Lys337*,H0N29_02925,,SAM-dependent methyltransferase
    CP059040,1036730,ins,T,TA,TA:101 T:0
    CP059040,1059847,snp,C,T,T:109 C:0,CDS,-,119/642,40/213,missense_variant c.119G>A p.Gly40Asp,H0N29_04880,,DUF2238 domain-containing protein
    CP059040,1272639,snp,T,G,G:94 T:0
    CP059040,1300706,snp,A,G,G:136 A:0,CDS,+,351/1428,117/475,synonymous_variant c.351A>G p.Ser117Ser,H0N29_06100,,M48 family metalloprotease

    CP059040,1527276,del,TTGAACC,T,T:29 TTGAACC:1,CDS,+,1327/2115,443/704,conservative_inframe_deletion c.1327_1332delGAACCT p.Glu443_Pro444del,H0N29_07175,,DNA polymerase III subunit gamma/tau

    CP059040,1970200,snp,T,C,C:92 T:0,CDS,+,281/2136,94/711,missense_variant c.281T>C p.Val94Ala,H0N29_09260,,TonB-dependent siderophore receptor
    CP059040,2383727,ins,A,AT,AT:56 A:0
    CP059040,2477628,ins,T,TA,TA:27 T:1
    CP059040,2525852,ins,A,AT,AT:124 A:0,CDS,-,529/836,177/277,frameshift_variant c.529dupA p.Ile177fs,H0N29_11845,,hypothetical protein
    CP059040,3016359,ins,A,AT,AT:112 A:0,CDS,-,2136/3236,712/1077,frameshift_variant c.2136dupA p.Phe713fs,H0N29_14350,cas3f,type I-F CRISPR-associated helicase Cas3
    CP059040,3111299,snp,A,G,G:152 A:0,CDS,-,957/1185,319/394,synonymous_variant c.957T>C p.Ile319Ile,H0N29_14775,,hypothetical protein
    CP059040,3310021,ins,C,CT,CT:94 C:0
    CP059040,3542741,del,GC,G,G:152 GC:0,CDS,+,1746/2231,582/742,frameshift_variant c.1746delC p.Arg583fs,H0N29_16865,,primosomal protein N'
    CP059040,3542934,del,CT,C,C:165 CT:0,CDS,+,1938/2231,646/742,frameshift_variant c.1938delT p.Met647fs,H0N29_16865,,primosomal protein N'
    CP059040,3570717,del,AC,A,A:177 AC:0,CDS,+,187/1315,63/437,frameshift_variant c.187delC p.Gln63fs,H0N29_16960,,FAD-dependent monooxygenase
    CP059040,3629616,ins,C,CT,CT:125 C:0
    CP059040,3873573,snp,A,G,G:190 A:0,CDS,+,1705/2187,569/728,missense_variant c.1705A>G p.Asn569Asp,H0N29_18380,,polysaccharide biosynthesis tyrosine autokinase

    #in snippy/A12_AYE_HQ/A12_AYE_HQ.csv
    CHROM,POS,TYPE,REF,ALT,EVIDENCE,FTYPE,STRAND,NT_POS,AA_POS,EFFECT,LOCUS_TAG,GENE,PRODUCT
    CU459141,169174,snp,G,A,A:296 G:0,CDS,-,74/441,25/146,missense_variant c.74C>T p.Thr25Met,ABAYE0155,,conserved hypothetical protein

    #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  A12_AYE_HQ_trimmed
    CU459141        169174  .       G       A       11566.04        PASS    AC=1;AF=1.00;AN=1;DP=307;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=60.00;QD=31.04;SOR=0.763;ANN=A|missense_variant|MODERATE|ABAYE0155|ABAYE0155|transcript|ABAYE0155|protein_coding|1/1|c.74C>T|p.Thr25Met|74/441|74/441|25/146||  GT:AD:DP:GQ:PL  1:0,294:294:99:11576,0

    #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  A12_AYE_HQ_trimmed
    CU459141        519645  .       TA      T       175.01  PASS    AC=1;AF=1.00;AN=1;DP=15;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=31.88;QD=29.17;SOR=3.912 GT:AD:DP:GQ:PL  1:0,6:6:99:185,0
    CU459141        1213004 .       C       CACCTGGGCTCGAACCAGGGACCTGCGGATTAACAGTCCGTCGCTCTACCGACTGAGCTATGGGAGATTAGTAGATGGCTCTCCA   3656.01 PASS    AC=1;AF=1.00;AN=1;BaseQRankSum=-0.754;DP=160;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=59.15;MQRankSum=-0.555;QD=29.30;ReadPosRankSum=-1.188;SOR=0.846;ANN=CACCTGGGCTCGAACCAGGGACCTGCGGATTAACAGTCCGTCGCTCTACCGACTGAGCTATGGGAGATTAGTAGATGGCTCTCCA|intragenic_variant|MODIFIER|ABAYEtRNA60|ABAYEtRNA60|gene_variant|ABAYEtRNA60|||n.1213004_1213005insACCTGGGCTCGAACCAGGGACCTGCGGATTAACAGTCCGTCGCTCTACCGACTGAGCTATGGGAGATTAGTAGATGGCTCTCCA||||||     GT:AD:DP:GQ:PL  1:2,93:95:99:3666,0

    #in snippy/A19_17978_HQ/A19_17978_HQ.csv
    CHROM,POS,TYPE,REF,ALT,EVIDENCE,FTYPE,STRAND,NT_POS,AA_POS,EFFECT,LOCUS_TAG,GENE,PRODUCT
    CP079931,697999,snp,A,C,C:319 A:0,CDS,-,370/723,124/240,missense_variant c.370T>G p.Cys124Gly,KZA74_03385,,folate-binding Fe/S cluster repair protein
    CP079931,1594519,snp,A,C,C:232 A:0,CDS,-,558/1122,186/373,missense_variant c.558T>G p.Asn186Lys,KZA74_07445,,ATP-binding protein
    CP079931,2140669,snp,A,C,C:180 A:0,CDS,-,1412/1791,471/596,missense_variant c.1412T>G p.Val471Gly,KZA74_10170,,ABC transporter substrate-binding protein
    CP079931,2415049,del,CA,C,C:190 CA:0
    CP079931,3583600,snp,T,C,C:402 T:0

    #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  A19_17978_HQ_trimmed
    CP079931        499908  .       G       A       3372.04 PASS    AC=1;AF=1.00;AN=1;DP=95;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=35.85;QD=28.47;SOR=9.087 GT:AD:DP:GQ:PL  1:0,93:93:99:3382,0
    CP079931        499966  .       C       T       5744.04 PASS    AC=1;AF=1.00;AN=1;DP=150;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=37.54;QD=28.10;SOR=10.008       GT:AD:DP:GQ:PL  1:0,148:148:99:5754,0
    CP079931        697999  .       A       C       12568.04        PASS    AC=1;AF=1.00;AN=1;DP=336;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=60.00;QD=26.58;SOR=0.712;ANN=C|missense_variant|MODERATE|KZA74_03385|KZA74_03385|transcript|KZA74_03385|protein_coding|1/1|c.370T>G|p.Cys124Gly|370/723|370/723|124/240||       GT:AD:DP:GQ:PL  1:0,323:323:99:12578,0
    CP079931        1594519 .       A       C       9115.04 PASS    AC=1;AF=1.00;AN=1;DP=239;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=60.00;QD=28.16;SOR=0.782;ANN=C|missense_variant|MODERATE|KZA74_07445|KZA74_07445|transcript|KZA74_07445|protein_coding|1/1|c.558T>G|p.Asn186Lys|558/1122|558/1122|186/373||     GT:AD:DP:GQ:PL  1:0,232:232:99:9125,0
    CP079931        2140669 .       A       C       7145.04 PASS    AC=1;AF=1.00;AN=1;DP=184;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=60.00;QD=25.70;SOR=0.965;ANN=C|missense_variant|MODERATE|KZA74_10170|KZA74_10170|transcript|KZA74_10170|protein_coding|1/1|c.1412T>G|p.Val471Gly|1412/1791|1412/1791|471/596||WARNING_TRANSCRIPT_NO_START_CODON GT:AD:DP:GQ:PL  1:0,180:180:99:7155,0
    CP079931        3583600 .       T       C       16124.04        PASS    AC=1;AF=1.00;AN=1;DP=426;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=60.00;QD=28.42;SOR=0.779;ANN=C|upstream_gene_variant|MODIFIER|rplL|KZA74_16990|transcript|KZA74_16990|protein_coding||c.-23A>G|||||23|  GT:AD:DP:GQ:PL  1:0,409:409:99:16134,0

    #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  A19_17978_HQ_trimmed
    CP079931        499924  .       AT      A       4174.01 PASS    AC=1;AF=1.00;AN=1;DP=118;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=36.44;QD=30.23;SOR=9.455        GT:AD:DP:GQ:PL  1:0,112:112:99:4184,0
    CP079931        2415049 .       CA      C       6579.01 PASS    AC=1;AF=1.00;AN=1;DP=213;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=60.00;QD=32.57;SOR=1.085        GT:AD:DP:GQ:PL  1:0,202:202:99:6589,0
    CP079931        3695212 .       AT      A       342.01  PASS    AC=1;AF=1.00;AN=1;DP=10;FS=0.000;MLEAC=1;MLEAF=1.00;MQ=35.05;QD=34.20;SOR=4.804 GT:AD:DP:GQ:PL  1:0,10:10:99:352,0

    ./Outputs/bams/A10_CraA_HQ_trimmed.dedup.bam
    ./Outputs/bams/A6_WT_HQ_trimmed.dedup.bam
    ./snippy/A10_CraA_HQ/A10_CraA_HQ.bam
    ./snippy/A6_WT_HQ/A6_WT_HQ.bam

    TODO after meeting: import the two bam files and annotation of genes into IGV, go to the position craA gene and screenshot show if the gene regions are completely removed? Send the table and screenshot to Tam, say the annotation of genom of NCBI is still pending!

7 (optional), filtering process of SPANDx results

    #Input file is Outputs/Phylogeny_and_annotation/All_SNPs_indels_annotated.txt (61203)
    #- Step1: filtering $6!=$7 (330)
    awk '{if($6!=$7) print}' < All_SNPs_indels_annotated.txt > All_SNPs_indels_annotated_.txt
    #- Step2: restricted the first 7 columns (240)
    cut -d$'\t' -f1-7 All_SNPs_indels_annotated_.txt > f1_7
    #- Step3: filtering examples as "G/C" (1450)
    grep -v "/" f1_7 > f1_7_
    #- Step4: filtering examples as ".       A" (170)
    grep -v "\." f1_7_ > f1_7__
    #- Step5: filtering examples as "T       *" (162), since we want to have only SNPs differing between BK20399 and GE3138.
    grep -v "*" f1_7__ > f1_7___
    #- Step6: only SNPs (50).
    grep -v "INDEL" f1_7___ > f1_7____
    #- Step7: get the complete rows with the positions
    cut -d$'\t' -f2-2 f1_7____ > f2
    replace "\n" with " All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt\ngrep "
    grep "CHROM" All_SNPs_indels_annotated_.txt > All_SNPs_annotated.txt
    grep "46882" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt
    grep "46892" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt
    grep "46936" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt
    ....
    #- Step8: Missense mutations ()
    grep "missense_variant" All_SNPs_annotated.txt

8 (optional), merge results from two variant calling methods!

    #-- post-processing --
    awk '{if($6!=$7) print}' < All_SNPs_indels_annotated.txt > All_SNPs_indels_annotated_.txt
    cut -d$'\t' -f1-7 All_SNPs_indels_annotated_.txt > f1_7
    grep -v "/" f1_7 > f1_7_
    grep -v "\." f1_7_ > f1_7__
    grep -v "*" f1_7__ > f1_7___

    grep -v "INDEL" f1_7___ > f1_7_SNPs
    cut -d$'\t' -f2-2 f1_7_SNPs > f1_7_SNPs_f2
    \n --> " All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt\ngrep "
    #grep "CHROM" All_SNPs_indels_annotated_.txt > All_SNPs_annotated.txt
    #grep "46882" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt
    #grep "46892" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt

    grep -v "SNP" f1_7___ > f1_7_indels
    cut -d$'\t' -f2-2 f1_7_indels > f1_7_indels_f2
    \n --> " All_SNPs_indels_annotated_.txt >> All_indels_annotated.txt\ngrep "
    #grep "CHROM" All_SNPs_indels_annotated_.txt > All_indels_annotated.txt
    #grep "49848" All_SNPs_indels_annotated_.txt >> All_indels_annotated.txt
    #grep "53872" All_SNPs_indels_annotated_.txt >> All_indels_annotated.txt

9 (optional), PubMLST, ResFinder or RGI calling https://cge.cbs.dtu.dk/services/ https://cge.cbs.dtu.dk/services/MLST/ https://pubmlst.org/bigsdb?db=pubmlst_pacnes_seqdef&page=batchProfiles&scheme_id=3 https://cge.food.dtu.dk/services/ResFinder/ https://card.mcmaster.ca/home https://www.pseudomonas.com/ https://www.pseudomonas.com/amr/list https://card.mcmaster.ca/analyze/rgi The antimicrobial resistance genes were predicted by the tool RGI 6.0.0 which utilizes the database CARD 3.2.5 (PMID: 31665441). !!!!#check the difference of SNPs in the two files, and then look for the SNPs in my tables comparing the two isolates!!!!

    cut -d$'\t' -f9-21 BK20399_contigs.fa.txt > f9_21
    cut -d$'\t' -f2-6 BK20399_contigs.fa.txt > f2_6
    #Best_Hit_ARO --> Antibiotic Resistance Ontology (ARO) Term
    paste f9_21 f2_6 > BK20399_.txt
    #grep "CARD_Protein_Sequence" BK20399_.txt > header
    grep -v "CARD_Protein_Sequence" BK20399_.txt > BK20399__.txt
    sort BK20399__.txt > BK20399_sorted.txt

    cut -d$'\t' -f9-21 GE3138_contigs.fa.txt > f9_21
    cut -d$'\t' -f2-6 GE3138_contigs.fa.txt > f2_6
    paste f9_21 f2_6 > GE3138_.txt
    grep -v "CARD_Protein_Sequence" GE3138_.txt > GE3138__.txt
    sort GE3138__.txt > GE3138_sorted.txt

    cut -d$'\t' -f1-6 BK20399_sorted.txt > BK20399_f1_6.txt  
    cut -d$'\t' -f1-6 GE3138_sorted.txt > GE3138_f1_6.txt
    diff BK20399_f1_6.txt GE3138_f1_6.txt
    #-->< APH(3'')-Ib   99.63   3002639 protein homolog model   n/a n/a

    cat header BK20399_sorted.txt > BK20399.txt
    cat header GE3138_sorted.txt > GE3138.txt

    ~/Tools/csv2xls-0.4/csv_to_xls.py BK20399.txt GE3138.txt -d$'\t' -o ARG_calling.xls

    #---- another example ----
    cut -d$'\t' -f9-21 AW27149_before.txt > f9_21
    cut -d$'\t' -f2-6 AW27149_before.txt > f2_6
    #Best_Hit_ARO --> Antibiotic Resistance Ontology (ARO) Term
    paste f9_21 f2_6 > AW27149_before_.txt
    grep "CARD_Protein_Sequence" AW27149_before_.txt > header
    grep -v "CARD_Protein_Sequence" AW27149_before_.txt > AW27149_before__.txt
    sort AW27149_before__.txt > AW27149_before_sorted.txt

    cut -d$'\t' -f9-21 AW48641_after.txt > f9_21
    cut -d$'\t' -f2-6 AW48641_after.txt > f2_6
    paste f9_21 f2_6 > AW48641_after_.txt
    grep -v "CARD_Protein_Sequence" AW48641_after_.txt > AW48641_after__.txt
    sort AW48641_after__.txt > AW48641_after_sorted.txt

    cut -d$'\t' -f1-6 AW27149_before_sorted.txt > AW27149_before_f1_6.txt  
    cut -d$'\t' -f1-6 AW48641_after_sorted.txt > AW48641_after_f1_6.txt
    diff AW27149_before_f1_6.txt AW48641_after_f1_6.txt
    #48c48
    #< Pseudomonas aeruginosa CpxR   100.0   3004054 protein homolog model   n/a     n/a
    #---
    #> Pseudomonas aeruginosa CpxR   99.56   3004054 protein homolog model   n/a     n/a

    cat header AW27149_before_sorted.txt > ../RGI_calling_AW27149_before.txt
    cat header AW48641_after_sorted.txt > ../RGI_calling_AW48641_after.txt

    #mlst add header "isolate   species ST  acsA    aroE    guaA    mutL    nuoD    ppsA    trpE"
    mv 
    ~/Tools/csv2xls-0.4/csv_to_xls.py All_SNPs_annotated.csv RGI_calling_AW27149_before.txt RGI_calling_AW48641_after.txt mlst.txt -d$'\t' -o SNP_ARG_calling.xls

    #Find the common SNP between SNP calling and RGI calling, since the SNP calling has bad annotation, marked yellow.

核糖体与自噬体:细胞平衡的关键

“自噬体”(Autophagosome)和”核糖体”(Ribosome)是细胞生物学中的两个重要结构,它们在细胞的功能中扮演着关键角色。以下是它们的中文解释:

自噬体(Autophagosome):自噬体是细胞在自噬过程中形成的一种双层膜结构。在自噬过程中,细胞会包裹并隔离那些需要被分解和回收的细胞内部组成部分,如损坏的细胞器或过时的蛋白质。这些被包裹的物质随后被运送到溶酶体,那里它们会被降解并回收利用。自噬体的形成和功能对于细胞的健康和稳定至关重要,特别是在细胞遭受压力或损伤时。

核糖体(Ribosome):核糖体是细胞中的一种复杂的分子机器,主要负责蛋白质的合成。核糖体由RNA和蛋白质组成,它们根据信使RNA(mRNA)的指令,将氨基酸连接起来,形成蛋白质链。核糖体在所有生命形式中都存在,是实现遗传信息表达和蛋白质生产的关键部分。在真核生物中,核糖体可以存在于细胞质中,也可以附着在内质网的表面。

自噬体(Autophagosome)和核糖体(Ribosome)在细胞中的关系主要体现在蛋白质合成和降解这两个细胞代谢和稳态调控的关键方面。

核糖体的作用:核糖体是细胞内负责蛋白质合成的机器。它们将信使RNA(mRNA)翻译成多肽链,从而制造出执行多种功能的蛋白质。核糖体对细胞的生长和维护至关重要。

自噬体的作用:自噬体则涉及到细胞成分的降解和回收,包括蛋白质。当蛋白质受损、错误折叠或不再需要时,它们可以被自噬体包裹。自噬体随后与溶酶体融合,其中的内容物被分解成基本成分(如氨基酸),这些成分可以被细胞重新利用。

在细胞健康和应激反应中的相互作用:这两部分的相互作用对于细胞的健康和应对压力至关重要。当细胞处于压力状态(如营养匮乏或损伤)时,自噬过程会被上调。这不仅有助于清除受损的蛋白质和细胞器,还为细胞提供了从这些降解成分中回收的基本构建块。在特定条件下,核糖体也可能通过自噬被降解,特别是在细胞需要在压力下降低蛋白质合成时。

质量控制:这两个系统都是细胞质量控制机制的一部分。核糖体确保蛋白质的准确合成,而自噬则移除可能有害或不再需要的蛋白质。

疾病含义:这两个系统的调控失常可能导致疾病。例如,自噬功能受损可能导致受损蛋白质和细胞器的积累,从而促成神经退行性疾病。同样,核糖体功能缺陷也可能导致与蛋白质合成相关的疾病。

总之,核糖体和自噬体以协调的方式工作,以维持细胞内新蛋白质的合成与旧或受损蛋白质的降解之间的平衡。这种平衡对细胞的健康和生存至关重要。

线粒体(Mitochondria)是细胞内的一种重要细胞器,常被称为“细胞的能量工厂”。它们在细胞呼吸过程中发挥关键作用,是大多数细胞能量(以三磷酸腺苷,即ATP的形式)的主要来源。线粒体的主要功能包括: 能量产生:通过氧化磷酸化过程,线粒体将营养物质转化为ATP,为细胞提供能量。 细胞代谢调控:线粒体参与多种代谢途径,包括脂肪酸的氧化、氨基酸的代谢等。 细胞凋亡:线粒体在调控细胞死亡过程中也起着重要作用。在某些情况下,线粒体会释放促凋亡因子,引导细胞走向程序性死亡。 钙离子储存:线粒体能够储存和释放钙离子,参与调节细胞内的钙平衡,影响多种细胞功能。 线粒体具有自己的DNA(称为线粒体DNA或mtDNA),与细胞核的DNA分开。这些mtDNA编码一些必要的线粒体蛋白质和RNA。线粒体的起源被普遍认为是一种古老的共生关系,即原始真核细胞通过内共生过程吞噬了一种原核细胞,后者逐渐演化成为现代细胞的线粒体。

Bubble plot for 1457∆atlE vs 1457-M10 vs 1457 vs mock

  1. R code for bubbleplot

    library(ggplot2)
    library(dplyr)
    library(readxl)
    
    # Assuming you have already read the data with read_excel
    mydat <- read_excel("Pathway_KEGG_1457_vs_mock_Top10.xlsx")
    
    # Custom function to convert fraction to decimal
    convert_fraction_to_decimal <- function(fraction) {
        parts <- strsplit(as.character(fraction), "/")[[1]]
        as.numeric(parts[1]) / as.numeric(parts[2])
    }
    
    mydat$GeneRatio <- sapply(mydat$Ratio, convert_fraction_to_decimal)
    mydat$Description <- factor(mydat$Description, levels = unique(mydat$Description))
    mydat$Category <- factor(mydat$Category, levels=c("Up-regulated","Down-regulated"))
    
    description_order <- rev(c("TNF signaling pathway","Legionellosis","Cytokine-cytokine receptor interaction","Protein processing in endoplasmic reticulum","Toxoplasmosis","Fluid shear stress and atherosclerosis","Pathways in cancer","JAK-STAT signaling pathway","IL-17 signaling pathway","Influenza A","Transcriptional misregulation in cancer","Glycine serine and threonine metabolism","Antifolate resistance","Base excision repair","Metabolic pathways","Acute myeloid leukemia","Homologous recombination","Fanconi anemia pathway","Primary immunodeficiency","MAPK signaling pathway"))
    mydat$Description <- factor(mydat$Description, levels = description_order)
    
    # Set the size for axis labels larger than the axis text
    axis_label_size <- 24
    
    # Now, create the plot        
    png("bubble_plot.png", width = 1000, height = 800)
    ggplot(mydat, aes(x = GeneRatio, y = Description)) +
      geom_point(aes(color = Category, size = Count, alpha = abs(log10(FDR)))) +
      scale_color_manual(values = c("Up-regulated" = "red", "Down-regulated" = "blue")) +
      scale_size_continuous(range = c(4, 10)) +
      labs(x = "GeneRatio", y = "Pathway name", color="Category", size="Count", alpha="-log10(FDR)") +
      theme(
        axis.text.x = element_text(angle = 20, vjust = 0.5, size = 20),
        axis.text.y = element_text(size = 20),
        axis.title.x = element_text(size = axis_label_size),
        axis.title.y = element_text(size = axis_label_size),
        legend.text = element_text(size = 20),
        legend.title = element_text(size = 20),
        plot.title = element_text(size = axis_label_size)
      ) +
      guides(color = guide_legend(override.aes = list(size = 10)), alpha = guide_legend(override.aes = list(size = 10)))
    dev.off()
  2. R code for bubbleplot2

    library(readxl)
    library(ggplot2)
    library(dplyr)
    library(magrittr)
    library(tidyr)
    library(forcats)
    
    # Read data from an Excel file
    mydat <- read_excel("1457_M10_atlE_DEGs_all_pathway-2.xlsx")
    
    mydat$Comparison <- factor(mydat$Comparison, levels=c("1457","1457-M10","1457∆atlE"))
    
    description_order <- rev(c("Protein processing in endoplasmic reticulum","TNF signaling pathway","Legionellosis","Epstein-Barr virus infection","Toxoplasmosis","Osteoclast differentiation","Proteasome","Influenza A","Herpes simplex infection","HIF-1 signaling pathway","NOD-like receptor signaling pathway","Apoptosis","C-type lectin receptor signaling pathway","MAPK signaling pathway","Endocytosis","Neurotrophin signaling pathway","Ubiquitin mediated proteolysis","Pancreatic cancer"))
    mydat$Description <- factor(mydat$Description, levels = description_order)
    
    png("bubble_plot2.png", 1000, 800)
    ggplot(mydat, aes(y = Description, x = Comparison)) +
      geom_point(aes(color = p.adjust), size = 10) + # Set fixed size for points
      labs(x = "", y = "", alpha="-log10(p.adjust)") +
      theme(axis.text.x = element_text(angle = 20, vjust = 0.5)) +
      theme(axis.text = element_text(size = 20)) +
      theme(legend.text = element_text(size = 20)) +
      theme(legend.title = element_text(size = 20)) +
      guides(size = "none") # Turn off size in legend
    dev.off()
  3. Input Excel for bubbleplot

    Description Size    Expect  Ratio   P Value FDR Category    Count
    TNF signaling pathway   110 93/373  42/839  2.22E-12    7.24E-10    Up-regulated    42
    Legionellosis   55  46/686  55/691  2.9E-10 4.72E-08    Up-regulated    55
    Cytokine-cytokine receptor interaction  294 24/956  26/046  1.95E-09    2.12E-07    Up-regulated    26
    Protein processing in endoplasmic reticulum 165 14/006  30/701  1.01E-07    8.27E-06    Up-regulated    30
    Toxoplasmosis   113 95/919  35/447  2.2E-07 1.43E-05    Up-regulated    35
    Fluid shear stress and atherosclerosis  139 11/799  31/359  1.52E-06    8.26E-05    Up-regulated    31
    Pathways in cancer  526 44/649  19/037  1.92E-05    0.000896    Up-regualted    19
    JAK-STAT signaling pathway  162 13/751  27/634  4.35E-05    0.00177 Up-regulated    27
    IL-17 signaling pathway 93  78/942  32/935  0.000285    1.0327E-06  Up-regulated    32
    Influenza A 171 14/515  25/491  0.000687    2.241E-06   Up-regulated    25
    Transcriptional misregulation in cancer 186 83/425  23/974  0.0002368   0.038559    Down-regulated  23
    Glycine serine and threonine metabolism 40  17/941  44/591  0.00032864  0.038559    Down-regulated  44
    Antifolate resistance   31  13/904  50/345  0.00035484  0.038559    Down-regulated  50
    Base excision repair    33  14/801  47/294  0.00053358  0.043487    Down-regulated  47
    Metabolic pathways  1305    58/532  12/984  0.0075183   0.40292 Down-regulated  12
    Acute myeloid leukemia  66  29/602  27/025  0.008932    0.40292 Down-regulated  27
    Homologous recombination    41  18/389  32/628  0.0092737   0.40292 Down-regulated  32
    Fanconi anemia pathway  54  24/220  28/902  0.0098875   0.40292 Down-regulated  28
    Primary immunodeficiency    37  16/595  30/129  0.023604    0.77755 Down-regulated  30
    MAPK signaling pathway  295 13/231  15/871  0.023851    0.77755 Down-regulated  15
  4. Input Excel for bubbleplot2

    Comparison  Description p.adjust
    1457    Protein processing in endoplasmic reticulum 6.7681E-08
    1457-M10    Protein processing in endoplasmic reticulum 4.6253E-06
    1457∆atlE   Protein processing in endoplasmic reticulum 2.0787E-05
    1457    TNF signaling pathway   2.6941E-05
    1457-M10    TNF signaling pathway   4.5734E-06
    1457∆atlE   TNF signaling pathway   3.3099E-06
    1457    Legionellosis   0.0062434
    1457-M10    Legionellosis   4.6253E-06
    1457∆atlE   Legionellosis   0.0073192
    1457    Epstein-Barr virus infection    0.0062434
    1457-M10    Epstein-Barr virus infection    1.6635E-06
    1457∆atlE   Epstein-Barr virus infection    0.00049454
    1457    Toxoplasmosis   0.0064469
    1457    Osteoclast differentiation  4.6509E-06
    1457-M10    Osteoclast differentiation  2.6616E-05
    1457    Proteasome  1.0391E-05
    1457    Influenza A 1.4677E-05
    1457    Herpes simplex infection    1.5915E-05
    1457∆atlE   Herpes simplex infection    1.857E-06
    1457    HIF-1 signaling pathway 1.6873E-05
    1457-M10    NOD-like receptor signaling pathway 2.22E-06
    1457∆atlE   NOD-like receptor signaling pathway 0.0096378
    1457-M10    Apoptosis   9.54E-06
    1457-M10    C-type lectin receptor signaling pathway    1.37E-05
    1457-M10    MAPK signaling pathway  5.3439E-05
    1457-M10    Endocytosis 5.49E-05
    1457∆atlE   Endocytosis 1.857E-06
    1457∆atlE   Neurotrophin signaling pathway  0.00049454
    1457∆atlE   Ubiquitin mediated proteolysis  0.0088734
    1457∆atlE   Pancreatic cancer   1.857E-06

肺部疾病概述:从肺移植到支气管扩张症

几种肺部疾病概述:

  • 肺移植 (Lung Transplantation) (Lungentransplantationsambulanz): 这通常是针对终末期肺部疾病的最后治疗手段,意味着患者的肺功能已经严重衰竭,需要通过移植新的肺部来维持生命。

  • 重度慢性阻塞性肺病 (Severe Chronic Obstructive Pulmonary Disease, COPD) (Ambulanz für schwere COPD): 这是COPD的一种严重形式,影响患者的呼吸功能,常常伴随有持续性气流阻塞和其他并发症。

  • 肺囊性纤维化 (Cystic Fibrosis) (Ambulanz für Cystische Fibrose): 一种遗传性疾病,主要影响肺部,导致持续性肺部感染和粘液堆积,严重影响患者的呼吸和消化系统。

  • 肺动脉高压 (Pulmonary Hypertension) (Ambulanz für pulmonale Hypertonie): 是指肺部血管的压力异常升高,这种情况可能导致心脏负担加重,甚至心力衰竭。

  • 间质性肺部疾病 (Interstitial Lung Disease) (Ambulanz für interstitielle Lungenerkrankungen): 这是一组影响肺部间质(肺组织中的支持结构)的疾病,可能导致持续性肺部纤维化和气体交换功能障碍。

  • 支气管扩张症 (Bronchiectasis) (Bronchiektasen): 这种疾病导致支气管永久性扩张,引发反复的肺部感染和痰液积聚,长期影响患者的呼吸功能。

表皮葡萄球菌atlE基因及其医疗影响

“atlE” 基因在人类遗传学中并不广为人知,但在微生物学领域,特别是在研究表皮葡萄球菌(Staphylococcus epidermidis)时,这个基因具有重要意义。表皮葡萄球菌是一种通常存在于人类皮肤上的细菌,是我们自然菌群的重要组成部分。虽然通常情况下它是无害的,但在医院环境中,尤其是对于免疫系统较弱或植入医疗设备的患者来说,它可能成为一个问题。

以下是对表皮葡萄球菌中的 “atlE” 基因的简要概述:

  1. 功能和作用:atlE 基因编码自溶素蛋白 AtlE,这是细菌细胞壁的主要成分。自溶素是参与细胞壁代谢和分裂的酶,它们参与分解肽聚糖,肽聚糖是细菌细胞壁的主要成分。

  2. 在附着中的重要性:AtlE 在表皮葡萄球菌最初附着到表面(如医疗设备)的过程中尤为重要。这种附着是生物膜形成的关键步骤,生物膜是由细菌构成的结构化社群。生物膜之所以重要,是因为它们对抗生素具有很强的抵抗力,并且是医院环境中慢性感染和并发症的主要原因。

  3. 生物膜形成:表皮葡萄球菌在植入人体的医疗设备(如导尿管和人工关节)上形成生物膜的能力,使其成为医院获得性感染的显著原因。atlE 基因通过其产物 AtlE 促进这一过程,成为理解和可能减轻这些感染的关注目标。

  4. 研究意义:理解像 atlE 这样的基因在细菌附着和生物膜形成中的作用,可以帮助开发新的策略来预防或治疗由生物膜形成细菌引起的感染。这在医院环境中尤为重要,因为与植入医疗设备有关的感染是一个主要的关注点。

总而言之,虽然 atlE 基因在人类遗传学中可能并不相关,但它在研究某些细菌物种,特别是在理解感染机制和开发对抗医院获得性感染的策略方面非常重要。

GSVA plots & tables for RNA-seq and NanoString

  1. preparing gene expression matrix: calculate DESeq2 results

    #Input: merged_gene_counts.txt
    
    setwd("/home/jhuang/DATA/Data_Susanne_Carotis_RNASeq/run_2023_GSVA_table/")
    
    library("AnnotationDbi")
    library("clusterProfiler")
    library("ReactomePA")
    #BiocManager::install("org.Hs.eg.db")
    library("org.Hs.eg.db")
    library(DESeq2)
    library(gplots)
    
    d.raw<- read.delim2("merged_gene_counts.txt",sep="\t", header=TRUE, row.names=1)
    
    colnames(d.raw)<-c("gene_name", "leer_mock_2h_r2", "Ace2_mock_2h_r2", "leer_inf_24h_r1", "Ace2_inf_2h_r1", "leer_inf_24h_r2", "leer_inf_2h_r1", "leer_mock_2h_r1", "leer_inf_2h_r2", "Ace2_inf_2h_r2", "Ace2_mock_2h_r1", "Ace2_inf_24h_r2", "Ace2_inf_24h_r1")
    
    col_order <- c("gene_name", "leer_mock_2h_r1","leer_mock_2h_r2","leer_inf_2h_r1","leer_inf_2h_r2","leer_inf_24h_r1","leer_inf_24h_r2","Ace2_mock_2h_r1","Ace2_mock_2h_r2","Ace2_inf_2h_r1","Ace2_inf_2h_r2","Ace2_inf_24h_r1","Ace2_inf_24h_r2")
    
    reordered.raw <- d.raw[,col_order]
    reordered.raw$gene_name <- NULL
    #d <- d.raw[rowSums(reordered.raw>3)>2,]
    
    condition = as.factor(c("leer_mock_2h","leer_mock_2h","leer_inf_2h","leer_inf_2h","leer_inf_24h","leer_inf_24h","Ace2_mock_2h","Ace2_mock_2h","Ace2_inf_2h","Ace2_inf_2h","Ace2_inf_24h","Ace2_inf_24h"))
    ids = as.factor(c("leer_mock_2h_r1","leer_mock_2h_r2","leer_inf_2h_r1","leer_inf_2h_r2","leer_inf_24h_r1","leer_inf_24h_r2","Ace2_mock_2h_r1","Ace2_mock_2h_r2","Ace2_inf_2h_r1","Ace2_inf_2h_r2","Ace2_inf_24h_r1","Ace2_inf_24h_r2"))
    
    #cData = data.frame(row.names=colnames(reordered.raw), condition=condition,  batch=batch, ids=ids)
    #dds<-DESeqDataSetFromMatrix(countData=reordered.raw, colData=cData, design=~batch+condition)
    cData = data.frame(row.names=colnames(reordered.raw), condition=condition, ids=ids)
    dds<-DESeqDataSetFromMatrix(countData=reordered.raw, colData=cData, design=~condition)
    
    #----more detailed and specific with the following code!----
    dds$condition <- relevel(dds$condition, "Ace2_mock_2h")
    dds = DESeq(dds, betaPrior=FALSE)  # betaPrior default value is FALSE
    resultsNames(dds)
  2. preparing selected_geneSets in gsva(exprs, selected_geneSets, method=”gsva”). Note that methods are different than methods for nanoString, here are ENSEMBL listed.

    #Input: "Signatures.xls" + "Signatures_additional.xls"
    library(readxl)
    library(gridExtra)
    library(ggplot2)
    library(GSVA)
    # Paths to the Excel files
    file_paths <- list("Signatures.xls", "Signatures_additional.xls")
    # Get sheet names for each file
    sheet_names_list <- lapply(file_paths, excel_sheets)
    
    # Initialize an empty list to hold gene sets
    geneSets <- list()
    # Loop over each file path and its corresponding sheet names
    for (i in 1:length(file_paths)) {
      file_path <- file_paths[[i]]
      sheet_names <- sheet_names_list[[i]]
      # Loop over each sheet, extract the ENSEMBL IDs, and add to the list
      for (sheet in sheet_names) {
        # Read the sheet
        data <- read_excel(file_path, sheet = sheet)
    
        # Process the GeneSet names (replacing spaces with underscores, for example)
        gene_set_name <- gsub(" ", "_", unique(data$GeneSet)[1])
    
        # Add ENSEMBL IDs to the list
        geneSets[[gene_set_name]] <- as.character(data$ENSEMBL)
      }
    }
    
    # Print the result to check
    #print(geneSets)
    #summary(geneSets)
    ##desired_geneSets <- c("Monocytes", "Plasma_cells", "T_regs", "Cyt._act._T_cells", "Neutrophils", "Inflammatory_neutrophils", "Suppressive_neutrophils", "LDG", "CD40_activated")
    #desired_geneSets <- c("IFN", "TNF", "IL-6R_complex", "IL-1_cytokines", "Pro-inflam._IL-1", "Monocyte_secreted", "Apoptosis", "NFkB_complex",   "NLRP3_inflammasome")
    #selected_geneSets <- geneSets[desired_geneSets]
    selected_geneSets <- geneSets
    # Print the selected gene sets
    print(selected_geneSets)
  3. prepare violin plots and p-value table for RNA-seq data

    # 0. for RNAseq, the GSVA input requires a gene expression matrix where rows are genes and columns are samples. This matrix must be in non-log space.
    exprs <- counts(dds, normalized=TRUE)
    
    # 1. Compute GSVA scores:
    gsva_scores <- gsva(exprs, selected_geneSets, method="gsva")
    
    # 2. Convert to data.frame for ggplot:
    gsva_df <- as.data.frame(t(gsva_scores))
    
    # 3. Add conditions to gsva_df:
    gsva_df$Condition <- dds$condition
    
    # 4. Filter the gsva_df to retain only the desired conditions:
    #group 1 vs. group 3 in the nanostring data
    gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("Ace2_mock_2h", "Ace2_inf_24h"), ]
    
    # 5. Define a function to plot violin plots:
    # Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
    gsva_df_filtered$Condition <- gsub("Ace2_mock_2h", "Group3", gsva_df_filtered$Condition)  #group3=mock
    gsva_df_filtered$Condition <- gsub("Ace2_inf_24h", "Group1a", gsva_df_filtered$Condition)  #group1a=infection
    gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1a", "Group3"))
    
    #plot_violin <- function(data, gene_name) {
    #  # Calculate the t-test p-value for the two conditions
    #  condition1_data <- data[data$Condition == "Group1a", gene_name]
    #  condition2_data <- data[data$Condition == "Group3", gene_name]
    #  p_value <- t.test(condition1_data, condition2_data)$p.value
    #  # Convert p-value to annotation
    #  p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
    #  rounded_p_value <- paste0("p = ", round(p_value, 2))
    #  plot_title <- gsub("_", " ", gene_name)
    #  p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name), fill=Condition)) +
    #    geom_violin(linewidth=1.2) + 
    #    scale_fill_manual(values = custom_colors) +
    #    labs(title=plot_title, y="GSVA Score") +
    #    ylim(-1, 1) +
    #    theme_light() +
    #    theme(
    #      axis.title.x = element_text(size=12),
    #      axis.title.y = element_text(size=12),
    #      axis.text.x  = element_text(size=10),
    #      axis.text.y  = element_text(size=10),
    #      plot.title   = element_text(size=12, hjust=0.5),
    #      legend.position = "none" # Hide legend since the colors are self-explanatory
    #    )
    #  # Add p-value annotation to the plot
    #  p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
    #  return(p)
    #}
    ## 6. Generate the list of plots in a predefined order:
    #genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
    #genes <- genes[match(desired_order, genes)]
    #genes <- genes[!is.na(genes)]
    #first_row_plots <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))
    
    # -- This following code does not have the colors in the figure --
    plot_violin <- function(data, gene_name) {
      # Calculate the t-test p-value for the two conditions
      condition1_data <- data[data$Condition == "Group1a", gene_name]
      condition2_data <- data[data$Condition == "Group3", gene_name]
      p_value <- t.test(condition1_data, condition2_data)$p.value
    
      # Convert p-value to annotation
      p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
      rounded_p_value <- paste0("p = ", round(p_value, 2))
    
      plot_title <- gsub("_", " ", gene_name)
      p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name))) +
        geom_violin(linewidth=1.2) + 
        labs(title=plot_title, y="GSVA Score") +
        ylim(-1, 1) +
        theme_light() +
        theme(
          axis.title.x = element_text(size=12),
          axis.title.y = element_text(size=12),
          axis.text.x  = element_text(size=10),
          axis.text.y  = element_text(size=10),
          plot.title   = element_text(size=12, hjust=0.5)
        )
    
      # Add p-value annotation to the plot
      p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
    
      #return(p)
    
      # Return both the plot and the p-value
      list(plot = p, p_value = p_value)
    }
    
    # 6. Generate the list of plots in a predefined order:
    #desired_order <- c("IFN", "TNF", "IL-6R_complex", "IL-1_cytokines", "Pro-inflam._IL-1", "Monocyte_secreted", "Apoptosis", "NFkB_complex",   "NLRP3_inflammasome")
    genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
    #genes <- genes[match(desired_order, genes)]
    #first_row_plots <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))
    
    # Correct the creation of p_values_df
    p_values_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene)$p_value)
    p_values_df <- data.frame(Gene = genes, P_Value = unlist(p_values_list))
    
    # Calculate adjusted p-values
    #p_values_df$Adjusted_P_Value <- p.adjust(p_values_df$P_Value, method = "fdr")
    
    write.xlsx(p_values_df, "p_values_df.xlsx")

5.1. preparing selected_geneSets in gsva(exprs, selected_geneSets, method=”gsva”) for NanoString

    #Input: Signatures.xls

    library(readxl)
    library(gridExtra)
    library(ggplot2)
    library(GSVA)

    # Paths to the Excel files
    file_paths <- list("Signatures.xls", "Signatures_additional.xls")

    # Get sheet names for each file
    sheet_names_list <- lapply(file_paths, excel_sheets)

    # Initialize an empty list to hold gene sets
    geneSets <- list()

    # Loop over each file path and its corresponding sheet names
    for (i in 1:length(file_paths)) {
      file_path <- file_paths[[i]]
      sheet_names <- sheet_names_list[[i]]

      # Loop over each sheet, extract the ENSEMBL IDs, and add to the list
      for (sheet in sheet_names) {
        # Read the sheet
        data <- read_excel(file_path, sheet = sheet)

        # Process the GeneSet names (replacing spaces with underscores, for example)
        gene_set_name <- gsub(" ", "_", unique(data$GeneSet)[1])

        # Add ENSEMBL IDs to the list
        geneSets[[gene_set_name]] <- unique(as.character(data$geneSymbol))
      }
    }

    # Print the result to check
    summary(geneSets)

    #desired_geneSets <- c("Monocytes", "Plasma_cells", "T_regs", "Cyt._act._T_cells", "Neutrophils", "Inflammatory_neutrophils", "Suppressive_neutrophils", "LDG", "CD40_activated")
    desired_geneSets <- c("IFN", "TNF", "IL-6R_complex", "IL-1_cytokines", "Pro-inflam._IL-1", "Monocyte_secreted", "Apoptosis", "NFkB_complex",   "NLRP3_inflammasome", "T_cells",    "Monocytes","Plasma_cells","T_regs","Cyt._act._T_cells","Neutrophils","Inflammatory_neutrophils","Suppressive_neutrophils","LDG","CD40_activated")
    selected_geneSets <- geneSets[desired_geneSets]
    # Print the selected gene sets
    print(selected_geneSets)

5.2. prepare violin plots and p-value table for NanoString data (Group 3 vs Group 1)

    #library(rmarkdown); render("run.Rmd", c("html_document")) under jhuang@hamburg:~/DATA/Data_Susanne_Carotis_spatialRNA_PUBLISHING/run_2023_2_GSVA_table with R

    # 0. for Nanostring, the GSVA input requires a gene expression matrix 'exprs' where rows are genes and columns are samples. This matrix must be in non-log space.
    exprs <- exprs(target_m666Data)  #18677    45
    #exprs <- exprs(filtered_or_neg_target_m666Data)  #2274   45
    # 1. Compute GSVA scores:
    gsva_scores <- gsva(exprs, selected_geneSets, method="gsva")
    # 2. Convert to data.frame for ggplot:
    gsva_df <- as.data.frame(t(gsva_scores))
    # 3. Add conditions to gsva_df:
    identical(rownames(pData(target_m666Data)), rownames(gsva_df))
    gsva_df$Condition <- pData(target_m666Data)$Grp
    #identical(rownames(gsva_df_filtered), rownames(pData(target_m666Data)) )
    gsva_df$SampleID <- pData(target_m666Data)$SampleID
    # 4. Filter the gsva_df to retain only the desired conditions:
    #group 1 vs. group 3 in the nanostring data
    gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1", "3"), ]
    # 5. Define a function to plot violin plots:
    # Define custom colors
    custom_colors <- c("Group1" = "lightblue", "Group1a" = "red", "Group3" = "grey")
    #To implement the custom colors, and make the adjustments to abbreviate "Inflammatory" and "Suppressive", as well as increase the font size for the groups on the x-axis, we can modify the plot_violin function as follows:
    gsva_df_filtered$Condition <- gsub("1", "Group1", gsva_df_filtered$Condition)
    gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
    gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1", "Group3"))
    plot_violin <- function(data, gene_name) {
      # Calculate the t-test p-value for the two conditions
      condition1_data <- data[data$Condition == "Group1", gene_name]
      condition2_data <- data[data$Condition == "Group3", gene_name]
      p_value <- t.test(condition1_data, condition2_data)$p.value
      # Convert p-value to annotation
      p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
      rounded_p_value <- paste0("p = ", round(p_value, 2))
      plot_title <- gsub("_", " ", gene_name)
      p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name), fill=Condition)) +
        geom_violin(linewidth=1.2) + 
        scale_fill_manual(values = custom_colors) +
        labs(title=plot_title, y="GSVA Score") +
        ylim(-1, 1) +
        theme_light() +
        theme(
          axis.title.x = element_text(size=12),
          axis.title.y = element_text(size=12),
          axis.text.x  = element_text(size=10),
          axis.text.y  = element_text(size=10),
          plot.title   = element_text(size=12, hjust=0.5),
          legend.position = "none" # Hide legend since the colors are self-explanatory
        )
      # Add p-value annotation to the plot
      p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
      #return(p)

      # Return both the plot and the p-value
      list(plot = p, p_value = p_value)
    }

    library(openxlsx)

    desired_order <- c("IFN", "TNF", "IL-6R_complex", "IL-1_cytokines", "Pro-inflam._IL-1", "Monocyte_secreted", "Apoptosis", "NFkB_complex",   "NLRP3_inflammasome", "T_cells",    "Monocytes","Plasma_cells","T_regs","Cyt._act._T_cells","Neutrophils","Inflammatory_neutrophils","Suppressive_neutrophils","LDG","CD40_activated")
    genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
    genes <- genes[match(desired_order, genes)]
    genes <- genes[!is.na(genes)]
    #second_row_plots <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))

    # Correct the creation of p_values_df
    p_values_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene)$p_value)
    p_values_df <- data.frame(Gene = genes, P_Value = unlist(p_values_list))

    write.xlsx(p_values_df, "p_values_df_Group3_vs_Group1_19sig.xlsx")

5.3. prepare violin plots and p-value table for NanoString data (Group 3 vs Group 1a)

    # 2. Convert to data.frame for ggplot:
    gsva_df <- as.data.frame(t(gsva_scores))
    # 3. Add conditions to gsva_df:
    identical(rownames(pData(target_m666Data)), rownames(gsva_df))
    gsva_df$Condition <- pData(target_m666Data)$Group
    #identical(rownames(gsva_df_filtered), rownames(pData(target_m666Data)) )
    gsva_df$SampleID <- pData(target_m666Data)$SampleID
    # 4. Filter the gsva_df to retain only the desired conditions:
    #group 1 vs. group 3 in the nanostring data
    gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1a", "3"), ]
    # 5. Define a function to plot violin plots:
    # Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
    gsva_df_filtered$Condition <- gsub("1a", "Group1a", gsva_df_filtered$Condition)
    gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
    gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1a", "Group3"))
    plot_violin <- function(data, gene_name) {
      # Calculate the t-test p-value for the two conditions
      condition1_data <- data[data$Condition == "Group1a", gene_name]
      condition2_data <- data[data$Condition == "Group3", gene_name]
      p_value <- t.test(condition1_data, condition2_data)$p.value
      # Convert p-value to annotation
      p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
      rounded_p_value <- paste0("p = ", round(p_value, 2))
      plot_title <- gsub("_", " ", gene_name)
      p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name), fill=Condition)) +
        geom_violin(linewidth=1.2) + 
        scale_fill_manual(values = custom_colors) +
        labs(title=plot_title, y="GSVA Score") +
        ylim(-1, 1) +
        theme_light() +
        theme(
          axis.title.x = element_text(size=12),
          axis.title.y = element_text(size=12),
          axis.text.x  = element_text(size=10),
          axis.text.y  = element_text(size=10),
          plot.title   = element_text(size=12, hjust=0.5),
          legend.position = "none" # Hide legend since the colors are self-explanatory
        )
      # Add p-value annotation to the plot
      p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
      #return(p)

      # Return both the plot and the p-value
      list(plot = p, p_value = p_value)
    }
    # 6. Generate the list of plots in a predefined order:
    genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
    genes <- genes[match(desired_order, genes)]
    genes <- genes[!is.na(genes)]

    # Correct the creation of p_values_df
    p_values_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene)$p_value)
    p_values_df <- data.frame(Gene = genes, P_Value = unlist(p_values_list))

    write.xlsx(p_values_df, "p_values_df_Group3_vs_Group1a_19sig.xlsx")

Small RNA sequencing processing in the example of smallRNA_7

  1. adapter sequence

    Lexogen small RNA-Seq kit
    
    some common adapter sequences from different kits for reference:
    
        - TruSeq Small RNA (Illumina): TGGAATTCTCGGGTGCCAAGG
        - Small RNA Kits V1 (Illumina): TCGTATGCCGTCTTCTGCTTGT
        - Small RNA Kits V1.5 (Illumina): ATCTCGTATGCCGTCTTCTGCTTG
        - NEXTflex Small RNA Sequencing Kit v3 for Illumina Platforms (Bioo Scientific): TGGAATTCTCGGGTGCCAAGG
        - LEXOGEN Small RNA-Seq Library Prep Kit (Illumina): TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC
    
    [Header],,,,,
    IEMFileVersion,4,,,,
    InvestigatorName,ag96,,,,
    ExperimentName,ag96,,,,
    Date,16.10.2023,,,,
    Workflow,GenerateFASTQ,,,,
    Application,NextSeqFASTQOnly,,,,
    Assay,TruSeq HT,,,,
    Description,pcr,,,,
    Chemistry,Amplicon,,,,
    ,,,,,
    [Reads],,,,,
    82,,,,,
    ,,,,,
    ,,,,,
    [Settings],,,,,
    Adapter,TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC,,,,
    ,,,,,
    ,,,,,
    [Data],,,,,
    Sample_ID,Sample_Name,I7_Index_ID,index,Sample_Project,Description
    nf930,01_0505_WaGa_wt_EV_RNA,SRi7001,CAGCGT,2023_064_nf_ute,smallRNA-Seq
    nf931,02_0505_WaGa_sT_DMSO_EV_RNA,SRi7002,GATCAC,2023_064_nf_ute,smallRNA-Seq
    nf932,03_0505_WaGa_sT_Dox_EV_RNA,SRi7003,ACCAGT,2023_064_nf_ute,smallRNA-Seq
    nf933,04_0505_WaGa_scr_DMSO_EV_RNA,SRi7004,TGCACG,2023_064_nf_ute,smallRNA-Seq
    nf934,05_0505_WaGa_scr_Dox_EV_RNA,SRi7005,ACATTA,2023_064_nf_ute,smallRNA-Seq
    nf935,06_1905_WaGa_wt_EV_RNA,SRi7006,GTGTAG,2023_064_nf_ute,smallRNA-Seq
    nf936,07_1905_WaGa_sT_DMSO_EV_RNA,SRi7007,CTAGTC,2023_064_nf_ute,smallRNA-Seq
    nf937,08_1905_WaGa_sT_Dox_EV_RNA,SRi7008,TGTGCA,2023_064_nf_ute,smallRNA-Seq
    nf938,09_1905_WaGa_scr_DMSO_EV_RNA,SRi7009,TCAGGA,2023_064_nf_ute,smallRNA-Seq
    nf939,10_1905_WaGa_scr_Dox_EV_RNA,SRi7010,CGGTTA,2023_064_nf_ute,smallRNA-Seq
    nf940,11_control_MKL1,SRi7011,TTAACT,2023_064_nf_ute,smallRNA-Seq
    nf941,12_control_WaGa,SRi7012,ATGAAC,2023_064_nf_ute,smallRNA-Seq
  2. input data

    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf930/01_0505_WaGa_wt_EV_RNA_S1_R1_001.fastq.gz         0505_WaGa_wt.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf931/02_0505_WaGa_sT_DMSO_EV_RNA_S2_R1_001.fastq.gz    0505_WaGa_sT_DMSO.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf932/03_0505_WaGa_sT_Dox_EV_RNA_S3_R1_001.fastq.gz     0505_WaGa_sT_Dox.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf933/04_0505_WaGa_scr_DMSO_EV_RNA_S4_R1_001.fastq.gz   0505_WaGa_scr_DMSO.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf934/05_0505_WaGa_scr_Dox_EV_RNA_S5_R1_001.fastq.gz    0505_WaGa_scr_Dox.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf935/06_1905_WaGa_wt_EV_RNA_S6_R1_001.fastq.gz         1905_WaGa_wt.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf936/07_1905_WaGa_sT_DMSO_EV_RNA_S7_R1_001.fastq.gz    1905_WaGa_sT_DMSO.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf937/08_1905_WaGa_sT_Dox_EV_RNA_S8_R1_001.fastq.gz     1905_WaGa_sT_Dox.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf938/09_1905_WaGa_scr_DMSO_EV_RNA_S9_R1_001.fastq.gz   1905_WaGa_scr_DMSO.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf939/10_1905_WaGa_scr_Dox_EV_RNA_S10_R1_001.fastq.gz   1905_WaGa_scr_Dox.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf940/11_control_MKL1_S11_R1_001.fastq.gz               control_MKL1.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf941/12_control_WaGa_S12_R1_001.fastq.gz               control_WaGa.fastq.gz
  3. run cutadapt

    for sample in 0505_WaGa_wt 0505_WaGa_sT_DMSO 0505_WaGa_sT_Dox 0505_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_wt 1905_WaGa_sT_DMSO 1905_WaGa_sT_Dox 1905_WaGa_scr_DMSO 1905_WaGa_scr_Dox  control_MKL1 control_WaGa; do
      cutadapt -a TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -q 20 -o ${sample}_cutadapted.fastq.gz --minimum-length 5 --trim-n ${sample}.fastq.gz >> LOG
    done
    #jhuang@hamburg:~/DATA/Data_Ute/Data_Ute_smallRNA_7$ fastp -i 0505_WaGa_wt.fastq.gz -o 0505_WaGa_wt3.fastq.gz -a TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC
  4. run COMPSRA

    ln -s ../Data_Ute_smallRNA_3/bundle_v1 .
    #change the reference of 'hg38 -> hg38_orig' to 'hg38 -> hg38_miRNA_JN707599' (other options are hg38_WaGa_KJ128379 and hg38_MKL1_FJ173815)
    cd bundle_v1/db/stars; rm hg38; ln -s hg38_miRNA_JN707599 hg38;
    
    # DEBUG_1: Make sure the file COMPSRA.jar under Data_Ute_smallRNA_7
    # DEBUG_2: "-qc -ra TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -rb 4" does not work! Using cutadapt -a xxxx -q 20 replace those parameters!
    
    mkdir out_out
    for sample in 0505_WaGa_wt 0505_WaGa_sT_DMSO 0505_WaGa_sT_Dox 0505_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_wt 1905_WaGa_sT_DMSO 1905_WaGa_sT_Dox 1905_WaGa_scr_DMSO 1905_WaGa_scr_Dox  control_MKL1 control_WaGa; do
      echo "mkdir our_out/${sample}_cutadapted/"
    done
    for sample in 0505_WaGa_wt 0505_WaGa_sT_DMSO 0505_WaGa_sT_Dox 0505_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_wt 1905_WaGa_sT_DMSO 1905_WaGa_sT_Dox 1905_WaGa_scr_DMSO 1905_WaGa_scr_Dox  control_MKL1 control_WaGa; do
      echo "java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in ${sample}_cutadapted.fastq.gz -out ./our_out/"
    done
    
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 0505_WaGa_wt_cutadapted.fastq.gz -out ./our_out/
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 0505_WaGa_sT_DMSO_cutadapted.fastq.gz -out ./our_out/
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 0505_WaGa_sT_Dox_cutadapted.fastq.gz -out ./our_out/
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 0505_WaGa_scr_DMSO_cutadapted.fastq.gz -out ./our_out/
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 0505_WaGa_scr_Dox_cutadapted.fastq.gz -out ./our_out/
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 1905_WaGa_wt_cutadapted.fastq.gz -out ./our_out/
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 1905_WaGa_sT_DMSO_cutadapted.fastq.gz -out ./our_out/
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 1905_WaGa_sT_Dox_cutadapted.fastq.gz -out ./our_out/
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 1905_WaGa_scr_DMSO_cutadapted.fastq.gz -out ./our_out/
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 1905_WaGa_scr_Dox_cutadapted.fastq.gz -out ./our_out/
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in control_MKL1_cutadapted.fastq.gz -out ./our_out/
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in control_WaGa_cutadapted.fastq.gz -out ./our_out/
    #END
    
    #4.2.3 -rb/-rm_bias n
    #To remove n random bases in both 5’ (5-prime) and 3’ (3-prime) ends after removing the adapter sequence.
    #4.2.4 -rh/-rm_low_quality_head score
    #To remove the low quality bases with the score less than score from 5’ (5-prime) end.
    #4.2.5 -rt/-rm_low_quality_tail score
    #To remove the low quality bases with the score less than score from 3’ (3-prime) end.
    #4.2.6 -rr/-rm_low_quality_read score
    #To remove the low quality reads with the average score less than score.
    #4.6.3 -fdclass/-fun_diff_class A1,A2,...,An
    #To set the small RNAs that will be performed the differential expression analysis. The format is the same as the parameter -ac/-ann_class A1,A2,...,An.
    #4.6.4 -fdcase/-fun_diff_case ID1,ID2,...,IDn
    #To set the IDs of case samples.
    #4.6.5 -fdctrl/-fun_diff_control ID1,ID2,...,IDn
    #To set the IDs of control samples.
    #4.4.2 -ac/-ann_class A1,A2,...,An
    #To set the small RNA categories that will be annotated. The index of small RNA is listed:
    #    1 miRNA
    #    2 piRNA
    #    3 tRNA
    #    4 snoRNA
    #    5 snRNA
    #    6 circRNA
    
    java -jar COMPSRA.jar -ref hg38 -fun -fm -fms 1-5 -fdclass 1,2,3,4,5 -fdann -pro COMPSRA_MERGE -inf ./sample.list -out ./our_out/
    java -jar COMPSRA.jar -ref hg38 -fun -fd -fdclass 1,2,3,4,5 -fdcase 1-2 -fdctrl 3-6 -fdnorm cpm -fdtest mwu -fdann -pro COMPSRA_DEG -inf ./sample.list -out ./our_out/
  5. get the read number of virus genome (see also ~/DATA/Data_Ute/Data_Ute_smallRNA_3/README_virusgenome)

    #rsync -a -P jhuang@newton.ibvt.uni-stuttgart.de:~/COMPSRA_V1.0.2/our_out/pfsk1_mcvsyn_TSQ ./
    #!!NOTE that the used COMPSRA.jar --> Data_Ute_smallRNA_3/COMPSRA_V1.0.2, the database (changable) --> Data_Ute_smallRNA_3/bundle_v1!
    
    #sample.list
    #0505_WaGa_sT_DMSO
    #1905_WaGa_sT_DMSO
    #0505_WaGa_sT_Dox
    #1905_WaGa_sT_Dox
    #0505_WaGa_scr_DMSO
    #1905_WaGa_scr_DMSO
    #0505_WaGa_scr_Dox
    #1905_WaGa_scr_Dox
    #0505_WaGa_wt
    #1905_WaGa_wt
    #control_MKL1
    #control_WaGa
    
    for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
      mv ${sample}_cutadapted ${sample}
    done
    
    for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
        cd ${sample}
        #samtools sort ${sample}_cutadapted_STAR_Aligned.out.bam > sorted.bam
        #samtools index sorted.bam
        #samtools view -h sorted.bam "JN707599:1-5387" > JN707599.sam
        #samtools view -Sb JN707599.sam > JN707599.bam
        #samtools view -h sorted.bam "JN707599:1-5387" | samtools view -Sb - > JN707599.bam
        samtools view -h sorted.bam | grep -E 'JN707599' | samtools view -Sb - > JN707599.bam
        #samtools view -h sorted.bam | grep -E '^@|chr' | samtools view -b > hg38.bam
        cd ..
    done
    
    # #https://bedtools.readthedocs.io/en/latest/content/tools/intersect.html
    # #https://www.researchgate.net/post/How_to_find_number_of_reads_aligned_to_a_particular_region_of_a_chromosome_from_sam_file
    # echo -e 'JN707599\t4347\t4368' > region_5p.bed
    # intersectBed -abam 0505_WaGa_sT_Dox_cutadapted_STAR_Aligned.out.bam -b region_5p.bed -bed | wc -l
    # #382487    -->  382487
    # echo -e 'JN707599\t4381\t4402' > region_3p.bed
    # intersectBed -abam 0505_WaGa_sT_Dox_cutadapted_STAR_Aligned.out.bam -b region_3p.bed -bed | wc -l
    # #172659
    # #Total = 382487+172659=555146   
    # echo -e 'JN707599\t4332\t4415' > region_primary.bed
    # intersectBed -abam JN707599.bam -b region_primary.bed -bed | wc -l
    # #NOTE that 556752 > 555146
    
    for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
      echo "---- ${sample} ----" >> LOG_JN707599_M1
      cd ${sample}
      intersectBed -abam JN707599.bam -b ../region_all.bed -bed | wc -l >> ../LOG_JN707599_M1
      intersectBed -abam JN707599.bam -b ../region_primary.bed -bed | wc -l >> ../LOG_JN707599_M1
      cd ..
    done
    
    for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
      echo "---- ${sample} ----" >> LOG_read_number
      zcat ${sample}2.fastq.gz | echo $((`wc -l`/4)) >> LOG_read_number
    done
    
    cd our_out_on_hg38+JN707599
    for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
      echo "---- ${sample} ----" >> LOG_hg38
      echo "---- ${sample} ----" >> LOG_JN707599
      echo "---- ${sample} ----" >> LOG_hg38_alignments
      echo "---- ${sample} ----" >> LOG_JN707599_alignments
      cd ${sample}
      #-F 260: Excludes secondary alignments (flag 256) and unmapped reads (flag 4).
      samtools view -c -F 260 sorted.bam `seq -f "chr%g" 1 22` chrX chrY chrM `samtools view -H sorted.bam | grep '@SQ' | cut -f 2 | cut -d ':' -f 2 | grep 'chrUn\|chrEBV'` | awk '{sum+=$1} END {print sum}' >> ../LOG_hg38
      samtools view -c -F 260 sorted.bam JN707599 >> ../LOG_JN707599
      samtools view -c -F 4 sorted.bam `seq -f "chr%g" 1 22` chrX chrY chrM `samtools view -H sorted.bam | grep '@SQ' | cut -f 2 | cut -d ':' -f 2 | grep 'chrUn\|chrEBV'` | awk '{sum+=$1} END {print sum}' >> ../LOG_hg38_alignments
      samtools view -c -F 4 sorted.bam JN707599 >> ../LOG_JN707599_alignments
      cd ..
    done
    
    #for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt #1905_WaGa_wt control_MKL1 control_WaGa; do
    #  echo "---- ${sample} ----" >> LOG_hg38
    #  cd ${sample}
    #  samtools flagstat ${sample}_cutadapted_STAR_Aligned.out.bam >> ../LOG_hg38
    #  cd ..
    #done
    #for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
    #  echo "---- ${sample} ----" >> LOG_JN707599
    #  cd ${sample}
    #  samtools flagstat JN707599.bam >> ../LOG_JN707599
    #  cd ..
    #done
    
    for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt #1905_WaGa_wt control_MKL1 control_WaGa; do
      echo "---- ${sample} ----" >> LOG_alignments
      cd ${sample}
      samtools flagstat ${sample}2_STAR_Aligned.out.bam >> ../LOG_alignments
      cd ..
    done
    grep "mapped (" LOG_alignments
    
    #The following results calculate raw counts (Note: If you only want to merge the count files, you can use -fm -fms.)
    java -jar COMPSRA.jar -ref hg38 -fun -fm -fms 1-12 -fdclass 1,2,3,4,5,6 -fdann -pro COMPSRA_MERGE -inf ./sample.list -out ./our_out/
    ##The following command calculate statistical test after defining case and control.
    #java -jar COMPSRA.jar -ref hg38 -fun -fd -fdclass 1,2,3,4,5,6 -fdcase 1-10 -fdctrl 11-12 -fdnorm cpm -fdtest mwu -fdann -pro COMPSRA_DEG -inf ./sample.list -out ./our_out/
    
    # sed -i -e 's/_August/-08/g' COMPSRA_MERGE_0_miRNA.txt
    # sed -i -e 's/_November/-11/g' COMPSRA_MERGE_0_miRNA.txt
    # sed -i -e 's/_June/-06/g' COMPSRA_MERGE_0_miRNA.txt
    sed -i -e 's/_cutadapted_STAR_Aligned_miRNA.txt//g' COMPSRA_MERGE_0_miRNA.txt
    # #sed -i -e 's/_piRNA.txt//g' COMPSRA_MERGE_0_piRNA.txt
    # #sed -i -e 's/_tRNA.txt//g' COMPSRA_MERGE_0_tRNA.txt
    # #sed -i -e 's/_snoRNA.txt//g' COMPSRA_MERGE_0_snoRNA.txt
    # #sed -i -e 's/_snRNA.txt//g' COMPSRA_DEG_0_snRNA.txt
    # sed -i -e 's/_August/-08/g' COMPSRA_DEG_0_miRNA.txt
    # sed -i -e 's/_November/-11/g' COMPSRA_DEG_0_miRNA.txt
    # sed -i -e 's/_June/-06/g' COMPSRA_DEG_0_miRNA.txt
    # sed -i -e 's/_STAR_Aligned_miRNA.txt//g' COMPSRA_DEG_0_miRNA.txt
    
    #python3
    import pandas as pd
    df = pd.read_csv('COMPSRA_MERGE_0_miRNA.txt', sep='\t', index_col=0)
    df = df.drop(columns=['Unnamed: 13'])
    # Assuming df is your DataFrame
    df.loc['Sum'] = df.sum(numeric_only=True)
    df.to_csv('COMPSRA_MERGE_0_miRNA_.txt', sep='\t')
    df = pd.read_csv('COMPSRA_MERGE_0_piRNA.txt', sep='\t', index_col=0)
    df = df.drop(columns=['Unnamed: 13'])
    df.loc['Sum'] = df.sum(numeric_only=True)
    df.to_csv('COMPSRA_MERGE_0_piRNA_.txt', sep='\t')
    df = pd.read_csv('COMPSRA_MERGE_0_snoRNA.txt', sep='\t', index_col=0)
    df = df.drop(columns=['Unnamed: 13'])
    df.loc['Sum'] = df.sum(numeric_only=True)
    df.to_csv('COMPSRA_MERGE_0_snoRNA_.txt', sep='\t')
    df = pd.read_csv('COMPSRA_MERGE_0_snRNA.txt', sep='\t', index_col=0)
    df = df.drop(columns=['Unnamed: 13'])
    df.loc['Sum'] = df.sum(numeric_only=True)
    df.to_csv('COMPSRA_MERGE_0_snRNA_.txt', sep='\t')
    df = pd.read_csv('COMPSRA_MERGE_0_tRNA.txt', sep='\t', index_col=0)
    df = df.drop(columns=['Unnamed: 13'])
    df.loc['Sum'] = df.sum(numeric_only=True)
    df.to_csv('COMPSRA_MERGE_0_tRNA_.txt', sep='\t')
    #samtools flagstat **.bam
    #47217410 + 0 in total (QC-passed reads + QC-failed reads)
    #45166321 + 0 mapped (95.66% : N/A)
    #2051089 + 0 in total (QC-passed reads + QC-failed reads)
    #TODO: check the microRNA in the paper mentioned in which position?
    #Single publications on EVs as transport vehicles for specific miRNAs in the pathogenesis of Merkel cell carcinoma have also been reported, such as miR-375 and its function in proliferation
    
    ~/Tools/csv2xls-0.4/csv_to_xls.py COMPSRA_MERGE_0_miRNA_.txt \
    COMPSRA_MERGE_0_piRNA_.txt \
    COMPSRA_MERGE_0_tRNA_.txt \
    COMPSRA_MERGE_0_snoRNA_.txt \
    COMPSRA_MERGE_0_snRNA_.txt \
    -d$'\t' -o raw_counts_hg38.xls;
    
    # # sorting the DEG table, change the sheet labels to 'miRNA_between_columns_B-C_and_columns_D-G'
    # ~/Tools/csv2xls-0.4/csv_to_xls.py COMPSRA_DEG_0_miRNA.txt -d$'\t' -o normalized_and_significance_test_miRNA.xls;
    
    ##merging the row counts and statical values
    #cut -f1-1 COMPSRA_MERGE_0_snoRNA.txt > f1_MERGE
    #cut -f1-1 COMPSRA_DEG_0_snoRNA.txt > f1_DEG
    #cut -f1-1 COMPSRA_MERGE_0_miRNA.txt > f1_MERGE
    #cut -f1-1 COMPSRA_DEG_0_miRNA.txt > f1_DEG
    #diff f1_MERGE f1_DEG
    
    sT: This abbreviation could stand for various things depending on the context of the research. In some studies, it might refer to "short-term," "signal transducer," "small T antigen," or other terms relevant to the specific area of study. The exact meaning would depend on the context in which these samples are being used.
    
    scr: This is likely short for "scramble" or "scrambled." In the context of genetic research, it often refers to a control group where cells have been treated with a scrambled sequence of RNA or DNA. This serves as a baseline to compare against other groups where specific genes are targeted.
    
    Dox: This usually stands for "Doxorubicin," which is a type of chemotherapy medication used in cancer treatment. It's also commonly used in research to study cellular responses to chemotherapy.
    
    DMSO: Short for "Dimethyl Sulfoxide," DMSO is an organic solvent that is often used as a vehicle in biological experiments. It's used to dissolve other substances and carry them into cells. In control experiments, DMSO is used without the active drug/compound to differentiate the effects of the solvent from the effects of the drug/compound being tested.
    
    WaGa: This could be a specific cell line or experimental condition used in your research. Cell lines often have specific names that are abbreviated in this way.
    
    wt: This typically stands for "wild type." In genetic research, "wild type" refers to organisms or cells that have not undergone any genetic modification and are thus considered to be the standard or normal phenotype.
    
    Control_MKL1 and Control_WaGa (parental cell): These seem to be control samples for the experiment. "Control" indicates that these samples are used as standards or baselines for comparison. "MKL1" and "WaGa" are likely specific identifiers for the controls used.
    
    sT vs scr (condition): This comparison could be named based on the nature of the genetic or molecular treatment in the samples. If "sT" and "scr" refer to different genetic constructs or treatments (such as a specific gene modification versus a scrambled sequence), you could name this comparison something like "Genetic Modification Comparison" or "Treatment Type Comparison." The exact name would depend on what "sT" and "scr" specifically represent in your study.
    
    Dox vs DMSO (treatment): Since "Dox" likely stands for Doxorubicin (a chemotherapy drug) and "DMSO" is a solvent used as a control, this comparison is between a drug treatment and a control treatment. You could name this comparison "Drug Treatment vs Control" or more specifically "Doxorubicin Treatment vs Solvent Control."
    
    WaGa vs MKL-1 (cell line): As you've identified these as cell lines, this comparison is straightforward. It's a comparison between two different cell lines. You might name it "Cell Line Comparison: WaGa vs MKL-1."
    
    0505 vs 1905 (donor): ****
  6. draw plots with R using DESeq2 (~/DATA/Data_Ute/Data_Ute_smallRNA_3_nextflow/README_draw_miRNA_plots)

    library("AnnotationDbi")
    library("clusterProfiler")
    library("ReactomePA")
    library("org.Hs.eg.db")
    library(DESeq2)
    library(gplots)
    setwd("/home/jhuang/DATA/Data_Ute/Data_Ute_smallRNA_7/our_out_on_hg38+JN707599/")
    
    d.raw<- read.delim2("COMPSRA_MERGE_0_miRNA.txt",sep="\t", header=TRUE, row.names=1)
    d.raw$X <- NULL
    #colnames(d.raw)<- c("WaGa_EV", "MKL1_EV", "WaGa_wt", "MKL1_wt")
    d.raw[] <- lapply(d.raw, as.numeric)
    
    #MCPyV-M1   0   0   29  0   23  0   30  8   0   0   202 196
    # New row to add
    new_row <- data.frame(
      X0505_WaGa_sT_DMSO = 0,
      X1905_WaGa_sT_DMSO = 0,
      X0505_WaGa_sT_Dox = 29,
      X1905_WaGa_sT_Dox = 0,
      X0505_WaGa_scr_DMSO = 23,
      X1905_WaGa_scr_DMSO = 0,
      X0505_WaGa_scr_Dox = 30,
      X1905_WaGa_scr_Dox = 8,
      X0505_WaGa_wt = 0,
      X1905_WaGa_wt = 0,
      control_MKL1 = 202,
      control_WaGa = 196,
      row.names = "MCPyV-M1"
    )
    
    # Add the new row to the data frame
    d.raw <- rbind(d.raw, new_row)
    
    #cell_line = as.factor(c("WaGa","WaGa", "WaGa","WaGa", "WaGa","WaGa", "WaGa","WaGa", "WaGa","WaGa", "MKL1","WaGa"))
    #condition = as.factor(c("sT","sT", "sT","sT", "scr","scr", "scr","scr", "wt","wt", "control","control"))
    #treatment = as.factor(c("DMSO","DMSO", "Dox","Dox", "DMSO","DMSO", "Dox","Dox", "wt","wt", "control","control"))
    
    EV_or_parental = as.factor(c("EV","EV", "EV","EV", "EV","EV", "EV","EV", "EV","EV", "parental","parental"))
    donor = as.factor(c("0505","1905", "0505","1905", "0505","1905", "0505","1905", "0505","1905", "0505","1905"))
    replicates = as.factor(c("sT_DMSO","sT_DMSO", "sT_Dox","sT_Dox", "scr_DMSO","scr_DMSO", "scr_Dox","scr_Dox", "wt","wt", "control","control"))
    ids = as.factor(c("0505_WaGa_sT_DMSO","1905_WaGa_sT_DMSO","0505_WaGa_sT_Dox","1905_WaGa_sT_Dox","0505_WaGa_scr_DMSO","1905_WaGa_scr_DMSO","0505_WaGa_scr_Dox","1905_WaGa_scr_Dox","0505_WaGa_wt","1905_WaGa_wt","control_MKL1","control_WaGa"))
    
    cData = data.frame(row.names=colnames(d.raw), replicates=replicates, ids=ids, donor=donor, EV_or_parental=EV_or_parental)
    dds<-DESeqDataSetFromMatrix(countData=d.raw, colData=cData, design=~replicates+donor)
    
    rld <- rlogTransformation(dds)
    
    #The normalized counts are calculated from the 'median of ratios method' (see the calulcation process at the paragraph 'DESeq2-normalized counts: Median of ratios method' at https://hbctraining.github.io/DGE_workshop/lessons/02_DGE_count_normalization.html).
    
    # -- before pca --
    png("pca.png", 1200, 800)
    plotPCA(rld, intgroup=c("replicates"))
    #plotPCA(rld, intgroup = c("replicates", "batch"))
    #plotPCA(rld, intgroup = c("replicates", "ids"))
    #plotPCA(rld, "batch")
    dev.off()
    
    png("pca2.png", 1200, 800)
    plotPCA(rld, intgroup=c("donor"))
    dev.off()
    
    # -- before heatmap --
    ## generate the pairwise comparison between samples
    library(gplots) 
    library("RColorBrewer")
    png("heatmap.png", 1200, 800)
    distsRL <- dist(t(assay(rld)))
    mat <- as.matrix(distsRL)
    #paste( rld$dex, rld$cell, sep="-" )
    #rownames(mat) <- colnames(mat) <- with(colData(dds),paste(replicates,batch, sep=":"))
    #rownames(mat) <- colnames(mat) <- with(colData(dds),paste(replicates,ids, sep=":"))
    hc <- hclust(distsRL)
    hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
    heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
    dev.off()
    
    # -- remove batch effect --
    # #show the results which delete the batches effect
    # #http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-after-vst-are-there-still-batches-in-the-pca-plot
    # mat <- assay(rld)
    # mat <- limma::removeBatchEffect(mat, rld$donor)
    # assay(rld) <- mat
    
    #TODO: next time using rld
    #http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#how-do-i-use-vst-or-rlog-data-for-differential-testing
    #It uses the design formula to calculate the within-group variability (if blind=FALSE) or the across-all-samples variability (if blind=TRUE).
    #- It is possible to visualize the transformed data with batch variation removed, using the removeBatchEffect function from limma.
    #- This simply removes any shifts in the log2-scale expression data that can be explained by batch.
    #IMPROVE: ~batch+replicates
    #https://support.bioconductor.org/p/116375/
    #Have a look at the manual pages of these functions. The first sentence of that for varianceStabilizingTransformation says "This function calculates a variance stabilizing transformation (VST) from the fitted dispersion-mean relation(s) and then transforms the count data (normalized by division by the size factors or normalization factors)." For rlog, it says "This function transforms the count data to the log2 scale in a way which minimizes differences between samples for rows with small counts, and which normalizes with respect to library size."
    #Do try to read the documentation and a little bit about the underlying methods, you'll find that you'll be more efficient and have much more fun with the software.
    mat <- assay(rld)
    mm <- model.matrix(~replicates, colData(rld))
    mat <- limma::removeBatchEffect(mat, batch=rld$donor, design=mm)
    assay(rld) <- mat
    #plotPCA(rld)
    
    ##https://www.biostars.org/p/403053/
    ##?? DESeq() function should work after removeBatchEffect ??
    #dds <- DESeqDataSetFromMatrix(countData=counts, colData=factors, design = ~ Batch + Covariate)
    #dds <- DESeq(dds)
    #rld <- vst(dds, blind=FALSE)
    #mat <- assay(rld)
    #mat <- limma::removeBatchEffect(mat, rld$Batch)
    #assay(rld) <- mat
    #counts_batch_corrected <- assay(rld)
    
    # -- after pca --
    png("pca_after_donor_correction.png", 1200, 800)
    #plotPCA(rld, intgroup = c("replicates", "batch"))
    #plotPCA(rld, intgroup = c("replicates", "ids"))
    plotPCA(rld, intgroup=c("replicates"))
    dev.off()
    
    png("pca_after_donor_correction2.png", 1200, 800)
    plotPCA(rld, intgroup = c("donor"))
    #plotPCA(rld, intgroup = c("replicates", "ids"))
    #plotPCA(rld, intgroup=c("replicates"))
    dev.off()
    
    # -- after heatmap --
    ## generate the pairwise comparison between samples
    png("heatmap_after_donor_correction.png", 1200, 800)
    distsRL <- dist(t(assay(rld)))
    mat <- as.matrix(distsRL)
    rownames(mat) <- colnames(mat) <- with(colData(dds),paste(replicates,donor, sep=":"))
    #rownames(mat) <- colnames(mat) <- with(colData(dds),paste(replicates,ids, sep=":"))
    hc <- hclust(distsRL)
    hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
    heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
    dev.off()
    
    #convert bam to bigwig using deepTools by feeding inverse of DESeq’s size Factor
    sizeFactors(dds)
    #NULL
    dds <- estimateSizeFactors(dds)
    sizeFactors(dds)
    normalized_counts <- counts(dds, normalized=TRUE)
    write.table(normalized_counts, file="normalized_counts.txt", sep="\t", quote=F, col.names=NA)
    #cp COMPSRA_MERGE_0_miRNA.txt raw_counts.txt
    #~/Tools/csv2xls-0.4/csv_to_xls.py raw_counts.txt normalized_counts.txt -d$'\t' -o raw_and_normalized_counts.xls;
    ~/Tools/csv2xls-0.4/csv_to_xls.py normalized_counts.txt -d$'\t' -o normalized_counts_miRNA.xls;
    
    #unter konsole
    #      WaGa WaGa_EV_r1 WaGa_EV_r2       MKL1 MKL1_EV_r1 MKL1_EV_r2 
    # 1/1,2948728  1/0,4790155  1/0,9996732  1/1,5667017  1/1,0727251  1/1,0755253 
    
    > sizeFactors(dds)
    X0505_WaGa_sT_DMSO  X1905_WaGa_sT_DMSO   X0505_WaGa_sT_Dox   X1905_WaGa_sT_Dox 
              0.7042043           0.9986761           1.3277555           1.4950649 
    X0505_WaGa_scr_DMSO X1905_WaGa_scr_DMSO  X0505_WaGa_scr_Dox  X1905_WaGa_scr_Dox 
              0.5406249           1.5657442           0.4216588           1.1627786 
          X0505_WaGa_wt       X1905_WaGa_wt        control_MKL1        control_WaGa 
              0.3560245           0.6982057           3.2447398           3.2147966 
    
    1/1,2616592=0,792607069
    1/0,5302187=1,886014205
    1/1,0476709=0,954498211
    1/1,5243221=0,656029326
    1/1,0000000=1,0
    1/1,0856832=0,921079004
    
    #bamCoverage --bam ${sample}Aligned.sortedByCoord.out.bam -o ../bigWigs/${sample}_norm.bw --binSize 10 --scaleFactor  --effectiveGenomeSize 2864785220
    
    bamCoverage --bam WaGa_RNAAligned.sortedByCoord.out.markDups.bam -o ../bigWigs/WaGa_norm.bw --binSize 10 --scaleFactor 0.792607069         --effectiveGenomeSize 2864785220
    bamCoverage --bam WaGa_derived_EV_RNAAligned.sortedByCoord.out.markDups.bam -o ../bigWigs/WaGa_EV_r1_norm.bw --binSize 10 --scaleFactor 1.886014205 --effectiveGenomeSize 2864785220
    bamCoverage --bam WaGa_derived_EV_RNA_2Aligned.sortedByCoord.out.markDups.bam -o ../bigWigs/WaGa_EV_r2_norm.bw --binSize 10 --scaleFactor 0.954498211 --effectiveGenomeSize 2864785220
    bamCoverage --bam MKL_1_RNAAligned.sortedByCoord.out.markDups.bam -o ../bigWigs/MKL1_norm.bw --binSize 10 --scaleFactor 0.656029326        --effectiveGenomeSize 2864785220
    bamCoverage --bam MKL_1_derived_EV_RNAAligned.sortedByCoord.out.markDups.bam -o ../bigWigs/MKL1_EV_r1_norm.bw --binSize 10 --scaleFactor 1.0 --effectiveGenomeSize 2864785220
    bamCoverage --bam MKL_1_derived_EV_RNA_2Aligned.sortedByCoord.out.markDups.bam -o ../bigWigs/MKL1_EV_r2_norm.bw --binSize 10 --scaleFactor 0.921079004 --effectiveGenomeSize 2864785220
    
    #setwd("/home/jhuang/DATA/Data_Ute_RNASeq/results/featureCounts/degenes")
    #---- * to untreated ----
    dds<-DESeqDataSetFromMatrix(countData=d.raw, colData=cData, design=~EV_or_parental+donor)
    dds$EV_or_parental <- relevel(dds$EV_or_parental, "parental")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("EV_vs_parental")
    
    for (i in clist) {
      contrast = paste("EV_or_parental", i, sep="_")
      res = results(dds, name=contrast)
      res <- res[!is.na(res$log2FoldChange),]
      #https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-are-some-p-values-set-to-na
      res$padj <- ifelse(is.na(res$padj), 1, res$padj)
    
      res_df <- as.data.frame(res)
      write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
    
      up <- subset(res_df, padj<=0.1 & log2FoldChange>=2)
      down <- subset(res_df, padj<=0.1 & log2FoldChange<=-2)
      write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
      write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
    }
    
    ~/Tools/csv2xls-0.4/csv_to_xls.py \
    EV_vs_parental-all.txt \
    EV_vs_parental-up.txt \
    EV_vs_parental-down.txt \
    -d$',' -o EV_vs_parental.xls;
    
    dds<-DESeqDataSetFromMatrix(countData=d.raw, colData=cData, design=~replicates+donor)
    dds$replicates <- relevel(dds$replicates, "sT_DMSO")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("sT_Dox_vs_sT_DMSO")
    
    dds$replicates <- relevel(dds$replicates, "scr_Dox")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("sT_Dox_vs_scr_Dox")
    
    dds$replicates <- relevel(dds$replicates, "scr_DMSO")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("scr_Dox_vs_scr_DMSO", "sT_DMSO_vs_scr_DMSO")
    
    for (i in clist) {
      contrast = paste("replicates", i, sep="_")
      res = results(dds, name=contrast)
      res <- res[!is.na(res$log2FoldChange),]
      #https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-are-some-p-values-set-to-na
      res$padj <- ifelse(is.na(res$padj), 1, res$padj)
    
      res_df <- as.data.frame(res)
      write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
    
      up <- subset(res_df, padj<=0.1 & log2FoldChange>=2)
      down <- subset(res_df, padj<=0.1 & log2FoldChange<=-2)
      write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
      write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
    }
    
    ~/Tools/csv2xls-0.4/csv_to_xls.py \
    sT_Dox_vs_sT_DMSO-all.txt \
    sT_Dox_vs_sT_DMSO-up.txt \
    sT_Dox_vs_sT_DMSO-down.txt \
    -d$',' -o sT_Dox_vs_sT_DMSO.xls;
    
    ~/Tools/csv2xls-0.4/csv_to_xls.py \
    sT_Dox_vs_scr_Dox-all.txt \
    sT_Dox_vs_scr_Dox-up.txt \
    sT_Dox_vs_scr_Dox-down.txt \
    -d$',' -o sT_Dox_vs_scr_Dox.xls;
    
    ~/Tools/csv2xls-0.4/csv_to_xls.py \
    scr_Dox_vs_scr_DMSO-all.txt \
    scr_Dox_vs_scr_DMSO-up.txt \
    scr_Dox_vs_scr_DMSO-down.txt \
    -d$',' -o scr_Dox_vs_scr_DMSO.xls;
    
    ~/Tools/csv2xls-0.4/csv_to_xls.py \
    sT_DMSO_vs_scr_DMSO-all.txt \
    sT_DMSO_vs_scr_DMSO-up.txt \
    sT_DMSO_vs_scr_DMSO-down.txt \
    -d$',' -o sT_DMSO_vs_scr_DMSO.xls;
    
    for i in EV_vs_wt; do
      # read files to geness_res
      echo "geness_res <- read.csv(file = paste(\"${i}\", \"all.txt\", sep=\"-\"), row.names=1)"
    
      echo "subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res\$log2FoldChange) >= 2.0))"
      echo "geness_res\$Color <- \"NS or log2FC < 2.0\""
      echo "geness_res\$Color[geness_res\$pvalue < 0.05] <- \"P < 0.05\""
      echo "geness_res\$Color[geness_res\$padj < 0.05] <- \"P-adj < 0.05\""
      echo "geness_res\$Color[abs(geness_res\$log2FoldChange) < 2.0] <- \"NS or log2FC < 2.0\""
      echo "geness_res\$Color[geness_res\$external_gene_name %in% intersect_891_471\$Gene] <- \"lysosomal & TFEB\""
      echo "geness_res\$Color[geness_res\$external_gene_name %in% G891_only\$Gene] <- \"lysosomal\""
      echo "geness_res\$Color[geness_res\$external_gene_name %in% G471_only\$Gene] <- \"TFEB\""
      echo "geness_res\$Color <- factor(geness_res\$Color, levels = c(\"NS or log2FC < 2.0\", \"P-adj < 0.05\", \"lysosomal & TFEB\", \"lysosomal\", \"TFEB\"))"
      echo "write.csv(geness_res, \"${i}_with_Category.csv\")"
    
      # pick top genes for either side of volcano to label
      # order genes for convenience:
      echo "geness_res\$invert_P <- (-log10(geness_res\$pvalue)) * sign(geness_res\$log2FoldChange)"
      echo "top_g <- c()"
      echo "top_g <- c(top_g, \
                geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:100]], \
                geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:100]])"
      echo "top_g <- unique(top_g)"
      echo "geness_res <- geness_res[, -1*ncol(geness_res)]"  # remove invert_P from matrix
    
      # Graph results
      echo "png(\"${i}.png\",width=1200, height=2000)"
      echo "ggplot(geness_res, \
          aes(x = log2FoldChange, y = -log10(pvalue), \
              color = Color, label = external_gene_name)) + \
          geom_vline(xintercept = c(2.0, -2.0), lty = \"dashed\") + \
          geom_hline(yintercept = -log10(0.05), lty = \"dashed\") + \
          geom_point() + \
          labs(x = \"log2(FC)\", y = \"Significance, -log10(P)\", color = \"Significance\") + \
          scale_color_manual(values = c(\"P < 0.05\"=\"darkgray\",\"P-adj < 0.05\"=\"red\",\"lysosomal\"=\"lightblue\",\"TFEB\"=\"green\",\"lysosomal & TFEB\"=\"cyan\",\"NS or log2FC < 2.0\"=\"gray\"),guide = guide_legend(override.aes = list(size = 4))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) + \
          geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res\$log2FoldChange) >= 2.0)), size = 4, point.padding = 0.15, color = \"black\", min.segment.length = .1, box.padding = .2, lwd = 2) + \
          theme_bw(base_size = 16) + \
          theme(legend.position = \"bottom\")"
      echo "dev.off()"
    done
    
    library(ggplot2)
    library(ggrepel)
    
    geness_res <- read.csv(file = paste("EV_vs_wt", "all.txt", sep="-"), row.names=1)
    
    external_gene_name <- rownames(geness_res)
    geness_res <- cbind(geness_res, external_gene_name)
    #top_g <- c("hsa-miR-10b-5p","hsa-miR-1226-5p","hsa-miR-30c-5p","hsa-mir-182","hsa-miR-182-5p",  "hsa-mir-1291","hsa-miR-1291","hsa-mir-3607","hsa-mir-3653","hsa-miR-3607-3p","hsa-miR-1244","hsa-mir-1248")
    top_g <- c("hsa-mir-3607","hsa-mir-1291","hsa-mir-3653","hsa-miR-3607-3p","hsa-miR-30c-5p","hsa-miR-10b-5p","hsa-miR-182-5p","hsa-miR-1291","hsa-mir-182","hsa-miR-1244","hsa-mir-1248","hsa-miR-1226-5p","hsa-mir-1224","hsa-miR-423-5p","hsa-miR-3653-3p","hsa-miR-375","hsa-miR-15a-3p","hsa-mir-3651","hsa-miR-1246","hsa-miR-4521","hsa-miR-30a-5p","hsa-miR-425-5p","hsa-miR-9-5p","hsa-miR-664a-5p","hsa-let-7f-5p","hsa-miR-146b-5p","hsa-miR-320a","hsa-miR-3607-5p","hsa-mir-3687-1","hsa-mir-3687-2","hsa-let-7a-3","hsa-mir-6723","hsa-miR-342-3p")
    
    subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0))
    geness_res$Color <- "NS or log2FC < 2.0"
    geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
    geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
    geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
    
    #geness_res$Color[geness_res$external_gene_name %in% intersect_891_471$Gene] <- "lysosomal & TFEB"
    #geness_res$Color[geness_res$external_gene_name %in% G891_only$Gene] <- "lysosomal"
    #geness_res$Color[geness_res$external_gene_name %in% G471_only$Gene] <- "TFEB"
    #geness_res$Color <- factor(geness_res$Color, levels = c("NS or log2FC < 2.0", "P-adj < 0.05", "lysosomal & TFEB", "lysosomal", "TFEB"))
    write.csv(geness_res, "EV_vs_wt_with_Category.csv")
    geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
    #top_g <- c()
    #top_g <- c(top_g,              geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:100]],              geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:100]])
    #top_g <- unique(top_g)
    
    geness_res <- geness_res[, -1*ncol(geness_res)]
    
    #png("EV_vs_wt.png",width=1200, height=2000)
    svg("EV_vs_wt.svg",width=12, height=14)
    ggplot(geness_res,       aes(x = log2FoldChange, y = -log10(pvalue),           color = Color, label = external_gene_name)) +       geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +       geom_hline(yintercept = -log10(0.05), lty = "dashed") +       geom_point() +       labs(x = "log2(FC)", y = "Significance, -log10(P)", color = "Significance") +       scale_color_manual(values = c("P < 0.05"="orange","P-adj < 0.05"="red","NS or log2FC < 2.0"="darkgray"),guide = guide_legend(override.aes = list(size = 4))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) +       geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)), size = 4, point.padding = 0.15, color = "black", min.segment.length = .1, box.padding = .2, lwd = 2) +       theme_bw(base_size = 16) +       theme(legend.position = "bottom")
    dev.off()
    
    #print all top_g-gene labels including 
    svg("EV_vs_wt.svg",width=12, height=14)
    ggplot(geness_res,       aes(x = log2FoldChange, y = -log10(pvalue),           color = Color, label = external_gene_name)) +       geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +       geom_hline(yintercept = -log10(0.05), lty = "dashed") +       geom_point() +       labs(x = "log2(FC)", y = "Significance, -log10(P)", color = "Significance") +       scale_color_manual(values = c("P < 0.05"="orange","P-adj < 0.05"="red","NS or log2FC < 2.0"="darkgray"),guide = guide_legend(override.aes = list(size = 4))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) +       geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g), size = 4, point.padding = 0.15, color = "black", min.segment.length = .1, box.padding = .2, lwd = 2) +       theme_bw(base_size = 16) +       theme(legend.position = "bottom")
    dev.off()
    
    ###################################################################
    ##### STEP3: prepare all_genes #####
    rld <- rlogTransformation(dds)
    mat <- assay(rld)
    mm <- model.matrix(~replicates, colData(rld))
    mat <- limma::removeBatchEffect(mat, batch=rld$donor, design=mm)
    assay(rld) <- mat
    RNASeq.NoCellLine <- assay(rld)
    # reorder the columns
    colnames(RNASeq.NoCellLine) = c("0505 WaGa sT DMSO","1905 WaGa sT DMSO","0505 WaGa sT Dox","1905 WaGa sT Dox","0505 WaGa scr DMSO","1905 WaGa scr DMSO","0505 WaGa scr Dox","1905 WaGa scr Dox","0505 WaGa wt","1905 WaGa wt","control MKL1","control WaGa")
    col.order <-c("0505 WaGa sT DMSO","1905 WaGa sT DMSO","0505 WaGa sT Dox","1905 WaGa sT Dox","0505 WaGa scr DMSO","1905 WaGa scr DMSO","0505 WaGa scr Dox","1905 WaGa scr Dox","0505 WaGa wt","1905 WaGa wt","control MKL1","control WaGa")
    RNASeq.NoCellLine <- RNASeq.NoCellLine[,col.order]
    
    #Option1: filter to genes with overall lfc > 2       
    datamat  <- RNASeq.NoCellLine[apply(RNASeq.NoCellLine,1,function(x){max(x)-min(x)})>=2,]
    
    #Option2: not using the automatical comparing between max and min, rather than using manually selected from DESeq2 between conditions.
    RNASeq.NoCellLine_  <- RNASeq.NoCellLine[top_g,]
    colnames(RNASeq.NoCellLine_) <- c("MKL-1 Cell", "WaGa Cell", "MKL-1 EVs", "WaGa EVs") 
    datamat <- RNASeq.NoCellLine_
    
    #Option3: take all_genes        
    datamat <- RNASeq.NoCellLine
    
    #Option4: manully defining
    for i in EV_vs_parental sT_Dox_vs_sT_DMSO sT_Dox_vs_scr_Dox scr_Dox_vs_scr_DMSO sT_DMSO_vs_scr_DMSO; do echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"; echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"; done
    cat *.id | sort -u > ids
    #add Gene_Id in the first line, delete the ""
    GOI <- read.csv("ids")$Gene_Id
    datamat = RNASeq.NoCellLine[GOI, ]
    
    ######################################################################
    ##### STEP4: clustering the genes and draw heatmap #####
    #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
    hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
    hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
    mycl = cutree(hr, h=max(hr$height)/1.05)
    mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED",  "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");
    
    mycol = mycol[as.vector(mycl)]
    #sampleCols <- rep('GREY',ncol(RNASeq.NoCellLine_))
    #names(sampleCols) <- c("mock_r1", "mock_r2", "mock_r3", "mock_r4", "WAP_r1", "WAP_r2",  "WAP_r3", "WAP_r4", "WAC_r1","WAC_r2")
    #sampleCols[substr(colnames(RNASeq.NoCellLine_),1,4)=='mock'] <- 'GREY'
    #sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAP'] <- 'RED'
    #sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dM_'] <- 'CYAN'
    #sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dP_'] <- 'BLUE'
    #sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAC'] <- 'GREEN'
    png("miRNA_heatmap.png", width=800, height=1000)
    #svg("DEGs_heatmap.svg", width=6, height=8)
    heatmap.2(as.matrix(datamat), 
      Rowv=as.dendrogram(hr), 
      Colv=NA, 
      dendrogram='row', 
      labRow="", 
      scale='row', 
      trace='none', 
      col=bluered(75), 
      RowSideColors=mycol, 
      srtCol=20, 
      lhei=c(1,8), 
      #cexRow=1.2,   # Increase row label font size
      cexCol=1.7,    # Increase column label font size
      margin=c(7,1)
     )
    dev.off()
    #ColSideColors = sampleCols,
    heatmap.2(datamat,Rowv=as.dendrogram(hr),Colv=as.dendrogram(hc),
                scale='row',trace='none',col=bluered(75),
                RowSideColors = mycol, labRow="",
                main=sprintf('%s', "RNA-Seq for all DEGs"), srtCol=30)
    dev.off()
    #margin=c(5,5),
    heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', labRow="", srtCol=30, lhei=c(0.1,2))
    #heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none",, sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', cexRow=1.4, lhei=c(0.25,5))
    #heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(10,10), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none')
    #heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(10,10), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', cexRow=1.2, lhei=c(0.1,5))
    dev.off()
    
    #### cluster members #####
    write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
    write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt')
    write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')  
    #~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o genelist_clusters.xls
  7. display viral transcripts found in mRNA-seq (or small RNA) MKL-1, WaGa EVs compared to parental cells (http://xgenes.com/article/article-content/87/display-viral-transcripts-found-in-mrna-seq-mkl-1-waga-evs-compared-to-cells/)

Diversity and Complexity in Plant Genomes

In the world of plant genomics, it’s indeed true that not all plants have diploid genomes. Unlike many animals which are typically diploid (having two sets of chromosomes, one from each parent), plants can exhibit a range of genomic complexities. Here are some key points:

  1. Polyploidy: Many plants are polyploid, meaning they have more than two sets of chromosomes. Polyploidy is common in the plant kingdom and includes triploidy (three sets), tetraploidy (four sets), hexaploidy (six sets), and even higher levels of chromosome sets. Wheat, for example, is hexaploid.

  2. Haploidy: Some plants can be haploid, having only a single set of chromosomes. This is less common than diploidy or polyploidy but is found in certain species and can be a stage in the plant life cycle.

  3. Mixoploidy: This condition, where different cells in the same organism have different numbers of chromosome sets, can also occur in plants.

  4. Genome Duplication Events: Many plants have undergone ancient genome duplication (or polyploidization) events, which significantly contribute to their genetic diversity and complexity. This is a key factor in the adaptability and evolution of plant species.

  5. Breeders and Geneticists: Plant breeders and geneticists often exploit these genomic characteristics of plants for crop improvement, creating varieties with desired traits such as increased size, enhanced nutritional content, or resistance to pests and diseases.

In summary, plant genomes are diverse. While some plants are diploid like humans, many others exhibit polyploidy or other variations in chromosome number, contributing to the rich genetic diversity observed in the plant kingdom.

在植物基因组学领域,确实有许多植物并不是二倍体。与许多动物通常是二倍体(即有两套染色体,各来自一个亲本)不同,植物展现出多种基因组复杂性。以下是一些关键点:

  1. 多倍体性(Polyploidy):许多植物是多倍体,意味着它们拥有两套以上的染色体。在植物界,多倍体是常见现象,包括三倍体(三套染色体)、四倍体(四套)、六倍体(六套),甚至更高级别的染色体套数。例如,小麦就是六倍体。

  2. 单倍体(Haploidy):一些植物可能是单倍体,只有一套染色体。这种情况不如二倍体或多倍体常见,但在某些物种中存在,可以是植物生命周期中的一个阶段。

  3. 混合倍性(Mixoploidy):在同一植物不同细胞中出现不同染色体套数的情况也可能发生。

  4. 基因组复制事件(Genome Duplication Events):许多植物经历了古代的基因组复制(或多倍体化)事件,这显著贡献于它们的遗传多样性和复杂性。这是植物种类适应性和进化的关键因素。

  5. 育种家和遗传学家:植物育种家和遗传学家经常利用植物的这些基因组特性进行作物改良,创造出具有期望特征的品种,如增大体积、提高营养含量或抗虫害和疾病。

总之,植物基因组多样性丰富。虽然有些植物像人类一样是二倍体,但还有许多其他植物表现出多倍体或其他染色体数目的变异,这为植物界的丰富遗传多样性做出了贡献。

Workstation Configuration

1. 1485 * RECT Workstation WS-2290C 1 Stück 15.150,00

    - 2x Intel Xeon Gold 6338 Processor (2,00 GHz, 32 Cores, 64 Threads, 48 MB Cache);
    - High-Efficiency Dual Noctua Silent CPU-Kühler;
    - 1 TB DDR4-3200 RAM (16x RDIMM 64 GB PC4-25600 ECC Reg.);
    - Workstation-Mainboard mit Intel C621A Chipsatz (E-ATX), 
      on Board: VGA, Audio, 2x 1 Gbit-LAN (Intel i210),
      2x M.2 und 4x U.2 (PCIe 4.0), 8x S-ATA 6Gb/s, RAID;
    - Nvidia GeForce RTX 3060 Ti Chipsatz mit 8 GB GDDR6-RAM;
    - 2x 4 TB M.2 NVMe SSD Seagate FireCuda 530 (1.000.000 IOPS, 5100 TBW);
    - 4x 3,84 TB U.2 NVMe Samsung PM9A3 (1.000.000 IOPS, 7008 TBW, 1 DWD);
    - 6x 22 TB Western Digital Ultrastar DC HC570 (512 MB, 7.200, S-ATA 6Gb/s);
    - Individuelle RAID-Konfiguration;
    - 3,2 TB NVMe SSD Samsung PM1735 Series (1.500.000 IOPS, 17520 TBW, PCIe 4.0 x8);
    - Netgear AC1200 WLAN USB Adapter (Dualband, bis 1200 Mbit/s);
    - Intel Dual-Port 10 Gbit Ethernet Network Adapter X550-T2 (2x RJ45);
    - 5,0m Patchkabel Cat.6A (grau);
    - Creative Sound BlasterX AE-5 Plus (PCIe x1);
    - Big-Tower Gehäuse (schwarz, gedämmt);
    - Blu-ray Brenner 12fach;
    - High-efficiency Single Netzteil 80Plus Platinum (2000W);
    - Ihr Betriebssystem vorinstalliert;
    - Logitech Wireless Desktop-Set MK540 Advanced deutsch (schwarz, UnifyingTM USB-Empfänger);
    - Enthaltene Urheberrechtsabgabe: Workstation;
    - 60 Monate Gewährleistung (Vor-Ort 3BD)

    Summe RECT Workstation WS-2290C
    *15.150,00
    15.150,00

2. 9900 * Aufrechnung Rabatt Forschung und Lehre (5%) Summe Aufrechnung 1,00 Wert 15.150,00 -0,050 -757,50

  • -757,50

3. 9400 Versand UPS / UPS Standard 1x Computer-Box * Summe Versand ** Gesamtwert EURO (zzgl. ges. MwSt.) 1,00 Wert 60,00 60,00

  • 60,00

4. ** 14.452,50

Setup conda environments

  1. install rnaseq on sage

    #-- version 2 --
    conda install -c conda-forge mamba
    mamba create -n rnaseq -c conda-forge -c bioconda -c defaults python fastqc trim-galore star hisat2 picard csvtk preseq rseqc samtools
    conda activate rnaseq
    pip3 install deeptools
    pip3 install multiqc
    mamba install -c conda-forge -c bioconda -c defaults -c r stringtie subread gffread r-data.table r-gplots bioconductor-dupradar bioconductor-edger
    #(rnaseq) [jhuang@sage ~]$ which nextflow
    #/usr/local/bin/nextflow
    conda install -c bionconda fq
    mamba install -c bioconda ucsc-bedclip ucsc-bedgraphtobigwig
    mamba install -c bioconda rsem
    mamba install -c bioconda salmon
    mamba install -c conda-forge -c bioconda -c defaults -c r r-data.table r-gplots bioconductor-dupradar bioconductor-edger bioconductor-deseq2
    mamba install -c conda-forge openjdk=17
    conda install -c bioconda bedtools qualimap
    #under R
    install.packages("BiocManager")
    BiocManager::install("tximport")
    BiocManager::install("tximeta")        
    install.packages("optparse")            
    install.packages(“pheatmap”)
    (rnaseq2) [jhuang@sage Data_Soeren_RNA-seq_2023]$ nextflow run rnaseq/main.nf --input samplesheet.csv    --outdir results_GRCh38 --genome GRCh38   -profile test_full -resume --max_memory 300.GB --max_time 2400.h --save_reference --aligner star_salmon  --skip_deseq2_qc
    (rnaseq) nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_1585 --fasta 1585.fasta --gtf 1585_m_.gtf -profile test_full -resume --max_memory 200.GB --max_time 2400.h --save_reference --aligner star_salmon   --skip_rseqc --skip_dupradar --skip_preseq --skip_biotype_qc --skip_deseq2_qc --min_mapped_reads 0
    #--gtf_extra_attributes gene_name
    
    #-- version 1 --
    #conda env create -f ~/Tools/rnaseq/environment.yml
    conda create -n rnaseq -c conda-forge -c bioconda -c defaults python=3.6 fastqc trim-galore star=2.6.1d hisat2
    conda activate rnaseq
    conda install -c conda-forge -c bioconda -c defaults picard csvtk preseq rseqc samtools
    pip3 install deeptools
    pip3 install multiqc
    conda install -c bioconda stringtie subread gffread
    conda install -c conda-forge -c bioconda -c defaults -c r r-data.table r-gplots
    conda install -c conda-forge -c bioconda -c defaults -c r bioconductor-dupradar bioconductor-edger
    conda install nextflow=20.04
  2. install chipseq

    conda create -n chipseq -c conda-forge -c bioconda -c defaults -c r python=3.9 fastqc cutadapt trim-galore bwa samtools 
    conda activate chipseq
    conda install picard bedtools phantompeakqualtools 
    conda install -c conda-forge -c bioconda -c defaults -c r r-base 
    #ERROR_SINCE_R_NGSPLOT_ONLY_FOR_PYTHON2 conda install -c conda-forge -c bioconda -c defaults -c r r-ngsplot
    sudo apt install macs
    conda install -c bioconda multiqc
    pip3 install deeptools
    conda install nextflow=20.04
    
    #conda env chipseq2 is still NOT working.
    conda create -n chipseq2 -c conda-forge -c bioconda -c defaults python=2.7 fastqc cutadapt trim-galore bwa samtools 
    conda activate chipseq2
    conda install -c conda-forge -c bioconda -c defaults picard bedtools phantompeakqualtools 
    conda install -c conda-forge -c bioconda -c defaults -c r deeptools r-base r-ngsplot macs multiqc
    conda install -c conda-forge -c bioconda -c defaults -c r r-ade4 r-assertthat r-base r-bh r-biocmanager r-bit r-bit64 r-bitops r-blob r-catools r-codetools r-crayon r-curl r-dbi r-digest r-domc r-foreach r-formatr r-futile.logger r-futile.options r-glue r-hms r-httr r-hwriter r-idr r-iterators r-jsonlite r-lambda.r r-lattice r-latticeextra r-lazyeval r-magrittr r-mass r-matrix r-matrixstats r-memoise r-mime r-openssl r-pkgconfig r-plogr r-prettyunits r-progress r-r6 r-rcolorbrewer r-rcpp r-rcurl r-rlang r-rsqlite r-segmented r-seqinr r-snow r-spp r-stringi r-stringr r-survival r-venndiagram r-xml
    conda install -c conda-forge -c bioconda -c defaults -c r bioconductor-annotationdbi bioconductor-annotationfilter bioconductor-biobase bioconductor-biocgenerics bioconductor-biocparallel bioconductor-biomart bioconductor-biostrings bioconductor-bsgenome bioconductor-chippeakanno bioconductor-delayedarray bioconductor-ensembldb bioconductor-genomeinfodb bioconductor-genomeinfodbdata bioconductor-genomicalignments bioconductor-genomicfeatures bioconductor-genomicranges bioconductor-go.db bioconductor-graph bioconductor-iranges bioconductor-limma bioconductor-multtest bioconductor-protgenerics bioconductor-rbgl bioconductor-regioner bioconductor-rsamtools bioconductor-rsubread bioconductor-rtracklayer bioconductor-s4vectors bioconductor-shortread bioconductor-summarizedexperiment bioconductor-xvector bioconductor-zlibbioc
    conda install nextflow=20.04
    
    #first_try
    (chipseq) nextflow run NGI-ChIPseq/main.nf --reads '/home/jhuang/DATA/Data_Denise_LT_DNA_Binding/enhancer_analysis/Raw_Data/*.fastq.gz' --genome hg38 --macsconfig macs.config --saveReference --saveAlignedIntermediates --singleEnd --blacklist_filtering -profile standard --project NHDF_enhancer_analysis_hg38 -resume
    #--notrim
    (chipseq) nextflow run NGI-ChIPseq/main.nf --reads '/home/jhuang/DATA/Data_Denise_LT_DNA_Binding/enhancer_analysis/Raw_Data/*.fastq.gz' --genome hg38 --macsconfig macs.config --saveAlignedIntermediates --notrim --saveTrimmed --singleEnd --blacklist_filtering -profile standard --project NHDF_enhancer_analysis_hg38 -resume
    #test
    (chipseq) nextflow run NGI-ChIPseq/main.nf --reads '/home/jhuang/DATA/Data_Denise_LT_DNA_Binding/Raw_Data/p783_*.fastq.gz' --genome hg38 --macsconfig macs.config --saveReference --saveAlignedIntermediates --singleEnd --blacklist_filtering -profile standard --project NHDF_enhancer_analysis_hg38 -resume
  3. install spandx

    conda create --name spandx2 python=3.8
    conda activate spandx2
    conda install -c conda-forge -c bioconda -c defaults art trimmomatic bwa bedtools=2.28.0 seqtk pindel mosdepth samtools=1.9 picard gatk4 snpeff=4.3.1t nextflow=22 fasttree
    #[genbank copying]
    mkdir ~/miniconda3/envs/spandx2/share/snpeff-4.3.1t-5/data/WA_plasmid
    cp WA_plasmid.gb ~/miniconda3/envs/spandx2/share/snpeff-4.3.1t-5/data/WA_plasmid/genes.gbk
    vim ~/miniconda3/envs/spandx2/share/snpeff-4.3.1t-5/snpEff.config
    /home/jhuang/miniconda3/envs/spandx2/bin/snpEff build -genbank WA_plasmid      -d
    
    nextflow run spandx/main.nf --fastq "Raw_Data_RNAseq_K331A_d8_SPANDx/*.fastq.gz" --ref LT_wt.fasta --annotation --database LT_wildtype --pairing SE -resume
    (spandx2) nextflow run spandx/main.nf --fastq "raw_data/*_R{1,2}.fastq.gz" --ref WA_chr.fasta --annotation --database WA_chr -resume
    (spandx2) nextflow run spandx/main.nf --fastq "raw_data/*_R{1,2}.fastq.gz" --ref WA_plasmid.fasta --annotation --database WA_plasmid -resume
    #Note that the files in variants contain only SNPs, but the files in snippy contain SNPs+INDELs which can be used to consensus the results of SPANDx, resulting in the final results of SNPs+INDELs.
    ~/DATA/Data_Benjamin_Yersinia_SNP/variants_WA_chr$ vim snippy.core.vcf
    ~/DATA/Data_Benjamin_Yersinia_SNP/snippy_WA_chr/Wacton_S96/Wacton_S96.txt
    ~/DATA/Data_Benjamin_Yersinia_SNP/snippy_WA_chr/Wacton_S96/Wacton_S96.filt.vcf 
    
    #FOLLOWING INSTALLATION CANNOT FINISH CALCULATION --> MAYBE IT NOW WORKS, BUT NEEDS TO GIVE AT LEAST SAMPLES, AS SAME AS IN SPANDX2
    conda create --name spandx python=3.8
    #conda install -c conda-forge -c bioconda -c defaults art trimmomatic bwa bedtools seqtk pindel mosdepth samtools picard gatk4 snpeff nextflow fasttree
    #-->picard-3.0.0-1, gatk4-4.4.0.0-0, snpeff-5.1-2, trimmomatic-0.39-2 installed!
    #DEBUG1: Due to using snpeff-5.1-2 by deleting the option '-t' in all 'snpEff eff -t ...' since '-t' is not a option in the new version.
    #DEBUG2: Don't login spandx after login base, since in the case the env uses /home/jhuang/anaconda3/bin/java.
    #  - /usr/bin/java: openjdk version "11.0.18" 2023-01-17
    #  - /home/jhuang/anaconda3/bin/java: openjdk version "1.8.0_152-release"
    #  - /home/jhuang/anaconda3/envs/spandx/bin/java: openjdk version "17.0.3-internal" 2022-04-19
    #DEBUG3: using '-' instead of '_'
    #  mv V_8_2_4_p600_d8_DonorI.fastq.gz    control-d8-DI.fastq.gz
  4. install bengal3_ac3

    conda create -n bengal3_ac3 python=3.6
    conda activate bengal3_ac3
    conda install -c conda-forge -c bioconda -c defaults  prokka 
    conda install -c conda-forge -c bioconda -c defaults  shovill mlst 
    conda install -c conda-forge -c bioconda -c defaults  trimmomatic fastqc    
    conda install -c conda-forge -c bioconda -c defaults  roary
    conda install -c conda-forge -c bioconda -c defaults  snippy
    conda install -c conda-forge -c bioconda -c defaults  fasttree raxml-ng  
    conda install -c conda-forge -c bioconda -c defaults  gubbins snp-sites
    conda install -c conda-forge -c bioconda -c defaults openjdk=11
    TODO: snippy is still NOT working in hamm, we have to run the modul  "variants_calling" on hamburg.
  5. install qiime1

    conda create -n qiime1 -c bioconda qiime
  6. export environments

    #!/bin/bash
    
    # Get a list of conda environments
    env_list=$(conda env list | awk '{print $1}' | tail -n +4)
    
    # Iterate through each environment and export it to a YAML file
    for env in $env_list; do
        echo "conda activate $env"
        echo "conda env export > \"${env}_exported.yml\""
        echo "conda deactivate"
    done
    
    #conda create --name spandx2 --clone spandx
    #conda env remove --name spandx
    
    #conda activate spandx
    #conda env export > spandx.yml
    #rsync -a -P *.yml xxx@xxx:~/xxx/xxx
    #conda env create -f spandx.yml
    
    conda activate base
    conda env export > "base_exported.yml"
    conda deactivate
    conda activate HLCA_mapping_env
    conda env export > "HLCA_mapping_env_exported.yml"
    conda deactivate
    conda activate assembly2
    conda env export > "assembly2_exported.yml"
    conda deactivate
    conda activate bac3
    conda env export > "bac3_exported.yml"
    conda deactivate
    conda activate bactopia
    conda env export > "bactopia_exported.yml"
    conda deactivate