Author Archives: gene_x

Tn-seq analysis pipeline (improved)

  1. quality control

    ./240606_VH00358_96_AAFCFJGM5/kr11/initial_mutants_a_2_S6_R1_001.fastq.gz
    ./240606_VH00358_96_AAFCFJGM5/kr11/initial_mutants_a_2_S6_R2_001.fastq.gz
    ./240606_VH00358_96_AAFCFJGM5/kr13/LB_culture_a_2_S7_R1_001.fastq.gz
    ./240606_VH00358_96_AAFCFJGM5/kr13/LB_culture_a_2_S7_R2_001.fastq.gz
    ./240606_VH00358_96_AAFCFJGM5/kr15/growthout_control_24h_a_2_S8_R1_001.fastq.gz
    ./240606_VH00358_96_AAFCFJGM5/kr15/growthout_control_24h_a_2_S8_R2_001.fastq.gz
    ./240606_VH00358_96_AAFCFJGM5/kr17/extracellular_mutants_24h_a_2_S9_R1_001.fastq.gz
    ./240606_VH00358_96_AAFCFJGM5/kr17/extracellular_mutants_24h_a_2_S9_R2_001.fastq.gz
    ./240606_VH00358_96_AAFCFJGM5/kr19/intracellular_mutants_24h_a_2_S10_R1_001.fastq.gz
    ./240606_VH00358_96_AAFCFJGM5/kr19/intracellular_mutants_24h_a_2_S10_R2_001.fastq.gz
    
    #from fastqc results of initial_mutants
    49821406
    35-161
    49821406
    35-161
  2. modify the tpp scripts

    vim ~/.local/lib/python3.10/site-packages/pytpp/tpp_tools.py
    #search for "DEBUG"
    #-maxreads 10000 or not_given for take all!
    #-primer AGATGTGTATAAGAGACAG     the default primer of Tn5 is TAAGAGACAG!
    #-primer-start-window 0,159  set 0,159 as default!
  3. run Transposon Position Profiling (TPP) https://transit.readthedocs.io/en/latest/transit_running.html; https://orca2.tamu.edu/tom/iLab.html

    # Break-down of total reads (49821406):
    #  29481783 reads (59.2%) lack the expected Tn prefix
    # Break-down of trimmed reads with valid Tn prefix (20339623):
    
    #primer AGCTTCAGGGTTGAGATGTGTATAAGAGACAG
    #  primer_matches: 0 reads (0.0%) contain CTAGAGGGCCCAATTCGCCCTATAGTGAGT (Himar1)
    #  vector_matches: 0 reads (0.0%) contain CTAGACCGTCCAGTCTGGCAGGCCGGAAAC (phiMycoMarT7)
    #  adapter_matches: 0 reads (0.0%) contain GATCGGAAGAGCACACGTCTGAACTCCAGTCAC (Illumina/TruSeq index)
    #  misprimed_reads: 0 reads (0.0%) contain Himar1 prefix but don't end in TGTTA
    
    #kr11.trimmed1_failed_trim    22072406
    #-rw-rw-r-- 1 jhuang jhuang  2,7G Jun 11 15:44 kr11.trimmed1    20339623
    #-rw-rw-r-- 1 jhuang jhuang  2,9G Jun 11 15:46 kr11.trimmed2    20339623
    #cat initial_mutants_a_2_S6_R1_001.fastq | echo $((`wc -l` / 4))  49821406=29481783 reads (59.2%) + 20339623 = 49821406
    
    #vs.
    
    # Break-down of total reads (49821406):
    #  29356859 reads (58.9%) lack the expected Tn prefix
    # Break-down of trimmed reads with valid Tn prefix (20464547):
    #  primer_matches: 0 reads (0.0%) contain CTAGAGGGCCCAATTCGCCCTATAGTGAGT (Himar1)
    #  vector_matches: 0 reads (0.0%) contain CTAGACCGTCCAGTCTGGCAGGCCGGAAAC (phiMycoMarT7)
    #  adapter_matches: 0 reads (0.0%) contain GATCGGAAGAGCACACGTCTGAACTCCAGTCAC (Illumina/TruSeq index)
    #  misprimed_reads: 0 reads (0.0%) contain Himar1 prefix but don't end in TGTTA
    # read_length: 100 bp
    # mean_R1_genomic_length: 73.4 bp
    # mean_R2_genomic_length: 86.4 bp
    
    #conda deactivate
    ##Test AGCTTCAGGGTTGAGATGTGTATAAGAGACAG --> TAAGAGACAG, the results are similar!
    #python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m_.fasta -reads1 240606_VH00358_96_AAFCFJGM5/kr11/initial_mutants_a_2_S6_R1_001.fastq.gz -reads2 240606_VH00358_96_AAFCFJGM5/kr11/initial_mutants_a_2_S6_R2_001.fastq.gz -output kr11_10nt_primer -primer TAAGAGACAG -mismatches 1 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
    #mv tpp.cfg kr11_10nt_primer_tpp.cfg
    
    #for initial_mutants, note that "-primer-start-window 0,161" is invalid!
    python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m_.fasta -reads1 240606_VH00358_96_AAFCFJGM5/kr11/initial_mutants_a_2_S6_R1_001.fastq.gz -reads2 240606_VH00358_96_AAFCFJGM5/kr11/initial_mutants_a_2_S6_R2_001.fastq.gz -output initial_mutants -primer AGCTTCAGGGTTGAGATGTGTATAAGAGACAG -mismatches 1 -primer-start-window 0,161 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
    mv tpp.cfg initial_mutants_tpp.cfg
    
    #primer_start_window 0,159
    # Break-down of total reads (49821406):
    #  29481783 reads (59.2%) lack the expected Tn prefix
    # Break-down of trimmed reads with valid Tn prefix (20339623):
    #  primer_matches: 0 reads (0.0%) contain CTAGAGGGCCCAATTCGCCCTATAGTGAGT (Himar1)
    #  vector_matches: 0 reads (0.0%) contain CTAGACCGTCCAGTCTGGCAGGCCGGAAAC (phiMycoMarT7)
    #  adapter_matches: 0 reads (0.0%) contain GATCGGAAGAGCACACGTCTGAACTCCAGTCAC (Illumina/TruSeq index)
    #  misprimed_reads: 0 reads (0.0%) contain Himar1 prefix but don't end in TGTTA
    
    #for LB_culture
    python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m_.fasta -reads1 ./240606_VH00358_96_AAFCFJGM5/kr13/LB_culture_a_2_S7_R1_001.fastq.gz -reads2 ./240606_VH00358_96_AAFCFJGM5/kr13/LB_culture_a_2_S7_R2_001.fastq.gz -output LB_culture -primer AGCTTCAGGGTTGAGATGTGTATAAGAGACAG -mismatches 1 -primer-start-window 0,161 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
    mv tpp.cfg LB_culture_tpp.cfg
    
    #for growthout_control_24h
    python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m_.fasta -reads1 ./240606_VH00358_96_AAFCFJGM5/kr15/growthout_control_24h_a_2_S8_R1_001.fastq.gz -reads2 ./240606_VH00358_96_AAFCFJGM5/kr15/growthout_control_24h_a_2_S8_R2_001.fastq.gz -output growthout_control_24h -primer AGCTTCAGGGTTGAGATGTGTATAAGAGACAG -mismatches 1 -primer-start-window 0,161 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
    mv tpp.cfg growthout_control_24h_tpp.cfg
    
    #for extracellular_mutants_24h
    python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m_.fasta -reads1 ./240606_VH00358_96_AAFCFJGM5/kr17/extracellular_mutants_24h_a_2_S9_R1_001.fastq.gz -reads2 ./240606_VH00358_96_AAFCFJGM5/kr17/extracellular_mutants_24h_a_2_S9_R2_001.fastq.gz -output initial_mutants -primer AGCTTCAGGGTTGAGATGTGTATAAGAGACAG -mismatches 1 -primer-start-window 0,161 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
    mv tpp.cfg extracellular_mutants_24h_tpp.cfg
    
    #for intracellular_mutants_24h
    python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m_.fasta -reads1 ./240606_VH00358_96_AAFCFJGM5/kr19/intracellular_mutants_24h_a_2_S10_R1_001.fastq.gz -reads2 ./240606_VH00358_96_AAFCFJGM5/kr19/intracellular_mutants_24h_a_2_S10_R2_001.fastq.gz -output initial_mutants -primer AGCTTCAGGGTTGAGATGTGTATAAGAGACAG -mismatches 1 -primer-start-window 0,161 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
    mv tpp.cfg intracellular_mutants_24h_tpp.cfg
    
    ##Ignoring the reads_R1
    #python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref contig_1_1.fna -reads1 original_reads_rc_R2.fastq -output #contig_1_1_rc_only_reads2 -primer CCTACAACAAAGCTCTCATCAAC -mismatches 1 -bwa-alg mem -replicon-ids contig_1_1
    #mv tpp.cfg contig_1_1_rc_only_reads2.tpp.cfg
  4. generate statistics tables in Excel-format

    cp initial_mutants_rep1.tn_stats initial_mutants_rep1.tn_stats_
    #Delete all general statistics before the table data in initial_mutants_rep1.tn_stats_; delete the content after "# FR_corr (Fwd templates vs. Rev templates):"
    sed -i 's/read_count (TA sites only, for Himar1)/read_counts/g' *.tn_stats_
    sed -i 's/NZ_mean (among templates)/NZ_mean (mean template count over non-zero TA sites)/g' *.tn_stats_
    python3 ../parse_tn_stats.py initial_mutants_rep1.tn_stats_ initial_mutants_rep1.tn_stats.xlsx
    python3 ../parse_tn_stats.py LB_culture_rep1.tn_stats_ LB_culture_rep1.tn_stats.xlsx
    #calculate the sum of the first and second columns by "=SUM(B2:B130)" and "=SUM(C2:C130)"  441057 and 276060
    mkdir initial_mutants_rep1_wig
    mv *.wig initial_mutants_rep1_wig/
    zip -r initial_mutants_rep1_wig.zip initial_mutants_rep1_wig/
    #The counts-files are too big, not nessasary to send:
    #~/Tools/csv2xls-0.4/csv_to_xls.py initial_mutants_rep1.tn_stats_ *.counts -d$',' -o initial_mutants_rep1.stats.xls;
    #   contig_1_10 699 411
    #   contig_1_8  3206    2031
    #   contig_1_7  3787    2376
    #   contig_1_6  4604    2871
    #   contig_1_5  2794    1765
    #   contig_1_4  83  58
    #   contig_1_3  2944    1882
    #   contig_1_2  15446   9678
    #   contig_1_1  14391   8954

Bildschirmarbeitsplatzbrille vs Gleitsichtbrille (Progressive Lenses)

  1. Sehtest Ergebnisse

    #Im Jahr 2024 von Alster Optik
    ????
    
    #Im Jahr 2024 von SICHTWERK Augenärzte
    F R  +0.75    -1.50     160
        L  +0.50    -0.75     170
    N R  ADD      +1.50
        L  ADD      +1.50
    
    #Im Jahr 2023 von Apollo
    F R  +0.00    -1.25     180
        L  +0.00    -1.00     170
    N R  ADD      +1.00
        L  ADD      +1.00
    
    # Sehleistung (visual performance) 2024
    R,        R+L,  L
    0.8,    1.0,    0.8
    # Sehleistung (visual performance) 2023
    0.63, 0.8, 0.63
  2. Brillen von 2023

    #Brille 1, annotated as Mehrstärke-Brille (Auto-Brille):
    GOLD: VARIVIEW-ZS35 N&D-SAPPH
        x*  25,00  y*  21,00
        Sphäre  Zylinder  Achse
    F R   0.00    -1.25     180
        L   0.00    -1.00     180
    N R Addition   1,00
        L Addition   1,00
    
    #Brille 2, annotated as Arbeitsplatz-Brille:
    GOLD: MULTIKOMFORT-NAEHE-ZS11
        x*  25,00  y*  20,00
    F R   1.00    -1.25     180
        L   1.00    -1.00     180
    N R Addition   0,00
        L Addition   0,00
    
    #Brille 3, annotated as Nah-Brille:
    141583 HKN7 54#18-140 SE SC/SC    ING
    inkl. Basis-Glaspaket
        x*  25,00  y*  16,54
        Sphäre  Zylinder  Achse
    F R   1.00    -1.25     180
        L   1.00    -1.00     180
    N R Addition   0,00
        L Addition   0,00
    
    感谢您提供的更新信息。根据这些详细的验光参数,以下是对两副眼镜的重新分析以及适用场景的建议:
    
    第一副眼镜(标注为 Mehrstärke-Brille)
    
        验光参数:
            右眼 (R):
                球镜度数 (Sphäre): 0.00
                柱镜度数 (Zylinder): -1.25
                轴位 (Achse): 180
                近用附加度数 (Addition): 1.00
            左眼 (L):
                球镜度数 (Sphäre): 0.00
                柱镜度数 (Zylinder): -1.00
                轴位 (Achse): 180
                近用附加度数 (Addition): 1.00
    
    第二副眼镜(标注为 Arbeitsplatz-Brille)
    
        验光参数:
            右眼 (R):
                球镜度数 (Sphäre): 1.00
                柱镜度数 (Zylinder): -1.25
                轴位 (Achse): 180
                近用附加度数 (Addition): 0.00
            左眼 (L):
                球镜度数 (Sphäre): 1.00
                柱镜度数 (Zylinder): -1.00
                轴位 (Achse): 180
                近用附加度数 (Addition): 0.00
    
    分析和建议
    开车用眼镜
    
        推荐使用第一副眼镜(标注为 Mehrstärke-Brille):
            理由:这副眼镜是渐进多焦点眼镜,设计用于多个视距(远、中、近)的视力矫正。它具有1.00的近用附加度数,可以帮助您在驾驶时清晰地看到远处的路况,同时也可以查看近处的仪表盘。
            由于球镜度数为0.00,它不会影响您的远距离视力,因此非常适合开车使用。
    
    工作用眼镜
    
        推荐使用第二副眼镜(标注为 Arbeitsplatz-Brille):
            理由:这副眼镜的球镜度数为1.00,适合中近距离视物,特别适合长时间在电脑前工作或阅读文档。尽管没有近用附加度数,但其设计仍然非常适合办公室工作环境。
    
    结论
    
        开车用眼镜:第一副眼镜(GOLD: VARIVIEW-ZS35 N&D-SAPPH, 标注为 Mehrstärke-Brille),适合多焦点需求,特别是需要远近视力切换的场合,如开车。
        工作用眼镜:第二副眼镜(GOLD: MULTIKOMFORT-NAEHE-ZS11, 标注为 Arbeitsplatz-Brille),适合中近距离视物,特别适合在办公室或电脑前工作。
  3. Empfehlung von einer Brille für Arbeit

    #为什么第二副眼镜加了 球镜度数1.00, 而第一副为0,00. The test-results today I performed is as follows, please suggest me a brille for monitor-arbeit, note that ich have to work on the monitor (60 cm) for 8 hours each day.
    
    #Sehhifenverordnung Ergebniss 2024
    
    F R  +0.75    -1.50     160
    L  +0.50    -0.75     170
        ADD      +1.50
        ADD      +1.50
    
    Sehtest-Ergebnis 2024
    
        Fernwerte:
            Rechtes Auge (R):
                Sphäre: +0,75
                Zylinder: -1,50
                Achse: 160
            Linkes Auge (L):
                Sphäre: +0,50
                Zylinder: -0,75
                Achse: 170
        Additionswerte (Nahzusatz):
            Rechtes Auge (R): +1,50
            Linkes Auge (L): +1,50
    
    Brille für die Bildschirmarbeit (60 cm) - Empfehlungen
    
    Um acht Stunden täglich in einer Entfernung von 60 cm am Monitor zu arbeiten, benötigen Sie eine spezielle Brille, die für diese mittlere Entfernung optimiert ist.
    Empfohlene Brillengläser-Typen
    
    1. Bildschirmarbeitsplatzbrille
    
    Diese Brille ist speziell für mittlere und nahe Entfernungen entwickelt und eignet sich besonders für lange Arbeitszeiten am Computer.
    
        Sehstärken:
            Rechtes Auge (R):
                Sphäre: +2,25 (+0,75 Fernwert plus +1,50 Nahzusatz)
                Zylinder: -1,50
                Achse: 160
            Linkes Auge (L):
                Sphäre: +2,00 (+0,50 Fernwert plus +1,50 Nahzusatz)
                Zylinder: -0,75
                Achse: 170
    
    Diese Werte gewährleisten eine klare Sicht auf mittlere Entfernungen, wie sie bei der Bildschirmarbeit üblich sind.
    
    2. Gleitsichtbrille (Progressive Lenses)
    
    Diese Brillen bieten eine stufenlose Sicht von fern bis nah und sind ideal für den täglichen Gebrauch, einschließlich Bildschirmarbeit.
    
        Sehstärken:
            Fernwerte gemäß den oben angegebenen Sehtestergebnissen:
                Rechtes Auge (R): +0,75 / -1,50 / 160
                Linkes Auge (L): +0,50 / -0,75 / 170
            Additionswerte: +1,50
    
    Empfehlung
    
        Bildschirmarbeitsplatzbrille: Diese Brille ist speziell für mittlere Entfernungen entwickelt und bietet die beste Lösung für langes Arbeiten am Computer in einer Entfernung von 60 cm.
        Gleitsichtbrille: Wenn Sie eine Brille für verschiedene Entfernungen benötigen, ist eine Gleitsichtbrille eine gute Wahl, obwohl sie möglicherweise nicht so effektiv für die Bildschirmarbeit ist wie eine spezielle Arbeitsplatzbrille.
    
    Fazit
    
    * Empfohlen wird eine Bildschirmarbeitsplatzbrille mit folgenden Werten:
    
        Rechtes Auge (R): +2,25 (+0,75 Fernwert plus +1,50 Nahzusatz) / -1,50 / 160
        Linkes Auge (L): +2,00 (+0,50 Fernwert plus +1,50 Nahzusatz) / -0,75 / 170
    
    Diese Brille bietet Ihnen die beste Unterstützung für die Arbeit am Monitor in einer Entfernung von 60 cm, reduziert die Augenbelastung und erhöht die Arbeitseffizienz. Bei weiteren Fragen wenden Sie sich bitte an Ihren Augenarzt oder Optiker.
    
    根据您提供的2024年的验光结果,以下是详细信息以及对适合工作在电脑前(60厘米距离)使用的眼镜的建议:
    2024年的验光结果
    
        远用度数:
            右眼 (R):
                球镜度数 (Sphäre): +0.75
                柱镜度数 (Zylinder): -1.50
                轴位 (Achse): 160
            左眼 (L):
                球镜度数 (Sphäre): +0.50
                柱镜度数 (Zylinder): -0.75
                轴位 (Achse): 170
        近用附加度数 (Addition):
            右眼 (R): +1.50
            左眼 (L): +1.50
    
    适合在电脑前工作的眼镜建议
    
    为了在60厘米距离的电脑前工作8小时,建议使用一副专门设计的工作眼镜,这种眼镜应优化中近距离视物。
    建议的工作眼镜类型
    
    1. 单独的工作眼镜
    
    这种眼镜专门为中近距离视物设计,特别适合长时间在电脑前工作。
    
        验光参数:
            右眼 (R):
                球镜度数 (Sphäre): +2.25(+0.75的远用度数加上+1.50的近用附加度数)
                柱镜度数 (Zylinder): -1.50
                轴位 (Achse): 160
            左眼 (L):
                球镜度数 (Sphäre): +2.00(+0.50的远用度数加上+1.50的近用附加度数)
                柱镜度数 (Zylinder): -0.75
                轴位 (Achse): 170
    
    这种配方的眼镜将使您在中近距离(如60厘米的电脑屏幕)视物更加清晰。
    2. 渐进多焦点眼镜(Progressive Lenses)
    
    这种眼镜可以提供从远到近的连续视距,适合需要多种视距的日常活动,包括电脑工作。
    
        验光参数:
            远用度数与上面提供的验光结果相同,即:
                右眼 (R): +0.75 / -1.50 / 160
                左眼 (L): +0.50 / -0.75 / 170
            近用附加度数:+1.50
    
    选择合适的眼镜
    
        单独的工作眼镜:这种眼镜专门为中近距离设计,适合在固定距离(如60厘米的电脑屏幕)长时间工作。
        渐进多焦点眼镜:如果您的日常生活需要多种视距,可以选择渐进多焦点眼镜,它能提供从远到近的连续视距,虽然在电脑前工作的效果可能不如单独的工作眼镜。
    
    结论
    
    推荐使用单独的工作眼镜,即根据以下配方:
    
        右眼 (R): +2.25(+0.75的远用度数加上+1.50的近用附加度数)/ -1.50 / 160
        左眼 (L): +2.00(+0.50的远用度数加上+1.50的近用附加度数)/ -0.75 / 170
    
    这种眼镜将为您在60厘米距离的电脑前工作提供最佳的视力支持,减少眼睛疲劳,提高工作效率。如果有进一步的疑问,请咨询您的眼科医生或验光师。
  4. Berechnung der Zwischenwerte

    #Ich bin nicht 100% sicher, bei der erste Lösung, darf man einfach die Sphäre-werte mit die addierte Werte addieren?
    
    Nein, das direkte Addieren der Sphärenwerte mit den Additionswerten ist nicht die korrekte Vorgehensweise für die Anpassung einer Bildschirmarbeitsplatzbrille. Stattdessen gibt es spezielle Berechnungen und Anpassungen, die ein Optiker vornimmt, um sicherzustellen, dass die Brille optimal für die mittlere Entfernung (in diesem Fall 60 cm) geeignet ist.
    
    Hier sind die Schritte, die normalerweise bei der Anpassung einer Bildschirmarbeitsplatzbrille durchgeführt werden:
    1. Fernwerte und Additionswerte
    
    Ihre aktuellen Fernwerte und Additionswerte sind:
    
        Rechtes Auge (R):
            Sphäre: +0,75
            Zylinder: -1,50
            Achse: 160
            Addition: +1,50
        Linkes Auge (L):
            Sphäre: +0,50
            Zylinder: -0,75
            Achse: 170
            Addition: +1,50
    
    2. Berechnung der Zwischenwerte
    
    Ein Optiker kann die Zwischenwerte berechnen, die für die mittlere Entfernung (60 cm) optimal sind. Dies wird oft als „Computerbrille“ oder „Arbeitsplatzbrille“ bezeichnet. Die Berechnung erfolgt üblicherweise durch die Reduktion der Additionswerte, da diese Brille für eine mittlere Distanz und nicht für die Nähe (Lesen) gedacht ist.
    3. Optimierung der Werte
    
    Ein Beispiel für eine Berechnung könnte folgendermaßen aussehen:
    
        * Rechtes Auge (R):
            Sphäre: +1,50 (+0,75 Fernwert + 50% von +1,50 Addition für mittlere Entfernung)
            Zylinder: -1,50
            Achse: 160
        * Linkes Auge (L):
            Sphäre: +1,25 (+0,50 Fernwert + 50% von +1,50 Addition für mittlere Entfernung)
            Zylinder: -0,75
            Achse: 170
    
    Diese Berechnung basiert auf der Annahme, dass Sie etwa die Hälfte des Additionswertes für die mittlere Entfernung verwenden. Es ist jedoch wichtig, dass diese Berechnung individuell angepasst wird.
    4. Beratung durch einen Optiker
    
    Es ist entscheidend, dass Sie diese Berechnung von einem Optiker durchführen lassen. Ein Optiker kann:
    
        Die genauen Werte für Ihre individuelle Sehentfernung bestimmen.
        Berücksichtigen, wie Sie Ihren Monitor nutzen.
        Ihre Augenstellung und Ihre typische Arbeitsposition in die Berechnung einbeziehen.
    
    Fazit
    
    Während die oben angegebenen Werte ein guter Ausgangspunkt sein könnten, ist es wichtig, dass Sie eine professionelle Anpassung durch einen Optiker vornehmen lassen. Eine Bildschirmarbeitsplatzbrille muss präzise angepasst werden, um optimalen Sehkomfort und Augenschutz zu gewährleisten.
    
    Ich empfehle Ihnen, mit Ihren aktuellen Sehtest-Ergebnissen zu einem Optiker zu gehen und ihn um die Anpassung einer Bildschirmarbeitsplatzbrille für die Arbeit in einer Entfernung von 60 cm zu bitten. Der Optiker wird sicherstellen, dass die berechneten Werte korrekt und auf Ihre Bedürfnisse abgestimmt sind.
  5. +Blaulichtfilteroptionen?

    Ja, Sie können eine Computerbrille mit Ihrem aktuellen Refraktionsvorschlag und zusätzlichen Blaulichtfilteroptionen anpassen lassen. Hier sind die Schritte und Überlegungen, um dies zu erreichen:
    Schritte zur Anpassung einer Computerbrille
    
        Refraktionsvorschlag: Nutzen Sie Ihre aktuellen Sehtestergebnisse als Grundlage für die Anpassung der Computerbrille.
            Rechte Auge (R):
                Sphäre (Ferne): +0,75
                Zylinder: -1,50
                Achse: 160
                Addition: +1,50
            Linke Auge (L):
                Sphäre (Ferne): +0,50
                Zylinder: -0,75
                Achse: 170
                Addition: +1,50
    
        Spezifische Anpassung für Computerarbeit: Ein Optiker kann Ihre Sehstärke für den spezifischen Arbeitsabstand (ca. 60 cm) berechnen. Dies wird oft als Zwischenbereichsstärke oder Computerbrillenstärke bezeichnet.
            Beispiel für die Anpassung:
                Rechte Auge (R): +2,25 (Ferne + Hälfte der Addition)
                Linke Auge (L): +2,00 (Ferne + Hälfte der Addition)
    
        Blaulichtfilter hinzufügen: Bitten Sie den Optiker, Gläser mit einem Blaulichtfilter zu verwenden. Diese Filter helfen, die Belastung der Augen durch das blaue Licht von Bildschirmen zu reduzieren und verbessern den Sehkomfort bei längerer Bildschirmarbeit.
    
    Vorteile der Kombination
    
        Erhöhter Sehkomfort: Die Kombination von korrekt angepassten Refraktionswerten und einem Blaulichtfilter sorgt für weniger Augenbelastung und besseren Sehkomfort während der Arbeit am Computer.
        Reduzierte Ermüdung: Ein Blaulichtfilter kann die Ermüdung der Augen verringern und möglicherweise auch die Schlafqualität verbessern, da er das schädliche blaue Licht blockiert, das den Schlaf-Wach-Rhythmus stören kann.
        Individuelle Anpassung: Die genaue Anpassung der Brille an Ihre spezifischen Sehbedürfnisse und Arbeitsabstände sorgt für eine optimale visuelle Erfahrung.
    
    Beratung und Anpassung
    
        Optikerbesuch: Vereinbaren Sie einen Termin bei einem Optiker und bringen Sie Ihre aktuellen Sehtestergebnisse mit. Besprechen Sie Ihre spezifischen Anforderungen für eine Computerbrille, einschließlich des Arbeitsabstands und der Blaulichtfilteroption.
        Anpassung und Testen: Der Optiker kann Ihre Augen erneut vermessen, um sicherzustellen, dass die Brille perfekt auf Ihre Bedürfnisse abgestimmt ist. Sie können auch verschiedene Gläser und Beschichtungen testen, um die beste Wahl zu treffen.
    
    Fazit
    
    Ja, Sie können eine Computerbrille mit Ihren aktuellen Refraktionswerten und einem Blaulichtfilter anpassen lassen. Dies bietet Ihnen optimalen Sehkomfort und Schutz vor den negativen Auswirkungen des blauen Lichts während langer Arbeitszeiten am Computer. Ein Besuch beim Optiker wird Ihnen helfen, die beste Lösung für Ihre spezifischen Bedürfnisse zu finden.
  6. Was bedeutet das Wort Refraktionsvorschlag?

    #in meinem Sehhilfenverordnung, hat eine Wort Refraktionsvorschlag, refers it to Anti-Blau-Filter, oder it refers to Anti-Reflex-Beschichtung?
    In Ihrer Sehhilfenverordnung bezieht sich das Wort „Refraktionsvorschlag“ nicht auf den Anti-Blau-Filter oder die Anti-Reflex-Beschichtung. Stattdessen handelt es sich bei einem Refraktionsvorschlag um die Verschreibung Ihrer Brillengläserwerte, also die spezifischen Korrekturwerte (Sphäre, Zylinder, Achse und Addition) basierend auf Ihrer Sehschärfenmessung.
    Erklärung der Begriffe:
    
        Refraktionsvorschlag:
            Bedeutung: Dies bezieht sich auf die Messergebnisse Ihrer Sehschärfe und die vorgeschlagenen Korrekturwerte für Ihre Brillengläser, um Ihre Fehlsichtigkeit (wie Kurzsichtigkeit, Weitsichtigkeit oder Astigmatismus) zu korrigieren.
            Inhalt: Typischerweise umfasst dies die Werte für Sphäre (SPH), Zylinder (CYL), Achse (AXIS) und Addition (ADD) für beide Augen.
    
        Anti-Blau-Filter (Blaufilter):
            Bedeutung: Ein spezieller Filter in den Brillengläsern, der das blaue Licht von digitalen Bildschirmen (wie Computern, Smartphones und Tablets) reduziert.
            Vorteil: Hilft, die Augenbelastung zu verringern und den Schlaf-Wach-Rhythmus zu verbessern.
    
        Anti-Reflex-Beschichtung (Entspiegelung):
            Bedeutung: Eine Beschichtung auf den Brillengläsern, die Reflexionen von Lichtquellen minimiert.
            Vorteil: Erhöht die Klarheit und den Sehkomfort, besonders bei Nachtfahrten und bei der Arbeit unter künstlichem Licht.
    
    Zusammenfassung:
    
        Refraktionsvorschlag: Bezieht sich auf die optischen Korrekturwerte Ihrer Brille.
        Anti-Blau-Filter: Zusätzliche Option für den Schutz gegen blaues Licht.
        Anti-Reflex-Beschichtung: Zusätzliche Option zur Reduktion von Reflexionen.
    
    Was tun?
    
        Wenn Sie eine Brille für die Computerarbeit benötigen, sollten Sie den Refraktionsvorschlag (Ihre Korrekturwerte) nutzen und mit Ihrem Optiker besprechen, dass Sie zusätzlich einen Anti-Blau-Filter und eine Anti-Reflex-Beschichtung wünschen.
        Diese zusätzlichen Optionen können in Ihre Brillengläser integriert werden, um den Sehkomfort zu maximieren und Ihre Augen zu schützen.
    
    Besprechen Sie Ihre Bedürfnisse und Optionen mit Ihrem Optiker, um sicherzustellen, dass Ihre Brille optimal für Ihre Arbeitsbedingungen angepasst ist.
    
    Refraktion 意思
    
    在眼科和验光学中,“Refraktion” 是一个专业术语,指的是眼睛对光线的折射能力,具体来说,它是测量和矫正视力的一种方法。通过折射测量,可以确定眼睛需要什么样的镜片度数来纠正视力缺陷,如近视、远视或散光。
    详细解释
    
        Refraktion(折射):
            定义:指光线通过眼睛时被弯曲(折射)的过程。正常的眼睛会将光线准确地聚焦在视网膜上,从而产生清晰的视觉图像。
            折射误差:如果光线不能正确地聚焦在视网膜上,就会导致视力问题,如近视(光线聚焦在视网膜前)、远视(光线聚焦在视网膜后)或散光(光线聚焦不规则)。
    
        Refraktionsbestimmung(折射测定):
            过程:这是通过验光程序测量眼睛的屈光状态以确定需要的镜片度数。这通常包括使用验光仪和试镜架来测量眼睛的屈光度数(即球镜度数、柱镜度数和轴位)。
            结果:折射测定的结果是一个处方(Refraktionsvorschlag),其中包含矫正视力所需的具体镜片度数。
    
        Refraktionsvorschlag(折射建议):
            内容:这是根据折射测定得出的具体处方,包括以下参数:
                Sphäre (SPH):球镜度数,表示近视或远视的矫正度数。
                Zylinder (CYL):柱镜度数,表示散光的矫正度数。
                Achse (AXIS):轴位,表示散光矫正的方向。
                Addition (ADD):附加度数,通常用于双焦或渐进镜片,表示近距离视力矫正的度数。
    
    总结
    
    “Refraktion” 主要是指眼睛对光线的折射过程及其测量和矫正方法。在验光过程中,通过测量折射,可以得到具体的镜片度数,以矫正视力问题。这个过程的结果即为“Refraktionsvorschlag”,它是为您提供合适矫正眼镜的依据。
  7. Computerbrille und Gleitsichtbrille, whiche is teuerer?

    In der Regel sind Gleitsichtbrillen teurer als Computerbrillen. Hier sind die Hauptgründe und Unterschiede:
    Gleitsichtbrille
    
        Kosten: Gleitsichtbrillen sind meist teurer, da sie eine spezielle Technologie verwenden, um mehrere Sehbereiche (Fern-, Zwischen- und Nahsicht) in einem Glas zu kombinieren. Die Herstellung und Anpassung dieser Gläser ist komplexer und erfordert präzise Messungen und hochwertige Materialien.
        Vorteil: Sie bieten eine nahtlose Sicht für verschiedene Entfernungen und sind ideal für den täglichen Gebrauch, da sie das Sehen in verschiedenen Entfernungen ohne Wechsel der Brille ermöglichen.
        Nachteile: Der Übergang zwischen den Sehbereichen kann anfangs gewöhnungsbedürftig sein, und es kann eine Eingewöhnungszeit erforderlich sein.
    
    Computerbrille
    
        Kosten: Computerbrillen sind in der Regel günstiger als Gleitsichtbrillen. Sie sind speziell für die mittlere Sehentfernung (wie die Arbeit am Computer) optimiert und haben daher weniger komplexe Anforderungen als Gleitsichtbrillen.
        Vorteil: Sie bieten klare Sicht für den Arbeitsbereich am Computer und sind komfortabel für längere Bildschirmarbeit. Sie können auch mit einem Blaulichtfilter ausgestattet sein, um die Augen vor dem schädlichen blauen Licht von Bildschirmen zu schützen.
        Nachteile: Sie sind hauptsächlich für mittlere Entfernungen optimiert und bieten nicht die gleiche Flexibilität wie Gleitsichtbrillen für unterschiedliche Sehentfernungen.
    
    Kostenfaktoren
    
    Die genauen Kosten können variieren, abhängig von verschiedenen Faktoren wie:
    
        Glasqualität: Hochwertigere Gläser kosten mehr, bieten aber besseren Sehkomfort und Langlebigkeit.
        Beschichtungen: Zusätzliche Beschichtungen wie Antireflex, Kratzfestigkeit und Blaulichtfilter erhöhen die Kosten.
        Rahmen: Der Preis des Rahmens kann erheblich variieren, je nach Marke, Material und Design.
        Anpassung und Beratung: Die individuelle Anpassung durch den Optiker kann ebenfalls die Kosten beeinflussen.
    
    Preisbeispiele (geschätzte Kosten in Deutschland)
    
        Gleitsichtbrille: Kann zwischen 300 und 1000 Euro oder mehr kosten, abhängig von den oben genannten Faktoren.
        Computerbrille: Kann zwischen 150 und 500 Euro kosten, abhängig von den gewählten Optionen und der Qualität der Gläser und des Rahmens.
    
    Fazit
    
    Gleitsichtbrillen sind aufgrund ihrer Vielseitigkeit und der komplexeren Herstellung in der Regel teurer als Computerbrillen. Wenn Sie hauptsächlich am Computer arbeiten und eine kostengünstigere Lösung suchen, könnte eine speziell angepasste Computerbrille die bessere Wahl sein. Für den täglichen Gebrauch und verschiedene Sehentfernungen wäre eine Gleitsichtbrille jedoch vorteilhafter, wenn auch teurer.
    
    Es ist ratsam, sich von einem Optiker beraten zu lassen, um die beste Lösung für Ihre spezifischen Bedürfnisse und Ihr Budget zu finden.
  8. Was genau bedeutet Dioptrien (dpt)? 这个数值 Sphäre (Ferne): +0,75,是表示中文里的度数吗?是指远视 75度吗?

    在验光和眼科学中,"Dioptrien" (缩写为 "dpt") 是度数的单位,用于表示眼睛的屈光度数,也就是镜片的度数。这个数值表示镜片矫正视力所需要的光学强度。
    详细解释
    
        Dioptrien (dpt):
            定义:是镜片屈光度数的单位,用于描述镜片对光线的折射能力。1 dioptrie (1 dpt) 等于镜片焦距的倒数,单位是米 (m)。例如,1 dpt 表示镜片的焦距为1米。
            应用:用于表示近视、远视和散光的矫正度数。
    
        Sphäre (SPH):
            定义:表示眼睛的球镜度数,用于矫正近视或远视。正值表示远视,负值表示近视。
            示例:+0,75 dpt 表示远视,需要一个焦距为1/0.75米(约1.33米)的凸透镜来矫正。
    
    中文中的“度数”
    
    在中文中,眼镜度数通常也以“度”来表示,尽管有时具体转换会根据不同的标准有细微差别。
    
        近视度数:负值 (例如:-1.00 dpt 表示近视100度)
        远视度数:正值 (例如:+0.75 dpt 表示远视75度)
    
    具体解释您的数值
    
        Sphäre (Ferne): +0,75:
            中文解释:表示远视75度。这个数值表明您需要一个度数为 +0.75 dpt 的凸透镜来矫正您的远视。
    
    总结
    
        Dioptrien (dpt):是表示眼镜镜片度数的单位,1 dpt 等于1米焦距的倒数。
        Sphäre (SPH):表示眼睛的球镜度数,正值为远视,负值为近视。
        +0,75 dpt:表示远视75度,需要使用 +0.75 dpt 的凸透镜进行矫正。
    
    希望这能帮助您更好地理解验光处方中的度数。如有进一步问题,请随时询问!
  9. 远视 (Hyperopie) 和加视度 (Addition, ADD) 的区别

    远视(Hyperopie)和加视度(Addition, ADD)在眼镜处方中是两个不同的概念,它们矫正不同类型的视力问题。以下是详细解释:
    
        远视 (Hyperopie):
            定义:远视是一种屈光不正,表示眼睛在放松状态下,平行光线在视网膜后聚焦。远视患者在看远处时通常能看清楚,但看近处时会有困难。
            矫正:使用凸透镜(正度数)来将光线前移,使其正确聚焦在视网膜上。
            表示:在处方中用“球镜度数 (Sphäre, SPH)”表示,正值表示远视。例如,+0.75 dpt 表示需要 +0.75 度的凸透镜来矫正远视。
    
        加视度 (Addition, ADD):
            定义:加视度是指在双焦点或渐进多焦点眼镜中,用于矫正老花眼(Presbyopie)的额外度数。老花眼是由于年龄增长导致眼睛的调节能力下降,看近处时需要额外的矫正。
            矫正:在眼镜的下方区域增加一个附加度数(ADD),帮助患者看清近处物体。
            表示:在处方中用“Addition”表示,通常为正值。例如,ADD +1.50 表示需要额外 +1.50 度的度数来帮助近距离视物。
    
    示例分析
    
    根据您的处方:
    
        Sphäre (Ferne):
            右眼 (R): +0.75
            左眼 (L): +0.50
        Addition (ADD): +1.50
    
    这意味着:
    
        远视:您的右眼需要 +0.75 dpt,左眼需要 +0.50 dpt 的凸透镜来矫正远视。
        加视度:当您需要看近处时,例如阅读或在电脑上工作,您需要额外 +1.50 dpt 的度数。这通常用于老花眼的矫正。
    
    适合您的眼镜类型
    
    考虑到您的视力需求,您可能需要以下类型的眼镜:
    
        渐进多焦点眼镜(Progressive Lenses):
            优点:提供从远到近的连续视距,适合全天佩戴,包括看远处、看中距离(如电脑)和看近处(如阅读)。
            示例:
                远视矫正:上部分镜片矫正远视。
                加视度矫正:下部分镜片提供额外的加视度,帮助近距离视物。
    
        双焦点眼镜(Bifocal Lenses):
            优点:上下两部分分别矫正远视和近视,适合看远处和看近处。
            缺点:中间视距(如电脑)可能不如渐进多焦点眼镜方便。
    
    结论
    
        远视 (Hyperopie) 和 加视度 (Addition, ADD) 是不同的矫正需求,分别用于远距离和近距离视力矫正。
        您的处方中的 +0.75 和 +0.50 表示远视矫正,而 ADD +1.50 表示老花眼的近距离视力矫正。
        推荐使用渐进多焦点眼镜以满足您的多种视力需求,包括远视和老花眼。
    
    请咨询您的眼科医生或验光师,确保根据您的具体需求选择最佳的眼镜类型。
  10. “远视”和“老花”是一回事吗?

    通过这张图我们不难发现眼睛其实挺像一架照相机。光从外界进入眼球,要经过两个会变焦的镜头--角膜和晶状体,其中晶状体起主要的调节作用。然后光再经过玻璃体,最终在视网膜上聚焦。这个光经过的通路要保持透明,也就是眼科大夫口中的屈光介质透明,才能保证清晰的成像。然后这些获得的图像转换成信号通过神经传导到大脑再经过复杂的处理,我们才能看清楚东西。
    
    当角膜和晶状体曲率过大或者眼球太长,光最终聚焦在视网膜前的时候,就是近视。这时候,你看近处没问题,看远处不清楚。需要使用凹透镜来矫正,使得光可以正常聚焦在视网膜上。凹透镜一般是用负号(-)来表示,比如近视200度,写为-2.00D。
    
    理解了近视,我们再来看远视就不难了。远视就是当角膜和晶状体曲率太小或者眼球太短,光最终聚焦在了视网膜的后面。远视度数小的时候,一般可以通过调节来代偿;但是远视大的话,则会看远看近都不清楚,很容易导致眼酸,头疼等症状。需要使用凸透镜来矫正,使得光可以正常聚焦在视网膜上。凸透镜一般是用正号(+)来表示,比如远视300度,会写为+3.00D。
    
    那么散光又是什么?简单来说,就是由于眼球形态不是那么完美的规整,当光通过眼球折射后所成像并非一个焦点。一般通过柱镜予以矫正。
    
    以上提到的近视,远视,散光都属于屈光不正。
    
    老视,也就是一般说的老花。很多人觉得老花就是看不清楚近处,容易跟远视弄混。我们开始提过,光进入眼球主要通过晶状体的调节来实现,而随着年龄的增长,晶状体的调节能力下降,导致看近不清楚。一般在40岁以后出现,随着年龄的增长,所需的花镜度数也在增加。
    
    #--
    
    生活中人们常常把远视和老花搞混了,因为远视眼和老花眼所配的眼镜是同一类型,即都是凸透镜。其实,远视眼和老花眼是两个概念,它们之间既有区别又有关联。什么是远视眼呢?远视的定义是:当眼的调节放松时,平行光线经过眼的屈光系统后聚焦在视网膜之后,称为远视(如果聚焦在视网膜之前,就是近视)。老视即老花,是我们随着年龄的增长,眼内的晶状体逐渐硬化,弹性减弱,眼睛的睫状肌功能也逐渐减低,从而引起眼睛的调节功能减退。大约40~50岁开始,就慢慢出现阅读等近距离工作困难。这种由于年龄增长所致的生理性眼调节功能减弱,称为老视,也就是我们说的老花。老花眼有哪些表现呢?早期,老花患者会逐渐发现在往常习惯的距离阅读,字体小一点就比较吃力,看不清楚。与近视眼相反,老花患者会不自觉地把头后仰或者把书报拿开一点才能看清楚,而且随着年龄的增加,阅读的距离越来越远。另外老花眼的阅读需要更亮的光线,因为足够的光线既增加了书本和文字的对比度,又使眼睛的瞳孔缩小,景深加大,提高视力。还有,老花眼视近不能持久。许多老花眼的人说,看报纸时刚开始还可以,但不能持久,勉强看下去易串行,字迹成双,最后便读不下去了。有些患者甚至出现眼胀、流泪、头痛等视疲劳症状。
    
    远视眼和老花眼有什么异同呢?相同的是在配镜矫正时,都是用的凸透镜;不同的是有远视眼的人一般出现老花的时间较早,相比同龄的人老花程度高,有的看远时也要戴一副轻度的凸透镜才能看得更清楚。记住,老花是一种生理现象,不论屈光状态如何,每个人均会发生老花。那为什么近视眼看近距离的东西不用戴老花镜呢?其实近视眼在中老年以后,如果还戴着原来看远的眼镜看书报,也会出现老花的症状,摘掉近视眼镜看书,就相当于配了一副老花镜,用通俗地话说,就是抵消了部分老花。
  11. 去年我配了两幅下面的眼镜,他们说一幅开车用,一幅工作用,根据账单,请告诉我,哪一幅工作用好,哪一幅用来开车?

正手拉球的技巧

  1. 来球时引拍时跟球一起往回转,然后身体跟手臂一起再往前转(这里需要🈶夹紧的感觉),才能自然的转腰;

  2. 手臂不能夹太紧+手远一点+拍型外展=就是一条线;

  3. 收回到眉心,注意向上(往上挑的感觉);

  4. 加分点:在触⚽️时加速⏩ 【这个有可能被全世爆误导了,制动,所以太早减速了!】

  5. (As Jan mentioned) 核心是快速应激反应:⚠️快速下腰准备,夹臂向右上↗️转腰(因为手臂跟腰同步,手臂自然就转动了) + 收臂至眉心!⚠️也就是;以转腰为框架➕收臂!

  6. Arylate-Carbon 与 Zylon+碳纤维的区别: Arylate-Carbon 和 Zylon+碳纤维 是两种常用于制造乒乓球拍的高性能材料。以下是它们之间的主要区别:

    1. 材料成分

      Arylate-Carbon (ALC): Arylate-Carbon 由芳纶纤维和碳纤维组合而成。芳纶是一种以抗冲击性著称的聚合物纤维,而碳纤维则以其刚性和强度闻名。 Zylon+碳纤维: Zylon 是一种超硬的合成纤维,被认为是市面上最强的纤维之一。Zylon 与碳纤维的结合形成了一种既强韧又轻质的材料。相比其他合成纤维,Zylon 还具有更高的耐温性。

    2. 打球特性

      Arylate-Carbon: 提供了柔韧性和刚性的良好平衡,带来更好的控制和良好的手感。它以减震性能著称,能提供更舒适的握拍体验。 Zylon+碳纤维: 以极高的刚性和反应速度闻名,能带来更高的速度和击球力量。然而,它可能不如 Arylate-Carbon 容错性高,控制性稍逊。

    3. 应用和偏好

      Arylate-Carbon: 适合寻求控制和平衡速度的球员,是防守和全能型球员的热门选择。 Zylon+碳纤维: 适合需要最大速度和冲击力的攻击型球员,通常被顶级球员使用,不妥协性能。

    4. 价格

      Arylate-Carbon: 相对较便宜。 Zylon+碳纤维: 由于生产成本高和材料性能优越,价格通常较高。

    结论

    两种材料各有优势,球员可根据自己的需求和打法选择。Arylate-Carbon 适合寻求良好控制和手感的球员,而 Zylon+碳纤维则适合优先考虑速度和刚性的球员。 参考资料:

    Butterfly Arylate-Carbon 解释
    Zylon 纤维特性
    Zylon 材料性能及用途
    
        * Timo Boll ALC
        DAS HOLZ VON TIMO BOLL
    
        Reaktionswert 11.8
        Vibrationswert 10.3
        Furnieraufbau 5-schichtig + 2 ALC
        Blattdicke 5,8 mm
        Blattgröße 157 x 150 mm
        Griffmaße FL: 100x24x34 mm / ST: 100x22x27,5 mm / AN: 100x24x34 mm
    
        * Franziska Innerforce ZLC
        Offizielles Spielholz von Patrick Franziska
    
        Reaktionswert 10.5
        Vibrationswert 9.5
        Furnieraufbau 5-schichtig + 2 ZLC
        Blattdicke 5.7 mm
        Blattgröße 157 x 150 mm
        Griffmaße FL: 100×24×34 mm / ST: 100×23×28 mm / AN: 100×24×34 mm
        FL: 100×24×34 mm / ST: 100×23×28 mm / AN: 100×24×34 mm
    
        * Fan Zhendong ALC
        Das Spielmaterial von Fan Zhendong
    
        Reaktionswert 11.8
        Vibrationswert 10.3
        Furnieraufbau 5-schichtig + 2 ALC
        Blattdicke 5.8 mm
        Blattgröße 157 x 150 mm (CS: 161 x 150)
        Griffmaße FL: 100×25×34 mm / ST: 100×23×28 mm / CS: 82×24×32 mm

IS Elements and Bacterial Genomic DNA Structure

IS元件(Insertion Sequence,简称IS)是最小和最且最常见的自主转座元件(TEs)。它们是一种短的DNA片段,长度在400到10,000个碱基对之间,通常编码一个转座酶(Tpase),该酶能够催化IS元件在基因组内或基因组间的移动。

IS元件(Insertion Sequence): IS元件是一种短的DNA序列,通常包含一个或多个基因,其中至少包括一个编码转座酶的基因。这些基因使得IS元件能够通过转座酶的作用,在基因组内或不同基因组之间移动。

IR序列(Inverted Repeat): IR序列是IS元件两端带有的短的不完全反向重复序列。这些序列通常是6到40个碱基对长,具有特定的结构,形成了元件的终端结构。IR序列在IS元件的移动过程中起着重要的作用,因为它们是转座酶识别和切割的靶标。

TEs(转座元件): 转座元件(Transposable Elements,TEs)是基因组中能够移动位置的DNA序列。除了IS元件外,TEs还包括更大的转座子和复合转座子等不同类型。它们在基因组结构和进化中起到重要作用,可以导致基因的插入、删除或重排,从而影响基因的表达和功能。

IR在IS元件中的作用: 在IS元件中,IR序列是转座酶识别和介导移动的关键部分。当转座酶结合到IR序列时,它会切割元件的DNA,将元件插入到新的位置或者复制并插入到不同的基因组中。IR序列的特定结构使得转座酶能够高效地介导这些转座过程,确保元件的准确定位和移动。

总结来说,IS元件是基因组中最小的自主转座元件之一,其结构包括编码转座酶的基因和两端的IR序列。这些元件通过转座酶介导的移动过程,影响基因组的结构和功能,对生物体的遗传变异和进化起着重要作用。

IS1元件之所以可以进行转座,尽管只有一个倒转重复(IR),主要依赖于其特殊的转座机制。以下是IS1元件转座的基本过程和机制:

结构特点:

* 单个倒转重复序列:IS1元件在一个端点有一个倒转重复序列(IR),例如:

        5'- CTAG -3'
        3'- GATC -5'
    这个IR序列在转座过程中起到关键作用,它是转座酶识别和切割的靶标。

* 另一端的序列:在另一端,IS1元件没有典型的倒转重复序列,而是具有一种不是倒转的序列结构。

转座过程:

* 转座酶活性:IS1元件编码一种特定的转座酶(transposase),这种酶能够识别并结合IR序列。
* 切割和粘贴机制:转座酶首先结合到IR序列,然后切割IS1元件与其周围DNA的连接。这导致IS1元件从其原始位置被剪切出来。
* 插入到新位置:一旦IS1元件被剪切出来,转座酶会帮助它插入到新的基因组位置。虽然IS1元件没有在另一端具有典型的倒转重复序列,但转座酶可能会利用其他序列或机制来在新位置上完成插入。

逆转座抑制:

* 在某些情况下,IS1元件的转座过程可能会受到宿主细胞逆转座抑制机制的影响。这些机制可以防止IS1元件在不合适的时间或位置进行转座,从而维持基因组的稳定性。

结论: 尽管IS1元件只有一个端点具有明确的倒转重复序列,但它仍然能够通过其特有的转座机制实现移动。转座酶的作用是关键,它能够识别和切割IR序列,并促进IS1元件的精确插入到新的基因组位置,这种机制确保了IS1元件在细菌基因组中的活跃和传播。

In bacterial cells, the genomic DNA is typically double-stranded, similar to eukaryotic cells. Bacteria, like all living organisms, store their genetic information in double-stranded DNA molecules. Here are some key points regarding bacterial DNA structure:

  • Double-Stranded DNA: Bacterial genomes consist of double-stranded DNA (dsDNA), where two complementary strands of nucleotides are bound together by hydrogen bonds between adenine (A) and thymine (T), and between cytosine (C) and guanine (G).

  • Haploid Genome: Bacteria are often considered haploid because they usually have a single circular chromosome. This means that they have one copy of each gene per cell, although bacteria can sometimes carry plasmids which can be present in multiple copies.

  • Plasmids: Apart from the chromosomal DNA, bacteria can also harbor extrachromosomal DNA elements called plasmids. Plasmids are small circular DNA molecules that can replicate independently of the chromosomal DNA. They can carry genes that provide additional functions such as antibiotic resistance or virulence factors.

  • Replication: Bacterial DNA undergoes replication before cell division, ensuring that each daughter cell receives a complete copy of the genome. The process of DNA replication in bacteria involves the unwinding of the double helix, synthesis of new complementary strands, and reformation of double-stranded DNA.

  • Genetic Material: The double-stranded nature of bacterial DNA allows for the faithful replication and transmission of genetic information to offspring cells during cell division and to progeny during reproduction.

In summary, bacteria possess double-stranded DNA as their genetic material, similar to eukaryotic organisms. This DNA is typically organized into a single circular chromosome and can also include plasmids. These characteristics enable bacteria to maintain and propagate their genetic information through generations.

Viral variant calling

1: trimming using trimmomatic

    mkdir trimmed bams
    for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523; do \
        java -jar /home/jhuang/Tools/Trimmomatic-0.36/trimmomatic-0.36.jar PE -threads 100 ./raw_data/${sample}_R1.fastq.gz ./raw_data/${sample}_R2.fastq.gz trimmed/${sample}_R1.fastq.gz trimmed/${sample}_unpaired_R1.fastq.gz trimmed/${sample}_R2.fastq.gz trimmed/${sample}_unpaired_R2.fastq.gz  ILLUMINACLIP:/home/jhuang/Tools/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa:2:30:10:8:TRUE LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 AVGQUAL:20; \
    done

    #(optional) #The reference alignment can be downloaded from the ViPR site:
    #https://www.viprbrc.org/brc/workbenchSequenceSearch.spg?#uploadedFileId=20272&decorator=flavi&method=SubmitForm
    ##-- deduplicate fasta --
    ##awk 'BEGIN{RS=">"}NR>1{sub("\n","\t"); gsub("\n",""); print RS$0}' RSV.fa | awk '!seen[$0]++' | awk -v OFS="\n" '{for(i=2;i<NF;i++) head = head " " $i; print $1 " " head,$NF; head = ""}'
    ##sed -e '/^>/s/$/@/' -e 's/^>/#/' RSV.fa | tr -d '\n' | tr "#" "\n" | tr "@" "\t" | sort -u -k1,1 | sed -e 's/^/>/' -e 's/\t/\n/' > RSV_dedup.fa
    #from Bio import SeqIO
    #with open('RSV_dedup.fa', 'a') as outFile:
    #    record_ids = list()
    #    for record in SeqIO.parse('RSV.fa', 'fasta'):
    #        if record.id not in record_ids:
    #            record_ids.append( record.id )
    #            SeqIO.write(record, outFile, 'fasta')
  1. mapping

    mv PP810610.1.fasta PP810610.1.fa
    ref_fa="PP810610.1.fa";
    # raw mapping
    for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523; do \
        bwa index ${ref_fa}; \
        bwa mem -M -t 16 ${ref_fa} trimmed/${sample}_R1.fastq.gz trimmed/${sample}_R2.fastq.gz | samtools view -bS - > bams/${sample}_genome_alignment.bam; \
        #for table filling using the following commands! -->3000000 \
        #bwa mem -M -t 14 ${ref_fa} ${sample}_R1.fastq.gz ${sample}_R2.fastq.gz | samtools view -bS -F 256 - > bams/${sample}_uniqmap.bam; \
    done

3: AddOrReplaceReadGroup is IMPORTANT step, otherwise the step viral_ngs cannot run correctly

    #MODIFIED
    #default_jvm_mem_opts="-Xms512m -Xmx1g"
    #--> default_jvm_mem_opts="-Xms256g -Xmx512g"
    for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523; do \
    picard AddOrReplaceReadGroups I=bams/${sample}_genome_alignment.bam O=data/00_raw/${sample}.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=$sample RGSM=$sample RGLB=standard RGPU=$sample VALIDATION_STRINGENCY=LENIENT; \
    done
  1. set values in samples-*.txt before running viral-ngs

    ln -s ~/Tools/viral-ngs/Snakefile Snakefile
    ln -s  ~/Tools/viral-ngs/bin bin
    cp  ~/Tools/viral-ngs/refsel.acids refsel.acids
    cp  ~/Tools/viral-ngs/lastal.acids lastal.acids
    cp  ~/Tools/viral-ngs/config.yaml config.yaml
    cp  ~/Tools/viral-ngs/samples-runs.txt samples-runs.txt
    cp  ~/Tools/viral-ngs/samples-depletion.txt samples-depletion.txt
    cp  ~/Tools/viral-ngs/samples-metagenomics.txt samples-metagenomics.txt
    cp  ~/Tools/viral-ngs/samples-assembly.txt samples-assembly.txt
    cp  ~/Tools/viral-ngs/samples-assembly-failures.txt samples-assembly-failures.txt
    mkdir data
    cd data
    mkdir 00_raw
    cd ../..
    
    conda update --all
    
    #all bin/tools can be installed automatically via scritps:
    conda deactivate
    
    #This script installs every Tool needed by viral-ngs
    bin/easy-deploy-script/easy-deploy-viral-ngs.sh setup
    sudo apt update
    sudo apt install curl
    #environment location: /home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/mc3
    #DEBUG: I installed a environment using script under the environment location: /home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/mc3, how to activate the new installed env mc3?
    
    #viral-ngs parent directory found
    #Activating viral-ngs environment...
    #Miniconda installed.
    #Prepending miniconda to PATH...
    #Linking to current viral-ngs install...
    #GATK jar could not be found on this system for GATK version 3.8
    #Please activate the viral-ngs conda environment and 'gatk-register /path/to/GenomeAnalysisTK.jar'
    
    ## Initialize Conda if not already done
    #conda init
    #source ~/.bashrc  # or source ~/.zshrc
    
    # Activate the new environment
    conda activate /home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env
    
    # Verify the active environment
    conda env list
                          *  /home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env
    base                     /home/jhuang/miniconda3
    viral-ngs-env            /home/jhuang/miniconda3/envs/viral-ngs-env
    viral-ngs-env-py37       /home/jhuang/miniconda3/envs/viral-ngs-env-py37
    
    conda install -c bioconda gatk
    gatk3 -version
    #3.8-1-0-gf15c1c3ef
    
    conda list
    #conda install -c conda-forge -c bioconda -c defaults biopython pysam bmtagger picard pybedtools
    #cdhit
    mamba install -c conda-forge -c bioconda -c defaults biopython pyyaml muscle=3.8 trinity tbl2asn snpeff=4.1 megan spades last  vphaser2 blast mvicuna diamond bwa krona bmtagger gap2seq mummer mummer4 kraken fastqc mafft picard prinseq openjdk=8 python=3.6 matplotlib
    #Manually refers to the following tools to the location during hard-coded, don't need to be install: novoalign trimmomatic gatk samtools
    
    #MODIFIED
    bin/tools/spades.py          #hard-coded /usr/bin/spades.py
    bin/tools/mummer.py
    bin/tools/novoalign.py       #hard-coded Novoalign path: /home/jhuang/Tools/novocraft_v3/novoalign
    #in config.yaml
    NOVOALIGN_PATH: "/home/jhuang/Tools/novocraft_v3"
    bin/tools/trimmomatic.py     #hard-coded Trimmomatic path: /usr/local/bin/trimmomatic
    bin/tools/gatk.py: /usr/local/bin/gatk  #!!!!!!! BIG_BIG_BIG_BIG_BUG !!!!!!!: JAVA only with 1.8 + conda install openjdk=8 and /home/jhuang/Tools/SPANDx_v3.2/GenomeAnalysisTK.jar (version 3.2-2-gec30cee) + "mamba install python=3.8"
    #---> using python=3.8, NO ERROR generated in "bin/reports.py consolidate_fastqc reports/fastqc/hCoV229E_Rluc/cleaned reports/fastqc/p10_DMSO/cleaned reports/fastqc/p10_K22/cleaned reports/fastqc/p10_K7523/cleaned reports/summary.fastqc.cleaned.txt"
    bin/assembly.py              #'Seq' object has no attribute 'ungap', solution len(seg.seq.ungap('N'))-->len(seg.seq.replace("N", ""))
    bin/intrahost.py             #'Seq' object has no attribute 'ungap', solution len(seq.seq.ungap('-'))-->len(seq.seq.replace("-", ""))
    bin/reports.py
    bin/samtools.py              #hard-coded /home/jhuang/Tools/samtools-1.9/samtools
    ## Install samtools 1.9
    #wget https://github.com/samtools/samtools/releases/download/1.9/samtools-1.9.tar.bz2
    #tar -xvjf samtools-1.9.tar.bz2
    #cd samtools-1.9
    #./configure
    #make
    #sudo make install
    picard.py currently does not to modify! #Picard 2.20 has warning, but still working! ********** NOTE: Picard's command line syntax is changing.
    
    #DEBUG1: a pretty env is automatically installed: "/home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env"? with bin/tools/__init__.py
    #  "prefix": "/home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env",
    #  "success": true
    
    #DEBUG2: What did the following command?
    /home/jhuang/miniconda3/bin/python /home/jhuang/miniconda3/condabin/conda remove -q -y --json -p /home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env mafft[snpeff]
    
    #DEBUG3: Possibly need to manually annotate the vcf file with snpEff /home/jhuang/Tools/SPANDx_v3.2/snpEff/snpEff.jar
    #bin/interhost.py snpEff inVcf=data/04_intrahost/isnvs.vcf.gz genomes=['PP810610'] outVcf=data/04_intrahost/isnvs.annot2.vcf.gz emailAddress=j.huang@uke.de loglevel=DEBUG
    mkdir ~/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env/share/snpeff-5.1-0/data/PP810610
    cp PP810610.1.gb ~/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env/share/snpeff-5.1-0/data/PP810610/genes.gbk
    vim ~/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env/share/snpeff-5.1-0/snpEff.config
    /home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env/bin/snpEff build -genbank PP810610      -d
    #-t
    snpEff eff -nodownload -no-downstream -no-intergenic -ud 100 -v CP040849.1 noAB_wildtype_trimmed.PASS.snps.vcf > noAB_wildtype_trimmed.PASS.snps.annotated.vcf
    
    snakemake --printshellcmds --cores 50
  2. get statistics from snakemake-output

    samtools flagstat data/02_align_to_self/hCoV229E_Rluc.mapped.bam
    samtools flagstat data/02_align_to_self/p10_DMSO.mapped.bam
    samtools flagstat data/02_align_to_self/p10_K22.mapped.bam
    samtools flagstat data/02_align_to_self/p10_K7523.mapped.bam
    
    671888 + 0 properly paired (99.65% : N/A)
    739432 + 0 properly paired (99.66% : N/A)
    496190 + 0 properly paired (99.61% : N/A)
    549030 + 0 properly paired (99.60% : N/A)
  3. generate variant_annot.xls and coverages.xls

    # -- generate isnvs_annot_complete__.txt, isnvs_annot_0.05.txt from ~/DATA/Data_Pietschmann_RSV_Probe3/data/04_intrahost
    cp isnvs.annot.txt isnvs.annot_complete.txt
    ~/Tools/csv2xls-0.4/csv_to_xls.py isnvs.annot_complete.txt -d$'\t' -o isnvs.annot_complete.xls
    #delete the columns patient, time, Hw and Hs and the header in the xls and save as txt file.
    
    awk '{printf "%.3f\n", $5}' isnvs.annot_complete.csv > f5
    cut -f1-4 isnvs.annot_complete.csv > f1_4
    cut -f6- isnvs.annot_complete.csv > f6_
    paste f1_4 f5 > f1_5
    paste f1_5 f6_ > isnvs_annot_complete_.txt
    cat header isnvs_annot_complete_.txt > isnvs_annot_complete__.txt
    ~/Tools/csv2xls-0.4/csv_to_xls.py isnvs_annot_complete__.txt -d$'\t' -o variant_annot.xls
    
    #MANUALLY generate variant_annot_0.05.csv
    
    #[OPTIONAL]: automatically generate the file variant_annot_0.05.csv
    #awk ' $5 >= 0.05 ' isnvs_annot_complete__.txt > 0.05.csv
    cut -f2-2 xP0/0.05.csv > xP0_ids
    cut -f2-2 xDMSO/0.05.csv > xDMSO_ids
    cut -f2-2 xComp28/0.05.csv > xComp28_ids
    cut -f2-2 xComp29/0.05.csv > xComp29_ids
    cut -f2-2 xComp32/0.05.csv > xComp32_ids
    cut -f2-2 xLona/0.05.csv > xLona_ids
    cat *_ids | sort -u -n > ids
    replace \n with \\t" isnvs_annot_complete__.txt >> isnvs_annot_0.01.txt\ngrep -P "MK816924\\t  in ids
    mv ids get_0.02.sh
    ~/Tools/csv2xls-0.4/csv_to_xls.py variant_annot_0.05.csv isnvs_annot_complete__.txt -d$'\t' -o variant_annot.xls
    
    # -- calculate the coverage
    samtools depth ./data/02_align_to_self/hCoV229E_Rluc.mapped.bam > hCoV229E_Rluc_cov.txt
    samtools depth ./data/02_align_to_self/p10_DMSO.mapped.bam > p10_DMSO_cov.txt
    samtools depth ./data/02_align_to_self/p10_K22.mapped.bam > p10_K22_cov.txt
    samtools depth ./data/02_align_to_self/p10_K7523.mapped.bam > p10_K7523_cov.txt
    ~/Tools/csv2xls-0.4/csv_to_xls.py hCoV229E_Rluc_cov.txt p10_DMSO_cov.txt p10_K22_cov.txt p10_K7523_cov.txt -d$'\t' -o coverages.xls
  4. using bengal3_ac3 calling variants (not useful)

    git clone https://github.com/huang/bacto
    mv bacto/* ./
    rm -rf bacto
    #prepare raw_data and bacto-0.1.json
    conda activate bengal3_ac3
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    
    CHR     POS     REF     hCoV229E_Rluc   p10_DMSO        p10_K22 p10_K7523
    PP810610        1464    T       C       C       C       C
    PP810610        1699    C       T       T       T       T
    PP810610        6691    C       T       T       T       T
    PP810610        6919    C       G       G       G       G
    PP810610        6922    A       G       G       G       G
    PP810610        6925    G       C       C       C       C
    PP810610        7294    T       A       A       A       A
    PP810610        14472   T       C       C       C       C
    PP810610        15458   T       C       C       C       C
    PP810610        16035   C       A       A       A       A
    PP810610        17430   T       C       C       C       C
    PP810610        19289   G       G       G       T       G
    PP810610        21183   T       G       G       G       G
    PP810610        22636   T       G       G       G       G
    PP810610        23022   T       C       C       C       C
    PP810610        24781   C       T       T       T       T
    PP810610        25163   C       T       T       T       T
    PP810610        25264   C       T       T       T       T
    PP810610        26838   G       T       T       T       T
  5. using spandx calling variants (almost the same results to the one from viral-ngs!)

    mkdir ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610
    cp PP810610.1.gb  ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610/genes.gbk
    vim ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config
    /home/jhuang/miniconda3/envs/spandx/bin/snpEff build  PP810610     -d
    gzip hCoV229E_Rluc_trimmed_P_1.fastq hCoV229E_Rluc_trimmed_P_2.fastq p10_DMSO_trimmed_P_1.fastq p10_DMSO_trimmed_P_2.fastq p10_K22_trimmed_P_1.fastq p10_K22_trimmed_P_2.fastq p10_K7523_trimmed_P_1.fastq p10_K7523_trimmed_P_2.fastq
    ln -s /home/jhuang/Tools/spandx/ spandx
    (spandx) nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref PP810610.fasta --annotation --database PP810610 -resume
    
    ->PP810610  1492    T   A   SNP T/A T/A T/A T/A     MODIFIER
    ->PP810610  8289    C   A   SNP C/A C/A C   C/A     MODIFIER
    ->PP810610  8294    A   G   SNP A/G A   A/G A       MODIFIER
    PP810610    8376    G   T   SNP G/T G   G   G       MODIFIER
    PP810610    9146    T   C   SNP T   T   T   T/C     MODIFIER
    ->PP810610  9174    G   A   SNP G   G   G   G/A     MODIFIER
    PP810610    10145   A   G   SNP A   A   A   A/G     MODIFIER
    ->PP810610  10239   T   G   SNP T   T   T/G T       MODIFIER
    ->PP810610  10310   G   A   SNP G   G   G   G/A     MODIFIER
    ->PP810610  10871   C   T   SNP C   C/T T   C/T     MODIFIER
    ->PP810610  10898   G   A   SNP G   G/A G   G/A     MODIFIER
    ->PP810610  11577   A   C   SNP A   A/C A   A       MODIFIER
    PP810610    18640   T   G   SNP T   T   T   T/G     MODIFIER
    ->PP810610  18646   C   T   SNP C   C   C   C/T     MODIFIER
    PP810610    18701   A   G   SNP A   A   A   A/G     MODIFIER
    PP810610    19028   C   T   SNP C   C   C   C/T     MODIFIER
    PP810610    19289   G   T   SNP G   G   T   G       MODIFIER
    -->PP810610 21027   C   T   SNP C   C/T C   C/T     MODIFIER
    ->PP810610  21633   T   C   SNP T   T/C T   T       MODIFIER
    ->PP810610  22215   T   G   SNP T   T   T   T/G     MODIFIER
    ->PP810610  23435   C   T   SNP C   C   T   C/T     MODIFIER
    PP810610    24738   C   *,A SNP C   C   */A C/A     MODIFIER
    PP810610    25025   C   T   SNP C   C/T C   C       MODIFIER
    ->PP810610  25592   T   C   SNP T   T/C T   T       MODIFIER
  6. Consensus sequences of each and of all isolates

    cat PP810610.1.fa OZ035258.1.fa MZ712010.1.fa OK662398.1.fa OK625404.1.fa KF293664.1.fa NC_002645.1.fa > all.fa
    cp data/02_assembly/*.fasta ./
    for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523; do \
    mv ${sample}.fasta ${sample}.fa
    cat all.fa ${sample}.fa >> all.fa
    done
    
    cat RSV_dedup.fa all.fa > RSV_all.fa
    mafft --clustalout --adjustdirection RSV_all.fa > RSV_all.aln
    snp-sites RSV_all.aln -o RSV_all_.aln

Copy a Conda environment from one computer to another

1, Export the Conda Environment

On the original computer, you can export the environment to a YAML file. This file will contain all the necessary details about the environment, including the packages and versions.

Open a terminal and activate the environment you want to copy:

    conda activate bengal3_ac3
    conda env export > bengal3_ac3.yml
    conda activate spandx
    conda env export > spandx.yml
    conda activate chipseq
    conda env export > chipseq.yml
    conda activate rnaseq
    conda env export > rnaseq.yml
    conda activate homer
    conda env export > homer.yml
    conda activate qiime1
    conda env export > qiime1.yml
    conda activate trycycler
    conda env export > trycycler.yml
    conda activate vrap
    conda env export > vrap.yml
    #--------
    conda activate chipseq2
    conda env export > chipseq2.yml
    conda activate r
    conda env export > r.yml
    conda activate rnaseq_2021
    conda env export > rnaseq_2021.yml
    conda activate rnaseq_old
    conda env export > rnaseq_old.yml
    conda activate snippy_env
    conda env export > snippy_env.yml
    conda activate spandx2
    conda env export > spandx2.yml
    #--------
    conda activate rnaseq
    conda env export > rnaseq_sage.yml
    conda activate rnaseq2
    conda env export > rnaseq2_sage.yml
    #--------
    TODO: export the yml files from notebook

2, Transfer the YAML File

Copy the environment.yml file from the original computer to the new computer. You can use methods such as USB drives, cloud storage, or direct file transfer over the network (e.g., using SCP if both machines are accessible over a network):

    rsync -a -P jhuang@hamm:/home/jhuang/*.yml ./

3, Create the Environment on the New Computer

    conda env create -f bengal3_ac3.yml
    conda env create -f spandx.yml
    #or conda env update --name spandx --file spandx.yml
    conda env create -f chipseq.yml
    conda env create -f rnaseq.yml
    conda env create -f homer.yml
    conda env create -f qiime1.yml
    conda env create -f trycycler.yml
    conda env create -f vrap.yml

    # -- for viral variant calling
    /home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env
    viral-ngs
    # -- The results can be confirmed by the bacterial variant calling pipelines
    spandx
    bengal3_ac3  # for the env, we need snakemake_4_3_1.
        conda update -n base conda
        conda create -n snakemake_4_3_1 snakemake=4.3.1
        /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds

Tn-seq analysis pipeline (improved2)

tnseq_principle

circos

  1. Overview of Data Processing Procedure

    1. Convert .fastq files to .fasta format (.reads).
    
    "AGCTTCAGGGTTGAGATGTGTATAAGAGACAG", allowed a mismatch of 1 nt
    2. Identify reads with the transposon prefix in R1. The sequence searched for is "AGCTTCAGGGTTGAGATGTGTATAAGAGACAG", allowed a mismatch of 1 nt, which must start between cycles 5 and 10 (inclusive). (Note that this ends in the canonical terminus of the Himar1 transposon, TGTTA.) The “staggered” position of this sequence is due to insertion a few nucleotides of variable length in the primers used in the Tn-Seq sample prep protocol (e.g. 4 variants of Sol_AP1_57, etc.). The number of mimatches allowed in searching reads for the transposon sequence pattern can be adjusted as an option in the interface; the default is 1.
    
    #What are TACCACGACCA?
    3. Extract genomic part of read 1. This is the suffix following the transposon sequence pattern above. However, for reads coming from fragments shorter than the read length, the adapter might appear at the other end of R1, TACCACGACCA. If so, the adapter suffix is stripped off. (These are referred to as “truncated” reads, but they can still be mapped into the genome just fine by BWA.) The length of the genomic part must be at least 20 bp.
    
    3. Extract barcodes from read 2. Read 2 is searched for GATGGCCGGTGGATTTGTGnnnnnnnnnnTGGTCGTGGTAT”. The length of the barcode is typically 10 bp, but can be varaible, and must be between 5-15 bp.
    
    4. Extract genomic portions of read 2. This is the part following TGGTCGTGGTAT…. It is often the whole suffix of the read. However, if the read comes from a short DNA fragment that is shorter than the read length, the adapter on the other end might appear, in which case it is stripped off and the nucleotides in the middle representing the genomic insert, TGGTCGTGGTATxxxxxxxTAACAGGTTGGCTGATAAG. The insert must be at least 20 bp long (inserts shorter than this are discarded, as they might map to spurious locations in the genome).
    
    5. Map genomic parts of R1 and R2 into the genome using BWA. Mismatches are allowed, but indels are ignored. No trimming is performed. BWA is run in ‘sampe’ mode (treating reads as pairs). Both reads of a pair must map (on opposite strands) to be counted.
    
    6. Count the reads mapping to each TA site in the reference genome (or all sites for Tn5).
    
    7. Reduce raw read counts to unique template counts. Group reads by barcode AND mapping location of read 2 (aka fragment “endpoints”).
    
    8. Output template counts at each TA site in a .wig file.
    
    9. Calculate statistics like insertion_density and NZ_mean. Look for the site with the max template count. Look for reads matching the primer or vector sequences.
  2. quality control

    ./240606_VH00358_96_AAFCFJGM5/kr11/initial_mutants_a_2_S6_R1_001.fastq.gz
    ./240606_VH00358_96_AAFCFJGM5/kr11/initial_mutants_a_2_S6_R2_001.fastq.gz
    ./240606_VH00358_96_AAFCFJGM5/kr13/LB_culture_a_2_S7_R1_001.fastq.gz
    ./240606_VH00358_96_AAFCFJGM5/kr13/LB_culture_a_2_S7_R2_001.fastq.gz
    ./240606_VH00358_96_AAFCFJGM5/kr15/growthout_control_24h_a_2_S8_R1_001.fastq.gz
    ./240606_VH00358_96_AAFCFJGM5/kr15/growthout_control_24h_a_2_S8_R2_001.fastq.gz
    ./240606_VH00358_96_AAFCFJGM5/kr17/extracellular_mutants_24h_a_2_S9_R1_001.fastq.gz
    ./240606_VH00358_96_AAFCFJGM5/kr17/extracellular_mutants_24h_a_2_S9_R2_001.fastq.gz
    ./240606_VH00358_96_AAFCFJGM5/kr19/intracellular_mutants_24h_a_2_S10_R1_001.fastq.gz
    ./240606_VH00358_96_AAFCFJGM5/kr19/intracellular_mutants_24h_a_2_S10_R2_001.fastq.gz
    
    #from fastqc results of initial_mutants
    49821406
    35-161
    49821406
    35-161
    
    https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=1194086
    https://www.ncbi.nlm.nih.gov/Traces/wgs/AKKR01?
    https://www.ncbi.nlm.nih.gov/Traces/wgs/AKKR01?display=download
    #Found 4,131 proteins
  3. modify the tpp scripts

    vim ~/.local/lib/python3.10/site-packages/pytpp/tpp_tools.py
    #search for "DEBUG"
    #-maxreads 10000 or not_given for take all!
    #-primer AGATGTGTATAAGAGACAG     the default primer of Tn5 is TAAGAGACAG!
    #-primer-start-window 0,159  set 0,159 as default!
    #delete to import barcode-file, since we already demultipled the file!
    #  pattern for read 2...
    #    TAGTGGATGATGGCCGGTGGATTTGTG GTAATTACCA TGGTCGTGGTAT CCCAGCGCGACTTCTTCGGCGCACACACC TAACAGGTTGGCTGATAAGTCCCCG?AGAT AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGT
    #    -----const1---------------- --barcode- ---const2--- ------genomic---------------- ------const3--------------------------------------------------------------
  4. run Transposon Position Profiling (TPP) on multiple contigs

    #https://transit.readthedocs.io/en/latest/transit_running.html;
    #https://orca2.tamu.edu/tom/iLab.html
    
    # Break-down of total reads (49821406):
    #  29481783 reads (59.2%) lack the expected Tn prefix
    # Break-down of trimmed reads with valid Tn prefix (20339623):
    
    #primer AGCTTCAGGGTTGAGATGTGTATAAGAGACAG
    #  primer_matches: 0 reads (0.0%) contain CTAGAGGGCCCAATTCGCCCTATAGTGAGT (Himar1)
    #  vector_matches: 0 reads (0.0%) contain CTAGACCGTCCAGTCTGGCAGGCCGGAAAC (phiMycoMarT7)
    #  adapter_matches: 0 reads (0.0%) contain GATCGGAAGAGCACACGTCTGAACTCCAGTCAC (Illumina/TruSeq index)
    #  misprimed_reads: 0 reads (0.0%) contain Himar1 prefix but don't end in TGTTA
    
    #kr11.trimmed1_failed_trim    22072406
    #-rw-rw-r-- 1 jhuang jhuang  2,7G Jun 11 15:44 kr11.trimmed1    20339623
    #-rw-rw-r-- 1 jhuang jhuang  2,9G Jun 11 15:46 kr11.trimmed2    20339623
    #cat initial_mutants_a_2_S6_R1_001.fastq | echo $((`wc -l` / 4))  49821406=29481783 reads (59.2%) + 20339623 = 49821406
    
    #vs.
    
    # Break-down of total reads (49821406):
    #  29356859 reads (58.9%) lack the expected Tn prefix
    # Break-down of trimmed reads with valid Tn prefix (20464547):
    #  primer_matches: 0 reads (0.0%) contain CTAGAGGGCCCAATTCGCCCTATAGTGAGT (Himar1)
    #  vector_matches: 0 reads (0.0%) contain CTAGACCGTCCAGTCTGGCAGGCCGGAAAC (phiMycoMarT7)
    #  adapter_matches: 0 reads (0.0%) contain GATCGGAAGAGCACACGTCTGAACTCCAGTCAC (Illumina/TruSeq index)
    #  misprimed_reads: 0 reads (0.0%) contain Himar1 prefix but don't end in TGTTA
    # read_length: 100 bp
    # mean_R1_genomic_length: 73.4 bp
    # mean_R2_genomic_length: 86.4 bp
    
    #conda deactivate
    ##Test AGCTTCAGGGTTGAGATGTGTATAAGAGACAG --> TAAGAGACAG, the results are similar!
    # Note that "-primer-start-window 0,161" is invalid and cannot replace the default value!
    #python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m_.fasta -reads1 240606_VH00358_96_AAFCFJGM5/kr11/initial_mutants_a_2_S6_R1_001.fastq.gz -reads2 240606_VH00358_96_AAFCFJGM5/kr11/initial_mutants_a_2_S6_R2_001.fastq.gz -output kr11_10nt_primer -primer TAAGAGACAG -mismatches 1 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
    #mv tpp.cfg kr11_10nt_primer_tpp.cfg
    
    #for initial_mutants
    python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m_.fasta -reads1 240606_VH00358_96_AAFCFJGM5/kr11/initial_mutants_a_2_S6_R1_001.fastq.gz -reads2 240606_VH00358_96_AAFCFJGM5/kr11/initial_mutants_a_2_S6_R2_001.fastq.gz -output initial_mutants -primer AGCTTCAGGGTTGAGATGTGTATAAGAGACAG -mismatches 1 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
    mv tpp.cfg initial_mutants_tpp.cfg
    
    #primer_start_window 0,159
    # Break-down of total reads (49821406):
    #  29481783 reads (59.2%) lack the expected Tn prefix
    # Break-down of trimmed reads with valid Tn prefix (20339623):
    #  primer_matches: 0 reads (0.0%) contain CTAGAGGGCCCAATTCGCCCTATAGTGAGT (Himar1)
    #  vector_matches: 0 reads (0.0%) contain CTAGACCGTCCAGTCTGGCAGGCCGGAAAC (phiMycoMarT7)
    #  adapter_matches: 0 reads (0.0%) contain GATCGGAAGAGCACACGTCTGAACTCCAGTCAC (Illumina/TruSeq index)
    #  misprimed_reads: 0 reads (0.0%) contain Himar1 prefix but don't end in TGTTA
    
    #for LB_culture
    python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m_.fasta -reads1 ./240606_VH00358_96_AAFCFJGM5/kr13/LB_culture_a_2_S7_R1_001.fastq.gz -reads2 ./240606_VH00358_96_AAFCFJGM5/kr13/LB_culture_a_2_S7_R2_001.fastq.gz -output LB_culture -primer AGCTTCAGGGTTGAGATGTGTATAAGAGACAG -mismatches 1 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
    mv tpp.cfg LB_culture_tpp.cfg
    
    #for growthout_control_24h
    python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m_.fasta -reads1 ./240606_VH00358_96_AAFCFJGM5/kr15/growthout_control_24h_a_2_S8_R1_001.fastq.gz -reads2 ./240606_VH00358_96_AAFCFJGM5/kr15/growthout_control_24h_a_2_S8_R2_001.fastq.gz -output growthout_control_24h -primer AGCTTCAGGGTTGAGATGTGTATAAGAGACAG -mismatches 1 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
    mv tpp.cfg growthout_control_24h_tpp.cfg
    
    #for extracellular_mutants_24h
    python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m_.fasta -reads1 ./240606_VH00358_96_AAFCFJGM5/kr17/extracellular_mutants_24h_a_2_S9_R1_001.fastq.gz -reads2 ./240606_VH00358_96_AAFCFJGM5/kr17/extracellular_mutants_24h_a_2_S9_R2_001.fastq.gz -output extracellular_mutants_24h -primer AGCTTCAGGGTTGAGATGTGTATAAGAGACAG -mismatches 1 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
    mv tpp.cfg extracellular_mutants_24h_tpp.cfg
    
    #for intracellular_mutants_24h
    python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref WA-314_m_.fasta -reads1 ./240606_VH00358_96_AAFCFJGM5/kr19/intracellular_mutants_24h_a_2_S10_R1_001.fastq.gz -reads2 ./240606_VH00358_96_AAFCFJGM5/kr19/intracellular_mutants_24h_a_2_S10_R2_001.fastq.gz -output intracellular_mutants_24h -primer AGCTTCAGGGTTGAGATGTGTATAAGAGACAG -mismatches 1 -bwa-alg mem -replicon-ids contig_2_10,contig_2_9,contig_2_8,contig_2_7,contig_2_6,contig_2_5,contig_2_3,contig_2_2,contig_5_10,contig_5_11,contig_5_12,contig_5_13,contig_5_15,contig_5_16,contig_5_17,contig_5_18,contig_5_9,contig_5_8,contig_5_7,contig_5_6,contig_5_5,contig_5_4,contig_5_3,contig_5_2,contig_5_1,contig_4_2,contig_4_1,contig_3_59,contig_3_58,contig_3_57,contig_3_56,contig_3_55,contig_3_54,contig_3_53,contig_3_52,contig_3_51,contig_3_50,contig_3_49,contig_3_48,contig_3_47,contig_3_46,contig_3_44,contig_3_43,contig_3_42,contig_3_41,contig_3_40,contig_3_39,contig_3_38,contig_3_37,contig_3_36,contig_3_35,contig_3_34,contig_3_33,contig_3_32,contig_3_31,contig_3_30,contig_3_29,contig_3_28,contig_3_27,contig_3_26,contig_3_25,contig_3_24,contig_3_23,contig_3_22,contig_3_21,contig_3_20,contig_3_17,contig_3_16,contig_3_15,contig_3_14,contig_3_13,contig_3_12,contig_3_11,contig_3_9,contig_3_8,contig_3_7,contig_3_6,contig_3_5,contig_3_3,contig_3_2,contig_3_1,contig_1_48,contig_1_47,contig_1_46,contig_1_45,contig_1_44,contig_1_43,contig_1_42,contig_1_41,contig_1_40,contig_1_39,contig_1_38,contig_1_37,contig_1_34,contig_1_33,contig_1_32,contig_1_31,contig_1_28,contig_1_27,contig_1_26,contig_1_25,contig_1_24,contig_1_22,contig_1_20,contig_1_19,contig_1_18,contig_1_17,contig_1_16,contig_1_15,contig_1_14,contig_1_13,contig_1_12,contig_1_10,contig_1_8,contig_1_7,contig_1_6,contig_1_5,contig_1_4,contig_1_3,contig_1_2,contig_1_1,contig_C8715,contig_C8943,contig_C9371,contig_C8939,contig_C9357,contig_C8991,contig_C9445,contig_C8689
    mv tpp.cfg intracellular_mutants_24h_tpp.cfg
  5. generate statistics tables in Excel-format from the multiple contig running.

    for sample in initial_mutants LB_culture growthout_control_24h extracellular_mutants_24h intracellular_mutants_24h; do
        echo "cd ${sample}"
        echo "cp ${sample}.tn_stats ${sample}.tn_stats_"
        echo "#Delete all general statistics before the table data in ${sample}.tn_stats_; delete the content after \"# FR_corr (Fwd templates vs. Rev templates):\""
        echo "sed -i 's/read_count (TA sites only, for Himar1)/read_counts/g' *.tn_stats_"
        echo "sed -i 's/NZ_mean (among templates)/NZ_mean (mean template count over non-zero TA sites)/g' *.tn_stats_"
        echo "python3 ~/Scripts/parse_tn_stats.py ${sample}.tn_stats_ ${sample}.tn_stats.xlsx"
        echo "#calculate the sum of the first and second columns by \"=SUM(B2:B130)\" and \"=SUM(C2:C130)\""
        echo "mkdir ${sample}_wig"
        echo "mv *.wig ${sample}_wig/"
        echo "zip -r ${sample}_wig.zip ${sample}_wig/"
    done
    
    cd initial_mutants
    cp initial_mutants.tn_stats initial_mutants.tn_stats_
    #Delete all general statistics before the table data in initial_mutants.tn_stats_; delete the content after "# FR_corr (Fwd templates vs. Rev templates):"
    sed -i 's/read_count (TA sites only, for Himar1)/read_counts/g' *.tn_stats_
    sed -i 's/NZ_mean (among templates)/NZ_mean (mean template count over non-zero TA sites)/g' *.tn_stats_
    python3 ~/Scripts/parse_tn_stats.py initial_mutants.tn_stats_ initial_mutants.tn_stats.xlsx
    #calculate the sum of the first and second columns by "=SUM(B2:B130)" and "=SUM(C2:C130)"
    #16,228,513 and 2,454,346
    
    cd LB_culture
    cp LB_culture.tn_stats LB_culture.tn_stats_
    #Delete all general statistics before the table data in LB_culture.tn_stats_; delete the content after "# FR_corr (Fwd templates vs. Rev templates):"
    sed -i 's/read_count (TA sites only, for Himar1)/read_counts/g' *.tn_stats_
    sed -i 's/NZ_mean (among templates)/NZ_mean (mean template count over non-zero TA sites)/g' *.tn_stats_
    python3 ~/Scripts/parse_tn_stats.py LB_culture.tn_stats_ LB_culture.tn_stats.xlsx
    #calculate the sum of the first and second columns by "=SUM(B2:B130)" and "=SUM(C2:C130)"
    #19735541, 3266320
    
    cd growthout_control_24h
    cp growthout_control_24h.tn_stats growthout_control_24h.tn_stats_
    #Delete all general statistics before the table data in growthout_control_24h.tn_stats_; delete the content after "# FR_corr (Fwd templates vs. Rev templates):"
    sed -i 's/read_count (TA sites only, for Himar1)/read_counts/g' *.tn_stats_
    sed -i 's/NZ_mean (among templates)/NZ_mean (mean template count over non-zero TA sites)/g' *.tn_stats_
    python3 ~/Scripts/parse_tn_stats.py growthout_control_24h.tn_stats_ growthout_control_24h.tn_stats.xlsx
    #calculate the sum of the first and second columns by "=SUM(B2:B130)" and "=SUM(C2:C130)"
    #23812866, 3487969
    
    cd extracellular_mutants_24h
    cp extracellular_mutants_24h.tn_stats extracellular_mutants_24h.tn_stats_
    #Delete all general statistics before the table data in extracellular_mutants_24h.tn_stats_; delete the content after "# FR_corr (Fwd templates vs. Rev templates):"
    sed -i 's/read_count (TA sites only, for Himar1)/read_counts/g' *.tn_stats_
    sed -i 's/NZ_mean (among templates)/NZ_mean (mean template count over non-zero TA sites)/g' *.tn_stats_
    python3 ~/Scripts/parse_tn_stats.py extracellular_mutants_24h.tn_stats_ extracellular_mutants_24h.tn_stats.xlsx
    #calculate the sum of the first and second columns by "=SUM(B2:B130)" and "=SUM(C2:C130)"
    #6491071, 236041
    
    cd intracellular_mutants_24h
    cp intracellular_mutants_24h.tn_stats intracellular_mutants_24h.tn_stats_
    #Delete all general statistics before the table data in intracellular_mutants_24h.tn_stats_; delete the content after "# FR_corr (Fwd templates vs. Rev templates):"
    sed -i 's/read_count (TA sites only, for Himar1)/read_counts/g' *.tn_stats_
    sed -i 's/NZ_mean (among templates)/NZ_mean (mean template count over non-zero TA sites)/g' *.tn_stats_
    python3 ~/Scripts/parse_tn_stats.py intracellular_mutants_24h.tn_stats_ intracellular_mutants_24h.tn_stats.xlsx
    #calculate the sum of the first and second columns by "=SUM(B2:B130)" and "=SUM(C2:C130)"
    #20619934, 1402849
    
    mkdir initial_mutants_wig
    mv *.wig initial_mutants_wig/
    cd initial_mutants_wig/
    python3 ~/Scripts/update_wig_initial_mutants.py
    cd ..
    zip -r initial_mutants_wig.zip initial_mutants_wig/
    
    mkdir LB_culture_wig
    mv *.wig LB_culture_wig/
    cd LB_culture_wig/
    python3 ~/Scripts/update_wig_LB_culture.py
    cd ..
    zip -r LB_culture_wig.zip LB_culture_wig/
    
    mkdir growthout_control_24h_wig
    mv *.wig growthout_control_24h_wig/
    cd growthout_control_24h_wig/
    python3 ~/Scripts/update_wig_growthout_control_24h.py
    cd ..
    zip -r growthout_control_24h_wig.zip growthout_control_24h_wig/
    
    mkdir extracellular_mutants_24h_wig
    mv *.wig extracellular_mutants_24h_wig/
    cd extracellular_mutants_24h_wig/
    python3 ~/Scripts/update_wig_extracellular_mutants_24h.py
    cd ..
    zip -r extracellular_mutants_24h_wig.zip extracellular_mutants_24h_wig/
    
    mkdir intracellular_mutants_24h_wig
    mv *.wig intracellular_mutants_24h_wig/
    cd intracellular_mutants_24h_wig/
    python3 ~/Scripts/update_wig_intracellular_mutants_24h.py
    cd ..
    zip -r intracellular_mutants_24h_wig.zip intracellular_mutants_24h_wig/
    
    zip -r genbank_files.zip genbank_files
  6. run Transposon Position Profiling (TPP) on merged_genome

    #for initial_mutants
    python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref merged_genome.fasta -reads1 240606_VH00358_96_AAFCFJGM5/kr11/initial_mutants_a_2_S6_R1_001.fastq.gz -reads2 240606_VH00358_96_AAFCFJGM5/kr11/initial_mutants_a_2_S6_R2_001.fastq.gz -output initial_mutants_run2 -primer AGCTTCAGGGTTGAGATGTGTATAAGAGACAG -mismatches 1 -bwa-alg mem
    mv tpp.cfg initial_mutants_tpp_run2.cfg
    
    #for LB_culture
    python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref merged_genome.fasta -reads1 ./240606_VH00358_96_AAFCFJGM5/kr13/LB_culture_a_2_S7_R1_001.fastq.gz -reads2 ./240606_VH00358_96_AAFCFJGM5/kr13/LB_culture_a_2_S7_R2_001.fastq.gz -output LB_culture_run2 -primer AGCTTCAGGGTTGAGATGTGTATAAGAGACAG -mismatches 1 -bwa-alg mem
    mv tpp.cfg LB_culture_tpp_run2.cfg
    
    #for growthout_control_24h
    python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref merged_genome.fasta -reads1 ./240606_VH00358_96_AAFCFJGM5/kr15/growthout_control_24h_a_2_S8_R1_001.fastq.gz -reads2 ./240606_VH00358_96_AAFCFJGM5/kr15/growthout_control_24h_a_2_S8_R2_001.fastq.gz -output growthout_control_24h_run2 -primer AGCTTCAGGGTTGAGATGTGTATAAGAGACAG -mismatches 1 -bwa-alg mem
    mv tpp.cfg growthout_control_24h_tpp_run2.cfg
    
    #for extracellular_mutants_24h
    python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref merged_genome.fasta -reads1 ./240606_VH00358_96_AAFCFJGM5/kr17/extracellular_mutants_24h_a_2_S9_R1_001.fastq.gz -reads2 ./240606_VH00358_96_AAFCFJGM5/kr17/extracellular_mutants_24h_a_2_S9_R2_001.fastq.gz -output extracellular_mutants_24h_run2 -primer AGCTTCAGGGTTGAGATGTGTATAAGAGACAG -mismatches 1 -bwa-alg mem
    mv tpp.cfg extracellular_mutants_24h_tpp_run2.cfg
    
    #for intracellular_mutants_24h
    python3 ~/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref merged_genome.fasta -reads1 ./240606_VH00358_96_AAFCFJGM5/kr19/intracellular_mutants_24h_a_2_S10_R1_001.fastq.gz -reads2 ./240606_VH00358_96_AAFCFJGM5/kr19/intracellular_mutants_24h_a_2_S10_R2_001.fastq.gz -output intracellular_mutants_24h_run2 -primer AGCTTCAGGGTTGAGATGTGTATAAGAGACAG -mismatches 1 -bwa-alg mem
    mv tpp.cfg intracellular_mutants_24h_tpp_run2.cfg
    
    # "=SUM(B2:B130)" 20619934; "=SUM(C2:C130)" 1402849
    # command: python /home/jhuang/.local/bin/tpp -bwa /usr/bin/bwa -protocol Tn5 -ref merged_genome.fasta -reads1 ./240606_VH00358_96_AAFCFJGM5/kr19/intracellular_mutants_24h_a_2_S10_R1_001.fastq.gz -reads2 ./240606_VH00358_96_AAFCFJGM5/kr19/intracellular_mutants_24h_a_2_S10_R2_001.fastq.gz -output intracellular_mutants_24h_run2 -primer AGCTTCAGGGTTGAGATGTGTATAAGAGACAG -mismatches 1 -bwa-alg mem
    # transposon type: Tn5
    # protocol type: Tn5
    # bwa flags:
    # read1: ./240606_VH00358_96_AAFCFJGM5/kr19/intracellular_mutants_24h_a_2_S10_R1_001.fastq
    # read2: ./240606_VH00358_96_AAFCFJGM5/kr19/intracellular_mutants_24h_a_2_S10_R2_001.fastq
    # ref_genome: merged_genome.fasta
    # replicon_ids:
    # total_reads (or read pairs): 51244639
    # truncated_reads 0 (genomic inserts shorter than the read length; ADAP2 appears in read1)
    # trimmed_reads (reads with valid Tn prefix, and insert size>20bp): 23204461 *
    # reads1_mapped: 20987078
    # reads2_mapped: 20915836
    # mapped_reads (both R1 and R2 map into genome, and R2 has a proper barcode): 20627374 * (since barcode is deleted, that means only this filters only the records in which the R2 not containing bacterial genome!)
    # read_count (TA sites only, for Himar1): 20627374
    # template_count: 1405508
    # template_ratio (reads per template): 14.68
    # TA_sites: 4537463
    # TAs_hit: 93859
    # density: 0.021
    # max_count (among templates): 285
    # max_site (coordinate): 31693
    # NZ_mean (among templates): 15.0
    # FR_corr (Fwd templates vs. Rev templates): 0.003
    # BC_corr (reads vs. templates, summed over both strands): 0.919
    # Break-down of total reads (51244639):
    #  28040178 reads (54.7%) lack the expected Tn prefix
    # Break-down of trimmed reads with valid Tn prefix (23204461):
    #  primer_matches: 0 reads (0.0%) contain CTAGAGGGCCCAATTCGCCCTATAGTGAGT (Himar1)
    #  vector_matches: 0 reads (0.0%) contain CTAGACCGTCCAGTCTGGCAGGCCGGAAAC (phiMycoMarT7)
    #  adapter_matches: 0 reads (0.0%) contain GATCGGAAGAGCACACGTCTGAACTCCAGTCAC (Illumina/TruSeq index)
    #  misprimed_reads: 0 reads (0.0%) contain Himar1 prefix but don't end in TGTTA
    # read_length: 130 bp
    # mean_R1_genomic_length: 75.6 bp
    # mean_R2_genomic_length: 88.5 bp
    
    ./initial_mutants.tn_stats
    # Break-down of total reads (49821406):
    #  29481783 reads (59.2%) lack the expected Tn prefix
    # Break-down of trimmed reads with valid Tn prefix (20339623): --> #16,228,513 and 2,454,346
    
    ./LB_culture.tn_stats
    # Break-down of total reads (43486192):
    #  20855173 reads (48.0%) lack the expected Tn prefix
    # Break-down of trimmed reads with valid Tn prefix (22631019): --> #19,735,541 and 3,266,320
    
    ./growthout_control_24h.tn_stats
    # Break-down of total reads (70663823):
    #  43886543 reads (62.1%) lack the expected Tn prefix
    # Break-down of trimmed reads with valid Tn prefix (26777280):
    
    ./extracellular_mutants_24h.tn_stats
    # Break-down of total reads (47473664):
    #  38115004 reads (80.3%) lack the expected Tn prefix
    # Break-down of trimmed reads with valid Tn prefix (9358660):
    
    ./intracellular_mutants_24h.tn_stats
    # Break-down of total reads (51244639):
    #  28040178 reads (54.7%) lack the expected Tn prefix
    # Break-down of trimmed reads with valid Tn prefix (23204461):
    
    #grep "AGCTTCAGGGTTGAGATGTGTATAAGAGACAG" intracellular_mutants_24h_a_2_S10_R1_001.fastq | wc -l
    #28404198
    #grep "AGCTTCAGGGTTGAGATGTGTATAAGAGACAG" intracellular_mutants_24h_a_2_S10_R2_001.fastq | wc -l
    #29
    
    #AGCTTCAGGGTTGAGATGTGTATAAGAGACAG
    #NOTE_IMPORTANT: explain that some multiple mapped reads have to been deleted for the down-stream analysis!
  7. Prepare the wig files on merged_genome for transit running

    #change all wigs title to WA314
    #./initial_mutants_run2.wig
    #./LB_culture_run2.wig
    #./growthout_control_24h_run2.wig
    #./intracellular_mutants_24h_run2.wig
    #./extracellular_mutants_24h_run2.wig
    sed -i 's/chrom=merged_genome/chrom=WA314/g' *_run2.wig
  8. Prepare the sample-metadata file for transit running

    #my
    ID  Condition   Treatment   Filename
    initial_mutants initial_mutants control initial_mutants_run2.wig
    LB_culture  LB_culture  control LB_culture_run2.wig
    growthout_control_24h   growthout_control_24h   control growthout_control_24h_run2.wig
    intracellular_mutants_24h   intracellular_mutants_24h   treated intracellular_mutants_24h_run2.wig
    extracellular_mutants_24h   extracellular_mutants_24h   treated extracellular_mutants_24h_run2.wig
    
    #Doc
    Id      Condition    Filename
    glyc1   glycerol     /Users/example_data/glycerol_rep1.wig
    glyc2   glycerol     /Users/example_data/glycerol_rep2.wig
    chol1   cholesterol  /Users/example_data/cholesterol_rep1.wig
    chol2   cholesterol  /Users/example_data/cholesterol_rep2.wig
    chol2   cholesterol  /Users/example_data/cholesterol_rep3.wig
  9. Run Transit on merged_genome

    #https://training.galaxyproject.org/training-material/topics/genome-annotation/tutorials/tnseq/tutorial.html#compare-the-essential-genes-between-two-conditions
    
    #File --> Export --> combined_wig, or IGV, or Mean_Gene_Counts.
    #     --> Convert --> ...... [1]
    #View --> Scatter_Plot (only two datasets are allowed)
    #     --> Track_View
    #     --> Quality_Control
    #Analysis
    --> Himar1_Methods
    
        * gumbel
        * resampling
        * hmm
        * example
        * binomial
        * griffin
        * randproduct
        * utest
        * gi
        * normalize
        * tnseq_stats
        * corrplot
        * heatmap
        * ttnfitness
    
    --> Tn5_Methods
    
        * resampling: Resampling test of conditional essentiality between two conditions
        transit resampling -c combined.wig samples.metadata LB_culture intracellular_mutants_24h merged_genome.prot_table resampling_results_test.txt -s 10000 -n TTR -h -a -l -winz
    
            [resampling] site_restricted=False
            [resampling] Starting resampling Method
            [resampling] Winsorizing insertion counts
            [resampling] Getting Data
            Reading combined wig data...
            Filtering wigs by conditions...
            Checking condition: initial_mutants, included_conditions: ['lb_culture', 'intracellular_mutants_24h']
            Checking condition: LB_culture, included_conditions: ['lb_culture', 'intracellular_mutants_24h']
            Checking condition: growthout_control_24h, included_conditions: ['lb_culture', 'intracellular_mutants_24h']
            Checking condition: intracellular_mutants_24h, included_conditions: ['lb_culture', 'intracellular_mutants_24h']
            Checking condition: extracellular_mutants_24h, included_conditions: ['lb_culture', 'intracellular_mutants_24h']
            ['LB_culture' 'intracellular_mutants_24h']
            Creating data_ctrl and data_exp arrays...
            Shapes of data_ctrl and data_exp:
            data_ctrl.shape: (1, 4537463)
            data_exp.shape: (1, 4537463)
            [resampling] Preprocessing Ctrl data...
            [resampling] Normalizing using: TTR
            [resampling] Performing LOESS Correction
            /home/jhuang/.local/lib/python3.10/site-packages/pytransit/stat_tools.py:453: RuntimeWarning: invalid value encountered in divide
            normalized_Y[window*i:window*(i+1)] = Y[window*i:window*(i+1)] / (ysmooth[i]/mline)
            [resampling] Preprocessing Exp data...
            [resampling] Normalizing using: TTR
            [resampling] Performing LOESS Correction
            Creating Genes objects...
            Running resampling...
            [resampling] Running Resampling Method... 100.0%
            [resampling] Performing Benjamini-Hochberg Correction
            Writing output...
            [resampling] Number of significant conditionally essential genes (Padj<0.05): 37
            [resampling] Time: 748.52s
            [resampling] Finished resampling Method
    
        #Resampling
        #Console: python3 /home/jhuang/.local/bin/transit resampling -c combined.wig samples.metadata LB_culture intracellular_mutants_24h merged_genome.prot_table resampling_results.txt -s 10000 -n TTR -h -a -l -winz
        #Parameters: samples=10000, norm=TTR, histograms=True, adaptive=True, excludeZeros=False, pseudocounts=1.0, LOESS=True, trim_Nterm=0.0, trim_Cterm=0.0, site_restricted=False, winsorize=True
        #Control Data: b'lb_culture'
        #Experimental Data: b'intracellular_mutants_24h'
        #Annotation path: b'merged_genome.prot_table'
        #Number of significant conditionally essential genes (Padj<0.05): 37
    
    * utest: Mann-Whitney U-test of conditional essentiality between two conditions. This is a method for comparing datasets from a TnSeq library evaluated in two different conditions, analogous to resampling.
    transit utest LB_culture_run2.wig intracellular_mutants_24h_run2.wig merged_genome.prot_table utest_out -n TTR -l
            [utest] Starting Mann-Whitney U-test Method
            [utest] Getting Data
            [utest] Normalizing using: TTR
            [utest] Performing LOESS Correction
            /home/jhuang/.local/lib/python3.10/site-packages/pytransit/stat_tools.py:453: RuntimeWarning: invalid value encountered in divide
            normalized_Y[window*i:window*(i+1)] = Y[window*i:window*(i+1)] / (ysmooth[i]/mline)
            [utest] Running Mann-Whitney U-test Method... 100.0%
            [utest] Performing Benjamini-Hochberg Correction
            [utest] Adding File: utest_out_l
            [utest] Finished Mann-Whitney U-test Method
    transit utest LB_culture_run2.wig intracellular_mutants_24h_run2.wig merged_genome.prot_table utest_out_without_l -n TTR
            [utest] Starting Mann-Whitney U-test Method
            [utest] Getting Data
            [utest] Normalizing using: TTR
            [utest] Running Mann-Whitney U-test Method... 100.0%
            [utest] Performing Benjamini-Hochberg Correction
            [utest] Adding File: utest_out
            [utest] Finished Mann-Whitney U-test Method
    
            #utest
            #Console: python3 /home/jhuang/.local/bin/transit utest LB_culture_run2.wig intracellular_mutants_24h_run2.wig merged_genome.prot_table utest_out -n TTR -l
            #Control Data: b'LB_culture_run2.wig'
            #Experimental Data: b'intracellular_mutants_24h_run2.wig'
            #Annotation path: b'merged_genome.prot_table'
            #Time: 56.52693510055542
    
            #-l              :=  Perform LOESS Correction; Helps remove possible genomic position bias. Default: Turned Off.
    
    * ZINB (command line only, If you want to compare more than two conditions, see ZINB.): The ZINB (Zero-Inflated Negative Binomial) method is used to determine which genes exhibit statistically significant variability across multiple conditions, in either the magnitude of insertion counts or local saturation, agnostically (in any one condition compared to the others). Like ANOVA, the ZINB method takes a combined_wig file (which combines multiple datasets in one file) and a samples_metadata file (which describes which samples/replicates belong to which experimental conditions).
        transit zinb combined.wig samples.metadata merged_genome.prot_table zinb_out -n TTR --condition Condition --include-conditions LB_culture,intracellular_mutants_24h
        #grep "not analyzed" zinb_out | wc -l  #WARNING: Could run successful, but 4097 records are not analyzed!
    #TODO: R is called by Transit for certain commands, such as ZINB, corrplot, and heatmap.
    #install R (tested on v3.5.2)
    #R packages: MASS, pscl, corrplot, gplots (run
        install.packages(c("MASS", "pscl", "corrplot", "gplots"))
        install.packages("remotes")
        remotes::install_version("MASS", version = "7.3-60")
    #Python packages (for python3): rpy2 (v>=3.0) (run “pip3 install rpy2” on command line)
    
    ** ANOVA (command line only): The Anova (Analysis of variance) method is used to determine which genes exhibit statistically significant variability of insertion counts across multiple conditions. Unlike other methods which take a comma-separated list of wig files as input, the method takes a combined_wig file (which combined multiple datasets in one file) and a samples_metadata file (which describes which samples/replicates belong to which experimental conditions).
    transit anova combined.wig samples.metadata merged_genome.prot_table anova_out -n TTR --include-conditions LB_culture,intracellular_mutants_24h --ref LB_culture -PC 5 -alpha 1000 -winz
        [anova] Starting Anova analysis
        [anova] Getting Data
        [anova] Normalizing using: TTR
        [anova] Winsorizing insertion counts
        [anova] Running Anova
        /home/jhuang/.local/lib/python3.10/site-packages/scipy/stats/_axis_nan_policy.py:531: ConstantInputWarning: Each of the input arrays is constant; the F statistic is not defined or infinite
        res = hypotest_fun_out(*samples, **kwds)
        [anova] Adding File: anova_out. 100.0%
        [anova] Finished Anova analysis
        [anova] Time: 105.4s
        #Console: python3 /home/jhuang/.local/bin/transit anova combined.wig samples.metadata merged_genome.prot_table anova_out -n TTR --include-conditions LB_culture,intracellular_mutants_24h --ref LB_culture -PC 5 -alpha 1000 -winz
        #parameters: normalization=TTR, trimming=0.0/0.0% (N/C), pseudocounts=5, alpha=1000.0
    
        #--ref 
    := which condition(s) to use as a reference for calculating LFCs (comma-separated if multiple conditions) transit anova combined.wig samples.metadata merged_genome.prot_table anova_5samples_ref_LB_culture_out -n TTR –include-conditions initial_mutants,LB_culture,growthout_control_24h,intracellular_mutants_24h,extracellular_mutants_24h –ref LB_culture -PC 5 -alpha 1000 -winz #TODO_MERGE: merge combined.wig and combined_normalized.wig to Excel-file as the read_counts based on the gene! #Rv Gene TAs Mean_initial_mutants Mean_LB_culture Mean_growthout_control_24h Mean_intracellular_mutants_24h Mean_extracellular_mutants_24h LFC_initial_mutants LFC_LB_culture LFC_growthout_control_24h LFC_intracellular_mutants_24h LFC_extracellular_mutants_24h Fstat Pval Padj status Orf Gene ID. Name Name of the gene. TAs Number of TA sites in Gene Means… Mean readcounts for each condition LFCs… Log-fold-changes of counts in each condition vs mean across all conditions MSR Mean-squared residual MSE+alpha Mean-squared error, plus moderation value p-value P-value calculated by the Anova test. p-adj Adjusted p-value controlling for the FDR (Benjamini-Hochberg) status Debug information (If any) transit example initial_mutants_run2.wig merged_genome.prot_table initial_mutants_mean_read-counts_per_gene.txt #TODO: MERGE anovo+example together, delete all headers of the results, save as the Excel-file! #Console: python3 /home/jhuang/.local/bin/transit anova combined.wig 5samples.metadata merged_genome.prot_table anova_out -n TTR –include-conditions LB_culture,intracellular_mutants_24h –ref LB_culture -PC 5 -alpha 1000 -winz #parameters: normalization=TTR, trimming=0.0/0.0% (N/C), pseudocounts=5, alpha=1000.0 #Rv Gene TAs Mean_LB_culture Mean_intracellular_mutants_24h LFC_LB_culture LFC_intracellular_mutants_24h MSR MSE+alpha Fstat Pval Padj status YWA314_00005 307 185.55 381.95 0.000 1.022 5434118.726573 4314598.914305 1.259473 0.262191 1.000000 – YWA314_00010 243 44.93 32.98 0.000 -0.395 7853.462976 476245.960914 0.016490 0.897874 1.000000 – YWA314_00015 103 0.00 0.00 0.000 0.000 0.000000 0.000000 -1.000000 1.000000 1.000000 No counts in all conditions YWA314_00020 190 1.67 0.00 0.000 -0.415 263.556577 1131.081049 0.233013 0.629578 1.000000 – # —- commands for one sample —- ** normalize: Normalization method: python transit.py norm glycerol_H37Rv_rep1.wig,glycerol_H37Rv_rep2.wig H37Rv.prot_table glycerol_TTR.txt -n TTR – TTR: Trimmed Total Reads (TTR), normalized by the total read-counts (like totreads), but trims top and bottom 5% of read-counts. This is the recommended normalization method for most cases, as it has the benefit of compensating for differences in saturation (which is especially important for resampling). – nzmean: Normalizes datasets to have the same mean over the non-zero sites. – totreads: Normalizes datasets by total read-counts, and scales them to have the same mean over all counts. – zinfnb: Fits a zero-inflated negative binomial model, and then divides read-counts by the mean. The zero-inflated negative binomial model will treat some empty sites as belonging to the “true” negative binomial distribution responsible for read-counts while treating the others as “essential” (and thus not influencing its parameters). – quantile: Normalizes datasets using the quantile normalization method described by Bolstad et al. (2003). In this normalization procedure, datasets are sorted, an empirical distribution is estimated as the mean across the sorted datasets at each site, and then the original (unsorted) datasets are assigned values from the empirical distribution based on their quantiles. This actually doesn’t work well on TnSeq data if a large fraction of TA sites have counts of 0 (ties). – betageom: Normalizes the datasets to fit an “ideal” Geometric distribution with a variable probability parameter p. Specially useful for datasets that contain a large skew. – nonorm: No normalization is performed. ** tnseq_stats: Statistical Metrics for TnSeq datasets #Typically a skew < 50 is desired #total_cts Sum of total read-counts in the sample. Indicates how much sequencing material was obtained. Typically >1M reads is desired for Himar1 datasets. transit tnseq_stats -c combined.wig -o tnseq_stats dataset density mean_ct NZmean NZmedian max_ct total_cts skewness kurtosis pickands_tail_index initial_mutants_run2.wig 0.025 0.5 21.7 13 306.0 2458212 2.3 7.1 -0.078 LB_culture_run2.wig 0.025 0.7 29.0 19 307.0 3270725 1.9 4.8 -0.160 growthout_control_24h_run2.wig 0.024 0.8 31.9 22 333.0 3492192 1.8 4.2 -0.082 intracellular_mutants_24h_run2.wig 0.021 0.3 15.0 6 285.0 1405508 3.0 11.3 0.080 extracellular_mutants_24h_run2.wig 0.011 0.1 4.7 3 159.0 236640 5.2 66.9 0.585 * example: Example method that calculates mean read-counts per gene. * transit example initial_mutants_run2.wig,LB_culture_run2.wig growthout_control_24h_run2.wig intracellular_mutants_24h_run2.wig extracellular_mutants_24h_run2.wig merged_genome.prot_table mean_read-counts_per_gene.txt #–>ERROR! #Orf Name Desc k n mean nzmean YWA314_00005 IS1329 transposase A 15 307 1.11 22.80 YWA314_00010 transposase B 7 243 0.30 10.57 YWA314_00015 hypothetical protein 0 103 0.00 0.00 YWA314_00020 phage protein 2 190 0.02 2.00 YWA314_00025 putative phage endopeptidase 1 418 0.05 20.00 * rankproduct: Rank product test for determining conditional essentiality. transit rankproduct LB_culture_run2.wig intracellular_mutants_24h_run2.wig merged_genome.prot_table rankproduct_out #-s 100 -n TTR -h -a -l #warnings.warn(“\nOne or more of your .wig files does not include any empty sites (i.e. sites with zero read-counts). Proceeding as if data was Tn5 (all other sites assumed to be zero)!\n”) #ERROR! #Tn5Gaps method: https://training.galaxyproject.org/training-material/topics/genome-annotation/tutorials/tnseq/tutorial.html#predict-the-essentiality-of-genes ** tn5gaps: It is based on a Gumbel analysis method Griffin et al. 2011 and adapted to Tn5 transposon specificity. The main difference comes from the fact that Tn5 transposon can insert everywhere, thus creating libraries with lower insertion rates. #transit tn5gaps initial_mutants_run2.wig merged_genome.prot_table initial_mutants_tn5gaps_out #-m 2 -r Sum -iN 5 -iC 5; for sample in initial_mutants LB_culture growthout_control_24h intracellular_mutants_24h extracellular_mutants_24h; do transit tn5gaps ${sample}_run2_normalized.wig merged_genome.prot_table ${sample}_tn5gaps_trimmed.dat -m 2 -r Sum -iN 5 -iC 5; done #grep “Essential” initial_mutants_tn5gaps_trimmed.dat | wc -l #298 #grep “Non-essential” initial_mutants_tn5gaps_trimmed.dat | wc -l #3834 ~/Tools/csv2xls-0.4/csv_to_xls.py initial_mutants_tn5gaps_trimmed.dat LB_culture_tn5gaps_trimmed.dat \ growthout_control_24h_tn5gaps_trimmed.dat intracellular_mutants_24h_tn5gaps_trimmed.dat extracellular_mutants_24h_tn5gaps_trimmed.dat -d$’\t’ -o Tn5Gaps.xls; draw graphics to explain the r, ovr and lenovr based on the information below. #Orf Name Desc k n(length) r ovr lenovr pval padj call YWA314_00005 IS1329 transposase A 15 278 102 104 157 1.00000 1.00000 Non-essential YWA314_00010 transposase B 7 220 54 55 70 1.00000 1.00000 Non-essential YWA314_00015 hypothetical protein 0 94 94 96 502 0.29242 1.00000 Non-essential YWA314_00020 phage protein 2 173 113 114 502 0.29242 1.00000 Non-essential YWA314_00025 putative phage endopeptidase 1 379 315 317 439 0.81781 1.00000 Non-essential YWA314_17634 dnaA chromosomal replication initiation protein 1 1250 1137 1141 1287 0.00000 0.00000 Essential k: Number of Transposon Insertions Observed within the ORF. n: Total Number of TA dinucleotides within the ORF. r: Length of the Maximum Run of Non-Insertions observed. #TODO1_DEL: ovr: The number of nucleotides in the overlap with the longest run partially covering the gene. lenovr: The length of the above run with the largest overlap with the gene. pval: P-value calculated by the permutation test. padj: Adjusted p-value controlling for the FDR (Benjamini-Hochberg). call: Essentiality call for the gene. Depends on FDR corrected thresholds. Essential or Non-Essential. r (Run of Non-Insertions): This value represents the length of the longest continuous region within an ORF (open reading frame) where no transposon insertions are observed. Graphically, this could be shown as a long unbroken line or bar within a longer gene representation, highlighting the absence of marks (insertions). ovrovr (Overlap with Run): This is the number of nucleotides that overlap with the longest run, which might partially cover the gene. It is not the total length of the run, but how much of it overlaps with the gene. In a graphic, this could be illustrated by overlapping two segments: one for the gene and another for the run, with the overlapping part distinctly colored or shaded. lenovrlenovr (Length of Overlap with Run): This measures the full length of the run that has the largest overlap with the gene. Visually, this could be depicted as a separate longer line or bar that extends beyond the gene boundaries but is highlighted where it overlaps with the gene. * #WARNING: Since gumbel_out cannot be generated, the ttnfitness can be generated! * TTN-Fitness (TTNFitness method that calculates mean read-counts per gene): Typically with individual TnSeq datasets, Gumbel and HMM are the methods used for evaluating essentiality. – Gumbel distinguishes between ES (essential) from NE (non-essential). – HMM adds the GD (growth-defect; suppressed counts; mutant has reduced fitness) and GA (growth advantage; inflated counts; mutant has selective advantage) categories. – Quantifying the magnitude of the fitness defect is risky because the counts at individual TA sites can be noisy. Sometimes the counts at a TA site in a gene can span a wide range of very low to very high counts. The TTN-Fitness gives a more fine-grained analysis of the degree of fitness effect by taking into account the insertion preferences of the Himar1 transposon. – These insertion preferences are influenced by the nucleotide context of each TA site. The TTN-Fitness method uses a statistical model based on surrounding nucleotides to estimate the insertion bias of each site. Then, it corrects for this to compute an overall fitness level as a Fitness Ratio, where the ratio is 0 for ES genes, 1 for typical NE genes, between 0 and 1 for GD genes and above 1 for GA genes. transit ttnfitness transit ttnfitness initial_mutants_run2_normalized.wig,LB_culture_run2_normalized.wig,growthout_control_24h_run2_normalized.wig,intracellular_mutants_24h_run2_normalized.wig,extracellular_mutants_24h_run2_normalized.wig merged_genome.prot_table merged_genome.fasta gumbel_out ttnfitness_out1 ttnfitness_out2 transit ttnfitness initial_mutants_run2_normalized.wig,LB_culture_run2_normalized.wig,growthout_control_24h_run2_normalized.wig,intracellular_mutants_24h_run2_normalized.wig,extracellular_mutants_24h_run2_normalized.wig merged_genome.prot_table – gumbel output file:* The Gumbel method must be run first on the dataset.The output of the Gumbel method is provided as an input to this method. ES (essential by Gumbel) and EB (essential by Binomial) is calculated in the TTN-Fitness method via this files * Genetic Interactions: The genetic interactions (GI) method is a comparative analysis used used to determine genetic interactions. It is a Bayesian method that estimates the distribution of log fold-changes (logFC) in two strain backgrounds under different conditions, and identifies significantly large changes in enrichment (delta_logFC) to identify those genes that imply a genetic interaction. * Pathway Enrichment Analysis: Pathway Enrichment Analysis provides a method to identify enrichment of functionally-related genes among those that are conditionally essential (i.e. significantly more or less essential between two conditions). transit pathway_enrichment [-M ] [- … ** corrplot (Correlation among TnSeq datasets, command line only): A useful tool when evaluating the quality of a collection of TnSeq datasets is to make a correlation plot of the mean insertion counts (averaged at the gene-level) among samples. #INCOMPLETE cut -f1-6 combined.wig > combined_.wig transit corrplot combined_.wig corrplot.png #INCOMPLETE cut -f1-6 combined_normalized.wig > combined_normalized_.wig transit corrplot combined_normalized_.wig corrplot_normalized.png transit corrplot anova_5samples_ref_LB_culture_out corrplot_anova.png -anova #[corrplot] Starting Corrplot correlations based on 74 genes [corrplot] Finished Corrplot ** heatmap (Heatmap among Conditions, command line only): The output of ANOVA or ZINB can be used to generate a heatmap that simultaneously clusters the significant genes and clusters the conditions, which is especially useful for shedding light on the relationships among the conditions apparent in the data. transit heatmap anova_5samples_ref_LB_culture_out heatmap.png -anova -qval 0.05 -low_mean_filter 3 #heatmap based on 74 genes transit heatmap anova_5samples_ref_LB_culture_out heatmap.png -anova -qval 0.1 -low_mean_filter 3 #– convert gbk to prot_table — python3 ~/Scripts/gbk_to_prottable.py merged_genome.gbk merged_genome.prot_table #transit export #https://transit.readthedocs.io/en/latest/transit_normalization_tutorial.html #-1- transit export combined_wig -n TTR #RUN transit export combined_wig initial_mutants_run2.wig,LB_culture_run2.wig,growthout_control_24h_run2.wig,intracellular_mutants_24h_run2.wig,extracellular_mutants_24h_run2.wig merged_genome.prot_table combined.wig -n nonorm transit export combined_wig initial_mutants_run2.wig,LB_culture_run2.wig,growthout_control_24h_run2.wig,intracellular_mutants_24h_run2.wig,extracellular_mutants_24h_run2.wig merged_genome.prot_table combined_normalized.wig -n TTR #-2- transit export igv initial_mutants_run2.wig,LB_culture_run2.wig,growthout_control_24h_run2.wig,intracellular_mutants_24h_run2.wig,extracellular_mutants_24h_run2.wig merged_genome.prot_table combined_normalized.igv -n TTR #TODO: replace merged_genome to WA314! #DEBUG: how to run it? #-3- transit export mean_counts -c combined.wig combined.mean_counts # note: append -c if inputing a combined_wig file # — detect essential genes (DnaA is essential, which is why there are no insertion counts in the first few TA sites) #https://transit.readthedocs.io/en/v3.2.5/method_ttnfitness.html #对于 transit gumbel 分析,通常期望输入的是已经规范化(normalized)的 .wig 文件。规范化的数据可以减少由于不同实验条件或测序深度导致的偏差,使得后续的统计分析更加可靠 #– Normalization — #https://transit.readthedocs.io/en/latest/method_normalization.html for sample in initial_mutants LB_culture growthout_control_24h intracellular_mutants_24h extracellular_mutants_24h; do transit normalize ${sample}_run2.wig ${sample}_run2_normalized.wig -n TTR #betageom done #https://transit.readthedocs.io/en/latest/method_tnseq_stats.html #pre-normalization for sample in initial_mutants LB_culture growthout_control_24h intracellular_mutants_24h extracellular_mutants_24h; do transit tnseq_stats ${sample}_run2.wig done #post-normalization for sample in initial_mutants LB_culture growthout_control_24h intracellular_mutants_24h extracellular_mutants_24h; do transit tnseq_stats ${sample}_run2_normalized.wig done
  10. Reports

    #1. For gene_based reports: example (mean_read-counts_per_gene.txt) + ANOVA (anova_5samples_ref_LB_culture_out)
            #TAs: Number of TA sites in Gene
    
            #mean: Average read-count, including empty sites.
            #nzmean: Average read-count, excluding empty sites.
    
            Orf: Gene ID
            Name: Name of the gene.
            Desc    Gene description.
            k: Number of Transposon Insertions Observed within the ORF.
            n: Total Number of TA sites within the ORF
            Normalized_initial_mutants: Mean read counts for the condition initial_mutants (normalized with TTR)
            Normalized_LB_culture: Mean read counts for the condition LB_culture (normalized with TTR)
            Normalized_growthout_control_24h: Mean read counts for the condition growthout_control_24h (normalized with TTR)
            Normalized_intracellular_mutants_24h: Mean read counts for the condition intracellular_mutants_24h (normalized with TTR)
            Normalized_extracellular_mutants_24h: Mean read counts for the condition extracellular_mutants_24h (normalized with TTR)
    
    Orf    Name    Desc    k       n       mean    nzmean
    Rv     Gene    TAs     Normalized_initial_mutants    Normalized_LB_culture Normalized_growthout_control_24h      Normalized_intracellular_mutants_24h  Normalized_extracellular_mutants_24h
    
    #check both f1 are the same;
    cut -f4-8 anova_5samples_ref_LB_culture_out > f4_8;
    paste mean_read-counts_per_gene.txt f4_8 > overview_gene_based.txt
    #delete the columns mean and nzmean.
    
    #2. For essentiall gene report: transit tn5gaps
    TODO: delete the headers; DEL ovr; merge tn5gaps also the gene-based tables, add columns r, ovr, lenovr, pval, padj, call to the gene-based table.
    
    #Tn5Gaps method: https://training.galaxyproject.org/training-material/topics/genome-annotation/tutorials/tnseq/tutorial.html#predict-the-essentiality-of-genes
    #dinucleotides
    ORF     Gene ID.
    Name    Name of the gene.
    Desc    Gene description.
    k   Number of Transposon Insertions Observed within the ORF.
    n   Total Number of TA sites within the ORF.
    r   Length of the Maximum Run of Non-Insertions observed.
    pval    P-value calculated by the permutation test.
    padj    Adjusted p-value controlling for the FDR (Benjamini-Hochberg).
    call    Essentiality call for the gene. Depends on FDR corrected thresholds. Essential or Non-Essential.
    
    #3. Only choose ANOVA for DEG reports, since it can be drawn + are consistent with the gene-based view!
    # delete MSR and MSE+alpha.
    
    Orf: Gene ID
    Name: Name of the gene
    TAs: Number of TA sites in the gene
    Means…: Mean read counts for each condition
    LFCs…: Log-fold changes of counts in each condition vs. the mean across all conditions
    p-value: P-value calculated by the ANOVA test
    p-adj: Adjusted p-value controlling for the FDR (False Discovery Rate, Benjamini-Hochberg method)
    status: Debug information
    
    #4. 2 plots (Temporarily do not send, since the based on 74 genes, we have only 30+ significant genes between LB_culture and intracellular_mutants_24h!)
    corrplot: A useful tool when evaluating the quality of a collection of TnSeq datasets is to make a correlation plot of the mean insertion counts (averaged at the gene-level) among samples.
    heatmap: The output of ANOVA can be used to generate a heatmap that simultaneously clusters the significant genes and clusters the conditions, which is especially useful for shedding light on the relationships among the conditions apparent in the data.
    
    #5. (Temporarily do not send, since for previous wig files can also beed used!) five wig-files and merged_genome.gb and merged_genome.fa
    #resampling
    #Orf    Name    Desc    Sites   Mean Ctrl   Mean Exp    log2FC  Sum Ctrl    Sum Exp Delta Mean  p-value Adj. p-value
    YWA314_00463        chaperone protein DnaJ  1129    38.5    1.3 -4.13   43427.8 1413.92 -37.2   0   0
    
    #utest
    #Orf    Name    Desc    Sites   Mean Ctrl   Mean Exp    log2FC  U-Statistic p-value Adj. p-value
    YWA314_00463        chaperone protein DnaJ  1129    1696.7  339.9   -2.32   635 0   0
    
    #ANOVA
    #Rv Gene    TAs Mean_LB_culture Mean_intracellular_mutants_24h  LFC_LB_culture  LFC_intracellular_mutants_24h   MSR MSE+alpha   Fstat   Pval    Padj    status
    
    YWA314_05189        2506    106.58  0   0   -4.48   14054884.5521   338643.330313   41.503503   0   0.000001
    YWA314_00518    carB    3232    194.02  31.97   0   -2.428  42162662.154189 1121416.346908  37.597688   0   0.000002
    YWA314_16630        2146    34.88   0   0   -2.996  1261390.068193  36940.341092    34.146682   0   0.000008
    YWA314_00463        1129    49.05   1.87    0   -2.976  1051922.250591  110643.16916    9.50734 0.002071    0.213913
    
    YWA314_00463            1129    58.38   49.05   19.61   1.87    22.56   0.230   0.000   -1.135  -2.976  -0.972  489073.319278   169569.370262   2.884208        0.021237        0.669899
  11. plot Figure 1. Overview of the Yersinia enterocolitica subsp. enterocolitica WA-314 Transposon Mutant Library This figure illustrates the distribution of transposon insertion sites across the genome. The outermost black circle represents the WA-314 genome in kilobase pairs (Kbp). The middle circles, which feature scatter points, indicate the normalized number of sequencing reads at each unique transposon insertion site, with each axis line representing an increment of 50,000 in data values. The circles are color-coded to represent different conditions: green for extracellular mutants, blue for intracellular mutants, red for growthout control, purple for LB culture, and yellow for initial mutants. The innermost orange circle highlights the locations of genes identified as essential.

    #circos -conf circos.conf
    
    <<include /etc/circos/colors_fonts_patterns.conf>>
    
    karyotype = circos_data/karyotype.microbe.txt
    
    chromosomes_units           = 10000
    chromosomes_display_default = yes
show = yes default = 0.01r break = 0.5r #8r # Adds a gap between the first and last position of the single chromosome radius = 0.9r thickness = 25p fill = no stroke_thickness = 2 stroke_color = black show_bands = yes fill_bands = yes band_transparency = 0 show_label = yes label_font = default label_radius = 1.05r label_size = 75 label_parallel = yes orientation = 100 # Rotate the plot by 90 degrees
    #<<include ticks.conf>>
    show_ticks          = yes
    show_tick_labels    = yes

    show_grid          = no
    grid_start         = dims(ideogram,radius_inner)-0.5r
    grid_end           = dims(ideogram,radius_inner)
skip_first_label = yes skip_last_label = no radius = dims(ideogram,radius_outer) tick_separation = 2p min_label_distance_to_edge = 0p label_separation = 5p label_offset = 5p label_size = 8p multiplier = 0.001 color = black thickness = 3p size = 20p size = 10p spacing = 1u color = black show_label = no label_size = 12p format = %.2f grid = no grid_color = lblue grid_thickness = 1p size = 15p spacing = 5u color = black show_label = yes label_size = 16p format = %s grid = yes grid_color = lgrey grid_thickness = 1p size = 18p spacing = 10u color = black show_label = yes label_size = 16p format = %s grid = yes grid_color = grey grid_thickness = 1p spacing = 100u color = black show_label = yes suffix = ” kb” label_size = 36p format = %s grid = yes grid_color = dgrey grid_thickness = 1p <> <> <>
    # -- Scatter plot 1 --
type = scatter file = circos_data/extracellular_mutants.txt r1 = 0.99r r0 = 0.79r #python3 identify_min_max.py circos_data/initial_mutants.txt min = 0 max = 400000 glyph = circle glyph_size = 5 color = dgreen spacing = 50000 color = lgrey # # #spacing = 0.1r #color = lgrey # # #spacing = 0.1 #size = 10p #thickness = 2p #color = lgrey #show_label = yes #label_size = 20p #label_offset = 5p #format = %0.1f # # # # # # #condition = var(value) > 10000 #stroke_color = dred #fill_color = red #glyph = rectangle #glyph_size = 2 # # # — Scatter plot 2 — type = scatter file = circos_data/intracellular_mutants.txt r1 = 0.79r r0 = 0.69r min = 0 max = 200000 glyph = circle glyph_size = 5 color = dblue spacing = 50000 color = lgrey # — Scatter plot 3 — type = scatter file = circos_data/growthout_control.txt r1 = 0.69r r0 = 0.59r min = 0 max = 200000 glyph = circle glyph_size = 5 color = dred spacing = 50000 color = lgrey # — Scatter plot 4 — type = scatter file = circos_data/LB_culture.txt r1 = 0.59r r0 = 0.49r min = 0 max = 200000 glyph = circle glyph_size = 5 color = dpurple spacing = 50000 color = lgrey # — Scatter plot 5 — type = scatter file = circos_data/initial_mutants.txt r1 = 0.49r r0 = 0.39r min = 0 max = 200000 glyph = circle glyph_size = 5 color = dyellow spacing = 50000 color = lgrey # Gene Locations type = heatmap file = circos_data/merged_genome.txt r1 = 0.35r r0 = 0.32r color = orange #grep “Essential” tn5_gap_inituial_mutant.csv > essential_genes.txt #cut -f1 -d$’\t’ essential_genes.txt > f1 #vim merged_genome.prot_table #” merged_genome.prot_table >> merged_genome.prot_table_essential \ngrep ” #python3 generate_gene_locations.py #replace “\n\t” with “\nchr\t”
    <<include /etc/circos/housekeeping.conf>>

Key Bioconductor and R Packages for Bioinformatics

Top Bioconductor Packages

  • DESeq2: Differential gene expression analysis based on the negative binomial distribution.
  • edgeR: Differential expression analysis of RNA-Seq and other count data.
  • limma: Linear models for microarray data analysis.
  • GenomicRanges: Representation and manipulation of genomic intervals and variables.
  • SummarizedExperiment: Container for storing high-throughput assay data and metadata.
  • SingleCellExperiment: Container for single-cell RNA-Seq data.
  • AnnotationHub: Access to a variety of genome annotation resources.
  • BiocGenerics: S4 generic functions used across Bioconductor packages.
  • Rsamtools: Input/output, manipulation, and analysis of SAM/BAM files.
  • biomaRt: Interface to BioMart databases like Ensembl.
  • tximport: Import and summarize transcript-level estimates for gene-level analysis.
  • VariantAnnotation: Annotation of variants detected by high-throughput sequencing.
  • GenomicFeatures: Representation and manipulation of transcript annotation databases.
  • Biostrings: Efficient manipulation of biological strings.
  • Gviz: Plotting data and annotation information along genomic coordinates.
  • ComplexHeatmap: Making complex, annotated heatmaps.
  • scran: Methods for single-cell RNA-Seq data analysis.
  • scater: Single-cell analysis tools for quality control, normalization, and visualization.
  • IRanges: Infrastructure for representing and manipulating intervals.
  • HDF5Array: HDF5 backend for DelayedArray objects.
  • TCGAbiolinks: Tools for downloading, preparing, and analyzing TCGA data.
  • edgeRun: High-performance differential expression analysis for RNA-Seq data.
  • ChIPseeker: Annotation of peaks in ChIP-Seq data.
  • clusterProfiler: Statistical analysis and visualization of functional profiles for genes and gene clusters.
  • ensembldb: Utilities for working with Ensembl-based annotations.
  • GEOquery: Get data from NCBI Gene Expression Omnibus (GEO).
  • pathview: Pathway-based data integration and visualization.
  • GSEABase: Gene set enrichment data structures and methods.
  • GOstats: Tools for manipulating GO and microarray data.
  • GO.db: A set of annotation maps describing the entire Gene Ontology.
  • WGCNA: Weighted correlation network analysis for gene expression data.
  • maftools: Analysis and visualization of mutation annotation format (MAF) files.
  • DiffBind: Differential binding analysis of ChIP-Seq peak data.
  • BSgenome: Infrastructure for Biostrings-based genome data packages.
  • Rhtslib: High-throughput sequencing library as used by Rsamtools.
  • ShortRead: Import and analyze high-throughput sequencing data.
  • BiocParallel: Bioconductor facilities for parallel evaluation.
  • msigdb: Import Molecular Signatures Database (MSigDB) gene sets.
  • goseq: Gene Ontology analysis for RNA-Seq data.
  • ReactomePA: Pathway enrichment analysis with Reactome Pathway Database.
  • IRanges: Infrastructure for manipulating intervals on sequences.
  • sva: Surrogate variable analysis for removing batch effects and other unwanted variation in high-throughput experiments.
  • AnnotationForge: Tools for building SQLite-based annotation data packages.
  • BioCycData: Access to BioCyc Pathway/Genome Database Collection.
  • biovizBase: Basic graphic utilities for visualization of genomic data.
  • scRNAseq: Single-cell RNA sequencing data package.
  • TxDb.Hsapiens.UCSC.hg38.knownGene: Annotation package for TxDb object(s).
  • SummarizedExperiment: Container for storing high-throughput assay data and metadata.
  • GenomicAlignments: Representation and manipulation of short genomic alignments.
  • rtracklayer: Extensible framework for interacting with multiple genome browsers.

Top General R Packages

  • ggplot2: Data visualization package based on the Grammar of Graphics.
  • dplyr: Data manipulation functions that simplify complex operations on data frames.
  • tidyr: Tools for converting data to tidy format.
  • stringr: Simple, consistent functions to manipulate strings.
  • lubridate: Functions to work with date-times and timespans.
  • shiny: Web application framework for R.
  • caret: Classification and regression training package.
  • rmarkdown: Dynamic documents for R.
  • knitr: A general-purpose literate programming engine.
  • data.table: Extension of data.frame for fast manipulation of large datasets.
  • xtable: Export tables to LaTeX or HTML.
  • forecast: Tools for forecasting and time series analysis.
  • randomForest: Classification and regression based on a forest of trees using random inputs.
  • survival: Survival analysis, including penalized likelihood.
  • glmnet: Lasso and elastic-net regularized generalized linear models.
  • plotly: Interactive, web-based graphs via plotly’s JavaScript graphing library.
  • sf: Simple features for R, for handling vector data.
  • zoo: S3 infrastructure for regular and irregular time series.
  • tm: Text mining package for text mining applications.
  • lme4: Linear mixed-effects models using ‘Eigen’ and S4.
  • httr: Tools for working with URLs and HTTP.
  • rcpp: Seamless R and C++ integration.
  • sp: Classes and methods for spatial data.
  • leaflet: Create interactive web maps with the JavaScript ‘Leaflet’ library.
  • MASS: Functions and datasets to support Venables and Ripley’s MASS book.
  • readr: Read rectangular data (csv, tsv, fwf).
  • magrittr: Provides a mechanism for chaining commands with a new forward-pipe operator.
  • haven: Import and export ‘SPSS’, ‘Stata’ and ‘SAS’ files.
  • tibble: Modern re-imagining of data frames.
  • purrr: Functional programming tools.
  • janitor: Simple tools for examining and cleaning dirty data.
  • forcats: Tools for working with categorical variables (factors).
  • sparklyr: R interface for Apache Spark.
  • odbc: Connect to ODBC compatible databases.
  • curl: A Modern and Flexible Web Client for R.
  • jsonlite: A Simple and Robust JSON Parser and Generator for R.
  • xml2: A modern XML package.
  • RCurl: General network (HTTP/FTP/…) client interface for R.
  • highcharter: A wrapper for the ‘Highcharts’ library.
  • DT: A wrapper of the DataTables JavaScript library.
  • shiny: Easy interactive web applications with R.
  • flexdashboard: R Markdown Format for Flexible Dashboards.
  • DiagrammeR: Create graph diagrams and flowcharts using R.
  • visNetwork: Network visualization using vis.js library.
  • sf: Simple features for R.
  • tmap: Thematic maps.
  • mapview: Interactive viewing of spatial data.
  • plotly: Create interactive web graphics via ‘plotly.js’.
  • dygraphs: Interface to ‘dygraphs’ JavaScript Charting Library.
  • threejs: Interactive 3D scatter plots and globes.

This extended list provides a comprehensive overview of the most widely used Bioconductor and general R packages essential for bioinformatics, data analysis, and visualization. For more detailed information and exploration of these packages, you can visit the Bioconductor website and CRAN.

Essential Open Source Software for Science (EOSS)

https://chanzuckerberg.com/eoss/proposals/?cycle=1,2,3,4,5,6,d-i,

The Chan Zuckerberg Initiative (CZI) has funded numerous open source software projects through its Essential Open Source Software for Science (EOSS) program. Here is a list of some of the funded projects with brief descriptions:

Cycle 1

  • BioDynaMo: A platform for agent-based simulation of biological systems.
  • Bioconda: A distribution of bioinformatics software using conda.
  • CellProfiler: Software for quantitative analysis of biological images.
  • Cytoscape: An open-source platform for visualizing complex networks.
  • DeepLabCut: A toolbox for markerless pose estimation of animals.
  • Galaxy: An open, web-based platform for data-intensive biomedical research.
  • iSEE: Interactive visualization of single-cell data.
  • Jupyter Notebooks: A web-based interactive computing platform.
  • OpenMM: A high-performance toolkit for molecular simulations.
  • scikit-image: Image processing in Python.

Cycle 2

  • Bioconda: Continued support for the software distribution.
  • CellProfiler Analyst: Extension for machine learning-based image analysis.
  • Napari: Multi-dimensional image viewer for Python.
  • Pandas: Data structures for data analysis in Python.
  • Rao Lab’s HiGlass: Tool for exploring large genomic contact maps.
  • Scanpy: Scalable analysis of single-cell RNA sequencing data.

Cycle 3

  • CWL-Airflow: Workflow engine based on Apache Airflow.
  • Galaxy: Continued support for the platform.
  • Pangeo: Community platform for big data geoscience.
  • Project Jupyter: Further development of Jupyter Notebooks.
  • Sourmash: Scalable search of genomic data.

Cycle 4

  • GEMmaker: RNA-Seq pipeline tool.
  • Jupyter Book: Interactive, computational documents for data science.
  • NCBI BLAST+: Tool for comparing gene and protein sequences.
  • Scanpy: Continued support for single-cell RNA sequencing analysis.

Cycle 5

  • Community and Code: Promoting diversity in open source.
  • DeepLabCut: Continued support for animal pose estimation.
  • OpenMM: Further development for molecular simulations.
  • QHub: Managed JupyterHub deployments for collaborative research.
  • SciPy: Python-based ecosystem for mathematics, science, and engineering.

Cycle 6

  • BioSimSpace: Computational toolset for molecular simulations.
  • MetPy: Meteorological data analysis tools in Python.
  • Open Data Cube: Platform for analyzing large-scale Earth observation data.
  • TOPMed DCC: Data coordination for Trans-Omics for Precision Medicine.

RNA-seq 2024 Ute from raw counts

DEGs_heatmap_WaGa

1, input files (use R 4.3.3 (/home/jhuang/miniconda3/bin/R))

  merged_gene_counts_40samples.txt
  merged_gene_counts_WaGa_virus_rounded.txt
  merged_gene_counts_MKL-1_virus_rounded.txt

  #!! Problem: merge the two files and check if the read number in Dox samples are really lower than that of DMSO samples --> Not really!

2, background knowledge

  #Figure 4. MCPyV sT alters the expression of cell surface proteins in WaGa cells. WaGa shRNA scr and shRNA sT cell lines were induced with 1 m g/ml Dox or DMSO 3 d before the experiment. #Surface proteins upregulated during WaGa sT KD, indicating downregulation by sT in WaGa cells, are depicted in a.
  #Surface markers downregulated during sT KD, indicating an upregulation by sT in WaGa cells, are displayed in b. The threshold of sT-dependent differential regulation was set to 0.1 log 2diff.
  (c) The independent, confirmatory FACS experiments for the surface markers ADAM10, CD44, CD47, and CD95 in WaGa cells described in a and b Dox/DMSO addition.
  (d) CD47 surface marker expression in HEK293, WaGa, or nHDF cells overexpressing sT. All experiments were performed in triplicates,

  grown as described (Czech-Sioli et al., 2020a).
  For knockdown experiments, cells were induced with 1 mg/ml Dox or DMSO.
  Plasmids, shRNAs, and lentiviral transduction
  - MCPyV sT cDNA was cloned into lentiviral expression plasmid LeGo-iC2 through EcoRI and NotI restriction sites.
  - Small interfering RNA transfection
  - Analysis was performed using www.webgestalt.org.
  - Complete data of transcriptome analysis can be found in Supplementary Tables S1 and S2.
  d, day;
  Dox, doxycycline;
  FDR, false discovery rate;
  GO, gene ontology;
  GSEA, gene set enrichment analysis;
  KD, knockdown;
  RNA-Seq, RNA sequencing;
  scr, scrambled;
  #For knockdown experiments, cells were induced with 1 mg/ml Dox or DMSO.
  #shRNA scr, control cell line expressing a scrambled short hairpin RNA;
  #shRNA sT, small T antigen‒specific short hairpin RNA;
  #sT, small T antigen.

  #Figure 1. (b) Immunoblot analysis of total cell lysates from WaGa cells transduced with shRNA scr, shRNA sT, or shRNA sT/LT 3 and 5 d after Dox induction.
  #sT and LTtrunc protein expression was detected using the sT/LT-recognizing antibody 2T2 (first panel) and the LT-specific antibody Cm2B4 antibody (third panel); antibody-recognizing actin was used as a loading control.
  !Only "shRNA sT with Dox induction" can low the expression of sT; the DMSO control or shRNA scrambled both cannot low the expression.
  #Only "shRNA sT with Dox induction" specifically targets and reduces the expression of small T antigen (sT). The DMSO control and the scrambled shRNA do not have this effect because they do not specifically target the sT gene. The Dox induction is necessary to activate the shRNA sT, leading to the knockdown of sT expression.

  #In the figure, "shRNA scr" not knocked down, "shRNA sT and DMSO" not knocked down!

  #Yes, typically, in systems where doxycycline (Dox) is used to control shRNA expression, all shRNAs under the control of a Dox-inducible promoter would require doxycycline to be induced. Doxycycline is often used in inducible systems like the Tet-On or Tet-Off systems to regulate gene expression, including the expression of short hairpin RNAs (shRNAs). In these systems, without doxycycline, there should be minimal to no expression of the shRNA. When doxycycline is added, it induces the expression of the shRNA, leading to the targeted knockdown of gene expression.

  #shRNA (小干扰RNA)敲减的原理是通过介导RNA干扰(RNA interference, RNAi)过程来降低特定基因的表达。shRNA是一种双链RNA分子,其中一条链与目标基因的mRNA序列互补。当shRNA进入细胞后,它被DICER酶切割成小分子干扰RNA (siRNA)。然后siRNA被纳入到RNA诱导沉默复合体(RISC)中。在RISC中,siRNA的一条链被降解,而另一条与目标mRNA互补的链则被用来寻找相对应的mRNA分子。
  当siRNA找到与其互补的目标mRNA后,RISC会切割这个mRNA,从而阻止它被翻译成蛋白质。这种降解过程减少了目标基因在细胞内的表达量,从而实现了基因表达的敲减。通过选择特定的目标基因进行敲减,研究人员可以研究这些基因的功能,或者在疾病治疗中靶向这些基因。
  #shRNA scrambled(杂乱或无特定靶向的小干扰RNA)是一种设计用于不特定地靶向任何基因的短链RNA。它通常用作对照,以确保观察到的效果是特定于靶向的shRNA(如针对特定基因的shRNA)的结果,而不是由于RNA干扰技术本身或细胞对处理的非特异性反应。shRNA scrambled不会靶向或降低任何特定基因的表达,因此在实验中,它用于与特定靶向的shRNA处理的细胞进行比较,以证明任何观察到的变化是由于特定基因表达的降低。
  #DMSO 是二甲基亚砜(Dimethyl sulfoxide)的缩写,这是一种有机溶剂,常用于生物学实验,可以增加细胞膜的渗透性,帮助药物或化合物进入细胞。DMSO 也常作为对照物质使用,在实验中用于对比特定处理的效果。
  #Doxycycline(Dox)是一种抗生素,属于四环素类,通常用于治疗各种感染症,如呼吸道感染、尿路感染、眼睛感染等。在分子生物学研究中,Doxycycline 常用于诱导表达系统,如在doxycycline-inducible shRNA表达系统中,Doxycycline 的添加可以诱导特定基因的沉默或表达。

  #Difference between design=~shRNA+treatment+shRNA:treatment and design=~shRNA+treatment
  #在这两个设计公式中,design=~shRNA+treatment 表示一个没有交互作用项的模型,其中 shRNA 和 treatment 作为独立的因素被考虑。这意味着每个因素的效果是单独评估的,不考虑这两个因素之间可能的相互作用。
  #而 design=~shRNA+treatment+shRNA:treatment 这个设计公式包含了一个交互作用项(shRNA:treatment),这表示除了考虑 shRNA 和 treatment 作为独立因素的效果之外,还要考虑它们之间的相互作用。换句话说,该模型会评估 treatment(处理方式)如何依赖于不同 shRNA 的情况下有所不同。
  #总之,第一个设计只考虑了独立效果,而第二个设计还考虑了这两个因素的相互作用。在进行实验设计和统计分析时,选择哪一个取决于你的研究假设和数据的特点。

3, common processing for the data MKL+1 + WaGa

  #The pipeline finished successfully, but the following samples were skipped,
  #  - 0505_MKL-1_wt_EV,EV.RNA

  #### -------
  setwd("/home/jhuang/DATA/Data_Ute/Data_RNA-Seq_MKL-1_WaGa/results_2024_2/featureCounts")

  # ---- when BiocManager::install("ggtree") doesn't work, uisng devtools install it (Successful!) ----
  #update.packages(ask = FALSE, checkBuilt = TRUE)
  #.libPaths()
  #install.packages(c("rlang", "cli"))
  #rm(list = ls())
  #options(repos = BiocManager::repositories())
  #if (!requireNamespace("devtools", quietly = TRUE))
  #    install.packages("devtools")
  #devtools::install_github("YuLab-SMU/ggtree")

  #install.packages("gplots")
  #if (!requireNamespace("BiocManager", quietly = TRUE))
  #    install.packages("BiocManager")
  #BiocManager::install(c("clusterProfiler", "ReactomePA", "DESeq2", "AnnotationDbi", "GenomeInfoDb", "Biostrings"), force=TRUE)

  # R(4.3) works well of the saga server!
  library("AnnotationDbi")
  library("clusterProfiler")
  library("ReactomePA")
  #library("org.Mm.eg.db")
  library(DESeq2)
  library(gplots)

  [1] gene_name
  [2] X042_MKL.1_wt_EV
  [3] MKL.1_RNA_147
  [4] X042_MKL.1_sT_DMSO     #EIGENTLICH _EV
  [5] MKL.1_RNA
  [6] X0505_MKL.1_scr_DMSO_EV
  [7] X0505_MKL.1_sT_DMSO_EV
  [8] MKL.1_EV.RNA_87
  [9] MKL.1_EV.RNA_27
  [10] X042_MKL.1_scr_Dox_EV
  [11] X042_MKL.1_scr_DMSO_EV
  [12] MKL.1_EV.RNA_118
  [13] X0505_MKL.1_scr_Dox_EV
  [14] MKL.1_EV.RNA
  [15] X042_MKL.1_sT_Dox      #EIGENTLICH _EV
  [16] X0505_MKL.1_sT_Dox_EV
  [17] MKL.1_RNA_118
  [18] MKL.1_EV.RNA_2

  [19] Geneid.1
  [20] gene_name.1
  [21] X1605_WaGa_sT_DMSO_EV
  [22] WaGa_EV.RNA_118
  [23] WaGa_EV.RNA
  [24] X2706_WaGa_scr_Dox_EV
  [25] X2706_WaGa_sT_DMSO_EV
  [26] X1107_WaGa_wt_EV
  [27] X1107_WaGa_sT_Dox_EV
  [28] WaGa_RNA
  [29] X1605_WaGa_scr_DMSO_EV
  [30] X2706_WaGa_scr_DMSO_EV
  [31] WaGa_EV.RNA_226
  [32] X2706_WaGa_sT_Dox_EV
  [33] X1605_WaGa_scr_Dox_EV
  [34] X1605_WaGa_wt_EV
  [35] X1605_WaGa_sT_Dox_EV
  [36] X1107_WaGa_scr_DMSO_EV
  [37] WaGa_EV.RNA_2
  [38] WaGa_EV.RNA_147
  [39] WaGa_RNA_118
  [40] X1107_WaGa_scr_Dox_EV
  [41] X1107_WaGa_sT_DMSO_EV
  [42] WaGa_RNA_147
  [43] X2706_WaGa_wt_EV

  #Geneid  gene_name       1605_WaGa_sT_DMSO_EV    WaGa_EV-RNA_118 WaGa_EV-RNA     2706_WaGa_scr_Dox_EV    2706_WaGa_sT_DMSO_EV    1107_WaGa_wt_EV 1107_WaGa_sT_Dox_EVWaGa_RNA        1605_WaGa_scr_DMSO_EV   2706_WaGa_scr_DMSO_EV   WaGa_EV-RNA_226 2706_WaGa_sT_Dox_EV     1605_WaGa_scr_Dox_EV    1605_WaGa_wt_EV 1605_WaGa_sT_Dox_EV1107_WaGa_scr_DMSO_EV   WaGa_EV-RNA_2   WaGa_EV-RNA_147 WaGa_RNA_118    1107_WaGa_scr_Dox_EV    1107_WaGa_sT_DMSO_EV    WaGa_RNA_147    2706_WaGa_wt_EV

  d.raw_human<- read.delim2("merged_gene_counts_40samples.txt",sep="\t", header=TRUE, row.names=1)
  colnames(d.raw_human)<- c("gene_name","X042_MKL.1_wt_EV","MKL.1_RNA_147","X042_MKL.1_sT_DMSO","MKL.1_RNA","X0505_MKL.1_scr_DMSO_EV","X0505_MKL.1_sT_DMSO_EV","MKL.1_EV.RNA_87","MKL.1_EV.RNA_27","X042_MKL.1_scr_Dox_EV","X042_MKL.1_scr_DMSO_EV","MKL.1_EV.RNA_118","X0505_MKL.1_scr_Dox_EV","MKL.1_EV.RNA","X042_MKL.1_sT_Dox","X0505_MKL.1_sT_Dox_EV","MKL.1_RNA_118","MKL.1_EV.RNA_2",      "Geneid.1","gene_name.1","X1605_WaGa_sT_DMSO_EV","WaGa_EV.RNA_118","WaGa_EV.RNA","X2706_WaGa_scr_Dox_EV","X2706_WaGa_sT_DMSO_EV","X1107_WaGa_wt_EV","X1107_WaGa_sT_Dox_EV","WaGa_RNA","X1605_WaGa_scr_DMSO_EV","X2706_WaGa_scr_DMSO_EV","WaGa_EV.RNA_226","X2706_WaGa_sT_Dox_EV","X1605_WaGa_scr_Dox_EV","X1605_WaGa_wt_EV","X1605_WaGa_sT_Dox_EV","X1107_WaGa_scr_DMSO_EV","WaGa_EV.RNA_2","WaGa_EV.RNA_147","WaGa_RNA_118","X1107_WaGa_scr_Dox_EV","X1107_WaGa_sT_DMSO_EV","WaGa_RNA_147","X2706_WaGa_wt_EV")

  col_order <- c("gene_name",  "MKL.1_RNA","MKL.1_RNA_118","MKL.1_RNA_147","MKL.1_EV.RNA","MKL.1_EV.RNA_2","MKL.1_EV.RNA_118","MKL.1_EV.RNA_87","MKL.1_EV.RNA_27","X042_MKL.1_wt_EV","X042_MKL.1_sT_DMSO","X0505_MKL.1_sT_DMSO_EV","X042_MKL.1_scr_DMSO_EV","X0505_MKL.1_scr_DMSO_EV","X042_MKL.1_sT_Dox","X0505_MKL.1_sT_Dox_EV","X042_MKL.1_scr_Dox_EV","X0505_MKL.1_scr_Dox_EV",     "Geneid.1","gene_name.1",    "WaGa_RNA","WaGa_RNA_118","WaGa_RNA_147",    "WaGa_EV.RNA","WaGa_EV.RNA_2","WaGa_EV.RNA_118","WaGa_EV.RNA_147","WaGa_EV.RNA_226","X1107_WaGa_wt_EV","X1605_WaGa_wt_EV","X2706_WaGa_wt_EV",    "X1107_WaGa_sT_DMSO_EV","X1605_WaGa_sT_DMSO_EV","X2706_WaGa_sT_DMSO_EV",    "X1107_WaGa_scr_DMSO_EV","X1605_WaGa_scr_DMSO_EV","X2706_WaGa_scr_DMSO_EV",     "X1107_WaGa_sT_Dox_EV","X1605_WaGa_sT_Dox_EV","X2706_WaGa_sT_Dox_EV",     "X1107_WaGa_scr_Dox_EV","X1605_WaGa_scr_Dox_EV","X2706_WaGa_scr_Dox_EV")
  reordered.raw_human <- d.raw_human[,col_order]

  d.raw_virus <- read.delim2("merged_gene_counts_virus_rounded.txt",sep="\t", header=TRUE, row.names=1)
  reordered.raw_virus <- d.raw_virus[,col_order]

  identical(colnames(reordered.raw_human), colnames(reordered.raw_virus))

  reordered.raw <- rbind(reordered.raw_human, reordered.raw_virus)

  #rename
  #colnames(reordered.raw) <- c("gene_name", "MKL-1 RNA","MKL-1 RNA 118","MKL-1 RNA 147",    "MKL-1 EV","MKL-1 EV 2","MKL-1 EV 118","MKL-1 EV 87","MKL-1 EV 27","MKL-1 EV 042",    "MKL-1 EV sT DMSO 042","MKL-1 EV sT DMSO 0505","MKL-1 EV scr DMSO 042","MKL-1 EV scr DMSO 0505","MKL-1 EV sT Dox 042","MKL-1 EV sT Dox 0505","MKL-1 EV scr Dox 042","MKL-1 EV scr Dox 0505",   "Geneid.1","gene_name.1",    "WaGa RNA","WaGa RNA 118","WaGa RNA 147",    "WaGa EV","WaGa EV 2","WaGa EV 118","WaGa EV 147","WaGa EV 226","WaGa EV 1107","WaGa EV 1605","WaGa EV 2706",    "WaGa EV sT DMSO 1107","WaGa EV sT DMSO 1605","WaGa EV sT DMSO 2706",     "WaGa EV scr DMSO 1107","WaGa EV scr DMSO 1605","WaGa EV scr DMSO 2706",     "WaGa EV sT Dox 1107","WaGa EV sT Dox 1605","WaGa EV sT Dox 2706",     "WaGa EV scr Dox 1107","WaGa EV scr Dox 1605","WaGa EV scr Dox 2706")

  colnames(reordered.raw) <- c("gene_name", "MKL-1 parental cell RNA","MKL-1 parental cell RNA 118","MKL-1 parental cell RNA 147",    "MKL-1 wt EV RNA","MKL-1 wt EV RNA 2","MKL-1 wt EV RNA 118","MKL-1 wt EV RNA 87","MKL-1 wt EV RNA 27","MKL-1 wt EV RNA 042",    "MKL-1 sT DMSO EV RNA 042","MKL-1 sT DMSO EV RNA 0505",  "MKL-1 scr DMSO EV RNA 042","MKL-1 scr DMSO EV RNA 0505",  "MKL-1 sT Dox EV RNA 042","MKL-1 sT Dox EV RNA 0505",  "MKL-1 scr Dox EV RNA 042","MKL-1 scr Dox EV RNA 0505",   "Geneid.1","gene_name.1",    "WaGa parental cell RNA","WaGa parental cell RNA 118","WaGa parental cell RNA 147",    "WaGa wt EV RNA","WaGa wt EV RNA 2","WaGa wt EV RNA 118","WaGa wt EV RNA 147","WaGa wt EV RNA 226","WaGa wt EV RNA 1107","WaGa wt EV RNA 1605","WaGa wt EV RNA 2706",    "WaGa sT DMSO EV RNA 1107","WaGa sT DMSO EV RNA 1605","WaGa sT DMSO EV RNA 2706",     "WaGa scr DMSO EV RNA 1107","WaGa scr DMSO EV RNA 1605","WaGa scr DMSO EV RNA 2706",     "WaGa sT Dox EV RNA 1107","WaGa sT Dox EV RNA 1605","WaGa sT Dox EV RNA 2706",     "WaGa scr Dox EV RNA 1107","WaGa scr Dox EV RNA 1605","WaGa scr Dox EV RNA 2706")

  reordered.raw$gene_name <- NULL
  reordered.raw$Geneid.1 <- NULL
  reordered.raw$gene_name.1 <- NULL
  write.csv(reordered.raw, file="counts.txt")
  #IMPORTANT that we should filter the data with the counts in the STEP!
  d <- reordered.raw[rowSums(reordered.raw>3)>2,]

  condition_for_pca = as.factor(c("RNA","RNA","RNA","EV","EV","EV","EV","EV","EV","sT.DMSO","sT.DMSO","scr.DMSO","scr.DMSO","sT.Dox","sT.Dox","scr.Dox","scr.Dox",    "RNA","RNA","RNA","EV","EV","EV","EV","EV","EV","EV","EV","sT.DMSO","sT.DMSO","sT.DMSO","scr.DMSO","scr.DMSO","scr.DMSO","sT.Dox","sT.Dox","sT.Dox","scr.Dox","scr.Dox","scr.Dox"))

  condition = as.factor(c("MKL1.RNA","MKL1.RNA","MKL1.RNA","MKL1.EV","MKL1.EV","MKL1.EV","MKL1.EV","MKL1.EV","MKL1.EV","MKL1.sT.DMSO","MKL1.sT.DMSO","MKL1.scr.DMSO","MKL1.scr.DMSO","MKL1.sT.Dox","MKL1.sT.Dox","MKL1.scr.Dox","MKL1.scr.Dox",    "WaGa.RNA","WaGa.RNA","WaGa.RNA","WaGa.EV","WaGa.EV","WaGa.EV","WaGa.EV","WaGa.EV","WaGa.EV","WaGa.EV","WaGa.EV","WaGa.sT.DMSO","WaGa.sT.DMSO","WaGa.sT.DMSO","WaGa.scr.DMSO","WaGa.scr.DMSO","WaGa.scr.DMSO","WaGa.sT.Dox","WaGa.sT.Dox","WaGa.sT.Dox","WaGa.scr.Dox","WaGa.scr.Dox","WaGa.scr.Dox"))

  #not sure if they rep1 in the first time is the same to the second time.
  donor = as.factor(c("1","118","147",  "1","2","118","87","27","042",  "042","0505","042","0505","042","0505","042","0505",    "1","118","147",  "1","2","118","147","226","1107","1605","2706",   "1107","1605","2706","1107","1605","2706","1107","1605","2706","1107","1605","2706"))

  batch = as.factor(c("2021.08","2021.09","2021.09","2021.08","2021.08","2021.09","2021.09","2021.09","2022.08","2022.08","2022.08","2022.08","2022.08","2022.08","2022.08","2022.08","2022.08",    "2021.08","2021.09","2021.09","2021.08","2021.08","2021.09","2021.09","2021.09","2022.11","2022.11","2022.11","2022.11","2022.11","2022.11","2022.11","2022.11","2022.11","2022.11","2022.11","2022.11","2022.11","2022.11","2022.11"))

  cell.line = as.factor(c("MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1",    "WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa"))

  ids = as.factor(c("MKL1.RNA","MKL1.RNA.118","MKL1.RNA.147",  "MKL1.EV","MKL1.EV.2","MKL1.EV.118","MKL1.EV.87","MKL1.EV.27","MKL1.EV.042",  "MKL1.EV.sT.DMSO.042","MKL1.EV.sT.DMSO.0505",  "MKL1.EV.scr.DMSO.042","MKL1.EV.scr.DMSO.0505",  "MKL1.EV.sT.Dox.042","MKL1.EV.sT.Dox.0505",  "MKL1.EV.scr.Dox.042","MKL1.EV.scr.Dox.0505",      "WaGa.RNA","WaGa.RNA.118","WaGa.RNA.147",  "WaGa.EV","WaGa.EV.2","WaGa.EV.118","WaGa.EV.147","WaGa.EV.226","WaGa.EV.1107","WaGa.EV.1605","WaGa.EV.2706",  "WaGa.EV.sT.DMSO.1107","WaGa.EV.sT.DMSO.1605","WaGa.EV.sT.DMSO.2706",  "WaGa.EV.scr.DMSO.1107","WaGa.EV.scr.DMSO.1605","WaGa.EV.scr.DMSO.2706",  "WaGa.EV.sT.Dox.1107","WaGa.EV.sT.Dox.1605","WaGa.EV.sT.Dox.2706",  "WaGa.EV.scr.Dox.1107","WaGa.EV.scr.Dox.1605","WaGa.EV.scr.Dox.2706"))

  #DEL ids = as.factor(c("MKL.1_RNA","MKL.1_RNA_118","MKL.1_RNA_147","MKL.1_EV.RNA","MKL.1_EV.RNA_2","MKL.1_EV.RNA_118","MKL.1_EV.RNA_87","MKL.1_EV.RNA_27","042_MKL.1_wt_EV","042_MKL.1_sT_DMSO","0505_MKL.1_sT_DMSO_EV","042_MKL.1_scr_DMSO_EV","0505_MKL.1_scr_DMSO_EV","042_MKL.1_sT_Dox","0505_MKL.1_sT_Dox_EV","042_MKL.1_scr_Dox_EV","0505_MKL.1_scr_Dox_EV"))

  #IMPORTANT: using d instead of reordered.raw.
  #cData = data.frame(row.names=colnames(d), condition=condition,  batch=batch, ids=ids)
  #dds<-DESeqDataSetFromMatrix(countData=d, colData=cData, design=~batch+condition)
  #cData = data.frame(row.names=colnames(d), condition=condition, ids=ids)
  #dds<-DESeqDataSetFromMatrix(countData=d, colData=cData, design=~condition)
  cData = data.frame(row.names=colnames(d), condition=condition,  donor=donor, batch=batch, cell.line=cell.line, ids=ids)
  #dds<-DESeqDataSetFromMatrix(countData=d, colData=cData, design=~batch+condition_for_pca)
  dds<-DESeqDataSetFromMatrix(countData=d, colData=cData, design=~batch+condition)

  #rld <- rlogTransformation(dds)
  rld <- vst(dds)
  #--> [OPTION] goto DEG_Heatmap_Drawing!

4, preparing the data for PCA_MKL1 and PCA_WaGa drawing

  d_WaGa <- d[, !grepl("parental|MKL-1", names(d))]
  condition = as.factor(c("WaGa.EV","WaGa.EV","WaGa.EV","WaGa.EV","WaGa.EV","WaGa.EV","WaGa.EV","WaGa.EV","WaGa.sT.DMSO","WaGa.sT.DMSO","WaGa.sT.DMSO","WaGa.scr.DMSO","WaGa.scr.DMSO","WaGa.scr.DMSO","WaGa.sT.Dox","WaGa.sT.Dox","WaGa.sT.Dox","WaGa.scr.Dox","WaGa.scr.Dox","WaGa.scr.Dox"))
  donor = as.factor(c("1","2","118","147","226","1107","1605","2706",   "1107","1605","2706","1107","1605","2706","1107","1605","2706","1107","1605","2706"))
  batch = as.factor(c("2021.08","2021.08","2021.09","2021.09","2021.09","2022.11","2022.11","2022.11","2022.11","2022.11","2022.11","2022.11","2022.11","2022.11","2022.11","2022.11","2022.11","2022.11","2022.11","2022.11"))

  cell.line = as.factor(c("WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa"))
  ids = as.factor(c("WaGa.EV","WaGa.EV.2","WaGa.EV.118","WaGa.EV.147","WaGa.EV.226","WaGa.EV.1107","WaGa.EV.1605","WaGa.EV.2706",  "WaGa.EV.sT.DMSO.1107","WaGa.EV.sT.DMSO.1605","WaGa.EV.sT.DMSO.2706",  "WaGa.EV.scr.DMSO.1107","WaGa.EV.scr.DMSO.1605","WaGa.EV.scr.DMSO.2706",  "WaGa.EV.sT.Dox.1107","WaGa.EV.sT.Dox.1605","WaGa.EV.sT.Dox.2706",  "WaGa.EV.scr.Dox.1107","WaGa.EV.scr.Dox.1605","WaGa.EV.scr.Dox.2706"))
  cData = data.frame(row.names=colnames(d_WaGa), condition=condition,  donor=donor, batch=batch, cell.line=cell.line, ids=ids)
  dds_WaGa<-DESeqDataSetFromMatrix(countData=d_WaGa, colData=cData, design=~batch+condition)
  rld_WaGa <- vst(dds_WaGa)

  d_MKL1 <- d[, !grepl("parental|WaGa", names(d))]
  condition = as.factor(c("MKL1.EV","MKL1.EV","MKL1.EV","MKL1.EV","MKL1.EV","MKL1.EV","MKL1.sT.DMSO","MKL1.sT.DMSO","MKL1.scr.DMSO","MKL1.scr.DMSO","MKL1.sT.Dox","MKL1.sT.Dox","MKL1.scr.Dox","MKL1.scr.Dox"))
  donor = as.factor(c("1","2","118","87","27","042",  "042","0505","042","0505","042","0505","042","0505"))
  batch = as.factor(c("2021.08","2021.08","2021.09","2021.09","2021.09","2022.08","2022.08","2022.08","2022.08","2022.08","2022.08","2022.08","2022.08","2022.08"))
  cell.line = as.factor(c("MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1"))
  ids = as.factor(c("MKL1.EV","MKL1.EV.2","MKL1.EV.118","MKL1.EV.87","MKL1.EV.27","MKL1.EV.042",  "MKL1.EV.sT.DMSO.042","MKL1.EV.sT.DMSO.0505",  "MKL1.EV.scr.DMSO.042","MKL1.EV.scr.DMSO.0505",  "MKL1.EV.sT.Dox.042","MKL1.EV.sT.Dox.0505",  "MKL1.EV.scr.Dox.042","MKL1.EV.scr.Dox.0505"))
  cData = data.frame(row.names=colnames(d_MKL1), condition=condition,  donor=donor, batch=batch, cell.line=cell.line, ids=ids)
  dds_MKL1<-DESeqDataSetFromMatrix(countData=d_MKL1, colData=cData, design=~batch+condition)
  rld_MKL1 <- vst(dds_MKL1)

  # -- before pca --
  png("pca_before_batch_correction2.png", 1200, 800)
  #plotPCA(rld, intgroup=c("condition"))
  plotPCA(rld, intgroup = c("condition", "batch"))
  #plotPCA(rld, intgroup = c("condition", "ids"))
  #plotPCA(rld, "batch")
  dev.off()
  #73% (PC1), 11% (PC2)   8% (PC3)

5, drawing PCA MKL1+WaGa

  # -- construct a data structure (merged_df) as above with data and pc --
  library(ggplot2)
  data <- plotPCA(rld, intgroup=c("condition", "batch", "cell.line"), returnData=TRUE)
  write.csv(data, file="plotPCA_data.csv")
  #calculate all PCs including PC3 with the following codes
  library(genefilter)
  ntop <- 500
  rv <- rowVars(assay(rld))
  select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
  mat <- t( assay(rld)[select, ] )
  pc <- prcomp(mat)
  summary(pc)
  #Cumulative Proportion   0.7651  0.07746 0.05608   0.01405 0.00984 0.00857 0.00689 (with rld <- rlogTransformation(dds))
  #Proportion of Variance  0.7286  0.1060  0.07657   0.01445 0.01042 0.00705 0.00578 (with rld <- vst(dds))
  pc$x[,1:3]
  #df_pc <- data.frame(pc$x[,1:3])
  df_pc <- data.frame(pc$x)

  #> head(data)
  #                     PC1        PC2            group condition   batch
  #MKL-1 RNA     -71.108137  -2.054660 MKL1.RNA:2021.08  MKL1.RNA 2021.08
  #MKL-1 RNA 118 -62.978513  -6.906138 MKL1.RNA:2021.09  MKL1.RNA 2021.09
  #MKL-1 RNA 147 -77.698768  -3.355581 MKL1.RNA:2021.09  MKL1.RNA 2021.09
  #MKL-1 EV      -49.482607 -25.469602  MKL1.EV:2021.08   MKL1.EV 2021.08
  #MKL-1 EV 2    -19.805802 -23.850122  MKL1.EV:2021.08   MKL1.EV 2021.08
  #MKL-1 EV 118    2.264943 -22.114427  MKL1.EV:2021.09   MKL1.EV 2021.09
  #> head(df_pc)
  #                     PC1        PC2       PC3        PC4       PC5        PC6
  #MKL-1 RNA     -71.108137  -2.054660 16.617315 -1.7281595  6.275895 -2.3597341
  #MKL-1 RNA 118 -62.978513  -6.906138  9.973149 -2.4119718  7.778461 -2.3797116
  #MKL-1 RNA 147 -77.698768  -3.355581 17.062958 -2.0865184  6.100746 -0.6303835
  #MKL-1 EV      -49.482607 -25.469602 -2.410902  2.2679166  5.300198 10.5329350
  #MKL-1 EV 2    -19.805802 -23.850122 -1.704857 -2.3895896 -1.742060 -3.7750239
  #MKL-1 EV 118    2.264943 -22.114427 -4.991651 -0.3336808 -2.973397 -4.5959970

  #Note it is a little different since we added the virus-gene-expression in the table!
  identical(rownames(data), rownames(df_pc)) #-->TRUE
  data$PC1 <- NULL
  data$PC2 <- NULL
  merged_df <- merge(data, df_pc, by = "row.names")
  #merged_df <- merged_df[, -1]
  row.names(merged_df) <- merged_df$Row.names
  merged_df$Row.names <- NULL  # remove the "name" column
  merged_df$name <- NULL
  merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10","PC11","PC12","PC13","PC14","PC15","PC16","PC17","PC18","PC19","PC20","PC21","PC22","PC23","PC24","PC25","PC26","PC27","PC28","PC29","PC30","PC31","PC32","PC33","PC34","PC35","PC36","PC37","PC38","PC39","PC40","group","condition","batch","cell.line")]
  write.csv(merged_df, file="merged_df_40PCs.csv")

  #using the python script to draw the 3D PCA-plot.
  # need to install plotly with pip ~/.local/bin/pip3 in .local/bin/python3.10/site-packages
  #python3 ~/Scripts/PCA_3D_drawing.py
  python3 ~/Scripts/PCA_3D_drawing.py merged_df_40PCs.csv

6, drawing PCA WaGa

  library(ggplot2)
  data <- plotPCA(rld_WaGa, intgroup=c("condition", "batch", "cell.line"), returnData=TRUE)
  write.csv(data, file="plotPCA_data_WaGa.csv")
  #calculate all PCs including PC3 with the following codes
  library(genefilter)
  ntop <- 500
  rv <- rowVars(assay(rld_WaGa))
  select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
  mat <- t( assay(rld_WaGa)[select, ] )
  pc <- prcomp(mat)
  summary(pc)
  #Proportion of Variance  0.6342 0.06491 0.06044 --> 0.63, 0.06, 0.06
  pc$x[,1:3]
  #df_pc <- data.frame(pc$x[,1:3])
  df_pc <- data.frame(pc$x)

  #Note it is a little different since we added the virus-gene-expression in the table!
  identical(rownames(data), rownames(df_pc)) #-->TRUE
  data$PC1 <- NULL
  data$PC2 <- NULL
  merged_df <- merge(data, df_pc, by = "row.names")
  #merged_df <- merged_df[, -1]
  row.names(merged_df) <- merged_df$Row.names
  merged_df$Row.names <- NULL  # remove the "name" column
  merged_df$name <- NULL
  merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10","PC11","PC12","PC13","PC14","PC15","PC16","PC17","PC18","PC19","PC20","group","condition","batch","cell.line")]
  write.csv(merged_df, file="merged_df_WaGa.csv")

  python3 ~/Scripts/PCA_3D_drawing_WaGa.py

7, drawing PCA MKL1

  library(ggplot2)
  data <- plotPCA(rld_MKL1, intgroup=c("condition", "batch", "cell.line"), returnData=TRUE)
  write.csv(data, file="plotPCA_data_MKL1.csv")
  #calculate all PCs including PC3 with the following codes
  library(genefilter)
  ntop <- 500
  rv <- rowVars(assay(rld_MKL1))
  select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
  mat <- t( assay(rld_MKL1)[select, ] )
  pc <- prcomp(mat)
  summary(pc)
  #Proportion of Variance  0.7549  0.05717  0.04503 --> 0.75, 0.06, 0.05
  pc$x[,1:3]
  #df_pc <- data.frame(pc$x[,1:3])
  df_pc <- data.frame(pc$x)

  #Note it is a little different since we added the virus-gene-expression in the table!
  identical(rownames(data), rownames(df_pc)) #-->TRUE
  data$PC1 <- NULL
  data$PC2 <- NULL
  merged_df <- merge(data, df_pc, by = "row.names")
  #merged_df <- merged_df[, -1]
  row.names(merged_df) <- merged_df$Row.names
  merged_df$Row.names <- NULL  # remove the "name" column
  merged_df$name <- NULL
  merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10","PC11","PC12","PC13","PC14","group","condition","batch","cell.line")]
  write.csv(merged_df, file="merged_df_MKL1.csv")

  python3 ~/Scripts/PCA_3D_drawing_MKL1.py

  # -- before heatmap --
  ## generate the pairwise comparison between samples
  library(gplots)
  library("RColorBrewer")
  png("heatmap_before_donor_correction.png", 1200, 800)
  distsRL <- dist(t(assay(rld)))
  mat <- as.matrix(distsRL)
  #paste( rld$dex, rld$cell, sep="-" )
  #rownames(mat) <- colnames(mat) <- with(colData(dds),paste(condition,batch, sep=":"))
  #rownames(mat) <- colnames(mat) <- with(colData(dds),paste(condition,ids, sep=":"))
  hc <- hclust(distsRL)
  hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
  heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
  dev.off()

  # -- remove batch effect --
  mat <- assay(rld)
  mm <- model.matrix(~condition, colData(rld))
  mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm)
  assay(rld) <- mat

  # -- after pca --
  png("pca_after_batch_correction.png", 1200, 800)
  svg("pca_after_batch_correction.svg")
  plotPCA(rld, intgroup=c("condition"))
  dev.off()

  library(ggplot2)
  data <- plotPCA(rld, intgroup=c("condition", "cell.line"), returnData=TRUE)
  colnames(data) <- c("PC1","PC2","group2","Group","Cell.line","name")
  percentVar <- round(100 * attr(data, "percentVar"))
  #, shape=donor
  #png("pca6.png", 1000, 1000)
  svg("pca6.svg", 10, 8)
  ggplot(data, aes(PC1, PC2, color=Group, shape=Cell.line)) +
    geom_point(size=3) +
    scale_color_manual(values = c("RNA" = "grey",
                                  "EV"="cyan",
                                  "scr.DMSO"="#b2df8a",
                                  "sT.DMSO"="#33a02c",
                                  "scr.Dox"="#fb9a99",
                                  "sT.Dox"="#e31a1c")) +
    xlab(paste0("PC1: ",percentVar[1],"% variance")) +
    ylab(paste0("PC2: ",percentVar[2],"% variance")) +
    coord_fixed()
  dev.off()

  # -- after heatmap --
  ## generate the pairwise comparison between samples
  png("heatmap_after_donor_correction.png", 1200, 800)
  distsRL <- dist(t(assay(rld)))
  mat <- as.matrix(distsRL)
  rownames(mat) <- colnames(mat) <- with(colData(dds),paste(condition,donor, sep=":"))
  #rownames(mat) <- colnames(mat) <- with(colData(dds),paste(condition,ids, sep=":"))
  hc <- hclust(distsRL)
  hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
  heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
  dev.off()

  #convert bam to bigwig using deepTools by feeding inverse of DESeq’s size Factor
  sizeFactors(dds)
  #NULL
  dds <- estimateSizeFactors(dds)
  > sizeFactors(dds)
      MKL-1 parental cell RNA MKL-1 parental cell RNA 118
                    2.5689118                   1.6270659
  MKL-1 parental cell RNA 147             MKL-1 wt EV RNA
                    1.9241464                   1.5296877
            MKL-1 wt EV RNA 2         MKL-1 wt EV RNA 118
                    1.2829153                   0.6786713
          MKL-1 wt EV RNA 87          MKL-1 wt EV RNA 27
                    0.6741590                   0.4176156
          MKL-1 wt EV RNA 042    MKL-1 sT DMSO EV RNA 042
                    1.3360156                   0.9539241
    MKL-1 sT DMSO EV RNA 0505   MKL-1 scr DMSO EV RNA 042
                    1.7357438                   1.2957555
  MKL-1 scr DMSO EV RNA 0505     MKL-1 sT Dox EV RNA 042
                    1.0908450                   1.3657925
    MKL-1 sT Dox EV RNA 0505    MKL-1 scr Dox EV RNA 042
                    1.1221456                   1.4708191
    MKL-1 scr Dox EV RNA 0505      WaGa parental cell RNA
                    1.1242096                   2.1097851
  WaGa parental cell RNA 118  WaGa parental cell RNA 147
                    1.6925780                   1.6712182
              WaGa wt EV RNA            WaGa wt EV RNA 2
                    0.6021352                   1.2486966
          WaGa wt EV RNA 118          WaGa wt EV RNA 147
                    0.4518733                   0.5695057
          WaGa wt EV RNA 226         WaGa wt EV RNA 1107
                    0.5449802                   1.4885064
          WaGa wt EV RNA 1605         WaGa wt EV RNA 2706
                    1.2631181                   0.7678146
    WaGa sT DMSO EV RNA 1107    WaGa sT DMSO EV RNA 1605
                    0.7981256                   0.9054709
    WaGa sT DMSO EV RNA 2706   WaGa scr DMSO EV RNA 1107
                    1.0603277                   0.7927572
    WaGa scr DMSO EV RNA 1605   WaGa scr DMSO EV RNA 2706
                    0.8115785                   1.0659662
      WaGa sT Dox EV RNA 1107     WaGa sT Dox EV RNA 1605
                    0.8553457                   0.9547341
      WaGa sT Dox EV RNA 2706    WaGa scr Dox EV RNA 1107
                    0.9081678                   0.7440897
    WaGa scr Dox EV RNA 1605    WaGa scr Dox EV RNA 2706
                    0.8735490                   1.0146989

  raw_counts <- counts(dds)
  normalized_counts <- counts(dds, normalized=TRUE)
  write.table(raw_counts, file="raw_counts.txt", sep="\t", quote=F, col.names=NA)
  write.table(normalized_counts, file="normalized_counts.txt", sep="\t", quote=F, col.names=NA)

  #bamCoverage --bam ../markDuplicates/${sample}Aligned.sortedByCoord.out.bam -o ${sample}_norm.bw --binSize 10 --scaleFactor  --effectiveGenomeSize 2864785220
  bamCoverage --bam ../markDuplicates/WaGa_RNAAligned.sortedByCoord.out.markDups.bam -o WaGa_RNA.bw --binSize 10 --scaleFactor 0.4958732    --effectiveGenomeSize 2864785220
  bamCoverage --bam ../markDuplicates/WaGa_RNA_118Aligned.sortedByCoord.out.markDups.bam -o WaGa_RNA_118.bw --binSize 10 --scaleFactor 0.6013898        --effectiveGenomeSize 2864785220
  bamCoverage --bam ../markDuplicates/WaGa_RNA_147Aligned.sortedByCoord.out.markDups.bam -o WaGa_RNA_147.bw --binSize 10 --scaleFactor 0.6154516      --effectiveGenomeSize 2864785220

  bamCoverage --bam ../markDuplicates/MKL1_RNAAligned.sortedByCoord.out.markDups.bam -o MKL1_RNA.bw --binSize 10 --scaleFactor  0.4078775      --effectiveGenomeSize 2864785220
  bamCoverage --bam ../markDuplicates/MKL1_RNA_118Aligned.sortedByCoord.out.markDups.bam -o MKL1_RNA_118.bw --binSize 10 --scaleFactor 0.6525297       --effectiveGenomeSize 2864785220
  bamCoverage --bam ../markDuplicates/MKL1_RNA_147Aligned.sortedByCoord.out.markDups.bam -o MKL1_RNA_147.bw --binSize 10 --scaleFactor 0.5331748       --effectiveGenomeSize 2864785220

  bamCoverage --bam ../markDuplicates/WaGa_EV_RNAAligned.sortedByCoord.out.markDups.bam -o WaGa_EV_RNA.bw --binSize 10 --scaleFactor 1.733964       --effectiveGenomeSize 2864785220
  bamCoverage --bam ../markDuplicates/WaGa_EV_RNA_2Aligned.sortedByCoord.out.markDups.bam -o WaGa_EV_RNA_2.bw --binSize 10 --scaleFactor  0.7761222      --effectiveGenomeSize 2864785220
  bamCoverage --bam ../markDuplicates/WaGa_EV_RNA_118Aligned.sortedByCoord.out.markDups.bam -o WaGa_EV_RNA_118.bw --binSize 10 --scaleFactor  2.222971      --effectiveGenomeSize 2864785220
  bamCoverage --bam ../markDuplicates/WaGa_EV_RNA_147Aligned.sortedByCoord.out.markDups.bam -o WaGa_EV_RNA_147.bw --binSize 10 --scaleFactor  1.679485     --effectiveGenomeSize 2864785220
  bamCoverage --bam ../markDuplicates/WaGa_EV_RNA_226Aligned.sortedByCoord.out.markDups.bam -o WaGa_EV_RNA_226.bw --binSize 10 --scaleFactor  1.901282     --effectiveGenomeSize 2864785220

  bamCoverage --bam ../markDuplicates/MKL1_EV_RNAAligned.sortedByCoord.out.markDups.bam -o MKL1_EV_RNA.bw --binSize 10 --scaleFactor 0.6469583       --effectiveGenomeSize 2864785220
  bamCoverage --bam ../markDuplicates/MKL1_EV_RNA_2Aligned.sortedByCoord.out.markDups.bam -o MKL1_EV_RNA_2.bw --binSize 10 --scaleFactor 0.6877478       --effectiveGenomeSize 2864785220
  bamCoverage --bam ../markDuplicates/MKL1_EV_RNA_27Aligned.sortedByCoord.out.markDups.bam -o MKL1_EV_RNA_27.bw --binSize 10 --scaleFactor 2.462424       --effectiveGenomeSize 2864785220
  bamCoverage --bam ../markDuplicates/MKL1_EV_RNA_87Aligned.sortedByCoord.out.markDups.bam -o MKL1_EV_RNA_87.bw --binSize 10 --scaleFactor 1.411154       --effectiveGenomeSize 2864785220
  bamCoverage --bam ../markDuplicates/MKL1_EV_RNA_118Aligned.sortedByCoord.out.markDups.bam -o MKL1_EV_RNA_118.bw --binSize 10 --scaleFactor 1.544239       --effectiveGenomeSize 2864785220

  setwd("degenes")
  #---- * to untreated and wildtype ----
  dds$condition <- relevel(dds$condition, "MKL1.RNA")
  dds = DESeq(dds, betaPrior=FALSE)
  resultsNames(dds)
  clist <- c("MKL1.EV_vs_MKL1.RNA")

  dds$condition <- relevel(dds$condition, "MKL1.EV")
  dds = DESeq(dds, betaPrior=FALSE)
  resultsNames(dds)
  clist <- c("MKL1.sT.DMSO_vs_MKL1.EV", "MKL1.sT.Dox_vs_MKL1.EV", "MKL1.scr.DMSO_vs_MKL1.EV", "MKL1.scr.Dox_vs_MKL1.EV")

  #For internal comparisons between sT.DMSO, sT.Dox, scr.DMSO and scr.Dox see the chapter "Do separate shRNA and treatment analysis"

  dds$condition <- relevel(dds$condition, "WaGa.RNA")
  dds = DESeq(dds, betaPrior=FALSE)
  resultsNames(dds)
  clist <- c("WaGa.EV_vs_WaGa.RNA")

  dds$condition <- relevel(dds$condition, "WaGa.EV")
  dds = DESeq(dds, betaPrior=FALSE)
  resultsNames(dds)
  clist <- c("WaGa.sT.DMSO_vs_WaGa.EV", "WaGa.sT.Dox_vs_WaGa.EV", "WaGa.scr.DMSO_vs_WaGa.EV", "WaGa.scr.Dox_vs_WaGa.EV")

  #For internal comparisons between sT.DMSO, sT.Dox, scr.DMSO and scr.Dox see the chapter "Do separate shRNA and treatment analysis"

  dds$condition <- relevel(dds$condition, "MKL1.RNA")
  dds = DESeq(dds, betaPrior=FALSE)
  resultsNames(dds)
  clist <- c("WaGa.RNA_vs_MKL1.RNA")

  dds$condition <- relevel(dds$condition, "MKL1.EV")
  dds = DESeq(dds, betaPrior=FALSE)
  resultsNames(dds)
  clist <- c("WaGa.EV_vs_MKL1.EV")

  ##https://bioconductor.statistik.tu-dortmund.de/packages/3.7/data/annotation/
  #BiocManager::install("EnsDb.Mmusculus.v79")
  #library(EnsDb.Mmusculus.v79)
  #edb <- EnsDb.Mmusculus.v79
  #https://bioconductor.org/packages/release/bioc/vignettes/biomaRt/inst/doc/accessing_ensembl.html#selecting-an-ensembl-biomart-database-and-dataset
  #https://bioconductor.org/packages/release/bioc/vignettes/biomaRt/inst/doc/accessing_ensembl.html#selecting-an-ensembl-biomart-database-and-dataset
  library(biomaRt)
  listEnsembl()
  listMarts()
  #ensembl <- useEnsembl(biomart = "genes", mirror="asia")  # default is Mouse strains 104
  #ensembl <- useEnsembl(biomart = "ensembl", dataset = "mmusculus_gene_ensembl", mirror = "www")
  #ensembl = useMart("ensembl_mart_44", dataset="hsapiens_gene_ensembl",archive=TRUE, mysql=TRUE)
  #ensembl <- useEnsembl(biomart = "ensembl", dataset = "mmusculus_gene_ensembl", version="104")
  #ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="86")
  #ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="GRCh37")
  #--> total 69, 27  GRCh38.p7 and 39  GRCm38.p4
  #DEBUG: use R version 4.3.3, the version 104 is not loadable, using version 112 instead!
  #ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="104")
  #Error in bmRequest(request = request, httr_config = httr_config, verbose = verbose) :
  #  Internal Server Error (HTTP 500).
  #--> total 202   80                         GRCh38.p13         107                            GRCm39
  #80           hsapiens_gene_ensembl                                      Human genes (GRCh38.p13)                         GRCh38.p13
  #107         mmusculus_gene_ensembl                                        Mouse genes (GRCm39)                            GRCm39
  ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="112")
  #80                         GRCh38.p14
  #107                            GRCm39
  datasets <- listDatasets(ensembl)

  > listEnsemblArchives()
              name     date                                url version
  1  Ensembl GRCh37 Feb 2014          http://grch37.ensembl.org  GRCh37  *
  2     Ensembl 104 May 2021 http://may2021.archive.ensembl.org     104  *
  3     Ensembl 103 Feb 2021 http://feb2021.archive.ensembl.org     103
  4     Ensembl 102 Nov 2020 http://nov2020.archive.ensembl.org     102
  attributes = listAttributes(ensembl)
  attributes[1:25,]

  #library("dplyr")
  for (i in clist) {
  #i<-clist[1]
    contrast = paste("condition", i, sep="_")
    res = results(dds, name=contrast)
    res <- res[!is.na(res$log2FoldChange),]
    #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("ENTREZID","EXONID","GENEBIOTYPE","GENEID","GENENAME","PROTEINDOMAINSOURCE","PROTEINID","SEQNAME","SEQSTRAND","SYMBOL","TXBIOTYPE","TXID","TXNAME","UNIPROTID"))
    #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("GENEID", "ENTREZID", "SYMBOL", "GENENAME","GENEBIOTYPE","TXBIOTYPE","SEQSTRAND","UNIPROTID"))
    # In the ENSEMBL-database, GENEID is ENSEMBL-ID.
    #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("GENEID", "SYMBOL", "GENEBIOTYPE"))  #  "ENTREZID", "TXID","TXBIOTYPE","TXSEQSTART","TXSEQEND"
    #geness <- geness[!duplicated(geness$GENEID), ]

    #using getBM replacing AnnotationDbi::select
    #filters = 'ensembl_gene_id' means the records should always have a valid ensembl_gene_ids.
    geness <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
        filters = 'ensembl_gene_id',
        values = rownames(res),
        mart = ensembl)
    geness_uniq <- distinct(geness, ensembl_gene_id, .keep_all= TRUE)

    #merge by column by common colunmn name, in the case "GENEID"
    res$ENSEMBL = rownames(res)
    identical(rownames(res), rownames(geness_uniq))
    res_df <- as.data.frame(res)
    geness_res <- merge(geness_uniq, res_df, by.x="ensembl_gene_id", by.y="ENSEMBL")
    dim(geness_res)
    rownames(geness_res) <- geness_res$ensembl_gene_id
    geness_res$ensembl_gene_id <- NULL
    write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
    up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
    down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
    write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
    write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
  }

8, do separate shRNA and treatment analysis

  #In DESeq2, resultsNames(dds) gives you the names of the coefficients in your linear model that DESeq2 has fit to the count data. Each name corresponds to a comparison between levels of #your factors (or the interaction between them) that DESeq2 can test for differential expression. Here's what each of these results names likely means based on typical DESeq2 output:
  #  * "Intercept": This represents the base level of expression estimated by the model. If your factors are shRNA and treatment, and if the reference levels for these factors are scr and DMSO, respectively, then the "Intercept" represents the expression level for the reference group, which in this case is cells treated with scr and DMSO.
  #  * "shRNA_sT_vs_scr": This represents the comparison between the sT and scr levels of the shRNA factor. This means it represents the log2 fold change in expression from the scr condition to the sT condition, while keeping the treatment factor constant (at its reference level, likely DMSO).
  #  * "treatment_Dox_vs_DMSO": This represents the comparison between the Dox and DMSO levels of the treatment factor. This means it represents the log2 fold change in expression from the DMSO condition to the Dox condition, while keeping the shRNA factor constant (at its reference level, likely scr).
  #  * "shRNAsT.treatmentDox": This is the interaction term between the sT level of shRNA and the Dox level of treatment. This term tests whether the effect of the treatment (Dox vs. DMSO) is different in the sT condition compared to the scr condition. In other words, it tests whether the difference in expression due to the treatment (Dox) is different when the cells are treated with sT compared to when they are treated with scr.
  #If you have an interaction term in your design, you should interpret main effects (shRNA_sT_vs_scr and treatment_Dox_vs_DMSO) cautiously because the main effects are evaluated at the reference level of the other factor, and their simple interpretation is complicated by the presence of interaction. Interaction term (shRNAsT.treatmentDox) can indicate if there's a specific combination of shRNA and treatment that has a differential expression different from what would be expected based on the individual effects of shRNA and treatment.
  #当你运行 DESeq2 并使用 resultsNames(dds) 函数时,你将得到一系列的结果名称,这些名称代表了模型中的比较。在你给出的例子中:
  #  * "Intercept":截距,代表模型的基线水平。在生物统计学中,这通常指没有进行任何处理的条件。
  #  * "shRNA_sT_vs_scr":这代表 shRNA 的 sT 条件与 scr 条件的比较。如果你的实验设计中有两种不同的 shRNA 处理(sT 和 scr),这个比较会告诉你,与 scr 相比,sT 处理导致的基因表达量变化。
  #  * "treatment_Dox_vs_DMSO":这代表了处理条件 Dox 与 DMSO 的比较。这会显示在 DMSO(常作为对照组)和 Dox(可能是某种药物或处理)之间的差异。
  #  * "shRNAsT.treatmentDox":这是一个交互项,表示 shRNA 的 sT 处理和 Dox 处理的结合效应。这意味着它比较的是在 sT shRNA 影响下,Dox 对基因表达的影响与在 scr shRNA 影响下,Dox 对基因表达的影响之间的差异。
  #简而言之,这些结果名称代表你的实验中不同条件或处理组合的比较,这有助于你了解不同实验条件下的基因表达如何变化。

  d_MKL1 <- d[, !(grepl("parental cell|wt|WaGa", names(d)))]
  cell.line = as.factor(c("MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1","MKL-1"))
  shRNA = as.factor(c("sT","sT","scr","scr","sT","sT","scr","scr"))
  treatment = as.factor(c("DMSO","DMSO","DMSO","DMSO","Dox","Dox","Dox","Dox"))
  cData = data.frame(row.names=colnames(d_MKL1), shRNA=shRNA, treatment=treatment, cell.line=cell.line)
  dds_MKL1<-DESeqDataSetFromMatrix(countData=d_MKL1, colData=cData, design=~shRNA+treatment+shRNA:treatment)

  #png("pca_shRNA_treatment.png", 1200, 800)
  #plotPCA(rld, intgroup = c("shRNA", "treatment"))
  #dev.off()
  #shRNA type (scr or sT) and the treatment (DMSO or Dox),
  #design = ~ shRNA + treatment + shRNA:treatment
  #DESeq2Res <- results(dds_MKL1,  contrast = c("treatment","Dox","DMSO")) # scr_Dox vs scr_DMSO, effect of Dox in scr. "
  #DESeq2Res = results(dds_MKL1, contrast = list( c("treatment_Dox_vs_DMSO","shRNAsT.treatmentDMSO") )) # sT_Dox vs sT_DMSO, effect of Dox in sT"
  #DESeq2Res = results(dds_MKL1, name="shRNAsT.treatmentDMSO") # difference between sT and src, effect of Dox different in sT vs scr.

  #dds_MKL1$treatment <- factor(dds_MKL1$treatment)
  #dds_MKL1$treatment <- relevel(dds_MKL1$treatment, ref = "DMSO")

  dds_MKL1 = DESeq(dds_MKL1, betaPrior=FALSE)
  resultsNames(dds_MKL1)
  contrasts <- c("shRNA_sT_vs_scr", "treatment_Dox_vs_DMSO", "shRNAsT.treatmentDox")

  #library("dplyr")
  for (contrast in contrasts) {
    res = results(dds_MKL1, name=contrast)
    res <- res[!is.na(res$log2FoldChange),]
    #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("ENTREZID","EXONID","GENEBIOTYPE","GENEID","GENENAME","PROTEINDOMAINSOURCE","PROTEINID","SEQNAME","SEQSTRAND","SYMBOL","TXBIOTYPE","TXID","TXNAME","UNIPROTID"))
    #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("GENEID", "ENTREZID", "SYMBOL", "GENENAME","GENEBIOTYPE","TXBIOTYPE","SEQSTRAND","UNIPROTID"))
    # In the ENSEMBL-database, GENEID is ENSEMBL-ID.
    #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("GENEID", "SYMBOL", "GENEBIOTYPE"))  #  "ENTREZID", "TXID","TXBIOTYPE","TXSEQSTART","TXSEQEND"
    #geness <- geness[!duplicated(geness$GENEID), ]

    #using getBM replacing AnnotationDbi::select
    #filters = 'ensembl_gene_id' means the records should always have a valid ensembl_gene_ids.
    geness <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
        filters = 'ensembl_gene_id',
        values = rownames(res),
        mart = ensembl)
    geness_uniq <- distinct(geness, ensembl_gene_id, .keep_all= TRUE)

    #merge by column by common colunmn name, in the case "GENEID"
    res$ENSEMBL = rownames(res)
    identical(rownames(res), rownames(geness_uniq))
    res_df <- as.data.frame(res)
    geness_res <- merge(geness_uniq, res_df, by.x="ensembl_gene_id", by.y="ENSEMBL")
    dim(geness_res)
    rownames(geness_res) <- geness_res$ensembl_gene_id
    geness_res$ensembl_gene_id <- NULL
    write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(contrast, "all.txt", sep="-"))
    up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
    down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
    write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(contrast, "up.txt", sep="-"))
    write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(contrast, "down.txt", sep="-"))
  }

  mv shRNA_sT_vs_scr-all.txt MKL1_shRNA_sT_vs_scr-all.txt
  mv shRNA_sT_vs_scr-up.txt MKL1_shRNA_sT_vs_scr-up.txt
  mv shRNA_sT_vs_scr-down.txt MKL1_shRNA_sT_vs_scr-down.txt
  mv treatment_Dox_vs_DMSO-all.txt MKL1_treatment_Dox_vs_DMSO-all.txt
  mv treatment_Dox_vs_DMSO-up.txt MKL1_treatment_Dox_vs_DMSO-up.txt
  mv treatment_Dox_vs_DMSO-down.txt MKL1_treatment_Dox_vs_DMSO-down.txt
  mv shRNAsT.treatmentDox-all.txt MKL1_shRNAsT.treatmentDox-all.txt
  mv shRNAsT.treatmentDox-up.txt MKL1_shRNAsT.treatmentDox-up.txt
  mv shRNAsT.treatmentDox-down.txt MKL1_shRNAsT.treatmentDox-down.txt

  d_WaGa <- d[, !(grepl("parental cell|wt|MKL", names(d)))]
  cell.line = as.factor(c("WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa"))
  shRNA = as.factor(c("sT","sT","sT","scr","scr","scr","sT","sT","sT","scr","scr","scr"))
  treatment = as.factor(c("DMSO","DMSO","DMSO","DMSO","DMSO","DMSO","Dox","Dox","Dox","Dox","Dox","Dox"))
  cData = data.frame(row.names=colnames(d_WaGa), shRNA=shRNA, treatment=treatment, cell.line=cell.line)
  dds_WaGa<-DESeqDataSetFromMatrix(countData=d_WaGa, colData=cData, design=~shRNA+treatment+shRNA:treatment)

  dds_WaGa = DESeq(dds_WaGa, betaPrior=FALSE)
  resultsNames(dds_WaGa)
  contrasts <- c("shRNA_sT_vs_scr", "treatment_Dox_vs_DMSO", "shRNAsT.treatmentDox")

  #library("dplyr")
  for (contrast in contrasts) {
    res = results(dds_WaGa, name=contrast)
    res <- res[!is.na(res$log2FoldChange),]
    #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("ENTREZID","EXONID","GENEBIOTYPE","GENEID","GENENAME","PROTEINDOMAINSOURCE","PROTEINID","SEQNAME","SEQSTRAND","SYMBOL","TXBIOTYPE","TXID","TXNAME","UNIPROTID"))
    #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("GENEID", "ENTREZID", "SYMBOL", "GENENAME","GENEBIOTYPE","TXBIOTYPE","SEQSTRAND","UNIPROTID"))
    # In the ENSEMBL-database, GENEID is ENSEMBL-ID.
    #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("GENEID", "SYMBOL", "GENEBIOTYPE"))  #  "ENTREZID", "TXID","TXBIOTYPE","TXSEQSTART","TXSEQEND"
    #geness <- geness[!duplicated(geness$GENEID), ]

    #using getBM replacing AnnotationDbi::select
    #filters = 'ensembl_gene_id' means the records should always have a valid ensembl_gene_ids.
    geness <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
        filters = 'ensembl_gene_id',
        values = rownames(res),
        mart = ensembl)
    geness_uniq <- distinct(geness, ensembl_gene_id, .keep_all= TRUE)

    #merge by column by common colunmn name, in the case "GENEID"
    res$ENSEMBL = rownames(res)
    identical(rownames(res), rownames(geness_uniq))
    res_df <- as.data.frame(res)
    geness_res <- merge(geness_uniq, res_df, by.x="ensembl_gene_id", by.y="ENSEMBL")
    dim(geness_res)
    rownames(geness_res) <- geness_res$ensembl_gene_id
    geness_res$ensembl_gene_id <- NULL
    write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(contrast, "all.txt", sep="-"))
    up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
    down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
    write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(contrast, "up.txt", sep="-"))
    write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(contrast, "down.txt", sep="-"))
  }

  mv shRNA_sT_vs_scr-all.txt WaGa_shRNA_sT_vs_scr-all.txt
  mv shRNA_sT_vs_scr-up.txt WaGa_shRNA_sT_vs_scr-up.txt
  mv shRNA_sT_vs_scr-down.txt WaGa_shRNA_sT_vs_scr-down.txt
  mv treatment_Dox_vs_DMSO-all.txt WaGa_treatment_Dox_vs_DMSO-all.txt
  mv treatment_Dox_vs_DMSO-up.txt WaGa_treatment_Dox_vs_DMSO-up.txt
  mv treatment_Dox_vs_DMSO-down.txt WaGa_treatment_Dox_vs_DMSO-down.txt
  mv shRNAsT.treatmentDox-all.txt WaGa_shRNAsT.treatmentDox-all.txt
  mv shRNAsT.treatmentDox-up.txt WaGa_shRNAsT.treatmentDox-up.txt
  mv shRNAsT.treatmentDox-down.txt WaGa_shRNAsT.treatmentDox-down.txt

  ~/Tools/csv2xls-0.4/csv_to_xls.py MKL1_shRNA_sT_vs_scr-up.txt MKL1_shRNA_sT_vs_scr-down.txt MKL1_shRNA_sT_vs_scr-all.txt -d$',' -o MKL1_shRNA_sT_vs_scr.xls
  ~/Tools/csv2xls-0.4/csv_to_xls.py MKL1_treatment_Dox_vs_DMSO-up.txt MKL1_treatment_Dox_vs_DMSO-down.txt MKL1_treatment_Dox_vs_DMSO-all.txt -d$',' -o MKL1_treatment_Dox_vs_DMSO.xls
  ~/Tools/csv2xls-0.4/csv_to_xls.py MKL1_shRNAsT.treatmentDox-up.txt MKL1_shRNAsT.treatmentDox-down.txt MKL1_shRNAsT.treatmentDox-all.txt -d$',' -o MKL1_shRNAsT.treatmentDox.xls
  ~/Tools/csv2xls-0.4/csv_to_xls.py WaGa_shRNA_sT_vs_scr-up.txt WaGa_shRNA_sT_vs_scr-down.txt WaGa_shRNA_sT_vs_scr-all.txt -d$',' -o WaGa_shRNA_sT_vs_scr.xls
  ~/Tools/csv2xls-0.4/csv_to_xls.py WaGa_treatment_Dox_vs_DMSO-up.txt WaGa_treatment_Dox_vs_DMSO-down.txt WaGa_treatment_Dox_vs_DMSO-all.txt -d$',' -o WaGa_treatment_Dox_vs_DMSO.xls
  ~/Tools/csv2xls-0.4/csv_to_xls.py WaGa_shRNAsT.treatmentDox-up.txt WaGa_shRNAsT.treatmentDox-down.txt WaGa_shRNAsT.treatmentDox-all.txt -d$',' -o WaGa_shRNAsT.treatmentDox.xls

9, volcano plots with automatically finding top_g

  #A canonical visualization for interpreting differential gene expression results is the volcano plot.
  library(ggrepel)

  #"EV.RNA_vs_RNA"  "scr.DMSO_vs_EV.RNA" "sT.DMSO_vs_EV.RNA" "scr.Dox_vs_EV.RNA" "sT.Dox_vs_EV.RNA"  "sT.Dox_vs_scr.Dox"  "sT.DMSO_vs_scr.DMSO"
  geness_res <- read.csv(file = paste("WaGa.EV_vs_WaGa.RNA", "all.txt", sep="-"), row.names=1)
  geness_res$Color <- "NS or log2FC < 2.0"
  geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
  geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
  geness_res$Color[geness_res$padj < 0.001] <- "P-adj < 0.001"
  geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
  geness_res$Color <- factor(geness_res$Color,
                          levels = c("NS or log2FC < 2.0", "P < 0.05",
                                    "P-adj < 0.05", "P-adj < 0.001"))

  geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
  top_g <- c()
  top_g <- c(top_g, geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:200]], geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:200]])
  top_g <- unique(top_g)
  geness_res <- geness_res[, -1*ncol(geness_res)]  #remove invert_P from matrix

  png("WaGa.EV_vs_WaGa.RNA.png",width=1400, height=1000)
  ggplot(geness_res,
        aes(x = log2FoldChange, y = -log10(pvalue),
            color = Color, label = external_gene_name)) +
    geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +
    geom_hline(yintercept = -log10(0.05), lty = "dashed") +
    geom_point() +
    labs(x = "log2(FC)",
        y = "Significance, -log10(P)",
        color = "Significance") +
    scale_color_manual(values = c(`P-adj < 0.001` = "dodgerblue",
                                  `P-adj < 0.05` = "lightblue",
                                  `P < 0.05` = "orange2",
                                  `NS or log2FC < 2.0` = "gray"),
                      guide = guide_legend(override.aes = list(size = 4))) +
    scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
    geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)),
                    size = 4, point.padding = 0.15, color = "black",
                    min.segment.length = .1, box.padding = .2, lwd = 2) +
    theme_bw(base_size = 16) +
    theme(legend.position = "bottom")
  dev.off()

  #sed -i -e 's/Color/Category/g' *_Category.csv

  #x <- data.frame(k1 = c(NA,3,4,5,2), k2 = c(1,NA,4,5,2), data = 6:10)
  #merge(x, y, by = "k2")

  for cmp in "EV.RNA_vs_RNA"  "scr.DMSO_vs_EV.RNA" "sT.DMSO_vs_EV.RNA" "scr.Dox_vs_EV.RNA" "sT.Dox_vs_EV.RNA"  "sT.Dox_vs_scr.Dox"  "sT.DMSO_vs_scr.DMSO"; do
    echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${cmp}-all.txt ${cmp}-up.txt ${cmp}-down.txt -d$',' -o ${cmp}.xls"
  done
  ~/Tools/csv2xls-0.4/csv_to_xls.py EV.RNA_vs_RNA-all.txt EV.RNA_vs_RNA-up.txt EV.RNA_vs_RNA-down.txt -d$',' -o EV.RNA_vs_RNA.xls
  ~/Tools/csv2xls-0.4/csv_to_xls.py scr.DMSO_vs_EV.RNA-all.txt scr.DMSO_vs_EV.RNA-up.txt scr.DMSO_vs_EV.RNA-down.txt -d$',' -o scr.DMSO_vs_EV.RNA.xls
  ~/Tools/csv2xls-0.4/csv_to_xls.py sT.DMSO_vs_EV.RNA-all.txt sT.DMSO_vs_EV.RNA-up.txt sT.DMSO_vs_EV.RNA-down.txt -d$',' -o sT.DMSO_vs_EV.RNA.xls
  ~/Tools/csv2xls-0.4/csv_to_xls.py scr.Dox_vs_EV.RNA-all.txt scr.Dox_vs_EV.RNA-up.txt scr.Dox_vs_EV.RNA-down.txt -d$',' -o scr.Dox_vs_EV.RNA.xls
  ~/Tools/csv2xls-0.4/csv_to_xls.py sT.Dox_vs_EV.RNA-all.txt sT.Dox_vs_EV.RNA-up.txt sT.Dox_vs_EV.RNA-down.txt -d$',' -o sT.Dox_vs_EV.RNA.xls
  ~/Tools/csv2xls-0.4/csv_to_xls.py sT.Dox_vs_scr.Dox-all.txt sT.Dox_vs_scr.Dox-up.txt sT.Dox_vs_scr.Dox-down.txt -d$',' -o sT.Dox_vs_scr.Dox.xls
  ~/Tools/csv2xls-0.4/csv_to_xls.py sT.DMSO_vs_scr.DMSO-all.txt sT.DMSO_vs_scr.DMSO-up.txt sT.DMSO_vs_scr.DMSO-down.txt -d$',' -o sT.DMSO_vs_scr.DMSO.xls
  #WaGa.EV_vs_WaGa.RNA-all.txt
  ~/Tools/csv2xls-0.4/csv_to_xls.py WaGa.EV_vs_WaGa.RNA-up.txt WaGa.EV_vs_WaGa.RNA-down.txt -d$',' -o WaGa.EV_vs_WaGa.RNA.xls

10, clustering the genes and draw heatmap

  install.packages("gplots")
  library("gplots")

  #Option3: as paper described, A heatmap showing expression values of all DEGs which are significant between any pair conditions.
  all_genes <- c(rownames(mock_sT_d8_vs_mock_sT_d3_sig),rownames(sT_d3_vs_mock_sT_d3_sig),rownames(sT_d8_vs_mock_sT_d8_sig),rownames(sT_d8_vs_sT_d3_sig))     #873
  all_genes <- unique(all_genes)   #663
  #all_genes2 <- c(rownames(WAC_vs_mock_sig),rownames(WAP_vs_mock_sig),rownames(WAC_vs_WAP_sig))   #3917
  #all_genes2 <- unique(all_genes2)   #2608
  #intersected_genes <- intersect(all_genes, all_genes2)  # 2608
  #RNASeq.NoCellLine <- read.csv(file ="gene_expression_keeping_condition.txt", row.names=1)
  RNASeq.NoCellLine_  <- RNASeq.NoCellLine[all_genes,]
  write.csv(as.data.frame(RNASeq.NoCellLine_), file ="gene_expression_keeping_condition.txt")

  RNASeq.NoCellLine_ <- cbind(RNASeq.NoCellLine_, mock_sT_d3 = rowMeans(RNASeq.NoCellLine_[, 1:2]))
  RNASeq.NoCellLine_ <- cbind(RNASeq.NoCellLine_, mock_sT_d8 = rowMeans(RNASeq.NoCellLine_[, 3:4]))
  RNASeq.NoCellLine_ <- cbind(RNASeq.NoCellLine_, sT_d3 = rowMeans(RNASeq.NoCellLine_[, 5:6]))
  RNASeq.NoCellLine_ <- cbind(RNASeq.NoCellLine_, sT_d8 = rowMeans(RNASeq.NoCellLine_[, 7:8]))
  RNASeq.NoCellLine_ <- RNASeq.NoCellLine_[,c(-1:-8)]        #663x4
  #RNASeq.NoCellLine__ <- read.csv(file ="gene_expression_keeping_condition.txt", row.names=1)
  write.csv(as.data.frame(RNASeq.NoCellLine_), file ="gene_expression_merging_condition.txt")

  cut -d',' -f1-1 ./WaGa.EV_vs_WaGa.RNA-up.txt > WaGa.EV_vs_WaGa.RNA-up.id
  cut -d',' -f1-1 ./WaGa.sT.DMSO_vs_WaGa.EV-up.txt > WaGa.sT.DMSO_vs_WaGa.EV-up.id
  cut -d',' -f1-1 ./WaGa.sT.Dox_vs_WaGa.EV-up.txt > WaGa.sT.Dox_vs_WaGa.EV-up.id
  cut -d',' -f1-1 ./WaGa.scr.DMSO_vs_WaGa.EV-up.txt > WaGa.scr.DMSO_vs_WaGa.EV-up.id
  cut -d',' -f1-1 ./WaGa.scr.Dox_vs_WaGa.EV-up.txt > WaGa.scr.Dox_vs_WaGa.EV-up.id
  cut -d',' -f1-1 ./WaGa_shRNA_sT_vs_scr-up.txt > WaGa_shRNA_sT_vs_scr-up.id
  cut -d',' -f1-1 ./WaGa_treatment_Dox_vs_DMSO-up.txt > WaGa_treatment_Dox_vs_DMSO-up.id
  cut -d',' -f1-1 ./WaGa_shRNAsT.treatmentDox-up.txt > WaGa_shRNAsT.treatmentDox-up.id

  cut -d',' -f1-1 ./WaGa.EV_vs_WaGa.RNA-down.txt > WaGa.EV_vs_WaGa.RNA-down.id
  cut -d',' -f1-1 ./WaGa.sT.DMSO_vs_WaGa.EV-down.txt > WaGa.sT.DMSO_vs_WaGa.EV-down.id
  cut -d',' -f1-1 ./WaGa.sT.Dox_vs_WaGa.EV-down.txt > WaGa.sT.Dox_vs_WaGa.EV-down.id
  cut -d',' -f1-1 ./WaGa.scr.DMSO_vs_WaGa.EV-down.txt > WaGa.scr.DMSO_vs_WaGa.EV-down.id
  cut -d',' -f1-1 ./WaGa.scr.Dox_vs_WaGa.EV-down.txt > WaGa.scr.Dox_vs_WaGa.EV-down.id
  cut -d',' -f1-1 ./WaGa_shRNA_sT_vs_scr-down.txt > WaGa_shRNA_sT_vs_scr-down.id
  cut -d',' -f1-1 ./WaGa_treatment_Dox_vs_DMSO-down.txt > WaGa_treatment_Dox_vs_DMSO-down.id
  cut -d',' -f1-1 ./WaGa_shRNAsT.treatmentDox-down.txt > WaGa_shRNAsT.treatmentDox-down.id

  cut -d',' -f1-1 ./MKL1.EV_vs_MKL1.RNA-up.txt > MKL1.EV_vs_MKL1.RNA-up.id
  cut -d',' -f1-1 ./MKL1.sT.DMSO_vs_MKL1.EV-up.txt > MKL1.sT.DMSO_vs_MKL1.EV-up.id
  cut -d',' -f1-1 ./MKL1.sT.Dox_vs_MKL1.EV-up.txt > MKL1.sT.Dox_vs_MKL1.EV-up.id
  cut -d',' -f1-1 ./MKL1.scr.DMSO_vs_MKL1.EV-up.txt > MKL1.scr.DMSO_vs_MKL1.EV-up.id
  cut -d',' -f1-1 ./MKL1.scr.Dox_vs_MKL1.EV-up.txt > MKL1.scr.Dox_vs_MKL1.EV-up.id
  cut -d',' -f1-1 ./MKL1_shRNA_sT_vs_scr-up.txt > MKL1_shRNA_sT_vs_scr-up.id
  cut -d',' -f1-1 ./MKL1_treatment_Dox_vs_DMSO-up.txt > MKL1_treatment_Dox_vs_DMSO-up.id
  cut -d',' -f1-1 ./MKL1_shRNAsT.treatmentDox-up.txt > MKL1_shRNAsT.treatmentDox-up.id

  cut -d',' -f1-1 ./MKL1.EV_vs_MKL1.RNA-down.txt > MKL1.EV_vs_MKL1.RNA-down.id
  cut -d',' -f1-1 ./MKL1.sT.DMSO_vs_MKL1.EV-down.txt > MKL1.sT.DMSO_vs_MKL1.EV-down.id
  cut -d',' -f1-1 ./MKL1.sT.Dox_vs_MKL1.EV-down.txt > MKL1.sT.Dox_vs_MKL1.EV-down.id
  cut -d',' -f1-1 ./MKL1.scr.DMSO_vs_MKL1.EV-down.txt > MKL1.scr.DMSO_vs_MKL1.EV-down.id
  cut -d',' -f1-1 ./MKL1.scr.Dox_vs_MKL1.EV-down.txt > MKL1.scr.Dox_vs_MKL1.EV-down.id
  cut -d',' -f1-1 ./MKL1_shRNA_sT_vs_scr-down.txt > MKL1_shRNA_sT_vs_scr-down.id
  cut -d',' -f1-1 ./MKL1_treatment_Dox_vs_DMSO-down.txt > MKL1_treatment_Dox_vs_DMSO-down.id
  cut -d',' -f1-1 ./MKL1_shRNAsT.treatmentDox-down.txt > MKL1_shRNAsT.treatmentDox-down.id

  cut -d',' -f1-1 WaGa.RNA_vs_MKL1.RNA-up.txt > WaGa.RNA_vs_MKL1.RNA-up.id
  cut -d',' -f1-1 WaGa.RNA_vs_MKL1.RNA-down.txt > WaGa.RNA_vs_MKL1.RNA-down.id
  cut -d',' -f1-1 WaGa.EV_vs_MKL1.EV-up.txt > WaGa.EV_vs_MKL1.EV-up.id
  cut -d',' -f1-1 WaGa.EV_vs_MKL1.EV-down.txt > WaGa.EV_vs_MKL1.EV-down.id

  cat MKL1.EV_vs_MKL1.RNA-down.id MKL1.EV_vs_MKL1.RNA-up.id MKL1.scr.DMSO_vs_MKL1.EV-down.id MKL1.scr.DMSO_vs_MKL1.EV-up.id MKL1.scr.Dox_vs_MKL1.EV-down.id MKL1.scr.Dox_vs_MKL1.EV-up.id MKL1_shRNAsT.treatmentDox-down.id MKL1_shRNAsT.treatmentDox-up.id MKL1_shRNA_sT_vs_scr-down.id MKL1_shRNA_sT_vs_scr-up.id MKL1.sT.DMSO_vs_MKL1.EV-down.id MKL1.sT.DMSO_vs_MKL1.EV-up.id MKL1.sT.Dox_vs_MKL1.EV-down.id MKL1.sT.Dox_vs_MKL1.EV-up.id MKL1_treatment_Dox_vs_DMSO-down.id MKL1_treatment_Dox_vs_DMSO-up.id | sort -u > MKL1.ids
  cat WaGa.EV_vs_WaGa.RNA-down.id WaGa.EV_vs_WaGa.RNA-up.id WaGa.scr.DMSO_vs_WaGa.EV-down.id WaGa.scr.DMSO_vs_WaGa.EV-up.id WaGa.scr.Dox_vs_WaGa.EV-down.id WaGa.scr.Dox_vs_WaGa.EV-up.id WaGa_shRNAsT.treatmentDox-down.id WaGa_shRNAsT.treatmentDox-up.id WaGa_shRNA_sT_vs_scr-down.id WaGa_shRNA_sT_vs_scr-up.id WaGa.sT.DMSO_vs_WaGa.EV-down.id WaGa.sT.DMSO_vs_WaGa.EV-up.id WaGa.sT.Dox_vs_WaGa.EV-down.id WaGa.sT.Dox_vs_WaGa.EV-up.id WaGa_treatment_Dox_vs_DMSO-down.id WaGa_treatment_Dox_vs_DMSO-up.id | sort -u > WaGa.ids
  #WaGa.RNA_vs_MKL1.RNA-up.id WaGa.RNA_vs_MKL1.RNA-down.id WaGa.EV_vs_MKL1.EV-up.id WaGa.EV_vs_MKL1.EV-down.id | sort -u > ids

  # 28186 (new) vs 28193
  cat MKL1.EV_vs_MKL1.RNA-down.id MKL1.EV_vs_MKL1.RNA-up.id MKL1.scr.DMSO_vs_MKL1.EV-down.id MKL1.scr.DMSO_vs_MKL1.EV-up.id MKL1.scr.Dox_vs_MKL1.EV-down.id MKL1.scr.Dox_vs_MKL1.EV-up.id MKL1_shRNAsT.treatmentDox-down.id MKL1_shRNAsT.treatmentDox-up.id MKL1_shRNA_sT_vs_scr-down.id MKL1_shRNA_sT_vs_scr-up.id MKL1.sT.DMSO_vs_MKL1.EV-down.id MKL1.sT.DMSO_vs_MKL1.EV-up.id MKL1.sT.Dox_vs_MKL1.EV-down.id MKL1.sT.Dox_vs_MKL1.EV-up.id MKL1_treatment_Dox_vs_DMSO-down.id MKL1_treatment_Dox_vs_DMSO-up.id WaGa.EV_vs_WaGa.RNA-down.id WaGa.EV_vs_WaGa.RNA-up.id WaGa.scr.DMSO_vs_WaGa.EV-down.id WaGa.scr.DMSO_vs_WaGa.EV-up.id WaGa.scr.Dox_vs_WaGa.EV-down.id WaGa.scr.Dox_vs_WaGa.EV-up.id WaGa_shRNAsT.treatmentDox-down.id WaGa_shRNAsT.treatmentDox-up.id WaGa_shRNA_sT_vs_scr-down.id WaGa_shRNA_sT_vs_scr-up.id WaGa.sT.DMSO_vs_WaGa.EV-down.id WaGa.sT.DMSO_vs_WaGa.EV-up.id WaGa.sT.Dox_vs_WaGa.EV-down.id WaGa.sT.Dox_vs_WaGa.EV-up.id WaGa_treatment_Dox_vs_DMSO-down.id WaGa_treatment_Dox_vs_DMSO-up.id WaGa.RNA_vs_MKL1.RNA-up.id WaGa.RNA_vs_MKL1.RNA-down.id WaGa.EV_vs_MKL1.EV-up.id WaGa.EV_vs_MKL1.EV-down.id | sort -u > ids

  # ---- Draw DEGs_heatmap for parental_cell_RNA and wt_EV_RNA samples ----
  #-- MKL-1.RNA and MKL-1.EV vs WaGa.RNA and WaGa.EV --
  #add Gene_Id in the first line.
  GOI <- read.csv("ids")$Gene_Id
  RNASeq.NoCellLine <- assay(rld)

  #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
  datamat = RNASeq.NoCellLine[GOI, ]
  hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
  hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
  mycl = cutree(hr, h=max(hr$height)/1.05)
  mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED",  "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");

  mycol = mycol[as.vector(mycl)]
  sampleCols <- rep('GREY',ncol(datamat))
  names(sampleCols) <- c("WaGa_RNA","WaGa_RNA_118","WaGa_RNA_147",  "MKL1_RNA","MKL1_RNA_118","MKL1_RNA_147",  "WaGa_EV_RNA","WaGa_EV_RNA_2","WaGa_EV_RNA_118","WaGa_EV_RNA_147","WaGa_EV_RNA_226",  "MKL1_EV_RNA","MKL1_EV_RNA_2","MKL1_EV_RNA_27","MKL1_EV_RNA_87","MKL1_EV_RNA_118")
  #sampleCols[substr(colnames(RNASeq.NoCellLine_),1,4)=='mock'] <- 'GREY'

  sampleCols["WaGa_RNA"] <- 'DARKBLUE'
  sampleCols["WaGa_RNA_118"] <- 'DARKBLUE'
  sampleCols["WaGa_RNA_147"] <- 'DARKBLUE'

  sampleCols["MKL1_RNA"] <- 'DARKRED'
  sampleCols["MKL1_RNA_118"] <- 'DARKRED'
  sampleCols["MKL1_RNA_147"] <- 'DARKRED'

  sampleCols["WaGa_EV_RNA"] <- 'DARKORANGE'
  sampleCols["WaGa_EV_RNA_2"] <- 'DARKORANGE'
  sampleCols["WaGa_EV_RNA_118"] <- 'DARKORANGE'
  sampleCols["WaGa_EV_RNA_147"] <- 'DARKORANGE'
  sampleCols["WaGa_EV_RNA_226"] <- 'DARKORANGE'

  sampleCols["MKL1_EV_RNA"] <- 'DARKGREEN'
  sampleCols["MKL1_EV_RNA_2"] <- 'DARKGREEN'
  sampleCols["MKL1_EV_RNA_27"] <- 'DARKGREEN'
  sampleCols["MKL1_EV_RNA_87"] <- 'DARKGREEN'
  sampleCols["MKL1_EV_RNA_118"] <- 'DARKGREEN'

  png("DEGs_heatmap.png", width=1000, height=1200)
  heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
              scale='row',trace='none',col=bluered(75),
              RowSideColors = mycol, ColSideColors = sampleCols, labRow="", margins=c(22,10), cexRow=8, cexCol=2, srtCol=45, lwid=c(1,7), lhei = c(1, 8))
  legend("top", title = "",legend=c("WaGa_RNA","MKL1_RNA","WaGa_EV_RNA","MKL1_EV_RNA"), fill=c("DARKBLUE","DARKRED","DARKORANGE","DARKGREEN"), cex=0.8, box.lty=0)
  dev.off()

  # ---- Draw DEGs_heatmap for the MKL-1 samples ----
  #add Gene_Id in the first line.
  setwd("degenes")
  #BiocManager::install("biomaRt")
  #install.packages("gplots")
  #install.packages("writexl")  # Install the writexl package
  library(writexl)             # Load the writexl package
  library(gplots)
  library(biomaRt)
  library(dplyr)
  GOI <- read.csv("MKL1.ids")$Gene_Id
  RNASeq.NoCellLine <- assay(rld)

  #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
  datamat_ = RNASeq.NoCellLine[GOI, c("MKL-1 parental cell RNA", "MKL-1 parental cell RNA 118", "MKL-1 parental cell RNA 147", "MKL-1 wt EV RNA", "MKL-1 wt EV RNA 2", "MKL-1 wt EV RNA 118", "MKL-1 wt EV RNA 87", "MKL-1 wt EV RNA 27", "MKL-1 wt EV RNA 042", "MKL-1 sT DMSO EV RNA 042", "MKL-1 sT DMSO EV RNA 0505", "MKL-1 scr DMSO EV RNA 042", "MKL-1 scr DMSO EV RNA 0505", "MKL-1 sT Dox EV RNA 042", "MKL-1 sT Dox EV RNA 0505", "MKL-1 scr Dox EV RNA 042", "MKL-1 scr Dox EV RNA 0505")]
  new_order <- c("MKL-1 parental cell RNA", "MKL-1 parental cell RNA 118", "MKL-1 parental cell RNA 147", "MKL-1 wt EV RNA", "MKL-1 wt EV RNA 2", "MKL-1 wt EV RNA 118", "MKL-1 wt EV RNA 87", "MKL-1 wt EV RNA 27", "MKL-1 wt EV RNA 042", "MKL-1 sT DMSO EV RNA 042", "MKL-1 sT DMSO EV RNA 0505", "MKL-1 sT Dox EV RNA 042", "MKL-1 sT Dox EV RNA 0505", "MKL-1 scr DMSO EV RNA 042", "MKL-1 scr DMSO EV RNA 0505", "MKL-1 scr Dox EV RNA 042", "MKL-1 scr Dox EV RNA 0505")
  datamat <- datamat_[, new_order]
  hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
  hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
  mycl = cutree(hr, h=max(hr$height)/1.05)
  mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED",  "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");
  mycol = mycol[as.vector(mycl)]
  sampleCols <- rep('GREY',ncol(datamat))
  names(sampleCols) <- c("MKL-1 parental cell RNA", "MKL-1 parental cell RNA 118", "MKL-1 parental cell RNA 147", "MKL-1 wt EV RNA", "MKL-1 wt EV RNA 2", "MKL-1 wt EV RNA 118", "MKL-1 wt EV RNA 87", "MKL-1 wt EV RNA 27", "MKL-1 wt EV RNA 042", "MKL-1 sT DMSO EV RNA 042", "MKL-1 sT DMSO EV RNA 0505", "MKL-1 sT Dox EV RNA 042", "MKL-1 sT Dox EV RNA 0505", "MKL-1 scr DMSO EV RNA 042", "MKL-1 scr DMSO EV RNA 0505", "MKL-1 scr Dox EV RNA 042", "MKL-1 scr Dox EV RNA 0505")
  #sampleCols[substr(colnames(RNASeq.NoCellLine_),1,4)=='mock'] <- 'GREY'

  sampleCols["MKL-1 parental cell RNA"] <- 'WHITE'
  sampleCols["MKL-1 parental cell RNA 118"] <- 'WHITE'
  sampleCols["MKL-1 parental cell RNA 147"] <- 'WHITE'

  sampleCols["MKL-1 wt EV RNA"] <- 'GREY'
  sampleCols["MKL-1 wt EV RNA 2"] <- 'GREY'
  sampleCols["MKL-1 wt EV RNA 118"] <- 'GREY'
  sampleCols["MKL-1 wt EV RNA 87"] <- 'GREY'
  sampleCols["MKL-1 wt EV RNA 27"] <- 'GREY'
  sampleCols["MKL-1 wt EV RNA 042"] <- 'GREY'

  #"salmon", "lightcoral", or "mistyrose"
  sampleCols["MKL-1 sT DMSO EV RNA 042"] <- 'mistyrose'
  sampleCols["MKL-1 sT DMSO EV RNA 0505"] <- 'mistyrose'
  sampleCols["MKL-1 sT Dox EV RNA 042"] <- 'RED'
  sampleCols["MKL-1 sT Dox EV RNA 0505"] <- 'RED'

  sampleCols["MKL-1 scr DMSO EV RNA 042"] <- 'LIGHTGREEN'
  sampleCols["MKL-1 scr DMSO EV RNA 0505"] <- 'LIGHTGREEN'
  sampleCols["MKL-1 scr Dox EV RNA 042"] <- 'GREEN'
  sampleCols["MKL-1 scr Dox EV RNA 0505"] <- 'GREEN'

  #legend("right", title = "",legend=c("MKL-1 parental cell RNA","MKL-1 wt EV RNA","MKL-1 sT DMSO EV RNA","MKL-1 sT Dox EV RNA","MKL-1 scr DMSO EV RNA","MKL-1 scr Dox EV RNA"), fill=c("WHITE", "GREY", "salmon","RED","LICHTGREEN","GREEN"), cex=0.8, box.lty=0)
  png("DEGs_heatmap_MKL-1.png", width=1000, height=1200)
  heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
              scale='row',trace='none',col=bluered(75),
              RowSideColors = mycol, ColSideColors = sampleCols, labRow="", margins=c(22,10), cexRow=8, cexCol=2, srtCol=30, lwid=c(1,7), lhei = c(1, 8))
  dev.off()
  #rsync -a -P /home/jhuang/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results/featureCounts/degenes/DEGs_heatmap_MKL-1.png ./

  # ---- cluster members ----
  subset_1<-names(subset(mycl, mycl == '1'))
  subset_1_ <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
        filters = 'ensembl_gene_id',
        values = subset_1,
        mart = ensembl)
  subset_1_uniq <- distinct(subset_1_, ensembl_gene_id, .keep_all= TRUE)
  subset_1_expr  <- datamat[subset_1,]
  subset_1_expr <- as.data.frame(subset_1_expr)
  subset_1_expr$ENSEMBL = rownames(subset_1_expr)
  cluster1_YELLOW <- merge(subset_1_uniq, subset_1_expr, by.x="ensembl_gene_id", by.y="ENSEMBL")
  #write.csv(cluster1_YELLOW,file='cluster1_YELLOW.txt')

  subset_2<-names(subset(mycl, mycl == '2'))
  subset_2_ <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
        filters = 'ensembl_gene_id',
        values = subset_2,
        mart = ensembl)
  subset_2_uniq <- distinct(subset_2_, ensembl_gene_id, .keep_all= TRUE)
  subset_2_expr  <- datamat[subset_2,]
  subset_2_expr <- as.data.frame(subset_2_expr)
  subset_2_expr$ENSEMBL = rownames(subset_2_expr)
  cluster2_DARKBLUE <- merge(subset_2_uniq, subset_2_expr, by.x="ensembl_gene_id", by.y="ENSEMBL")
  #write.csv(cluster2_DARKBLUE,file='cluster2_DARKBLUE.txt')

  write_xlsx(list(
    "Cluster 1 YELLOW" = cluster1_YELLOW,
    "Cluster 2 DARKBLUE" = cluster2_DARKBLUE
  ), "DEGs_heatmap_data_MKL-1.xlsx")

  #write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
  #write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt')
  #~/Tools/csv2xls-0.4/csv_to_xls.py \
  #cluster1_YELLOW.txt \
  #cluster2_DARKBLUE.txt \
  #-d$',' -o gene_culsters.xls;

  #TODO: save datamat as DEGs_heatmap_data.txt
  #datamat = read.csv(file="DEGs_heatmap_data.txt", sep="\t", row.names=1)
  #~/Tools/csv2xls-0.4/csv_to_xls.py \
  #DEGs_heatmap_data.txt \
  #-d',' -o DEGs_heatmap_data.xls;

  #~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o genelist_clusters_MKL-1.xls

  # ---- Draw DEGs_heatmap for the WaGa samples ----
  #add Gene_Id in the first line.
  GOI <- read.csv("WaGa.ids")$Gene_Id
  RNASeq.NoCellLine <- assay(rld)

  #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
  datamat_ = RNASeq.NoCellLine[GOI, c("WaGa parental cell RNA", "WaGa parental cell RNA 118", "WaGa parental cell RNA 147", "WaGa wt EV RNA", "WaGa wt EV RNA 2", "WaGa wt EV RNA 118", "WaGa wt EV RNA 147", "WaGa wt EV RNA 226", "WaGa wt EV RNA 1107", "WaGa wt EV RNA 1605", "WaGa wt EV RNA 2706", "WaGa sT DMSO EV RNA 1107", "WaGa sT DMSO EV RNA 1605", "WaGa sT DMSO EV RNA 2706", "WaGa scr DMSO EV RNA 1107", "WaGa scr DMSO EV RNA 1605", "WaGa scr DMSO EV RNA 2706", "WaGa sT Dox EV RNA 1107", "WaGa sT Dox EV RNA 1605", "WaGa sT Dox EV RNA 2706", "WaGa scr Dox EV RNA 1107", "WaGa scr Dox EV RNA 1605", "WaGa scr Dox EV RNA 2706")]
  new_order <- c("WaGa parental cell RNA", "WaGa parental cell RNA 118", "WaGa parental cell RNA 147", "WaGa wt EV RNA", "WaGa wt EV RNA 2", "WaGa wt EV RNA 118", "WaGa wt EV RNA 147", "WaGa wt EV RNA 226", "WaGa wt EV RNA 1107", "WaGa wt EV RNA 1605", "WaGa wt EV RNA 2706", "WaGa sT DMSO EV RNA 1107", "WaGa sT DMSO EV RNA 1605", "WaGa sT DMSO EV RNA 2706", "WaGa sT Dox EV RNA 1107", "WaGa sT Dox EV RNA 1605", "WaGa sT Dox EV RNA 2706", "WaGa scr DMSO EV RNA 1107", "WaGa scr DMSO EV RNA 1605", "WaGa scr DMSO EV RNA 2706", "WaGa scr Dox EV RNA 1107", "WaGa scr Dox EV RNA 1605", "WaGa scr Dox EV RNA 2706")
  datamat <- datamat_[, new_order]
  hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
  hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
  mycl = cutree(hr, h=max(hr$height)/1.05)
  mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED",  "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");

  mycol = mycol[as.vector(mycl)]
  sampleCols <- rep('GREY',ncol(datamat))
  names(sampleCols) <- c("WaGa parental cell RNA", "WaGa parental cell RNA 118", "WaGa parental cell RNA 147", "WaGa wt EV RNA", "WaGa wt EV RNA 2", "WaGa wt EV RNA 118", "WaGa wt EV RNA 147", "WaGa wt EV RNA 226", "WaGa wt EV RNA 1107", "WaGa wt EV RNA 1605", "WaGa wt EV RNA 2706", "WaGa sT DMSO EV RNA 1107", "WaGa sT DMSO EV RNA 1605", "WaGa sT DMSO EV RNA 2706", "WaGa sT Dox EV RNA 1107", "WaGa sT Dox EV RNA 1605", "WaGa sT Dox EV RNA 2706", "WaGa scr DMSO EV RNA 1107", "WaGa scr DMSO EV RNA 1605", "WaGa scr DMSO EV RNA 2706", "WaGa scr Dox EV RNA 1107", "WaGa scr Dox EV RNA 1605", "WaGa scr Dox EV RNA 2706")
  #sampleCols[substr(colnames(RNASeq.NoCellLine_),1,4)=='mock'] <- 'GREY'

  sampleCols["WaGa parental cell RNA"] <- 'WHITE'
  sampleCols["WaGa parental cell RNA 118"] <- 'WHITE'
  sampleCols["WaGa parental cell RNA 147"] <- 'WHITE'

  sampleCols["WaGa wt EV RNA"] <- '#A9A9A9'
  sampleCols["WaGa wt EV RNA 2"] <- '#A9A9A9'
  sampleCols["WaGa wt EV RNA 118"] <- '#A9A9A9'
  sampleCols["WaGa wt EV RNA 147"] <- '#A9A9A9'
  sampleCols["WaGa wt EV RNA 226"] <- '#A9A9A9'
  sampleCols["WaGa wt EV RNA 1107"] <- '#A9A9A9'
  sampleCols["WaGa wt EV RNA 1605"] <- '#A9A9A9'
  sampleCols["WaGa wt EV RNA 2706"] <- '#A9A9A9'
  #fb9a99
  sampleCols["WaGa sT DMSO EV RNA 1107"] <- 'mistyrose'
  sampleCols["WaGa sT DMSO EV RNA 1605"] <- 'mistyrose'
  sampleCols["WaGa sT DMSO EV RNA 2706"] <- 'mistyrose'
  sampleCols["WaGa sT Dox EV RNA 1107"] <- '#e31a1c'
  sampleCols["WaGa sT Dox EV RNA 1605"] <- '#e31a1c'
  sampleCols["WaGa sT Dox EV RNA 2706"] <- '#e31a1c'

  sampleCols["WaGa scr DMSO EV RNA 1107"] <- '#b2df8a'
  sampleCols["WaGa scr DMSO EV RNA 1605"] <- '#b2df8a'
  sampleCols["WaGa scr DMSO EV RNA 2706"] <- '#b2df8a'
  sampleCols["WaGa scr Dox EV RNA 1107"] <- '#33a02c'
  sampleCols["WaGa scr Dox EV RNA 1605"] <- '#33a02c'
  sampleCols["WaGa scr Dox EV RNA 2706"] <- '#33a02c'

  #legend("right", title = "",legend=c("WaGa parental cell RNA","WaGa wt EV RNA","WaGa sT DMSO EV RNA","WaGa sT Dox EV RNA","WaGa scr DMSO EV RNA","WaGa scr Dox EV RNA"), fill=c("WHITE", "GREY", "salmon","RED","LICHTGREEN","GREEN"), cex=0.8, box.lty=0)
  png("DEGs_heatmap_WaGa.png", width=1000, height=1200)
  heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
              scale='row',trace='none',col=bluered(75),
              RowSideColors = mycol, ColSideColors = sampleCols, labRow="", margins=c(22,10), cexRow=8, cexCol=2, srtCol=45, lwid=c(1,7), lhei = c(1, 8))
  dev.off()

  # ---- cluster members ----
  subset_1<-names(subset(mycl, mycl == '1'))
  subset_1_ <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
        filters = 'ensembl_gene_id',
        values = subset_1,
        mart = ensembl)
  subset_1_uniq <- distinct(subset_1_, ensembl_gene_id, .keep_all= TRUE)
  subset_1_expr  <- datamat[subset_1,]
  subset_1_expr <- as.data.frame(subset_1_expr)
  subset_1_expr$ENSEMBL = rownames(subset_1_expr)
  cluster1_YELLOW <- merge(subset_1_uniq, subset_1_expr, by.x="ensembl_gene_id", by.y="ENSEMBL")
  #write.csv(cluster1_YELLOW,file='cluster1_YELLOW.txt')

  subset_2<-names(subset(mycl, mycl == '2'))
  subset_2_ <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
        filters = 'ensembl_gene_id',
        values = subset_2,
        mart = ensembl)
  subset_2_uniq <- distinct(subset_2_, ensembl_gene_id, .keep_all= TRUE)
  subset_2_expr  <- datamat[subset_2,]
  subset_2_expr <- as.data.frame(subset_2_expr)
  subset_2_expr$ENSEMBL = rownames(subset_2_expr)
  cluster2_DARKBLUE <- merge(subset_2_uniq, subset_2_expr, by.x="ensembl_gene_id", by.y="ENSEMBL")
  #write.csv(cluster2_DARKBLUE,file='cluster2_DARKBLUE.txt')

  write_xlsx(list(
    "Cluster 1 YELLOW" = cluster1_YELLOW,
    "Cluster 2 DARKBLUE" = cluster2_DARKBLUE
  ), "DEGs_heatmap_data_WaGa.xlsx")

  #write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
  #write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt')
  #write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')
  #write.csv(names(subset(mycl, mycl == '4')),file='cluster4_DARKMAGENTA.txt')
  #write.csv(names(subset(mycl, mycl == '5')),file='cluster5_DARKCYAN.txt')
  #~/Tools/csv2xls-0.4/csv_to_xls.py \
  #cluster1_YELLOW.txt \
  #cluster2_DARKBLUE.txt \
  #-d$',' -o gene_culsters.xls;
  #~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o genelist_clusters_WaGa.xls

  # ---- Draw DEGs_heatmap for all samples ----
  #add Gene_Id in the first line.
  GOI <- read.csv("all.ids")$Gene_Id
  RNASeq.NoCellLine <- assay(rld)

  #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
  datamat = RNASeq.NoCellLine[GOI, c("MKL-1 RNA", "MKL-1 RNA 118", "MKL-1 RNA 147", "MKL-1 EV", "MKL-1 EV 2", "MKL-1 EV 118", "MKL-1 EV 87", "MKL-1 EV 27", "MKL-1 EV 042", "MKL-1 EV sT DMSO 042", "MKL-1 EV sT DMSO 0505", "MKL-1 EV scr DMSO 042", "MKL-1 EV scr DMSO 0505", "MKL-1 EV sT Dox 042", "MKL-1 EV sT Dox 0505", "MKL-1 EV scr Dox 042", "MKL-1 EV scr Dox 0505",    "WaGa RNA", "WaGa RNA 118", "WaGa RNA 147", "WaGa EV", "WaGa EV 2", "WaGa EV 118", "WaGa EV 147", "WaGa EV 226", "WaGa EV 1107", "WaGa EV 1605", "WaGa EV 2706", "WaGa EV sT DMSO 1107", "WaGa EV sT DMSO 1605", "WaGa EV sT DMSO 2706", "WaGa EV scr DMSO 1107", "WaGa EV scr DMSO 1605", "WaGa EV scr DMSO 2706", "WaGa EV sT Dox 1107", "WaGa EV sT Dox 1605", "WaGa EV sT Dox 2706", "WaGa EV scr Dox 1107", "WaGa EV scr Dox 1605", "WaGa EV scr Dox 2706")]
  hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
  hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
  mycl = cutree(hr, h=max(hr$height)/1.05)
  mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED",  "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");

  mycol = mycol[as.vector(mycl)]
  sampleCols <- rep('GREY',ncol(datamat))
  names(sampleCols) <- c("MKL-1 RNA", "MKL-1 RNA 118", "MKL-1 RNA 147", "MKL-1 EV", "MKL-1 EV 2", "MKL-1 EV 118", "MKL-1 EV 87", "MKL-1 EV 27", "MKL-1 EV 042", "MKL-1 EV sT DMSO 042", "MKL-1 EV sT DMSO 0505", "MKL-1 EV scr DMSO 042", "MKL-1 EV scr DMSO 0505", "MKL-1 EV sT Dox 042", "MKL-1 EV sT Dox 0505", "MKL-1 EV scr Dox 042", "MKL-1 EV scr Dox 0505",    "WaGa RNA", "WaGa RNA 118", "WaGa RNA 147", "WaGa EV", "WaGa EV 2", "WaGa EV 118", "WaGa EV 147", "WaGa EV 226", "WaGa EV 1107", "WaGa EV 1605", "WaGa EV 2706", "WaGa EV sT DMSO 1107", "WaGa EV sT DMSO 1605", "WaGa EV sT DMSO 2706", "WaGa EV scr DMSO 1107", "WaGa EV scr DMSO 1605", "WaGa EV scr DMSO 2706", "WaGa EV sT Dox 1107", "WaGa EV sT Dox 1605", "WaGa EV sT Dox 2706", "WaGa EV scr Dox 1107", "WaGa EV scr Dox 1605", "WaGa EV scr Dox 2706")
  #sampleCols[substr(colnames(RNASeq.NoCellLine_),1,4)=='mock'] <- 'GREY'

  sampleCols["MKL-1 RNA"] <- 'DARKBLUE'
  sampleCols["MKL-1 RNA 118"] <- 'DARKBLUE'
  sampleCols["MKL-1 RNA 147"] <- 'DARKBLUE'

  sampleCols["MKL-1 EV"] <- 'DARKRED'
  sampleCols["MKL-1 EV 2"] <- 'DARKRED'
  sampleCols["MKL-1 EV 118"] <- 'DARKRED'
  sampleCols["MKL-1 EV 87"] <- 'DARKRED'
  sampleCols["MKL-1 EV 27"] <- 'DARKRED'
  sampleCols["MKL-1 EV 042"] <- 'DARKRED'

  sampleCols["MKL-1 EV sT DMSO 042"] <- 'DARKORANGE'
  sampleCols["MKL-1 EV sT DMSO 0505"] <- 'DARKORANGE'
  sampleCols["MKL-1 EV scr DMSO 042"] <- 'DARKORANGE'
  sampleCols["MKL-1 EV scr DMSO 0505"] <- 'DARKORANGE'

  sampleCols["MKL-1 EV sT Dox 042"] <- 'DARKGREEN'
  sampleCols["MKL-1 EV sT Dox 0505"] <- 'DARKGREEN'
  sampleCols["MKL-1 EV scr Dox 042"] <- 'DARKGREEN'
  sampleCols["MKL-1 EV scr Dox 0505"] <- 'DARKGREEN'

  sampleCols["WaGa RNA"] <- 'BLUE'
  sampleCols["WaGa RNA 118"] <- 'BLUE'
  sampleCols["WaGa RNA 147"] <- 'BLUE'

  sampleCols["WaGa EV"] <- 'RED'
  sampleCols["WaGa EV 2"] <- 'RED'
  sampleCols["WaGa EV 118"] <- 'RED'
  sampleCols["WaGa EV 147"] <- 'RED'
  sampleCols["WaGa EV 226"] <- 'RED'
  sampleCols["WaGa EV 1107"] <- 'RED'
  sampleCols["WaGa EV 1605"] <- 'RED'
  sampleCols["WaGa EV 2706"] <- 'RED'

  sampleCols["WaGa EV sT DMSO 1107"] <- 'ORANGE'
  sampleCols["WaGa EV sT DMSO 1605"] <- 'ORANGE'
  sampleCols["WaGa EV sT DMSO 2706"] <- 'ORANGE'
  sampleCols["WaGa EV scr DMSO 1107"] <- 'ORANGE'
  sampleCols["WaGa EV scr DMSO 1605"] <- 'ORANGE'
  sampleCols["WaGa EV scr DMSO 2706"] <- 'ORANGE'

  sampleCols["WaGa EV sT Dox 1107"] <- 'GREEN'
  sampleCols["WaGa EV sT Dox 1605"] <- 'GREEN'
  sampleCols["WaGa EV sT Dox 2706"] <- 'GREEN'
  sampleCols["WaGa EV scr Dox 1107"] <- 'GREEN'
  sampleCols["WaGa EV scr Dox 1605"] <- 'GREEN'
  sampleCols["WaGa EV scr Dox 2706"] <- 'GREEN'

  #legend("right", title = "",legend=c("WaGa parental cell RNA","WaGa wt EV RNA","WaGa sT DMSO EV RNA","WaGa sT Dox EV RNA","WaGa scr DMSO EV RNA","WaGa scr Dox EV RNA"), fill=c("WHITE", "GREY", "salmon","RED","LICHTGREEN","GREEN"), cex=0.8, box.lty=0)
  png("DEGs_heatmap_all.png", width=1000, height=1200)
  heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
              scale='row',trace='none',col=bluered(75),
              RowSideColors = mycol, ColSideColors = sampleCols, labRow="", margins=c(22,10), cexRow=8, cexCol=2, srtCol=45, lwid=c(1,7), lhei = c(1, 8))
  dev.off()

  # ---- cluster members ----
  #c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED",  "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");
  write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
  write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt')
  write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')
  write.csv(names(subset(mycl, mycl == '4')),file='cluster4_DARKMAGENTA.txt')
  write.csv(names(subset(mycl, mycl == '5')),file='cluster5_DARKCYAN.txt')
  #~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o genelist_clusters_all.xls

11, pathways

  mkdir pathways

  #--continue from BREAK POINT--
  ##
  #source("https://bioconductor.org/biocLite.R")
  #biocLite("AnnotationDbi")
  library("clusterProfiler")
  library("ReactomePA")
  setwd("~/DATA/Data_Anastasia_RNASeq/results/featureCounts/pathways")

  for sample in WaGa_RNA_vs_MKL1_RNA MKL1_EV_RNA_vs_MKL1_RNA WaGa_EV_RNA_vs_MKL1_EV_RNA WaGa_EV_RNA_vs_WaGa_RNA; do \
  echo "${sample}_up <- read.csv('../degenes/${sample}-up.txt', row.names=1)"
  echo "${sample}_up_KEGG <- enrichKEGG(${sample}_up\$entrezgene_id)"
  echo "write.table(as.data.frame(${sample}_up_KEGG), file = '${sample}_up_KEGG.txt', sep = '\t', row.names = FALSE)"
  echo "${sample}_down <- read.csv('../degenes/${sample}-down.txt', row.names=1)"
  echo "${sample}_down_KEGG <- enrichKEGG(${sample}_down\$entrezgene_id)"
  echo "write.table(as.data.frame(${sample}_down_KEGG), file = '${sample}_down_KEGG.txt', sep = '\t', row.names = FALSE)"
  echo "${sample}_sig <- rbind(${sample}_up, ${sample}_down)"
  echo "${sample}_sig_KEGG <- enrichKEGG(${sample}_sig\$entrezgene_id)"
  echo "write.table(as.data.frame(${sample}_sig_KEGG), file = '${sample}_sig_KEGG.txt', sep = '\t', row.names = FALSE)"
  done

  WaGa_RNA_vs_MKL1_RNA_up <- read.csv('../degenes/WaGa_RNA_vs_MKL1_RNA-up.txt', row.names=1)
  WaGa_RNA_vs_MKL1_RNA_up_KEGG <- enrichKEGG(WaGa_RNA_vs_MKL1_RNA_up$entrezgene_id)
  write.table(as.data.frame(WaGa_RNA_vs_MKL1_RNA_up_KEGG), file = 'WaGa_RNA_vs_MKL1_RNA_up_KEGG.txt', sep = '\t', row.names = FALSE)
  WaGa_RNA_vs_MKL1_RNA_down <- read.csv('../degenes/WaGa_RNA_vs_MKL1_RNA-down.txt', row.names=1)
  WaGa_RNA_vs_MKL1_RNA_down_KEGG <- enrichKEGG(WaGa_RNA_vs_MKL1_RNA_down$entrezgene_id)
  write.table(as.data.frame(WaGa_RNA_vs_MKL1_RNA_down_KEGG), file = 'WaGa_RNA_vs_MKL1_RNA_down_KEGG.txt', sep = '\t', row.names = FALSE)
  WaGa_RNA_vs_MKL1_RNA_sig <- rbind(WaGa_RNA_vs_MKL1_RNA_up, WaGa_RNA_vs_MKL1_RNA_down)
  WaGa_RNA_vs_MKL1_RNA_sig_KEGG <- enrichKEGG(WaGa_RNA_vs_MKL1_RNA_sig$entrezgene_id)
  write.table(as.data.frame(WaGa_RNA_vs_MKL1_RNA_sig_KEGG), file = 'WaGa_RNA_vs_MKL1_RNA_sig_KEGG.txt', sep = '\t', row.names = FALSE)
  MKL1_EV_RNA_vs_MKL1_RNA_up <- read.csv('../degenes/MKL1_EV_RNA_vs_MKL1_RNA-up.txt', row.names=1)
  MKL1_EV_RNA_vs_MKL1_RNA_up_KEGG <- enrichKEGG(MKL1_EV_RNA_vs_MKL1_RNA_up$entrezgene_id)
  write.table(as.data.frame(MKL1_EV_RNA_vs_MKL1_RNA_up_KEGG), file = 'MKL1_EV_RNA_vs_MKL1_RNA_up_KEGG.txt', sep = '\t', row.names = FALSE)
  MKL1_EV_RNA_vs_MKL1_RNA_down <- read.csv('../degenes/MKL1_EV_RNA_vs_MKL1_RNA-down.txt', row.names=1)
  MKL1_EV_RNA_vs_MKL1_RNA_down_KEGG <- enrichKEGG(MKL1_EV_RNA_vs_MKL1_RNA_down$entrezgene_id)
  write.table(as.data.frame(MKL1_EV_RNA_vs_MKL1_RNA_down_KEGG), file = 'MKL1_EV_RNA_vs_MKL1_RNA_down_KEGG.txt', sep = '\t', row.names = FALSE)
  MKL1_EV_RNA_vs_MKL1_RNA_sig <- rbind(MKL1_EV_RNA_vs_MKL1_RNA_up, MKL1_EV_RNA_vs_MKL1_RNA_down)
  MKL1_EV_RNA_vs_MKL1_RNA_sig_KEGG <- enrichKEGG(MKL1_EV_RNA_vs_MKL1_RNA_sig$entrezgene_id)
  write.table(as.data.frame(MKL1_EV_RNA_vs_MKL1_RNA_sig_KEGG), file = 'MKL1_EV_RNA_vs_MKL1_RNA_sig_KEGG.txt', sep = '\t', row.names = FALSE)
  WaGa_EV_RNA_vs_MKL1_EV_RNA_up <- read.csv('../degenes/WaGa_EV_RNA_vs_MKL1_EV_RNA-up.txt', row.names=1)
  WaGa_EV_RNA_vs_MKL1_EV_RNA_up_KEGG <- enrichKEGG(WaGa_EV_RNA_vs_MKL1_EV_RNA_up$entrezgene_id)
  write.table(as.data.frame(WaGa_EV_RNA_vs_MKL1_EV_RNA_up_KEGG), file = 'WaGa_EV_RNA_vs_MKL1_EV_RNA_up_KEGG.txt', sep = '\t', row.names = FALSE)
  WaGa_EV_RNA_vs_MKL1_EV_RNA_down <- read.csv('../degenes/WaGa_EV_RNA_vs_MKL1_EV_RNA-down.txt', row.names=1)
  WaGa_EV_RNA_vs_MKL1_EV_RNA_down_KEGG <- enrichKEGG(WaGa_EV_RNA_vs_MKL1_EV_RNA_down$entrezgene_id)
  write.table(as.data.frame(WaGa_EV_RNA_vs_MKL1_EV_RNA_down_KEGG), file = 'WaGa_EV_RNA_vs_MKL1_EV_RNA_down_KEGG.txt', sep = '\t', row.names = FALSE)
  WaGa_EV_RNA_vs_MKL1_EV_RNA_sig <- rbind(WaGa_EV_RNA_vs_MKL1_EV_RNA_up, WaGa_EV_RNA_vs_MKL1_EV_RNA_down)
  WaGa_EV_RNA_vs_MKL1_EV_RNA_sig_KEGG <- enrichKEGG(WaGa_EV_RNA_vs_MKL1_EV_RNA_sig$entrezgene_id)
  write.table(as.data.frame(WaGa_EV_RNA_vs_MKL1_EV_RNA_sig_KEGG), file = 'WaGa_EV_RNA_vs_MKL1_EV_RNA_sig_KEGG.txt', sep = '\t', row.names = FALSE)
  WaGa_EV_RNA_vs_WaGa_RNA_up <- read.csv('../degenes/WaGa_EV_RNA_vs_WaGa_RNA-up.txt', row.names=1)
  WaGa_EV_RNA_vs_WaGa_RNA_up_KEGG <- enrichKEGG(WaGa_EV_RNA_vs_WaGa_RNA_up$entrezgene_id)
  write.table(as.data.frame(WaGa_EV_RNA_vs_WaGa_RNA_up_KEGG), file = 'WaGa_EV_RNA_vs_WaGa_RNA_up_KEGG.txt', sep = '\t', row.names = FALSE)
  WaGa_EV_RNA_vs_WaGa_RNA_down <- read.csv('../degenes/WaGa_EV_RNA_vs_WaGa_RNA-down.txt', row.names=1)
  WaGa_EV_RNA_vs_WaGa_RNA_down_KEGG <- enrichKEGG(WaGa_EV_RNA_vs_WaGa_RNA_down$entrezgene_id)
  write.table(as.data.frame(WaGa_EV_RNA_vs_WaGa_RNA_down_KEGG), file = 'WaGa_EV_RNA_vs_WaGa_RNA_down_KEGG.txt', sep = '\t', row.names = FALSE)
  WaGa_EV_RNA_vs_WaGa_RNA_sig <- rbind(WaGa_EV_RNA_vs_WaGa_RNA_up, WaGa_EV_RNA_vs_WaGa_RNA_down)
  WaGa_EV_RNA_vs_WaGa_RNA_sig_KEGG <- enrichKEGG(WaGa_EV_RNA_vs_WaGa_RNA_sig$entrezgene_id)
  write.table(as.data.frame(WaGa_EV_RNA_vs_WaGa_RNA_sig_KEGG), file = 'WaGa_EV_RNA_vs_WaGa_RNA_sig_KEGG.txt', sep = '\t', row.names = FALSE)

  png("pathways_KEGG.png",width=1260, height=1000)
  merged_list <- merge_result(list('WaGa_RNA vs MKL1_RNA'=WaGa_RNA_vs_MKL1_RNA_sig_KEGG, 'MKL1_EV_RNA vs MKL1_RNA'=MKL1_EV_RNA_vs_MKL1_RNA_sig_KEGG, 'WaGa_EV_RNA vs MKL1_EV_RNA'=WaGa_EV_RNA_vs_MKL1_EV_RNA_sig_KEGG, 'WaGa_EV_RNA vs WaGa_RNA'=WaGa_EV_RNA_vs_WaGa_RNA_sig_KEGG))
  dotplot(merged_list, showCategory=1000)    #, font.size=6, srt = 35
  dev.off()

  # under CONSOLE
  cd pathways_KEGG
  ~/Tools/csv2xls-0.4/csv_to_xls.py WaGa_RNA_vs_MKL1_RNA_sig_KEGG.txt MKL1_EV_RNA_vs_MKL1_RNA_sig_KEGG.txt WaGa_EV_RNA_vs_MKL1_EV_RNA_sig_KEGG.txt WaGa_EV_RNA_vs_WaGa_RNA_sig_KEGG.txt -d$'\t' -o pathways_KEGG.xls

12, GOs

  setwd("~/DATA/Data_Anastasia_RNASeq/results/featureCounts/GOs")
  for sample in IKK1_vs_Cre IKK2_vs_Cre Nemo_vs_Cre NIK_vs_Cre; do \
  echo "${sample}_up <- read.csv('../degenes/${sample}-up.txt', row.names=1)"
  echo "${sample}_up_GO_BP <- enrichGO(${sample}_up\$entrezgene_id, 'org.Mm.eg.db', ont='BP')"
  echo "${sample}_up_GO_MF <- enrichGO(${sample}_up\$entrezgene_id, 'org.Mm.eg.db', ont='MF')"
  echo "${sample}_up_GO_CC <- enrichGO(${sample}_up\$entrezgene_id, 'org.Mm.eg.db', ont='CC')"
  #echo "${sample}_up_GO_ALL <- enrichGO(${sample}_up\$entrezgene_id, 'org.Mm.eg.db', ont='ALL')"
  echo "write.table(as.data.frame(${sample}_up_GO_BP), file = '${sample}_up_GO_BP.txt', sep = '\t', row.names = FALSE)"
  echo "write.table(as.data.frame(${sample}_up_GO_MF), file = '${sample}_up_GO_MF.txt', sep = '\t', row.names = FALSE)"
  echo "write.table(as.data.frame(${sample}_up_GO_CC), file = '${sample}_up_GO_CC.txt', sep = '\t', row.names = FALSE)"
  #echo "write.table(as.data.frame(${sample}_up_GO_ALL), file = '${sample}_up_GO_ALL.txt', sep = '\t', row.names = FALSE)"
  echo "${sample}_down <- read.csv('../degenes/${sample}-down.txt', row.names=1)"
  echo "${sample}_down_GO_BP <- enrichGO(${sample}_down\$entrezgene_id, 'org.Mm.eg.db', ont='BP')"
  echo "${sample}_down_GO_MF <- enrichGO(${sample}_down\$entrezgene_id, 'org.Mm.eg.db', ont='MF')"
  echo "${sample}_down_GO_CC <- enrichGO(${sample}_down\$entrezgene_id, 'org.Mm.eg.db', ont='CC')"
  #echo "${sample}_down_GO_ALL <- enrichGO(${sample}_down\$entrezgene_id, 'org.Mm.eg.db', ont='ALL')"
  echo "write.table(as.data.frame(${sample}_down_GO_BP), file = '${sample}_down_GO_BP.txt', sep = '\t', row.names = FALSE)"
  echo "write.table(as.data.frame(${sample}_down_GO_MF), file = '${sample}_down_GO_MF.txt', sep = '\t', row.names = FALSE)"
  echo "write.table(as.data.frame(${sample}_down_GO_CC), file = '${sample}_down_GO_CC.txt', sep = '\t', row.names = FALSE)"
  #echo "write.table(as.data.frame(${sample}_down_GO_ALL), file = '${sample}_down_GO_ALL.txt', sep = '\t', row.names = FALSE)"
  echo "${sample}_sig <- rbind(${sample}_up, ${sample}_down)"
  echo "${sample}_sig_GO_BP <- enrichGO(${sample}_sig\$entrezgene_id, 'org.Mm.eg.db', ont='BP')"
  echo "${sample}_sig_GO_MF <- enrichGO(${sample}_sig\$entrezgene_id, 'org.Mm.eg.db', ont='MF')"
  echo "${sample}_sig_GO_CC <- enrichGO(${sample}_sig\$entrezgene_id, 'org.Mm.eg.db', ont='CC')"
  #echo "${sample}_sig_GO_ALL <- enrichGO(${sample}_sig\$entrezgene_id, 'org.Mm.eg.db', ont='ALL')"
  echo "write.table(as.data.frame(${sample}_sig_GO_BP), file = '${sample}_sig_GO_BP.txt', sep = '\t', row.names = FALSE)"
  echo "write.table(as.data.frame(${sample}_sig_GO_MF), file = '${sample}_sig_GO_MF.txt', sep = '\t', row.names = FALSE)"
  echo "write.table(as.data.frame(${sample}_sig_GO_CC), file = '${sample}_sig_GO_CC.txt', sep = '\t', row.names = FALSE)"
  #echo "write.table(as.data.frame(${sample}_sig_GO_ALL), file = '${sample}_sig_GO_ALL.txt', sep = '\t', row.names = FALSE)"
  done

  png("GOs_BP.png",width=3000, height=24000)
  merged_list <- merge_result(list('IKK1_vs_Cre_up'=IKK1_vs_Cre_up_GO_BP, 'IKK1_vs_Cre_down'=IKK1_vs_Cre_down_GO_BP, 'IKK1_vs_Cre_sig'=IKK1_vs_Cre_sig_GO_BP, 'IKK2_vs_Cre_up'=IKK2_vs_Cre_up_GO_BP, 'IKK2_vs_Cre_down'=IKK2_vs_Cre_down_GO_BP, 'IKK2_vs_Cre_sig'=IKK2_vs_Cre_sig_GO_BP, 'Nemo_vs_Cre_up'= Nemo_vs_Cre_up_GO_BP, 'Nemo_vs_Cre_down'=Nemo_vs_Cre_down_GO_BP, 'Nemo_vs_Cre_sig'=Nemo_vs_Cre_sig_GO_BP, 'NIK_vs_Cre_up'=NIK_vs_Cre_up_GO_BP, 'NIK_vs_Cre_down'=NIK_vs_Cre_down_GO_BP, 'NIK_vs_Cre_sig'=NIK_vs_Cre_sig_GO_BP))
  dotplot(merged_list, showCategory=10000)
  dev.off()

  # under CONSOLE
  #
  cd GOs
  ~/Tools/csv2xls-0.4/csv_to_xls.py IKK1_vs_Cre_up_GO_BP.txt IKK1_vs_Cre_down_GO_BP.txt IKK1_vs_Cre_sig_GO_BP.txt  IKK2_vs_Cre_up_GO_BP.txt IKK2_vs_Cre_down_GO_BP.txt IKK2_vs_Cre_sig_GO_BP.txt  Nemo_vs_Cre_up_GO_BP.txt Nemo_vs_Cre_down_GO_BP.txt Nemo_vs_Cre_sig_GO_BP.txt  NIK_vs_Cre_up_GO_BP.txt NIK_vs_Cre_down_GO_BP.txt NIK_vs_Cre_sig_GO_BP.txt  -d$'\t' -o GOs_BP.xls
  ~/Tools/csv2xls-0.4/csv_to_xls.py IKK1_vs_Cre_up_GO_MF.txt IKK1_vs_Cre_down_GO_MF.txt IKK1_vs_Cre_sig_GO_MF.txt  IKK2_vs_Cre_up_GO_MF.txt IKK2_vs_Cre_down_GO_MF.txt IKK2_vs_Cre_sig_GO_MF.txt  Nemo_vs_Cre_up_GO_MF.txt Nemo_vs_Cre_down_GO_MF.txt Nemo_vs_Cre_sig_GO_MF.txt  NIK_vs_Cre_up_GO_MF.txt NIK_vs_Cre_down_GO_MF.txt NIK_vs_Cre_sig_GO_MF.txt  -d$'\t' -o GOs_MF.xls
  ~/Tools/csv2xls-0.4/csv_to_xls.py IKK1_vs_Cre_up_GO_CC.txt IKK1_vs_Cre_down_GO_CC.txt IKK1_vs_Cre_sig_GO_CC.txt  IKK2_vs_Cre_up_GO_CC.txt IKK2_vs_Cre_down_GO_CC.txt IKK2_vs_Cre_sig_GO_CC.txt  Nemo_vs_Cre_up_GO_CC.txt Nemo_vs_Cre_down_GO_CC.txt Nemo_vs_Cre_sig_GO_CC.txt  NIK_vs_Cre_up_GO_CC.txt NIK_vs_Cre_down_GO_CC.txt NIK_vs_Cre_sig_GO_CC.txt  -d$'\t' -o GOs_CC.xls