Small RNA sequencing processing in the example of smallRNA_7

gene_x 0 like s 398 view s

Tags: python, R, bash, pipeline

  1. adapter sequence

    Lexogen small RNA-Seq kit
    
    some common adapter sequences from different kits for reference:
    
        - TruSeq Small RNA (Illumina): TGGAATTCTCGGGTGCCAAGG
        - Small RNA Kits V1 (Illumina): TCGTATGCCGTCTTCTGCTTGT
        - Small RNA Kits V1.5 (Illumina): ATCTCGTATGCCGTCTTCTGCTTG
        - NEXTflex Small RNA Sequencing Kit v3 for Illumina Platforms (Bioo Scientific): TGGAATTCTCGGGTGCCAAGG
        - LEXOGEN Small RNA-Seq Library Prep Kit (Illumina): TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC
    
    [Header],,,,,
    IEMFileVersion,4,,,,
    InvestigatorName,ag96,,,,
    ExperimentName,ag96,,,,
    Date,16.10.2023,,,,
    Workflow,GenerateFASTQ,,,,
    Application,NextSeqFASTQOnly,,,,
    Assay,TruSeq HT,,,,
    Description,pcr,,,,
    Chemistry,Amplicon,,,,
    ,,,,,
    [Reads],,,,,
    82,,,,,
    ,,,,,
    ,,,,,
    [Settings],,,,,
    Adapter,TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC,,,,
    ,,,,,
    ,,,,,
    [Data],,,,,
    Sample_ID,Sample_Name,I7_Index_ID,index,Sample_Project,Description
    nf930,01_0505_WaGa_wt_EV_RNA,SRi7001,CAGCGT,2023_064_nf_ute,smallRNA-Seq
    nf931,02_0505_WaGa_sT_DMSO_EV_RNA,SRi7002,GATCAC,2023_064_nf_ute,smallRNA-Seq
    nf932,03_0505_WaGa_sT_Dox_EV_RNA,SRi7003,ACCAGT,2023_064_nf_ute,smallRNA-Seq
    nf933,04_0505_WaGa_scr_DMSO_EV_RNA,SRi7004,TGCACG,2023_064_nf_ute,smallRNA-Seq
    nf934,05_0505_WaGa_scr_Dox_EV_RNA,SRi7005,ACATTA,2023_064_nf_ute,smallRNA-Seq
    nf935,06_1905_WaGa_wt_EV_RNA,SRi7006,GTGTAG,2023_064_nf_ute,smallRNA-Seq
    nf936,07_1905_WaGa_sT_DMSO_EV_RNA,SRi7007,CTAGTC,2023_064_nf_ute,smallRNA-Seq
    nf937,08_1905_WaGa_sT_Dox_EV_RNA,SRi7008,TGTGCA,2023_064_nf_ute,smallRNA-Seq
    nf938,09_1905_WaGa_scr_DMSO_EV_RNA,SRi7009,TCAGGA,2023_064_nf_ute,smallRNA-Seq
    nf939,10_1905_WaGa_scr_Dox_EV_RNA,SRi7010,CGGTTA,2023_064_nf_ute,smallRNA-Seq
    nf940,11_control_MKL1,SRi7011,TTAACT,2023_064_nf_ute,smallRNA-Seq
    nf941,12_control_WaGa,SRi7012,ATGAAC,2023_064_nf_ute,smallRNA-Seq
    
  2. input data

    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf930/01_0505_WaGa_wt_EV_RNA_S1_R1_001.fastq.gz         0505_WaGa_wt.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf931/02_0505_WaGa_sT_DMSO_EV_RNA_S2_R1_001.fastq.gz    0505_WaGa_sT_DMSO.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf932/03_0505_WaGa_sT_Dox_EV_RNA_S3_R1_001.fastq.gz     0505_WaGa_sT_Dox.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf933/04_0505_WaGa_scr_DMSO_EV_RNA_S4_R1_001.fastq.gz   0505_WaGa_scr_DMSO.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf934/05_0505_WaGa_scr_Dox_EV_RNA_S5_R1_001.fastq.gz    0505_WaGa_scr_Dox.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf935/06_1905_WaGa_wt_EV_RNA_S6_R1_001.fastq.gz         1905_WaGa_wt.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf936/07_1905_WaGa_sT_DMSO_EV_RNA_S7_R1_001.fastq.gz    1905_WaGa_sT_DMSO.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf937/08_1905_WaGa_sT_Dox_EV_RNA_S8_R1_001.fastq.gz     1905_WaGa_sT_Dox.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf938/09_1905_WaGa_scr_DMSO_EV_RNA_S9_R1_001.fastq.gz   1905_WaGa_scr_DMSO.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf939/10_1905_WaGa_scr_Dox_EV_RNA_S10_R1_001.fastq.gz   1905_WaGa_scr_Dox.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf940/11_control_MKL1_S11_R1_001.fastq.gz               control_MKL1.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf941/12_control_WaGa_S12_R1_001.fastq.gz               control_WaGa.fastq.gz
    
  3. run cutadapt

    for sample in 0505_WaGa_wt 0505_WaGa_sT_DMSO 0505_WaGa_sT_Dox 0505_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_wt 1905_WaGa_sT_DMSO 1905_WaGa_sT_Dox 1905_WaGa_scr_DMSO 1905_WaGa_scr_Dox  control_MKL1 control_WaGa; do
      cutadapt -a TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -q 20 -o ${sample}_cutadapted.fastq.gz --minimum-length 5 --trim-n ${sample}.fastq.gz >> LOG
    done
    #jhuang@hamburg:~/DATA/Data_Ute/Data_Ute_smallRNA_7$ fastp -i 0505_WaGa_wt.fastq.gz -o 0505_WaGa_wt3.fastq.gz -a TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC
    
  4. run COMPSRA

    ln -s ../Data_Ute_smallRNA_3/bundle_v1 .
    #change the reference of 'hg38 -> hg38_orig' to 'hg38 -> hg38_miRNA_JN707599' (other options are hg38_WaGa_KJ128379 and hg38_MKL1_FJ173815)
    cd bundle_v1/db/stars; rm hg38; ln -s hg38_miRNA_JN707599 hg38;
    
    # DEBUG_1: Make sure the file COMPSRA.jar under Data_Ute_smallRNA_7
    # DEBUG_2: "-qc -ra TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -rb 4" does not work! Using cutadapt -a xxxx -q 20 replace those parameters!
    
    mkdir out_out
    for sample in 0505_WaGa_wt 0505_WaGa_sT_DMSO 0505_WaGa_sT_Dox 0505_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_wt 1905_WaGa_sT_DMSO 1905_WaGa_sT_Dox 1905_WaGa_scr_DMSO 1905_WaGa_scr_Dox  control_MKL1 control_WaGa; do
      echo "mkdir our_out/${sample}_cutadapted/"
    done
    for sample in 0505_WaGa_wt 0505_WaGa_sT_DMSO 0505_WaGa_sT_Dox 0505_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_wt 1905_WaGa_sT_DMSO 1905_WaGa_sT_Dox 1905_WaGa_scr_DMSO 1905_WaGa_scr_Dox  control_MKL1 control_WaGa; do
      echo "java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in ${sample}_cutadapted.fastq.gz -out ./our_out/"
    done
    
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 0505_WaGa_wt_cutadapted.fastq.gz -out ./our_out/
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 0505_WaGa_sT_DMSO_cutadapted.fastq.gz -out ./our_out/
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 0505_WaGa_sT_Dox_cutadapted.fastq.gz -out ./our_out/
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 0505_WaGa_scr_DMSO_cutadapted.fastq.gz -out ./our_out/
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 0505_WaGa_scr_Dox_cutadapted.fastq.gz -out ./our_out/
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 1905_WaGa_wt_cutadapted.fastq.gz -out ./our_out/
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 1905_WaGa_sT_DMSO_cutadapted.fastq.gz -out ./our_out/
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 1905_WaGa_sT_Dox_cutadapted.fastq.gz -out ./our_out/
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 1905_WaGa_scr_DMSO_cutadapted.fastq.gz -out ./our_out/
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in 1905_WaGa_scr_Dox_cutadapted.fastq.gz -out ./our_out/
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in control_MKL1_cutadapted.fastq.gz -out ./our_out/
    java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in control_WaGa_cutadapted.fastq.gz -out ./our_out/
    #END
    
    #4.2.3 -rb/-rm_bias n
    #To remove n random bases in both 5’ (5-prime) and 3’ (3-prime) ends after removing the adapter sequence.
    #4.2.4 -rh/-rm_low_quality_head score
    #To remove the low quality bases with the score less than score from 5’ (5-prime) end.
    #4.2.5 -rt/-rm_low_quality_tail score
    #To remove the low quality bases with the score less than score from 3’ (3-prime) end.
    #4.2.6 -rr/-rm_low_quality_read score
    #To remove the low quality reads with the average score less than score.
    #4.6.3 -fdclass/-fun_diff_class A1,A2,...,An
    #To set the small RNAs that will be performed the differential expression analysis. The format is the same as the parameter -ac/-ann_class A1,A2,...,An.
    #4.6.4 -fdcase/-fun_diff_case ID1,ID2,...,IDn
    #To set the IDs of case samples.
    #4.6.5 -fdctrl/-fun_diff_control ID1,ID2,...,IDn
    #To set the IDs of control samples.
    #4.4.2 -ac/-ann_class A1,A2,...,An
    #To set the small RNA categories that will be annotated. The index of small RNA is listed:
    #    1 miRNA
    #    2 piRNA
    #    3 tRNA
    #    4 snoRNA
    #    5 snRNA
    #    6 circRNA
    
    java -jar COMPSRA.jar -ref hg38 -fun -fm -fms 1-5 -fdclass 1,2,3,4,5 -fdann -pro COMPSRA_MERGE -inf ./sample.list -out ./our_out/
    java -jar COMPSRA.jar -ref hg38 -fun -fd -fdclass 1,2,3,4,5 -fdcase 1-2 -fdctrl 3-6 -fdnorm cpm -fdtest mwu -fdann -pro COMPSRA_DEG -inf ./sample.list -out ./our_out/
    
  5. get the read number of virus genome (see also ~/DATA/Data_Ute/Data_Ute_smallRNA_3/README_virusgenome)

    #rsync -a -P jhuang@newton.ibvt.uni-stuttgart.de:~/COMPSRA_V1.0.2/our_out/pfsk1_mcvsyn_TSQ ./
    #!!NOTE that the used COMPSRA.jar --> Data_Ute_smallRNA_3/COMPSRA_V1.0.2, the database (changable) --> Data_Ute_smallRNA_3/bundle_v1!
    
    #sample.list
    #0505_WaGa_sT_DMSO
    #1905_WaGa_sT_DMSO
    #0505_WaGa_sT_Dox
    #1905_WaGa_sT_Dox
    #0505_WaGa_scr_DMSO
    #1905_WaGa_scr_DMSO
    #0505_WaGa_scr_Dox
    #1905_WaGa_scr_Dox
    #0505_WaGa_wt
    #1905_WaGa_wt
    #control_MKL1
    #control_WaGa
    
    for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
      mv ${sample}_cutadapted ${sample}
    done
    
    for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
        cd ${sample}
        #samtools sort ${sample}_cutadapted_STAR_Aligned.out.bam > sorted.bam
        #samtools index sorted.bam
        #samtools view -h sorted.bam "JN707599:1-5387" > JN707599.sam
        #samtools view -Sb JN707599.sam > JN707599.bam
        #samtools view -h sorted.bam "JN707599:1-5387" | samtools view -Sb - > JN707599.bam
        samtools view -h sorted.bam | grep -E 'JN707599' | samtools view -Sb - > JN707599.bam
        #samtools view -h sorted.bam | grep -E '^@|chr' | samtools view -b > hg38.bam
        cd ..
    done
    
    # #https://bedtools.readthedocs.io/en/latest/content/tools/intersect.html
    # #https://www.researchgate.net/post/How_to_find_number_of_reads_aligned_to_a_particular_region_of_a_chromosome_from_sam_file
    # echo -e 'JN707599\t4347\t4368' > region_5p.bed
    # intersectBed -abam 0505_WaGa_sT_Dox_cutadapted_STAR_Aligned.out.bam -b region_5p.bed -bed | wc -l
    # #382487    -->  382487
    # echo -e 'JN707599\t4381\t4402' > region_3p.bed
    # intersectBed -abam 0505_WaGa_sT_Dox_cutadapted_STAR_Aligned.out.bam -b region_3p.bed -bed | wc -l
    # #172659
    # #Total = 382487+172659=555146   
    # echo -e 'JN707599\t4332\t4415' > region_primary.bed
    # intersectBed -abam JN707599.bam -b region_primary.bed -bed | wc -l
    # #NOTE that 556752 > 555146
    
    for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
      echo "---- ${sample} ----" >> LOG_JN707599_M1
      cd ${sample}
      intersectBed -abam JN707599.bam -b ../region_all.bed -bed | wc -l >> ../LOG_JN707599_M1
      intersectBed -abam JN707599.bam -b ../region_primary.bed -bed | wc -l >> ../LOG_JN707599_M1
      cd ..
    done
    
    for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
      echo "---- ${sample} ----" >> LOG_read_number
      zcat ${sample}2.fastq.gz | echo $((`wc -l`/4)) >> LOG_read_number
    done
    
    cd our_out_on_hg38+JN707599
    for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
      echo "---- ${sample} ----" >> LOG_hg38
      echo "---- ${sample} ----" >> LOG_JN707599
      echo "---- ${sample} ----" >> LOG_hg38_alignments
      echo "---- ${sample} ----" >> LOG_JN707599_alignments
      cd ${sample}
      #-F 260: Excludes secondary alignments (flag 256) and unmapped reads (flag 4).
      samtools view -c -F 260 sorted.bam `seq -f "chr%g" 1 22` chrX chrY chrM `samtools view -H sorted.bam | grep '@SQ' | cut -f 2 | cut -d ':' -f 2 | grep 'chrUn\|chrEBV'` | awk '{sum+=$1} END {print sum}' >> ../LOG_hg38
      samtools view -c -F 260 sorted.bam JN707599 >> ../LOG_JN707599
      samtools view -c -F 4 sorted.bam `seq -f "chr%g" 1 22` chrX chrY chrM `samtools view -H sorted.bam | grep '@SQ' | cut -f 2 | cut -d ':' -f 2 | grep 'chrUn\|chrEBV'` | awk '{sum+=$1} END {print sum}' >> ../LOG_hg38_alignments
      samtools view -c -F 4 sorted.bam JN707599 >> ../LOG_JN707599_alignments
      cd ..
    done
    
    #for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt #1905_WaGa_wt control_MKL1 control_WaGa; do
    #  echo "---- ${sample} ----" >> LOG_hg38
    #  cd ${sample}
    #  samtools flagstat ${sample}_cutadapted_STAR_Aligned.out.bam >> ../LOG_hg38
    #  cd ..
    #done
    #for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
    #  echo "---- ${sample} ----" >> LOG_JN707599
    #  cd ${sample}
    #  samtools flagstat JN707599.bam >> ../LOG_JN707599
    #  cd ..
    #done
    
    for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt #1905_WaGa_wt control_MKL1 control_WaGa; do
      echo "---- ${sample} ----" >> LOG_alignments
      cd ${sample}
      samtools flagstat ${sample}2_STAR_Aligned.out.bam >> ../LOG_alignments
      cd ..
    done
    grep "mapped (" LOG_alignments
    
    #The following results calculate raw counts (Note: If you only want to merge the count files, you can use -fm -fms.)
    java -jar COMPSRA.jar -ref hg38 -fun -fm -fms 1-12 -fdclass 1,2,3,4,5,6 -fdann -pro COMPSRA_MERGE -inf ./sample.list -out ./our_out/
    ##The following command calculate statistical test after defining case and control.
    #java -jar COMPSRA.jar -ref hg38 -fun -fd -fdclass 1,2,3,4,5,6 -fdcase 1-10 -fdctrl 11-12 -fdnorm cpm -fdtest mwu -fdann -pro COMPSRA_DEG -inf ./sample.list -out ./our_out/
    
    # sed -i -e 's/_August/-08/g' COMPSRA_MERGE_0_miRNA.txt
    # sed -i -e 's/_November/-11/g' COMPSRA_MERGE_0_miRNA.txt
    # sed -i -e 's/_June/-06/g' COMPSRA_MERGE_0_miRNA.txt
    sed -i -e 's/_cutadapted_STAR_Aligned_miRNA.txt//g' COMPSRA_MERGE_0_miRNA.txt
    # #sed -i -e 's/_piRNA.txt//g' COMPSRA_MERGE_0_piRNA.txt
    # #sed -i -e 's/_tRNA.txt//g' COMPSRA_MERGE_0_tRNA.txt
    # #sed -i -e 's/_snoRNA.txt//g' COMPSRA_MERGE_0_snoRNA.txt
    # #sed -i -e 's/_snRNA.txt//g' COMPSRA_DEG_0_snRNA.txt
    # sed -i -e 's/_August/-08/g' COMPSRA_DEG_0_miRNA.txt
    # sed -i -e 's/_November/-11/g' COMPSRA_DEG_0_miRNA.txt
    # sed -i -e 's/_June/-06/g' COMPSRA_DEG_0_miRNA.txt
    # sed -i -e 's/_STAR_Aligned_miRNA.txt//g' COMPSRA_DEG_0_miRNA.txt
    
    #python3
    import pandas as pd
    df = pd.read_csv('COMPSRA_MERGE_0_miRNA.txt', sep='\t', index_col=0)
    df = df.drop(columns=['Unnamed: 13'])
    # Assuming df is your DataFrame
    df.loc['Sum'] = df.sum(numeric_only=True)
    df.to_csv('COMPSRA_MERGE_0_miRNA_.txt', sep='\t')
    df = pd.read_csv('COMPSRA_MERGE_0_piRNA.txt', sep='\t', index_col=0)
    df = df.drop(columns=['Unnamed: 13'])
    df.loc['Sum'] = df.sum(numeric_only=True)
    df.to_csv('COMPSRA_MERGE_0_piRNA_.txt', sep='\t')
    df = pd.read_csv('COMPSRA_MERGE_0_snoRNA.txt', sep='\t', index_col=0)
    df = df.drop(columns=['Unnamed: 13'])
    df.loc['Sum'] = df.sum(numeric_only=True)
    df.to_csv('COMPSRA_MERGE_0_snoRNA_.txt', sep='\t')
    df = pd.read_csv('COMPSRA_MERGE_0_snRNA.txt', sep='\t', index_col=0)
    df = df.drop(columns=['Unnamed: 13'])
    df.loc['Sum'] = df.sum(numeric_only=True)
    df.to_csv('COMPSRA_MERGE_0_snRNA_.txt', sep='\t')
    df = pd.read_csv('COMPSRA_MERGE_0_tRNA.txt', sep='\t', index_col=0)
    df = df.drop(columns=['Unnamed: 13'])
    df.loc['Sum'] = df.sum(numeric_only=True)
    df.to_csv('COMPSRA_MERGE_0_tRNA_.txt', sep='\t')
    #samtools flagstat **.bam
    #47217410 + 0 in total (QC-passed reads + QC-failed reads)
    #45166321 + 0 mapped (95.66% : N/A)
    #2051089 + 0 in total (QC-passed reads + QC-failed reads)
    #TODO: check the microRNA in the paper mentioned in which position?
    #Single publications on EVs as transport vehicles for specific miRNAs in the pathogenesis of Merkel cell carcinoma have also been reported, such as miR-375 and its function in proliferation
    
    ~/Tools/csv2xls-0.4/csv_to_xls.py COMPSRA_MERGE_0_miRNA_.txt \
    COMPSRA_MERGE_0_piRNA_.txt \
    COMPSRA_MERGE_0_tRNA_.txt \
    COMPSRA_MERGE_0_snoRNA_.txt \
    COMPSRA_MERGE_0_snRNA_.txt \
    -d$'\t' -o raw_counts_hg38.xls;
    
    # # sorting the DEG table, change the sheet labels to 'miRNA_between_columns_B-C_and_columns_D-G'
    # ~/Tools/csv2xls-0.4/csv_to_xls.py COMPSRA_DEG_0_miRNA.txt -d$'\t' -o normalized_and_significance_test_miRNA.xls;
    
    ##merging the row counts and statical values
    #cut -f1-1 COMPSRA_MERGE_0_snoRNA.txt > f1_MERGE
    #cut -f1-1 COMPSRA_DEG_0_snoRNA.txt > f1_DEG
    #cut -f1-1 COMPSRA_MERGE_0_miRNA.txt > f1_MERGE
    #cut -f1-1 COMPSRA_DEG_0_miRNA.txt > f1_DEG
    #diff f1_MERGE f1_DEG
    
    sT: This abbreviation could stand for various things depending on the context of the research. In some studies, it might refer to "short-term," "signal transducer," "small T antigen," or other terms relevant to the specific area of study. The exact meaning would depend on the context in which these samples are being used.
    
    scr: This is likely short for "scramble" or "scrambled." In the context of genetic research, it often refers to a control group where cells have been treated with a scrambled sequence of RNA or DNA. This serves as a baseline to compare against other groups where specific genes are targeted.
    
    Dox: This usually stands for "Doxorubicin," which is a type of chemotherapy medication used in cancer treatment. It's also commonly used in research to study cellular responses to chemotherapy.
    
    DMSO: Short for "Dimethyl Sulfoxide," DMSO is an organic solvent that is often used as a vehicle in biological experiments. It's used to dissolve other substances and carry them into cells. In control experiments, DMSO is used without the active drug/compound to differentiate the effects of the solvent from the effects of the drug/compound being tested.
    
    WaGa: This could be a specific cell line or experimental condition used in your research. Cell lines often have specific names that are abbreviated in this way.
    
    wt: This typically stands for "wild type." In genetic research, "wild type" refers to organisms or cells that have not undergone any genetic modification and are thus considered to be the standard or normal phenotype.
    
    Control_MKL1 and Control_WaGa (parental cell): These seem to be control samples for the experiment. "Control" indicates that these samples are used as standards or baselines for comparison. "MKL1" and "WaGa" are likely specific identifiers for the controls used.
    
    sT vs scr (condition): This comparison could be named based on the nature of the genetic or molecular treatment in the samples. If "sT" and "scr" refer to different genetic constructs or treatments (such as a specific gene modification versus a scrambled sequence), you could name this comparison something like "Genetic Modification Comparison" or "Treatment Type Comparison." The exact name would depend on what "sT" and "scr" specifically represent in your study.
    
    Dox vs DMSO (treatment): Since "Dox" likely stands for Doxorubicin (a chemotherapy drug) and "DMSO" is a solvent used as a control, this comparison is between a drug treatment and a control treatment. You could name this comparison "Drug Treatment vs Control" or more specifically "Doxorubicin Treatment vs Solvent Control."
    
    WaGa vs MKL-1 (cell line): As you've identified these as cell lines, this comparison is straightforward. It's a comparison between two different cell lines. You might name it "Cell Line Comparison: WaGa vs MKL-1."
    
    0505 vs 1905 (donor): ****
    
  6. draw plots with R using DESeq2 (~/DATA/Data_Ute/Data_Ute_smallRNA_3_nextflow/README_draw_miRNA_plots)

    library("AnnotationDbi")
    library("clusterProfiler")
    library("ReactomePA")
    library("org.Hs.eg.db")
    library(DESeq2)
    library(gplots)
    setwd("/home/jhuang/DATA/Data_Ute/Data_Ute_smallRNA_7/our_out_on_hg38+JN707599/")
    
    d.raw<- read.delim2("COMPSRA_MERGE_0_miRNA.txt",sep="\t", header=TRUE, row.names=1)
    d.raw$X <- NULL
    #colnames(d.raw)<- c("WaGa_EV", "MKL1_EV", "WaGa_wt", "MKL1_wt")
    d.raw[] <- lapply(d.raw, as.numeric)
    
    #MCPyV-M1   0   0   29  0   23  0   30  8   0   0   202 196
    # New row to add
    new_row <- data.frame(
      X0505_WaGa_sT_DMSO = 0,
      X1905_WaGa_sT_DMSO = 0,
      X0505_WaGa_sT_Dox = 29,
      X1905_WaGa_sT_Dox = 0,
      X0505_WaGa_scr_DMSO = 23,
      X1905_WaGa_scr_DMSO = 0,
      X0505_WaGa_scr_Dox = 30,
      X1905_WaGa_scr_Dox = 8,
      X0505_WaGa_wt = 0,
      X1905_WaGa_wt = 0,
      control_MKL1 = 202,
      control_WaGa = 196,
      row.names = "MCPyV-M1"
    )
    
    # Add the new row to the data frame
    d.raw <- rbind(d.raw, new_row)
    
    #cell_line = as.factor(c("WaGa","WaGa", "WaGa","WaGa", "WaGa","WaGa", "WaGa","WaGa", "WaGa","WaGa", "MKL1","WaGa"))
    #condition = as.factor(c("sT","sT", "sT","sT", "scr","scr", "scr","scr", "wt","wt", "control","control"))
    #treatment = as.factor(c("DMSO","DMSO", "Dox","Dox", "DMSO","DMSO", "Dox","Dox", "wt","wt", "control","control"))
    
    EV_or_parental = as.factor(c("EV","EV", "EV","EV", "EV","EV", "EV","EV", "EV","EV", "parental","parental"))
    donor = as.factor(c("0505","1905", "0505","1905", "0505","1905", "0505","1905", "0505","1905", "0505","1905"))
    replicates = as.factor(c("sT_DMSO","sT_DMSO", "sT_Dox","sT_Dox", "scr_DMSO","scr_DMSO", "scr_Dox","scr_Dox", "wt","wt", "control","control"))
    ids = as.factor(c("0505_WaGa_sT_DMSO","1905_WaGa_sT_DMSO","0505_WaGa_sT_Dox","1905_WaGa_sT_Dox","0505_WaGa_scr_DMSO","1905_WaGa_scr_DMSO","0505_WaGa_scr_Dox","1905_WaGa_scr_Dox","0505_WaGa_wt","1905_WaGa_wt","control_MKL1","control_WaGa"))
    
    cData = data.frame(row.names=colnames(d.raw), replicates=replicates, ids=ids, donor=donor, EV_or_parental=EV_or_parental)
    dds<-DESeqDataSetFromMatrix(countData=d.raw, colData=cData, design=~replicates+donor)
    
    rld <- rlogTransformation(dds)
    
    #The normalized counts are calculated from the 'median of ratios method' (see the calulcation process at the paragraph 'DESeq2-normalized counts: Median of ratios method' at https://hbctraining.github.io/DGE_workshop/lessons/02_DGE_count_normalization.html).
    
    # -- before pca --
    png("pca.png", 1200, 800)
    plotPCA(rld, intgroup=c("replicates"))
    #plotPCA(rld, intgroup = c("replicates", "batch"))
    #plotPCA(rld, intgroup = c("replicates", "ids"))
    #plotPCA(rld, "batch")
    dev.off()
    
    png("pca2.png", 1200, 800)
    plotPCA(rld, intgroup=c("donor"))
    dev.off()
    
    # -- before heatmap --
    ## generate the pairwise comparison between samples
    library(gplots) 
    library("RColorBrewer")
    png("heatmap.png", 1200, 800)
    distsRL <- dist(t(assay(rld)))
    mat <- as.matrix(distsRL)
    #paste( rld$dex, rld$cell, sep="-" )
    #rownames(mat) <- colnames(mat) <- with(colData(dds),paste(replicates,batch, sep=":"))
    #rownames(mat) <- colnames(mat) <- with(colData(dds),paste(replicates,ids, sep=":"))
    hc <- hclust(distsRL)
    hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
    heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
    dev.off()
    
    # -- remove batch effect --
    # #show the results which delete the batches effect
    # #http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-after-vst-are-there-still-batches-in-the-pca-plot
    # mat <- assay(rld)
    # mat <- limma::removeBatchEffect(mat, rld$donor)
    # assay(rld) <- mat
    
    #TODO: next time using rld
    #http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#how-do-i-use-vst-or-rlog-data-for-differential-testing
    #It uses the design formula to calculate the within-group variability (if blind=FALSE) or the across-all-samples variability (if blind=TRUE).
    #- It is possible to visualize the transformed data with batch variation removed, using the removeBatchEffect function from limma.
    #- This simply removes any shifts in the log2-scale expression data that can be explained by batch.
    #IMPROVE: ~batch+replicates
    #https://support.bioconductor.org/p/116375/
    #Have a look at the manual pages of these functions. The first sentence of that for varianceStabilizingTransformation says "This function calculates a variance stabilizing transformation (VST) from the fitted dispersion-mean relation(s) and then transforms the count data (normalized by division by the size factors or normalization factors)." For rlog, it says "This function transforms the count data to the log2 scale in a way which minimizes differences between samples for rows with small counts, and which normalizes with respect to library size."
    #Do try to read the documentation and a little bit about the underlying methods, you'll find that you'll be more efficient and have much more fun with the software.
    mat <- assay(rld)
    mm <- model.matrix(~replicates, colData(rld))
    mat <- limma::removeBatchEffect(mat, batch=rld$donor, design=mm)
    assay(rld) <- mat
    #plotPCA(rld)
    
    ##https://www.biostars.org/p/403053/
    ##?? DESeq() function should work after removeBatchEffect ??
    #dds <- DESeqDataSetFromMatrix(countData=counts, colData=factors, design = ~ Batch + Covariate)
    #dds <- DESeq(dds)
    #rld <- vst(dds, blind=FALSE)
    #mat <- assay(rld)
    #mat <- limma::removeBatchEffect(mat, rld$Batch)
    #assay(rld) <- mat
    #counts_batch_corrected <- assay(rld)
    
    # -- after pca --
    png("pca_after_donor_correction.png", 1200, 800)
    #plotPCA(rld, intgroup = c("replicates", "batch"))
    #plotPCA(rld, intgroup = c("replicates", "ids"))
    plotPCA(rld, intgroup=c("replicates"))
    dev.off()
    
    png("pca_after_donor_correction2.png", 1200, 800)
    plotPCA(rld, intgroup = c("donor"))
    #plotPCA(rld, intgroup = c("replicates", "ids"))
    #plotPCA(rld, intgroup=c("replicates"))
    dev.off()
    
    # -- after heatmap --
    ## generate the pairwise comparison between samples
    png("heatmap_after_donor_correction.png", 1200, 800)
    distsRL <- dist(t(assay(rld)))
    mat <- as.matrix(distsRL)
    rownames(mat) <- colnames(mat) <- with(colData(dds),paste(replicates,donor, sep=":"))
    #rownames(mat) <- colnames(mat) <- with(colData(dds),paste(replicates,ids, sep=":"))
    hc <- hclust(distsRL)
    hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
    heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
    dev.off()
    
    #convert bam to bigwig using deepTools by feeding inverse of DESeq’s size Factor
    sizeFactors(dds)
    #NULL
    dds <- estimateSizeFactors(dds)
    sizeFactors(dds)
    normalized_counts <- counts(dds, normalized=TRUE)
    write.table(normalized_counts, file="normalized_counts.txt", sep="\t", quote=F, col.names=NA)
    #cp COMPSRA_MERGE_0_miRNA.txt raw_counts.txt
    #~/Tools/csv2xls-0.4/csv_to_xls.py raw_counts.txt normalized_counts.txt -d$'\t' -o raw_and_normalized_counts.xls;
    ~/Tools/csv2xls-0.4/csv_to_xls.py normalized_counts.txt -d$'\t' -o normalized_counts_miRNA.xls;
    
    #unter konsole
    #      WaGa WaGa_EV_r1 WaGa_EV_r2       MKL1 MKL1_EV_r1 MKL1_EV_r2 
    # 1/1,2948728  1/0,4790155  1/0,9996732  1/1,5667017  1/1,0727251  1/1,0755253
    
    > sizeFactors(dds)
    X0505_WaGa_sT_DMSO  X1905_WaGa_sT_DMSO   X0505_WaGa_sT_Dox   X1905_WaGa_sT_Dox 
              0.7042043           0.9986761           1.3277555           1.4950649 
    X0505_WaGa_scr_DMSO X1905_WaGa_scr_DMSO  X0505_WaGa_scr_Dox  X1905_WaGa_scr_Dox 
              0.5406249           1.5657442           0.4216588           1.1627786 
          X0505_WaGa_wt       X1905_WaGa_wt        control_MKL1        control_WaGa 
              0.3560245           0.6982057           3.2447398           3.2147966
    
    1/1,2616592=0,792607069
    1/0,5302187=1,886014205
    1/1,0476709=0,954498211
    1/1,5243221=0,656029326
    1/1,0000000=1,0
    1/1,0856832=0,921079004
    
    #bamCoverage --bam ${sample}Aligned.sortedByCoord.out.bam -o ../bigWigs/${sample}_norm.bw --binSize 10 --scaleFactor  --effectiveGenomeSize 2864785220
    
    bamCoverage --bam WaGa_RNAAligned.sortedByCoord.out.markDups.bam -o ../bigWigs/WaGa_norm.bw --binSize 10 --scaleFactor 0.792607069         --effectiveGenomeSize 2864785220
    bamCoverage --bam WaGa_derived_EV_RNAAligned.sortedByCoord.out.markDups.bam -o ../bigWigs/WaGa_EV_r1_norm.bw --binSize 10 --scaleFactor 1.886014205 --effectiveGenomeSize 2864785220
    bamCoverage --bam WaGa_derived_EV_RNA_2Aligned.sortedByCoord.out.markDups.bam -o ../bigWigs/WaGa_EV_r2_norm.bw --binSize 10 --scaleFactor 0.954498211 --effectiveGenomeSize 2864785220
    bamCoverage --bam MKL_1_RNAAligned.sortedByCoord.out.markDups.bam -o ../bigWigs/MKL1_norm.bw --binSize 10 --scaleFactor 0.656029326        --effectiveGenomeSize 2864785220
    bamCoverage --bam MKL_1_derived_EV_RNAAligned.sortedByCoord.out.markDups.bam -o ../bigWigs/MKL1_EV_r1_norm.bw --binSize 10 --scaleFactor 1.0 --effectiveGenomeSize 2864785220
    bamCoverage --bam MKL_1_derived_EV_RNA_2Aligned.sortedByCoord.out.markDups.bam -o ../bigWigs/MKL1_EV_r2_norm.bw --binSize 10 --scaleFactor 0.921079004 --effectiveGenomeSize 2864785220
    
    #setwd("/home/jhuang/DATA/Data_Ute_RNASeq/results/featureCounts/degenes")
    #---- * to untreated ----
    dds<-DESeqDataSetFromMatrix(countData=d.raw, colData=cData, design=~EV_or_parental+donor)
    dds$EV_or_parental <- relevel(dds$EV_or_parental, "parental")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("EV_vs_parental")
    
    for (i in clist) {
      contrast = paste("EV_or_parental", i, sep="_")
      res = results(dds, name=contrast)
      res <- res[!is.na(res$log2FoldChange),]
      #https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-are-some-p-values-set-to-na
      res$padj <- ifelse(is.na(res$padj), 1, res$padj)
    
      res_df <- as.data.frame(res)
      write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
    
      up <- subset(res_df, padj<=0.1 & log2FoldChange>=2)
      down <- subset(res_df, padj<=0.1 & log2FoldChange<=-2)
      write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
      write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
    }
    
    ~/Tools/csv2xls-0.4/csv_to_xls.py \
    EV_vs_parental-all.txt \
    EV_vs_parental-up.txt \
    EV_vs_parental-down.txt \
    -d$',' -o EV_vs_parental.xls;
    
    dds<-DESeqDataSetFromMatrix(countData=d.raw, colData=cData, design=~replicates+donor)
    dds$replicates <- relevel(dds$replicates, "sT_DMSO")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("sT_Dox_vs_sT_DMSO")
    
    dds$replicates <- relevel(dds$replicates, "scr_Dox")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("sT_Dox_vs_scr_Dox")
    
    dds$replicates <- relevel(dds$replicates, "scr_DMSO")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("scr_Dox_vs_scr_DMSO", "sT_DMSO_vs_scr_DMSO")
    
    for (i in clist) {
      contrast = paste("replicates", i, sep="_")
      res = results(dds, name=contrast)
      res <- res[!is.na(res$log2FoldChange),]
      #https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-are-some-p-values-set-to-na
      res$padj <- ifelse(is.na(res$padj), 1, res$padj)
    
      res_df <- as.data.frame(res)
      write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
    
      up <- subset(res_df, padj<=0.1 & log2FoldChange>=2)
      down <- subset(res_df, padj<=0.1 & log2FoldChange<=-2)
      write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
      write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
    }
    
    ~/Tools/csv2xls-0.4/csv_to_xls.py \
    sT_Dox_vs_sT_DMSO-all.txt \
    sT_Dox_vs_sT_DMSO-up.txt \
    sT_Dox_vs_sT_DMSO-down.txt \
    -d$',' -o sT_Dox_vs_sT_DMSO.xls;
    
    ~/Tools/csv2xls-0.4/csv_to_xls.py \
    sT_Dox_vs_scr_Dox-all.txt \
    sT_Dox_vs_scr_Dox-up.txt \
    sT_Dox_vs_scr_Dox-down.txt \
    -d$',' -o sT_Dox_vs_scr_Dox.xls;
    
    ~/Tools/csv2xls-0.4/csv_to_xls.py \
    scr_Dox_vs_scr_DMSO-all.txt \
    scr_Dox_vs_scr_DMSO-up.txt \
    scr_Dox_vs_scr_DMSO-down.txt \
    -d$',' -o scr_Dox_vs_scr_DMSO.xls;
    
    ~/Tools/csv2xls-0.4/csv_to_xls.py \
    sT_DMSO_vs_scr_DMSO-all.txt \
    sT_DMSO_vs_scr_DMSO-up.txt \
    sT_DMSO_vs_scr_DMSO-down.txt \
    -d$',' -o sT_DMSO_vs_scr_DMSO.xls;
    
    for i in EV_vs_wt; do
      # read files to geness_res
      echo "geness_res <- read.csv(file = paste(\"${i}\", \"all.txt\", sep=\"-\"), row.names=1)"
    
      echo "subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res\$log2FoldChange) >= 2.0))"
      echo "geness_res\$Color <- \"NS or log2FC < 2.0\""
      echo "geness_res\$Color[geness_res\$pvalue < 0.05] <- \"P < 0.05\""
      echo "geness_res\$Color[geness_res\$padj < 0.05] <- \"P-adj < 0.05\""
      echo "geness_res\$Color[abs(geness_res\$log2FoldChange) < 2.0] <- \"NS or log2FC < 2.0\""
      echo "geness_res\$Color[geness_res\$external_gene_name %in% intersect_891_471\$Gene] <- \"lysosomal & TFEB\""
      echo "geness_res\$Color[geness_res\$external_gene_name %in% G891_only\$Gene] <- \"lysosomal\""
      echo "geness_res\$Color[geness_res\$external_gene_name %in% G471_only\$Gene] <- \"TFEB\""
      echo "geness_res\$Color <- factor(geness_res\$Color, levels = c(\"NS or log2FC < 2.0\", \"P-adj < 0.05\", \"lysosomal & TFEB\", \"lysosomal\", \"TFEB\"))"
      echo "write.csv(geness_res, \"${i}_with_Category.csv\")"
    
      # pick top genes for either side of volcano to label
      # order genes for convenience:
      echo "geness_res\$invert_P <- (-log10(geness_res\$pvalue)) * sign(geness_res\$log2FoldChange)"
      echo "top_g <- c()"
      echo "top_g <- c(top_g, \
                geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:100]], \
                geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:100]])"
      echo "top_g <- unique(top_g)"
      echo "geness_res <- geness_res[, -1*ncol(geness_res)]"  # remove invert_P from matrix
    
      # Graph results
      echo "png(\"${i}.png\",width=1200, height=2000)"
      echo "ggplot(geness_res, \
          aes(x = log2FoldChange, y = -log10(pvalue), \
              color = Color, label = external_gene_name)) + \
          geom_vline(xintercept = c(2.0, -2.0), lty = \"dashed\") + \
          geom_hline(yintercept = -log10(0.05), lty = \"dashed\") + \
          geom_point() + \
          labs(x = \"log2(FC)\", y = \"Significance, -log10(P)\", color = \"Significance\") + \
          scale_color_manual(values = c(\"P < 0.05\"=\"darkgray\",\"P-adj < 0.05\"=\"red\",\"lysosomal\"=\"lightblue\",\"TFEB\"=\"green\",\"lysosomal & TFEB\"=\"cyan\",\"NS or log2FC < 2.0\"=\"gray\"),guide = guide_legend(override.aes = list(size = 4))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) + \
          geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res\$log2FoldChange) >= 2.0)), size = 4, point.padding = 0.15, color = \"black\", min.segment.length = .1, box.padding = .2, lwd = 2) + \
          theme_bw(base_size = 16) + \
          theme(legend.position = \"bottom\")"
      echo "dev.off()"
    done
    
    library(ggplot2)
    library(ggrepel)
    
    geness_res <- read.csv(file = paste("EV_vs_wt", "all.txt", sep="-"), row.names=1)
    
    external_gene_name <- rownames(geness_res)
    geness_res <- cbind(geness_res, external_gene_name)
    #top_g <- c("hsa-miR-10b-5p","hsa-miR-1226-5p","hsa-miR-30c-5p","hsa-mir-182","hsa-miR-182-5p",  "hsa-mir-1291","hsa-miR-1291","hsa-mir-3607","hsa-mir-3653","hsa-miR-3607-3p","hsa-miR-1244","hsa-mir-1248")
    top_g <- c("hsa-mir-3607","hsa-mir-1291","hsa-mir-3653","hsa-miR-3607-3p","hsa-miR-30c-5p","hsa-miR-10b-5p","hsa-miR-182-5p","hsa-miR-1291","hsa-mir-182","hsa-miR-1244","hsa-mir-1248","hsa-miR-1226-5p","hsa-mir-1224","hsa-miR-423-5p","hsa-miR-3653-3p","hsa-miR-375","hsa-miR-15a-3p","hsa-mir-3651","hsa-miR-1246","hsa-miR-4521","hsa-miR-30a-5p","hsa-miR-425-5p","hsa-miR-9-5p","hsa-miR-664a-5p","hsa-let-7f-5p","hsa-miR-146b-5p","hsa-miR-320a","hsa-miR-3607-5p","hsa-mir-3687-1","hsa-mir-3687-2","hsa-let-7a-3","hsa-mir-6723","hsa-miR-342-3p")
    
    subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0))
    geness_res$Color <- "NS or log2FC < 2.0"
    geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
    geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
    geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
    
    #geness_res$Color[geness_res$external_gene_name %in% intersect_891_471$Gene] <- "lysosomal & TFEB"
    #geness_res$Color[geness_res$external_gene_name %in% G891_only$Gene] <- "lysosomal"
    #geness_res$Color[geness_res$external_gene_name %in% G471_only$Gene] <- "TFEB"
    #geness_res$Color <- factor(geness_res$Color, levels = c("NS or log2FC < 2.0", "P-adj < 0.05", "lysosomal & TFEB", "lysosomal", "TFEB"))
    write.csv(geness_res, "EV_vs_wt_with_Category.csv")
    geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
    #top_g <- c()
    #top_g <- c(top_g,              geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:100]],              geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:100]])
    #top_g <- unique(top_g)
    
    geness_res <- geness_res[, -1*ncol(geness_res)]
    
    #png("EV_vs_wt.png",width=1200, height=2000)
    svg("EV_vs_wt.svg",width=12, height=14)
    ggplot(geness_res,       aes(x = log2FoldChange, y = -log10(pvalue),           color = Color, label = external_gene_name)) +       geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +       geom_hline(yintercept = -log10(0.05), lty = "dashed") +       geom_point() +       labs(x = "log2(FC)", y = "Significance, -log10(P)", color = "Significance") +       scale_color_manual(values = c("P < 0.05"="orange","P-adj < 0.05"="red","NS or log2FC < 2.0"="darkgray"),guide = guide_legend(override.aes = list(size = 4))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) +       geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)), size = 4, point.padding = 0.15, color = "black", min.segment.length = .1, box.padding = .2, lwd = 2) +       theme_bw(base_size = 16) +       theme(legend.position = "bottom")
    dev.off()
    
    #print all top_g-gene labels including 
    svg("EV_vs_wt.svg",width=12, height=14)
    ggplot(geness_res,       aes(x = log2FoldChange, y = -log10(pvalue),           color = Color, label = external_gene_name)) +       geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +       geom_hline(yintercept = -log10(0.05), lty = "dashed") +       geom_point() +       labs(x = "log2(FC)", y = "Significance, -log10(P)", color = "Significance") +       scale_color_manual(values = c("P < 0.05"="orange","P-adj < 0.05"="red","NS or log2FC < 2.0"="darkgray"),guide = guide_legend(override.aes = list(size = 4))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) +       geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g), size = 4, point.padding = 0.15, color = "black", min.segment.length = .1, box.padding = .2, lwd = 2) +       theme_bw(base_size = 16) +       theme(legend.position = "bottom")
    dev.off()
    
    ###################################################################
    ##### STEP3: prepare all_genes #####
    rld <- rlogTransformation(dds)
    mat <- assay(rld)
    mm <- model.matrix(~replicates, colData(rld))
    mat <- limma::removeBatchEffect(mat, batch=rld$donor, design=mm)
    assay(rld) <- mat
    RNASeq.NoCellLine <- assay(rld)
    # reorder the columns
    colnames(RNASeq.NoCellLine) = c("0505 WaGa sT DMSO","1905 WaGa sT DMSO","0505 WaGa sT Dox","1905 WaGa sT Dox","0505 WaGa scr DMSO","1905 WaGa scr DMSO","0505 WaGa scr Dox","1905 WaGa scr Dox","0505 WaGa wt","1905 WaGa wt","control MKL1","control WaGa")
    col.order <-c("0505 WaGa sT DMSO","1905 WaGa sT DMSO","0505 WaGa sT Dox","1905 WaGa sT Dox","0505 WaGa scr DMSO","1905 WaGa scr DMSO","0505 WaGa scr Dox","1905 WaGa scr Dox","0505 WaGa wt","1905 WaGa wt","control MKL1","control WaGa")
    RNASeq.NoCellLine <- RNASeq.NoCellLine[,col.order]
    
    #Option1: filter to genes with overall lfc > 2       
    datamat  <- RNASeq.NoCellLine[apply(RNASeq.NoCellLine,1,function(x){max(x)-min(x)})>=2,]
    
    #Option2: not using the automatical comparing between max and min, rather than using manually selected from DESeq2 between conditions.
    RNASeq.NoCellLine_  <- RNASeq.NoCellLine[top_g,]
    colnames(RNASeq.NoCellLine_) <- c("MKL-1 Cell", "WaGa Cell", "MKL-1 EVs", "WaGa EVs") 
    datamat <- RNASeq.NoCellLine_
    
    #Option3: take all_genes        
    datamat <- RNASeq.NoCellLine
    
    #Option4: manully defining
    for i in EV_vs_parental sT_Dox_vs_sT_DMSO sT_Dox_vs_scr_Dox scr_Dox_vs_scr_DMSO sT_DMSO_vs_scr_DMSO; do echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"; echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"; done
    cat *.id | sort -u > ids
    #add Gene_Id in the first line, delete the ""
    GOI <- read.csv("ids")$Gene_Id
    datamat = RNASeq.NoCellLine[GOI, ]
    
    ######################################################################
    ##### STEP4: clustering the genes and draw heatmap #####
    #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
    hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
    hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
    mycl = cutree(hr, h=max(hr$height)/1.05)
    mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED",  "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");
    
    mycol = mycol[as.vector(mycl)]
    #sampleCols <- rep('GREY',ncol(RNASeq.NoCellLine_))
    #names(sampleCols) <- c("mock_r1", "mock_r2", "mock_r3", "mock_r4", "WAP_r1", "WAP_r2",  "WAP_r3", "WAP_r4", "WAC_r1","WAC_r2")
    #sampleCols[substr(colnames(RNASeq.NoCellLine_),1,4)=='mock'] <- 'GREY'
    #sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAP'] <- 'RED'
    #sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dM_'] <- 'CYAN'
    #sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dP_'] <- 'BLUE'
    #sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAC'] <- 'GREEN'
    png("miRNA_heatmap.png", width=800, height=1000)
    #svg("DEGs_heatmap.svg", width=6, height=8)
    heatmap.2(as.matrix(datamat), 
      Rowv=as.dendrogram(hr), 
      Colv=NA, 
      dendrogram='row', 
      labRow="", 
      scale='row', 
      trace='none', 
      col=bluered(75), 
      RowSideColors=mycol, 
      srtCol=20, 
      lhei=c(1,8), 
      #cexRow=1.2,   # Increase row label font size
      cexCol=1.7,    # Increase column label font size
      margin=c(7,1)
     )
    dev.off()
    #ColSideColors = sampleCols,
    heatmap.2(datamat,Rowv=as.dendrogram(hr),Colv=as.dendrogram(hc),
                scale='row',trace='none',col=bluered(75),
                RowSideColors = mycol, labRow="",
                main=sprintf('%s', "RNA-Seq for all DEGs"), srtCol=30)
    dev.off()
    #margin=c(5,5),
    heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', labRow="", srtCol=30, lhei=c(0.1,2))
    #heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none",, sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', cexRow=1.4, lhei=c(0.25,5))
    #heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(10,10), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none')
    #heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(10,10), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', cexRow=1.2, lhei=c(0.1,5))
    dev.off()
    
    #### cluster members #####
    write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
    write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt')
    write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')  
    #~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o genelist_clusters.xls
    
  7. display viral transcripts found in mRNA-seq (or small RNA) MKL-1, WaGa EVs compared to parental cells (http://xgenes.com/article/article-content/87/display-viral-transcripts-found-in-mrna-seq-mkl-1-waga-evs-compared-to-cells/)

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum