gene_x 0 like s 398 view s
Tags: python, R, bash, pipeline
adapter sequence
Lexogen small RNA-Seq kit
some common adapter sequences from different kits for reference:
- TruSeq Small RNA (Illumina): TGGAATTCTCGGGTGCCAAGG
- Small RNA Kits V1 (Illumina): TCGTATGCCGTCTTCTGCTTGT
- Small RNA Kits V1.5 (Illumina): ATCTCGTATGCCGTCTTCTGCTTG
- NEXTflex Small RNA Sequencing Kit v3 for Illumina Platforms (Bioo Scientific): TGGAATTCTCGGGTGCCAAGG
- LEXOGEN Small RNA-Seq Library Prep Kit (Illumina): TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC
[Header],,,,,
IEMFileVersion,4,,,,
InvestigatorName,ag96,,,,
ExperimentName,ag96,,,,
Date,16.10.2023,,,,
Workflow,GenerateFASTQ,,,,
Application,NextSeqFASTQOnly,,,,
Assay,TruSeq HT,,,,
Description,pcr,,,,
Chemistry,Amplicon,,,,
,,,,,
[Reads],,,,,
82,,,,,
,,,,,
,,,,,
[Settings],,,,,
Adapter,TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC,,,,
,,,,,
,,,,,
[Data],,,,,
Sample_ID,Sample_Name,I7_Index_ID,index,Sample_Project,Description
nf930,01_0505_WaGa_wt_EV_RNA,SRi7001,CAGCGT,2023_064_nf_ute,smallRNA-Seq
nf931,02_0505_WaGa_sT_DMSO_EV_RNA,SRi7002,GATCAC,2023_064_nf_ute,smallRNA-Seq
nf932,03_0505_WaGa_sT_Dox_EV_RNA,SRi7003,ACCAGT,2023_064_nf_ute,smallRNA-Seq
nf933,04_0505_WaGa_scr_DMSO_EV_RNA,SRi7004,TGCACG,2023_064_nf_ute,smallRNA-Seq
nf934,05_0505_WaGa_scr_Dox_EV_RNA,SRi7005,ACATTA,2023_064_nf_ute,smallRNA-Seq
nf935,06_1905_WaGa_wt_EV_RNA,SRi7006,GTGTAG,2023_064_nf_ute,smallRNA-Seq
nf936,07_1905_WaGa_sT_DMSO_EV_RNA,SRi7007,CTAGTC,2023_064_nf_ute,smallRNA-Seq
nf937,08_1905_WaGa_sT_Dox_EV_RNA,SRi7008,TGTGCA,2023_064_nf_ute,smallRNA-Seq
nf938,09_1905_WaGa_scr_DMSO_EV_RNA,SRi7009,TCAGGA,2023_064_nf_ute,smallRNA-Seq
nf939,10_1905_WaGa_scr_Dox_EV_RNA,SRi7010,CGGTTA,2023_064_nf_ute,smallRNA-Seq
nf940,11_control_MKL1,SRi7011,TTAACT,2023_064_nf_ute,smallRNA-Seq
nf941,12_control_WaGa,SRi7012,ATGAAC,2023_064_nf_ute,smallRNA-Seq
input data
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf930/01_0505_WaGa_wt_EV_RNA_S1_R1_001.fastq.gz 0505_WaGa_wt.fastq.gz
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf931/02_0505_WaGa_sT_DMSO_EV_RNA_S2_R1_001.fastq.gz 0505_WaGa_sT_DMSO.fastq.gz
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf932/03_0505_WaGa_sT_Dox_EV_RNA_S3_R1_001.fastq.gz 0505_WaGa_sT_Dox.fastq.gz
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf933/04_0505_WaGa_scr_DMSO_EV_RNA_S4_R1_001.fastq.gz 0505_WaGa_scr_DMSO.fastq.gz
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf934/05_0505_WaGa_scr_Dox_EV_RNA_S5_R1_001.fastq.gz 0505_WaGa_scr_Dox.fastq.gz
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf935/06_1905_WaGa_wt_EV_RNA_S6_R1_001.fastq.gz 1905_WaGa_wt.fastq.gz
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf936/07_1905_WaGa_sT_DMSO_EV_RNA_S7_R1_001.fastq.gz 1905_WaGa_sT_DMSO.fastq.gz
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf937/08_1905_WaGa_sT_Dox_EV_RNA_S8_R1_001.fastq.gz 1905_WaGa_sT_Dox.fastq.gz
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf938/09_1905_WaGa_scr_DMSO_EV_RNA_S9_R1_001.fastq.gz 1905_WaGa_scr_DMSO.fastq.gz
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf939/10_1905_WaGa_scr_Dox_EV_RNA_S10_R1_001.fastq.gz 1905_WaGa_scr_Dox.fastq.gz
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf940/11_control_MKL1_S11_R1_001.fastq.gz control_MKL1.fastq.gz
ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf941/12_control_WaGa_S12_R1_001.fastq.gz control_WaGa.fastq.gz
run cutadapt
for sample in 0505_WaGa_wt 0505_WaGa_sT_DMSO 0505_WaGa_sT_Dox 0505_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_wt 1905_WaGa_sT_DMSO 1905_WaGa_sT_Dox 1905_WaGa_scr_DMSO 1905_WaGa_scr_Dox control_MKL1 control_WaGa; do
cutadapt -a TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -q 20 -o ${sample}_cutadapted.fastq.gz --minimum-length 5 --trim-n ${sample}.fastq.gz >> LOG
done
#jhuang@hamburg:~/DATA/Data_Ute/Data_Ute_smallRNA_7$ fastp -i 0505_WaGa_wt.fastq.gz -o 0505_WaGa_wt3.fastq.gz -a TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC
run COMPSRA
ln -s ../Data_Ute_smallRNA_3/bundle_v1 .
#change the reference of 'hg38 -> hg38_orig' to 'hg38 -> hg38_miRNA_JN707599' (other options are hg38_WaGa_KJ128379 and hg38_MKL1_FJ173815)
cd bundle_v1/db/stars; rm hg38; ln -s hg38_miRNA_JN707599 hg38;
# DEBUG_1: Make sure the file COMPSRA.jar under Data_Ute_smallRNA_7
# DEBUG_2: "-qc -ra TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -rb 4" does not work! Using cutadapt -a xxxx -q 20 replace those parameters!
mkdir out_out
for sample in 0505_WaGa_wt 0505_WaGa_sT_DMSO 0505_WaGa_sT_Dox 0505_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_wt 1905_WaGa_sT_DMSO 1905_WaGa_sT_Dox 1905_WaGa_scr_DMSO 1905_WaGa_scr_Dox control_MKL1 control_WaGa; do
echo "mkdir our_out/${sample}_cutadapted/"
done
for sample in 0505_WaGa_wt 0505_WaGa_sT_DMSO 0505_WaGa_sT_Dox 0505_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_wt 1905_WaGa_sT_DMSO 1905_WaGa_sT_Dox 1905_WaGa_scr_DMSO 1905_WaGa_scr_Dox control_MKL1 control_WaGa; do
echo "java -jar COMPSRA.jar -ref hg38 -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6 -in ${sample}_cutadapted.fastq.gz -out ./our_out/"
done
java -jar COMPSRA.jar -ref hg38 -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6 -in 0505_WaGa_wt_cutadapted.fastq.gz -out ./our_out/
java -jar COMPSRA.jar -ref hg38 -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6 -in 0505_WaGa_sT_DMSO_cutadapted.fastq.gz -out ./our_out/
java -jar COMPSRA.jar -ref hg38 -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6 -in 0505_WaGa_sT_Dox_cutadapted.fastq.gz -out ./our_out/
java -jar COMPSRA.jar -ref hg38 -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6 -in 0505_WaGa_scr_DMSO_cutadapted.fastq.gz -out ./our_out/
java -jar COMPSRA.jar -ref hg38 -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6 -in 0505_WaGa_scr_Dox_cutadapted.fastq.gz -out ./our_out/
java -jar COMPSRA.jar -ref hg38 -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6 -in 1905_WaGa_wt_cutadapted.fastq.gz -out ./our_out/
java -jar COMPSRA.jar -ref hg38 -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6 -in 1905_WaGa_sT_DMSO_cutadapted.fastq.gz -out ./our_out/
java -jar COMPSRA.jar -ref hg38 -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6 -in 1905_WaGa_sT_Dox_cutadapted.fastq.gz -out ./our_out/
java -jar COMPSRA.jar -ref hg38 -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6 -in 1905_WaGa_scr_DMSO_cutadapted.fastq.gz -out ./our_out/
java -jar COMPSRA.jar -ref hg38 -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6 -in 1905_WaGa_scr_Dox_cutadapted.fastq.gz -out ./our_out/
java -jar COMPSRA.jar -ref hg38 -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6 -in control_MKL1_cutadapted.fastq.gz -out ./our_out/
java -jar COMPSRA.jar -ref hg38 -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6 -in control_WaGa_cutadapted.fastq.gz -out ./our_out/
#END
#4.2.3 -rb/-rm_bias n
#To remove n random bases in both 5’ (5-prime) and 3’ (3-prime) ends after removing the adapter sequence.
#4.2.4 -rh/-rm_low_quality_head score
#To remove the low quality bases with the score less than score from 5’ (5-prime) end.
#4.2.5 -rt/-rm_low_quality_tail score
#To remove the low quality bases with the score less than score from 3’ (3-prime) end.
#4.2.6 -rr/-rm_low_quality_read score
#To remove the low quality reads with the average score less than score.
#4.6.3 -fdclass/-fun_diff_class A1,A2,...,An
#To set the small RNAs that will be performed the differential expression analysis. The format is the same as the parameter -ac/-ann_class A1,A2,...,An.
#4.6.4 -fdcase/-fun_diff_case ID1,ID2,...,IDn
#To set the IDs of case samples.
#4.6.5 -fdctrl/-fun_diff_control ID1,ID2,...,IDn
#To set the IDs of control samples.
#4.4.2 -ac/-ann_class A1,A2,...,An
#To set the small RNA categories that will be annotated. The index of small RNA is listed:
# 1 miRNA
# 2 piRNA
# 3 tRNA
# 4 snoRNA
# 5 snRNA
# 6 circRNA
java -jar COMPSRA.jar -ref hg38 -fun -fm -fms 1-5 -fdclass 1,2,3,4,5 -fdann -pro COMPSRA_MERGE -inf ./sample.list -out ./our_out/
java -jar COMPSRA.jar -ref hg38 -fun -fd -fdclass 1,2,3,4,5 -fdcase 1-2 -fdctrl 3-6 -fdnorm cpm -fdtest mwu -fdann -pro COMPSRA_DEG -inf ./sample.list -out ./our_out/
get the read number of virus genome (see also ~/DATA/Data_Ute/Data_Ute_smallRNA_3/README_virusgenome)
#rsync -a -P jhuang@newton.ibvt.uni-stuttgart.de:~/COMPSRA_V1.0.2/our_out/pfsk1_mcvsyn_TSQ ./
#!!NOTE that the used COMPSRA.jar --> Data_Ute_smallRNA_3/COMPSRA_V1.0.2, the database (changable) --> Data_Ute_smallRNA_3/bundle_v1!
#sample.list
#0505_WaGa_sT_DMSO
#1905_WaGa_sT_DMSO
#0505_WaGa_sT_Dox
#1905_WaGa_sT_Dox
#0505_WaGa_scr_DMSO
#1905_WaGa_scr_DMSO
#0505_WaGa_scr_Dox
#1905_WaGa_scr_Dox
#0505_WaGa_wt
#1905_WaGa_wt
#control_MKL1
#control_WaGa
for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
mv ${sample}_cutadapted ${sample}
done
for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
cd ${sample}
#samtools sort ${sample}_cutadapted_STAR_Aligned.out.bam > sorted.bam
#samtools index sorted.bam
#samtools view -h sorted.bam "JN707599:1-5387" > JN707599.sam
#samtools view -Sb JN707599.sam > JN707599.bam
#samtools view -h sorted.bam "JN707599:1-5387" | samtools view -Sb - > JN707599.bam
samtools view -h sorted.bam | grep -E 'JN707599' | samtools view -Sb - > JN707599.bam
#samtools view -h sorted.bam | grep -E '^@|chr' | samtools view -b > hg38.bam
cd ..
done
# #https://bedtools.readthedocs.io/en/latest/content/tools/intersect.html
# #https://www.researchgate.net/post/How_to_find_number_of_reads_aligned_to_a_particular_region_of_a_chromosome_from_sam_file
# echo -e 'JN707599\t4347\t4368' > region_5p.bed
# intersectBed -abam 0505_WaGa_sT_Dox_cutadapted_STAR_Aligned.out.bam -b region_5p.bed -bed | wc -l
# #382487 --> 382487
# echo -e 'JN707599\t4381\t4402' > region_3p.bed
# intersectBed -abam 0505_WaGa_sT_Dox_cutadapted_STAR_Aligned.out.bam -b region_3p.bed -bed | wc -l
# #172659
# #Total = 382487+172659=555146
# echo -e 'JN707599\t4332\t4415' > region_primary.bed
# intersectBed -abam JN707599.bam -b region_primary.bed -bed | wc -l
# #NOTE that 556752 > 555146
for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
echo "---- ${sample} ----" >> LOG_JN707599_M1
cd ${sample}
intersectBed -abam JN707599.bam -b ../region_all.bed -bed | wc -l >> ../LOG_JN707599_M1
intersectBed -abam JN707599.bam -b ../region_primary.bed -bed | wc -l >> ../LOG_JN707599_M1
cd ..
done
for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
echo "---- ${sample} ----" >> LOG_read_number
zcat ${sample}2.fastq.gz | echo $((`wc -l`/4)) >> LOG_read_number
done
cd our_out_on_hg38+JN707599
for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
echo "---- ${sample} ----" >> LOG_hg38
echo "---- ${sample} ----" >> LOG_JN707599
echo "---- ${sample} ----" >> LOG_hg38_alignments
echo "---- ${sample} ----" >> LOG_JN707599_alignments
cd ${sample}
#-F 260: Excludes secondary alignments (flag 256) and unmapped reads (flag 4).
samtools view -c -F 260 sorted.bam `seq -f "chr%g" 1 22` chrX chrY chrM `samtools view -H sorted.bam | grep '@SQ' | cut -f 2 | cut -d ':' -f 2 | grep 'chrUn\|chrEBV'` | awk '{sum+=$1} END {print sum}' >> ../LOG_hg38
samtools view -c -F 260 sorted.bam JN707599 >> ../LOG_JN707599
samtools view -c -F 4 sorted.bam `seq -f "chr%g" 1 22` chrX chrY chrM `samtools view -H sorted.bam | grep '@SQ' | cut -f 2 | cut -d ':' -f 2 | grep 'chrUn\|chrEBV'` | awk '{sum+=$1} END {print sum}' >> ../LOG_hg38_alignments
samtools view -c -F 4 sorted.bam JN707599 >> ../LOG_JN707599_alignments
cd ..
done
#for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt #1905_WaGa_wt control_MKL1 control_WaGa; do
# echo "---- ${sample} ----" >> LOG_hg38
# cd ${sample}
# samtools flagstat ${sample}_cutadapted_STAR_Aligned.out.bam >> ../LOG_hg38
# cd ..
#done
#for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt 1905_WaGa_wt control_MKL1 control_WaGa; do
# echo "---- ${sample} ----" >> LOG_JN707599
# cd ${sample}
# samtools flagstat JN707599.bam >> ../LOG_JN707599
# cd ..
#done
for sample in 0505_WaGa_sT_DMSO 1905_WaGa_sT_DMSO 0505_WaGa_sT_Dox 1905_WaGa_sT_Dox 0505_WaGa_scr_DMSO 1905_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_scr_Dox 0505_WaGa_wt #1905_WaGa_wt control_MKL1 control_WaGa; do
echo "---- ${sample} ----" >> LOG_alignments
cd ${sample}
samtools flagstat ${sample}2_STAR_Aligned.out.bam >> ../LOG_alignments
cd ..
done
grep "mapped (" LOG_alignments
#The following results calculate raw counts (Note: If you only want to merge the count files, you can use -fm -fms.)
java -jar COMPSRA.jar -ref hg38 -fun -fm -fms 1-12 -fdclass 1,2,3,4,5,6 -fdann -pro COMPSRA_MERGE -inf ./sample.list -out ./our_out/
##The following command calculate statistical test after defining case and control.
#java -jar COMPSRA.jar -ref hg38 -fun -fd -fdclass 1,2,3,4,5,6 -fdcase 1-10 -fdctrl 11-12 -fdnorm cpm -fdtest mwu -fdann -pro COMPSRA_DEG -inf ./sample.list -out ./our_out/
# sed -i -e 's/_August/-08/g' COMPSRA_MERGE_0_miRNA.txt
# sed -i -e 's/_November/-11/g' COMPSRA_MERGE_0_miRNA.txt
# sed -i -e 's/_June/-06/g' COMPSRA_MERGE_0_miRNA.txt
sed -i -e 's/_cutadapted_STAR_Aligned_miRNA.txt//g' COMPSRA_MERGE_0_miRNA.txt
# #sed -i -e 's/_piRNA.txt//g' COMPSRA_MERGE_0_piRNA.txt
# #sed -i -e 's/_tRNA.txt//g' COMPSRA_MERGE_0_tRNA.txt
# #sed -i -e 's/_snoRNA.txt//g' COMPSRA_MERGE_0_snoRNA.txt
# #sed -i -e 's/_snRNA.txt//g' COMPSRA_DEG_0_snRNA.txt
# sed -i -e 's/_August/-08/g' COMPSRA_DEG_0_miRNA.txt
# sed -i -e 's/_November/-11/g' COMPSRA_DEG_0_miRNA.txt
# sed -i -e 's/_June/-06/g' COMPSRA_DEG_0_miRNA.txt
# sed -i -e 's/_STAR_Aligned_miRNA.txt//g' COMPSRA_DEG_0_miRNA.txt
#python3
import pandas as pd
df = pd.read_csv('COMPSRA_MERGE_0_miRNA.txt', sep='\t', index_col=0)
df = df.drop(columns=['Unnamed: 13'])
# Assuming df is your DataFrame
df.loc['Sum'] = df.sum(numeric_only=True)
df.to_csv('COMPSRA_MERGE_0_miRNA_.txt', sep='\t')
df = pd.read_csv('COMPSRA_MERGE_0_piRNA.txt', sep='\t', index_col=0)
df = df.drop(columns=['Unnamed: 13'])
df.loc['Sum'] = df.sum(numeric_only=True)
df.to_csv('COMPSRA_MERGE_0_piRNA_.txt', sep='\t')
df = pd.read_csv('COMPSRA_MERGE_0_snoRNA.txt', sep='\t', index_col=0)
df = df.drop(columns=['Unnamed: 13'])
df.loc['Sum'] = df.sum(numeric_only=True)
df.to_csv('COMPSRA_MERGE_0_snoRNA_.txt', sep='\t')
df = pd.read_csv('COMPSRA_MERGE_0_snRNA.txt', sep='\t', index_col=0)
df = df.drop(columns=['Unnamed: 13'])
df.loc['Sum'] = df.sum(numeric_only=True)
df.to_csv('COMPSRA_MERGE_0_snRNA_.txt', sep='\t')
df = pd.read_csv('COMPSRA_MERGE_0_tRNA.txt', sep='\t', index_col=0)
df = df.drop(columns=['Unnamed: 13'])
df.loc['Sum'] = df.sum(numeric_only=True)
df.to_csv('COMPSRA_MERGE_0_tRNA_.txt', sep='\t')
#samtools flagstat **.bam
#47217410 + 0 in total (QC-passed reads + QC-failed reads)
#45166321 + 0 mapped (95.66% : N/A)
#2051089 + 0 in total (QC-passed reads + QC-failed reads)
#TODO: check the microRNA in the paper mentioned in which position?
#Single publications on EVs as transport vehicles for specific miRNAs in the pathogenesis of Merkel cell carcinoma have also been reported, such as miR-375 and its function in proliferation
~/Tools/csv2xls-0.4/csv_to_xls.py COMPSRA_MERGE_0_miRNA_.txt \
COMPSRA_MERGE_0_piRNA_.txt \
COMPSRA_MERGE_0_tRNA_.txt \
COMPSRA_MERGE_0_snoRNA_.txt \
COMPSRA_MERGE_0_snRNA_.txt \
-d$'\t' -o raw_counts_hg38.xls;
# # sorting the DEG table, change the sheet labels to 'miRNA_between_columns_B-C_and_columns_D-G'
# ~/Tools/csv2xls-0.4/csv_to_xls.py COMPSRA_DEG_0_miRNA.txt -d$'\t' -o normalized_and_significance_test_miRNA.xls;
##merging the row counts and statical values
#cut -f1-1 COMPSRA_MERGE_0_snoRNA.txt > f1_MERGE
#cut -f1-1 COMPSRA_DEG_0_snoRNA.txt > f1_DEG
#cut -f1-1 COMPSRA_MERGE_0_miRNA.txt > f1_MERGE
#cut -f1-1 COMPSRA_DEG_0_miRNA.txt > f1_DEG
#diff f1_MERGE f1_DEG
sT: This abbreviation could stand for various things depending on the context of the research. In some studies, it might refer to "short-term," "signal transducer," "small T antigen," or other terms relevant to the specific area of study. The exact meaning would depend on the context in which these samples are being used.
scr: This is likely short for "scramble" or "scrambled." In the context of genetic research, it often refers to a control group where cells have been treated with a scrambled sequence of RNA or DNA. This serves as a baseline to compare against other groups where specific genes are targeted.
Dox: This usually stands for "Doxorubicin," which is a type of chemotherapy medication used in cancer treatment. It's also commonly used in research to study cellular responses to chemotherapy.
DMSO: Short for "Dimethyl Sulfoxide," DMSO is an organic solvent that is often used as a vehicle in biological experiments. It's used to dissolve other substances and carry them into cells. In control experiments, DMSO is used without the active drug/compound to differentiate the effects of the solvent from the effects of the drug/compound being tested.
WaGa: This could be a specific cell line or experimental condition used in your research. Cell lines often have specific names that are abbreviated in this way.
wt: This typically stands for "wild type." In genetic research, "wild type" refers to organisms or cells that have not undergone any genetic modification and are thus considered to be the standard or normal phenotype.
Control_MKL1 and Control_WaGa (parental cell): These seem to be control samples for the experiment. "Control" indicates that these samples are used as standards or baselines for comparison. "MKL1" and "WaGa" are likely specific identifiers for the controls used.
sT vs scr (condition): This comparison could be named based on the nature of the genetic or molecular treatment in the samples. If "sT" and "scr" refer to different genetic constructs or treatments (such as a specific gene modification versus a scrambled sequence), you could name this comparison something like "Genetic Modification Comparison" or "Treatment Type Comparison." The exact name would depend on what "sT" and "scr" specifically represent in your study.
Dox vs DMSO (treatment): Since "Dox" likely stands for Doxorubicin (a chemotherapy drug) and "DMSO" is a solvent used as a control, this comparison is between a drug treatment and a control treatment. You could name this comparison "Drug Treatment vs Control" or more specifically "Doxorubicin Treatment vs Solvent Control."
WaGa vs MKL-1 (cell line): As you've identified these as cell lines, this comparison is straightforward. It's a comparison between two different cell lines. You might name it "Cell Line Comparison: WaGa vs MKL-1."
0505 vs 1905 (donor): ****
draw plots with R using DESeq2 (~/DATA/Data_Ute/Data_Ute_smallRNA_3_nextflow/README_draw_miRNA_plots)
library("AnnotationDbi")
library("clusterProfiler")
library("ReactomePA")
library("org.Hs.eg.db")
library(DESeq2)
library(gplots)
setwd("/home/jhuang/DATA/Data_Ute/Data_Ute_smallRNA_7/our_out_on_hg38+JN707599/")
d.raw<- read.delim2("COMPSRA_MERGE_0_miRNA.txt",sep="\t", header=TRUE, row.names=1)
d.raw$X <- NULL
#colnames(d.raw)<- c("WaGa_EV", "MKL1_EV", "WaGa_wt", "MKL1_wt")
d.raw[] <- lapply(d.raw, as.numeric)
#MCPyV-M1 0 0 29 0 23 0 30 8 0 0 202 196
# New row to add
new_row <- data.frame(
X0505_WaGa_sT_DMSO = 0,
X1905_WaGa_sT_DMSO = 0,
X0505_WaGa_sT_Dox = 29,
X1905_WaGa_sT_Dox = 0,
X0505_WaGa_scr_DMSO = 23,
X1905_WaGa_scr_DMSO = 0,
X0505_WaGa_scr_Dox = 30,
X1905_WaGa_scr_Dox = 8,
X0505_WaGa_wt = 0,
X1905_WaGa_wt = 0,
control_MKL1 = 202,
control_WaGa = 196,
row.names = "MCPyV-M1"
)
# Add the new row to the data frame
d.raw <- rbind(d.raw, new_row)
#cell_line = as.factor(c("WaGa","WaGa", "WaGa","WaGa", "WaGa","WaGa", "WaGa","WaGa", "WaGa","WaGa", "MKL1","WaGa"))
#condition = as.factor(c("sT","sT", "sT","sT", "scr","scr", "scr","scr", "wt","wt", "control","control"))
#treatment = as.factor(c("DMSO","DMSO", "Dox","Dox", "DMSO","DMSO", "Dox","Dox", "wt","wt", "control","control"))
EV_or_parental = as.factor(c("EV","EV", "EV","EV", "EV","EV", "EV","EV", "EV","EV", "parental","parental"))
donor = as.factor(c("0505","1905", "0505","1905", "0505","1905", "0505","1905", "0505","1905", "0505","1905"))
replicates = as.factor(c("sT_DMSO","sT_DMSO", "sT_Dox","sT_Dox", "scr_DMSO","scr_DMSO", "scr_Dox","scr_Dox", "wt","wt", "control","control"))
ids = as.factor(c("0505_WaGa_sT_DMSO","1905_WaGa_sT_DMSO","0505_WaGa_sT_Dox","1905_WaGa_sT_Dox","0505_WaGa_scr_DMSO","1905_WaGa_scr_DMSO","0505_WaGa_scr_Dox","1905_WaGa_scr_Dox","0505_WaGa_wt","1905_WaGa_wt","control_MKL1","control_WaGa"))
cData = data.frame(row.names=colnames(d.raw), replicates=replicates, ids=ids, donor=donor, EV_or_parental=EV_or_parental)
dds<-DESeqDataSetFromMatrix(countData=d.raw, colData=cData, design=~replicates+donor)
rld <- rlogTransformation(dds)
#The normalized counts are calculated from the 'median of ratios method' (see the calulcation process at the paragraph 'DESeq2-normalized counts: Median of ratios method' at https://hbctraining.github.io/DGE_workshop/lessons/02_DGE_count_normalization.html).
# -- before pca --
png("pca.png", 1200, 800)
plotPCA(rld, intgroup=c("replicates"))
#plotPCA(rld, intgroup = c("replicates", "batch"))
#plotPCA(rld, intgroup = c("replicates", "ids"))
#plotPCA(rld, "batch")
dev.off()
png("pca2.png", 1200, 800)
plotPCA(rld, intgroup=c("donor"))
dev.off()
# -- before heatmap --
## generate the pairwise comparison between samples
library(gplots)
library("RColorBrewer")
png("heatmap.png", 1200, 800)
distsRL <- dist(t(assay(rld)))
mat <- as.matrix(distsRL)
#paste( rld$dex, rld$cell, sep="-" )
#rownames(mat) <- colnames(mat) <- with(colData(dds),paste(replicates,batch, sep=":"))
#rownames(mat) <- colnames(mat) <- with(colData(dds),paste(replicates,ids, sep=":"))
hc <- hclust(distsRL)
hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
dev.off()
# -- remove batch effect --
# #show the results which delete the batches effect
# #http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-after-vst-are-there-still-batches-in-the-pca-plot
# mat <- assay(rld)
# mat <- limma::removeBatchEffect(mat, rld$donor)
# assay(rld) <- mat
#TODO: next time using rld
#http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#how-do-i-use-vst-or-rlog-data-for-differential-testing
#It uses the design formula to calculate the within-group variability (if blind=FALSE) or the across-all-samples variability (if blind=TRUE).
#- It is possible to visualize the transformed data with batch variation removed, using the removeBatchEffect function from limma.
#- This simply removes any shifts in the log2-scale expression data that can be explained by batch.
#IMPROVE: ~batch+replicates
#https://support.bioconductor.org/p/116375/
#Have a look at the manual pages of these functions. The first sentence of that for varianceStabilizingTransformation says "This function calculates a variance stabilizing transformation (VST) from the fitted dispersion-mean relation(s) and then transforms the count data (normalized by division by the size factors or normalization factors)." For rlog, it says "This function transforms the count data to the log2 scale in a way which minimizes differences between samples for rows with small counts, and which normalizes with respect to library size."
#Do try to read the documentation and a little bit about the underlying methods, you'll find that you'll be more efficient and have much more fun with the software.
mat <- assay(rld)
mm <- model.matrix(~replicates, colData(rld))
mat <- limma::removeBatchEffect(mat, batch=rld$donor, design=mm)
assay(rld) <- mat
#plotPCA(rld)
##https://www.biostars.org/p/403053/
##?? DESeq() function should work after removeBatchEffect ??
#dds <- DESeqDataSetFromMatrix(countData=counts, colData=factors, design = ~ Batch + Covariate)
#dds <- DESeq(dds)
#rld <- vst(dds, blind=FALSE)
#mat <- assay(rld)
#mat <- limma::removeBatchEffect(mat, rld$Batch)
#assay(rld) <- mat
#counts_batch_corrected <- assay(rld)
# -- after pca --
png("pca_after_donor_correction.png", 1200, 800)
#plotPCA(rld, intgroup = c("replicates", "batch"))
#plotPCA(rld, intgroup = c("replicates", "ids"))
plotPCA(rld, intgroup=c("replicates"))
dev.off()
png("pca_after_donor_correction2.png", 1200, 800)
plotPCA(rld, intgroup = c("donor"))
#plotPCA(rld, intgroup = c("replicates", "ids"))
#plotPCA(rld, intgroup=c("replicates"))
dev.off()
# -- after heatmap --
## generate the pairwise comparison between samples
png("heatmap_after_donor_correction.png", 1200, 800)
distsRL <- dist(t(assay(rld)))
mat <- as.matrix(distsRL)
rownames(mat) <- colnames(mat) <- with(colData(dds),paste(replicates,donor, sep=":"))
#rownames(mat) <- colnames(mat) <- with(colData(dds),paste(replicates,ids, sep=":"))
hc <- hclust(distsRL)
hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
dev.off()
#convert bam to bigwig using deepTools by feeding inverse of DESeq’s size Factor
sizeFactors(dds)
#NULL
dds <- estimateSizeFactors(dds)
sizeFactors(dds)
normalized_counts <- counts(dds, normalized=TRUE)
write.table(normalized_counts, file="normalized_counts.txt", sep="\t", quote=F, col.names=NA)
#cp COMPSRA_MERGE_0_miRNA.txt raw_counts.txt
#~/Tools/csv2xls-0.4/csv_to_xls.py raw_counts.txt normalized_counts.txt -d$'\t' -o raw_and_normalized_counts.xls;
~/Tools/csv2xls-0.4/csv_to_xls.py normalized_counts.txt -d$'\t' -o normalized_counts_miRNA.xls;
#unter konsole
# WaGa WaGa_EV_r1 WaGa_EV_r2 MKL1 MKL1_EV_r1 MKL1_EV_r2
# 1/1,2948728 1/0,4790155 1/0,9996732 1/1,5667017 1/1,0727251 1/1,0755253
> sizeFactors(dds)
X0505_WaGa_sT_DMSO X1905_WaGa_sT_DMSO X0505_WaGa_sT_Dox X1905_WaGa_sT_Dox
0.7042043 0.9986761 1.3277555 1.4950649
X0505_WaGa_scr_DMSO X1905_WaGa_scr_DMSO X0505_WaGa_scr_Dox X1905_WaGa_scr_Dox
0.5406249 1.5657442 0.4216588 1.1627786
X0505_WaGa_wt X1905_WaGa_wt control_MKL1 control_WaGa
0.3560245 0.6982057 3.2447398 3.2147966
1/1,2616592=0,792607069
1/0,5302187=1,886014205
1/1,0476709=0,954498211
1/1,5243221=0,656029326
1/1,0000000=1,0
1/1,0856832=0,921079004
#bamCoverage --bam ${sample}Aligned.sortedByCoord.out.bam -o ../bigWigs/${sample}_norm.bw --binSize 10 --scaleFactor --effectiveGenomeSize 2864785220
bamCoverage --bam WaGa_RNAAligned.sortedByCoord.out.markDups.bam -o ../bigWigs/WaGa_norm.bw --binSize 10 --scaleFactor 0.792607069 --effectiveGenomeSize 2864785220
bamCoverage --bam WaGa_derived_EV_RNAAligned.sortedByCoord.out.markDups.bam -o ../bigWigs/WaGa_EV_r1_norm.bw --binSize 10 --scaleFactor 1.886014205 --effectiveGenomeSize 2864785220
bamCoverage --bam WaGa_derived_EV_RNA_2Aligned.sortedByCoord.out.markDups.bam -o ../bigWigs/WaGa_EV_r2_norm.bw --binSize 10 --scaleFactor 0.954498211 --effectiveGenomeSize 2864785220
bamCoverage --bam MKL_1_RNAAligned.sortedByCoord.out.markDups.bam -o ../bigWigs/MKL1_norm.bw --binSize 10 --scaleFactor 0.656029326 --effectiveGenomeSize 2864785220
bamCoverage --bam MKL_1_derived_EV_RNAAligned.sortedByCoord.out.markDups.bam -o ../bigWigs/MKL1_EV_r1_norm.bw --binSize 10 --scaleFactor 1.0 --effectiveGenomeSize 2864785220
bamCoverage --bam MKL_1_derived_EV_RNA_2Aligned.sortedByCoord.out.markDups.bam -o ../bigWigs/MKL1_EV_r2_norm.bw --binSize 10 --scaleFactor 0.921079004 --effectiveGenomeSize 2864785220
#setwd("/home/jhuang/DATA/Data_Ute_RNASeq/results/featureCounts/degenes")
#---- * to untreated ----
dds<-DESeqDataSetFromMatrix(countData=d.raw, colData=cData, design=~EV_or_parental+donor)
dds$EV_or_parental <- relevel(dds$EV_or_parental, "parental")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("EV_vs_parental")
for (i in clist) {
contrast = paste("EV_or_parental", i, sep="_")
res = results(dds, name=contrast)
res <- res[!is.na(res$log2FoldChange),]
#https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-are-some-p-values-set-to-na
res$padj <- ifelse(is.na(res$padj), 1, res$padj)
res_df <- as.data.frame(res)
write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
up <- subset(res_df, padj<=0.1 & log2FoldChange>=2)
down <- subset(res_df, padj<=0.1 & log2FoldChange<=-2)
write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}
~/Tools/csv2xls-0.4/csv_to_xls.py \
EV_vs_parental-all.txt \
EV_vs_parental-up.txt \
EV_vs_parental-down.txt \
-d$',' -o EV_vs_parental.xls;
dds<-DESeqDataSetFromMatrix(countData=d.raw, colData=cData, design=~replicates+donor)
dds$replicates <- relevel(dds$replicates, "sT_DMSO")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("sT_Dox_vs_sT_DMSO")
dds$replicates <- relevel(dds$replicates, "scr_Dox")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("sT_Dox_vs_scr_Dox")
dds$replicates <- relevel(dds$replicates, "scr_DMSO")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("scr_Dox_vs_scr_DMSO", "sT_DMSO_vs_scr_DMSO")
for (i in clist) {
contrast = paste("replicates", i, sep="_")
res = results(dds, name=contrast)
res <- res[!is.na(res$log2FoldChange),]
#https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-are-some-p-values-set-to-na
res$padj <- ifelse(is.na(res$padj), 1, res$padj)
res_df <- as.data.frame(res)
write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
up <- subset(res_df, padj<=0.1 & log2FoldChange>=2)
down <- subset(res_df, padj<=0.1 & log2FoldChange<=-2)
write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}
~/Tools/csv2xls-0.4/csv_to_xls.py \
sT_Dox_vs_sT_DMSO-all.txt \
sT_Dox_vs_sT_DMSO-up.txt \
sT_Dox_vs_sT_DMSO-down.txt \
-d$',' -o sT_Dox_vs_sT_DMSO.xls;
~/Tools/csv2xls-0.4/csv_to_xls.py \
sT_Dox_vs_scr_Dox-all.txt \
sT_Dox_vs_scr_Dox-up.txt \
sT_Dox_vs_scr_Dox-down.txt \
-d$',' -o sT_Dox_vs_scr_Dox.xls;
~/Tools/csv2xls-0.4/csv_to_xls.py \
scr_Dox_vs_scr_DMSO-all.txt \
scr_Dox_vs_scr_DMSO-up.txt \
scr_Dox_vs_scr_DMSO-down.txt \
-d$',' -o scr_Dox_vs_scr_DMSO.xls;
~/Tools/csv2xls-0.4/csv_to_xls.py \
sT_DMSO_vs_scr_DMSO-all.txt \
sT_DMSO_vs_scr_DMSO-up.txt \
sT_DMSO_vs_scr_DMSO-down.txt \
-d$',' -o sT_DMSO_vs_scr_DMSO.xls;
for i in EV_vs_wt; do
# read files to geness_res
echo "geness_res <- read.csv(file = paste(\"${i}\", \"all.txt\", sep=\"-\"), row.names=1)"
echo "subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res\$log2FoldChange) >= 2.0))"
echo "geness_res\$Color <- \"NS or log2FC < 2.0\""
echo "geness_res\$Color[geness_res\$pvalue < 0.05] <- \"P < 0.05\""
echo "geness_res\$Color[geness_res\$padj < 0.05] <- \"P-adj < 0.05\""
echo "geness_res\$Color[abs(geness_res\$log2FoldChange) < 2.0] <- \"NS or log2FC < 2.0\""
echo "geness_res\$Color[geness_res\$external_gene_name %in% intersect_891_471\$Gene] <- \"lysosomal & TFEB\""
echo "geness_res\$Color[geness_res\$external_gene_name %in% G891_only\$Gene] <- \"lysosomal\""
echo "geness_res\$Color[geness_res\$external_gene_name %in% G471_only\$Gene] <- \"TFEB\""
echo "geness_res\$Color <- factor(geness_res\$Color, levels = c(\"NS or log2FC < 2.0\", \"P-adj < 0.05\", \"lysosomal & TFEB\", \"lysosomal\", \"TFEB\"))"
echo "write.csv(geness_res, \"${i}_with_Category.csv\")"
# pick top genes for either side of volcano to label
# order genes for convenience:
echo "geness_res\$invert_P <- (-log10(geness_res\$pvalue)) * sign(geness_res\$log2FoldChange)"
echo "top_g <- c()"
echo "top_g <- c(top_g, \
geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:100]], \
geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:100]])"
echo "top_g <- unique(top_g)"
echo "geness_res <- geness_res[, -1*ncol(geness_res)]" # remove invert_P from matrix
# Graph results
echo "png(\"${i}.png\",width=1200, height=2000)"
echo "ggplot(geness_res, \
aes(x = log2FoldChange, y = -log10(pvalue), \
color = Color, label = external_gene_name)) + \
geom_vline(xintercept = c(2.0, -2.0), lty = \"dashed\") + \
geom_hline(yintercept = -log10(0.05), lty = \"dashed\") + \
geom_point() + \
labs(x = \"log2(FC)\", y = \"Significance, -log10(P)\", color = \"Significance\") + \
scale_color_manual(values = c(\"P < 0.05\"=\"darkgray\",\"P-adj < 0.05\"=\"red\",\"lysosomal\"=\"lightblue\",\"TFEB\"=\"green\",\"lysosomal & TFEB\"=\"cyan\",\"NS or log2FC < 2.0\"=\"gray\"),guide = guide_legend(override.aes = list(size = 4))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) + \
geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res\$log2FoldChange) >= 2.0)), size = 4, point.padding = 0.15, color = \"black\", min.segment.length = .1, box.padding = .2, lwd = 2) + \
theme_bw(base_size = 16) + \
theme(legend.position = \"bottom\")"
echo "dev.off()"
done
library(ggplot2)
library(ggrepel)
geness_res <- read.csv(file = paste("EV_vs_wt", "all.txt", sep="-"), row.names=1)
external_gene_name <- rownames(geness_res)
geness_res <- cbind(geness_res, external_gene_name)
#top_g <- c("hsa-miR-10b-5p","hsa-miR-1226-5p","hsa-miR-30c-5p","hsa-mir-182","hsa-miR-182-5p", "hsa-mir-1291","hsa-miR-1291","hsa-mir-3607","hsa-mir-3653","hsa-miR-3607-3p","hsa-miR-1244","hsa-mir-1248")
top_g <- c("hsa-mir-3607","hsa-mir-1291","hsa-mir-3653","hsa-miR-3607-3p","hsa-miR-30c-5p","hsa-miR-10b-5p","hsa-miR-182-5p","hsa-miR-1291","hsa-mir-182","hsa-miR-1244","hsa-mir-1248","hsa-miR-1226-5p","hsa-mir-1224","hsa-miR-423-5p","hsa-miR-3653-3p","hsa-miR-375","hsa-miR-15a-3p","hsa-mir-3651","hsa-miR-1246","hsa-miR-4521","hsa-miR-30a-5p","hsa-miR-425-5p","hsa-miR-9-5p","hsa-miR-664a-5p","hsa-let-7f-5p","hsa-miR-146b-5p","hsa-miR-320a","hsa-miR-3607-5p","hsa-mir-3687-1","hsa-mir-3687-2","hsa-let-7a-3","hsa-mir-6723","hsa-miR-342-3p")
subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0))
geness_res$Color <- "NS or log2FC < 2.0"
geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
#geness_res$Color[geness_res$external_gene_name %in% intersect_891_471$Gene] <- "lysosomal & TFEB"
#geness_res$Color[geness_res$external_gene_name %in% G891_only$Gene] <- "lysosomal"
#geness_res$Color[geness_res$external_gene_name %in% G471_only$Gene] <- "TFEB"
#geness_res$Color <- factor(geness_res$Color, levels = c("NS or log2FC < 2.0", "P-adj < 0.05", "lysosomal & TFEB", "lysosomal", "TFEB"))
write.csv(geness_res, "EV_vs_wt_with_Category.csv")
geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
#top_g <- c()
#top_g <- c(top_g, geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:100]], geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:100]])
#top_g <- unique(top_g)
geness_res <- geness_res[, -1*ncol(geness_res)]
#png("EV_vs_wt.png",width=1200, height=2000)
svg("EV_vs_wt.svg",width=12, height=14)
ggplot(geness_res, aes(x = log2FoldChange, y = -log10(pvalue), color = Color, label = external_gene_name)) + geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") + geom_hline(yintercept = -log10(0.05), lty = "dashed") + geom_point() + labs(x = "log2(FC)", y = "Significance, -log10(P)", color = "Significance") + scale_color_manual(values = c("P < 0.05"="orange","P-adj < 0.05"="red","NS or log2FC < 2.0"="darkgray"),guide = guide_legend(override.aes = list(size = 4))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) + geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)), size = 4, point.padding = 0.15, color = "black", min.segment.length = .1, box.padding = .2, lwd = 2) + theme_bw(base_size = 16) + theme(legend.position = "bottom")
dev.off()
#print all top_g-gene labels including
svg("EV_vs_wt.svg",width=12, height=14)
ggplot(geness_res, aes(x = log2FoldChange, y = -log10(pvalue), color = Color, label = external_gene_name)) + geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") + geom_hline(yintercept = -log10(0.05), lty = "dashed") + geom_point() + labs(x = "log2(FC)", y = "Significance, -log10(P)", color = "Significance") + scale_color_manual(values = c("P < 0.05"="orange","P-adj < 0.05"="red","NS or log2FC < 2.0"="darkgray"),guide = guide_legend(override.aes = list(size = 4))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) + geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g), size = 4, point.padding = 0.15, color = "black", min.segment.length = .1, box.padding = .2, lwd = 2) + theme_bw(base_size = 16) + theme(legend.position = "bottom")
dev.off()
###################################################################
##### STEP3: prepare all_genes #####
rld <- rlogTransformation(dds)
mat <- assay(rld)
mm <- model.matrix(~replicates, colData(rld))
mat <- limma::removeBatchEffect(mat, batch=rld$donor, design=mm)
assay(rld) <- mat
RNASeq.NoCellLine <- assay(rld)
# reorder the columns
colnames(RNASeq.NoCellLine) = c("0505 WaGa sT DMSO","1905 WaGa sT DMSO","0505 WaGa sT Dox","1905 WaGa sT Dox","0505 WaGa scr DMSO","1905 WaGa scr DMSO","0505 WaGa scr Dox","1905 WaGa scr Dox","0505 WaGa wt","1905 WaGa wt","control MKL1","control WaGa")
col.order <-c("0505 WaGa sT DMSO","1905 WaGa sT DMSO","0505 WaGa sT Dox","1905 WaGa sT Dox","0505 WaGa scr DMSO","1905 WaGa scr DMSO","0505 WaGa scr Dox","1905 WaGa scr Dox","0505 WaGa wt","1905 WaGa wt","control MKL1","control WaGa")
RNASeq.NoCellLine <- RNASeq.NoCellLine[,col.order]
#Option1: filter to genes with overall lfc > 2
datamat <- RNASeq.NoCellLine[apply(RNASeq.NoCellLine,1,function(x){max(x)-min(x)})>=2,]
#Option2: not using the automatical comparing between max and min, rather than using manually selected from DESeq2 between conditions.
RNASeq.NoCellLine_ <- RNASeq.NoCellLine[top_g,]
colnames(RNASeq.NoCellLine_) <- c("MKL-1 Cell", "WaGa Cell", "MKL-1 EVs", "WaGa EVs")
datamat <- RNASeq.NoCellLine_
#Option3: take all_genes
datamat <- RNASeq.NoCellLine
#Option4: manully defining
for i in EV_vs_parental sT_Dox_vs_sT_DMSO sT_Dox_vs_scr_Dox scr_Dox_vs_scr_DMSO sT_DMSO_vs_scr_DMSO; do echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"; echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"; done
cat *.id | sort -u > ids
#add Gene_Id in the first line, delete the ""
GOI <- read.csv("ids")$Gene_Id
datamat = RNASeq.NoCellLine[GOI, ]
######################################################################
##### STEP4: clustering the genes and draw heatmap #####
#clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). pearson or spearman
hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.05)
mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED", "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");
mycol = mycol[as.vector(mycl)]
#sampleCols <- rep('GREY',ncol(RNASeq.NoCellLine_))
#names(sampleCols) <- c("mock_r1", "mock_r2", "mock_r3", "mock_r4", "WAP_r1", "WAP_r2", "WAP_r3", "WAP_r4", "WAC_r1","WAC_r2")
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,4)=='mock'] <- 'GREY'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAP'] <- 'RED'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dM_'] <- 'CYAN'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dP_'] <- 'BLUE'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAC'] <- 'GREEN'
png("miRNA_heatmap.png", width=800, height=1000)
#svg("DEGs_heatmap.svg", width=6, height=8)
heatmap.2(as.matrix(datamat),
Rowv=as.dendrogram(hr),
Colv=NA,
dendrogram='row',
labRow="",
scale='row',
trace='none',
col=bluered(75),
RowSideColors=mycol,
srtCol=20,
lhei=c(1,8),
#cexRow=1.2, # Increase row label font size
cexCol=1.7, # Increase column label font size
margin=c(7,1)
)
dev.off()
#ColSideColors = sampleCols,
heatmap.2(datamat,Rowv=as.dendrogram(hr),Colv=as.dendrogram(hc),
scale='row',trace='none',col=bluered(75),
RowSideColors = mycol, labRow="",
main=sprintf('%s', "RNA-Seq for all DEGs"), srtCol=30)
dev.off()
#margin=c(5,5),
heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', labRow="", srtCol=30, lhei=c(0.1,2))
#heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none",, sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', cexRow=1.4, lhei=c(0.25,5))
#heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(10,10), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none')
#heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(10,10), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', cexRow=1.2, lhei=c(0.1,5))
dev.off()
#### cluster members #####
write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt')
write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')
#~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o genelist_clusters.xls
display viral transcripts found in mRNA-seq (or small RNA) MKL-1, WaGa EVs compared to parental cells (http://xgenes.com/article/article-content/87/display-viral-transcripts-found-in-mrna-seq-mkl-1-waga-evs-compared-to-cells/)
点赞本文的读者
还没有人对此文章表态
没有评论
RNA-seq skin organoids on GRCh38+chrHsv1 (final)
© 2023 XGenes.com Impressum