Author Archives: gene_x

ChIP-seq using HOMER (-stype histone, macs2 or SICER + custom getDifferentialPeaksReplicates_broad[narrow].pl)

  1. nextflow processing data

    (chipseq) jhuang@hamm:/mnt/h1/jhuang/DATA/Data_Denise_LT_DNA_Binding/Histone-ChIP_hg38/H3K27ac_H3K4me1_public$ nextflow run NGI-ChIPseq/main.nf --reads '/mnt/h1/jhuang/DATA/Data_Denise_LT_DNA_Binding/Histone-ChIP_hg38/H3K27ac_H3K4me1_public/Raw_Data/*.fastq.gz' --genome hg38 --macsconfig macs.config --saveReference --saveAlignedIntermediates --singleEnd --blacklist_filtering -profile standard --project NHDF_enhancer_analysis_hg38 -resume
    
    ./picard/V_8_3_1_p600_601_d12_D1_input.dedup.sorted.bam
    ./picard/V_8_3_2_p600_601_d9_D2_input.dedup.sorted.bam
    ./picard/V_8_4_2_p602_d8_D1_input.dedup.sorted.bam
    ./picard/V_8_4_1_p602_d8_D2_input.dedup.sorted.bam
    
    ./picard/V_8_3_1_p600_601_d12_D1_H3K4me3.dedup.sorted.bam
    ./picard/V_8_3_2_p600_601_d9_D2_H3K4me3.dedup.sorted.bam
    ./picard/V_8_4_2_p602_d8_D1_H3K4me3.dedup.sorted.bam
    ./picard/V_8_4_1_p602_d8_D2_H3K4me3.dedup.sorted.bam
    
    ./picard/V_8_3_1_p600_601_d12_D1_H3K27me3.dedup.sorted.bam
    ./picard/V_8_3_2_p600_601_d9_D2_H3K27me3.dedup.sorted.bam
    ./picard/V_8_4_2_p602_d8_D1_H3K27me3.dedup.sorted.bam
    ./picard/V_8_4_1_p602_d8_D2_H3K27me3.dedup.sorted.bam
  2. make homer directories and findPeaks with HOMER under (conda homer, but note that homer is installed under Tools manually!)

    #HOMER will be installed in the same directory that you place the configureHomer.pl program.  configureHomer.pl will attempt to check for required utilities and alert you to missing programs.
    1. mkdir ~/Tools/homer; cd ~/Tools/homer
    2. wget http://homer.ucsd.edu/homer/configureHomer.pl
    3. Run the configureHomer.pl script to install homer: perl configureHomer.pl -install
    4. Add the homer/bin directory to your executable path.  For example, edit your ~/.bashrc file to include the line:
    PATH=$PATH:/home/jhuang/Tools/homer/bin/
    5. Reset your terminal so that the changes to the PATH variable take effect: source ~/.bashrc
    
    #jhuang@hamm:~/Tools/homer/bin$ wget http://homer.ucsd.edu/homer/configureHomer.pl
    #configureHomer.pl -install
    #chmod +x /home/jhuang/miniconda3/envs/homer/bin/configureHomer.pl
    #ls ~/miniconda3/envs/homer/bin
    #echo $PATH
    #If /home/jhuang/miniconda3/envs/homer/bin is not in the output, you'll need to add it to your $PATH.
    
    chmod +x ~/Tools/homer/configureHomer.pl
    ./configureHomer.pl -install hg38
    ./configureHomer.pl -list
    
    #conda update -n base -c defaults conda
    conda create -n homer
    conda activate homer
    #(NOT_USE!) conda install -c bioconda homer
    mamba install r-essentials bioconductor-deseq2 
    mamba install samtools
    #bioconductor-edger wget 
    #http://homer.ucsd.edu/homer/introduction/update.html
    #http://homer.ucsd.edu/homer/introduction/install.html
    conda install -c bioconda ucsc-bedgraphtobigwig # install bedGraphToBigWig on @homer
    conda install -c bioconda macs2
    conda install -c anaconda pandas
    
    #under (myperl) on computer @hamburg
    mkdir homer bigWigs macs2 sicer
    cd homer
    
    #Why do I need give "-genome hg38" in makeTagDirectory? If you don't provide a genome with the -genome option, HOMER will only count the number of tags in each region without any genomic context or sequence information. #So, it is essential to include this information when creating a tag directory if you plan to perform any genome-based analysis.
    
    makeTagDirectory p600_601_d12_D1_input ../results/picard/V_8_3_1_p600_601_d12_D1_input.dedup.sorted.bam -genome hg38
    makeTagDirectory p600_601_d9_D2_input ../results/picard/V_8_3_2_p600_601_d9_D2_input.dedup.sorted.bam -genome hg38
    makeTagDirectory p602_d8_D1_input ../results/picard/V_8_4_2_p602_d8_D1_input.dedup.sorted.bam -genome hg38
    makeTagDirectory p602_d8_D2_input ../results/picard/V_8_4_1_p602_d8_D2_input.dedup.sorted.bam -genome hg38
    
    makeTagDirectory p600_601_d12_D1_H3K4me3 ../results/picard/V_8_3_1_p600_601_d12_D1_H3K4me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p600_601_d9_D2_H3K4me3 ../results/picard/V_8_3_2_p600_601_d9_D2_H3K4me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p602_d8_D1_H3K4me3 ../results/picard/V_8_4_2_p602_d8_D1_H3K4me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p602_d8_D2_H3K4me3 ../results/picard/V_8_4_1_p602_d8_D2_H3K4me3.dedup.sorted.bam -genome hg38
    
    makeTagDirectory p600_601_d12_D1_H3K27me3 ../results/picard/V_8_3_1_p600_601_d12_D1_H3K27me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p600_601_d9_D2_H3K27me3 ../results/picard/V_8_3_2_p600_601_d9_D2_H3K27me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p602_d8_D1_H3K27me3 ../results/picard/V_8_4_2_p602_d8_D1_H3K27me3.dedup.sorted.bam -genome hg38
    makeTagDirectory p602_d8_D2_H3K27me3 ../results/picard/V_8_4_1_p602_d8_D2_H3K27me3.dedup.sorted.bam -genome hg38
    
    for sample in p600_601_d12_D1_input p600_601_d9_D2_input p602_d8_D1_input p602_d8_D2_input p600_601_d12_D1_H3K4me3 p600_601_d9_D2_H3K4me3 p602_d8_D1_H3K4me3 p602_d8_D2_H3K4me3 p600_601_d12_D1_H3K27me3 p600_601_d9_D2_H3K27me3 p602_d8_D1_H3K27me3 p602_d8_D2_H3K27me3; do
        makeUCSCfile ${sample} -bigWig /home/jhuang/REFs/hg38.chromSizes -o auto -style chipseq    #-pseudo 1 -norm 1e7 -normLength 100 -fsize 1
        mv ${sample}/${sample}.ucsc.bigWig ../bigWigs/
    done
    #Note that normLength should possibly read the fragmentation length from phantompeakqualtools. Easier to set all libraries to the default value: 100.
    #To normalize to a fragment length of 150, you would use -normLength 150.
    #To turn off normalization, you would use -normLength 0.
    
    # -- not necessary any more: using MACS2 and SICER instead of using findPeaks ------------------------> go STEPs 4.1 and 5.1.
    # #factor (transcription factor ChIP-Seq, uses -center, output: peaks.txt,  default)
    # #histone (histone modification ChIP-Seq, region based, uses -region -size 500 -L 0, regions.txt)
    # for sample in p601_d8_D1 p601_d8_D2 p604_d8_D1 p604_d8_D2; do
    #   #Finding peaks of size 1000, no closer than 2000
    #   findPeaks ${sample}_H3K4me3 -style factor -size 1000  -o auto -i ${sample}_input
    #   #-minDist <#> (minimum distance between peaks, default: peak size x2)
    #   #findPeaks ${sample}_H3K27me3 -style histone -region -size 3000 -minDist 5000 -o auto -i ${sample}_input
    #   #findPeaks ${sample}_H3K27ac -style factor -size 200 -minDist 200 -o auto -i ${sample}_input
    #   #findPeaks ${sample}_H3K4me1 -style histone -region -size 1000 -minDist 2500 -o auto -i ${sample}_input
    # done
    
    ./p601_d8_D1_H3K4me3/peaks.txt
    ./p601_d8_D2_H3K4me3/peaks.txt
    ./p604_d8_D1_H3K4me3/peaks.txt
    ./p604_d8_D2_H3K4me3/peaks.txt
    
    ./p601_d8_D1_H3K27me3/regions.txt
    ./p601_d8_D2_H3K27me3/regions.txt
    ./p604_d8_D1_H3K27me3/regions.txt
    ./p604_d8_D2_H3K27me3/regions.txt
    
    for dir in p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 p604_d8_D1_H3K4me3 p604_d8_D2_H3K4me3; do
    awk -v OFS='\t' '{print $2, $3, $4, $1, $6}' ./${dir}/peaks.txt > ${dir}_peaks.bed
    grep -v "#" ${dir}_peaks.bed | sort -k1,1 -k2,2n > ${dir}_sorted_peaks.bed
    done
    
    for dir in p601_d8_D1_H3K27me3 p601_d8_D2_H3K27me3 p604_d8_D1_H3K27me3 p604_d8_D2_H3K27me3; do
    awk -v OFS='\t' '{print $2, $3, $4, $1, $6}' ./${dir}/regions.txt > ${dir}_regions.bed
    grep -v "#" ${dir}_regions.bed | sort -k1,1 -k2,2n > ${dir}_sorted_regions.bed
    done
    
    #DEBUG: why the bam files so small?
    makeTagDirectory NHDF-Ad_Control_r1 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_Control_r1.dedup.sorted.bam -genome hg38
    makeTagDirectory NHDF-Ad_Control_r2 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_Control_r2.dedup.sorted.bam -genome hg38
    makeTagDirectory NHDF-Ad_H3K27ac_r1 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K27ac_r1.dedup.sorted.bam -genome hg38
    makeTagDirectory NHDF-Ad_H3K27ac_r2 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K27ac_r2.dedup.sorted.bam -genome hg38
    makeTagDirectory NHDF-Ad_H3K4me1_r1 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K4me1_r1.dedup.sorted.bam -genome hg38
    makeTagDirectory NHDF-Ad_H3K4me1_r2 /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/results_ChIPseq_histone_hg38/H3K27ac_H3K4me1_public/results/picard/NHDF_H3K4me1_r2.dedup.sorted.bam -genome hg38
    
    NHDF-Ad_Control_r1 NHDF-Ad_Control_r2 NHDF-Ad_H3K27ac_r1 NHDF-Ad_H3K27ac_r2 NHDF-Ad_H3K4me1_r1 NHDF-Ad_H3K4me1_r2
    
    > (myperl) environments for HOMER, ~/Tools/diffreps/bin/diffReps.pl, MACS2, ~/Tools/SICER1.1/SICER/SICER.sh

2.5. diffbind (NOT_USED) https://hbctraining.github.io/Intro-to-ChIPseq/lessons/08_diffbind_differential_peaks.html #–>identifying statistically significantly differentially bound sites https://genomebiology.biomedcentral.com/articles/10.1186/s13059-022-02686-y

 #samplesheet.csv
 #SampleID, Condition, Peaks, Counts
 #Sample1, Control, sample1_peaks.bed, sample1.bam
 #Sample2, Treatment, sample2_peaks.bed, sample2.bam
 library(DiffBind)
 samples <- dba(sampleSheet="samplesheet.csv")

3.0. combine the diffReps.pl and HOMER annotatePeaks.pl, so we can use for hg38 and mm10 and so on (IMPORTANT, DON’T NEED –gname hg38, run diffReps.pl –> bed_file –> annotatePeaks.pl it with HOMER)

    #Dynamic regions were defined as MACS (H3K4me3, H3K27ac) or SICER (H3K4me1, H3K27me3) peaks overlapping significantly (≥ 2-fold change, adjusted P-value ≤ 0.05) up- or down-regulated differentially enriched regions from diffReps in the three pairwise comparisons WAC vs mock, WA314 vs mock and WAC vs WA314.

    #STEP1
    #--> not given "--gname hg38"
    ## Step4: Annotate differential sites.
    #unless($noanno or $gname eq ''){
    #        `region_analysis.pl -i $report -r -d refseq -g $gname`;
    #}
    ## Step5: Look for hotspots.
    #unless($nohs){
    #        my $hotspot = $report . '.hotspot';
    #        `findHotspots.pl -d $report -o $hotspot`;
    #}
    ~/Tools/diffreps/bin/diffReps.pl -tr ../results/picard/V_8_1_6_p601_d8_D1_H3K4me3.dedup.sorted.bed ../results/picard/V_8_1_5_p601_d8_D2_H3K4me3.dedup.sorted.bed -co ../results/picard/V_8_1_6_p601_d8_D1_input.dedup.sorted.bed ../results/picard/V_8_1_5_p601_d8_D2_input.dedup.sorted.bed  --report output_results  --chrlen /home/jhuang/REFs/hg38.chromSizes --nsd sharp --noanno

    ~/Tools/diffreps/bin/diffReps.pl -tr ../results/picard/V_8_1_6_p601_d8_D1_H3K4me3.dedup.sorted.bed ../results/picard/V_8_1_5_p601_d8_D2_H3K4me3.dedup.sorted.bed -co ../results/picard/V_8_1_6_p601_d8_D1_input.dedup.sorted.bed ../results/picard/V_8_1_5_p601_d8_D2_input.dedup.sorted.bed  --report p602_vs_p600_601_H3K4me3_diff_out  --chrlen /home/jhuang/REFs/hg38.chromSizes --nsd sharp --noanno
    ~/Tools/diffreps/bin/diffReps.pl --treatment ./results/picard/V_8_4_2_p602_d8_D1_H3K27me3.dedup.sorted.bed ./results/picard/V_8_4_1_p602_d8_D2_H3K27me3.dedup.sorted.bed --btr ./results/picard/V_8_4_2_p602_d8_D1_input.dedup.sorted.bed ./results/picard/V_8_4_1_p602_d8_D2_input.dedup.sorted.bed --control ./results/picard/V_8_3_1_p600_601_d12_D1_H3K27me3.dedup.sorted.bed ./results/picard/V_8_3_2_p600_601_d9_D2_H3K27me3.dedup.sorted.bed --bco ./results/picard/V_8_3_1_p600_601_d12_D1_input.dedup.sorted.bed ./results/picard/V_8_3_2_p600_601_d9_D2_input.dedup.sorted.bed  --report p602_vs_p600_601_H3K27me3_diff_out  --chrlen /home/jhuang/REFs/hg38.chromSizes --nsd broad --mode block --window 10000 --gap 30000 --noanno

    #I have utilized 'diffReps' for this analysis because the methods I previously mentioned in my last email were not effective in identifying sites with statistically significant differential binding. 'diffReps' appears to be a more suitable tool for our needs. Below is the command I used. It's important to note that 'diffReps' typically requires replicates as input and outputs a set of peaks/regions.

    #STEP2
    #replace Chr to '#Chr'
    grep -v "#" output_results | sort -k1,1 -k2,2n > output_results_
    awk 'BEGIN {OFS="\t"} {print $1, $2, $3, "diffreps_peak_"NR, $12}' output_results_ > H3K4me3.bed
    #grep -v "#" H3K4me3.bed | sort -k1,1 -k2,2n > H3K4me3_sorted_peaks.bed

    grep -v "#" p602_vs_p600_601_H3K4me3_diff_out | sort -k1,1 -k2,2n > p602_vs_p600_601_H3K4me3_diff_out_
    awk 'BEGIN {OFS="\t"} {print $1, $2, $3, "diffreps_peak_"NR, $12}' p602_vs_p600_601_H3K4me3_diff_out_ > p602_vs_p600_601_H3K4me3.bed
    grep -v "#" p602_vs_p600_601_H3K27me3_diff_out | sort -k1,1 -k2,2n > p602_vs_p600_601_H3K27me3_diff_out_
    awk 'BEGIN {OFS="\t"} {print $1, $2, $3, "diffreps_peak_"NR, $12}' p602_vs_p600_601_H3K27me3_diff_out_ > p602_vs_p600_601_H3K27me3.bed

    #STEP3 (under myperl) peak calling macs2 for narrow peaks, CISER for broad peaks!
    #process the output of diffReps.pl to BED file.
    annotatePeaks.pl H3K4me3.bed hg38 > H3K4me3_annotated_peaks.txt

    annotatePeaks.pl p602_vs_p600_601_H3K4me3.bed hg38 > p602_vs_p600_601_H3K4me3_annotated_peaks.txt
    annotatePeaks.pl p602_vs_p600_601_H3K27me3.bed hg38 > p602_vs_p600_601_H3K27me3_annotated_peaks.txt
    (head -n 1 p602_vs_p600_601_H3K4me3_annotated_peaks.txt && tail -n +2 p602_vs_p600_601_H3K4me3_annotated_peaks.txt | sort -t "_" -k 3 -n) > p602_vs_p600_601_H3K4me3_annotated_peaks_.txt
    (head -n 1 p602_vs_p600_601_H3K27me3_annotated_peaks.txt && tail -n +2 p602_vs_p600_601_H3K27me3_annotated_peaks.txt | sort -t "_" -k 3 -n) > p602_vs_p600_601_H3K27me3_annotated_peaks_.txt
    ~/Tools/csv2xls-0.4/csv_to_xls.py p602_vs_p600_601_H3K4me3_annotated_peaks_.txt -d$'\t' -o p602_vs_p600_601_H3K4me3_annotated_peaks.xls
    ~/Tools/csv2xls-0.4/csv_to_xls.py p602_vs_p600_601_H3K27me3_annotated_peaks_.txt -d$'\t' -o p602_vs_p600_601_H3K27me3_annotated_peaks.xls

4.0. combine macs2 to getDifferentialPeaksReplicates.pl

    replace the initial peak identification by using your MACS2 output.

    #http://homer.ucsd.edu/homer/ngs/diffExpression.html
    #getDifferentialPeaksReplicates.pl = findPeaks + annotatePeaks.pl + getDiffExpression.pl 
    #annotatePeaks.pl tss hg38 -raw -d H3K4me3-Mock-rep1/ H3K4me3-Mock-rep2/ H3K4me3-WNT-rep1/ H3K4me3-WNT-rep3/ > countTable.peaks.txt

    Here's an outline of how we might be able to replace the initial peak identification by using your MACS2 output.

    #TODO: using MACS call peaks of the data H3K27ac.

4.1. MACS2 peak calling

  # -- macs2 --> bed --> annotatePeaks.pl
  conda activate homer
  cd macs2
  macs2 callpeak -t ../results/picard/V_8_3_1_p600_601_d12_D1_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_3_1_p600_601_d12_D1_input.dedup.sorted.bam -f BAM -g hs -n p600_601_d12_D1 -q 0.05
  macs2 callpeak -t ../results/picard/V_8_3_2_p600_601_d9_D2_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_3_2_p600_601_d9_D2_input.dedup.sorted.bam -f BAM -g hs -n p600_601_d9_D2 -q 0.05
  macs2 callpeak -t ../results/picard/V_8_4_2_p602_d8_D1_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_4_2_p602_d8_D1_input.dedup.sorted.bam -f BAM -g hs -n p602_d8_D1 -q 0.05
  macs2 callpeak -t ../results/picard/V_8_4_1_p602_d8_D2_H3K4me3.dedup.sorted.bam -c ../results/picard/V_8_4_1_p602_d8_D2_input.dedup.sorted.bam -f BAM -g hs -n p602_d8_D2 -q 0.05

  awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p600_601_d12_D1_peaks.narrowPeak > p600_601_d12_D1_peaks.bed
  awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p600_601_d9_D2_peaks.narrowPeak > p600_601_d9_D2_peaks.bed
  awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p602_d8_D1_peaks.narrowPeak > p602_d8_D1_peaks.bed
  awk 'OFS="\t" {print $1, $2, $3, $4, $5}' p602_d8_D2_peaks.narrowPeak > p602_d8_D2_peaks.bed

  #annotatePeaks.pl p601_d8_D1_peaks.bed hg38 > p601_d8_D1_annotated_peaks.txt
  #annotatePeaks.pl p601_d8_D2_peaks.bed hg38 > p601_d8_D2_annotated_peaks.txt
  #annotatePeaks.pl p604_d8_D1_peaks.bed hg38 > p604_d8_D1_annotated_peaks.txt
  #annotatePeaks.pl p604_d8_D2_peaks.bed hg38 > p604_d8_D2_annotated_peaks.txt

4.2. Convert your MACS2 peaks to HOMER-compatible format. You can do this manually or with a script. For example:

It’s possible to use more information from the MACS2 output file to create a more informative peaks.txt file for HOMER. However, it’s important to note that some information that HOMER needs for its differential peak analysis is not available in the MACS2 output (such as Normalized Tag Count, Control Tags, and others). But we can certainly map more of the available MACS2 columns to the corresponding HOMER columns.

  #The following awk command can be used to convert more MACS2 information into the HOMER format:

  cd macs2
  #awk 'BEGIN{OFS="\t"}{print $1,$2,$3,"Peak_"NR,$5,$6,$7,$8,$9,$10}' macs2_peaks.bed > macs2_peaks.txt
  awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p600_601_d12_D1_peaks.xls > p600_601_d12_D1_macs2_peaks.txt
  awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p600_601_d9_D2_peaks.xls > p600_601_d9_D2_macs2_peaks.txt
  awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p602_d8_D1_peaks.xls > p602_d8_D1_macs2_peaks.txt
  awk 'BEGIN{OFS="\t"} NR > 1 {print $10, $1, $2, $3, "+", "0", "0", $6, $6, "0", $8, $5, $9, "0", "0"}' p602_d8_D2_peaks.xls > p602_d8_D2_macs2_peaks.txt

  This command will:

      * Skip the header line (NR > 1)
      * Map the MACS2 peak name ($10) to the HOMER PeakID
      * Map the MACS2 chromosome, start, and end ($1, $2, $3) to the HOMER chr, start, end
      * Use a placeholder "+" for the HOMER strand
      * Use a placeholder "0" for the HOMER Normalized Tag Count and Focus Ratio
      * Map the MACS2 pileup ($6) to the HOMER findPeaks Score and Total Tags
      * Use a placeholder "0" for the HOMER Control Tags
      * Map the MACS2 fold_enrichment ($8) to the HOMER Fold Change vs Control
      * Map the MACS2 abs_summit ($5) to the HOMER p-value vs Control
      * Map the MACS2 -log10(qvalue) ($9) to the HOMER Fold Change vs Local
      * Use a placeholder "0" for the HOMER p-value vs Local and Clonal Fold Change

  This script is limited by the differences in the information provided by MACS2 and HOMER. While it makes use of as much information as possible from the MACS2 output, some columns in the HOMER format still have to be filled with placeholder values.

4.3. Associate the converted peak files with their respective tag directories. In HOMER, peak files can be associated with a tag directory by placing them in the tag directory with the filename “peaks.txt”.

  #mv homer/p600_601_d12_D1_H3K4me3/peaks.txt homer/p600_601_d12_D1_H3K4me3/peaks_raw.txt
  #mv homer/p600_601_d9_D2_H3K4me3/peaks.txt homer/p600_601_d9_D2_H3K4me3/peaks_raw.txt
  #mv homer/p602_d8_D1_H3K4me3/peaks.txt homer/p602_d8_D1_H3K4me3/peaks_raw.txt
  #mv homer/p602_d8_D2_H3K4me3/peaks.txt homer/p602_d8_D2_H3K4me3/peaks_raw.txt
  cp macs2/p600_601_d12_D1_macs2_peaks.txt homer/p600_601_d12_D1_H3K4me3/peaks.txt  
  cp macs2/p600_601_d9_D2_macs2_peaks.txt homer/p600_601_d9_D2_H3K4me3/peaks.txt    
  cp macs2/p602_d8_D1_macs2_peaks.txt homer/p602_d8_D1_H3K4me3/peaks.txt
  cp macs2/p602_d8_D2_macs2_peaks.txt homer/p602_d8_D2_H3K4me3/peaks.txt
  #33896 p600_601_d12_D1_peaks.bed
  #47977 p600_601_d9_D2_peaks.bed
  #59049 p602_d8_D1_peaks.bed
  #33730 p602_d8_D2_peaks.bed
  #CHECK and DELETE headers including "name ..." in the new peaks.txt files!

  #Repeat this for each of your tag directories.

4.4. The program getDifferentialPeaksReplicates will essentially perform 3 steps, in the step 2 was modified.

First, it will pool the target tag directories and input directories separately into pooled experiments and perform an initial peak identification (using findPeaks). Pooling the experiments is generally more sensitive than trying to merge the individual peak files coming from each experiment (although this can be done using the “-use ” option if each directory already has a peak file associated with it). Next, it will quantify the reads at the initial putative peaks across each of the target and input tag directories using annotatePeaks.pl. Finally, it calls getDiffExpression.pl and ultimately passes these values to the R/Bioconductor package DESeq2 to calculate enrichment values for each peak, returning only those peaks that pass a given fold enrichment (default: 2-fold) and FDR cutoff (default 5%). We can run getDifferentialPeaksReplicates.pl with the -use option to specify that the provided peaks should be used instead of calling findPeaks:

        #-- Successful modification of the script getDifferentialPeaksReplicates.pl (Explain how to archieve getDifferentialPeaksReplicates.pl, only for DEBUG, for calculation directly run getDifferentialPeaksReplicates.pl!) --
        #The -d parameter in the mergePeaks function in HOMER is used to specify the maximum distance between peak centers
        #change Max distance to merge to 30000 bp in getDifferentialPeaksReplicates.pl
        #mergePeaks -d 500 [100,1000] temp_sorted | sort

        #conda list homer  #4.11
        mergePeaks -d 2000 p600_601_d12_D1_H3K4me3/peaks.txt p600_601_d9_D2_H3K4me3/peaks.txt > mergePeaks_res1.txt  # get 37569 records 
        mergePeaks -d 2000 p602_d8_D1_H3K4me3/peaks.txt p602_d8_D2_H3K4me3/peaks.txt > mergePeaks_res2.txt  # get 44076 records 

  #(homer) jhuang@hamburg:~/DATA/Data_Denise_LT_DNA_Bindung/results_chipseq_histone_hg38/H3K4me3_H3K27ac__H3K27me3_H3K9me3/homer$
  #getDifferentialPeaksReplicates.pl -use 
/peaks.txt,/peaks.txt … #NOTE that the H3K4me3 has zu viele peaks, we can use the slightly modified getDifferentialPeaksReplicates_narrow.pl, “-d 2000” ./getDifferentialPeaksReplicates_narrow.pl -t p600_601_d12_D1_H3K4me3 p600_601_d9_D2_H3K4me3 -i p600_601_d12_D1_input p600_601_d9_D2_input -genome hg38 -use peaks.txt -style histone > p600_601_d9d12_H3K4me3_macs2_peaks.txt #p600_601_d12_D1_H3K4me3/peaks.txt p600_601_d9_D2_H3K4me3/peaks.txt Total Name # X 11530 p600_601_d9_D2_H3K4me3/peaks.txt #X 2158 p600_601_d12_D1_H3K4me3/peaks.txt #X X 23881 p600_601_d12_D1_H3K4me3/peaks.txt|p600_601_d9_D2_H3K4me3/peaks.txt # Using DESeq2 to calculate differential expression/enrichment… # Output Stats input vs. target: # Total Genes: 37569 # Total Up-regulated in target vs. input: 26340 (70.111%) [log2fold>1, FDR<0.05] # Total Dn-regulated in target vs. input: 1 (0.003%) [log2fold<-1, FDR<0.05] ./getDifferentialPeaksReplicates_narrow.pl -t p602_d8_D1_H3K4me3 p602_d8_D2_H3K4me3 -i p602_d8_D1_input p602_d8_D2_input -genome hg38 -use peaks.txt -style histone > p602_d8_H3K4me3_macs2_peaks.txt #p602_d8_D1_H3K4me3/peaks.txt p602_d8_D2_H3K4me3/peaks.txt Total Name # X 270 p602_d8_D2_H3K4me3/peaks.txt #X 24939 p602_d8_D1_H3K4me3/peaks.txt #X X 18867 p602_d8_D1_H3K4me3/peaks.txt|p602_d8_D2_H3K4me3/peaks.txt # Using DESeq2 to calculate differential expression/enrichment… # Output Stats input vs. target: # Total Genes: 44076 # Total Up-regulated in target vs. input: 24203 (54.912%) [log2fold>1, FDR<0.05] # Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05] #"-d 1000" #p600_601_d12_D1_H3K4me3/peaks.txt p600_601_d9_D2_H3K4me3/peaks.txt Total Name # X 14165 p600_601_d9_D2_H3K4me3/peaks.txt #X 3910 p600_601_d12_D1_H3K4me3/peaks.txt #X X 26162 p600_601_d12_D1_H3K4me3/peaks.txt|p600_601_d9_D2_H3K4me3/peaks.txt #Using DESeq2 to calculate differential expression/enrichment... # Output Stats input vs. target: # Total Genes: 44237 # Total Up-regulated in target vs. input: 32706 (73.934%) [log2fold>1, FDR<0.05] # Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05] #p602_d8_D1_H3K4me3/peaks.txt p602_d8_D2_H3K4me3/peaks.txt Total Name # X 2825 p602_d8_D2_H3K4me3/peaks.txt #X 29258 p602_d8_D1_H3K4me3/peaks.txt #X X 20829 p602_d8_D1_H3K4me3/peaks.txt|p602_d8_D2_H3K4me3/peaks.txt #Using DESeq2 to calculate differential expression/enrichment... #Output Stats input vs. target: # Total Genes: 52912 # Total Up-regulated in target vs. input: 32484 (61.393%) [log2fold>1, FDR<0.05] # Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05] 4.5. Draw plots #grep "p600_601_d12_D1_H3K4me3/peaks.txt|p600_601_d9_D2_H3K4me3/peaks.txt" p600_601_d9d12_H3K4me3_macs2_peaks.txt | wc -l #grep -v "p600_601_d12_D1_H3K4me3/peaks.txt|p600_601_d9_D2_H3K4me3/peaks.txt" p600_601_d9d12_H3K4me3_macs2_peaks.txt | grep "p600_601_d12_D1_H3K4me3/peaks.txt" | wc -l #grep -v "p600_601_d12_D1_H3K4me3/peaks.txt|p600_601_d9_D2_H3K4me3/peaks.txt" p600_601_d9d12_H3K4me3_macs2_peaks.txt | grep "p600_601_d9_D2_H3K4me3/peaks.txt" | wc -l #grep "p602_d8_D1_H3K4me3/peaks.txt|p602_d8_D2_H3K4me3/peaks.txt" p602_d8_H3K4me3_macs2_peaks.txt | wc -l #grep -v "p602_d8_D1_H3K4me3/peaks.txt|p602_d8_D2_H3K4me3/peaks.txt" p602_d8_H3K4me3_macs2_peaks.txt | grep "p602_d8_D1_H3K4me3/peaks.txt" | wc -l #grep -v "p602_d8_D1_H3K4me3/peaks.txt|p602_d8_D2_H3K4me3/peaks.txt" p602_d8_H3K4me3_macs2_peaks.txt | grep "p602_d8_D2_H3K4me3/peaks.txt" | wc -l import matplotlib.pyplot as plt from matplotlib_venn import venn2 venn2(subsets=(1808, 5774, 25124), set_labels=('Donor 1', 'Donor 2')) plt.title('Peaks between p600+p601 d9 or d12 H3K4me3 Donors') plt.xlabel('Number of Elements') plt.ylabel('') plt.savefig('Peak_Consistency_Between_p600+p601_d9d12_H3K4me3_Donors.png', dpi=300, bbox_inches='tight') import matplotlib.pyplot as plt from matplotlib_venn import venn2 venn2(subsets=(9100, 2761, 20623), set_labels=('Donor 1', 'Donor 2')) plt.title('Peaks between p602 d8 H3K4me3 Donors') plt.xlabel('Number of Elements') plt.ylabel('') plt.savefig('Peak_Consistency_Between_p602_d8_H3K4me3_Donors.png', dpi=300, bbox_inches='tight') awk 'NR>1 {print $2 “\t” $3 “\t” $4 “\t” $1}’ p600_601_d9d12_H3K4me3_macs2_peaks.txt > p600+p601_d9d12_H3K4me3_macs2_peaks.bed awk ‘NR>1 {print $2 “\t” $3 “\t” $4 “\t” $1}’ p602_d8_H3K4me3_macs2_peaks.txt > p602_d8_H3K4me3_macs2_peaks.bed ~/Tools/csv2xls-0.4/csv_to_xls.py p600_601_d9d12_H3K4me3_macs2_peaks.txt -d$’\t’ -o p600+p601_d9d12_H3K4me3_macs2_peaks.xls ~/Tools/csv2xls-0.4/csv_to_xls.py p602_d8_H3K4me3_macs2_peaks.txt -d$’\t’ -o p602_d8_H3K4me3_macs2_peaks.xls 4.6. identifying statistically significantly differentially bound sites based on evidence of binding affinity (measured by differences in read densities) ##http://homer.ucsd.edu/homer/ngs/peaksReplicates.html #getDifferentialPeaksReplicates.pl -t Oct4-r1/ Oct4-r2/ -b Sox2-r1/ Sox2-r2/ -i Input-r1/ Input-r2/ > outputPeaks.txt ##first get TSS regions using annotatePeaks.pl #annotatePeaks.pl tss hg38 > tss.txt #getDifferentialPeaksReplicates.pl -t H3K27ac-WNT-r1/ H3K27ac-WNT-r2/ -b H3K27ac-notx-r1 H3K27ac-notx-r2/ -p tss.txt > outputPeaks.txt #- Donor I p602 (d8) vs p600+601 (d12) H3K4me3 #- Donor II p602 (d8) vs p600+601 (d9) H3K4me3 ./getDifferentialPeaksReplicates_narrow.pl -t p602_d8_D1_H3K4me3 p602_d8_D2_H3K4me3 -b p600_601_d12_D1_H3K4me3 p600_601_d9_D2_H3K4me3 -genome hg38 -use peaks.txt -style histone > p602_d8_vs_p600_601_d9d12_H3K4me3_diff_peaks.txt #./getDifferentialPeaksReplicates_narrow.pl -t p602_d8_D1_H3K4me3 -b p600_601_d12_D1_H3K4me3 -genome hg38 -use peaks.txt -style histone > p602_d8_vs_p600_601_d12_D1_H3K4me3_diff_peaks.txt 5.1. SICER peak calling (under env myperl) # — SICER –> bed –> annotatePeaks.pl mkdir sicer; cd sicer; ln -s ../results/picard/V_8_3_1_p600_601_d12_D1_H3K27me3.dedup.sorted.bed . ln -s ../results/picard/V_8_3_2_p600_601_d9_D2_H3K27me3.dedup.sorted.bed . ln -s ../results/picard/V_8_4_2_p602_d8_D1_H3K27me3.dedup.sorted.bed . ln -s ../results/picard/V_8_4_1_p602_d8_D2_H3K27me3.dedup.sorted.bed . ln -s ../results/picard/V_8_3_1_p600_601_d12_D1_input.dedup.sorted.bed . ln -s ../results/picard/V_8_3_2_p600_601_d9_D2_input.dedup.sorted.bed . ln -s ../results/picard/V_8_4_2_p602_d8_D1_input.dedup.sorted.bed . ln -s ../results/picard/V_8_4_1_p602_d8_D2_input.dedup.sorted.bed . #/home/jhuang/Tools/SICER1.1/SICER/SICER.sh [InputDir] [bed file] [control file] [OutputDir] [Species] [redundancy threshold] [window size (bp)] [fragment size] [effective genome fraction] [gap size (bp)] [FDR] # 10000 is window size, 30000 is the gap size, 160 is the fragment size mkdir p600_601_d12_D1 p600_601_d9_D2 p602_d8_D1 p602_d8_D2 ~/Tools/SICER1.1/SICER/SICER.sh . V_8_3_1_p600_601_d12_D1_H3K27me3.dedup.sorted.bed V_8_3_1_p600_601_d12_D1_input.dedup.sorted.bed p600_601_d12_D1 hg38 1 10000 160 0.74 30000 0.01; ~/Tools/SICER1.1/SICER/SICER.sh . V_8_3_2_p600_601_d9_D2_H3K27me3.dedup.sorted.bed V_8_3_2_p600_601_d9_D2_input.dedup.sorted.bed p600_601_d9_D2 hg38 1 10000 160 0.74 30000 0.01; ~/Tools/SICER1.1/SICER/SICER.sh . V_8_4_2_p602_d8_D1_H3K27me3.dedup.sorted.bed V_8_4_2_p602_d8_D1_input.dedup.sorted.bed p602_d8_D1 hg38 1 10000 160 0.74 30000 0.01; ~/Tools/SICER1.1/SICER/SICER.sh . V_8_4_1_p602_d8_D2_H3K27me3.dedup.sorted.bed V_8_4_1_p602_d8_D2_input.dedup.sorted.bed p602_d8_D2 hg38 1 10000 160 0.74 30000 0.01; #sicer -t treatment.bed -c control.bed -s hg38 -w 200 -rt 1 -f 150 -egf 0.74 -fdr 0.01 -g 600 -e 1000 #———– to STEP 5.2-3. ———–> #TODO: – check if the peak calling works well in IGV! # – call peaks for H3K27me1, adjust the SICER-parameters! # – transform them peaks.txt of HOMER, and call getDifferentialPeaksReplicates.pl -t p601_d8_D1_H3K27me3 p601_d8_D2_H3K27me3 -i p601_d8_D1_input p601_d8_D2_input -genome hg38 -use peaks.txt > p601_d8_H3K27me3_sicer_peaks.txt getDifferentialPeaksReplicates.pl -t p604_d8_D1_H3K27me3 p604_d8_D2_H3K27me3 -i p604_d8_D1_input p604_d8_D2_input -genome hg38 -use peaks.txt > p604_d8_H3K27me3_sicer_peaks.txt #Note that histone using ‘cat file1.bed file2.bed | sort -k1,1 -k2,2n | bedtools merge > merged.bed’ #Note that factor using HOMER getDifferentialPeaksReplicates from begining: “mergePeaks.pl –> DESeq recheck”! mergePeaks sicer/p601_d8_D1/V_8_1_6_p601_d8_D1_H3K27me3.dedup.sorted-W10000-G30000-FDR0.01-island.bed sicer/p601_d8_D2/V_8_1_5_p601_d8_D2_H3K27me3.dedup.sorted-W10000-G30000-FDR0.01-island.bed -d 10000 > mergedPeaks1.txt cat sicer/p601_d8_D1/V_8_1_6_p601_d8_D1_H3K27me3.dedup.sorted-W10000-G30000-FDR0.01-island.bed sicer/p601_d8_D2/V_8_1_5_p601_d8_D2_H3K27me3.dedup.sorted-W10000-G30000-FDR0.01-island.bed | sort -k1,1 -k2,2n | bedtools merge > merged_p601_d8.bed #4957 (4388, 4822) awk ‘BEGIN{OFS=”\t”; print “PeakID\tchr\tstart\tend\tstrand”} {print “Peak-“NR, $1, $2, $3, “.”}’ merged_p601_d8.bed > homer_peaks.bed #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!TODO!!!!!!!!!!!!!!!!!!!!!!!!: The bedtools merge command does not generate output in the HOMER peak format directly. However, you can use a combination of Unix command line tools (e.g., awk, sort) to convert bedtools output into HOMER format. #bedtools merge -i input.bed > merged.bed #awk ‘BEGIN{OFS=”\t”; print “PeakID\tchr\tstart\tend\tstrand”} {print “Peak-“NR, $1, $2, $3, “.”}’ merged.bed > homer_peaks.bed 5.2-3. Convert your SICER peaks to HOMER-compatible format, and replace them in homer/${sample}/peaks.txt. You can do this manually or with a script. – The following awk command can be used to convert more MACS2 information into the HOMER format: #mv homer/p601_d8_D1_H3K27me3/regions.txt homer/p601_d8_D1_H3K27me3/regions_raw.txt #mv homer/p601_d8_D2_H3K27me3/regions.txt homer/p601_d8_D2_H3K27me3/regions_raw.txt #mv homer/p604_d8_D1_H3K27me3/regions.txt homer/p604_d8_D1_H3K27me3/regions_raw.txt #mv homer/p604_d8_D2_H3K27me3/regions.txt homer/p604_d8_D2_H3K27me3/regions_raw.txt #NOTE that we should name the file as peaks.txt, since the Input-samples contain only peaks.txt. awk ‘BEGIN{OFS=”\t”} {print $1″-“$2”-“$3,$1,$2,$3,”+”,$4,0,0,$4,$5,0,$6,0}’ sicer/p600_601_d12_D1/V_8_3_1_p600_601_d12_D1_H3K27me3.dedup.sorted-W10000-G30000-islands-summary-FDR0.01 > homer/p600_601_d12_D1_H3K27me3/peaks.txt awk ‘BEGIN{OFS=”\t”} {print $1″-“$2”-“$3,$1,$2,$3,”+”,$4,0,0,$4,$5,0,$6,0}’ sicer/p600_601_d9_D2/V_8_3_2_p600_601_d9_D2_H3K27me3.dedup.sorted-W10000-G30000-islands-summary-FDR0.01 > homer/p600_601_d9_D2_H3K27me3/peaks.txt awk ‘BEGIN{OFS=”\t”} {print $1″-“$2”-“$3,$1,$2,$3,”+”,$4,0,0,$4,$5,0,$6,0}’ sicer/p602_d8_D1/V_8_4_2_p602_d8_D1_H3K27me3.dedup.sorted-W10000-G30000-islands-summary-FDR0.01 > homer/p602_d8_D1_H3K27me3/peaks.txt awk ‘BEGIN{OFS=”\t”} {print $1″-“$2”-“$3,$1,$2,$3,”+”,$4,0,0,$4,$5,0,$6,0}’ sicer/p602_d8_D2/V_8_4_1_p602_d8_D2_H3K27me3.dedup.sorted-W10000-G30000-islands-summary-FDR0.01 > homer/p602_d8_D2_H3K27me3/peaks.txt #The format is similar to the following. #chr1-83000-89999 chr1 83000 89999 + 21 0 0 21 27 0 5.623763510806874e-10 0 0 0 #chr1-859000-865999 chr1 859000 865999 + 23 0 0 23 28 0 3.501056563888787e-11 0 0 0 This command will: * PeakID: Format it as “chr-start-end”. * chr: Same as SICER. * start: Same as SICER. * end: Same as SICER. * strand: SICER doesn’t provide strand information, we could fill with a default value ‘+’. * Normalized Tag Count: Same as the normalized read count column in SICER. * focus ratio: SICER doesn’t provide this, we could fill with a default value ‘0’. * findPeaks Score: Corresponds to Fold Change or FDR column in SICER. * Total Tags: SICER doesn’t provide this, we could fill with a default value ‘0’. * Control Tags (normalized to IP Experiment): Same as Clonal reads or local background column in SICER. * Fold Change vs Control: SICER doesn’t provide this, we could fill with a default value ‘0’. * p-value vs Control: Same as the p-value column in SICER. * Fold Change vs Local: SICER doesn’t provide this, we could fill with a default value ‘0’. * p-value vs Local: SICER doesn’t provide this, we could fill with a default value ‘0’. * Clonal Fold Change: SICER doesn’t provide this, we could fill with a default value ‘0’. 5.4. The program getDifferentialPeaksReplicates_broad.pl will essentially perform 3 steps, in the step 2 was modified. conda activate myperl cd homer #diff /home/jhuang/anaconda3/envs/myperl/bin/getDifferentialPeaksReplicates.pl /home/jhuang/homer/bin/getDifferentialPeaksReplicates.pl #vim /home/jhuang/anaconda3/envs/myperl/bin/getDifferentialPeaksReplicates.pl #max_distance_to_merge at line 203 to 133000, “-d <#>” #– Successful modification of the script getDifferentialPeaksReplicates.pl (Explain how to archieve getDifferentialPeaksReplicates.pl, only for DEBUG, for calculation directly run getDifferentialPeaksReplicates.pl!) — #The -d parameter in the mergePeaks function in HOMER is used to specify the maximum distance between peak centers #change Max distance to merge to 30000 bp in getDifferentialPeaksReplicates.pl #mergePeaks -d 30000 [] temp_sorted | sort #conda list homer #4.11 mergePeaks p600_601_d12_D1_H3K27me3/peaks.txt p600_601_d9_D2_H3K27me3/peaks.txt > mergePeaks_res.txt python3 update_header.py cat p600_601_d12_D1_H3K4me3/peaks.txt p600_601_d9_D2_H3K4me3/peaks.txt > merged_peaks.txt awk ‘{print $2 “\t” $3 “\t” $4 “\t” $1}’ merged_peaks.txt | sort -k1,1 -k2,2n | bedtools merge -d 1000 > bedtools_res.txt # get 36579 records using ‘bedtools merge’ python3 adjust_mergePeaks_res.py # limiting the start- and end-positons of mergePeaks_res.txt to the ranges in bedtools_res.txt, last very long! #check if the results are correct cut -d$’\t’ -f2-4 filtered_mergePeaks_res.txt > control1 diff control1 bedtools_res.txt #NOTE that the script getDifferentialPeaksReplicates_broad.pl is modified in the step2; the script used “update_header.py” and “adjust_mergedPeaks_res.py” #-d 35000 ./getDifferentialPeaksReplicates_broad.pl -t p600_601_d12_D1_H3K27me3 p600_601_d9_D2_H3K27me3 -i p600_601_d12_D1_input p600_601_d9_D2_input -genome hg38 -use peaks.txt -style histone > p600_601_d9d12_H3K27me3_sicer_regions.txt ./getDifferentialPeaksReplicates_broad.pl -t p602_d8_D1_H3K27me3 p602_d8_D2_H3K27me3 -i p602_d8_D1_input p602_d8_D2_input -genome hg38 -use peaks.txt -style histone > p602_d8_H3K27me3_sicer_regions.txt #p600_601_d12_D1_H3K27me3/peaks.txt p600_601_d9_D2_H3K27me3/peaks.txt Total Name # X 1821 p600_601_d9_D2_H3K27me3/peaks.txt #X 1306 p600_601_d12_D1_H3K27me3/peaks.txt #X X 3981 p600_601_d12_D1_H3K27me3/peaks.txt|p600_601_d9_D2_H3K27me3/peaks.txt #Using DESeq2 to calculate differential expression/enrichment… #Output Stats input vs. target: # Total Genes: 4431 # Total Up-regulated in target vs. input: 2502 (56.466%) [log2fold>1, FDR<0.05] # Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05] # #p602_d8_D1_H3K27me3/peaks.txt p602_d8_D2_H3K27me3/peaks.txt Total Name # X 1101 p602_d8_D2_H3K27me3/peaks.txt #X 1602 p602_d8_D1_H3K27me3/peaks.txt #X X 3668 p602_d8_D1_H3K27me3/peaks.txt|p602_d8_D2_H3K27me3/peaks.txt #Using DESeq2 to calculate differential expression/enrichment... #Output Stats input vs. target: # Total Genes: 4216 # Total Up-regulated in target vs. input: 2113 (50.119%) [log2fold>1, FDR<0.05] # Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05] 5.5. Draw plots grep "p600_601_d12_D1_H3K27me3/peaks.txt|p600_601_d9_D2_H3K27me3/peaks.txt" p600_601_d9d12_H3K27me3_sicer_regions.txt | wc -l # 2128 grep "p600_601_d12_D1_H3K27me3/peaks.txt" p600_601_d9d12_H3K27me3_sicer_regions.txt | wc -l # 2162 - 2128 = 34 grep "p600_601_d9_D2_H3K27me3/peaks.txt" p600_601_d9d12_H3K27me3_sicer_regions.txt | wc -l # 2468 - 2128 = 340 # 340+34+2128 = 2502 grep "p602_d8_D1_H3K27me3/peaks.txt|p602_d8_D2_H3K27me3/peaks.txt" p602_d8_H3K27me3_sicer_regions.txt | wc -l # 1840 grep "p602_d8_D1_H3K27me3/peaks.txt" p602_d8_H3K27me3_sicer_regions.txt | wc -l # 1940 - 1840 = 100 grep "p602_d8_D2_H3K27me3/peaks.txt" p602_d8_H3K27me3_sicer_regions.txt | wc -l # 2013 - 1840 = 173 # 1840 + 100 + 173 = 2113 import matplotlib.pyplot as plt from matplotlib_venn import venn2 venn2(subsets=(34, 340, 2128), set_labels=('Donor 1', 'Donor 2')) plt.title('Regions between p602 d8 H3K27me3 Donors') plt.xlabel('Number of Elements') plt.ylabel('') plt.savefig('Region_Consistency_Between_p600+p601_d9d12_H3K27me3_Donors.png', dpi=300, bbox_inches='tight') import matplotlib.pyplot as plt from matplotlib_venn import venn2 venn2(subsets=(100, 173, 1840), set_labels=('Donor 1', 'Donor 2')) plt.title('Regions between p602 d8 H3K27me3 Donors') plt.xlabel('Number of Elements') plt.ylabel('') plt.savefig('Region_Consistency_Between_p602_d8_H3K27me3_Donors.png', dpi=300, bbox_inches='tight') awk 'NR>1 {print $2 “\t” $3 “\t” $4 “\t” $1}’ p600_601_d9d12_H3K27me3_sicer_regions.txt > p600+p601_d9d12_H3K27me3_sicer_regions.bed awk ‘NR>1 {print $2 “\t” $3 “\t” $4 “\t” $1}’ p602_d8_H3K27me3_sicer_regions.txt > p602_d8_H3K27me3_sicer_regions.bed ~/Tools/csv2xls-0.4/csv_to_xls.py p600_601_d9d12_H3K27me3_sicer_regions.txt -d$’\t’ -o p600+p601_d9d12_H3K27me3_sicer_regions.xls ~/Tools/csv2xls-0.4/csv_to_xls.py p602_d8_H3K27me3_sicer_regions.txt -d$’\t’ -o p602_d8_H3K27me3_sicer_regions.xls 5.6. identifying statistically significantly differentially bound sites based on evidence of binding affinity (measured by differences in read densities) #- Donor I p602 (d8) vs p600+601 (d12) H3K27me3 #- Donor II p602 (d8) vs p600+601 (d9) H3K27me3 ./getDifferentialPeaksReplicates_broad.pl -t p602_d8_D1_H3K27me3 p602_d8_D2_H3K27me3 -b p600_601_d12_D1_H3K27me3 p600_601_d9_D2_H3K27me3 -genome hg38 -use peaks.txt -style histone > p602_d8_vs_p600_601_d9d12_H3K27me3_regions.txt 7.0. LOGs: result examples using different -d values # — Max distance to merge: 35000 bp — getDifferentialPeaksReplicates.pl -t p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 -i p601_d8_D1_input p601_d8_D2_input -genome hg38 -use peaks.txt > p601_d8_H3K4me3_macs2_peaks_using_mergePeaks_d35000.txt Merging peaks… Comparing p601_d8_D1_H3K4me3/peaks.txt (32453 total) and p601_d8_D1_H3K4me3/peaks.txt (32453 total) Comparing p601_d8_D1_H3K4me3/peaks.txt (32453 total) and p601_d8_D2_H3K4me3/peaks.txt (37269 total) Comparing p601_d8_D2_H3K4me3/peaks.txt (37269 total) and p601_d8_D1_H3K4me3/peaks.txt (32453 total) Comparing p601_d8_D2_H3K4me3/peaks.txt (37269 total) and p601_d8_D2_H3K4me3/peaks.txt (37269 total) p601_d8_D1_H3K4me3/peaks.txt p601_d8_D2_H3K4me3/peaks.txt Total Name X 2830 p601_d8_D2_H3K4me3/peaks.txt X 714 p601_d8_D1_H3K4me3/peaks.txt X X 12786 p601_d8_D1_H3K4me3/peaks.txt|p601_d8_D2_H3K4me3/peaks.txt Step2: Quantifying reads across target/background/input tag directories Peak File Statistics: Total Peaks: 16324 Step3: Calling R for differential enrichment statistics (-DESeq2) Autodetecting input file format… Autodetected annotatePeaks.pl file Performing variance stabalization (rlog)… Using DESeq2 to calculate differential expression/enrichment… Output Stats input vs. target: Total Genes: 16324 Total Up-regulated in target vs. input: 7193 (44.064%) [log2fold>1, FDR<0.05] Total Dn-regulated in target vs. input: 1319 (8.080%) [log2fold<-1, FDR<0.05] # -- Max distance to merge: 1000 bp -- getDifferentialPeaksReplicates.pl -t p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 -i p601_d8_D1_input p601_d8_D2_input -genome hg38 -use peaks.txt > p601_d8_H3K4me3_macs2_peaks_d1000.txt getDifferentialPeaksReplicates.pl -t p604_d8_D1_H3K4me3 p604_d8_D2_H3K4me3 -i p604_d8_D1_input p604_d8_D2_input -genome hg38 -use peaks.txt > p604_d8_H3K4me3_macs2_peaks_d1000.txt Step1: Defining Putative Peak Set p601_d8_D1_H3K4me3/peaks.txt p601_d8_D2_H3K4me3/peaks.txt Total Name X 10726 p601_d8_D2_H3K4me3/peaks.txt X 4883 p601_d8_D1_H3K4me3/peaks.txt X X 23478 p601_d8_D1_H3K4me3/peaks.txt|p601_d8_D2_H3K4me3/peaks.txt Step3: Calling R for differential enrichment statistics (-DESeq2) Using DESeq2 to calculate differential expression/enrichment… Output Stats input vs. target: Total Genes: 39087 Total Up-regulated in target vs. input: 28762 (73.585%) [log2fold>1, FDR<0.05] Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05] p604_d8_D1_H3K4me3/peaks.txt p604_d8_D2_H3K4me3/peaks.txt Total Name X 9806 p604_d8_D2_H3K4me3/peaks.txt X 4611 p604_d8_D1_H3K4me3/peaks.txt X X 19586 p604_d8_D1_H3K4me3/peaks.txt|p604_d8_D2_H3K4me3/peaks.txt Using DESeq2 to calculate differential expression/enrichment... Output Stats input vs. target: Total Genes: 34003 Total Up-regulated in target vs. input: 25135 (73.920%) [log2fold>1, FDR<0.05] Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05] # -- Max distance to merge: 500 bp -- vim /home/jhuang/anaconda3/envs/myperl/bin/getDifferentialPeaksReplicates.pl getDifferentialPeaksReplicates.pl -t p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 -i p601_d8_D1_input p601_d8_D2_input -genome hg38 -use peaks.txt > p601_d8_H3K4me3_macs2_peaks_d500.txt getDifferentialPeaksReplicates.pl -t p604_d8_D1_H3K4me3 p604_d8_D2_H3K4me3 -i p604_d8_D1_input p604_d8_D2_input -genome hg38 -use peaks.txt > p604_d8_H3K4me3_macs2_peaks_d500.txt p601_d8_D1_H3K4me3/peaks.txt p601_d8_D2_H3K4me3/peaks.txt Total Name X 13537 p601_d8_D2_H3K4me3/peaks.txt X 8455 p601_d8_D1_H3K4me3/peaks.txt X X 23009 p601_d8_D1_H3K4me3/peaks.txt|p601_d8_D2_H3K4me3/peaks.txt Using DESeq2 to calculate differential expression/enrichment… Output Stats input vs. target: Total Genes: 45001 Total Up-regulated in target vs. input: 31206 (69.345%) [log2fold>1, FDR<0.05] Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05] p604_d8_D1_H3K4me3/peaks.txt p604_d8_D2_H3K4me3/peaks.txt Total Name X 12775 p604_d8_D2_H3K4me3/peaks.txt X 8797 p604_d8_D1_H3K4me3/peaks.txt X X 18654 p604_d8_D1_H3K4me3/peaks.txt|p604_d8_D2_H3K4me3/peaks.txt Using DESeq2 to calculate differential expression/enrichment... Output Stats input vs. target: Total Genes: 40226 Total Up-regulated in target vs. input: 27939 (69.455%) [log2fold>1, FDR<0.05] Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05] cp p601_d8_H3K4me3_macs2_peaks_d500.txt p601_d8_H3K4me3_macs2_peaks.txt cp p604_d8_H3K4me3_macs2_peaks_d500.txt p604_d8_H3K4me3_macs2_peaks.txt awk 'NR>1 {print $2 “\t” $3 “\t” $4 “\t” $1}’ p601_d8_H3K4me3_macs2_peaks.txt > p601_d8_H3K4me3_macs2_peaks.bed awk ‘NR>1 {print $2 “\t” $3 “\t” $4 “\t” $1}’ p604_d8_H3K4me3_macs2_peaks.txt > p604_d8_H3K4me3_macs2_peaks.bed ~/Tools/csv2xls-0.4/csv_to_xls.py p601_d8_H3K4me3_macs2_peaks.txt -d$’\t’ -o p601_d8_H3K4me3_macs2_peaks.xls ~/Tools/csv2xls-0.4/csv_to_xls.py p604_d8_H3K4me3_macs2_peaks.txt -d$’\t’ -o p604_d8_H3K4me3_macs2_peaks.xls #—-> Send to Denise! # — Max distance to merge: 100 bp — getDifferentialPeaksReplicates.pl -t p601_d8_D1_H3K4me3 p601_d8_D2_H3K4me3 -i p601_d8_D1_input p601_d8_D2_input -genome hg38 -use peaks.txt > p601_d8_H3K4me3_macs2_peaks_d100.txt getDifferentialPeaksReplicates.pl -t p604_d8_D1_H3K4me3 p604_d8_D2_H3K4me3 -i p604_d8_D1_input p604_d8_D2_input -genome hg38 -use peaks.txt > p604_d8_H3K4me3_macs2_peaks_d100.txt p601_d8_D1_H3K4me3/peaks.txt p601_d8_D2_H3K4me3/peaks.txt Total Name X 25160 p601_d8_D2_H3K4me3/peaks.txt X 20344 p601_d8_D1_H3K4me3/peaks.txt X X 12109 p601_d8_D1_H3K4me3/peaks.txt|p601_d8_D2_H3K4me3/peaks.txt Using DESeq2 to calculate differential expression/enrichment… Output Stats input vs. target: Total Genes: 57613 Total Up-regulated in target vs. input: 31179 (54.118%) [log2fold>1, FDR<0.05] Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05] p604_d8_D1_H3K4me3/peaks.txt p604_d8_D2_H3K4me3/peaks.txt Total Name X 23330 p604_d8_D2_H3K4me3/peaks.txt X 19855 p604_d8_D1_H3K4me3/peaks.txt X X 8512 p604_d8_D1_H3K4me3/peaks.txt|p604_d8_D2_H3K4me3/peaks.txt Using DESeq2 to calculate differential expression/enrichment... Output Stats input vs. target: Total Genes: 51697 Total Up-regulated in target vs. input: 28436 (55.005%) [log2fold>1, FDR<0.05] Total Dn-regulated in target vs. input: 0 (0.000%) [log2fold<-1, FDR<0.05] cp ./homer/p601_d8_D1_input/p601_d8_D1_input.ucsc.bigWig bigWigs cp ./homer/p601_d8_D2_input/p601_d8_D2_input.ucsc.bigWig bigWigs cp ./homer/p604_d8_D1_input/p604_d8_D1_input.ucsc.bigWig bigWigs cp ./homer/p604_d8_D2_input/p604_d8_D2_input.ucsc.bigWig bigWigs cp ./homer/p601_d8_D1_H3K4me3/p601_d8_D1_H3K4me3.ucsc.bigWig bigWigs cp ./homer/p601_d8_D2_H3K4me3/p601_d8_D2_H3K4me3.ucsc.bigWig bigWigs cp ./homer/p604_d8_D1_H3K4me3/p604_d8_D1_H3K4me3.ucsc.bigWig bigWigs cp ./homer/p604_d8_D2_H3K4me3/p604_d8_D2_H3K4me3.ucsc.bigWig bigWigs cp ./homer/p601_d8_D1_H3K27me3/p601_d8_D1_H3K27me3.ucsc.bigWig bigWigs cp ./homer/p601_d8_D2_H3K27me3/p601_d8_D2_H3K27me3.ucsc.bigWig bigWigs cp ./homer/p604_d8_D1_H3K27me3/p604_d8_D1_H3K27me3.ucsc.bigWig bigWigs cp ./homer/p604_d8_D2_H3K27me3/p604_d8_D2_H3K27me3.ucsc.bigWig bigWigs #----> Send to Denise! cp ./homer/p601_d8_D1_H3K27ac/p601_d8_D1_H3K27ac.ucsc.bigWig bigWigs cp ./homer/p601_d8_D2_H3K27ac/p601_d8_D2_H3K27ac.ucsc.bigWig bigWigs cp ./homer/p604_d8_D1_H3K27ac/p604_d8_D1_H3K27ac.ucsc.bigWig bigWigs cp ./homer/p604_d8_D2_H3K27ac/p604_d8_D2_H3K27ac.ucsc.bigWig bigWigs 8. getDifferentialPeaksReplicates_broad.pl (code) #!/usr/bin/env perl use warnings; use lib “/home/jhuang/Tools/homer/bin”; my $homeDir = “/home/jhuang/Tools/homer/”; my $foldThresh = 2; my $fdrThresh = 0.05; my $peakFoldInput = 2; my $peakFdrInput = 0.001; my $style = “”; sub printCMD { print STDERR “\n\tUsage: getDifferentialPeaksReplicates.pl [options] -t [IP tagdir2] …\n”; print STDERR “\t -b [background tagdir2] …\n”; print STDERR “\t -i [Input tagdir2] …\n”; print STDERR “\t\tNote: if input is provided, peaks will be called.\n”; print STDERR “\n\tOptions:\n”; #print STDERR “\t\t-F <#> (fold enrichment over bg, default: $foldThresh)\n”; #print STDERR “\t\t-fdr <#> (FDR over input, default: $fdrThresh)\n”; print STDERR “\t\t-f <#> (fold enrichment over bg, default: $foldThresh)\n”; print STDERR “\t\t-q <#> (FDR over bg, default: $fdrThresh)\n”; print STDERR “\t\t-fdr <#>, -F <#>, -L <#> (parameters for findPeaks)\n”; print STDERR “\t\t-genome (genome version to use for annotation)\n”; print STDERR “\t\t-DESeq2 | -DESeq | -edgeR (differential stats algorithm, default: DESeq2)\n”; print STDERR “\t\t-balanced (normalize signal across peaks, default: normalize to read totals)\n”; print STDERR “\t\t-fragLength <#> (standardize estimated fragment length across analysis)\n”; print STDERR “\t\t-all (report all peaks, not just differentially regulated)\n”; print STDERR “\n\tPeak finding directives:\n”; print STDERR “\t\t-style (findPeaks style to use for finding peaks)\n”; print STDERR “\t\t-use (use existing peaks in tag directories)\n”; print STDERR “\t\t-p (use specific peak file instead of tagDir/peaks.txt or finding new one)\n”; print STDERR “\t\tOther options will be passed to findPeaks\n”; print STDERR “\n”; exit; } my @targets = (); my @background = (); my @inputs = (); my $findPeaksOpts = “”; my $use = “”; my $givenPeakFile = ”; my $norm2total = “-norm2total”; my $diffAlg = “-DESeq2”; my $genome = ‘none’; my $annOptions = “”; my $fragLength = ”; my $allFlag = 0; my $ogCmd = “getDifferentialPeaksReplicates.pl”; for (my $i=0;$i<@ARGV;$i++) { $ogCmd .= " " . $ARGV[$i]; } for (my $i=0;$i<@ARGV;$i++) { if ($ARGV[$i] eq '-t') { $i++; while ($i < @ARGV) { if ($ARGV[$i] =~ /^-/) { $i--; last; } push(@targets, $ARGV[$i++]); } } elsif ($ARGV[$i] eq '-i') { $i++; while ($i < @ARGV) { if ($ARGV[$i] =~ /^-/) { $i--; last; } push(@inputs, $ARGV[$i++]); } } elsif ($ARGV[$i] eq '-b') { $i++; while ($i < @ARGV) { if ($ARGV[$i] =~ /^-/) { $i--; last; } push(@background, $ARGV[$i++]); } } elsif ($ARGV[$i] eq '-f') { $foldThresh = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-genome') { $genome = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-q') { $fdrThresh = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-use') { $use = $ARGV[++$i]; if ($use eq 'tss.txt') { $annOptions .= " -fragLength 1 -strand +"; $fragLength = " -fragLength 1" if ($fragLength eq ''); } } elsif ($ARGV[$i] eq '-p') { $givenPeakFile = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-style') { $style = $ARGV[++$i]; if ($style eq 'tss') { $annOptions .= " -fragLength 1 -strand +"; $fragLength = " -fragLength 1" if ($fragLength eq ''); } } elsif ($ARGV[$i] eq '-edgeR') { $diffAlg = $ARGV[$i]; } elsif ($ARGV[$i] eq '-fragLength') { $fragLength = " -fragLength $ARGV[++$i]"; } elsif ($ARGV[$i] eq '-DESeq2') { $diffAlg = $ARGV[$i]; } elsif ($ARGV[$i] eq '-all') { $allFlag = 1; } elsif ($ARGV[$i] eq '-DESeq') { $diffAlg = $ARGV[$i]; } elsif ($ARGV[$i] eq '-balanced') { $norm2total = ""; } elsif ($ARGV[$i] eq '-h' || $ARGV[$i] eq '--help' || $ARGV[$i] eq '--') { printCMD(); } else { $findPeaksOpts .= " " . $ARGV[$i]; #print STDERR "!!! \"$ARGV[$i]\" not recognized\n"; #printCMD(); } } my $rand = rand(); my %toDelete = (); if ($diffAlg eq '-edgeR' && $norm2total ne '') { print STDERR "!!! Error, -edgeR requires \"-balanced\" to work correctly!!!\n"; exit; } $log2Thresh = log($foldThresh)/log(2); if (@targets < 1) { print STDERR "!!! Error, need at least one target directory!!!\n"; printCMD(); } if (scalar(@inputs) + scalar(@background) < 1) { print STDERR "\t!!! Error: program requires either input or background experiments to perform differential calculations!\n"; exit; } my $targetDirs = ''; my $targetStr = ""; foreach(@targets) { $targetDirs .= " \"$_\""; $targetStr .= " target"; } my $inputDirs = ''; my $inputStr = ""; foreach(@inputs) { $inputDirs .= " \"$_\""; $inputStr .= " input"; } my $bgDirs = ''; my $bgStr = ""; foreach(@background) { $bgDirs .= " \"$_\""; $bgStr .= " bg"; } if ($givenPeakFile eq '') { $peakFile = $rand . ".peaks"; $toDelete{$peakFile}=1; } else { $peakFile = $givenPeakFile; } if ($findPeaksOpts ne '') { print STDERR "\tUsing the following extra parameters for findPeaks:\n\t\t$findPeaksOpts\n"; } print STDERR "\tStep1: Defining Putative Peak Set\n"; if ($use eq '' && $givenPeakFile eq '') { print STDERR "\t\tFinding peaks in merged meta-experiment from target tag directories\n"; my $targetDir = $rand . ".targetTagDir"; `makeTagDirectory \"$targetDir\" -d $targetDirs $fragLength`; $toDelete{$targetDir}=1; my $inputDir = $rand . ".inputTagDir"; my $cmd = "findPeaks \"$targetDir\""; if ($style eq '') { $style = 'factor'; print STDERR "\tUsing -style $style...\n"; } $cmd .= " -style $style"; $cmd .= $fragLength . " " . $findPeaksOpts; if (@inputs > 0) { `makeTagDirectory \”$inputDir\” -d $inputDirs $fragLength`; $toDelete{$inputDir}=1; $cmd .= ” -i \”$inputDir\””; } #print STDERR “`$cmd > \”$peakFile\”`\n”; `$cmd > \”$peakFile\”`; } elsif ($givenPeakFile eq ” && $use ne ”) { my $files = “”; my @allDirs = (); push(@allDirs, @targets, @inputs, @background); print STDERR “\t\tUsing existing peak files for features:\n”; foreach(@allDirs) { my $p = $_ . “/” . $use; if (-e $p) { print STDERR “\t\t\t$p\n”; $files .= ” \”$p\””; } } #MODIFIED #`mergePeaks $files > \”$peakFile\”`; #print “$files\n”; #print “\”$peakFile\”\n”; `mergePeaks $files -d 35000 > mergePeaks_res.txt`; `python3 update_header.py`; `cat $files > merged_peaks.txt`; ## Split the list of files into an array #my @files = split / /, $files; # Open the output file #open(my $out, ‘>’, ‘merged_peaks.txt’) or die “Cannot open merged_peaks.txt: $!”; #foreach my $file (@files) { # print(“xxxxx$file\n”); # open(my $in, ‘<', $file) or die "Cannot open $file: $!"; # while (<$in>) { # s/\t+$//; # removes trailing tabs # print $out “$_\n”; # } # close $in; #} #close $out; system(“awk ‘{print \$2 \”\\t\” \$3 \”\\t\” \$4 \”\\t\” \$1}’ merged_peaks.txt | sort -k1,1 -k2,2n | bedtools merge -d 35000 > bedtools_res.txt”); `python3 adjust_mergePeaks_res.py`; `mv filtered_mergePeaks_res.txt \”$peakFile\”`; } $rawFile= $rand . “.raw.txt”; $diffFile= $rand . “.diff.txt”; $upFile = $rand . “.Up_target_vs_bg.txt”; $downFile = $rand . “.Down_target_vs_bg.txt”; if ($bgDirs eq ”) { $bgDirs = $inputDirs; $bgStr = $inputStr; @background = @inputs; $upFile = $rand . “.Up_target_vs_input.txt”; $downFile = $rand . “.Down_target_vs_input.txt”; } print STDERR “\n\tStep2: Quantifying reads across target/background/input tag directories\n\n”; #print STDERR “`annotatePeaks.pl \”$peakFile\” none -d $bgDirs $targetDirs -raw > \”$rawFile\”`;\n”; `annotatePeaks.pl \”$peakFile\” $genome -d $bgDirs $targetDirs -raw $annOptions $fragLength > \”$rawFile\”`; #print STDERR “`getDiffExpression.pl \”$rawFile\” $bgStr $targetStr $norm2total $diffAlg -fdr $fdrThresh -log2fold $log2Thresh -export $rand > $diffFile`;\n”; # print STDERR “\n\tStep3: Calling R for differential enrichment statistics ($diffAlg)\n\n”; `getDiffExpression.pl \”$rawFile\” $bgStr $targetStr $norm2total $diffAlg -fdr $fdrThresh -log2fold $log2Thresh -export $rand > $diffFile`; $toDelete{$rawFile}=1; $toDelete{$diffFile}=1; $toDelete{$upFile}=1; $toDelete{$downFile}=1; print “#cmd=$ogCmd|”; my $ofile = $upFile; $ofile = $diffFile if ($allFlag); open IN, $ofile; while ( ) { print $_; } close IN; foreach(keys %toDelete) { next if ($_ eq ‘/’); #print STDERR “\trm -r \”$_\”\n”; `rm -r “$_”`; } 9. getDifferentialPeaksReplicates_narrow.pl (code) #!/usr/bin/env perl use warnings; use lib “/home/jhuang/anaconda3/envs/myperl/share/homer/.//bin”; my $homeDir = “/home/jhuang/anaconda3/envs/myperl/share/homer/./”; my $foldThresh = 2; my $fdrThresh = 0.05; my $peakFoldInput = 2; my $peakFdrInput = 0.001; my $style = “”; sub printCMD { print STDERR “\n\tUsage: getDifferentialPeaksReplicates.pl [options] -t [IP tagdir2] …\n”; print STDERR “\t -b [background tagdir2] …\n”; print STDERR “\t -i [Input tagdir2] …\n”; print STDERR “\t\tNote: if input is provided, peaks will be called.\n”; print STDERR “\n\tOptions:\n”; #print STDERR “\t\t-F <#> (fold enrichment over bg, default: $foldThresh)\n”; #print STDERR “\t\t-fdr <#> (FDR over input, default: $fdrThresh)\n”; print STDERR “\t\t-f <#> (fold enrichment over bg, default: $foldThresh)\n”; print STDERR “\t\t-q <#> (FDR over bg, default: $fdrThresh)\n”; print STDERR “\t\t-fdr <#>, -F <#>, -L <#> (parameters for findPeaks)\n”; print STDERR “\t\t-genome (genome version to use for annotation)\n”; print STDERR “\t\t-DESeq2 | -DESeq | -edgeR (differential stats algorithm, default: DESeq2)\n”; print STDERR “\t\t-balanced (normalize signal across peaks, default: normalize to read totals)\n”; print STDERR “\t\t-fragLength <#> (standardize estimated fragment length across analysis)\n”; print STDERR “\t\t-all (report all peaks, not just differentially regulated)\n”; print STDERR “\n\tPeak finding directives:\n”; print STDERR “\t\t-style (findPeaks style to use for finding peaks)\n”; print STDERR “\t\t-use (use existing peaks in tag directories)\n”; print STDERR “\t\t-p (use specific peak file instead of tagDir/peaks.txt or finding new one)\n”; print STDERR “\t\tOther options will be passed to findPeaks\n”; print STDERR “\n”; exit; } my @targets = (); my @background = (); my @inputs = (); my $findPeaksOpts = “”; my $use = “”; my $givenPeakFile = ”; my $norm2total = “-norm2total”; my $diffAlg = “-DESeq2”; my $genome = ‘none’; my $annOptions = “”; my $fragLength = ”; my $allFlag = 0; my $ogCmd = “getDifferentialPeaksReplicates.pl”; for (my $i=0;$i<@ARGV;$i++) { $ogCmd .= " " . $ARGV[$i]; } for (my $i=0;$i<@ARGV;$i++) { if ($ARGV[$i] eq '-t') { $i++; while ($i < @ARGV) { if ($ARGV[$i] =~ /^-/) { $i--; last; } push(@targets, $ARGV[$i++]); } } elsif ($ARGV[$i] eq '-i') { $i++; while ($i < @ARGV) { if ($ARGV[$i] =~ /^-/) { $i--; last; } push(@inputs, $ARGV[$i++]); } } elsif ($ARGV[$i] eq '-b') { $i++; while ($i < @ARGV) { if ($ARGV[$i] =~ /^-/) { $i--; last; } push(@background, $ARGV[$i++]); } } elsif ($ARGV[$i] eq '-f') { $foldThresh = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-genome') { $genome = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-q') { $fdrThresh = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-use') { $use = $ARGV[++$i]; if ($use eq 'tss.txt') { $annOptions .= " -fragLength 1 -strand +"; $fragLength = " -fragLength 1" if ($fragLength eq ''); } } elsif ($ARGV[$i] eq '-p') { $givenPeakFile = $ARGV[++$i]; } elsif ($ARGV[$i] eq '-style') { $style = $ARGV[++$i]; if ($style eq 'tss') { $annOptions .= " -fragLength 1 -strand +"; $fragLength = " -fragLength 1" if ($fragLength eq ''); } } elsif ($ARGV[$i] eq '-edgeR') { $diffAlg = $ARGV[$i]; } elsif ($ARGV[$i] eq '-fragLength') { $fragLength = " -fragLength $ARGV[++$i]"; } elsif ($ARGV[$i] eq '-DESeq2') { $diffAlg = $ARGV[$i]; } elsif ($ARGV[$i] eq '-all') { $allFlag = 1; } elsif ($ARGV[$i] eq '-DESeq') { $diffAlg = $ARGV[$i]; } elsif ($ARGV[$i] eq '-balanced') { $norm2total = ""; } elsif ($ARGV[$i] eq '-h' || $ARGV[$i] eq '--help' || $ARGV[$i] eq '--') { printCMD(); } else { $findPeaksOpts .= " " . $ARGV[$i]; #print STDERR "!!! \"$ARGV[$i]\" not recognized\n"; #printCMD(); } } my $rand = rand(); my %toDelete = (); if ($diffAlg eq '-edgeR' && $norm2total ne '') { print STDERR "!!! Error, -edgeR requires \"-balanced\" to work correctly!!!\n"; exit; } $log2Thresh = log($foldThresh)/log(2); if (@targets < 1) { print STDERR "!!! Error, need at least one target directory!!!\n"; printCMD(); } if (scalar(@inputs) + scalar(@background) < 1) { print STDERR "\t!!! Error: program requires either input or background experiments to perform differential calculations!\n"; exit; } my $targetDirs = ''; my $targetStr = ""; foreach(@targets) { $targetDirs .= " \"$_\""; $targetStr .= " target"; } my $inputDirs = ''; my $inputStr = ""; foreach(@inputs) { $inputDirs .= " \"$_\""; $inputStr .= " input"; } my $bgDirs = ''; my $bgStr = ""; foreach(@background) { $bgDirs .= " \"$_\""; $bgStr .= " bg"; } if ($givenPeakFile eq '') { $peakFile = $rand . ".peaks"; $toDelete{$peakFile}=1; } else { $peakFile = $givenPeakFile; } if ($findPeaksOpts ne '') { print STDERR "\tUsing the following extra parameters for findPeaks:\n\t\t$findPeaksOpts\n"; } print STDERR "\tStep1: Defining Putative Peak Set\n"; if ($use eq '' && $givenPeakFile eq '') { print STDERR "\t\tFinding peaks in merged meta-experiment from target tag directories\n"; my $targetDir = $rand . ".targetTagDir"; `makeTagDirectory \"$targetDir\" -d $targetDirs $fragLength`; $toDelete{$targetDir}=1; my $inputDir = $rand . ".inputTagDir"; my $cmd = "findPeaks \"$targetDir\""; if ($style eq '') { $style = 'factor'; print STDERR "\tUsing -style $style...\n"; } $cmd .= " -style $style"; $cmd .= $fragLength . " " . $findPeaksOpts; if (@inputs > 0) { `makeTagDirectory \”$inputDir\” -d $inputDirs $fragLength`; $toDelete{$inputDir}=1; $cmd .= ” -i \”$inputDir\””; } #print STDERR “`$cmd > \”$peakFile\”`\n”; `$cmd > \”$peakFile\”`; } elsif ($givenPeakFile eq ” && $use ne ”) { my $files = “”; my @allDirs = (); push(@allDirs, @targets, @inputs, @background); print STDERR “\t\tUsing existing peak files for features:\n”; foreach(@allDirs) { my $p = $_ . “/” . $use; if (-e $p) { print STDERR “\t\t\t$p\n”; $files .= ” \”$p\””; } } `mergePeaks -d 2000 $files > \”$peakFile\”`; #`cat $files | sort -k1,1 -k2,2n | bedtools merge > \”$peakFile\”`; } $rawFile= $rand . “.raw.txt”; $diffFile= $rand . “.diff.txt”; $upFile = $rand . “.Up_target_vs_bg.txt”; $downFile = $rand . “.Down_target_vs_bg.txt”; if ($bgDirs eq ”) { $bgDirs = $inputDirs; $bgStr = $inputStr; @background = @inputs; $upFile = $rand . “.Up_target_vs_input.txt”; $downFile = $rand . “.Down_target_vs_input.txt”; } print STDERR “\n\tStep2: Quantifying reads across target/background/input tag directories\n\n”; #print STDERR “`annotatePeaks.pl \”$peakFile\” none -d $bgDirs $targetDirs -raw > \”$rawFile\”`;\n”; `annotatePeaks.pl \”$peakFile\” $genome -d $bgDirs $targetDirs -raw $annOptions $fragLength > \”$rawFile\”`; #print STDERR “`getDiffExpression.pl \”$rawFile\” $bgStr $targetStr $norm2total $diffAlg -fdr $fdrThresh -log2fold $log2Thresh -export $rand > $diffFile`;\n”; # print STDERR “\n\tStep3: Calling R for differential enrichment statistics ($diffAlg)\n\n”; `getDiffExpression.pl \”$rawFile\” $bgStr $targetStr $norm2total $diffAlg -fdr $fdrThresh -log2fold $log2Thresh -export $rand > $diffFile`; $toDelete{$rawFile}=1; $toDelete{$diffFile}=1; $toDelete{$upFile}=1; $toDelete{$downFile}=1; print “#cmd=$ogCmd|”; my $ofile = $upFile; $ofile = $diffFile if ($allFlag); open IN, $ofile; while ( ) { print $_; } close IN; #foreach(keys %toDelete) { # next if ($_ eq ‘/’); # #print STDERR “\trm -r \”$_\”\n”; # `rm -r “$_”`; #} 10. update_header.py (code) import os # File path file_path = ‘mergePeaks_res.txt’ # Read in the file with open(file_path, ‘r’) as file: lines = file.readlines() # Split the first line into components components = lines[0].split(“\t”) # Change the first component to “name” components[0] = “name” # Join the components back into a single string lines[0] = “\t”.join(components) # Write the file back out with open(file_path, ‘w’) as file: file.writelines(lines) 11. adjust_mergePeaks_res.py (code) import pandas as pd # Read the mergePeaks result file and bedtools result file mergePeaks_df = pd.read_csv(‘mergePeaks_res.txt’, sep=’\t’, header=0) bedtools_df = pd.read_csv(‘bedtools_res.txt’, sep=’\t’, header=None, names=[‘chr’, ‘start’, ‘end’]) # Function to check if a peak is within bedtools ranges # def is_in_bedtools_range(row): filtered_mergePeaks_df = [] for _, row in mergePeaks_df.iterrows(): for _, bed_row in bedtools_df.iterrows(): if row[‘chr’] == bed_row[‘chr’] and row[‘start’] >= bed_row[‘start’] and row[‘end’] <= bed_row['end']: row['start'] = bed_row['start'] row['end'] = bed_row['end'] filtered_mergePeaks_df.append(row) # Convert the filtered results to a DataFrame filtered_mergePeaks_df = pd.DataFrame(filtered_mergePeaks_df) ## Write the filtered results to a file #filtered_mergePeaks_df.to_csv('filtered_mergePeaks_res.txt', sep='\t', index=False) ## Filter the mergePeaks rows #filtered_mergePeaks_df = mergePeaks_df[mergePeaks_df.apply(is_in_bedtools_range, axis=1)] ## Sort and drop duplicates sorted_df = filtered_mergePeaks_df.sort_values(by=['chr', 'start', 'end']) deduplicated_df = sorted_df.drop_duplicates(subset=['chr', 'start', 'end']) ## Save to file deduplicated_df.to_csv('filtered_mergePeaks_res.txt', sep='\t', index=False)

How to Create a New User on Ubuntu Server?

  1. Restricting User ‘malawi’ from Installing System-wide Programs and Verifying Permissions

    chmod o-rx /home/jhuang
    ls -ld /home/jhuang
    cd /home/jhuang
    
    groups malawi
    sudo deluser malawi sudo
    
    jhuang@hamm:~$ groups malawi
    #malawi : malawi
    jhuang@hamm:~$ groups jhuang
    #jhuang : jhuang adm cdrom sudo dip plugdev lpadmin sambashare docker
  2. Ensuring Full Permissions for User ‘malawi’ in Their Home Directory

    #chmod -R u=rwx,go= /home/malawi
    ls -ld /home/malawi
    #drwxr-xr-x
    sudo chown malawi:malawi /home/malawi
    #sudo chmod 700 /home/malawi
    #sudo chmod -R 700 /home/malawi
    sudo chmod 755 /home/malawi
    sudo chmod -R 755 /home/malawi
    ls -ld /home/malawi
  3. Protecting Other Users’ Directories from Access by ‘malawi’

    sudo chmod o-rx /home/jhuang
    sudo chmod o-rx /mnt/h1/jhuang
    
    #for dir in /home/*; do
    #  if [ -d "$dir" ]; then
    #    sudo chmod o-rx "$dir"
    #  fi
    #done
    
    ls -ld /home/jhuang
  4. Changing User Passwords

    sudo passwd malawi
    passwd
  5. Changing User Passwords

    if (!requireNamespace("BiocManager", quietly = TRUE))
        install.packages("BiocManager")
    BiocManager::install("SeuratObject")
    install.packages("Seurat")
  6. #(NOT GOOD) from 775 to o-rx 
    #sudo chmod o-rx /mnt/h1/jhuang
    #sudo chmod -R o-rx /mnt/h1/jhuang
    #sudo chmod o-rx /home/jhuang
    #sudo chmod -R o-rx /home/jhuang
    
    #from 775 to 750
    sudo chmod 750 /mnt/h1/jhuang
    sudo chmod -R 750 /mnt/h1/jhuang
    sudo chmod 750 /home/jhuang
    sudo chmod -R 750 /home/jhuang
    #END
  7. install r and r-seurat with miniconda3

    mkdir -p ~/miniconda3
    wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh
    bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
    rm -rf ~/miniconda3/miniconda.sh
    ~/miniconda3/bin/conda init bash
    
    conda create -n r -c bioconda r r-seurat
    conda activate r
    #> .libPaths()
    #[1] "/home/malawi/miniconda3/envs/r/lib/R/library"
    if (!requireNamespace("remotes", quietly = TRUE))
      install.packages("remotes")
    remotes::install_github("satijalab/seurat-data")
    conda deactivate
    
    conda activate r
    R
    library(Seurat)
    library(SeuratData)
    library(ggplot2)
    library(patchwork)
    library(dplyr)

Processing Spatial Transcriptomics Data Using Space Ranger

Using spaceranger to process spatial transcriptomics data involves several steps, from preparing the necessary input files to running the analysis and interpreting the results. Below, I’ll provide a comprehensive example of how to use spaceranger for processing a hypothetical dataset.

Prerequisites:

Install Space Ranger: Make sure spaceranger is installed on your system. You can download it from the 10x Genomics website.
Required Data: You need three key pieces of data:
   - Spatial Gene Expression FASTQ files: These are generated by the sequencing instrument.
   - Spatial Gene Expression Image: An image of the tissue section.
   - Spatial Gene Expression Slide and Capture Area: This information is usually provided by the manufacturer.

Example Workflow:

  1. Prepare Input Files

    Ensure you have the following files:

    • FASTQ files: Usually named something like SampleName_S1_L001_R1_001.fastq.gz, SampleName_S1_L001_R2_001.fastq.gz.
    • A tissue image file, like tissue_hires_image.png.
    • A slide and capture area file, often provided by the manufacturer.
  2. Create a Reference Dataset

    If you don’t already have a reference dataset for your species of interest, you can create one using the spaceranger mkref command. For example:

    spaceranger mkref --genome=GRCh38 --fasta=GRCh38.fasta --genes=genes.gtf --nthreads=8 --memgb=64

    Replace GRCh38.fasta and genes.gtf with the paths to your genome FASTA and gene annotation GTF files, respectively.

  3. Running Space Ranger

    The core of the spaceranger workflow is the count command, which aligns reads, generates feature-barcode matrices, and performs spatial analysis. The command looks something like this:

    spaceranger count --id=sample_output \
                      --transcriptome=/path/to/refdata-cellranger-GRCh38-3.0.0 \
                      --fastqs=/path/to/fastqs \
                      --sample=SampleName \
                      --image=/path/to/tissue_hires_image.png \
                      --slide=V19J01-123 \
                      --area=A1 \
                      --nthreads=16 \
                      --memgb=64
    
        --id: The name of the output folder.
        --transcriptome: Path to the reference dataset.
        --fastqs: Path to the folder containing FASTQ files.
        --sample: Name of the sample.
        --image: Path to the high-resolution tissue image.
        --slide and --area: Slide and capture area information.
        --nthreads and --memgb: Specify computational resources.
  4. Analyze Output

    Once the spaceranger count command is complete, it will generate an output directory (sample_output in this case) containing several files, including:

    • Feature-barcode matrices
    • Analysis files (clustering, dimensionality reduction, etc.)
    • Images showing gene expression overlaid on the tissue image
  5. Further Analysis

    The resulting data can be further analyzed using tools like Seurat (R), Scanpy (Python), or Loupe Browser (from 10x Genomics).

Additional Notes:

Always refer to the specific version of the spaceranger documentation you are using, as commands and options might vary slightly between versions.
Ensure your computational environment has enough resources (CPU, memory) to handle the dataset size.
This workflow is a basic example. Depending on your specific experiment and data, additional steps or modifications might be necessary.

Prepare the databases for vrap

  1. I used an strategy, at first annotate the contigs using the virus-speicific data and bacteria-speicific data, then using more general databases nt and nr. The results are as attached. For some samples, for examples S5, which we can detected several contigs as gammaherpesvius. For the bacteria, it is more conversed.

    # -- txid10239 (Virus) and Taxonomy ID: 2 (Bacteria) --
    # -- Virus --
    #TODO: from 1,100,000 --> 1,288,629 (up to 2020/07/01); bacteria we can use refseq (up to 2020/07/01)!
    #--virus bacteria-refseq-fasta, then virus sequences, virus protein as default database, then nt and nr!
    #TODO!: download bact_nt_db and use in '--virus bact_nt_db'!
    #  pip install ncbi-genome-download
    #  ncbi-genome-download -F fasta bacteria
    #  ncbi-genome-download -F fasta virus
    #  https://www.ncbi.nlm.nih.gov/genome/microbes/
    #  https://www.biostars.org/p/9503245/
  2. download bacteria refseq with datasets

    #https://www.ncbi.nlm.nih.gov/datasets/docs/v1/download-and-install/
    The NCBI Datasets datasets command line tools are datasets and dataformat .
    
    #datasets download genome bacteria --assembly-source refseq --dehydrated --filename bacteria_refseq.zip
    ~/Tools/datasets download genome bacteria --assembly-source refseq --dehydrated --exclude-protein --exclude-genomic-cds --exclude-rna --exclude-gff3 --filename bacteria_refseq_fasta.zip
    ~/Tools/datasets download genome taxon bacteria #2,231,190
    ~/Tools/datasets download genome taxon bacteria --assembly-source refseq --dehydrated --exclude-protein --exclude-genomic-cds --exclude-rna --exclude-gff3 --filename bacteria_refseq.zip  #325,471
    #~/Tools/datasets download genome taxon virus #97,281 records
    #~/Tools/datasets download genome taxon virus --assembly-source refseq --dehydrated --exclude-protein --exclude-genomic-cds --exclude-rna --exclude-gff3 --filename virus_refseq.zip #14,992
    
    #Unzip the file
    unzip bacteria_refseq.zip -d bacteria_refseq
    unzip virus_refseq.zip -d virus_refseq
    
    #Rehydrate the file: I'm recommending the dehydrated option because it's actually faster and more reliable, despite the additional steps. By default, the data package includes genomic, transcript, protein and cds sequences, in addition to gff3. If you only need the genomic fasta sequences, you can use this command instead:
    ~/Tools/datasets rehydrate --directory bacteria_refseq/
    ~/Tools/datasets rehydrate --directory virus_refseq/  #29984
  3. run vrap.py with –host genome.fa –virus bacteria_refseq [default viral_db up to 2020_07_01] -n nt -r nr

    # -- Virus --
    vrap/vrap.py -1 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20430/635290002_CMV_S4_R1_001.fastq.gz -2 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20430/635290002_CMV_S4_R2_001.fastq.gz -o CMV_S4_unbiased2 --host /home/jhuang/REFs/genome.fa -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 40 -l 200
    vrap/vrap.py -1 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20431/635850623_EBV_S5_R1_001.fastq.gz -2 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20431/635850623_EBV_S5_R2_001.fastq.gz -o EBV_S5_unbiased2 --host /home/jhuang/REFs/genome.fa -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 40 -l 200
    
    # -- Control --
    vrap/vrap.py -1 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20428/neg_control_S2_R1_001.fastq.gz -2 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20428/neg_control_S2_R2_001.fastq.gz -o neg_control_S2_unbiased2 --host /home/jhuang/REFs/genome.fa -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 40 -l 200   
    
    # -- Bacteria --
    vrap/vrap.py -1 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20427/635031018_E_faecium_S1_R1_001.fastq.gz -2 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20427/635031018_E_faecium_S1_R2_001.fastq.gz -o E_faecium_S1_unbiased2 --host /home/jhuang/REFs/genome.fa -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 40 -l 200
    vrap/vrap.py -1 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20429/635724976_S_aureus_epidermidis_S3_R1_001.fastq.gz -2 ../231114_VH00358_62_AACYCYWM5_cfDNA/p20429/635724976_S_aureus_epidermidis_S3_R2_001.fastq.gz -o S_aureus_epidermidis_S3_unbiased2 --host /home/jhuang/REFs/genome.fa -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 40 -l 200
    #END

ChIP-seq using HOMER (-style factor, findPeaks + default getDifferentialPeaksReplicates.pl)

  1. nextflow ChIP-seq run for NHDF_p783

    #under Raw_Data for ChIP-seq 
    ln -s ./230306_NB501882_0417_AHMVHHBGXN/2023_022_nf_denise/nf859/3_NHDF_Donor_1_p783_input_S5_R1_001.fastq.gz p783_input_DonorI.fastq.gz
    ln -s ./230306_NB501882_0417_AHMVHHBGXN/2023_022_nf_denise/nf860/4_NHDF_Donor_2_p783_input_S6_R1_001.fastq.gz p783_input_DonorII.fastq.gz
    ln -s ./230306_NB501882_0417_AHMVHHBGXN/2023_022_nf_denise/nf861/5_NHDF_Donor_1_p783_ChIP_S7_R1_001.fastq.gz p783_ChIP_DonorI.fastq.gz
    ln -s ./230306_NB501882_0417_AHMVHHBGXN/2023_022_nf_denise/nf862/6_NHDF_Donor_2_p783_ChIP_S8_R1_001.fastq.gz p783_ChIP_DonorII.fastq.gz
    
    #'hg38'      { bwa = "/home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Sequence/BWAIndex/"
    #          blacklist = "/home/jhuang/REFs/Homo_sapiens/UCSC/hg38/blacklists/hg38-blacklist.bed"
    #          gtf = "/home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf"
    #        }
    ln -s /home/jhuang/Tools/NGI-ChIPseq/ .
    (chipseq) nextflow run NGI-ChIPseq/main.nf --reads '/home/jhuang/DATA/Data_Denise_LT_DNA_Binding/Raw_Data/*.fastq.gz' --genome hg38 --macsconfig macs.config --singleEnd --blacklist_filtering -profile standard --project Denise_LT_DNA_Bindung --outdir results_LT_DNA_Bindung_hg38 -resume
    
    #By the way: nextflow RNA-seq run for NHDF_p783 (NOT the topics of the post).
    #under Raw_Data for RNA-seq
    cp ~/DATA/Data_Denise_tx_epi_MCPyV_PUBLISHING/Data_Denise_RNASeq/Raw_Data/V_8_2_4_p600_d8_DonorI.fastq.gz ./
    cp ~/DATA/Data_Denise_tx_epi_MCPyV_PUBLISHING/Data_Denise_RNASeq/Raw_Data/V_8_2_3_p600_d8_DonorII.fastq.gz ./
    #under Raw_Data_p783_RNAseq for RNA-seq
    ln -s ../Raw_Data/V_8_2_4_p600_d8_DonorI.fastq.gz  ctrl_DonorI.fastq.gz   
    ln -s ../Raw_Data/V_8_2_3_p600_d8_DonorII.fastq.gz ctrl_DonorII.fastq.gz
    ln -s ../Raw_Data/230306_NB501882_0417_AHMVHHBGXN/2023_022_nf_denise/nf857/1_NHDF_Donor_1_p783_S1_R1_001.fastq.gz p783_DonorI.fastq.gz
    ln -s ../Raw_Data/230306_NB501882_0417_AHMVHHBGXN/2023_022_nf_denise/nf858/2_NHDF_Donor_2_p783_S2_R1_001.fastq.gz p783_DonorII.fastq.gz
    #Note that we need to regenerate MultiQC.html after ignoring 'Biotype Counts', since --fcGroupFeaturesType gene_name cannot generate the real biotype counts!
    (rnaseq_2021) nextflow run rnaseq --reads '/home/jhuang/DATA/Data_Denise_LT_DNA_Binding/Raw_Data_p783/RNA_seq/*.fastq.gz'  --fasta "/home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa" --gtf "/home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf"  --bed12 "/home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.bed" --singleEnd -profile standard --aligner star --saveReference -resume --saveAlignedIntermediates --skip_rseqc --skip_dupradar --skip_genebody_coverage --skip_preseq --skip_edger --fcGroupFeaturesType gene_name
  2. nextflow ChIP-seq run for data of truncated LT-Ag + sT expression of WaGa and HEK293

    #160719_SN7001212_0156_AC8K76ACXX
    
    cat ../160719_SN7001212_0156_AC8K76ACXX/Sample_293_input_1_10_p197_1/293_input_1_10_p197_1_GTAGAG_L002_R1_001.fastq.gz ../160719_SN7001212_0156_AC8K76ACXX/Sample_293_input_1_10_p197_1/293_input_1_10_p197_1_GTAGAG_L003_R1_001.fastq.gz > HEK293_Input_p197_r1.fastq.gz
    cat ../160719_SN7001212_0156_AC8K76ACXX/Sample_293_input_1_10_p197_2/293_input_1_10_p197_2_GTCCGC_L002_R1_001.fastq.gz ../160719_SN7001212_0156_AC8K76ACXX/Sample_293_input_1_10_p197_2/293_input_1_10_p197_2_GTCCGC_L003_R1_001.fastq.gz > HEK293_Input_p197_r2.fastq.gz
    cat ../160719_SN7001212_0156_AC8K76ACXX/Sample_293_input_1_10_p197_3/293_input_1_10_p197_3_GTGAAA_L002_R1_001.fastq.gz ../160719_SN7001212_0156_AC8K76ACXX/Sample_293_input_1_10_p197_3/293_input_1_10_p197_3_GTGAAA_L003_R1_001.fastq.gz > HEK293_Input_p197_r3.fastq.gz
    
    cat ../160719_SN7001212_0156_AC8K76ACXX/Sample_293_lt_p197_1/293_lt_p197_1_TAGCTT_L002_R1_001.fastq.gz ../160719_SN7001212_0156_AC8K76ACXX/Sample_293_lt_p197_1/293_lt_p197_1_TAGCTT_L003_R1_001.fastq.gz > HEK293_LT_p197_r1.fastq.gz
    cat ../160719_SN7001212_0156_AC8K76ACXX/Sample_293_lt_p197_2/293_lt_p197_2_GGCTAC_L002_R1_001.fastq.gz ../160719_SN7001212_0156_AC8K76ACXX/Sample_293_lt_p197_2/293_lt_p197_2_GGCTAC_L003_R1_001.fastq.gz > HEK293_LT_p197_r2.fastq.gz
    cat ../160719_SN7001212_0156_AC8K76ACXX/Sample_293_lt_p197_3/293_lt_p197_3_AGTCAA_L002_R1_001.fastq.gz ../160719_SN7001212_0156_AC8K76ACXX/Sample_293_lt_p197_3/293_lt_p197_3_AGTCAA_L003_R1_001.fastq.gz > HEK293_LT_p197_r3.fastq.gz
    
    #140117_SN7001212_0097_AC3ECBACXX
    
    cat ../140117_SN7001212_0097_AC3ECBACXX/Sample_waga_igg/waga_igg_TAGCTT_L003_R1_001.fastq.gz ../140117_SN7001212_0097_AC3ECBACXX/Sample_waga_igg/waga_igg_TAGCTT_L004_R1_001.fastq.gz > WaGa_IgG.fastq.gz
    
    cat ../140117_SN7001212_0097_AC3ECBACXX/Sample_waga_lt/waga_lt_GGTAGC_L003_R1_001.fastq.gz ../140117_SN7001212_0097_AC3ECBACXX/Sample_waga_lt/waga_lt_GGTAGC_L004_R1_001.fastq.gz > WaGa_LT.fastq.gz
    
    ln -s /home/jhuang/Tools/NGI-ChIPseq/ .
    (chipseq) nextflow run NGI-ChIPseq/main.nf --reads '/home/jhuang/DATA/Data_Denise_LT_DNA_Binding/LTtr-ChIP/Raw_Data/*.fastq.gz' --genome hg38 --macsconfig macs.config --singleEnd --blacklist_filtering -profile standard --project Denise_LTtr_DNA_Bindung --outdir results_LTtr_DNA_Bindung_hg38 -resume
  3. makeTagDirectory

    conda activate myperl
    mkdir results_ChIPseq_K331A_hg38/homer; cd results_ChIPseq_K331A_hg38/homer
    
    #makeTagDirectory <output directory> <input file> -genome hg38
    for sample in p783_ChIP_DonorI p783_ChIP_DonorII p783_input_DonorI p783_input_DonorII; do
      makeTagDirectory ${sample} ../picard/${sample}.dedup.sorted.bam -genome hg38
    done
  4. generate bigwigs

    #makeUCSCfile peaks.txt -f peaks.bed -o auto -noadj -bigWig sample.bw -genome hg38
    for sample in p783_ChIP_DonorI p783_ChIP_DonorII p783_input_DonorI p783_input_DonorII; do
    makeUCSCfile ${sample} -pseudo 1 -bigWig /home/jhuang/REFs/hg38.chromSizes -o auto -style chipseq    -norm 1e7 -normLength 100 -fsize 1
    done
    mv ./p783_ChIP_DonorI/p783_ChIP_DonorI.ucsc.bigWig     ./p783_ChIP_DonorI/LT_K331A_DI.bigWig
    mv ./p783_ChIP_DonorII/p783_ChIP_DonorII.ucsc.bigWig   ./p783_ChIP_DonorII/LT_K331A_DII.bigWig
    mv ./p783_input_DonorI/p783_input_DonorI.ucsc.bigWig   ./p783_input_DonorI/LT_K331A_DI_input.bigWig
    mv ./p783_input_DonorII/p783_input_DonorII.ucsc.bigWig ./p783_input_DonorII/LT_K331A_DII_input.bigWig
  5. peak calling, get peaks.txt

      #findPeaks <tag directory> -i <input file> -o <output file> -genome hg38
      findPeaks p783_ChIP_DonorI  -style factor    -o auto -i p783_input_DonorI
      findPeaks p783_ChIP_DonorII -style factor    -o auto -i p783_input_DonorII
      cp ../reproduce_2023/tagDirectories/ ./
      cd homer
      ln -s ../tagDirectories/NHDF_LT_Donor1 ./
      ln -s ../tagDirectories/NHDF_LT_Donor2 ./
      ln -s ../tagDirectories/NHDF_LT_Donor1_Input ./
      ln -s ../tagDirectories/NHDF_LT_Donor2_Input ./
      ln -s ../tagDirectories/Pfsk-1B_LT+sT_r1 ./
      ln -s ../tagDirectories/Pfsk-1B_LT+sT_r2 ./
      ln -s ../tagDirectories/Pfsk-1B_LT+sT_r1_Input ./
      ln -s ../tagDirectories/Pfsk-1B_LT+sT_r2_Input ./
      ln -s ../tagDirectories/HEK293_LT+sT_r2 ./
      ln -s ../tagDirectories/HEK293_LT+sT_r3 ./
      ln -s ../tagDirectories/HEK293_LT+sT_r2_Input ./
      ln -s ../tagDirectories/HEK293_LT+sT_r3_Input ./
    
      findPeaks NHDF_LT_Donor1  -style factor      -o auto -i NHDF_LT_Donor1_Input
      findPeaks NHDF_LT_Donor2  -style factor      -o auto -i NHDF_LT_Donor2_Input
    
      findPeaks Pfsk-1B_LT+sT_r1  -style factor    -o auto -i Pfsk-1B_LT+sT_r1_Input
      findPeaks Pfsk-1B_LT+sT_r2  -style factor    -o auto -i Pfsk-1B_LT+sT_r2_Input
    
      findPeaks HEK293_LT+sT_r2 -style factor      -o auto -i HEK293_LT+sT_r2_Input
      findPeaks HEK293_LT+sT_r3 -style factor      -o auto -i HEK293_LT+sT_r3_Input
  6. peak calling using getDifferentialPeaksReplicates.pl

    cp -r ../../reproduce_2023/tagDirectories/NHDF_LT_Donor1_Input ./
    cp -r ../../reproduce_2023/tagDirectories/NHDF_LT_Donor2_Input ./
    cp -r ../../reproduce_2023/tagDirectories/NHDF_LT_Donor1 ./
    cp -r ../../reproduce_2023/tagDirectories/NHDF_LT_Donor2 ./
    #-annStats annStats.txt 
    conda activate myperl
    getDifferentialPeaksReplicates.pl -t p783_ChIP_DonorI p783_ChIP_DonorII -i p783_input_DonorI p783_input_DonorII      -genome hg38 -use peaks.txt > peaks_K331A_LT.txt
    mv peaks_K331A_LT.txt peaks_NHDF_K331A_LT.txt
    getDifferentialPeaksReplicates.pl -t NHDF_LT_Donor1 NHDF_LT_Donor2      -i NHDF_LT_Donor1_Input NHDF_LT_Donor2_Input -genome hg38 -use peaks.txt > peaks_NHDF_LT.txt
    getDifferentialPeaksReplicates.pl -t Pfsk-1B_LT+sT_r1 Pfsk-1B_LT+sT_r2  -i Pfsk-1B_LT+sT_r1_Input Pfsk-1B_LT+sT_r2_Input -genome hg38 -use peaks.txt > peaks_PFSK-1_LT+sT.txt
    getDifferentialPeaksReplicates.pl -t HEK293_LT+sT_r2 HEK293_LT+sT_r3  -i HEK293_LT+sT_r2_Input HEK293_LT+sT_r3_Input -genome hg38 -use peaks.txt > peaks_HEK293_LT+sT.txt
  7. merge peaks: tried 0, 200, 500, 1000, 2000

    #http://homer.ucsd.edu/homer/ngs/mergePeaks.html
    mergePeaks -d 1000 peaks_PFSK-1_LT+sT.txt peaks_HEK293_LT+sT.txt peaks_NHDF_LT.txt -prefix celllines -venn celllines.txt -matrix celllines
    
    #-- generate bed files --
    awk 'BEGIN {OFS="\t"} {print $2,$3,$4,$1,$6}' peaks_NHDF_LT.txt > peaks_NHDF.bed;        
    awk 'BEGIN {OFS="\t"} {print $2,$3,$4,$1,$6}' peaks_HEK293_LT+sT.txt > peaks_HEK293.bed;
    awk 'BEGIN {OFS="\t"} {print $2,$3,$4,$1,$6}' peaks_PFSK-1_LT+sT.txt > peaks_PFSK-1.bed;
    awk 'BEGIN {OFS="\t"} {print $2,$3,$4,$1,$6}' celllines_peaks_HEK293_LT+sT.txt > peaks_HEK293_only.bed;
    awk 'BEGIN {OFS="\t"} {print $2,$3,$4,$1,$6}' celllines_peaks_HEK293_LT+sT.txt_peaks_NHDF_LT.txt > peaks_HEK293_NHDF.bed;
    awk 'BEGIN {OFS="\t"} {print $2,$3,$4,$1,$6}' celllines_peaks_NHDF_LT.txt > peaks_NHDF_only.bed;
    awk 'BEGIN {OFS="\t"} {print $2,$3,$4,$1,$6}' celllines_peaks_PFSK-1_LT+sT.txt > peaks_PFSK-1_only.bed;
    awk 'BEGIN {OFS="\t"} {print $2,$3,$4,$1,$6}' celllines_peaks_PFSK-1_LT+sT.txt_peaks_HEK293_LT+sT.txt > peaks_PFSK-1_HEK293.bed;
    awk 'BEGIN {OFS="\t"} {print $2,$3,$4,$1,$6}' celllines_peaks_PFSK-1_LT+sT.txt_peaks_HEK293_LT+sT.txt_peaks_NHDF_LT.txt > peaks_PFSK-1_HEK293_NHDF.bed;
    awk 'BEGIN {OFS="\t"} {print $2,$3,$4,$1,$6}' celllines_peaks_PFSK-1_LT+sT.txt_peaks_NHDF_LT.txt > peaks_PFSK-1_NHDF.bed;
    
    #-- annotate the peaks --
    annotatePeaks.pl peaks_NHDF_LT.txt hg38 > annotatedPeaks_NHDF.txt
    annotatePeaks.pl peaks_HEK293_LT+sT.txt hg38 > annotatedPeaks_HEK293.txt
    annotatePeaks.pl peaks_PFSK-1_LT+sT.txt hg38 > annotatedPeaks_PFSK-1.txt
    annotatePeaks.pl celllines_peaks_HEK293_LT+sT.txt hg38 > annotatedPeaks_HEK293_only.txt
    annotatePeaks.pl celllines_peaks_HEK293_LT+sT.txt_peaks_NHDF_LT.txt hg38 > annotatedPeaks_HEK293_NHDF.txt
    annotatePeaks.pl celllines_peaks_NHDF_LT.txt hg38 > annotatedPeaks_NHDF_only.txt
    annotatePeaks.pl celllines_peaks_PFSK-1_LT+sT.txt hg38 > annotatedPeaks_PFSK-1_only.txt
    annotatePeaks.pl celllines_peaks_PFSK-1_LT+sT.txt_peaks_HEK293_LT+sT.txt hg38 > annotatedPeaks_PFSK-1_HEK293.txt
    annotatePeaks.pl celllines_peaks_PFSK-1_LT+sT.txt_peaks_HEK293_LT+sT.txt_peaks_NHDF_LT.txt hg38 > annotatedPeaks_PFSK-1_HEK293_NHDF.txt
    annotatePeaks.pl celllines_peaks_PFSK-1_LT+sT.txt_peaks_NHDF_LT.txt hg38 > annotatedPeaks_PFSK-1_NHDF.txt
    
    mkdir ../beds_PFSK-1_HEK293_NHDF;
    for sample in peaks_HEK293_only peaks_PFSK-1_only peaks_NHDF_only    peaks_HEK293 peaks_PFSK-1 peaks_NHDF    peaks_PFSK-1_HEK293 peaks_PFSK-1_NHDF peaks_HEK293_NHDF     peaks_PFSK-1_HEK293_NHDF; do
      grep -v "cmd" ${sample}.bed > ../beds_PFSK-1_HEK293_NHDF/${sample}_.bed
    done
    
    #Chr     Start   End     PeakID (cmd=annotatePeaks.pl common_peaks_NHDF.txt hg38)        Peak Score      Strand
    ~/Tools/csv2xls-0.4/csv_to_xls.py celllines.txt annotatedPeaks_HEK293_only.txt annotatedPeaks_PFSK-1_only.txt annotatedPeaks_NHDF_only.txt    annotatedPeaks_HEK293.txt annotatedPeaks_PFSK-1.txt annotatedPeaks_NHDF.txt    annotatedPeaks_PFSK-1_HEK293.txt annotatedPeaks_PFSK-1_NHDF.txt annotatedPeaks_HEK293_NHDF.txt     annotatedPeaks_PFSK-1_HEK293_NHDF.txt  -d$'\t' -o  annotatedPeaks_PFSK-1_HEK293_NHDF.xls
    
    #IMPORTANT: DELETE the column 'Strand' marked with '+' in the merged Excel file!

Small RNA processing

Small RNA sequencing is a type of RNA-sequencing (RNA-seq) that specifically targets and sequences small RNA molecules in a sample.

RNA-seq is a technique that uses next-generation sequencing (NGS) to reveal the presence and quantity of RNA in a biological sample at a given moment, capturing a snapshot of the transcriptome.

Small RNAs, including microRNAs (miRNAs), small interfering RNAs (siRNAs), and piwi-interacting RNAs (piRNAs), play crucial roles in gene regulation. They typically range from 20 to 30 nucleotides in length.

  1. prepare raw data

    #mv Data_Ute_smallRNA_3/bundle_v1 Data_Ute_smallRNA_5
    ln -s ../Data_Ute_smallRNA_3/bundle_v1 .
    
    #OLD    MKL_1_wt_1_221216.fastq.gz -> ../221216_NB501882_0404_AHLVNMBGXM/ute/nf796/MKL_1_wt_1_S16_R1_001.fastq.gz
    #OLD    MKL_1_wt_2_221216.fastq.gz -> ../221216_NB501882_0404_AHLVNMBGXM/ute/nf797/MKL_1_wt_2_S17_R1_001.fastq.gz
    
    ln -s ./230623_newDemulti_smallRNAs/210817_NB501882_0294_AHW5Y2BGXJ_smallRNA_Ute_newDemulti/2021_nf_ute_smallRNA/nf655/MKL_1_derived_EV_miRNA_S1_R1_001.fastq.gz 2021_August_nf655_MKL-1_EV-miRNA.fastq.gz
    ln -s ./230623_newDemulti_smallRNAs/210817_NB501882_0294_AHW5Y2BGXJ_smallRNA_Ute_newDemulti/2021_nf_ute_smallRNA/nf657/WaGa_derived_EV_miRNA_S2_R1_001.fastq.gz 2021_August_nf657_WaGa_EV-miRNA.fastq.gz
    
    ln -s ./230623_newDemulti_smallRNAs/220617_NB501882_0371_AH7572BGXM_smallRNA_Ute_newDemulti/2022_nf_ute_smallRNA/nf774/0403_WaGa_wt_S1_R1_001.fastq.gz 2022_August_nf774_0403_WaGa_wt.fastq.gz
    ln -s ./230623_newDemulti_smallRNAs/220617_NB501882_0371_AH7572BGXM_smallRNA_Ute_newDemulti/2022_nf_ute_smallRNA/nf780/0505_MKL1_wt_S2_R1_001.fastq.gz 2022_August_nf780_0505_MKL-1_wt.fastq.gz
    
    ln -s ./230623_newDemulti_smallRNAs/221216_NB501882_0404_AHLVNMBGXM_smallRNA_Ute_newDemulti/2022_nf_ute_smallRNA/nf796/MKL-1_wt_1_S1_R1_001.fastq.gz 2022_November_nf796_MKL-1_wt_1.fastq.gz
    ln -s ./230623_newDemulti_smallRNAs/221216_NB501882_0404_AHLVNMBGXM_smallRNA_Ute_newDemulti/2022_nf_ute_smallRNA/nf797/MKL-1_wt_2_S2_R1_001.fastq.gz 2022_November_nf797_MKL-1_wt_2.fastq.gz
    
    ln -s ./230602_NB501882_0428_AHKG53BGXT/demulti_new/nf876/1002_WaGa_sT_Dox_S1_R1_001.fastq.gz    2023_June_nf876_1002_WaGa_sT_Dox.fastq.gz
    ln -s ./230602_NB501882_0428_AHKG53BGXT/demulti_new/nf887/2312_MKL_1_scr_DMSO_S2_R1_001.fastq.gz 2023_June_nf887_2312_MKL-1_scr_DMSO.fastq.gz
  2. main run

    mkdir our_out
    # -qc -ra TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -rb 4 #NOT_USED
    # -mic -mtool Blast -mdb viruses #IGNORING Microbe Module since it is very time-consuming!
    
    #jhuang@hamburg:~/DATA/Data_Ute/Data_Ute_smallRNA_5$ java -jar COMPSRA.jar -ref hg38 -qc -ra TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -rb 4 -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6 -mic -mtool Blast -mdb viruses -in 2021_August_nf655_MKL-1_EV-miRNA.fastq.gz -out ./our_out/
    for sample in 2021_August_nf655_MKL-1_EV-miRNA 2021_August_nf657_WaGa_EV-miRNA    2022_August_nf774_0403_WaGa_wt 2022_August_nf780_0505_MKL-1_wt    2022_November_nf796_MKL-1_wt_1 2022_November_nf797_MKL-1_wt_2    2023_June_nf876_1002_WaGa_sT_Dox 2023_June_nf887_2312_MKL-1_scr_DMSO; do
      mkdir our_out/${sample}/
      java -jar COMPSRA.jar -ref hg38   -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in ${sample}.fastq.gz -out ./our_out/
    done
  3. prepare Data_Ute/Data_Ute_smallRNA_4/sample.list

    2021_August_nf655_MKL-1_EV-miRNA
    2021_August_nf657_WaGa_EV-miRNA
    2022_August_nf774_0403_WaGa_wt
    2022_August_nf780_0505_MKL-1_wt
    2022_November_nf796_MKL-1_wt_1
    2022_November_nf797_MKL-1_wt_2
    2023_June_nf876_1002_WaGa_sT_Dox
    2023_June_nf887_2312_MKL-1_scr_DMSO
  4. extract the raw counts and perform statistical test on pre-defined groups.

    #The following results calculate raw counts (Note: If you only want to merge the count files, you can use -fm -fms.)
    java -jar COMPSRA.jar -ref hg38 -fun -fm -fms 1-8 -fdclass 1,2,3,4,5 -fdann -pro COMPSRA_MERGE -inf ./sample.list -out ./our_out/
    #The following command calculate statistical test after defining case and control.
    java -jar COMPSRA.jar -ref hg38 -fun -fd -fdclass 1,2,3,4,5,6 -fdcase 1-2 -fdctrl 3-6 -fdnorm cpm -fdtest mwu -fdann -pro COMPSRA_DEG -inf ./sample.list -out ./our_out/
    
    sed -i -e 's/_August/-08/g' COMPSRA_MERGE_0_miRNA.txt
    sed -i -e 's/_November/-11/g' COMPSRA_MERGE_0_miRNA.txt
    sed -i -e 's/_June/-06/g' COMPSRA_MERGE_0_miRNA.txt
    sed -i -e 's/_STAR_Aligned_miRNA.txt//g' COMPSRA_MERGE_0_miRNA.txt
    #sed -i -e 's/_piRNA.txt//g' COMPSRA_MERGE_0_piRNA.txt
    #sed -i -e 's/_tRNA.txt//g' COMPSRA_MERGE_0_tRNA.txt
    #sed -i -e 's/_snoRNA.txt//g' COMPSRA_MERGE_0_snoRNA.txt
    #sed -i -e 's/_snRNA.txt//g' COMPSRA_DEG_0_snRNA.txt
    sed -i -e 's/_August/-08/g' COMPSRA_DEG_0_miRNA.txt
    sed -i -e 's/_November/-11/g' COMPSRA_DEG_0_miRNA.txt
    sed -i -e 's/_June/-06/g' COMPSRA_DEG_0_miRNA.txt
    sed -i -e 's/_STAR_Aligned_miRNA.txt//g' COMPSRA_DEG_0_miRNA.txt
    
    import pandas as pd
    df = pd.read_csv('COMPSRA_MERGE_0_miRNA.txt', sep='\t', index_col=0)
    df = df.drop(columns=['Unnamed: 9'])
    # Assuming df is your DataFrame
    df.loc['Sum'] = df.sum(numeric_only=True)
    
    df.to_csv('COMPSRA_MERGE_0_miRNA_.txt', sep='\t')
    df = pd.read_csv('COMPSRA_MERGE_0_piRNA.txt', sep='\t', index_col=0)
    df = df.drop(columns=['Unnamed: 9'])
    df.loc['Sum'] = df.sum(numeric_only=True)
    df.to_csv('COMPSRA_MERGE_0_piRNA_.txt', sep='\t')
    df = pd.read_csv('COMPSRA_MERGE_0_snoRNA.txt', sep='\t', index_col=0)
    df = df.drop(columns=['Unnamed: 9'])
    df.loc['Sum'] = df.sum(numeric_only=True)
    df.to_csv('COMPSRA_MERGE_0_snoRNA_.txt', sep='\t')
    df = pd.read_csv('COMPSRA_MERGE_0_snRNA.txt', sep='\t', index_col=0)
    df = df.drop(columns=['Unnamed: 9'])
    df.loc['Sum'] = df.sum(numeric_only=True)
    df.to_csv('COMPSRA_MERGE_0_snRNA_.txt', sep='\t')
    df = pd.read_csv('COMPSRA_MERGE_0_tRNA.txt', sep='\t', index_col=0)
    df = df.drop(columns=['Unnamed: 9'])
    df.loc['Sum'] = df.sum(numeric_only=True)
    df.to_csv('COMPSRA_MERGE_0_tRNA_.txt', sep='\t')
    
    #samtools flagstat **.bam
    #47217410 + 0 in total (QC-passed reads + QC-failed reads)
    #45166321 + 0 mapped (95.66% : N/A)
    #2051089 + 0 in total (QC-passed reads + QC-failed reads)
    
    #TODO: check the microRNA in the paper mentioned in which position?
    #Single publications on EVs as transport vehicles for specific miRNAs in the pathogenesis of Merkel cell carcinoma have also been reported, such as miR-375 and its function in proliferation
    
    ~/Tools/csv2xls-0.4/csv_to_xls.py COMPSRA_MERGE_0_miRNA_.txt \
    COMPSRA_MERGE_0_piRNA_.txt \
    COMPSRA_MERGE_0_tRNA_.txt \
    COMPSRA_MERGE_0_snoRNA_.txt \
    COMPSRA_MERGE_0_snRNA_.txt \
    -d$'\t' -o raw_counts.xls;
    
    # sorting the DEG table, change the sheet labels to 'miRNA_between_columns_B-C_and_columns_D-G'
    ~/Tools/csv2xls-0.4/csv_to_xls.py COMPSRA_DEG_0_miRNA.txt -d$'\t' -o normalized_and_significance_test_miRNA.xls;
    
    ##merging the row counts and statical values
    #cut -f1-1 COMPSRA_MERGE_0_snoRNA.txt > f1_MERGE
    #cut -f1-1 COMPSRA_DEG_0_snoRNA.txt > f1_DEG
    #cut -f1-1 COMPSRA_MERGE_0_miRNA.txt > f1_MERGE
    #cut -f1-1 COMPSRA_DEG_0_miRNA.txt > f1_DEG
    #diff f1_MERGE f1_DEG
  5. calculate the number of total reads, total human reads and total non-human reads.

    for sample in 2021_August_nf655_MKL-1_EV-miRNA 2021_August_nf657_WaGa_EV-miRNA    2022_August_nf774_0403_WaGa_wt 2022_August_nf780_0505_MKL-1_wt    2022_November_nf796_MKL-1_wt_1 2022_November_nf797_MKL-1_wt_2    2023_June_nf876_1002_WaGa_sT_Dox 2023_June_nf887_2312_MKL-1_scr_DMSO; do
      echo "--------------- ${sample} ---------------"
      samtools flagstat ./${sample}/${sample}_STAR_Aligned.out.bam
      samtools flagstat ./${sample}/${sample}_STAR_Aligned_UnMapped.bam
    done

COMPSRA was composed of five functional modules: Quality Control, Alignment, Annotation, Microbe and Function. They are integrated into a pipeline and each module can also process independently.

Small_RNA_processing

  • Quality Control: To deal with fastq files and filter out the adapter sequences and reads with low quality.

    • FASTQ files from the small RNA sequencing of biological samples are the default input.
    • First, the adapter portions of a read are trimmed along with any randomized bases at ligation junctions that are produced by some small RNA-seq kits (e.g., NEXTflexT M Small RNA-Seq kit). The adapter sequences, typically situated at the 3′ (3-prime) end, vary across different kits. Some commonly employed adapter sequences are listed below:
      • TruSeq Small RNA (Illumina) TGGAATTCTCGGGTGCCAAGG
      • Small RNA Kits V1 (Illumina) TCGTATGCCGTCTTCTGCTTGT
      • Small RNA Kits V1.5 (Illumina) ATCTCGTATGCCGTCTTCTGCTTG
      • NEXTflex Small RNA Sequencing Kit v3 for Illumina Platforms (Bioo Scientific) TGGAATTCTCGGGTGCCAAGG
      • LEXOGEN Small RNA-Seq Library Prep Kit (Illumina) TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC
    • The read quality of the remaining sequence is evaluated using its corresponding Phred score.
    • Poor quality reads are removed according to quality control parameters set in the command line (-rh 20 –rt 20 –rr 20).
    • Users can specify qualified reads of specific length intervals for input into subse-quent modules.
  • Alignment:

    • To align the clean reads to the reference genome. COMPSRA uses STAR as its default RNA sequence aligner with default parameters which are customizable on the command line.
    • Qualified reads from the QC module output are first mapped to the human genome hg19/hg38, and then aligned reads are quantified and annotated in the Annotation Module.
    • Reads that could not be mapped to the human genome are saved into a FASTA file for input into the Microbe Module.
  • Annotation:

    • To annotate different kinds of circulating RNAs based on the alignment result.
    • COMPSRA currently uses several different small RNA databases for annotating human genome mapped reads and provides all the possible annotations:
      • miRBase (Kozomara and Griffiths-Jones, 2011) for miRNA;
      • piRNABank (Sai Lakshmi and Agrawal, 2008);
      • piRBase (Zhang, et al., 2014) and piRNACluster (Rosenkranz, 2016) for piRNA;
      • gtRNAdb (Chan and Lowe, 2016) for tRNA;
      • GENCODE release 27 (Harrow, et al., 2012) for snRNA and snoRNA;
      • circBase (Glazar, et al., 2014) for circular RNA.
    • To conform the different reference human genome versions in these databases, we use an automatic LiftOver created by the UCSC Genome Browser Group.
    • All the databases used are already pre-built, enabling speedy annotation.
  • Microbe:

    • To predict the possible species of microbes existed in the samples.
    • The qualified reads that could not be mapped to the human genome in the Alignment Module are aligned to the nucleotide (nt) database (Coordinators, 2013) from UCSC using BLAST.
    • The four major microbial taxons archaea, bacteria, fungi and viruses are supported.
  • Function:

    • To perform differential expression analysis and other functional studies to be extended.
    • The read count of each RNA molecule that is identified in the Annotation Module is outputted as a tab-delimited text file according to RNA type.
    • With more than one sample FASTQ file inputs, the output are further aggregated into a data matrix of RNA molecules as rows and samples as columns showing the read counts of an RNA molecule across different samples.
    • The user can mark each sample FASTQ file column as either a case or a control in the command line, and perform a case versus control differential expression analysis for each RNA molecule using the Mann-Whitney rank sum test (Wilcoxon Rank Sum Test) as the default statistical test.

Density of motif plots and its statistical tests

Density_of_GRGGC_Motifs

Density_of_GCCYC_Motifs

Histogram_of_GRGGC_Motifs

Histogram_of_GCCYC_Motifs

  1. plot peaks

    @Input: the peak bed-files 
    @Output: peak_tss_distances_NHDF_.xlsx 
              peak_tss_distances_NHDF_.txt 
              peak_distribution_NHDF_.csv
              peak_distribution_NHDF_.png
    @RUN: python3 plot_peaks.py peaks_NHDF_.bed /home/jhuang/REFs/gencode.v43.annotation.gtf.db /ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa
    @RUN: python3 plot_peaks.py peaks_HEK293_.bed /home/jhuang/REFs/gencode.v43.annotation.gtf.db /ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa
    @RUN: python3 plot_peaks.py peaks_PFSK-1_.bed /home/jhuang/REFs/gencode.v43.annotation.gtf.db /ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa
    @RUN: python3 plot_peaks.py peaks_K331A_.bed /home/jhuang/REFs/gencode.v43.annotation.gtf.db /ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa
  2. filtering the cloeset genes when the absolute values of Distance to TSS is smaller than or equal to 3000 using python.

    @Input: peak_tss_distances_NHDF_.txt
    @RUN: python3 filter_genes_with_peaks.py peak_tss_distances_NHDF_.txt
    
      import pandas as pd
      import argparse
    
      def filter_and_save_genes(input_file):
          # Read the dataframe
          df = pd.read_csv(input_file, sep='\t')
    
          # Convert Distance to TSS to absolute values
          df['Distance to TSS'] = df['Distance to TSS'].abs()
    
          # Filter dataframe where Distance to TSS <= 3000
          filtered_df = df[df['Distance to TSS'] <= 3000]
    
          # Get the list of gene names
          gene_names = filtered_df['Closest Gene (Gene name)'].tolist()
    
          # Join the gene names with a comma
          gene_names_string = ','.join(gene_names)
    
          # Print the gene names
          print(gene_names_string)
    
          # Save the gene names to a text file
          #with open('gene_names.txt', 'w') as file:
          #    file.write(gene_names_string)
    
          # Convert the list of gene names to a DataFrame
          gene_names_df = pd.DataFrame(gene_names, columns=['Gene Name'])
    
          # Save the DataFrame to an Excel file
          gene_names_df.to_excel('gene_with_peaks_in_promoter_NHDF.xlsx', index=False)
    
      if __name__ == '__main__':
          parser = argparse.ArgumentParser(description='Process some integers.')
          parser.add_argument('input_file', help='The path to the input file')
          args = parser.parse_args()
          filter_and_save_genes(args.input_file)
  3. draw venn diagrams based on the gene names

      import pandas as pd
      import matplotlib.pyplot as plt
      from matplotlib_venn import venn2, venn3
    
      # Read the gene names from the Excel files
      df1 = pd.read_excel('gene_with_peaks_in_promoter_NHDF.xlsx')  # generated in the STEP 2
      df2 = pd.read_excel('gene_with_peaks_in_promoter_K331A.xlsx')
      #df2 = pd.read_excel('gene_with_peaks_in_promoter_HEK293.xlsx')
      df3 = pd.read_excel('gene_with_peaks_in_promoter_PFSK-1.xlsx')
    
      # Create sets from the gene names
      set1 = set(df1['Gene Name'])
      set2 = set(df2['Gene Name'])
      set3 = set(df3['Gene Name'])
    
      venn2([set1, set2], set_labels=('NHDF LT', 'NHDF LT K331A')) #, set_colors=('skyblue', 'lightgreen')
      #venn3([set1, set2, set3], set_labels=('NHDF LT', 'HEK293 LT+sT', 'PFSK-1 LT+sT'))
      plt.title('Genes with Peaks in Promoter', fontsize=16)
      plt.xticks(fontsize=12)
      plt.yticks(fontsize=12)
    
      plt.savefig('Venn_Diagram_of_Genes_with_Peaks_in_Promoter_NHDF_vs_K331A.png', dpi=300, bbox_inches='tight')  # Save the Venn diagram as an image file
      plt.show()  # Display the Venn diagram
  4. extract the promoter seqeuences, generating promoter_sequences_3000_3000.fasta

      #!/usr/bin/env python3
      # ./1_generate_promoter_sequences.py /home/jhuang/REFs/gencode.v43.annotation.gtf.db
      #improve the code to exact alsogene_type except for ensemble_ids and gene_symbol, and save them in the heads.
    
      # # Create the database from the GTF file
      # gtf_file = 'gencode.v43.annotation.gtf'
      # db_file = 'gencode.v43.annotation.gtf.db'
      # db = gffutils.create_db(gtf_file, dbfn=db_file, force=True, merge_strategy='merge', verbose=True)
    
      import gffutils
      from pyfaidx import Fasta
      import argparse
      from Bio import SeqIO
      from Bio.SeqRecord import SeqRecord
      from Bio.Seq import Seq
    
      def extract_promoter_sequences(gencode_db, genome_records, upstream_length=3000, downstream_length=3000):
          db = gffutils.FeatureDB(gencode_db, keep_order=True)
          promoters = []
          for gene in db.features_of_type('gene'): #CDS
              gene_type = gene.attributes.get('gene_type', [''])[0]
              #if gene_type == 'protein_coding':
              if True:
                  if gene.strand == '+':
                      promoter_start = max(gene.start - upstream_length, 1)
                      promoter_end = gene.start + downstream_length - 1
                  else: # gene.strand == '-'
                      promoter_start = gene.end - downstream_length + 1
                      promoter_end = min(gene.end + upstream_length, len(genome_records[gene.chrom]))
                  promoter_seq = str(genome_records[gene.chrom][promoter_start-1:promoter_end])
                  if gene.strand == '-':
                      promoter_seq = str(Seq(promoter_seq).reverse_complement())
    
                  gene_symbol = gene.attributes.get('gene_name', [''])[0]
                  gene_type = gene.attributes.get('gene_type', [''])[0]
    
                  #20042 in which gene_type == 'protein_coding', while 62703 all types.
                  header = f"{gene.id} | Symbol: {gene_symbol} | Type: {gene_type}"
                  #header = f"{gene_symbol}"
                  #record = SeqRecord(Seq(promoter_seq.upper()), id=gene.id, description=gene_symbol)
                  record = SeqRecord(Seq(promoter_seq.upper()), id=gene.id, description=header)
                  promoters.append(record)
    
          return promoters
    
      if __name__ == "__main__":
          parser = argparse.ArgumentParser(description='Plot peak distribution around TSS.')
          parser.add_argument('gencode_db', type=str, help='Path to the gencode.v43.annotation.gtf.db file')
          args = parser.parse_args()
    
          genome_records = Fasta('/ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa')
          promoter_sequences = extract_promoter_sequences(args.gencode_db, genome_records)
          SeqIO.write(promoter_sequences, "promoter_sequences_3000_3000.fasta", "fasta")
  5. find motifs in promoters, generating the file

      #!/usr/bin/env python3
      #./2_find_motifs_in_promoters.py
      import re
      from Bio import SeqIO
    
      def find_motif_frequency_and_distribution(fasta_file, output_file, motif_1="GRGGC", motif_2="GCCYC"):
          motif_1 = motif_1.replace("R", "[AG]").replace("Y", "[CT]")
          motif_2 = motif_2.replace("R", "[AG]").replace("Y", "[CT]")
          promoter_sequences = SeqIO.parse(fasta_file, "fasta")
    
          with open(output_file, 'w') as f:
              f.write("PromoterID\tGene_Symbol\tGene_Type\tMotif_GRGGC_Count\tMotif_GCCYC_Count\tMotif_GRGGC_Center_Positions\tMotif_GCCYC_Center_Positions\n")
    
              for promoter in promoter_sequences:
                  header_parts = promoter.description.split("|")
                  promoter_id = header_parts[0].strip()
                  gene_symbol = header_parts[1].strip()
                  gene_type = header_parts[2].strip()
    
                  motif1_positions = [(m.start() + len(motif_1) // 2 - 3000) for m in re.finditer(motif_1, str(promoter.seq))]
                  motif2_positions = [(m.start() + len(motif_2) // 2 - 3000) for m in re.finditer(motif_2, str(promoter.seq))]
    
                  line = f"{promoter_id}\t{gene_symbol}\t{gene_type}\t{len(motif1_positions)}\t{len(motif2_positions)}\t{motif1_positions}\t{motif2_positions}\n"
                  f.write(line)
    
      if __name__ == "__main__":
          fasta_file = "promoter_sequences_3000_3000.fasta"
          output_file = "motif_counts_positions.txt"
          find_motif_frequency_and_distribution(fasta_file, output_file)
  6. prepare three files from motif_counts_positions.txt and gene_with_peaks_in_promoter_NHDF.xlsx.

       library(readxl)
       #sed -i 's/[][]//g' motif_counts_positions.txt  # generated in STEP 5
       motif_counts_positions <- read.table("motif_counts_positions.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
       # Read the gene names from the Excel files
       df1 <- read_excel("gene_with_peaks_in_promoter_NHDF.xlsx")  # generated in STEP 2
       df2 <- read_excel("gene_with_peaks_in_promoter_K331A.xlsx")
       # Create a new data frame for rows in 'data' where the PromoterID is in NHDF_LT's Closest Gene --> total 1706 genes
       gene_and_its_motifs_with_NHDF_LT_peak <- motif_counts_positions[motif_counts_positions$PromoterID %in% unique(df1$`Gene Name`), ]
       gene_and_its_motifs_with_NHDF_LT_peak$Motif_GRGGC_Center_Positions <- sapply(gene_and_its_motifs_with_NHDF_LT_peak$Motif_GRGGC_Center_Positions, toString)
       gene_and_its_motifs_with_NHDF_LT_peak$Motif_GCCYC_Center_Positions <- sapply(gene_and_its_motifs_with_NHDF_LT_peak$Motif_GCCYC_Center_Positions, toString)
       write.table(gene_and_its_motifs_with_NHDF_LT_peak, file="motifs_on_NHDF_LT_promoters.txt", sep="\t", row.names = FALSE)
       #sed -i 's/Symbol: //g' motifs_on_NHDF_LT_promoters.txt
       #sed -i 's/Type: //g' motifs_on_NHDF_LT_promoters.txt
       #sed -i 's/"//g' motifs_on_NHDF_LT_promoters.txt
       data1 <- read.table("motifs_on_NHDF_LT_promoters.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
       data1$Motif_GRGGC_Count <- as.numeric(data1$Motif_GRGGC_Count)
       data1$Motif_GCCYC_Count <- as.numeric(data1$Motif_GCCYC_Count)
       data1$Motif_GRGGC_Center_Positions <- strsplit(as.character(data1$Motif_GRGGC_Center_Positions), ",")
       data1$Motif_GCCYC_Center_Positions <- strsplit(as.character(data1$Motif_GCCYC_Center_Positions), ",")
       # Create data1 frames for each motif
       data1_motif1 <- data.frame(PromoterID = rep(data1$PromoterID, data1$Motif_GRGGC_Count), Motif = "Motif GRGGC", Position = unlist(data1$Motif_GRGGC_Center_Positions))
       data1_motif2 <- data.frame(PromoterID = rep(data1$PromoterID, data1$Motif_GCCYC_Count), Motif = "Motif GCCYC", Position = unlist(data1$Motif_GCCYC_Center_Positions))
       data1_positions <- rbind(data1_motif1, data1_motif2)
       # Get positions as numeric values, excluding NAs
       data1_motif1_positions <- as.numeric(data1_positions$Position[data1_positions$Motif == "Motif GRGGC"])
       data1_motif2_positions <- as.numeric(data1_positions$Position[data1_positions$Motif == "Motif GCCYC"])
       # Create another data frame for the remaining rows --> total 60997 genes
       gene_and_its_motifs_without_NHDF_LT_peak <- motif_counts_positions[!motif_counts_positions$PromoterID %in% unique(df1$`Gene Name`), ]
       gene_and_its_motifs_without_NHDF_LT_peak$Motif_GRGGC_Center_Positions <- sapply(gene_and_its_motifs_without_NHDF_LT_peak$Motif_GRGGC_Center_Positions, toString)
       gene_and_its_motifs_without_NHDF_LT_peak$Motif_GCCYC_Center_Positions <- sapply(gene_and_its_motifs_without_NHDF_LT_peak$Motif_GCCYC_Center_Positions, toString)
       write.table(gene_and_its_motifs_without_NHDF_LT_peak, file="motifs_on_NOT_NHDF_LT_promoters.txt", sep="\t", row.names = FALSE)
       #sed -i 's/Symbol: //g' motifs_on_NOT_NHDF_LT_promoters.txt
       #sed -i 's/Type: //g' motifs_on_NOT_NHDF_LT_promoters.txt
       #sed -i 's/"//g' motifs_on_NOT_NHDF_LT_promoters.txt
       data2 <- read.table("motifs_on_NOT_NHDF_LT_promoters.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
       data2$Motif_GRGGC_Count <- as.numeric(data2$Motif_GRGGC_Count)
       data2$Motif_GCCYC_Count <- as.numeric(data2$Motif_GCCYC_Count)
       data2$Motif_GRGGC_Center_Positions <- strsplit(as.character(data2$Motif_GRGGC_Center_Positions), ",")
       data2$Motif_GCCYC_Center_Positions <- strsplit(as.character(data2$Motif_GCCYC_Center_Positions), ",")
       # Create data2 frames for each motif
       data2_motif1 <- data.frame(PromoterID = rep(data2$PromoterID, data2$Motif_GRGGC_Count), Motif = "Motif GRGGC", Position = unlist(data2$Motif_GRGGC_Center_Positions))
       data2_motif2 <- data.frame(PromoterID = rep(data2$PromoterID, data2$Motif_GCCYC_Count), Motif = "Motif GCCYC", Position = unlist(data2$Motif_GCCYC_Center_Positions))
       data2_positions <- rbind(data2_motif1, data2_motif2)
       # Get positions as numeric values, excluding NAs
       data2_motif1_positions <- as.numeric(data2_positions$Position[data2_positions$Motif == "Motif GRGGC"])
       data2_motif2_positions <- as.numeric(data2_positions$Position[data2_positions$Motif == "Motif GCCYC"])
       # Create a new data frame for rows in 'data' where the PromoterID is in NHDF_LT_K331A's Closest Gene --> total 310 genes
       gene_and_its_motifs_with_NHDF_LT_K331A_peak <- motif_counts_positions[motif_counts_positions$PromoterID %in% unique(df2$`Gene Name`), ]
       gene_and_its_motifs_with_NHDF_LT_K331A_peak$Motif_GRGGC_Center_Positions <- sapply(gene_and_its_motifs_with_NHDF_LT_K331A_peak$Motif_GRGGC_Center_Positions, toString)
       gene_and_its_motifs_with_NHDF_LT_K331A_peak$Motif_GCCYC_Center_Positions <- sapply(gene_and_its_motifs_with_NHDF_LT_K331A_peak$Motif_GCCYC_Center_Positions, toString)
       write.table(gene_and_its_motifs_with_NHDF_LT_K331A_peak, file="motifs_on_NHDF_LT_K331A_promoters.txt", sep="\t", row.names = FALSE)
       #sed -i 's/Symbol: //g' motifs_on_NHDF_LT_K331A_promoters.txt
       #sed -i 's/Type: //g' motifs_on_NHDF_LT_K331A_promoters.txt
       #sed -i 's/"//g' motifs_on_NHDF_LT_K331A_promoters.txt
       data3 <- read.table("motifs_on_NHDF_LT_K331A_promoters.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
       data3$Motif_GRGGC_Count <- as.numeric(data3$Motif_GRGGC_Count)
       data3$Motif_GCCYC_Count <- as.numeric(data3$Motif_GCCYC_Count)
       data3$Motif_GRGGC_Center_Positions <- strsplit(as.character(data3$Motif_GRGGC_Center_Positions), ",")
       data3$Motif_GCCYC_Center_Positions <- strsplit(as.character(data3$Motif_GCCYC_Center_Positions), ",")
       # Create data3 frames for each motif
       data3_motif1 <- data.frame(PromoterID = rep(data3$PromoterID, data3$Motif_GRGGC_Count), Motif = "Motif GRGGC", Position = unlist(data3$Motif_GRGGC_Center_Positions))
       data3_motif2 <- data.frame(PromoterID = rep(data3$PromoterID, data3$Motif_GCCYC_Count), Motif = "Motif GCCYC", Position = unlist(data3$Motif_GCCYC_Center_Positions))
       data3_positions <- rbind(data3_motif1, data3_motif2)
       # Get positions as numeric values, excluding NAs
       data3_motif1_positions <- as.numeric(data3_positions$Position[data3_positions$Motif == "Motif GRGGC"])
       data3_motif2_positions <- as.numeric(data3_positions$Position[data3_positions$Motif == "Motif GCCYC"])
  7. Plot Histogram of counts and density of positions

      # Adjust the number of breaks (nbins) to control the width of histograms
      x_limit <- max(max(data1$Motif_GRGGC_Count), max(data1$Motif_GCCYC_Count), max(data2$Motif_GRGGC_Count), max(data2$Motif_GCCYC_Count), max(data3$Motif_GRGGC_Count), max(data3$Motif_GCCYC_Count))
      y_limit1 <- 300
      y_limit2 <- 12000
      y_limit3 <- 60
      # Set up the layout for the plots
      png("Histogram_of_GRGGC_Motifs.png", width = 1600, height = 800)
      par(mfrow = c(1, 3))
      hist(data1$Motif_GRGGC_Count, main = "Motif Counts in NHDF_LT promoters", xlab = "Counts", xlim = c(0, x_limit), ylim = c(0, y_limit1), breaks = 20, cex.main = 2)
      hist(data2$Motif_GRGGC_Count, main = "Motif Counts in NOT_NHDF_LT promoters", xlab = "Counts", xlim = c(0, x_limit), ylim = c(0, y_limit2), breaks = 30, cex.main = 2)
      hist(data3$Motif_GRGGC_Count, main = "Motif Counts in NHDF_LT_K331A promoters", xlab = "Counts", xlim = c(0, x_limit), ylim = c(0, y_limit3), breaks = 10, cex.main = 2)
      mtext("Histograms of GRGGC Motifs", outer = TRUE, line = -5, cex = 1.5)
      dev.off()
      png("Histogram_of_GCCYC_Motifs.png", width = 1600, height = 800)
      par(mfrow = c(1, 3))
      hist(data1$Motif_GCCYC_Count, main = "Motif Counts in NHDF_LT promoters", xlab = "Counts", xlim = c(0, x_limit), ylim = c(0, y_limit1), breaks = 20, cex.main = 2)
      hist(data2$Motif_GCCYC_Count, main = "Motif Counts in NOT_NHDF_LT promoters", xlab = "Counts", xlim = c(0, x_limit), ylim = c(0, y_limit2), breaks = 30, cex.main = 2)
      hist(data3$Motif_GCCYC_Count, main = "Motif Counts in NHDF_LT_K331A promoters", xlab = "Counts", xlim = c(0, x_limit), ylim = c(0, y_limit3), breaks = 10, cex.main = 2)
      mtext("Histograms of GCCYC Motifs", outer = TRUE, line = -5, cex = 1.5)
      dev.off()
      # Set the ylim value
      ylim <- c(0.0000, 0.0003)
      png("Density_of_GRGGC_Motifs.png", width = 1600, height = 800)
      par(mfrow = c(1, 3))
      plot(density(data1_motif1_positions, na.rm = TRUE), main = "Motif Density in NHDF_LT promoters", xlab = "Position", ylim = ylim, cex.main = 2)
      plot(density(data2_motif1_positions, na.rm = TRUE), main = "Motif Density in NOT_NHDF_LT promoters", xlab = "Position", ylim = ylim, cex.main = 2)
      plot(density(data3_motif1_positions, na.rm = TRUE), main = "Motif Density in NHDF_LT_K331A promoters", xlab = "Position", ylim = ylim, cex.main = 2)
      #mtext("Density of GRGGC Motifs", outer = TRUE, line = -5, cex = 1.5)
      dev.off()
      png("Density_of_GCCYC_Motifs.png", width = 1600, height = 800)
      par(mfrow = c(1, 3))
      plot(density(data1_motif2_positions, na.rm = TRUE), main = "Motif Density in NHDF_LT promoters", xlab = "Position", ylim = ylim, cex.main = 2)
      plot(density(data2_motif2_positions, na.rm = TRUE), main = "Motif Density in NOT_NHDF_LT promoters", xlab = "Position", ylim = ylim, cex.main = 2)
      plot(density(data3_motif2_positions, na.rm = TRUE), main = "Motif Density in NHDF_LT_K331A promoters", xlab = "Position", ylim = ylim, cex.main = 2)
      #mtext("Density of GCCYC Motifs", outer = TRUE, line = -5, cex = 1.5)
      dev.off()
  8. Statistical tests

    How to perform a statistical test from density plots? There is in the center positions a much higher possibility than background? This means, in a condition, the peaks occurs more often in the central positions? How to perform a statistical test to see if they are significant?

    If you are interested in testing whether the density of peaks in the central positions is significantly different from the background, a suitable statistical test might be a permutation test, or perhaps a Kolmogorov-Smirnov (KS) test. The KS test, in particular, is a non-parametric test that compares two continuous one-dimensional probability distributions.

    Here is an example of how you might do this using the KS test in R:

    # Suppose `data1_motif1_positions` and `data2_motif2_positions` are your two datasets
    ks.test(data1_motif1_positions, data2_motif1_positions)
    #D = 0.034868, p-value < 2.2e-16
    ks.test(data1_motif1_positions, data3_motif1_positions)
    #D = 0.048238, p-value = 4.959e-11
    ks.test(data2_motif1_positions, data3_motif1_positions)
    #D = 0.016348, p-value = 0.07909
    ks.test(data1_motif2_positions, data2_motif2_positions)
    #D = 0.022212, p-value = 4.441e-16
    ks.test(data1_motif2_positions, data3_motif2_positions)
    #D = 0.038822, p-value = 4.049e-07
    ks.test(data2_motif2_positions, data3_motif2_positions)
    #D = 0.018951, p-value = 0.02927
    
    pvalues <- c(2.2e-16, 4.959e-11, 0.07909)
    bonferroni <- p.adjust(pvalues, method = "bonferroni") #"bonferroni", "BH", "holm"
    print(bonferroni)
    #6.6000e-16 1.4877e-10 2.3727e-01
    pvalues <- c(4.441e-16, 4.049e-07, 0.02927)
    bonferroni <- p.adjust(pvalues, method = "bonferroni")
    print(bonferroni)
    #1.3323e-15 1.2147e-06 8.7810e-02

    The ks.test function will return a test statistic and a p-value. The null hypothesis of the KS test is that the two samples are drawn from the same distribution. Therefore, a small p-value (typically, p < 0.05) would lead you to reject the null hypothesis and conclude that there is a significant difference between the two distributions.

    Note that the KS test, like all statistical tests, has assumptions and limitations. It is sensitive to differences in both location and shape of the empirical cumulative distribution functions of the two samples. It also assumes that the samples are independent and identically distributed, which may or may not be appropriate depending on your specific data and research question. Therefore, the test results should be interpreted in the context of these assumptions.

    Keep in mind that while this test can indicate whether the densities are different, it does not specify where these differences occur (i.e., in the tails, the center, etc.). If you are specifically interested in the center, you may want to define what you mean by “center” and perform a test (e.g., t-test or KS test) on that specific subset of your data.

    A bidirectional promoter is a DNA sequence located between two genes, each of which is transcribed from this shared promoter region but in opposite directions. These type of promoters are particularly common in viruses, especially in those with compact genomes, such as many DNA viruses, where genomic space is at a premium. In viruses, bidirectional promoters can help regulate the simultaneous expression of two genes, which can be critical for the viral lifecycle. Some examples of viruses using bidirectional promoters include Adenovirus, Herpes simplex virus, and the Epstein-Barr virus. For example, in the Epstein-Barr virus, the C promoter (Cp) is a bidirectional promoter that regulates the expression of the EBNA (Epstein-Barr Nuclear Antigen) genes. These genes are essential for the establishment of latent infection.

  9. assign each promoter to a prefined class according to Motifs_Count and Avg_Distance_To_TSS_of_Motifs

    library(dplyr)
    data <- read.table("motifs_on_NHDF_LT_K331A_promoters.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
    data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count)
    data$Motif_GCCYC_Count <- as.numeric(data$Motif_GCCYC_Count)
    # Add a column for the sum of Motif_GRGGC_Count and Motif_GCCYC_Count
    data <- data %>%
      mutate(Motifs_Count = Motif_GRGGC_Count + Motif_GCCYC_Count)
    # Calculate absolute average Motif_GRGGC distance to TSS
    data$Avg_Distance_To_TSS_of_Motif_GRGGC <- sapply(strsplit(data$Motif_GRGGC_Center_Positions, ", "), 
                                            function(x) round(mean(as.numeric(x), na.rm = TRUE)))
    # Calculate absolute average Motif_GCCYC distance to TSS
    data$Avg_Distance_To_TSS_of_Motif_GCCYC <- sapply(strsplit(data$Motif_GCCYC_Center_Positions, ", "), 
                                            function(x) round(mean(as.numeric(x), na.rm = TRUE)))
    # Revised function to calculate the mean of two strings of comma-separated numbers
    mean_of_strings <- function(str1, str2) {
      # Convert strings of numbers to numeric vectors
      vec1 <- as.numeric(strsplit(str1, split = ",")[[1]])
      vec2 <- as.numeric(strsplit(str2, split = ",")[[1]])
      # Handle cases where vec1 or vec2 is NULL (i.e., when str1 or str2 is an empty string)
      if (is.null(vec1)) {
        vec1 <- numeric(0)
      }
      if (is.null(vec2)) {
        vec2 <- numeric(0)
      }
      # Compute the mean of the two vectors
      return(round(mean(c(vec1, vec2), na.rm = TRUE)))
    }
    # Apply the function to each row of the dataframe
    data$Avg_Distance_To_TSS_of_Motifs <- mapply(mean_of_strings, data$Motif_GRGGC_Center_Positions, data$Motif_GCCYC_Center_Positions)
    # Assign classes based on the given rules
    data <- data %>%
      mutate(
        Class_based_Motif_GRGGC = case_when(
          Motif_GRGGC_Count < 50 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) <= 500 ~ "Class 1",
          Motif_GRGGC_Count < 50 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) > 500 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) <= 1000 ~ "Class 2",
          Motif_GRGGC_Count < 50 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) > 1000 ~ "Class 3",
          Motif_GRGGC_Count >= 50 & Motif_GRGGC_Count < 100 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) <= 500 ~ "Class 4",
          Motif_GRGGC_Count >= 50 & Motif_GRGGC_Count < 100 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) > 500 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) <= 1000 ~ "Class 5",
          Motif_GRGGC_Count >= 50 & Motif_GRGGC_Count < 100 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) > 1000 ~ "Class 6",
          Motif_GRGGC_Count >= 100 & Motif_GRGGC_Count < 150 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) <= 500 ~ "Class 7",
          Motif_GRGGC_Count >= 100 & Motif_GRGGC_Count < 150 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) > 500 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) <= 1000 ~ "Class 8",
          Motif_GRGGC_Count >= 100 & Motif_GRGGC_Count < 150 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) > 1000 ~ "Class 9"
        )
      )
    data <- data %>%
      mutate(
        Class_based_Motif_GCCYC = case_when(
          Motif_GCCYC_Count < 50 & abs(Avg_Distance_To_TSS_of_Motif_GCCYC) <= 500 ~ "Class 1",
          Motif_GCCYC_Count < 50 & abs(Avg_Distance_To_TSS_of_Motif_GCCYC) > 500 & abs(Avg_Distance_To_TSS_of_Motif_GCCYC) <= 1000 ~ "Class 2",
          Motif_GCCYC_Count < 50 & abs(Avg_Distance_To_TSS_of_Motif_GCCYC) > 1000 ~ "Class 3",
          Motif_GCCYC_Count >= 50 & Motif_GCCYC_Count < 100 & abs(Avg_Distance_To_TSS_of_Motif_GCCYC) <= 500 ~ "Class 4",
          Motif_GCCYC_Count >= 50 & Motif_GCCYC_Count < 100 & abs(Avg_Distance_To_TSS_of_Motif_GCCYC) > 500 & abs(Avg_Distance_To_TSS_of_Motif_GCCYC) <= 1000 ~ "Class 5",
          Motif_GCCYC_Count >= 50 & Motif_GCCYC_Count < 100 & abs(Avg_Distance_To_TSS_of_Motif_GCCYC) > 1000 ~ "Class 6",
          Motif_GCCYC_Count >= 100 & Motif_GCCYC_Count < 150 & abs(Avg_Distance_To_TSS_of_Motif_GCCYC) <= 500 ~ "Class 7",
          Motif_GCCYC_Count >= 100 & Motif_GCCYC_Count < 150 & abs(Avg_Distance_To_TSS_of_Motif_GCCYC) > 500 & abs(Avg_Distance_To_TSS_of_Motif_GCCYC) <= 1000 ~ "Class 8",
          Motif_GCCYC_Count >= 100 & Motif_GCCYC_Count < 150 & abs(Avg_Distance_To_TSS_of_Motif_GCCYC) > 1000 ~ "Class 9"
        )
      )
    data <- data %>%
      mutate(
        Class_based_Motifs = case_when(
          Motifs_Count < 50 & abs(Avg_Distance_To_TSS_of_Motifs) <= 500 ~ "Class 1",
          Motifs_Count < 50 & abs(Avg_Distance_To_TSS_of_Motifs) > 500 & abs(Avg_Distance_To_TSS_of_Motifs) <= 1000 ~ "Class 2",
          Motifs_Count < 50 & abs(Avg_Distance_To_TSS_of_Motifs) > 1000 ~ "Class 3",
          Motifs_Count >= 50 & Motifs_Count < 100 & abs(Avg_Distance_To_TSS_of_Motifs) <= 500 ~ "Class 4",
          Motifs_Count >= 50 & Motifs_Count < 100 & abs(Avg_Distance_To_TSS_of_Motifs) > 500 & abs(Avg_Distance_To_TSS_of_Motifs) <= 1000 ~ "Class 5",
          Motifs_Count >= 50 & Motifs_Count < 100 & abs(Avg_Distance_To_TSS_of_Motifs) > 1000 ~ "Class 6",
          Motifs_Count >= 100 & Motifs_Count < 150 & abs(Avg_Distance_To_TSS_of_Motifs) <= 500 ~ "Class 7",
          Motifs_Count >= 100 & Motifs_Count < 150 & abs(Avg_Distance_To_TSS_of_Motifs) > 500 & abs(Avg_Distance_To_TSS_of_Motifs) <= 1000 ~ "Class 8",
          Motifs_Count >= 100 & Motifs_Count < 150 & abs(Avg_Distance_To_TSS_of_Motifs) > 1000 ~ "Class 9"
        )
      )
    # Define the desired column order
    desired_columns <- c("PromoterID","Gene_Symbol","Gene_Type","Motif_GRGGC_Count","Motif_GRGGC_Center_Positions","Avg_Distance_To_TSS_of_Motif_GRGGC","Class_based_Motif_GRGGC",  "Motif_GCCYC_Count","Motif_GCCYC_Center_Positions","Avg_Distance_To_TSS_of_Motif_GCCYC","Class_based_Motif_GCCYC",  "Motifs_Count","Avg_Distance_To_TSS_of_Motifs","Class_based_Motifs")
    # Reorder the columns
    data <- select(data, all_of(desired_columns))
    #sed -i 's/Motifs/Motifs_GRGGC+GCCYC/g' motifs_in_promoters_with_LT_peak_Category.txt
    write.table(data, file="motifs_on_NHDF_LT_K331A_promoters_with_Category.txt", sep="\t", row.names = FALSE)
    #~/Tools/csv2xls-0.4/csv_to_xls.py motifs_on_NHDF_LT_promoters_with_Category.txt -d$'\t' -o motifs_on_NHDF_LT_promoters_with_Category.xls;
    #~/Tools/csv2xls-0.4/csv_to_xls.py motifs_on_NOT_NHDF_LT_promoters_with_Category.txt -d$'\t' -o motifs_on_NOT_NHDF_LT_promoters_with_Category.xls;
    #~/Tools/csv2xls-0.4/csv_to_xls.py motifs_on_NHDF_LT_K331A_promoters_with_Category.txt -d$'\t' -o motifs_on_NHDF_LT_K331A_promoters_with_Category.xls;
  10. calculate the average positions and counts aggregated by PromoterID from the data data3_positions.

    library(dplyr)
    data1_motif2 <- data1_motif2 %>%
      mutate(Position = as.numeric(Position))
    avg_positions <- data1_motif2 %>%
      group_by(PromoterID) %>%
      summarise(avg_position = mean(Position, na.rm = TRUE))
    print(avg_positions)
    average_of_avg_positions <- mean(avg_positions$avg_position, na.rm = TRUE)
    print(average_of_avg_positions)
    #67.82571
    #21.56232
    
    library(dplyr)
    data2_motif2 <- data2_motif2 %>%
      mutate(Position = as.numeric(Position))
    avg_positions <- data2_motif2 %>%
      group_by(PromoterID) %>%
      summarise(avg_position = mean(Position, na.rm = TRUE))
    print(avg_positions)
    average_of_avg_positions <- mean(avg_positions$avg_position, na.rm = TRUE)
    print(average_of_avg_positions)
    #20.62934
    #13.99134
    
    library(dplyr)
    data3_motif2 <- data3_motif2 %>%
      mutate(Position = as.numeric(Position))
    avg_positions <- data3_motif2 %>%
      group_by(PromoterID) %>%
      summarise(avg_position = mean(Position, na.rm = TRUE))
    print(avg_positions)
    average_of_avg_positions <- mean(avg_positions$avg_position, na.rm = TRUE)
    print(average_of_avg_positions)
    #15.3197
    #-44.15695
    
    library(dplyr)
    avg_counts <- data3_motif2 %>%
      group_by(PromoterID) %>%
      summarise(count = n()) %>%
      summarise(avg_count = mean(count))
    print(avg_counts)
    #22.6
    #18.8
    #19.6
    #22.2
    #18.5
    #19.1
  11. Analysis Summary of “GRGGC” and “CYCCG” Motifs in Promoters. I summarized the process of how I analyzed the ‘GRGGC’ and ‘CYCCG’ motifs in promoters. In this analysis, our primary focus was to study the distribution of these two specific motifs.

    1. The first stage of the analysis was to extract all promoter sequences. A range of 3000 nt upstream to 3000 nt downstream of the Transcription Start Sites (TSS) was selected for this purpose. In total, we were able to identify 62703 promoters, each spanning a length of 6000 nt.

    2. We classified these 62703 promoters into three distinct sets:

      • ‘NHDF_LT’: Comprises promoters with at least one NHDF LT peak. This set contains a total of 1706 promoters.
      • ‘NOT_NHDF_LT’: Consists of promoters without any NHDF LT peaks, totaling 60997 promoters.
      • ‘NHDF_LT_K331A’: Contains promoters with NHDF LT K331A peaks, amounting to 310 promoters.
    3. Subsequently, we screened the extracted promoter sequences from these three promoter sets for occurrences of the “GRGGC” and “CYCCG” motifs. The results are in three separate files: motifs_on_NHDF_LT_promoters_with_Category.xls, motifs_on_NOT_NHDF_LT_promoters_with_Category.xls.zip, and motifs_on_NHDF_LT_K331A_promoters_with_Category.xls.

    4. The next step was to calculate the average counts of both motifs across the three promoter datasets. Interestingly, the average counts revealed that the ‘NHDF_LT’ set had slightly higher counts for both “GRGGC” and “CYCCG” motifs as compared to the ‘NOT_NHDF_LT’ and ‘NHDF_LT_K331A’ sets. The specific averages were as follows:

      • GRGGC in ‘NHDF_LT’ promoters: 22.6
      • GRGGC in ‘NOT_NHDF_LT’ promoters: 18.8
      • GRGGC in ‘NHDF_LT_K331A’ promoters: 19.6
      • CYCCG in ‘NHDF_LT’ promoters: 22.2
      • CYCCG in ‘NOT_NHDF_LT’ promoters: 18.5
      • CYCCG in ‘NHDF_LT_K331A’ promoters: 19.1
    5. In order to gain deeper insights into the distribution of motifs within the promoters, we generated density plots for all the motifs across the three datasets. A density plot visualizes the distribution of motifs across the promoter regions.

      Our analysis of the density plots revealed that the motifs within the ‘NHDF_LT’ promoter set are more concentrated near the Transcription Start Sites (TSS) compared to the other sets.

      In the file titled ‘Density_of_GRGGC_Motifs.png’, three statistical tests were conducted

      • The adjusted p-value between ‘NHDF_LT’ (left) and ‘NOT_NHDF_LT’ (middle) was found to be 2.2e-16, indicating statistical significance.
      • The adjusted p-value between ‘NHDF_LT’ (left) and ‘NHDF_LT_K331A’ (right) was 1.4877e-10, also denoting statistical significance.
      • The adjusted p-value between ‘NOT_NHDF_LT’ (middle) and ‘NHDF_LT_K331A’ (right) was 0.237, which is not statistically significant.

      In a similar manner, three statistical tests were performed for the ‘Density_of_GCCYC_Motifs.png’ file:

      • The adjusted p-value between ‘NHDF_LT’ (left) and ‘NOT_NHDF_LT’ (middle) was 1.3323e-15, which is statistically significant.
      • The adjusted p-value between ‘NHDF_LT’ (left) and ‘NHDF_LT_K331A’ (right) was 1.2147e-06, indicating statistical significance.
      • The adjusted p-value between ‘NOT_NHDF_LT’ (middle) and ‘NHDF_LT_K331A’ (right) was 0.088, which is not statistically significant.

Peak and Motif analyses in Promoters

  • ./1_generate_promoter_sequences.py gencode.v43.annotation.gtf.db

    1_generate_promoter_sequences.py

  • ./2_find_motifs_in_promoters.py

    2_find_motifs_in_promoters.py

  • generate peaks_and_motifs_in_promoters_with_LT_peak.xls

    Input file

    plot_peaks.py

    add_peaks_to_motifs.py

    peaks_and_motifs_in_promoters_with_LT_peak.xls

    peak_distribution

    #--3.1. generate peak_tss_distances.csv and peak_tss_distances.png
    python3 plot_peaks.py peaks_NHDF_.bed /home/jhuang/REFs/gencode.v43.annotation.gtf.db /ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa
    python3 plot_peaks.py peaks_HEK293_.bed /home/jhuang/REFs/gencode.v43.annotation.gtf.db /ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa
    python3 plot_peaks.py peaks_PFSK-1_.bed /home/jhuang/REFs/gencode.v43.annotation.gtf.db /ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa
    python3 plot_peaks.py peaks_K331A_.bed /home/jhuang/REFs/gencode.v43.annotation.gtf.db /ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa
    
    #--3.2. split data into two dataframe, one is in which PromoterID occurres in filtered_peaks$Closest.Gene..Ensemble.ID., another is the remaining
    #sed -i 's/[][]//g' motif_counts_positions.txt
    data <- read.table("motif_counts_positions.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
    
    # Read the data
    peaks <- read.table("peak_tss_distances.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
    # Filter the rows
    filtered_peaks <- subset(peaks, abs(`Distance.to.TSS`) <= 3000)
    # Print the filtered peaks
    write.table(filtered_peaks, file="filtered_peak_tss_distances.txt", sep="\t", row.names = FALSE)
    
    # Create a new data frame for rows in 'data' where the PromoterID is in filtered_peaks's Closest Gene (Ensemble ID)
    data_in_filtered <- data[data$PromoterID %in% filtered_peaks$Closest.Gene..Ensemble.ID., ]
    data_in_filtered$Motif_GRGGC_Center_Positions <- sapply(data_in_filtered$Motif_GRGGC_Center_Positions, toString)
    data_in_filtered$Motif_GCCYR_Center_Positions <- sapply(data_in_filtered$Motif_GCCYR_Center_Positions, toString)
    write.table(data_in_filtered, file="motifs_in_promoters_with_LT_peak.txt", sep="\t", row.names = FALSE)
    # Create another data frame for the remaining rows
    data_not_in_filtered <- data[!data$PromoterID %in% filtered_peaks$Closest.Gene..Ensemble.ID., ]
    data_not_in_filtered$Motif_GRGGC_Center_Positions <- sapply(data_not_in_filtered$Motif_GRGGC_Center_Positions, toString)
    data_not_in_filtered$Motif_GCCYR_Center_Positions <- sapply(data_not_in_filtered$Motif_GCCYR_Center_Positions, toString)
    write.table(data_not_in_filtered, file="motifs_in_promoters_without_LT_peak.txt", sep="\t", row.names = FALSE)
    #780 vs 19262
    #1707 vs ca. 60000
    #sed -i 's/Symbol: //g' motifs_in_promoters_with_LT_peak.txt
    #sed -i 's/Type: //g' motifs_in_promoters_with_LT_peak.txt
    #sed -i 's/Symbol: //g' motifs_in_promoters_without_LT_peak.txt
    #sed -i 's/Type: //g' motifs_in_promoters_without_LT_peak.txt
    #sed -i 's/"//g' motifs_in_promoters_with_LT_peak.txt
    #sed -i 's/"//g' motifs_in_promoters_without_LT_peak.txt
    
    #--3.3. prepare data for motifs_in_promoters_with_LT_peak.txt
    data1 <- read.table("motifs_in_promoters_with_LT_peak.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
    
    data1$Motif_GRGGC_Count <- as.numeric(data1$Motif_GRGGC_Count)
    data1$Motif_GCCYR_Count <- as.numeric(data1$Motif_GCCYR_Count)
    data1$Motif_GRGGC_Center_Positions <- strsplit(as.character(data1$Motif_GRGGC_Center_Positions), ",")
    data1$Motif_GCCYR_Center_Positions <- strsplit(as.character(data1$Motif_GCCYR_Center_Positions), ",")
    # Create data1 frames for each motif
    data1_motif1 <- data.frame(PromoterID = rep(data1$PromoterID, data1$Motif_GRGGC_Count), 
                              Motif = "Motif GRGGC", 
                              Position = unlist(data1$Motif_GRGGC_Center_Positions))
    data1_motif2 <- data.frame(PromoterID = rep(data1$PromoterID, data1$Motif_GCCYR_Count), 
                              Motif = "Motif GCCYR", 
                              Position = unlist(data1$Motif_GCCYR_Center_Positions))
    # Combine motif data1 frames
    data1_positions <- rbind(data1_motif1, data1_motif2)
    # Get positions as numeric values, excluding NAs
    motif1_positions_with_LT_peak <- as.numeric(data1_positions$Position[data1_positions$Motif == "Motif GRGGC"])
    motif2_positions_with_LT_peak <- as.numeric(data1_positions$Position[data1_positions$Motif == "Motif GCCYR"])
    
    #--3.4. prepare data for motifs_in_promoters_without_LT_peak.txt
    data2 <- read.table("motifs_in_promoters_without_LT_peak.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
    data2$Motif_GRGGC_Count <- as.numeric(data2$Motif_GRGGC_Count)
    data2$Motif_GCCYR_Count <- as.numeric(data2$Motif_GCCYR_Count)
    data2$Motif_GRGGC_Center_Positions <- strsplit(as.character(data2$Motif_GRGGC_Center_Positions), ",")
    data2$Motif_GCCYR_Center_Positions <- strsplit(as.character(data2$Motif_GCCYR_Center_Positions), ",")
    # Create data2 frames for each motif
    data2_motif1 <- data.frame(PromoterID = rep(data2$PromoterID, data2$Motif_GRGGC_Count), 
                              Motif = "Motif GRGGC", 
                              Position = unlist(data2$Motif_GRGGC_Center_Positions))
    data2_motif2 <- data.frame(PromoterID = rep(data2$PromoterID, data2$Motif_GCCYR_Count), 
                              Motif = "Motif GCCYR", 
                              Position = unlist(data2$Motif_GCCYR_Center_Positions))
    # Combine motif data2 frames
    data2_positions <- rbind(data2_motif1, data2_motif2)
    
    # Get positions as numeric values, excluding NAs
    motif1_positions_without_LT_peak <- as.numeric(data2_positions$Position[data2_positions$Motif == "Motif GRGGC"])
    motif2_positions_without_LT_peak <- as.numeric(data2_positions$Position[data2_positions$Motif == "Motif GCCYR"])
    
    #--3.5. Plot Histogram of counts and density of positions
    # Define a common x-axis limit for all plots
    x_limit <- max(max(data1$Motif_GRGGC_Count), max(data1$Motif_GCCYR_Count), max(data2$Motif_GRGGC_Count), max(data2$Motif_GCCYR_Count))
    # Adjust the number of breaks (nbins) to control the width of histograms
    nbins <- 30
    
    y_limit1 <- 300
    y_limit2 <- 12000
    # Plot Histogram of counts with the same scale for x-axis
    png("Histogram_of_Motif_GRGGC_in_promoters_with_LT_peak.png", width=800, height=800)
    hist(data1$Motif_GRGGC_Count, main = "Histogram of Motif GRGGC in promoters with LT peak", xlab = "Counts", xlim = c(0, x_limit), ylim = c(0, y_limit1), breaks = 20)
    dev.off()
    png("Histogram_of_Motif_GRGGC_in_promoters_without_LT_peak.png", width=800, height=800)
    hist(data2$Motif_GRGGC_Count, main = "Histogram of Motif GRGGC in promoters without LT peak", xlab = "Counts", xlim = c(0, x_limit), ylim = c(0, y_limit2), breaks = nbins)
    dev.off()
    png("Histogram_of_Motif_GCCYR_in_promoters_with_LT_peak.png", width=800, height=800)
    hist(data1$Motif_GCCYR_Count, main = "Histogram of Motif GCCYR in promoters with LT peak", xlab = "Counts", xlim = c(0, x_limit), ylim = c(0, y_limit1), breaks = nbins)
    dev.off()
    png("Histogram_of_Motif_GCCYR_in_promoters_without_LT_peak.png", width=800, height=800)
    hist(data2$Motif_GCCYR_Count, main = "Histogram of Motif GCCYR in promoters without LT peak", xlab = "Counts", xlim = c(0, x_limit), ylim = c(0, y_limit2), breaks = nbins)
    dev.off()
    
    # Set up the layout for the plots
    png("Histogram_of_Motifs.png", width = 1000, height = 1000)
    par(mfrow = c(2, 2))
    hist(data1$Motif_GRGGC_Count, main = "Motif GRGGC Counts with LT peak", xlab = "Counts", xlim = c(0, x_limit), ylim = c(0, y_limit1), breaks = 20)
    hist(data2$Motif_GRGGC_Count, main = "Motif GRGGC Counts without LT peak", xlab = "Counts", xlim = c(0, x_limit), ylim = c(0, y_limit2), breaks = nbins)
    hist(data1$Motif_GCCYR_Count, main = "Motif GCCYR Counts with LT peak", xlab = "Counts", xlim = c(0, x_limit), ylim = c(0, y_limit1), breaks = nbins)
    hist(data2$Motif_GCCYR_Count, main = "Motif GCCYR Counts without LT peak", xlab = "Counts", xlim = c(0, x_limit), ylim = c(0, y_limit2), breaks = nbins)
    dev.off()
    
    # Set the ylim value
    ylim <- c(0.0001, 0.0003)
    png("Density_of_Motif_GRGGC_in_promoters_with_LT_peak.png", width=800, height=800)
    plot(density(motif1_positions_with_LT_peak, na.rm = TRUE), main="Density of GRGGC in promoters with LT peak", xlab="Position", ylim = ylim)
    dev.off()
    png("Density_of_Motif_GRGGC_in_promoters_without_LT_peak.png", width=800, height=800)
    plot(density(motif1_positions_without_LT_peak, na.rm = TRUE), main="Density of GRGGC in promoters without LT peak", xlab="Position", ylim = ylim)
    dev.off()
    png("Density_of_Motif_GCCYR_in_promoters_with_LT_peak.png", width=800, height=800)
    plot(density(motif2_positions_with_LT_peak, na.rm = TRUE), main="Density of GCCYR in promoters with LT peak", xlab="Position", ylim = ylim)
    dev.off()
    png("Density_of_Motif_GCCYR_in_promoters_without_LT_peak.png", width=800, height=800)
    plot(density(motif2_positions_without_LT_peak, na.rm = TRUE), main="Density of GCCYR in promoters without LT peak", xlab="Position", ylim = ylim)
    dev.off()
    
    png("Density_of_Motifs.png", width = 1000, height = 1000)
    par(mfrow = c(2, 2))
    plot(density(motif1_positions_with_LT_peak, na.rm = TRUE), main = "Density of GRGGC in promoters with LT peak", xlab = "Position", ylim = ylim)
    plot(density(motif1_positions_without_LT_peak, na.rm = TRUE), main = "Density of GRGGC in promoters without LT peak", xlab = "Position", ylim = ylim)
    plot(density(motif2_positions_with_LT_peak, na.rm = TRUE), main = "Density of GCCYR in promoters with LT peak", xlab = "Position", ylim = ylim)
    plot(density(motif2_positions_without_LT_peak, na.rm = TRUE), main = "Density of GCCYR in promoters without LT peak", xlab = "Position", ylim = ylim)
    dev.off()
    
    #--3.6. assign each promoter to a prefined class according to Motifs_Count and Avg_Distance_To_TSS_of_Motifs
    library(dplyr)
    data <- read.table("motifs_in_promoters_with_LT_peak.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
    data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count)
    data$Motif_GCCYR_Count <- as.numeric(data$Motif_GCCYR_Count)
    # Add a column for the sum of Motif_GRGGC_Count and Motif_GCCYR_Count
    data <- data %>%
      mutate(Motifs_Count = Motif_GRGGC_Count + Motif_GCCYR_Count)
    # Calculate absolute average Motif_GRGGC distance to TSS
    data$Avg_Distance_To_TSS_of_Motif_GRGGC <- sapply(strsplit(data$Motif_GRGGC_Center_Positions, ", "), 
                                             function(x) round(mean(as.numeric(x), na.rm = TRUE)))
    # Calculate absolute average Motif_GCCYR distance to TSS
    data$Avg_Distance_To_TSS_of_Motif_GCCYR <- sapply(strsplit(data$Motif_GCCYR_Center_Positions, ", "), 
                                             function(x) round(mean(as.numeric(x), na.rm = TRUE)))
    # Revised function to calculate the mean of two strings of comma-separated numbers
    mean_of_strings <- function(str1, str2) {
      # Convert strings of numbers to numeric vectors
      vec1 <- as.numeric(strsplit(str1, split = ",")[[1]])
      vec2 <- as.numeric(strsplit(str2, split = ",")[[1]])
      # Handle cases where vec1 or vec2 is NULL (i.e., when str1 or str2 is an empty string)
      if (is.null(vec1)) {
        vec1 <- numeric(0)
      }
      if (is.null(vec2)) {
        vec2 <- numeric(0)
      }
      # Compute the mean of the two vectors
      return(round(mean(c(vec1, vec2), na.rm = TRUE)))
    }
    # Apply the function to each row of the dataframe
    data$Avg_Distance_To_TSS_of_Motifs <- mapply(mean_of_strings, data$Motif_GRGGC_Center_Positions, data$Motif_GCCYR_Center_Positions)
    # Assign classes based on the given rules
    data <- data %>%
      mutate(
        Class_based_Motif_GRGGC = case_when(
          Motif_GRGGC_Count < 50 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) <= 500 ~ "Class 1",
          Motif_GRGGC_Count < 50 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) > 500 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) <= 1000 ~ "Class 2",
          Motif_GRGGC_Count < 50 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) > 1000 ~ "Class 3",
          Motif_GRGGC_Count >= 50 & Motif_GRGGC_Count < 100 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) <= 500 ~ "Class 4",
          Motif_GRGGC_Count >= 50 & Motif_GRGGC_Count < 100 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) > 500 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) <= 1000 ~ "Class 5",
          Motif_GRGGC_Count >= 50 & Motif_GRGGC_Count < 100 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) > 1000 ~ "Class 6",
          Motif_GRGGC_Count >= 100 & Motif_GRGGC_Count < 150 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) <= 500 ~ "Class 7",
          Motif_GRGGC_Count >= 100 & Motif_GRGGC_Count < 150 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) > 500 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) <= 1000 ~ "Class 8",
          Motif_GRGGC_Count >= 100 & Motif_GRGGC_Count < 150 & abs(Avg_Distance_To_TSS_of_Motif_GRGGC) > 1000 ~ "Class 9"
        )
      )
    data <- data %>%
      mutate(
        Class_based_Motif_GCCYR = case_when(
          Motif_GCCYR_Count < 50 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) <= 500 ~ "Class 1",
          Motif_GCCYR_Count < 50 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) > 500 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) <= 1000 ~ "Class 2",
          Motif_GCCYR_Count < 50 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) > 1000 ~ "Class 3",
          Motif_GCCYR_Count >= 50 & Motif_GCCYR_Count < 100 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) <= 500 ~ "Class 4",
          Motif_GCCYR_Count >= 50 & Motif_GCCYR_Count < 100 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) > 500 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) <= 1000 ~ "Class 5",
          Motif_GCCYR_Count >= 50 & Motif_GCCYR_Count < 100 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) > 1000 ~ "Class 6",
          Motif_GCCYR_Count >= 100 & Motif_GCCYR_Count < 150 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) <= 500 ~ "Class 7",
          Motif_GCCYR_Count >= 100 & Motif_GCCYR_Count < 150 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) > 500 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) <= 1000 ~ "Class 8",
          Motif_GCCYR_Count >= 100 & Motif_GCCYR_Count < 150 & abs(Avg_Distance_To_TSS_of_Motif_GCCYR) > 1000 ~ "Class 9"
        )
      )
    data <- data %>%
      mutate(
        Class_based_Motifs = case_when(
          Motifs_Count < 50 & abs(Avg_Distance_To_TSS_of_Motifs) <= 500 ~ "Class 1",
          Motifs_Count < 50 & abs(Avg_Distance_To_TSS_of_Motifs) > 500 & abs(Avg_Distance_To_TSS_of_Motifs) <= 1000 ~ "Class 2",
          Motifs_Count < 50 & abs(Avg_Distance_To_TSS_of_Motifs) > 1000 ~ "Class 3",
          Motifs_Count >= 50 & Motifs_Count < 100 & abs(Avg_Distance_To_TSS_of_Motifs) <= 500 ~ "Class 4",
          Motifs_Count >= 50 & Motifs_Count < 100 & abs(Avg_Distance_To_TSS_of_Motifs) > 500 & abs(Avg_Distance_To_TSS_of_Motifs) <= 1000 ~ "Class 5",
          Motifs_Count >= 50 & Motifs_Count < 100 & abs(Avg_Distance_To_TSS_of_Motifs) > 1000 ~ "Class 6",
          Motifs_Count >= 100 & Motifs_Count < 150 & abs(Avg_Distance_To_TSS_of_Motifs) <= 500 ~ "Class 7",
          Motifs_Count >= 100 & Motifs_Count < 150 & abs(Avg_Distance_To_TSS_of_Motifs) > 500 & abs(Avg_Distance_To_TSS_of_Motifs) <= 1000 ~ "Class 8",
          Motifs_Count >= 100 & Motifs_Count < 150 & abs(Avg_Distance_To_TSS_of_Motifs) > 1000 ~ "Class 9"
        )
      )
    # Define the desired column order
    desired_columns <- c("PromoterID","Gene_Symbol","Gene_Type","Motif_GRGGC_Count","Motif_GRGGC_Center_Positions","Avg_Distance_To_TSS_of_Motif_GRGGC","Class_based_Motif_GRGGC",  "Motif_GCCYR_Count","Motif_GCCYR_Center_Positions","Avg_Distance_To_TSS_of_Motif_GCCYR","Class_based_Motif_GCCYR",  "Motifs_Count","Avg_Distance_To_TSS_of_Motifs","Class_based_Motifs")
    # Reorder the columns
    data <- select(data, all_of(desired_columns))
    #sed -i 's/Motifs/Motifs_GRGGC+GCCYR/g' motifs_in_promoters_with_LT_peak_Category.txt
    write.table(data, file="motifs_in_promoters_with_LT_peak_Category.txt", sep="\t", row.names = FALSE)
    
    #--3.7. add peak information into motif table
    ./add_peaks_to_motifs.py
    #~/Tools/csv2xls-0.4/csv_to_xls.py _peaks_and_motifs_in_promoters_with_LT_peak.txt peak_tss_distances.txt -d$'\t' -o peaks_and_motifs_in_promoters_with_LT_peak.xls;
    #rename the name of sheet1 to "peaks_and_motifs_in_promoters_with_LT_peak" and "all_LT_peaks_NHDF"
  1. plot_peaks.py (updated code)

  2. plot_peaks.py (updated code)

    #python3 plot_peaks.py peaks_NHDF_.bed /home/jhuang/REFs/gencode.v43.annotation.gtf.db 
    #mv peak_tss_distances.xlsx peak_tss_distances_NHDF.xlsx
    #mv peak_tss_distances.txt peak_tss_distances_NHDF.txt
    #mv peak_distribution.csv peak_distribution_NHDF.csv
    #mv peak_distribution.png peak_distribution_NHDF.png
    #python3 plot_peaks.py peaks_HEK293_.bed /home/jhuang/REFs/gencode.v43.annotation.gtf.db 
    #mv peak_tss_distances.xlsx peak_tss_distances_HEK293.xlsx
    #mv peak_tss_distances.txt peak_tss_distances_HEK293.txt
    #mv peak_distribution.csv peak_distribution_HEK293.csv
    #mv peak_distribution.png peak_distribution_HEK293.png
    #python3 plot_peaks.py peaks_PFSK-1_.bed /home/jhuang/REFs/gencode.v43.annotation.gtf.db 
    #mv peak_tss_distances.xlsx peak_tss_distances_PFSK-1.xlsx
    #mv peak_tss_distances.txt peak_tss_distances_PFSK-1.txt
    #mv peak_distribution.csv peak_distribution_PFSK-1.csv
    #mv peak_distribution.png peak_distribution_PFSK-1.png
    
    import pprint
    import argparse
    import matplotlib.pyplot as plt
    import pandas as pd
    import gffutils
    import numpy as np
    
    def plot_peak_distribution(peaks_file, gencode_db):
        db = gffutils.FeatureDB(gencode_db, keep_order=True)
        peaks = pd.read_table(peaks_file, header=None)
        peaks.columns = ['chrom', 'start', 'end', 'name', 'score']
        tss_positions = []
        tss_genes = []
        for gene in db.features_of_type('gene'):
            if gene.strand == '+':
                tss_positions.append(gene.start)
            else:
                tss_positions.append(gene.end)
            tss_genes.append(gene)
        peak_tss_distances = []
        peak_names = []
        closest_genes = []
        closest_gene_names = []
        closest_strands = []
        closest_gene_types = []  # New list for gene_type
    
        for _, peak in peaks.iterrows():
            peak_center = (peak['start'] + peak['end']) // 2
            closest_tss_index = np.argmin([abs(tss - peak_center) for tss in tss_positions])
            distance_to_tss = peak_center - tss_positions[closest_tss_index]
            peak_tss_distances.append(distance_to_tss)
            peak_names.append(peak['name'])
            closest_genes.append(tss_genes[closest_tss_index].id)
            if 'gene_name' in tss_genes[closest_tss_index].attributes:
                closest_gene_name = tss_genes[closest_tss_index].attributes['gene_name'][0]
            else:
                closest_gene_name = 'N/A'  # Set a default value if 'gene_name' attribute is not present
            closest_gene_names.append(closest_gene_name)
            closest_strands.append(tss_genes[closest_tss_index].strand)
    
            if 'gene_type' in tss_genes[closest_tss_index].attributes:
                closest_gene_type = tss_genes[closest_tss_index].attributes['gene_type'][0]
            else:
                closest_gene_type = 'N/A'  # Set a default value if 'gene_type' attribute is not present
            closest_gene_types.append(closest_gene_type)
    
        df = pd.DataFrame({
            'Peak Name': peak_names,
            'Distance to TSS': peak_tss_distances,
            'Closest Gene (Ensemble ID)': closest_genes,
            'Closest Gene (Gene name)': closest_gene_names,
            'Gene Strand': closest_strands,
            'Gene Type': closest_gene_types  # Add new column for gene_type
        })
    
        df.to_excel('peak_tss_distances.xlsx', index=False)
        df.to_csv('peak_tss_distances.txt', sep='\t', index=False)  # Save as tab-separated text file
        bins_no=600   #1000 for nHDF, 600 for HEK293 and PFSK-1
        counts, bin_edges = np.histogram(peak_tss_distances, bins=bins_no)
        bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
        hist_df = pd.DataFrame({'Bin Center': bin_centers, 'Count': counts})
        hist_df.to_csv('peak_distribution.csv', index=False)
        total_peaks = hist_df['Count'].sum()
        with open('peak_distribution.csv', 'a') as f:
            f.write(f'\nTotal number of peaks: {total_peaks}')
    
        # Set the figure background to be transparent
        plt.gcf().set_facecolor('none')
        plt.gcf().patch.set_alpha(0.0)
    
        plt.hist(peak_tss_distances, bins=bins_no)
    
        plt.xlabel('Distance to TSS', fontsize=17)
        plt.ylabel('Number of peaks', fontsize=17)
        plt.title('')
        #plt.title('Distribution of peaks around TSS', fontsize=12)
    
        # Set tick label font sizes
        plt.xticks(fontsize=14)
        plt.yticks(fontsize=14)
    
        # Adjust the space on the left of the figure
        plt.gcf().subplots_adjust(left=0.15)
    
        # Set x-axis range
        plt.xlim(-8000, 8000)
    
        plt.savefig('peak_distribution.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    if __name__ == "__main__":
        parser = argparse.ArgumentParser(description='Plot peak distribution around TSS.')
        parser.add_argument('peaks_file', type=str, help='Path to the peaks.bed file')
        parser.add_argument('gencode_db', type=str, help='Path to the gencode.v43.annotation.db file')
        args = parser.parse_args()
        plot_peak_distribution(args.peaks_file, args.gencode_db)

Regulating Gene Expression in Diploid Organisms with Different Haplotypes

If two haplotypes in a diploid organism are different, then the genome sequences for those haplotypes are also different. These genetic differences can indeed impact gene expression, and there are various mechanisms in place to regulate and coordinate the expression of genes from both haplotypes. This regulation helps ensure that the organism functions properly despite having two different sets of genetic information.

Here are some key mechanisms involved in regulating gene expression from different haplotypes:

  • Allelic Specific Expression (ASE): In many cases, both alleles (one from each haplotype) of a gene can be expressed, but the extent of expression may differ between the two alleles. This phenomenon is known as allelic-specific expression. It can be influenced by factors such as DNA methylation, histone modifications, and the binding of transcription factors.

  • Epigenetic Modifications: Epigenetic modifications, such as DNA methylation and histone modifications, play a crucial role in gene regulation. Differences in epigenetic marks between the two haplotypes can lead to differences in gene expression. For example, if one haplotype has more methylated DNA in a particular gene promoter region, that gene may be less active on that haplotype.

  • Imprinting: Some genes are imprinted, which means that one allele is preferentially expressed over the other, depending on its parental origin. This imprinting is established during gametogenesis and can lead to differential expression between the two haplotypes.

  • Regulatory Elements: The presence of specific regulatory elements, such as enhancers and silencers, can also influence gene expression. Differences in the sequences of these regulatory elements between haplotypes can lead to differential gene expression.

  • Transcription Factors: Transcription factors are proteins that bind to specific DNA sequences and regulate gene expression. Differences in transcription factor binding sites between haplotypes can affect how genes are turned on or off.

  • Post-Transcriptional Regulation: After transcription, RNA molecules are subject to post-transcriptional regulation, including alternative splicing, RNA editing, and microRNA-mediated regulation. Differences in these processes can further modulate gene expression from different haplotypes.

  • Environmental Factors: Environmental factors and signaling pathways can also influence gene expression. The response to external stimuli may vary between haplotypes due to differences in gene regulation.

Overall, the regulation of gene expression from two different haplotypes is a complex interplay of genetic, epigenetic, and environmental factors. This regulation helps maintain the balance of gene expression required for normal cellular and organismal function, even when there are genetic differences between the two haplotypes.

如果一个二倍体生物的两个单倍型不同,那么这两个单倍型的基因组序列也是不同的。这些遗传差异确实会影响基因表达,有各种机制来调节和协调来自两个单倍型的基因表达。这种调控有助于确保在拥有两套不同遗传信息的情况下,生物体能正常运作。

以下是调节来自不同单倍型的基因表达的一些关键机制:

  • 等位特异性表达(ASE):在许多情况下,一个基因的两个等位基因(分别来自两个单倍型)都可以被表达,但两个等位基因的表达程度可能不同。这种现象被称为等位特异性表达。它可以受到DNA甲基化、组蛋白修饰和转录因子结合等因素的影响。

  • 表观遗传修饰:表观遗传修饰,如DNA甲基化和组蛋白修饰,在基因调控中起着关键作用。两个单倍型之间的表观标记差异可能导致基因表达差异。例如,如果一个单倍型在某个基因启动子区域中有更多的甲基化DNA,那么该基因在该单倍型上可能不太活跃。

  • 印迹:一些基因具有印迹,这意味着其中一个等位基因会优先表达,这取决于它的亲本来源。这种印迹是在生殖细胞形成期间建立的,可以导致两个单倍型之间的差异表达。

  • 调控元件:特定调控元件(如增强子和抑制子)的存在也可以影响基因表达。两个单倍型之间的调控元件序列差异可能导致基因差异表达。

  • 转录因子:转录因子是能够结合到特定DNA序列并调控基因表达的蛋白质。两个单倍型之间的转录因子结合位点差异可以影响基因的启动或关闭。

  • 转录后调控:在转录后,RNA分子会经历转录后调控,包括可选剪接、RNA编辑和microRNA介导的调控。这些过程的差异可以进一步调节来自不同单倍型的基因表达。

  • 环境因素:环境因素和信号通路也可以影响基因表达。对外部刺激的反应可能因为基因调控的差异而有所不同。

总的来说,调节两个不同单倍型基因表达的机制是基因、表观遗传和环境因素之间复杂的相互作用。这种调控有助于维持细胞和生物体正常功能所需的基因表达平衡,即使两个单倍型之间存在遗传差异。

Small RNA sequencing processing in the example of smallRNA_7

  1. adapter sequence

    Lexogen small RNA-Seq kit
    
    some common adapter sequences from different kits for reference:
    
        - TruSeq Small RNA (Illumina): TGGAATTCTCGGGTGCCAAGG
        - Small RNA Kits V1 (Illumina): TCGTATGCCGTCTTCTGCTTGT
        - Small RNA Kits V1.5 (Illumina): ATCTCGTATGCCGTCTTCTGCTTG
        - NEXTflex Small RNA Sequencing Kit v3 for Illumina Platforms (Bioo Scientific): TGGAATTCTCGGGTGCCAAGG
        - LEXOGEN Small RNA-Seq Library Prep Kit (Illumina): TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC
    
    [Header],,,,,
    IEMFileVersion,4,,,,
    InvestigatorName,ag96,,,,
    ExperimentName,ag96,,,,
    Date,16.10.2023,,,,
    Workflow,GenerateFASTQ,,,,
    Application,NextSeqFASTQOnly,,,,
    Assay,TruSeq HT,,,,
    Description,pcr,,,,
    Chemistry,Amplicon,,,,
    ,,,,,
    [Reads],,,,,
    82,,,,,
    ,,,,,
    ,,,,,
    [Settings],,,,,
    Adapter,TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC,,,,
    ,,,,,
    ,,,,,
    [Data],,,,,
    Sample_ID,Sample_Name,I7_Index_ID,index,Sample_Project,Description
    nf930,01_0505_WaGa_wt_EV_RNA,SRi7001,CAGCGT,2023_064_nf_ute,smallRNA-Seq
    nf931,02_0505_WaGa_sT_DMSO_EV_RNA,SRi7002,GATCAC,2023_064_nf_ute,smallRNA-Seq
    nf932,03_0505_WaGa_sT_Dox_EV_RNA,SRi7003,ACCAGT,2023_064_nf_ute,smallRNA-Seq
    nf933,04_0505_WaGa_scr_DMSO_EV_RNA,SRi7004,TGCACG,2023_064_nf_ute,smallRNA-Seq
    nf934,05_0505_WaGa_scr_Dox_EV_RNA,SRi7005,ACATTA,2023_064_nf_ute,smallRNA-Seq
    nf935,06_1905_WaGa_wt_EV_RNA,SRi7006,GTGTAG,2023_064_nf_ute,smallRNA-Seq
    nf936,07_1905_WaGa_sT_DMSO_EV_RNA,SRi7007,CTAGTC,2023_064_nf_ute,smallRNA-Seq
    nf937,08_1905_WaGa_sT_Dox_EV_RNA,SRi7008,TGTGCA,2023_064_nf_ute,smallRNA-Seq
    nf938,09_1905_WaGa_scr_DMSO_EV_RNA,SRi7009,TCAGGA,2023_064_nf_ute,smallRNA-Seq
    nf939,10_1905_WaGa_scr_Dox_EV_RNA,SRi7010,CGGTTA,2023_064_nf_ute,smallRNA-Seq
    nf940,11_control_MKL1,SRi7011,TTAACT,2023_064_nf_ute,smallRNA-Seq
    nf941,12_control_WaGa,SRi7012,ATGAAC,2023_064_nf_ute,smallRNA-Seq
  2. input data

    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf930/01_0505_WaGa_wt_EV_RNA_S1_R1_001.fastq.gz         0505_WaGa_wt.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf931/02_0505_WaGa_sT_DMSO_EV_RNA_S2_R1_001.fastq.gz    0505_WaGa_sT_DMSO.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf932/03_0505_WaGa_sT_Dox_EV_RNA_S3_R1_001.fastq.gz     0505_WaGa_sT_Dox.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf933/04_0505_WaGa_scr_DMSO_EV_RNA_S4_R1_001.fastq.gz   0505_WaGa_scr_DMSO.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf934/05_0505_WaGa_scr_Dox_EV_RNA_S5_R1_001.fastq.gz    0505_WaGa_scr_Dox.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf935/06_1905_WaGa_wt_EV_RNA_S6_R1_001.fastq.gz         1905_WaGa_wt.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf936/07_1905_WaGa_sT_DMSO_EV_RNA_S7_R1_001.fastq.gz    1905_WaGa_sT_DMSO.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf937/08_1905_WaGa_sT_Dox_EV_RNA_S8_R1_001.fastq.gz     1905_WaGa_sT_Dox.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf938/09_1905_WaGa_scr_DMSO_EV_RNA_S9_R1_001.fastq.gz   1905_WaGa_scr_DMSO.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf939/10_1905_WaGa_scr_Dox_EV_RNA_S10_R1_001.fastq.gz   1905_WaGa_scr_Dox.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf940/11_control_MKL1_S11_R1_001.fastq.gz               control_MKL1.fastq.gz
    ln -s ./231016_NB501882_0435_AHG7HMBGXV/nf941/12_control_WaGa_S12_R1_001.fastq.gz               control_WaGa.fastq.gz
  3. run cutadapt

    for sample in 0505_WaGa_wt 0505_WaGa_sT_DMSO 0505_WaGa_sT_Dox 0505_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_wt 1905_WaGa_sT_DMSO 1905_WaGa_sT_Dox 1905_WaGa_scr_DMSO 1905_WaGa_scr_Dox  control_MKL1 control_WaGa; do
      cutadapt -a TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -q 20 -o ${sample}2.fastq.gz --minimum-length 5 --trim-n ${sample}.fastq.gz >> LOG
    done
    #jhuang@hamburg:~/DATA/Data_Ute/Data_Ute_smallRNA_7$ fastp -i 0505_WaGa_wt.fastq.gz -o 0505_WaGa_wt3.fastq.gz -a TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC
  4. run COMPSRA

    ln -s ../Data_Ute_smallRNA_3/bundle_v1 .
    
    # DEBUG_1: Make sure the file COMPSRA.jar under Data_Ute_smallRNA_7
    # DEBUG_2: "-qc -ra TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -rb 4" does not work! Using cutadapt -a xxxx -q 20 replace those parameters!
    for sample in 0505_WaGa_wt 0505_WaGa_sT_DMSO 0505_WaGa_sT_Dox 0505_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_wt 1905_WaGa_sT_DMSO 1905_WaGa_sT_Dox 1905_WaGa_scr_DMSO 1905_WaGa_scr_Dox  control_MKL1 control_WaGa; do
      mkdir our_out/${sample}2/
      java -jar COMPSRA.jar -ref hg38       -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in ${sample}2.fastq.gz -out ./our_out/
    done
    
    #4.2.3 -rb/-rm_bias n
    #To remove n random bases in both 5’ (5-prime) and 3’ (3-prime) ends after removing the adapter sequence.
    #4.2.4 -rh/-rm_low_quality_head score
    #To remove the low quality bases with the score less than score from 5’ (5-prime) end.
    #4.2.5 -rt/-rm_low_quality_tail score
    #To remove the low quality bases with the score less than score from 3’ (3-prime) end.
    #4.2.6 -rr/-rm_low_quality_read score
    #To remove the low quality reads with the average score less than score.
    #4.6.3 -fdclass/-fun_diff_class A1,A2,...,An
    #To set the small RNAs that will be performed the differential expression analysis. The format is the same as the parameter -ac/-ann_class A1,A2,...,An.
    #4.6.4 -fdcase/-fun_diff_case ID1,ID2,...,IDn
    #To set the IDs of case samples.
    #4.6.5 -fdctrl/-fun_diff_control ID1,ID2,...,IDn
    #To set the IDs of control samples.
    #4.4.2 -ac/-ann_class A1,A2,...,An
    #To set the small RNA categories that will be annotated. The index of small RNA is listed:
    #    1 miRNA
    #    2 piRNA
    #    3 tRNA
    #    4 snoRNA
    #    5 snRNA
    #    6 circRNA
    
    java -jar COMPSRA.jar -ref hg38 -fun -fm -fms 1-5 -fdclass 1,2,3,4,5 -fdann -pro COMPSRA_MERGE -inf ./sample.list -out ./our_out/
    java -jar COMPSRA.jar -ref hg38 -fun -fd -fdclass 1,2,3,4,5 -fdcase 1-2 -fdctrl 3-6 -fdnorm cpm -fdtest mwu -fdann -pro COMPSRA_DEG -inf ./sample.list -out ./our_out/
  5. The results without using cutadapt for comparison

    mkdir our_out
    for sample in 0505_WaGa_wt 0505_WaGa_sT_DMSO 0505_WaGa_sT_Dox 0505_WaGa_scr_DMSO 0505_WaGa_scr_Dox 1905_WaGa_wt 1905_WaGa_sT_DMSO 1905_WaGa_sT_Dox 1905_WaGa_scr_DMSO 1905_WaGa_scr_Dox  control_MKL1 control_WaGa; do
      mkdir our_out/${sample}/
      java -jar COMPSRA.jar -ref hg38   -rh 20 -rt 20 -rr 20 -rlh 8,17 -aln -mt star -ann -ac 1,2,3,4,5,6  -in ${sample}2.fastq.gz -out ./our_out/
    done
    
    {miRNA}:
    [miRBase]
    Total Annotation Items: 4697
    Annotated Items (covered by least one read): 587
    Unannotated Items: 4110
    Reads Support the Annotation: 791636
    
    {piRNA}:
    [piRNABank]
    Total Annotation Items: 665175
    Annotated Items (covered by least one read): 480
    Unannotated Items: 664695
    Reads Support the Annotation: 6363051
    [piRBase]
    Total Annotation Items: 804849
    Annotated Items (covered by least one read): 1220
    Unannotated Items: 803629
    Reads Support the Annotation: 41374788
    
    {tRNA}:
    [GtRNAdb]
    Total Annotation Items: 601
    Annotated Items (covered by least one read): 440
    Unannotated Items: 161
    Reads Support the Annotation: 18690795
    
    {snoRNA}:
    [GEN_snoRNA]
    Total Annotation Items: 1006
    Annotated Items (covered by least one read): 250
    Unannotated Items: 756
    Reads Support the Annotation: 416228
    
    {snRNA}:
    [GEN_snRNA]
    Total Annotation Items: 2053
    Annotated Items (covered by least one read): 267
    Unannotated Items: 1786
    Reads Support the Annotation: 793559
    
    {circRNA}:
    [circRNA]
    Total Annotation Items: 140195
    Annotated Items (covered by least one read): 51488
    Unannotated Items: 88707
    Reads Support the Annotation: 14238651