miRNA 2024 Ute from raw counts

  1. Generate the following files according to STEPS 1-4 from http://xgenes.com/article/article-content/239/small-rna-sequencing-processing-in-the-example-of-smallrna-7/, http://xgenes.com/article/article-content/232/small-rna-sequencing-processing-in-the-example-of-smallrna-7/, and http://xgenes.com/article/article-content/156/small-rna-processing/. For COMPSRA_MERGE_0_miRNA.txt, we also need STEP 5 to add the read numbers of MCPyV-M1.

     COMPSRA_MERGE_0_miRNA.txt *
     COMPSRA_MERGE_0_piRNA.txt *
     COMPSRA_MERGE_0_snRNA.txt *
     COMPSRA_MERGE_0_tRNA.txt
     COMPSRA_MERGE_0_snoRNA.txt
     COMPSRA_MERGE_0_circRNA.txt
  2. Input files for miRNA are two files: COMPSRA_MERGE_0_miRNA.txt and ids under “/media/jhuang/Elements/Data_Ute/Data_Ute_smallRNA_7/miRNA_WaGa_re”

    • COMPSRA_MERGE_0_miRNA.txt

      #./our_out_without_cutadapt/COMPSRA_MERGE_0_miRNA.txt
      #./our_out_on_hg38/COMPSRA_MERGE_0_miRNA.txt
      #./our_out_on_hg38+JN707599_miRNA_merging_replicates/COMPSRA_MERGE_0_miRNA.txt
      diff ./our_out_on_hg38/COMPSRA_MERGE_0_miRNA.txt ./our_out_on_hg38+JN707599_miRNA_merging_replicates/COMPSRA_MERGE_0_miRNA.txt #--> different!
      #BUG: why using the same code resulting in different miRNA results! A little different!!
      #--> using ./our_out_on_hg38+JN707599_miRNA_merging_replicates/COMPSRA_MERGE_0_miRNA.txt
      
      cp miRNA_WaGa_on_hg38+JN707599_merging_replicates/COMPSRA_MERGE_0_miRNA.txt miRNA_WaGa_2024
      cd miRNA_WaGa_2024
    • prepare the file ids

      #cp ./miRNA_WaGa_on_hg38+JN707599_merging_replicates/ids miRNA_WaGa_2024
      for i in EV_vs_parental sT_Dox_vs_sT_DMSO sT_Dox_vs_scr_Dox scr_Dox_vs_scr_DMSO sT_DMSO_vs_scr_DMSO; do echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"; echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"; done
      cat *.id | sort -u > ids
      #add Gene_Id in the first line, delete the ""
  3. Draw plots with R using DESeq2 (under the ‘base’ env)

     #BiocManager::install("AnnotationDbi")
     #BiocManager::install("clusterProfiler")
     #BiocManager::install(c("ReactomePA","org.Hs.eg.db"))
     #BiocManager::install("limma")
     library("AnnotationDbi")
     library("clusterProfiler")
     library("ReactomePA")
     library("org.Hs.eg.db")
     library(DESeq2)
     library(gplots)
     library(limma)
     # Check the current library paths
     .libPaths()
     #setwd("/home/jhuang/DATA/Data_Ute/Data_Ute_smallRNA_7/our_out_on_hg38+JN707599_2024_corrected/")
     setwd("/home/jhuang/DATA/Data_Ute/Data_Ute_smallRNA_7/miRNA_WaGa_2024")
    
     d.raw<- read.delim2("COMPSRA_MERGE_0_miRNA.txt",sep="\t", header=TRUE, row.names=1)
     d.raw$X <- NULL
     #colnames(d.raw)<- c("WaGa_EV", "MKL1_EV", "WaGa_wt", "MKL1_wt")
     d.raw[] <- lapply(d.raw, as.numeric)
    
     #MCPyV-M1   0   0   29  0   23  0   30  8   0   0   202 196
     # New row to add
     new_row <- data.frame(
       X0505_WaGa_sT_DMSO = 0,
       X1905_WaGa_sT_DMSO = 0,
       X0505_WaGa_sT_Dox = 29,
       X1905_WaGa_sT_Dox = 0,
       X0505_WaGa_scr_DMSO = 23,
       X1905_WaGa_scr_DMSO = 0,
       X0505_WaGa_scr_Dox = 30,
       X1905_WaGa_scr_Dox = 8,
       X0505_WaGa_wt = 0,
       X1905_WaGa_wt = 0,
       control_MKL1 = 202,
       control_WaGa = 196,
       row.names = "MCPyV-M1"
     )
    
     # Add the new row to the data frame
     d.raw <- rbind(d.raw, new_row)
    
     #cell_line = as.factor(c("WaGa","WaGa", "WaGa","WaGa", "WaGa","WaGa", "WaGa","WaGa", "WaGa","WaGa", "MKL1","WaGa"))
     #condition = as.factor(c("sT","sT", "sT","sT", "scr","scr", "scr","scr", "wt","wt", "control","control"))
     #treatment = as.factor(c("DMSO","DMSO", "Dox","Dox", "DMSO","DMSO", "Dox","Dox", "wt","wt", "control","control"))
    
     EV_or_parental = as.factor(c("EV","EV", "EV","EV", "EV","EV", "EV","EV", "EV","EV", "parental","parental"))
     donor = as.factor(c("0505","1905", "0505","1905", "0505","1905", "0505","1905", "0505","1905", "0505","1905"))
     replicates = as.factor(c("sT_DMSO","sT_DMSO", "sT_Dox","sT_Dox", "scr_DMSO","scr_DMSO", "scr_Dox","scr_Dox", "wt","wt", "control","control"))
     ids = as.factor(c("0505_WaGa_sT_DMSO","1905_WaGa_sT_DMSO","0505_WaGa_sT_Dox","1905_WaGa_sT_Dox","0505_WaGa_scr_DMSO","1905_WaGa_scr_DMSO","0505_WaGa_scr_Dox","1905_WaGa_scr_Dox","0505_WaGa_wt","1905_WaGa_wt","control_MKL1","control_WaGa"))
    
     cData = data.frame(row.names=colnames(d.raw), replicates=replicates, ids=ids, donor=donor, EV_or_parental=EV_or_parental)
     dds<-DESeqDataSetFromMatrix(countData=d.raw, colData=cData, design=~replicates+donor)
    
     rld <- rlogTransformation(dds)
    
     # -- before pca --
     png("pca.png", 1200, 800)
     plotPCA(rld, intgroup=c("replicates"))
     #plotPCA(rld, intgroup = c("replicates", "batch"))
     #plotPCA(rld, intgroup = c("replicates", "ids"))
     #plotPCA(rld, "batch")
     dev.off()
    
     png("pca2.png", 1200, 800)
     plotPCA(rld, intgroup=c("donor"))
     dev.off()
    
     ##### STEP3: prepare all_genes #####
     rld <- rlogTransformation(dds)
     mat <- assay(rld)
     mm <- model.matrix(~replicates, colData(rld))
     mat <- limma::removeBatchEffect(mat, batch=rld$donor, design=mm)
     assay(rld) <- mat
     RNASeq.NoCellLine <- assay(rld)
     # reorder the columns
     colnames(RNASeq.NoCellLine) = c("0505 WaGa sT DMSO","1905 WaGa sT DMSO","0505 WaGa sT Dox","1905 WaGa sT Dox","0505 WaGa scr DMSO","1905 WaGa scr DMSO","0505 WaGa scr Dox","1905 WaGa scr Dox","0505 WaGa wt","1905 WaGa wt","control MKL1","control WaGa")
     col.order <-c("control MKL1",  "control WaGa","0505 WaGa wt","1905 WaGa wt","0505 WaGa sT DMSO","1905 WaGa sT DMSO","0505 WaGa sT Dox","1905 WaGa sT Dox","0505 WaGa scr DMSO","1905 WaGa scr DMSO","0505 WaGa scr Dox","1905 WaGa scr Dox")
     RNASeq.NoCellLine <- RNASeq.NoCellLine[,col.order]
    
     #Option4: manully defining
     #for i in EV_vs_parental sT_Dox_vs_sT_DMSO sT_Dox_vs_scr_Dox scr_Dox_vs_scr_DMSO sT_DMSO_vs_scr_DMSO; do echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"; echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"; done
     #cat *.id | sort -u > ids
     ##add Gene_Id in the first line, delete the ""
     GOI <- read.csv("ids")$Gene_Id
     datamat = RNASeq.NoCellLine[GOI, ]
    
     ##### STEP4: clustering the genes and draw heatmap #####
     datamat <- datamat[,-1]  #delete the sample "control MKL1"
     colnames(datamat)[1] <- "WaGa control"  #rename the isolate names according to the style of RNA-seq as follows?
     colnames(datamat)[2] <- "WaGa wildtype 0505"
     colnames(datamat)[3] <- "WaGa wildtype 1905"
     colnames(datamat)[4] <- "WaGa sT DMSO 0505"
     colnames(datamat)[5] <- "WaGa sT DMSO 1905"
     colnames(datamat)[6] <- "WaGa sT Dox 0505"
     colnames(datamat)[7] <- "WaGa sT Dox 1905"
     colnames(datamat)[8] <- "WaGa scr DMSO 0505"
     colnames(datamat)[9] <- "WaGa scr DMSO 1905"
     colnames(datamat)[10] <- "WaGa scr Dox 0505"
     colnames(datamat)[11] <- "WaGa scr Dox 1905"
     write.csv(datamat, file ="gene_expression_keeping_replicates.txt")
     # In order to 100% reproduce the plot, load the data from the txt-file as datamat1.
     datamat1 <- read.csv("/media/jhuang/Elements/Data_Ute/Data_Ute_smallRNA_7/miRNA_WaGa_on_hg38+JN707599_keeping_replicates/gene_expression_keeping_replicates.txt")
     colnames(datamat1) <- c("Gene", "WaGa control", "WaGa wildtype 0505", "WaGa wildtype 1905", "WaGa sT DMSO 0505", "WaGa sT DMSO 1905", "WaGa sT Dox 0505", "WaGa sT Dox 1905", "WaGa scr DMSO 0505", "WaGa scr DMSO 1905", "WaGa scr Dox 0505", "WaGa scr Dox 1905")
     rownames(datamat1) <- datamat1$Gene
     datamat1 <- datamat1[ , -1]
    
     #"ward.D"’, ‘"ward.D2"’,‘"single"’, ‘"complete"’, ‘"average"’ (= UPGMA), ‘"mcquitty"’(= WPGMA), ‘"median"’ (= WPGMC) or ‘"centroid"’ (= UPGMC)
     hr <- hclust(as.dist(1-cor(t(datamat1), method="pearson")), method="complete")
     hc <- hclust(as.dist(1-cor(datamat1, method="spearman")), method="complete")
     mycl = cutree(hr, h=max(hr$height)/2.0)
     mycol = c("YELLOW", "BLUE", "ORANGE", "CYAN", "GREEN", "MAGENTA", "GREY", "LIGHTCYAN", "RED",     "PINK", "DARKORANGE", "MAROON",  "LIGHTGREEN", "DARKBLUE",  "DARKRED",   "LIGHTBLUE", "DARKCYAN",  "DARKGREEN", "DARKMAGENTA");
    
     mycol = mycol[as.vector(mycl)]
     png("miRNA_heatmap_keeping_replicates.png", width=800, height=1000)
     #svg("DEGs_heatmap_keeping_replicates.svg", width=6, height=8)
     heatmap.2(as.matrix(datamat1),
       Rowv=as.dendrogram(hr),
       Colv=NA,
       dendrogram='row',
       labRow="",
       scale='row',
       trace='none',
       col=bluered(75),
       RowSideColors=mycol,
       srtCol=20,
       lhei=c(1,8),
       #cexRow=1.2,   # Increase row label font size
       cexCol=1.7,    # Increase column label font size
       margin=c(7,1)
      )
     dev.off()
    
     #### cluster members #####
     write.csv(names(subset(mycl, mycl == '1')),file='YELLOW.txt')
     write.csv(names(subset(mycl, mycl == '2')),file='BLUE.txt')
     write.csv(names(subset(mycl, mycl == '3')),file='ORANGE.txt')
     write.csv(names(subset(mycl, mycl == '4')),file='CYAN.txt')
     write.csv(names(subset(mycl, mycl == '5')),file='GREEN.txt')
     write.csv(names(subset(mycl, mycl == '6')),file='MAGENTA.txt')
     write.csv(names(subset(mycl, mycl == '7')),file='GREY.txt')
     write.csv(names(subset(mycl, mycl == '8')),file='LIGHTCYAN.txt')
     write.csv(names(subset(mycl, mycl == '9')),file='RED.txt')
     #~/Tools/csv2xls-0.4/csv_to_xls.py gene_expression_keeping_replicates.txt YELLOW.txt ORANGE.txt BLUE.txt GREEN.txt CYAN.txt MAGENTA.txt LIGHTCYAN.txt GREY.txt RED.txt -d',' -o miRNA_heatmap_keeping_replicates.xls
    
     # merging replicates
     datamat <- cbind(datamat, "WaGa wildtype" = rowMeans(datamat[, 2:3]))
     datamat <- cbind(datamat, "WaGa sT DMSO" = rowMeans(datamat[, 4:5]))
     datamat <- cbind(datamat, "WaGa sT Dox" = rowMeans(datamat[, 6:7]))
     datamat <- cbind(datamat, "WaGa scr DMSO" = rowMeans(datamat[, 8:9]))
     datamat <- cbind(datamat, "WaGa scr Dox" = rowMeans(datamat[, 10:11]))
     datamat <- datamat[,c(-2:-11)]
     write.csv(datamat, file ="gene_expression_merging_replicates.txt")
     # In order to 100% reproduce the plot, load the data from the txt-file as datamat2.
     datamat2 <- read.csv("/media/jhuang/Elements/Data_Ute/Data_Ute_smallRNA_7/miRNA_WaGa_on_hg38+JN707599_merging_replicates/gene_expression_merging_replicates.txt")
     colnames(datamat2) <- c("Gene", "WaGa control", "WaGa wildtype", "WaGa sT DMSO", "WaGa sT Dox", "WaGa scr DMSO", "WaGa scr Dox")
     rownames(datamat2) <- datamat2$Gene
     datamat2 <- datamat2[ , -1]
    
     # Now map your clusters to colors, making sure that there's one color for each row:
     #ERROR: actualColors <- mycol[mycl]  # Assign colors based on cluster assignment
    
     #"ward.D"’, ‘"ward.D2"’,‘"single"’, ‘"complete"’, ‘"average"’ (= UPGMA), ‘"mcquitty"’(= WPGMA), ‘"median"’ (= WPGMC) or ‘"centroid"’ (= UPGMC)
     hr <- hclust(as.dist(1-cor(t(datamat2), method="pearson")), method="complete")
     hc <- hclust(as.dist(1-cor(datamat2, method="spearman")), method="complete")
     mycl <- cutree(hr, h=max(hr$height)/2.2)
     #mycl <- cutree(hr, h=max(hr$height)/5.0)
     mycol = c("YELLOW", "BLUE", "ORANGE", "CYAN", "GREEN", "MAGENTA", "GREY", "LIGHTCYAN", "RED",     "PINK", "DARKORANGE", "MAROON",  "LIGHTGREEN", "DARKBLUE",  "DARKRED",   "LIGHTBLUE", "DARKCYAN",  "DARKGREEN", "DARKMAGENTA");
     mycol = mycol[as.vector(mycl)]
     # ERROR: Then use these 'actualColors' in your heatmap
     #png("miRNA_heatmap_merging_replicates.png", width=800, height=1000)
     #heatmap.2(as.matrix(datamat2),
     #          Rowv=as.dendrogram(hr),
     #          Colv=NA,
     #          dendrogram='row',
     #          labRow=rownames(datamat2),
     #          scale='row',
     #          trace='none',
     #          col=bluered(75),
     #          RowSideColors=mycol, #ERROR: RowSideColors=actualColors,
     #          srtCol=20,
     #          lhei=c(1,8),
     #          cexCol=1.7,    # Increase column label font size
     #          margin=c(7,1)
     #        )
     # Save the heatmap as a PNG file
     png("miRNA_heatmap_merging_replicates.png", width=1400, height=4000)
     # Generate the heatmap
     heatmap.2(as.matrix(datamat2),
               Rowv=as.dendrogram(hr),
               Colv=NA,
               dendrogram='row',
               labRow=rownames(datamat2), # Add row names (gene names) to the heatmap
               scale='row',
               trace='none',
               col=bluered(75),
               RowSideColors=mycol, # Assign row colors based on clusters
               srtCol=20, # Adjust column label orientation for better visibility
               lhei=c(1, 16),
               cexCol=2.0, # Increase column label font size
               cexRow=1.6, # Adjust row label font size if needed
               margin=c(10, 15), # Adjust margins to prevent the error
               labCol=colnames(datamat2) # Add column names to the heatmap
             )
     dev.off()
    
     #### cluster members #####
     write.csv(names(subset(mycl, mycl == '1')),file='YELLOW.txt')
     write.csv(names(subset(mycl, mycl == '2')),file='BLUE.txt')
     write.csv(names(subset(mycl, mycl == '3')),file='ORANGE.txt')
     write.csv(names(subset(mycl, mycl == '4')),file='CYAN.txt')
     write.csv(names(subset(mycl, mycl == '5')),file='GREEN.txt')
     write.csv(names(subset(mycl, mycl == '6')),file='MAGENTA.txt')
     write.csv(names(subset(mycl, mycl == '7')),file='GREY.txt')
     write.csv(names(subset(mycl, mycl == '8')),file='LIGHTCYAN.txt')
     #write.csv(names(subset(mycl, mycl == '9')),file='RED.txt')
     #~/Tools/csv2xls-0.4/csv_to_xls.py gene_expression_merging_replicates.txt YELLOW.txt BLUE.txt ORANGE.txt CYAN.txt MAGENTA.txt GREEN.txt LIGHTCYAN.txt GREY.txt -d',' -o miRNA_heatmap_merging_replicates.xls
     #100+11+4+7+2+5+14+63=206

4, do separate shRNA and treatment analysis (input d.raw)

    d_WaGa <- d.raw[, !(grepl("control|wt|MKL1", names(d.raw)))]
    cell.line = as.factor(c("WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa","WaGa"))
    shRNA = as.factor(c("sT","sT","sT","sT","scr","scr","scr","scr"))
    treatment = as.factor(c("DMSO","DMSO","Dox","Dox","DMSO","DMSO","Dox","Dox"))
    cData = data.frame(row.names=colnames(d_WaGa), shRNA=shRNA, treatment=treatment, cell.line=cell.line)
    dds_WaGa<-DESeqDataSetFromMatrix(countData=d_WaGa, colData=cData, design=~shRNA+treatment+shRNA:treatment)
    dds_WaGa = DESeq(dds_WaGa, betaPrior=FALSE)
    resultsNames(dds_WaGa)
    contrasts <- c("shRNA_sT_vs_scr", "treatment_Dox_vs_DMSO", "shRNAsT.treatmentDox")

    for (i in contrasts) {
      res = results(dds_WaGa, name=i)
      res <- res[!is.na(res$log2FoldChange),]
      res$padj <- ifelse(is.na(res$padj), 1, res$padj)

      res_df <- as.data.frame(res)
      write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))

      up <- subset(res_df, padj<=0.1 & log2FoldChange>=2)
      down <- subset(res_df, padj<=0.1 & log2FoldChange<=-2)
      write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
      write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
    }

    mv shRNA_sT_vs_scr-all.txt WaGa_shRNA_sT_vs_scr-all.txt
    mv shRNA_sT_vs_scr-up.txt WaGa_shRNA_sT_vs_scr-up.txt
    mv shRNA_sT_vs_scr-down.txt WaGa_shRNA_sT_vs_scr-down.txt
    mv treatment_Dox_vs_DMSO-all.txt WaGa_treatment_Dox_vs_DMSO-all.txt
    mv treatment_Dox_vs_DMSO-up.txt WaGa_treatment_Dox_vs_DMSO-up.txt
    mv treatment_Dox_vs_DMSO-down.txt WaGa_treatment_Dox_vs_DMSO-down.txt
    mv shRNAsT.treatmentDox-all.txt WaGa_shRNAsT.treatmentDox-all.txt
    mv shRNAsT.treatmentDox-up.txt WaGa_shRNAsT.treatmentDox-up.txt
    mv shRNAsT.treatmentDox-down.txt WaGa_shRNAsT.treatmentDox-down.txt

    ~/Tools/csv2xls-0.4/csv_to_xls.py WaGa_shRNA_sT_vs_scr-up.txt WaGa_shRNA_sT_vs_scr-down.txt WaGa_shRNA_sT_vs_scr-all.txt -d$',' -o WaGa_shRNA_sT_vs_scr.xls
    ~/Tools/csv2xls-0.4/csv_to_xls.py WaGa_treatment_Dox_vs_DMSO-up.txt WaGa_treatment_Dox_vs_DMSO-down.txt WaGa_treatment_Dox_vs_DMSO-all.txt -d$',' -o WaGa_treatment_Dox_vs_DMSO.xls
    ~/Tools/csv2xls-0.4/csv_to_xls.py WaGa_shRNAsT.treatmentDox-up.txt WaGa_shRNAsT.treatmentDox-down.txt WaGa_shRNAsT.treatmentDox-all.txt -d$',' -o WaGa_shRNAsT.treatmentDox.xls

5, display viral transcripts found in mRNA-seq (or small RNA) MKL-1, WaGa EVs compared to parental cells

    http://xgenes.com/article/article-content/87/display-viral-transcripts-found-in-mrna-seq-mkl-1-waga-evs-compared-to-cells/

Leave a Reply

Your email address will not be published. Required fields are marked *