Author Archives: gene_x

Visualization and Export of miRNA Expression Profiles Using Manhattan Plots in R

This R script processes raw miRNA read count data to visualize expression profiles across different samples using Manhattan plots. The main steps include:

  • Normalization: Raw counts are converted to Reads Per Million (RPM) to account for sequencing depth differences across samples.

  • Transformation & Reshaping: The data is log-transformed and reshaped into long format for plotting.

  • Highlighting Key miRNAs:

    • Top 5 miRNAs with highest mean RPM across all samples are highlighted in Plot 1.
    • A manually selected set of biologically relevant miRNAs is highlighted in Plot 2.
  • Plotting: Two Manhattan plots are generated using ggplot2 and ggrepel, one for each set of highlighted miRNAs.

  • Export: The processed data, including RPM values, log-transformed values, and highlight flags, is saved to an Excel file (manhattan_data.xlsx) for further analysis.

      # Load required libraries
      library(ggplot2)
      library(dplyr)
      library(tidyr)
      library(ggrepel)
      library(openxlsx)
    
      # Load data
      d.raw <- read.delim2("d_raw.csv", sep = ",", header = TRUE, row.names = 1)
    
      # Step 1: Compute RPM
      d.raw_5 <- d.raw[, 1:5]
      total_counts <- colSums(d.raw_5)
      RPM <- sweep(d.raw_5, 2, total_counts, FUN = "/") * 1e6
      RPM$miRNA <- rownames(RPM)
    
      # Step 2: Long format
      df <- pivot_longer(RPM, cols = -miRNA, names_to = "sample", values_to = "RPM")
    
      # Step 3: Log transform
      df <- df %>%
      mutate(logRPM = log10(RPM + 1))
    
      # Step 4: miRNA position
      df <- df %>%
      arrange(miRNA) %>%
      group_by(sample) %>%
      mutate(Position = row_number())
    
      # Step 5: Define top miRNAs
      top_mirnas_mean <- df %>%
      group_by(miRNA) %>%
      summarise(mean_RPM = mean(RPM)) %>%
      arrange(desc(mean_RPM)) %>%
      slice_head(n = 5) %>%
      pull(miRNA)
    
      top_mirnas_selected <- c("hsa-miR-20a-5p", "hsa-miR-93-5p", "hsa-let-7g-5p",
                              "hsa-miR-30a-5p", "hsa-miR-423-5p", "hsa-let-7i-5p")
    
      # Step 6: Annotate highlights
      df <- df %>%
      mutate(
          highlight_meanRPM = miRNA %in% top_mirnas_mean,
          highlight_selected = miRNA %in% top_mirnas_selected
      )
    
      # Step 7: Export data to Excel
      write.xlsx(df, "manhattan_data.xlsx", asTable = TRUE)
    
      # Sample labels
      sample_labels <- c(
      "parental_cells_1" = "Parental cell 1",
      "parental_cells_2" = "Parental cell 2",
      "parental_cells_3" = "Parental cell 3",
      "untreated_1"      = "Untreated 1",
      "untreated_2"      = "Untreated 2"
      )
    
      # Step 8: Plot - Top by mean RPM
      df$color_mean <- ifelse(df$highlight_meanRPM, "red", "darkblue")
    
      png("manhattan_plot_top_miRNAs_based_on_mean_RPM.png", width = 1200, height = 1200)
      ggplot(df, aes(x = Position, y = logRPM, color = color_mean)) +
      scale_color_manual(values = c("red" = "red", "darkblue" = "darkblue")) +
      geom_jitter(width = 0.4) +
      geom_text_repel(
          data = df %>% filter(highlight_meanRPM),
          aes(label = miRNA),
          box.padding = 0.5,
          point.padding = 0.5,
          segment.color = 'gray50',
          size = 5,
          max.overlaps = 8,
          color = "black"
      ) +
      labs(x = "", y = "log10(Read Per Million) (RPM)") +
      facet_wrap(~sample, scales = "free_x", ncol = 5,
                  labeller = labeller(sample = sample_labels)) +
      theme_minimal() +
      theme(
          axis.text.x = element_blank(),
          axis.ticks.x = element_blank(),
          legend.position = "none",
          text = element_text(size = 16),
          axis.title = element_text(size = 18),
          strip.text = element_text(size = 16, face = "bold"),
          panel.spacing = unit(1.5, "lines")
      )
      dev.off()
    
      # Step 9: Plot - Selected miRNAs
      df$color_selected <- ifelse(df$highlight_selected, "red", "darkblue")
    
      png("manhattan_plot_most_differentially_expressed_miRNAs.png", width = 1200, height = 1200)
      ggplot(df, aes(x = Position, y = logRPM, color = color_selected)) +
      scale_color_manual(values = c("red" = "red", "darkblue" = "darkblue")) +
      geom_jitter(width = 0.4) +
      geom_text_repel(
          data = df %>% filter(highlight_selected),
          aes(label = miRNA),
          box.padding = 0.5,
          point.padding = 0.5,
          segment.color = 'gray50',
          size = 5,
          max.overlaps = 8,
          color = "black"
      ) +
      labs(x = "", y = "log10(Read Per Million) (RPM)") +
      facet_wrap(~sample, scales = "free_x", ncol = 5,
                  labeller = labeller(sample = sample_labels)) +
      theme_minimal() +
      theme(
          axis.text.x = element_blank(),
          axis.ticks.x = element_blank(),
          legend.position = "none",
          text = element_text(size = 16),
          axis.title = element_text(size = 18),
          strip.text = element_text(size = 16, face = "bold"),
          panel.spacing = unit(1.5, "lines")
      )
      dev.off()

Example Raw Data

    "","parental_cells_1","parental_cells_2","parental_cells_3","untreated_1","untreated_2","scr_control_1","scr_control_2","scr_control_3","DMSO_control_1","DMSO_control_2","DMSO_control_3","scr_DMSO_control_1","scr_DMSO_control_2","scr_DMSO_control_3","sT_knockdown_1","sT_knockdown_2","sT_knockdown_3"
    "hsa-miR-375",34533,3262377,1301496,100825,188119,153531,349072,21074,323775,336095,82391,194233,450228,51320,364245,772745,111276
    "hsa-miR-10b-5p",9041,7118756,2898458,89378,300984,171106,420457,30970,244186,358048,79946,202764,460488,64718,404386,432872,99274
    "hsa-let-7a-5p",117441,5512997,1790596,60180,90384,100759,131398,7423,208599,187416,33942,99479,172489,17403,214496,169542,41342
    "hsa-let-7b-5p",67432,814417,267664,69114,87227,44756,126032,6136,125020,121212,28550,52388,315105,13506,170644,216544,37364
    "hsa-miR-182-5p",6309,1361486,524716,42333,88821,63989,100756,2591,99036,104442,9956,74656,140172,6816,150818,151969,15236
    "hsa-miR-30a-5p",8721,948558,400525,14555,29211,27433,56620,418,67797,60160,2198,37562,112903,1134,89586,102199,2749
    "hsa-let-7f-5p",8950,1379796,501830,9482,18786,19152,25105,1097,35743,32119,4992,19641,29458,2611,37753,31201,5464
    "hsa-miR-191-5p",5319,187270,64891,17598,40244,9230,51212,2764,30164,48766,5275,20608,87755,9099,113397,85939,2608
    "hsa-miR-92a-3p",6202,466546,186103,9520,25668,20740,16677,1354,28347,33256,5931,21893,16776,3690,16799,20150,6742
    "hsa-miR-30d-5p",7668,480810,189297,8698,19449,11807,26133,332,24259,28667,1447,17094,44136,853,35161,37334,1542
    "hsa-miR-320a",4558,61397,19613,46980,78771,8831,237689,1470,24235,90901,2678,19811,222769,3872,76030,307737,2645
    "hsa-miR-486-5p",7236,116907,41502,6994,12458,11211,13947,1232,16559,19186,4688,12813,12954,1767,13088,17925,4386
    "hsa-miR-26a-5p",7787,556023,193310,4516,7921,8216,12146,730,15873,15392,2673,8557,18329,1703,25686,13053,3582
    "hsa-miR-378a-3p",2771,97674,38061,12733,60619,5156,157491,1151,14690,42554,2166,9806,108211,3328,58851,190501,1880
    "hsa-miR-423-5p",832,19336,6629,15116,40901,4269,38871,1090,12925,30694,1943,11965,63670,5767,32188,45385,2476
    "hsa-miR-93-5p",8360,336124,124845,3186,6539,5086,13468,121,11579,14276,618,5501,20073,361,19077,24599,644
    "hsa-miR-25-3p",2264,202212,84611,3218,8618,4036,20093,361,9774,14423,1514,5694,26041,1247,24033,46581,1924
    "hsa-let-7i-5p",6916,463608,184832,1876,3820,3259,6402,185,8490,8160,1032,4088,7530,470,8197,10634,1027
    "hsa-miR-181a-5p",4531,133716,47864,2879,6362,4313,8111,153,7554,8516,1077,4485,14439,533,13982,10684,1482
    "hsa-miR-30c-5p",1038,124393,49852,1592,4234,3036,4471,52,6945,7165,210,3745,7523,129,7161,4287,279
    "hsa-miR-148a-3p",448,439462,186294,1840,4740,3691,5857,525,6879,6337,1094,4959,6094,990,6399,6527,1590
    "hsa-miR-103a-3p",9950,316450,119774,1614,2530,3312,9204,190,5862,5666,653,4019,6595,390,7861,12440,940
    "hsa-let-7g-5p",8977,507218,178238,1663,2563,2572,6304,211,5486,5458,989,2676,7290,510,8191,8110,1301
    "hsa-miR-16-5p",3108,273138,96609,1621,3897,2418,6767,243,5200,6464,779,3178,7436,510,8496,6497,943
    "hsa-miR-769-5p",405,75785,29561,1168,3628,2156,5473,252,4465,5318,1059,3457,6846,633,6831,5787,1568
    "hsa-miR-146b-5p",354,80850,28700,2174,3161,1834,4325,100,3937,3779,288,3025,9058,363,10583,7922,362
    "hsa-miR-196a-5p",1361,113874,38288,1777,3074,1893,3981,104,3932,4291,506,2058,5900,316,6358,5839,594
    "hsa-miR-30e-5p",1070,147574,55848,936,1604,2255,2556,30,3822,3247,134,2572,3733,76,4280,3211,170
    "hsa-miR-200c-3p",7532,118630,38580,1465,3558,2559,5566,466,3807,6115,1632,3008,5706,1002,6467,5903,2384
    "hsa-miR-92b-3p",2340,78803,28318,1672,2249,3339,2488,187,3480,2980,830,2987,2072,293,2654,2559,823
    "hsa-miR-345-5p",439,27482,11486,1368,2869,1625,9583,47,3395,4737,258,2124,13198,139,6879,11281,248
    "hsa-miR-98-5p",1390,220110,74613,716,1168,1982,2470,99,3073,2615,471,1487,2115,237,2781,3424,390
    "hsa-let-7d-5p",1612,36438,11128,724,1148,1460,2190,44,2846,2642,250,1471,2238,162,2200,2146,354
    "hsa-miR-106b-3p",1000,18194,8302,1023,2106,1650,5202,34,2786,4840,142,1960,6501,89,6581,7052,209
    "hsa-miR-151a-3p",940,50844,20614,1123,2186,1317,3103,104,2748,3537,416,2348,6036,322,6857,5410,501
    "hsa-miR-423-3p",473,21186,8905,2029,3062,1262,11931,161,2669,3863,679,1955,9282,496,3890,13902,910
    "hsa-miR-9-5p",876,272251,119702,601,1293,1038,1670,63,2639,2000,353,1172,2409,172,3382,2448,454
    "hsa-miR-885-3p",373,3458,1123,3490,4454,613,3781,63,2490,4312,108,1918,12307,194,8321,6196,140
    "hsa-miR-200b-3p",2250,79100,30268,737,1969,2322,3554,52,2468,3343,198,3273,3457,193,3859,3264,320
    "hsa-miR-181b-5p",1089,31926,12240,1210,1488,1288,2379,64,2326,3140,323,1108,5134,208,4514,4266,274
    "hsa-miR-186-5p",677,21082,7797,776,2212,1591,5698,120,2016,3253,371,1785,6175,298,3723,6488,571
    "hsa-miR-181d-5p",929,76318,28372,756,1060,1000,4864,57,1942,2736,340,998,5815,232,3884,6836,466
    "hsa-miR-183-5p",847,54855,19776,824,1137,1242,2801,83,1881,2258,269,1418,2692,170,2367,3509,421
    "hsa-miR-26b-5p",2164,214186,74552,518,803,1090,2181,172,1797,1956,653,1031,2385,503,3527,3143,752
    "hsa-miR-320b",73,349,99,2395,6147,684,10886,5,1660,6171,20,1415,13452,21,5202,14559,21
    "hsa-miR-181c-5p",488,66293,25997,496,1407,986,3665,76,1607,1980,294,1221,5804,243,2968,4268,512
    "hsa-miR-30a-3p",23,18099,7876,946,1383,952,3233,79,1534,2332,308,1396,5988,401,7291,7917,506
    "hsa-miR-7-5p",526,50288,20079,490,541,708,1095,34,1277,1467,81,650,1322,41,1821,1619,90
    "hsa-miR-101-3p",63,35823,16223,480,1303,682,1503,58,1208,1491,147,1051,1581,81,2218,1883,204
    "hsa-miR-151a-5p|hsa-miR-151b",976,9344,3234,356,692,828,1176,15,1194,1228,50,868,1588,36,1490,978,66
    "hsa-miR-92a-1-5p",110,4993,1743,636,1140,1086,1407,27,1191,1667,90,1280,2098,92,1822,1648,245
    "hsa-miR-20a-5p",865,109223,38996,308,532,704,1469,17,1142,1172,83,900,1835,55,1824,1890,88
    "hsa-miR-190b",308,51627,19416,256,416,428,2568,102,1090,1453,488,423,2427,291,1787,4725,489
    "hsa-miR-1307-3p",111,8751,3680,368,722,442,1726,11,1059,1376,84,670,1579,21,944,2229,79
    "hsa-miR-24-3p",724,58135,21371,452,696,400,2739,58,1057,1011,303,601,2545,212,2017,4056,252
    "hsa-miR-1-3p",932,276107,128481,875,926,666,1334,145,1045,1201,457,861,1176,190,2555,1807,430
    "hsa-miR-532-5p",152,16899,7525,396,1174,622,2020,57,1043,1677,210,896,2132,204,2131,4051,227
    "hsa-miR-744-5p",980,8950,3197,479,668,628,1103,18,972,1242,108,559,1288,44,1090,1430,113
    "hsa-miR-130b-3p",258,7803,3037,688,1517,318,2330,13,957,1133,12,787,4032,16,3854,4966,33
    "hsa-miR-192-5p",60,11210,4454,269,474,639,1158,20,948,1128,82,802,1287,80,1689,1322,85
    "hsa-miR-3615",78,1503,642,268,344,238,1602,1,877,805,25,537,2549,14,612,2883,35
    "hsa-miR-1224-5p",74,1116,344,694,1264,135,1305,65,876,1723,136,882,3643,249,2182,1845,50
    "hsa-miR-17-5p",1102,65959,24350,194,665,533,1324,14,851,1306,82,842,1196,62,1767,1721,87
    "hsa-miR-130b-5p",291,11405,3876,274,529,451,476,20,832,595,93,618,655,35,1021,518,110
    "hsa-miR-301a-3p",61,43874,17611,161,310,337,928,28,826,777,198,428,879,98,991,855,198
    "hsa-miR-148a-5p",105,14869,6081,348,613,392,1150,19,794,693,124,488,1137,84,1044,1329,154
    "hsa-miR-107",1048,29568,12292,226,461,375,1154,22,785,890,62,454,1089,34,1197,1462,62
    "hsa-miR-200a-3p",649,45836,16810,184,684,833,1214,19,772,909,54,987,1136,26,1626,1217,72
    "hsa-miR-27b-3p",346,15959,6048,204,690,480,1106,9,751,1012,31,430,1544,10,1408,1354,46
    "hsa-miR-196b-5p",315,56050,20888,261,422,467,531,56,744,720,146,266,849,104,815,804,199
    "hsa-miR-885-5p",60,4387,1793,410,441,604,570,8,728,702,55,594,687,36,922,734,62
    "hsa-miR-19b-3p",44,46497,20811,304,1171,384,2380,40,727,1052,143,475,2640,72,2695,2681,182
    "hsa-miR-181a-2-3p",41,2966,1304,272,468,422,313,8,692,845,63,371,663,20,1126,613,67
    "hsa-miR-873-3p",102,5299,1957,767,1717,208,4009,77,686,1944,276,596,3868,255,2004,4425,236
    "hsa-miR-21-5p",124,54200,23746,278,290,448,848,13,682,864,73,567,886,25,1212,839,65
    "hsa-miR-421",51,4594,1689,185,138,192,347,16,586,314,60,312,507,56,398,595,50
    "hsa-miR-128-3p",147,3435,1325,209,609,212,927,10,515,668,39,394,680,17,728,1154,34
    "hsa-miR-425-5p",298,13138,4499,267,609,363,1650,27,492,972,108,397,1872,72,759,1884,147
    "hsa-miR-877-5p",114,1014,353,299,269,114,243,1,483,426,9,305,765,29,871,502,9
    "hsa-miR-625-3p",19,3201,1475,151,318,312,372,3,473,591,52,331,473,23,531,347,26
    "hsa-miR-1180-3p",331,3493,1265,283,343,238,815,5,418,478,94,325,810,53,536,1038,98
    "hsa-miR-30b-5p",89,28134,11724,298,419,534,544,19,401,935,87,368,1037,42,514,368,83
    "hsa-miR-30e-3p",15,11217,4550,178,270,328,702,23,396,422,76,273,743,52,671,839,116
    "hsa-miR-148b-3p",335,62635,29143,86,243,237,315,34,383,247,43,166,226,36,374,391,63
    "hsa-miR-941",590,10640,3991,141,270,231,354,29,350,381,30,172,364,42,402,238,41
    "hsa-miR-122-5p",0,0,0,625,126,250,190,6,346,225,9,581,190,2,1942,253,4
    "hsa-miR-95-3p",193,1087,394,90,370,401,571,1,345,555,13,338,576,0,641,467,18
    "hsa-miR-92b-5p",43,834,270,94,440,84,238,9,333,400,27,200,1109,54,704,327,21
    "hsa-miR-483-5p",0,445,147,117,1361,48,259,138,320,1598,328,70,1762,232,564,270,508
    "hsa-let-7c-5p",135,2637,1038,222,335,228,455,8,320,646,48,529,601,24,471,534,36
    "hsa-miR-197-3p",89,2006,834,97,159,154,158,11,318,168,52,184,225,22,131,128,22
    "hsa-miR-133a-3p",452,19488,7714,302,650,386,946,22,310,462,146,502,970,107,1026,864,156
    "hsa-miR-708-5p",293,22077,8483,103,162,209,394,5,300,279,51,162,466,15,584,403,44
    "hsa-miR-760",60,771,278,209,315,149,515,1,298,292,26,235,642,9,591,792,16
    "hsa-miR-141-3p",198,5911,2362,132,160,318,338,0,292,314,7,328,320,2,501,319,10
    "hsa-miR-22-3p",88,4854,1881,109,234,172,299,9,287,259,21,200,401,5,395,510,45
    "hsa-miR-181c-3p",32,12886,5202,53,209,107,411,4,275,228,39,145,376,24,504,497,51
    "hsa-miR-25-5p",209,907,353,207,247,306,235,3,263,417,20,328,571,14,546,444,29
    "hsa-miR-429",45,8117,3117,50,249,318,455,2,262,260,31,190,454,11,629,554,6
    "hsa-miR-340-5p",49,25006,10179,58,261,142,297,18,250,261,40,226,182,34,405,324,48
    "hsa-miR-200a-5p",31,2757,1043,87,113,179,279,10,248,150,30,259,634,26,1137,695,37
    "hsa-miR-454-3p",128,20684,7584,80,80,65,231,6,237,232,23,164,210,29,264,285,26
    "hsa-miR-425-3p",96,3316,1345,76,321,186,579,14,232,324,51,168,1043,45,478,707,39
    "hsa-miR-484",281,5775,2227,168,353,120,394,5,227,304,40,155,425,6,312,302,13
    "hsa-miR-1270",104,1035,434,189,226,183,433,13,224,541,39,173,999,23,1232,1187,36
    "hsa-miR-342-3p",117,6675,2169,170,245,150,555,22,220,401,57,204,694,31,356,453,28
    "hsa-miR-185-5p",94,1450,552,152,852,95,926,5,220,334,8,153,1063,8,1154,1312,9
    "hsa-let-7e-5p",173,3800,1440,90,110,89,184,7,214,228,80,157,202,38,346,308,77
    "hsa-miR-339-5p",65,1151,470,114,370,161,636,1,205,493,10,183,1428,13,833,547,30
    "hsa-miR-629-5p",26,1989,764,114,368,149,973,13,190,307,17,111,778,23,350,1726,36
    "hsa-miR-454-5p",8,2713,979,53,145,78,171,4,188,285,26,207,124,16,223,296,32
    "hsa-miR-7706",69,1304,490,114,214,156,322,0,186,212,18,47,238,18,238,603,27
    "hsa-let-7a-3p",6,4014,1498,33,270,121,279,3,179,180,31,171,431,15,485,312,10
    "hsa-miR-1226-5p",5,23,8,81,52,169,73,6,179,218,2,180,42,6,100,92,5
    "hsa-miR-3180|hsa-miR-3180-3p",7,308,87,267,659,9,680,6,171,1111,9,123,1816,36,483,629,2
    "hsa-miR-143-3p",0,14,6,83,238,127,189,1,170,242,3,179,248,3,404,394,6
    "hsa-miR-132-3p",14,4593,1814,95,345,137,252,19,168,291,65,119,194,41,333,274,98
    "hsa-miR-139-3p",33,951,297,396,1087,56,393,30,154,475,49,196,903,120,844,673,24
    "hsa-miR-3187-3p",24,348,119,74,103,18,311,0,152,128,4,96,198,4,58,598,3
    "hsa-miR-660-5p",42,14230,6086,78,241,165,367,11,151,241,34,192,483,24,474,576,36
    "hsa-miR-23b-3p",63,2574,1030,47,49,45,92,1,150,245,1,70,188,2,110,67,4
    "hsa-let-7d-3p",164,1466,508,80,222,191,248,2,148,237,22,182,254,21,122,167,8
    "hsa-miR-342-5p",35,1405,488,136,162,100,164,26,147,187,56,137,280,54,431,173,119
    "hsa-miR-339-3p",23,2262,798,66,208,109,429,1,139,143,8,149,606,9,542,765,5
    "hsa-miR-130a-3p",50,2277,947,75,94,52,251,0,139,116,1,119,272,3,269,442,6
    "hsa-miR-30c-2-3p",19,1421,668,25,92,27,164,0,129,112,20,71,185,6,309,206,28
    "hsa-miR-320c",17,25,12,214,390,41,1085,2,128,480,2,161,1304,7,477,1499,3
    "hsa-miR-18a-5p",52,6597,2513,0,0,6,94,0,124,63,5,31,31,5,43,61,2
    "hsa-miR-23a-3p",44,1148,473,47,114,65,156,3,122,134,3,50,148,5,218,144,6
    "hsa-miR-328-3p",67,1241,530,34,73,27,131,5,113,76,20,16,138,9,82,72,15
    "hsa-miR-409-3p",11,198,98,112,166,41,281,2,111,171,33,45,228,8,244,355,16
    "hsa-miR-34a-5p",100,10066,3383,65,69,26,960,19,109,351,71,132,845,50,589,1125,58
    "hsa-miR-589-5p",27,377,134,32,49,55,81,0,107,93,3,26,66,0,68,96,3
    "hsa-miR-361-5p",246,10554,3534,59,188,153,203,12,107,189,47,72,178,19,210,257,40
    "hsa-miR-1246",4,20,2,112,738,40,1106,1,106,518,1,92,995,1,413,967,3
    "hsa-miR-106b-5p",163,5590,1969,32,166,134,228,0,106,127,2,133,70,10,197,120,25
    "hsa-miR-3127-5p",12,62,22,78,117,18,257,8,101,183,10,60,379,28,350,806,16
    "hsa-miR-500a-3p",12,1217,470,66,84,32,94,6,100,73,24,60,202,17,78,178,22
    "hsa-miR-3131",2,49,31,27,187,21,143,4,97,163,0,58,169,1,162,199,2
    "hsa-miR-301b-3p",16,15397,5867,65,16,77,109,12,97,105,32,40,60,11,154,139,33
    "hsa-miR-15b-5p",609,6681,2409,83,47,45,159,1,95,104,13,89,144,13,109,58,31
    "hsa-miR-10a-5p",8,1492,607,46,94,88,144,2,94,182,6,138,116,4,118,115,14
    "hsa-miR-200b-5p",28,101,37,38,108,71,118,4,90,84,2,48,277,5,297,259,0
    "hsa-miR-361-3p",106,2463,1016,31,67,43,105,1,88,82,5,28,221,0,151,157,8
    "hsa-miR-625-5p",40,3917,1346,6,13,37,77,6,87,43,11,39,176,7,149,116,3
    "hsa-miR-887-3p",21,545,186,24,70,11,122,0,86,61,2,47,120,0,42,52,2
    "hsa-miR-708-3p",11,2405,977,32,115,87,129,1,86,104,4,69,141,3,121,160,9
    "hsa-miR-19a-3p",13,14791,6422,118,419,137,716,6,84,509,46,128,589,42,782,722,51
    "hsa-miR-330-3p",14,486,148,73,393,39,192,9,82,362,26,98,284,31,328,391,4
    "hsa-miR-671-5p",27,721,229,41,64,21,217,0,82,55,1,97,143,2,129,276,0
    "hsa-miR-324-5p",123,1886,782,16,122,30,287,6,81,81,12,137,590,13,284,290,41
    "hsa-miR-181a-3p",6,1199,510,51,23,183,28,2,80,92,7,34,135,2,134,53,13
    "hsa-miR-210-3p",68,30874,14090,18,74,33,290,28,77,189,109,43,460,64,224,348,178
    "hsa-miR-411-5p",0,1,3,25,55,8,19,0,75,7,1,6,15,0,53,116,0
    "hsa-miR-1468-5p",12,1112,404,48,59,25,171,17,74,195,27,134,274,35,161,417,36
    "hsa-let-7b-3p",15,1366,424,30,27,52,11,0,70,37,6,55,63,5,37,13,1
    "hsa-miR-324-3p",70,291,90,64,114,116,396,0,69,220,0,58,491,0,255,377,3
    "hsa-miR-340-3p",14,2099,728,0,37,56,40,1,68,47,1,90,9,6,25,19,7
    "hsa-miR-4677-3p",1,1395,547,0,35,18,23,5,68,88,14,69,21,0,38,48,7
    "hsa-miR-873-5p",8,1826,692,21,43,24,164,0,68,132,12,36,119,8,143,192,4
    "hsa-miR-652-3p",72,1113,369,14,27,0,49,0,67,59,4,16,90,2,54,62,17
    "hsa-miR-4286",20,2,0,19,79,18,75,0,66,148,0,65,199,0,201,77,0
    "hsa-miR-671-3p",19,273,115,20,46,13,146,3,66,143,6,21,155,3,140,128,5
    "hsa-miR-218-5p",68,11916,4970,30,58,41,116,3,65,121,15,70,179,3,178,137,12
    "hsa-miR-505-3p",45,687,339,10,67,1,131,0,65,24,2,71,111,3,52,156,1
    "hsa-miR-382-5p",5,245,73,57,177,18,310,1,64,125,9,12,330,20,230,1000,0
    "hsa-miR-6721-5p",15,139,52,4,0,19,0,0,62,46,1,18,18,0,30,18,0
    "hsa-miR-140-3p",36,1859,937,15,74,44,159,2,62,142,15,76,298,15,156,362,20
    "hsa-miR-5187-5p",6,44,8,7,94,1,52,1,61,92,9,78,190,12,94,92,15
    "hsa-miR-548av-5p|hsa-miR-548k",9,956,388,8,48,19,48,1,61,74,7,96,16,7,0,18,2
    "hsa-miR-1908-5p",45,125,58,26,25,36,55,0,60,34,0,46,66,0,23,106,0
    "hsa-miR-3940-5p",0,0,0,24,38,30,25,0,58,23,0,56,47,0,30,84,0
    "hsa-miR-335-3p",15,4166,1655,17,21,86,120,4,58,114,14,194,47,18,133,174,15
    "hsa-miR-15a-5p",79,3126,1105,18,67,50,142,0,58,93,8,46,131,0,170,113,2
    "hsa-miR-3605-3p",10,285,87,0,0,10,0,0,57,33,9,39,26,1,9,15,5
    "hsa-miR-18a-3p",20,396,170,60,106,23,230,0,56,130,11,85,288,12,151,189,16
    "hsa-miR-4788",17,274,106,13,23,20,54,5,52,26,4,24,57,0,138,80,4
    "hsa-miR-3200-3p",24,650,272,16,51,21,154,2,51,29,6,46,102,7,41,67,15
    "hsa-miR-219a-1-3p",9,316,105,198,223,13,557,0,51,232,10,51,691,30,240,1145,19
    "hsa-miR-330-5p",1,238,118,0,0,20,0,0,49,0,2,0,0,0,15,0,0
    "hsa-miR-940",30,127,45,11,16,37,100,0,49,123,0,0,171,0,107,92,0
    "hsa-miR-3174",7,174,56,16,43,1,0,2,49,68,3,34,45,1,69,14,1
    "hsa-miR-6730-5p",6,14,1,13,43,0,7,1,48,41,0,33,67,1,11,27,0
    "hsa-miR-151b",18,13,4,16,36,31,104,0,46,80,0,32,144,0,92,121,0
    "hsa-miR-504-3p",0,4,1,7,72,0,6,2,46,24,0,0,117,4,16,13,0
    "hsa-miR-129-5p",4,301,92,23,74,0,183,0,44,42,0,10,229,2,81,502,2
    "hsa-miR-769-3p",56,1608,582,21,25,0,111,0,44,56,2,51,126,0,23,83,3
    "hsa-miR-4326",1,167,69,0,0,12,34,0,42,15,3,19,48,0,20,2,1
    "hsa-miR-1276",9,135,44,0,0,0,0,0,42,17,0,0,49,0,14,0,0
    "hsa-miR-6847-5p",3,36,9,0,0,0,0,0,41,0,0,17,4,1,13,52,0
    "hsa-miR-3605-5p",4,366,109,102,178,19,134,18,40,215,20,16,383,26,280,115,18
    "hsa-miR-33b-3p",22,16,8,0,0,31,0,0,38,12,0,35,1,0,10,12,0
    "hsa-miR-335-5p",2,1518,535,33,56,46,145,10,38,25,5,65,55,14,136,91,10
    "hsa-miR-877-3p",14,269,91,14,28,13,11,1,37,9,1,12,72,0,0,20,4
    "hsa-miR-148b-5p",7,1317,512,3,7,0,22,1,36,14,5,16,64,5,50,0,8
    "hsa-miR-128-1-5p",32,385,158,14,2,2,12,0,36,49,6,11,59,2,67,40,8
    "hsa-miR-576-3p",2,391,138,28,0,12,88,2,36,0,9,49,98,0,28,107,1
    "hsa-miR-320d",5,2,1,65,177,11,289,0,35,131,1,41,382,0,170,389,0
    "hsa-miR-3928-3p",11,114,44,36,175,16,201,0,35,105,0,71,305,5,150,299,3
    "hsa-miR-6891-5p",0,0,1,1,0,4,6,0,35,5,0,21,10,0,10,12,0
    "hsa-miR-4476",1,11,1,5,0,14,7,0,35,9,0,0,8,0,13,0,0
    "hsa-miR-937-3p",9,105,61,17,19,56,22,2,34,74,12,23,44,6,4,87,18
    "hsa-miR-204-5p",0,155,59,5,27,0,0,6,33,33,0,0,15,0,58,16,1
    "hsa-miR-486-3p",14,342,101,21,16,1,75,0,33,56,10,6,70,0,90,73,0
    "hsa-miR-151a-5p",38,913,310,0,17,0,7,0,33,41,1,18,26,1,2,38,9
    "hsa-miR-659-5p",1,132,56,7,0,12,8,0,33,0,0,0,10,3,0,0,0
    "hsa-miR-144-5p",0,26,8,1,0,29,29,0,32,30,5,37,6,2,1,9,0
    "hsa-miR-501-3p",4,568,174,20,88,6,90,0,32,74,10,62,64,14,32,204,4
    "hsa-miR-197-5p",3,5,7,13,51,0,26,0,31,34,3,15,65,3,20,19,0
    "hsa-miR-1301-3p",66,587,208,11,53,23,100,0,31,55,7,15,51,1,32,21,16
    "hsa-miR-5010-5p",2,7,5,15,190,35,40,0,30,79,10,28,141,5,45,61,12
    "hsa-miR-193b-3p",20,776,302,40,64,12,67,1,30,96,13,34,263,2,31,130,18
    "hsa-miR-2682-5p",1,120,63,5,0,17,44,0,30,117,11,0,70,4,166,94,1
    "hsa-miR-191-3p",16,99,54,3,35,2,20,0,30,50,0,23,57,0,12,75,3
    "hsa-miR-577",3,2156,808,8,37,0,61,0,30,27,13,0,51,2,58,36,10
    "hsa-miR-126-5p",1,50,21,10,0,27,31,0,30,14,0,9,0,0,64,28,0
    "hsa-miR-1296-5p",65,740,299,9,33,34,103,6,30,77,11,18,129,15,69,34,10
    "hsa-miR-193b-5p",3,59,11,73,149,10,136,5,29,125,0,17,155,17,78,129,11
    "hsa-miR-93-3p",32,853,334,19,61,59,192,4,29,39,15,41,173,7,104,88,2
    "hsa-miR-149-3p",2,7,5,6,0,31,15,1,28,32,1,22,40,2,23,16,0
    "hsa-miR-16-2-3p",17,1180,441,20,73,26,71,1,28,68,5,36,95,4,96,50,17
    "hsa-miR-3074-5p",4,74,40,5,0,12,72,2,28,35,4,24,32,1,14,11,0
    "hsa-miR-4667-5p",0,14,2,12,52,13,53,2,27,38,0,40,105,12,25,34,0
    "hsa-miR-27a-3p",2,1136,434,8,10,36,70,1,26,30,0,11,26,0,46,16,1
    "hsa-miR-3158-3p",1,105,32,0,0,9,72,0,26,28,0,0,47,3,22,58,0
    "hsa-miR-331-3p",33,667,212,0,25,0,18,0,26,39,5,22,50,4,37,29,0
    "hsa-miR-149-5p",29,814,431,45,25,63,64,1,26,104,11,90,216,6,133,103,40
    "hsa-miR-451a",0,9,5,25,55,78,126,10,25,71,4,76,111,3,195,75,12
    "hsa-miR-7854-3p",1,5,4,31,7,0,6,0,25,12,0,9,29,6,17,12,0
    "hsa-miR-126-3p",6,620,318,10,47,35,1,0,25,0,0,21,22,3,77,28,0
    "hsa-miR-374b-5p",5,703,263,0,0,11,24,0,25,82,0,0,10,0,26,0,9
    "hsa-miR-6858-5p",0,0,0,16,0,9,11,0,24,0,0,0,39,0,59,0,2
    "hsa-miR-432-5p",0,1,2,25,23,50,78,6,24,103,4,48,115,5,202,85,16
    "hsa-miR-146a-5p",6,792,253,25,24,18,22,0,24,11,5,16,95,10,34,16,10
    "hsa-miR-502-3p",4,78,18,4,0,0,14,0,24,8,0,4,26,0,22,22,0
    "hsa-miR-221-3p",0,0,0,0,10,45,68,0,23,23,7,31,43,2,90,58,0
    "hsa-miR-3127-3p",0,4,2,0,0,0,0,0,23,0,0,0,19,0,18,13,0
    "hsa-miR-210-5p",2,1624,638,0,3,51,0,0,23,17,7,49,12,1,0,0,7
    "hsa-miR-4739",0,0,0,0,0,0,0,0,23,0,0,0,13,1,0,0,2
    "hsa-miR-1269a",0,24,17,0,0,0,0,0,23,12,0,0,36,1,16,0,0
    "hsa-miR-550a-3-5p|hsa-miR-550a-5p",6,267,96,0,9,0,32,0,22,14,2,12,22,0,38,23,0
    "hsa-miR-2277-5p",2,99,43,0,1,11,19,0,22,18,0,0,11,0,0,10,0
    "hsa-miR-3909",2,445,148,0,17,0,31,0,22,53,2,0,20,0,0,11,0
    "hsa-miR-3200-5p",1,38,11,0,21,0,24,0,22,0,0,0,15,0,11,21,0
    "hsa-miR-20b-5p",19,2713,900,0,20,0,19,0,22,0,1,22,1,0,1,20,0
    "hsa-miR-766-5p",5,11,4,15,196,13,129,5,21,116,6,54,157,0,93,159,0
    "hsa-miR-652-5p",1,41,34,0,0,0,0,0,21,0,0,0,0,0,0,0,0
    "hsa-miR-6514-5p",7,75,39,46,0,1,64,0,21,5,0,13,32,0,36,59,2
    "hsa-miR-33a-5p",1,13,13,0,2,10,31,0,21,0,0,4,0,0,49,6,0
    "hsa-miR-4743-5p",1,5,8,0,0,0,39,0,21,6,0,5,0,0,45,15,0
    "hsa-miR-4786-5p",1,34,13,4,0,7,0,0,21,26,0,0,0,4,0,16,0
    "hsa-miR-29b-1-5p",1,24,12,0,1,21,0,1,21,0,0,23,0,0,0,0,0
    "hsa-miR-1292-5p",2,27,6,0,53,0,85,0,20,24,5,14,57,2,10,95,0
    "hsa-miR-6793-5p",0,12,5,2,19,0,0,0,20,38,0,0,33,4,0,6,1
    "hsa-miR-27b-5p",13,1177,434,0,0,24,27,0,20,92,1,14,94,4,88,42,8
    "hsa-miR-6734-5p",0,5,1,0,0,0,0,0,19,0,0,0,7,0,19,0,0
    "hsa-miR-1538",1,27,18,0,0,0,18,0,19,7,0,0,32,0,7,37,0
    "hsa-miR-195-5p",18,1300,537,9,20,10,11,0,19,0,0,1,34,0,0,12,0
    "hsa-miR-4750-5p",3,38,14,26,11,0,28,1,19,54,0,12,49,8,27,115,0
    "hsa-miR-6894-5p",1,1,3,15,21,6,0,0,18,0,0,0,9,0,0,30,4
    "hsa-miR-504-5p",41,108,41,0,16,7,4,0,18,39,3,17,27,0,9,9,0
    "hsa-miR-382-3p",0,8,5,0,7,0,0,0,18,0,0,0,0,0,13,41,1
    "hsa-miR-4741",0,5,0,0,0,0,42,0,18,21,0,0,20,0,11,21,0
    "hsa-miR-363-3p",11,630,202,0,0,0,9,0,18,14,2,6,1,0,0,0,1
    "hsa-miR-6794-5p",1,9,4,55,59,20,53,1,18,77,0,17,147,3,88,57,0
    "hsa-miR-212-3p",22,269,136,8,25,7,11,0,17,47,0,0,15,2,0,59,1
    "hsa-miR-3188",12,93,30,27,0,0,47,2,17,0,3,14,13,5,0,48,0
    "hsa-miR-190a-5p",0,2441,792,0,0,2,25,0,17,0,2,0,11,0,17,42,2
    "hsa-miR-939-5p",1,71,15,11,0,0,7,0,17,0,0,0,13,0,0,11,0
    "hsa-miR-4746-5p",5,208,65,16,58,29,22,0,17,55,4,50,69,2,144,102,7
    "hsa-miR-3613-3p",1,119,35,12,42,19,16,2,16,28,1,26,0,1,10,0,5
    "hsa-miR-3682-3p",0,4,0,0,0,0,0,0,16,0,0,0,5,0,0,0,0
    "hsa-miR-505-5p",4,111,48,23,10,8,25,3,16,17,0,13,35,1,19,0,0
    "hsa-miR-3130-5p",0,10,3,0,0,0,0,0,16,0,0,0,18,0,15,0,0
    "hsa-miR-4738-3p",0,0,0,0,24,0,24,0,16,0,1,0,0,0,0,0,0
    "hsa-miR-4646-5p",0,10,0,0,24,0,16,1,16,7,1,0,42,1,11,9,3
    "hsa-miR-6867-5p",2,9,0,0,0,0,28,0,16,7,2,11,42,0,12,31,0
    "hsa-miR-7111-3p",1,5,6,10,0,0,18,0,15,0,0,0,14,0,0,0,0
    "hsa-miR-127-3p",0,36,12,13,19,6,20,0,15,8,14,36,5,3,57,24,5
    "hsa-miR-6726-5p",0,0,3,15,49,0,0,0,15,24,0,13,46,0,32,48,0
    "hsa-miR-1225-3p",0,1,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0
    "hsa-miR-590-3p",0,463,234,0,0,21,10,0,15,13,2,10,0,0,0,9,1
    "hsa-miR-4660",2,8,1,0,0,0,0,0,15,0,0,0,0,0,0,0,0
    "hsa-miR-6738-3p",0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0
    "hsa-miR-1307-5p",19,354,185,14,54,23,66,0,14,71,0,59,86,2,175,68,4
    "hsa-miR-629-3p",0,12,11,0,0,0,10,0,14,0,0,4,11,0,0,3,0
    "hsa-miR-6862-5p",2,6,4,17,19,0,31,0,14,36,0,0,51,0,31,42,0
    "hsa-miR-412-5p",0,1,1,4,28,0,7,0,14,34,0,0,18,0,0,40,0
    "hsa-miR-1253",1,1,0,0,9,0,13,0,14,0,0,0,7,0,29,8,0
    "hsa-miR-98-3p",1,423,133,0,34,9,31,0,14,0,0,11,14,0,4,0,0
    "hsa-miR-33b-5p",14,0,0,10,15,22,35,0,14,42,0,25,21,0,21,27,0
    "hsa-miR-6877-5p",6,91,34,5,30,7,69,0,14,46,0,0,46,1,37,45,0
    "hsa-miR-664a-3p",0,89,27,20,0,0,0,0,14,14,0,0,7,0,0,0,0
    "hsa-miR-4707-3p",2,56,32,33,15,0,84,0,14,0,2,7,73,5,42,101,0
    "hsa-miR-206",69,81,28,280,125,56,142,2,14,135,4,115,649,4,371,329,7
    "hsa-miR-943",1,51,32,0,21,0,0,0,13,0,0,0,0,0,0,0,0
    "hsa-miR-3138",0,0,0,0,0,0,16,0,13,15,0,0,0,0,21,0,0
    "hsa-miR-26b-3p",3,210,77,12,0,0,0,8,13,7,0,23,18,0,45,0,1
    "hsa-miR-152-3p",10,854,328,8,51,0,24,3,13,33,5,17,67,0,0,88,9
    "hsa-miR-487b-3p",0,42,11,0,0,0,33,0,13,0,0,13,16,0,0,22,0
    "hsa-miR-99b-3p",0,3,0,0,0,0,0,0,13,6,2,0,0,0,0,0,0
    "hsa-miR-195-3p",10,477,159,0,106,0,14,0,13,19,3,0,18,0,19,21,0
    "hsa-miR-942-5p",4,133,32,17,0,11,10,0,13,0,1,0,49,0,0,5,0
    "hsa-miR-155-5p",8,128,33,41,23,49,13,0,13,144,36,0,78,1,78,61,18
    "hsa-miR-1908-3p",1,14,9,0,0,0,25,0,13,0,0,0,11,0,0,5,0
    "hsa-miR-3180-5p",0,4,1,0,0,0,0,0,13,0,0,0,0,0,0,0,0
    "hsa-miR-4763-3p",1,0,0,0,4,0,0,0,13,8,0,14,0,0,0,0,0
    "hsa-miR-4466",0,6,1,0,0,0,0,0,13,0,0,0,0,0,0,0,0
    "hsa-miR-4520-3p",0,22,14,30,19,13,59,5,12,17,9,15,134,7,127,94,4
    "hsa-miR-3918",0,7,1,12,0,0,11,0,12,28,2,0,10,0,32,24,0
    "hsa-miR-6812-5p",0,0,2,0,0,16,0,0,12,0,1,9,8,0,6,0,0
    "hsa-miR-501-5p",24,321,121,0,16,0,21,0,12,0,0,21,50,0,0,85,4
    "hsa-miR-6788-5p",0,9,1,5,8,0,0,2,12,19,0,0,3,0,9,7,0
    "hsa-let-7f-2-3p",0,334,104,8,13,0,10,0,12,26,4,39,33,3,55,19,0
    "hsa-miR-5002-5p",1,11,2,0,0,17,0,0,12,0,0,0,0,0,0,0,0
    "hsa-miR-4421",6,173,71,0,7,16,17,0,12,67,0,0,101,0,40,60,0
    "hsa-miR-3691-3p",0,9,3,0,0,0,0,0,12,20,2,0,0,0,0,12,0
    "hsa-miR-654-3p",4,44,22,14,13,13,33,0,11,21,0,4,32,0,31,27,1
    "hsa-miR-6785-5p",1,0,0,0,18,0,0,0,11,11,0,0,4,0,0,0,0
    "hsa-miR-96-5p",7,3263,1378,0,40,41,28,11,11,16,3,19,8,3,36,46,1
    "hsa-miR-616-3p",0,13,4,0,10,0,43,0,11,67,0,0,76,3,39,96,0
    "hsa-miR-503-5p",2,81,27,0,0,0,10,0,11,0,0,0,0,0,0,1,0
    "hsa-miR-500a-5p",6,242,80,0,0,0,0,0,11,0,0,0,0,0,9,0,0
    "hsa-miR-222-3p",0,0,0,0,19,15,14,0,11,0,1,0,21,0,0,0,2
    "hsa-miR-2116-3p",1,19,6,0,0,0,0,0,11,0,0,0,0,0,0,6,0
    "hsa-miR-1233-3p",0,3,1,0,0,0,0,0,11,0,0,0,0,0,0,0,0
    "hsa-miR-5188",0,0,2,0,0,0,0,0,11,0,0,0,0,0,0,0,0
    "hsa-miR-4423-5p",0,0,1,0,0,0,0,0,11,0,0,0,7,0,0,0,0
    "hsa-miR-6765-5p",0,0,0,0,0,0,0,0,11,0,0,0,17,0,0,0,0
    "hsa-miR-139-5p",31,549,271,10,110,15,35,1,11,2,4,2,30,3,0,45,8
    "hsa-miR-3663-3p",0,9,1,9,17,0,28,0,10,0,0,14,18,0,17,16,0
    "hsa-miR-1255b-5p",0,13,8,0,6,0,21,0,10,24,0,0,0,6,27,20,0
    "hsa-miR-1910-5p",3,3,1,8,0,0,31,0,10,14,0,0,10,0,0,59,0
    "hsa-miR-4647",2,8,0,0,18,0,29,0,10,29,0,27,77,0,64,57,6
    "hsa-miR-1343-3p",0,26,10,0,0,0,0,0,10,0,0,0,0,0,0,8,0
    "hsa-miR-106a-5p",14,1490,566,2,12,4,20,0,10,16,2,2,4,0,10,29,2
    "hsa-miR-378i",7,0,0,14,66,3,180,0,10,51,0,14,120,0,73,245,0
    "hsa-miR-3691-5p",0,83,32,26,0,0,0,0,9,10,1,0,5,0,3,17,0
    "hsa-miR-6855-5p",1,2,1,0,0,0,0,0,9,0,0,0,0,0,0,0,0
    "hsa-miR-3679-5p",6,33,14,4,28,0,15,0,9,8,0,0,54,1,48,81,0
    "hsa-miR-4748",1,4,1,31,0,1,30,2,9,9,0,0,35,0,24,14,0
    "hsa-miR-223-5p",0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0
    "hsa-miR-5581-3p",0,8,3,0,0,0,0,0,9,3,0,0,0,0,11,0,0
    "hsa-miR-4706",0,0,2,0,18,0,23,0,9,0,0,18,27,0,0,0,0
    "hsa-miR-2110",9,107,35,19,60,5,39,0,9,3,2,13,51,2,84,29,4
    "hsa-miR-1305",0,63,16,0,0,0,0,0,9,0,0,0,0,0,0,0,0
    "hsa-miR-1286",3,10,7,0,28,1,13,0,9,11,0,1,27,1,19,28,0
    "hsa-miR-4747-5p",0,0,2,0,18,0,3,0,9,19,0,3,28,0,14,5,6
    "hsa-miR-17-3p",10,289,119,0,0,0,0,0,9,0,0,19,27,1,15,25,0
    "hsa-miR-365a-5p",2,112,20,12,7,11,8,2,9,11,7,0,36,5,15,27,8
    "hsa-miR-133b",4,72,34,0,22,8,51,0,8,32,1,18,60,0,25,32,1
    "hsa-miR-150-5p",2,42,12,10,22,29,0,0,8,0,0,18,0,1,14,6,0
    "hsa-miR-889-3p",1,5,7,0,0,0,0,0,8,7,0,0,0,0,8,0,0
    "hsa-miR-30b-3p",29,535,167,43,0,12,25,0,8,0,0,0,39,0,48,8,20
    "hsa-miR-4707-5p",0,28,8,0,25,0,0,0,8,0,0,8,0,0,0,19,0
    "hsa-miR-6802-5p",1,7,1,0,0,0,0,0,8,0,0,0,0,0,6,6,0
    "hsa-miR-4648",0,9,0,0,0,9,7,0,8,0,0,0,10,0,0,16,0
    "hsa-miR-6747-5p",0,0,0,0,0,0,0,0,8,0,0,0,2,5,0,0,0
    "hsa-miR-676-3p",4,153,61,0,0,19,54,0,8,0,0,0,56,0,12,49,1
    "hsa-miR-5009-5p",0,11,0,0,13,0,0,0,8,3,0,18,11,1,0,34,0
    "hsa-miR-4749-5p",1,27,2,4,4,8,6,0,8,0,3,0,18,1,24,15,0
    "hsa-miR-1224-3p",9,95,87,18,0,41,0,0,8,0,5,22,5,0,0,8,0
    "hsa-miR-4689",1,14,0,0,45,0,0,0,8,19,0,0,30,1,21,8,2
    "hsa-miR-532-3p",32,357,124,10,24,21,48,0,8,9,3,50,109,0,84,94,3
    "hsa-miR-30c-1-3p",10,115,56,6,10,0,5,0,8,0,1,0,6,0,0,0,1
    "hsa-miR-6845-5p",0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,5,0
    "hsa-miR-6871-5p",0,14,5,0,0,5,23,0,7,11,0,0,21,6,11,31,0
    "hsa-miR-212-5p",7,237,86,0,0,0,0,0,7,9,0,0,0,0,0,0,0
    "hsa-miR-6859-5p",0,6,1,0,0,0,5,2,7,0,3,0,14,2,10,16,0
    "hsa-miR-185-3p",4,73,16,0,0,1,9,1,7,0,0,0,6,0,0,15,0
    "hsa-miR-6882-5p",0,44,24,0,16,0,6,0,7,13,0,0,8,0,0,22,0
    "hsa-miR-1250-5p",1,89,29,0,5,0,10,0,7,0,0,0,0,0,0,6,0
    "hsa-miR-6500-3p",2,31,17,0,0,0,0,0,7,0,0,0,0,2,16,4,0
    "hsa-miR-543",3,31,7,12,0,11,14,0,7,28,7,0,34,1,59,14,0
    "hsa-miR-6817-3p",2,1,1,10,23,0,7,0,6,0,0,5,13,0,0,0,0
    "hsa-miR-6803-3p",1,45,15,0,0,11,10,0,6,0,0,0,0,0,22,12,0
    "hsa-miR-4668-5p",0,7,1,0,0,0,0,0,6,0,0,0,0,0,0,0,0
    "hsa-miR-7111-5p",0,0,0,0,11,0,0,0,6,0,0,0,0,0,10,0,0
    "hsa-miR-6825-5p",0,3,1,5,7,0,0,0,6,52,0,0,13,2,25,8,0
    "hsa-miR-3944-3p",0,6,4,0,0,0,12,0,6,0,0,0,0,0,0,0,0
    "hsa-miR-15b-3p",6,538,204,0,1,9,10,0,6,0,1,0,20,0,6,23,3
    "hsa-miR-323a-3p",0,4,0,0,0,0,0,0,5,0,0,0,0,2,0,0,0
    "hsa-miR-26a-2-3p",0,114,47,4,0,0,0,0,5,0,0,0,0,0,13,0,0
    "hsa-miR-1275",9,13,1,0,9,0,19,0,5,17,0,0,12,0,20,7,0
    "hsa-miR-7977",1,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0
    "hsa-miR-1343-5p",0,1,2,2,0,0,0,0,5,19,0,0,4,0,0,26,0
    "hsa-miR-3159",0,55,20,0,0,0,16,2,5,14,0,0,0,0,0,16,0
    "hsa-miR-194-3p",1,32,12,0,0,0,0,0,5,0,0,0,0,0,0,15,0
    "hsa-mir-378c",1,18,14,49,18,20,77,0,5,13,0,18,40,1,175,755,0
    "hsa-miR-204-3p",0,0,0,15,0,0,12,0,4,18,0,10,0,0,0,12,0
    "hsa-miR-4467",1,12,5,0,0,0,0,0,4,0,0,0,0,0,0,0,0
    "hsa-miR-3115",4,16,4,0,0,1,33,0,4,13,0,0,15,0,0,23,0
    "hsa-miR-6128",0,0,0,3,21,2,50,0,4,13,0,4,39,0,23,66,0
    "hsa-miR-378d",1,17,8,5,15,1,58,1,4,14,0,5,46,0,17,75,0
    "hsa-miR-6747-3p",2,35,12,0,0,0,0,0,3,0,0,0,0,0,0,0,0
    "hsa-miR-664a-5p",2,2,3,0,0,0,1,0,3,0,0,0,5,0,24,2,0
    "hsa-miR-99a-5p",0,1,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0
    "hsa-miR-320e",3,6,4,7,5,0,20,0,3,13,0,0,22,2,15,33,0
    "hsa-miR-1290",0,1,1,4,35,2,10,0,3,9,0,4,14,0,5,9,0
    "hsa-miR-4687-3p",1,14,6,1,0,0,7,0,3,0,0,0,16,0,24,0,0
    "hsa-miR-6838-5p",0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0
    "hsa-miR-6807-5p",0,10,0,0,0,0,45,0,2,0,0,0,0,0,19,0,0
    "hsa-miR-6511a-5p|hsa-miR-6511b-5p",4,88,57,5,15,0,15,1,2,32,0,18,22,3,11,8,0
    "hsa-miR-3064-5p",2,25,10,0,0,0,4,0,2,0,0,0,0,0,0,0,0
    "hsa-miR-374a-3p",2,2288,964,0,13,60,8,2,2,0,1,30,18,2,31,6,3
    "hsa-miR-378g",2,18,6,6,16,0,85,1,2,2,1,0,62,4,23,80,0
    "hsa-miR-4429",0,0,0,1,4,0,5,0,2,1,0,2,7,0,7,19,0
    "hsa-miR-144-3p",0,0,0,0,0,0,21,0,2,12,0,17,0,0,0,0,0
    "hsa-let-7c",1,15,10,0,0,0,0,0,2,1,0,0,0,0,0,5,0
    "hsa-miR-7974",14,878,330,0,0,0,0,0,1,0,0,0,0,0,0,0,0
    "hsa-miR-1271-5p",0,36,16,0,0,0,0,0,1,0,0,0,1,0,0,0,0
    "hsa-miR-654-5p",5,65,24,0,8,9,0,0,1,0,6,0,6,0,0,23,0
    "hsa-miR-2467-5p",0,318,125,3,18,18,10,0,1,28,0,3,31,3,15,1,0
    "hsa-miR-1244",0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
    "hsa-miR-373-5p",0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
    "hsa-miR-550a-3p",8,141,61,0,14,0,0,0,1,0,0,0,25,0,18,18,0
    "hsa-miR-6511a-5p",0,10,3,18,0,11,0,0,1,0,0,0,0,0,0,11,0
    "hsa-miR-378e",1,11,12,2,13,0,20,0,1,4,0,0,24,0,21,52,0
    "hsa-miR-378h",1,2,1,5,9,0,28,0,1,11,1,3,20,0,13,33,0
    "hsa-miR-374c-5p",0,4,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
    "hsa-miR-4318",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548i",0,20,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4520-2-3p",0,4,1,0,0,0,0,0,0,0,0,0,6,0,31,4,0
    "hsa-miR-215-5p",2,13,6,0,0,3,2,0,0,4,0,2,2,0,10,2,0
    "hsa-mir-196a-2",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-550a-3-5p",1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
    "hsa-miR-548av-5p",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-mir-107",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-mir-28",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-373-3p",0,2,2,22,35,0,346,0,0,136,1,0,164,0,0,934,2
    "hsa-miR-371a-5p",0,2,0,8,15,0,28,0,0,136,0,0,9,0,17,88,3
    "hsa-miR-4661-5p",0,31,10,0,0,15,0,0,0,61,0,0,18,0,0,30,0
    "hsa-miR-765",0,6,10,0,0,17,0,0,0,53,0,0,24,0,0,8,0
    "hsa-miR-6777-5p",3,11,11,68,69,11,47,12,0,47,4,1,66,7,60,42,0
    "hsa-miR-29a-3p",12,902,352,0,0,0,25,5,0,45,16,0,24,3,0,21,0
    "hsa-miR-27a-5p",0,496,187,0,0,14,0,0,0,45,1,0,0,1,17,26,1
    "hsa-miR-4664-3p",4,48,26,0,9,0,29,0,0,40,3,0,8,2,0,54,0
    "hsa-miR-6767-5p",2,42,2,24,0,0,54,7,0,37,2,0,65,8,76,76,10
    "hsa-miR-125b-2-3p",1,384,152,0,35,18,1,0,0,36,3,13,29,0,16,45,7
    "hsa-miR-378a-5p",7,254,65,0,0,6,7,0,0,36,0,0,1,0,19,9,1
    "hsa-miR-744-3p",1,47,26,0,45,0,24,0,0,35,0,18,33,4,8,15,2
    "hsa-miR-6780a-5p",0,16,5,3,14,0,11,1,0,33,1,0,44,6,0,15,14
    "hsa-miR-10b-3p",10,101,32,22,19,20,86,0,0,30,0,0,15,0,10,52,0
    "hsa-miR-4511",0,23,7,0,0,0,0,0,0,30,0,0,26,0,0,7,0
    "hsa-miR-1306-5p",15,89,22,14,46,8,7,0,0,28,0,23,32,2,38,11,8
    "hsa-miR-6511a-3p",5,155,39,18,0,0,2,0,0,27,3,0,4,0,0,0,1
    "hsa-miR-365b-5p",0,4,8,7,16,0,0,0,0,27,3,0,16,0,13,11,5
    "hsa-miR-6884-5p",0,3,2,0,0,0,0,0,0,26,0,17,11,0,0,0,0
    "hsa-miR-1287-5p",6,133,48,19,14,0,55,0,0,26,1,16,60,0,24,46,1
    "hsa-miR-6805-5p",0,3,6,0,9,8,0,0,0,24,0,0,0,0,0,0,4
    "hsa-miR-28-3p",3,97,52,22,20,13,47,1,0,23,15,28,79,12,31,58,17
    "hsa-miR-548b-5p",6,158,63,0,0,0,9,0,0,23,0,15,35,0,0,7,0
    "hsa-miR-548l",1,77,26,0,8,15,0,0,0,22,2,19,0,0,7,18,0
    "hsa-miR-125b-5p",3,125,51,20,0,0,0,0,0,21,2,0,5,0,0,2,0
    "hsa-miR-6751-5p",1,1,1,3,0,0,0,0,0,20,1,0,10,0,0,0,0
    "hsa-miR-3934-5p",3,149,66,4,0,13,7,6,0,20,6,18,27,9,38,38,0
    "hsa-miR-548av-3p|hsa-miR-548o-3p",3,137,47,0,0,26,17,0,0,19,0,0,4,2,0,0,0
    "hsa-miR-493-5p",0,6,3,0,41,0,3,0,0,19,0,0,25,4,64,34,0
    "hsa-miR-548u",0,20,8,0,0,0,0,0,0,19,0,0,0,0,0,0,0
    "hsa-miR-3929",0,42,10,0,7,0,3,0,0,18,0,0,16,0,0,19,0
    "hsa-miR-3133",0,6,0,0,0,0,0,0,0,18,0,0,0,0,0,0,0
    "hsa-miR-548e-3p",0,30,12,0,0,0,0,0,0,18,0,0,0,0,0,0,0
    "hsa-miR-548b-3p",0,46,12,0,0,0,0,0,0,18,0,0,0,0,5,11,0
    "hsa-miR-4665-5p",1,3,2,0,11,0,0,0,0,17,1,0,9,1,0,13,0
    "hsa-miR-194-5p",4,702,285,7,0,0,0,0,0,17,1,23,0,0,0,11,2
    "hsa-miR-1273d",0,8,5,0,0,0,0,0,0,17,0,0,5,0,0,0,0
    "hsa-miR-372-3p",0,0,1,0,0,0,0,0,0,17,0,0,0,0,0,20,0
    "hsa-miR-199a-3p|hsa-miR-199b-3p",2,72,31,8,0,21,26,0,0,17,0,15,8,0,47,32,0
    "hsa-miR-641",5,417,159,0,20,40,17,0,0,17,0,0,44,0,60,16,10
    "hsa-miR-628-3p",1,44,20,0,17,0,0,0,0,17,0,0,0,0,0,0,0
    "hsa-let-7f-1-3p",1,60,28,0,3,0,0,0,0,16,0,9,0,0,18,0,0
    "hsa-miR-323b-3p",0,0,1,0,13,0,0,0,0,16,0,6,14,0,0,25,0
    "hsa-miR-3173-3p",1,8,1,0,0,3,0,0,0,16,0,0,0,0,30,0,1
    "hsa-miR-6832-5p",1,31,7,14,0,0,0,0,0,15,2,0,16,1,59,15,3
    "hsa-miR-3938",2,95,19,0,0,0,10,0,0,15,12,0,31,0,11,9,9
    "hsa-miR-3198",0,3,8,0,0,0,35,0,0,15,5,10,28,0,38,0,0
    "hsa-miR-6778-5p",0,0,0,0,0,0,0,0,0,15,0,0,3,0,0,0,0
    "hsa-miR-3065-5p",16,943,345,9,0,66,26,0,0,15,6,36,72,1,91,48,6
    "hsa-miR-3911",2,27,14,0,0,0,18,0,0,15,0,0,18,0,6,0,0
    "hsa-miR-3620-3p",0,6,5,0,0,0,0,0,0,15,0,0,0,0,0,0,0
    "hsa-miR-6799-5p",0,0,0,5,0,0,0,0,0,14,0,0,0,0,0,0,0
    "hsa-miR-6765-3p",2,11,2,0,0,8,0,0,0,14,0,0,6,0,0,0,0
    "hsa-miR-5189-5p",0,0,0,0,0,0,0,0,0,14,2,0,19,2,12,0,0
    "hsa-miR-449a",0,122,49,0,0,0,4,0,0,14,0,13,0,0,0,0,0
    "hsa-miR-346",2,60,19,0,0,0,0,0,0,14,0,0,32,0,22,3,2
    "hsa-miR-5088-5p",0,6,0,0,0,0,16,0,0,14,3,0,37,0,0,0,0
    "hsa-miR-1277-5p",0,234,109,0,0,0,0,0,0,13,0,0,10,0,0,0,0
    "hsa-miR-642a-5p",4,107,47,0,0,0,0,0,0,13,2,0,12,0,0,14,0
    "hsa-miR-5001-5p",1,12,4,0,50,0,67,0,0,13,5,0,21,1,24,68,0
    "hsa-miR-184",0,6,2,6,32,5,19,1,0,13,0,0,11,1,1,23,8
    "hsa-miR-6801-5p",0,5,2,0,0,0,3,0,0,13,0,0,0,0,19,0,0
    "hsa-miR-5683",0,40,14,0,0,0,0,0,0,13,0,0,0,0,0,0,0
    "hsa-miR-6800-5p",0,0,2,11,0,28,27,0,0,12,0,17,8,0,0,0,0
    "hsa-miR-152-5p",0,23,3,0,0,0,9,0,0,12,0,0,0,0,0,0,0
    "hsa-miR-4783-3p",0,12,10,0,0,0,0,0,0,12,0,0,0,0,0,11,1
    "hsa-miR-6780b-5p",0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0
    "hsa-miR-6511b-3p",11,202,61,13,11,0,5,0,0,12,2,14,31,0,17,36,10
    "hsa-miR-485-3p",4,7,13,0,0,0,14,0,0,12,2,0,0,4,0,0,0
    "hsa-miR-4658",0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0
    "hsa-miR-5699-3p",0,2,1,0,0,0,12,0,0,11,0,0,0,0,0,0,0
    "hsa-miR-6764-5p",6,54,22,0,0,0,35,0,0,11,0,0,14,0,0,40,0
    "hsa-miR-5697",0,3,0,0,0,0,0,0,0,11,0,0,0,0,0,0,0
    "hsa-miR-365a-3p|hsa-miR-365b-3p",0,542,170,8,0,0,9,0,0,11,1,4,1,0,13,0,0
    "hsa-miR-6755-5p",0,9,7,0,0,0,4,0,0,11,0,0,11,0,0,0,0
    "hsa-miR-3124-5p",4,15,5,0,21,7,6,0,0,11,0,0,20,2,27,26,0
    "hsa-miR-2276-3p",0,1,3,0,0,0,0,0,0,10,0,0,8,0,13,14,0
    "hsa-miR-6790-5p",0,0,1,0,0,0,0,0,0,10,0,13,15,0,0,12,0
    "hsa-miR-615-3p",23,446,190,0,5,0,32,0,0,10,0,38,53,1,0,45,0
    "hsa-miR-183-3p",12,374,85,0,0,9,0,0,0,10,2,12,0,0,16,0,0
    "hsa-miR-551b-3p",9,241,88,8,0,8,10,0,0,10,0,0,0,0,0,11,1
    "hsa-miR-301a-5p",1,398,153,0,1,0,0,0,0,10,0,0,0,0,16,7,0
    "hsa-miR-6727-5p",0,1,2,0,8,0,0,0,0,10,0,0,2,0,37,6,0
    "hsa-miR-141-5p",4,905,355,1,0,0,10,0,0,10,6,20,0,0,0,0,2
    "hsa-miR-1306-3p",19,141,42,0,0,14,52,0,0,9,0,0,53,0,52,37,7
    "hsa-miR-656-3p",0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0
    "hsa-miR-1226-3p",7,98,45,6,0,0,4,0,0,9,0,0,0,2,0,3,0
    "hsa-miR-2278",0,1,1,0,0,0,0,0,0,9,0,0,0,0,0,0,0
    "hsa-miR-1237-3p",8,16,3,7,0,0,6,0,0,9,0,0,9,0,6,13,0
    "hsa-miR-636",1,12,2,0,8,0,0,0,0,8,0,0,0,0,0,0,0
    "hsa-miR-1225-5p",0,0,0,0,0,0,0,0,0,8,0,0,5,0,4,2,0
    "hsa-miR-6763-5p",0,0,0,1,4,0,0,0,0,8,0,0,6,0,28,17,0
    "hsa-miR-6738-5p",0,0,0,0,0,0,0,0,0,8,0,0,5,0,0,0,0
    "hsa-miR-6819-5p",0,0,2,0,0,0,0,0,0,7,0,0,0,0,0,0,0
    "hsa-miR-203a-3p",0,122,46,0,0,0,0,0,0,7,0,0,7,0,2,15,0
    "hsa-miR-5698",0,11,1,0,0,1,7,0,0,7,3,0,9,0,0,7,0
    "hsa-miR-3173-5p",2,12,8,0,0,3,11,0,0,7,2,0,7,0,0,9,0
    "hsa-miR-6777-3p",9,23,5,0,6,0,6,0,0,7,0,0,0,0,0,14,0
    "hsa-miR-378f",0,16,6,2,7,1,23,0,0,6,0,1,8,1,7,16,0
    "hsa-miR-6834-5p",0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0
    "hsa-miR-8072",0,2,1,1,0,12,0,0,0,6,0,11,0,0,0,14,0
    "hsa-miR-4745-5p",2,1,1,0,0,7,0,0,0,5,0,0,4,0,0,0,0
    "hsa-miR-6887-3p",0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0
    "hsa-miR-5089-5p",0,7,1,0,0,0,0,0,0,4,0,0,0,0,0,0,0
    "hsa-mir-320b-1",0,0,0,0,3,0,10,0,0,3,0,0,10,0,4,5,0
    "hsa-mir-320a",0,0,0,0,3,0,9,0,0,3,0,0,10,0,8,4,0
    "hsa-miR-33a-3p",1,6,5,0,0,0,0,0,0,3,0,0,0,0,0,0,0
    "hsa-miR-876-3p",4,60,28,0,16,0,0,0,0,3,0,0,13,0,3,1,0
    "hsa-miR-378b",0,1,0,2,2,0,10,0,0,3,0,1,4,0,2,15,0
    "hsa-miR-3180-3p",0,0,0,0,0,0,1,0,0,3,0,0,2,0,0,9,0
    "hsa-miR-6842-3p",2,22,19,0,0,0,0,0,0,3,0,0,0,0,0,0,0
    "hsa-miR-6741-3p",0,42,9,0,0,0,0,0,0,3,8,0,0,0,0,5,1
    "hsa-mir-320b-2",0,0,0,1,3,0,26,0,0,3,0,0,19,0,8,2,0
    "hsa-miR-6849-5p",0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0
    "hsa-miR-6740-5p",0,5,0,17,17,1,17,0,0,2,0,0,30,2,0,30,0
    "hsa-miR-378c",0,20,8,0,2,2,19,0,0,2,0,0,2,0,0,31,0
    "hsa-miR-30d-3p",3,657,238,0,15,0,18,0,0,2,0,0,10,2,1,21,0
    "hsa-let-7g-3p",0,53,30,0,0,0,0,0,0,1,0,18,12,0,0,0,0
    "hsa-let-7a-3",0,3,1,0,0,0,0,0,0,0,0,0,3,0,0,0,0
    "hsa-miR-374a-5p",3,864,413,0,0,0,0,0,0,0,0,0,5,0,0,12,0
    "hsa-mir-320e",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-mir-320c-2",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-483-3p",0,250,108,0,0,19,8,1,0,0,8,0,0,0,9,0,0
    "hsa-miR-3162-5p",0,0,0,0,0,0,0,0,0,0,6,0,23,0,0,10,0
    "hsa-miR-598-3p",1,90,39,0,50,0,42,2,0,0,5,13,0,0,0,60,0
    "hsa-miR-6511b-5p",0,27,6,6,0,6,26,1,0,0,5,11,0,0,8,0,0
    "hsa-miR-6794-3p",0,9,0,0,0,0,0,0,0,0,4,0,3,0,0,29,0
    "hsa-miR-4449",1,5,3,0,11,12,0,0,0,0,3,0,44,1,23,11,2
    "hsa-miR-1289",0,73,19,10,0,14,13,0,0,0,3,0,16,0,8,0,0
    "hsa-miR-4525",0,16,0,0,0,0,0,3,0,0,3,0,13,8,0,0,0
    "hsa-miR-6515-3p",0,6,2,0,0,0,0,0,0,0,3,12,0,0,0,0,0
    "hsa-miR-134-5p",6,150,36,10,15,1,36,1,0,0,3,23,48,0,20,33,0
    "hsa-miR-6812-3p",5,10,5,0,0,13,0,0,0,0,3,0,0,0,0,0,0
    "hsa-miR-3913-5p",0,46,21,0,0,0,0,0,0,0,3,0,6,0,0,0,0
    "hsa-miR-219a-5p",0,106,21,0,10,0,0,0,0,0,2,0,0,0,20,12,2
    "hsa-miR-3065-3p",6,475,158,0,0,0,18,0,0,0,2,0,0,2,0,31,3
    "hsa-miR-1229-3p",0,21,6,0,0,0,0,0,0,0,2,0,11,0,0,11,0
    "hsa-miR-34c-3p",0,0,0,0,0,0,0,0,0,0,2,0,17,0,0,11,0
    "hsa-miR-548ap-5p|hsa-miR-548j-5p",0,37,10,0,13,0,0,0,0,0,2,0,0,0,0,0,0
    "hsa-miR-5699-5p",1,19,4,11,0,0,0,0,0,0,2,0,0,0,28,27,0
    "hsa-miR-6788-3p",1,36,13,0,0,0,0,0,0,0,2,0,0,0,0,0,0
    "hsa-miR-8061",0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,25,0
    "hsa-miR-3187-5p",0,12,4,0,0,0,0,0,0,0,2,0,0,0,0,5,0
    "hsa-miR-628-5p",2,199,64,0,0,0,0,0,0,0,2,0,0,1,10,12,4
    "hsa-miR-132-5p",7,304,90,0,27,2,10,0,0,0,2,12,0,0,0,0,0
    "hsa-miR-6873-3p",0,7,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0
    "hsa-miR-200c-5p",0,67,29,0,0,0,14,0,0,0,1,0,0,0,0,0,0
    "hsa-miR-766-3p",2,203,48,0,0,19,33,0,0,0,1,19,20,0,0,13,0
    "hsa-miR-1976",0,39,18,0,21,0,0,0,0,0,1,0,0,0,0,11,0
    "hsa-miR-6807-3p",3,16,4,0,0,0,0,0,0,0,1,0,0,0,0,15,0
    "hsa-miR-485-5p",0,10,4,0,16,0,0,0,0,0,1,0,0,0,0,14,4
    "hsa-miR-548k",0,12,5,0,1,0,0,0,0,0,1,0,0,0,0,0,0
    "hsa-miR-874-3p",0,14,4,0,0,0,5,0,0,0,1,0,0,0,0,10,0
    "hsa-miR-4745-3p",2,7,3,0,0,0,0,0,0,0,1,0,0,0,0,0,0
    "hsa-miR-7705",0,33,9,0,0,0,0,0,0,0,1,0,0,0,0,0,0
    "hsa-miR-4793-3p",1,37,35,0,16,0,0,0,0,0,1,0,0,2,0,11,0
    "hsa-miR-576-5p",2,130,52,0,15,0,0,0,0,0,1,0,9,0,16,0,0
    "hsa-miR-6728-5p",0,13,8,20,0,0,0,0,0,0,1,0,0,0,0,11,0
    "hsa-miR-362-5p",33,2841,566,16,4,17,0,2,0,0,1,0,41,1,0,26,0
    "hsa-miR-9-3p",2,366,154,0,10,0,0,0,0,0,1,0,8,3,24,26,0
    "hsa-miR-548ad-5p|hsa-miR-548ae-5p|hsa-miR-548ay-5p",2,16,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0
    "hsa-miR-551b-5p",15,232,86,7,13,0,46,0,0,0,0,43,34,0,0,59,0
    "hsa-miR-4521",14,1333,438,0,0,0,2,0,0,0,0,0,0,1,0,0,0
    "hsa-miR-497-5p",13,507,191,0,0,0,22,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-32-3p",10,56,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6783-3p",8,52,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-146b-3p",8,141,55,0,0,0,0,0,0,0,0,0,7,0,16,9,2
    "hsa-miR-6720-3p",7,143,85,8,0,2,0,0,0,0,0,0,0,0,14,0,0
    "hsa-miR-182-3p",6,23,8,0,0,0,0,0,0,0,0,13,8,0,0,0,0
    "hsa-miR-424-3p",6,21,4,6,29,0,13,0,0,0,0,0,0,0,0,11,0
    "hsa-miR-6760-3p",5,4,2,0,0,10,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3176",4,29,17,0,0,0,0,0,0,0,0,0,0,0,0,11,0
    "hsa-miR-7155-3p",4,7,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0
    "hsa-miR-6758-5p",4,10,8,0,36,0,0,0,0,0,0,0,0,0,25,9,0
    "hsa-miR-6786-3p",4,24,4,0,0,0,0,0,0,0,0,0,0,1,15,0,0
    "hsa-miR-1254",4,71,25,9,0,15,10,0,0,0,0,3,21,1,22,0,3
    "hsa-miR-4804-5p",4,15,2,0,16,0,0,0,0,0,0,8,0,2,0,0,0
    "hsa-miR-188-5p",4,151,85,7,0,0,1,0,0,0,0,0,4,0,0,0,0
    "hsa-miR-92a-2-5p",3,47,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-32-5p",3,28,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-24-2-5p",3,338,140,2,0,13,21,0,0,0,0,0,0,0,20,0,0
    "hsa-miR-2355-3p",3,25,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1303",3,39,10,0,0,0,0,0,0,0,0,0,0,1,19,16,0
    "hsa-miR-193a-5p",3,40,12,5,0,0,0,0,0,0,0,0,12,0,8,0,0
    "hsa-miR-3651",3,16,9,6,9,0,17,0,0,0,0,0,0,0,0,2,0
    "hsa-miR-1260b",3,0,0,4,0,0,1,0,0,0,0,0,6,0,15,0,0
    "hsa-miR-4458",3,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1273h-3p",2,40,24,15,27,18,18,0,0,0,0,0,0,0,0,4,0
    "hsa-miR-4485-3p",2,25,1,0,0,0,0,0,0,0,0,0,0,0,14,0,0
    "hsa-miR-22-5p",2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-937-5p",2,5,0,0,0,0,18,0,0,0,0,0,0,0,0,9,0
    "hsa-miR-7851-3p",2,14,8,0,0,0,19,0,0,0,0,0,0,0,18,0,0
    "hsa-miR-3653-3p",2,11,3,0,0,0,2,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-876-5p",2,10,12,0,17,0,0,0,0,0,0,0,0,0,29,13,0
    "hsa-miR-3179",2,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6762-3p",2,16,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-138-5p",2,27,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1179",2,738,227,0,0,0,23,2,0,0,0,9,0,0,14,68,1
    "hsa-miR-1247-5p",2,7,1,0,0,14,9,0,0,0,0,0,0,0,0,5,0
    "hsa-miR-1227-3p",2,10,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4646-3p",2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-374b-3p",2,120,31,0,17,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1236-3p",2,0,1,0,0,0,16,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548e-5p",2,14,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-133a-5p",2,81,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3912-3p",2,74,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4781-3p",2,59,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6882-3p",2,1,1,9,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-381-3p",2,82,49,0,0,0,0,1,0,0,0,0,19,0,0,6,0
    "hsa-miR-6886-3p",2,4,0,11,0,0,0,0,0,0,0,4,0,0,0,0,0
    "hsa-miR-196a-3p",2,271,107,0,1,0,0,0,0,0,0,0,8,0,18,20,0
    "hsa-miR-34c-5p",2,30,9,0,0,0,0,0,0,0,0,0,23,0,24,8,6
    "hsa-miR-548s",1,13,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6800-3p",1,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0
    "hsa-miR-4498",1,12,5,0,0,0,0,0,0,0,0,0,0,0,0,6,0
    "hsa-miR-4517",1,55,18,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6894-3p",1,6,2,6,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1234-3p",1,16,4,0,0,0,0,0,0,0,0,0,0,0,6,0,0
    "hsa-miR-5579-3p",1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-887-5p",1,8,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4638-5p",1,9,2,0,19,0,0,0,0,0,0,0,19,0,0,4,0
    "hsa-miR-3146",1,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6840-5p",1,17,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-7855-5p",1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6865-5p",1,2,2,0,0,0,0,0,0,0,0,0,6,0,0,0,0
    "hsa-miR-6766-5p",1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6820-3p",1,3,2,0,0,0,7,0,0,0,0,0,9,0,0,0,0
    "hsa-miR-3609",1,11,3,0,0,0,0,0,0,0,0,0,0,0,0,1,0
    "hsa-miR-6759-5p",1,5,3,0,0,0,0,0,0,0,0,0,4,0,0,5,0
    "hsa-miR-6808-3p",1,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4440",1,57,11,8,0,0,1,0,0,0,0,0,7,0,0,0,0
    "hsa-miR-129-1-3p",1,10,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1914-3p",1,2,0,0,0,0,12,0,0,0,0,0,0,0,0,7,0
    "hsa-miR-6852-5p",1,22,6,0,0,0,0,0,0,0,0,0,5,0,0,19,0
    "hsa-miR-6769a-3p",1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-579-5p",1,4,3,0,17,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3122",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-23a-5p",1,27,7,0,12,0,9,0,0,0,0,0,0,2,0,0,0
    "hsa-miR-6873-5p",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-301b-5p",1,30,4,0,0,0,0,0,0,0,0,0,0,0,6,0,0
    "hsa-miR-3120-5p",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1185-1-3p",1,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3136-5p",1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6775-3p",1,4,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-610",1,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-592",1,45,17,0,0,0,0,0,0,0,0,0,6,0,0,0,0
    "hsa-miR-6893-5p",1,1,2,0,0,0,4,2,0,0,0,14,0,0,0,0,0
    "hsa-miR-3936",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0
    "hsa-miR-2277-3p",1,12,5,0,12,0,14,0,0,0,0,0,0,0,0,5,0
    "hsa-miR-6515-5p",1,17,2,11,34,0,21,0,0,0,0,0,10,0,22,15,0
    "hsa-miR-1236-5p",1,0,1,4,0,0,0,0,0,0,0,0,10,0,0,8,0
    "hsa-miR-3607-3p",1,48,33,0,0,0,1,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3157-3p",1,35,6,0,0,0,9,0,0,0,0,0,0,0,19,0,4
    "hsa-miR-1257",1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6857-3p",1,14,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-140-5p",1,201,72,1,0,0,0,0,0,0,0,0,0,0,22,0,0
    "hsa-miR-6751-3p",1,6,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3661",1,7,3,4,0,0,0,0,0,0,0,0,30,0,0,24,0
    "hsa-miR-1909-5p",1,6,2,0,0,2,0,0,0,0,0,0,12,0,0,9,0
    "hsa-miR-1273h-5p",1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6866-5p",1,23,7,0,2,0,0,0,0,0,0,0,16,0,0,12,3
    "hsa-miR-574-5p",1,0,0,7,8,0,0,0,0,0,0,0,0,0,27,2,0
    "hsa-miR-6820-5p",1,8,4,0,0,0,8,0,0,0,0,0,11,0,0,0,0
    "hsa-miR-4717-3p",1,23,20,0,13,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6850-5p",1,5,0,0,22,1,0,0,0,0,0,0,7,0,10,0,0
    "hsa-miR-6886-5p",1,0,2,0,0,0,10,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-5587-3p",1,7,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-31-5p",1,97,25,0,0,0,0,0,0,0,0,0,17,0,0,0,0
    "hsa-miR-495-3p",1,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4708-3p",1,23,9,0,0,0,3,0,0,0,0,0,0,0,0,2,7
    "hsa-miR-4753-5p",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-942-3p",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-5010-3p",1,45,27,0,0,0,0,0,0,0,0,0,3,0,0,0,0
    "hsa-miR-491-5p",1,8,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6827-5p",1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-101-5p",1,154,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6770-3p",1,10,8,0,19,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6853-3p",1,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6895-3p",1,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3190-3p",1,13,4,0,14,0,0,0,0,0,0,0,11,0,18,13,0
    "hsa-miR-487a-5p",1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-150-3p",1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3192-5p",1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-2355-5p",1,47,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4798-5p",1,18,22,0,0,0,0,0,0,0,0,0,0,0,1,0,0
    "hsa-miR-6726-3p",1,4,8,3,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-153-3p",0,424,163,0,5,0,6,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-7-1-3p",0,347,113,0,1,0,20,0,0,0,0,0,17,0,12,7,0
    "hsa-miR-338-3p",0,289,100,0,0,0,0,1,0,0,0,0,0,0,0,0,3
    "hsa-miR-29b-3p",0,98,57,0,9,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-542-3p",0,93,42,0,0,0,0,0,0,0,0,18,0,0,0,0,0
    "hsa-miR-20a-3p",0,91,40,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1255a",0,81,25,0,0,0,0,1,0,0,0,0,0,0,36,0,3
    "hsa-miR-6798-3p",0,79,36,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-137",0,77,34,0,0,0,0,0,0,0,0,0,8,0,0,0,0
    "hsa-miR-3677-3p",0,66,21,0,0,0,10,0,0,0,0,0,9,0,18,0,0
    "hsa-miR-550a-5p",0,56,23,0,0,0,0,1,0,0,0,0,0,0,12,0,1
    "hsa-miR-135b-5p",0,53,17,0,0,0,0,0,0,0,0,0,0,0,0,15,0
    "hsa-miR-24-1-5p",0,52,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6716-3p",0,51,13,0,6,0,0,0,0,0,0,0,16,0,0,0,0
    "hsa-miR-217",0,51,22,0,0,0,0,0,0,0,0,0,14,0,0,0,6
    "hsa-miR-135b-3p",0,48,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-616-5p",0,46,11,0,4,0,13,0,0,0,0,0,8,0,0,0,2
    "hsa-miR-331-5p",0,42,10,0,0,0,0,0,0,0,0,0,0,0,12,0,0
    "hsa-miR-627-5p",0,39,16,0,0,15,0,0,0,0,0,0,0,0,0,9,0
    "hsa-miR-3940-3p",0,39,4,0,25,0,17,0,0,0,0,0,9,0,0,0,0
    "hsa-miR-5001-3p",0,37,13,0,0,0,0,0,0,0,0,0,0,0,0,11,0
    "hsa-miR-6806-3p",0,36,19,0,18,0,0,0,0,0,0,0,12,0,0,0,0
    "hsa-miR-4775",0,35,16,0,0,0,0,0,0,0,0,0,0,0,20,8,0
    "hsa-miR-675-5p",0,34,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3139",0,33,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-203b-3p",0,33,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1266-5p",0,32,9,0,0,0,0,0,0,0,0,22,0,0,0,36,0
    "hsa-miR-345-3p",0,31,8,0,0,0,22,0,0,0,0,0,10,0,0,11,0
    "hsa-miR-450b-5p",0,31,17,0,0,0,0,0,0,0,0,0,13,0,0,0,0
    "hsa-miR-660-3p",0,31,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-21-3p",0,29,29,0,0,0,3,0,0,0,0,21,11,0,0,0,0
    "hsa-miR-3145-3p",0,27,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-20b-3p",0,26,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-424-5p",0,26,10,0,0,0,0,0,0,0,0,0,0,2,0,0,0
    "hsa-miR-18b-5p",0,24,4,0,0,0,6,0,0,0,0,0,0,0,0,4,0
    "hsa-miR-449c-5p",0,24,21,0,0,0,0,0,0,0,0,0,0,1,0,0,2
    "hsa-miR-7703",0,24,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-585-3p",0,23,10,0,0,0,0,0,0,0,0,0,0,0,0,9,0
    "hsa-miR-362-3p",0,23,9,0,0,0,0,0,0,0,0,0,4,0,0,0,0
    "hsa-miR-3120-3p",0,20,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-124-5p",0,20,18,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-216b-5p",0,20,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6720-5p",0,19,10,0,0,0,0,0,0,0,0,0,12,0,0,0,0
    "hsa-miR-19b-1-5p",0,19,12,0,0,0,9,0,0,0,0,0,0,0,0,19,0
    "hsa-miR-590-5p",0,19,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-589-3p",0,19,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4742-3p",0,19,5,0,0,0,9,0,0,0,0,0,0,0,0,8,0
    "hsa-miR-548n",0,18,10,0,0,0,0,0,0,0,0,0,0,0,0,0,2
    "hsa-miR-6770-5p",0,18,15,0,0,0,29,0,0,0,0,0,18,0,6,0,0
    "hsa-miR-5696",0,18,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-539-3p",0,18,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-219b-3p",0,17,7,0,0,0,19,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3149",0,17,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-196b-3p",0,17,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548j-5p",0,17,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548ba",0,17,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-103a-2-5p",0,16,19,0,0,0,0,0,0,0,0,6,0,0,0,0,0
    "hsa-miR-455-5p",0,16,5,0,0,0,0,0,0,0,0,0,0,0,0,16,0
    "hsa-miR-642a-3p",0,16,4,0,0,0,0,0,0,0,0,0,16,11,16,0,1
    "hsa-miR-664b-5p",0,16,3,0,0,0,0,0,0,0,0,0,12,0,0,1,0
    "hsa-miR-4999-5p",0,16,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-455-3p",0,15,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4640-3p",0,15,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4726-5p",0,15,4,0,0,0,2,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3074-3p",0,15,3,0,0,0,0,0,0,0,0,0,8,0,0,0,0
    "hsa-miR-3662",0,15,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-95-5p",0,14,7,0,0,0,0,0,0,0,0,14,0,0,36,0,0
    "hsa-miR-3684",0,14,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6837-3p",0,14,13,0,0,15,0,0,0,0,0,0,9,0,34,0,0
    "hsa-miR-5695",0,14,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-675-3p",0,14,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3150a-5p",0,14,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3150a-3p",0,14,2,0,0,0,0,0,0,0,0,0,8,0,0,18,0
    "hsa-miR-3177-5p",0,13,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3143",0,13,8,0,0,0,0,0,0,0,0,0,0,0,0,0,1
    "hsa-miR-193a-3p",0,13,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548at-5p",0,13,7,0,15,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-545-5p",0,13,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-5688",0,13,4,0,0,0,0,0,0,0,0,0,0,0,1,0,0
    "hsa-miR-450a-5p",0,13,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6802-3p",0,13,4,0,0,0,10,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3939",0,12,8,0,15,0,0,0,0,0,0,0,13,0,0,0,0
    "hsa-miR-16-1-3p",0,12,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-216a-3p",0,12,4,0,0,0,0,0,0,0,0,0,0,0,16,0,0
    "hsa-miR-6833-3p",0,12,3,0,0,0,6,0,0,0,0,0,10,0,0,10,0
    "hsa-miR-6516-5p",0,12,2,0,16,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6750-3p",0,12,3,0,0,0,15,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4667-3p",0,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-153-5p",0,11,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-7112-3p",0,11,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-26a-1-3p",0,11,4,0,0,0,1,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-181b-3p",0,11,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6508-3p",0,11,1,0,0,0,0,0,0,0,0,0,0,0,0,3,0
    "hsa-miR-627-3p",0,11,4,0,0,0,0,0,0,0,0,0,0,0,12,5,0
    "hsa-miR-5583-3p",0,11,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6513-5p",0,10,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-34a-3p",0,10,6,0,0,0,0,0,0,0,0,0,22,0,0,0,0
    "hsa-miR-6516-3p",0,10,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-7156-5p",0,10,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-597-3p",0,10,7,0,0,0,0,0,0,0,0,0,0,0,17,0,0
    "hsa-miR-7114-3p",0,10,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-488-3p",0,10,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-5690",0,10,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3685",0,9,4,0,0,0,0,0,0,0,0,0,0,0,14,0,0
    "hsa-miR-6823-5p",0,9,3,0,0,0,13,0,0,0,0,0,7,0,0,9,0
    "hsa-miR-4504",0,9,3,0,0,0,0,0,0,0,0,0,0,0,0,10,0
    "hsa-miR-6510-3p",0,9,4,0,0,0,0,0,0,0,0,0,8,0,0,11,0
    "hsa-miR-142-5p",0,9,1,23,37,10,0,0,0,0,0,19,20,0,44,18,0
    "hsa-miR-4766-3p",0,9,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1284",0,9,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-18b-3p",0,9,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4461",0,9,5,0,0,0,9,0,0,0,0,18,0,0,0,1,0
    "hsa-miR-3617-5p",0,9,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6737-3p",0,9,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-211-5p",0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548al",0,8,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-23b-5p",0,8,6,0,0,0,0,0,0,0,0,0,11,0,0,28,0
    "hsa-miR-5680",0,8,6,0,0,0,0,0,0,0,0,0,6,0,17,1,0
    "hsa-miR-99b-5p",0,8,1,0,0,0,0,0,0,0,0,0,0,0,23,11,0
    "hsa-miR-147b",0,8,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6864-5p",0,8,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-643",0,8,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-7156-3p",0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-874-5p",0,8,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0
    "hsa-miR-548a-3p",0,8,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6513-3p",0,8,2,0,0,0,5,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548ah-3p|hsa-miR-548av-3p",0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-mir-30c-2",0,8,2,0,14,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-mir-103a-1",0,8,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0
    "hsa-miR-6762-5p",0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3128",0,7,4,0,0,0,0,0,0,0,0,0,15,0,0,0,0
    "hsa-miR-363-5p",0,7,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-28-5p",0,7,3,0,0,11,13,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6784-3p",0,7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-199b-5p",0,7,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3688-3p",0,7,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-5003-3p",0,7,2,0,0,0,13,0,0,0,0,9,14,0,20,12,0
    "hsa-miR-6797-3p",0,7,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-542-5p",0,7,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4749-3p",0,7,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-34b-5p",0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1249-3p",0,7,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-580-3p",0,7,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-5004-5p",0,7,1,0,0,0,25,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-5094",0,7,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4671-3p",0,7,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-let-7i-3p",0,7,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-129-2-3p",0,6,6,0,0,0,0,0,0,0,0,0,0,2,0,0,0
    "hsa-let-7a-1",0,6,1,0,0,0,0,0,0,0,0,0,3,0,0,0,0
    "hsa-miR-6826-5p",0,6,3,0,0,0,0,0,0,0,0,0,0,2,0,0,0
    "hsa-miR-3191-3p",0,6,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6728-3p",0,6,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6862-3p",0,6,1,0,0,0,0,0,0,0,0,0,11,0,0,0,0
    "hsa-miR-450a-1-3p",0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548az-5p",0,6,12,0,0,0,0,0,0,0,0,0,10,2,0,0,0
    "hsa-miR-3619-5p",0,6,4,0,0,4,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6804-3p",0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4289",0,6,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-5004-3p",0,6,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-570-3p",0,6,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-502-5p",0,6,3,0,0,0,0,0,0,0,0,0,0,0,13,1,0
    "hsa-miR-3177-3p",0,6,3,14,0,0,15,0,0,0,0,0,30,0,8,4,0
    "hsa-miR-3925-5p",0,6,1,0,0,0,18,0,0,0,0,0,27,3,0,28,0
    "hsa-miR-4760-5p",0,6,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4728-3p",0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6804-5p",0,6,2,0,0,15,20,0,0,0,0,0,0,0,0,13,0
    "hsa-miR-6855-3p",0,6,1,0,0,7,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-5187-3p",0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3157-5p",0,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6868-3p",0,6,11,0,0,0,13,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4474-3p",0,6,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6848-5p",0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-100-5p",0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-199a-5p",0,5,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0
    "hsa-miR-6813-5p",0,5,4,0,0,0,8,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4751",0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6769a-5p",0,5,0,0,6,0,8,0,0,0,0,0,7,0,0,0,0
    "hsa-miR-1278",0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4641",0,5,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6732-3p",0,5,3,0,0,0,0,0,0,0,0,0,0,3,0,0,0
    "hsa-miR-767-5p",0,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6819-3p",0,5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6750-5p",0,5,1,0,0,0,0,0,0,0,0,0,15,0,0,8,0
    "hsa-miR-6754-3p",0,5,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-2115-3p",0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4747-3p",0,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-124-3p",0,5,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0
    "hsa-miR-6754-5p",0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3163",0,5,2,0,0,0,5,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4754",0,5,1,0,0,0,7,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4762-3p",0,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-7-2-3p",0,5,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6840-3p",0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-676-5p",0,5,4,0,0,0,11,0,0,0,0,0,10,0,0,15,0
    "hsa-miR-4666a-5p",0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3942-3p",0,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-377-3p",0,5,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-186-3p",0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3116",0,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-579-3p",0,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-125a-5p",0,4,6,0,0,0,11,0,0,0,0,0,0,0,0,5,2
    "hsa-miR-3064-3p",0,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3160-3p",0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6735-3p",0,4,6,0,0,0,0,0,0,0,0,0,7,0,0,10,0
    "hsa-miR-6879-3p",0,4,5,0,0,13,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6796-3p",0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-612",0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548ad-5p|hsa-miR-548ae-5p",0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
    "hsa-miR-6793-3p",0,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6743-3p",0,4,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6739-3p",0,4,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4671-5p",0,4,2,0,0,0,0,0,0,0,0,0,18,0,0,0,0
    "hsa-miR-496",0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4457",0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6852-3p",0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6851-5p",0,4,0,0,0,0,0,0,0,0,0,0,15,0,0,0,0
    "hsa-miR-548t-5p",0,4,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0
    "hsa-miR-6767-3p",0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-218-1-3p",0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3616-5p",0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4640-5p",0,4,1,0,0,0,0,0,0,0,0,0,8,0,0,0,0
    "hsa-miR-6755-3p",0,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6818-5p",0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4656",0,4,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0
    "hsa-miR-6818-3p",0,4,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548ap-5p",0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3125",0,4,0,0,17,0,0,0,0,0,0,0,0,0,14,0,0
    "hsa-miR-548ab",0,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,2
    "hsa-miR-4685-3p",0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-188-3p",0,4,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6841-5p",0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6859-3p",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-449b-3p",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4730",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-664b-3p",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4762-5p",0,3,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0
    "hsa-miR-1304-5p",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4803",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6885-5p",0,3,3,11,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4473",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548am-3p",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6730-3p",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6742-3p",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-7110-3p",0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-29a-5p",0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6827-3p",0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-5091",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6844",0,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-154-5p",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-7845-5p",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4683",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4528",0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-219b-5p",0,3,0,0,0,0,0,0,0,0,0,0,14,0,0,22,0
    "hsa-miR-7106-3p",0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1285-3p",0,3,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0
    "hsa-miR-6880-5p",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6885-3p",0,3,2,0,0,0,0,0,0,0,0,0,0,0,14,0,0
    "hsa-miR-4654",0,3,3,0,0,6,9,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1252-5p",0,3,2,8,0,14,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-541-3p",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0
    "hsa-miR-597-5p",0,3,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4794",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-2467-3p",0,3,0,0,0,0,19,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-933",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3193",0,3,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4632-3p",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3121-3p",0,3,2,0,0,0,0,0,0,0,0,13,0,0,16,0,0
    "hsa-miR-190a-3p",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6733-5p",0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3164",0,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6746-3p",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3165",0,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-573",0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6783-5p",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4716-3p",0,3,4,4,14,0,15,0,0,0,0,0,13,0,0,11,0
    "hsa-miR-1302",0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-624-3p",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-5002-3p",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6889-5p",0,3,2,11,11,0,0,0,0,0,0,0,9,0,0,24,0
    "hsa-miR-6866-3p",0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-545-3p",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3675-3p",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6854-5p",0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1243",0,3,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6796-5p",0,3,1,0,0,16,0,0,0,0,0,0,0,0,0,12,0
    "hsa-let-7b",0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-192-3p",0,2,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-509-3p",0,2,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-379-5p",0,2,1,31,17,11,27,0,0,0,0,0,23,0,41,64,0
    "hsa-mir-106a",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1285-5p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-499a-5p",0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-615-5p",0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-5587-5p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6871-3p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4690-3p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4713-5p",0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6851-3p",0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3923",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4638-3p",0,2,0,0,0,0,0,0,0,0,0,0,22,0,0,0,0
    "hsa-miR-548o-3p",0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4750-3p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1911-5p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6773-3p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6814-3p",0,2,1,0,0,0,5,0,0,0,0,15,0,0,0,0,0
    "hsa-miR-624-5p",0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-136-3p",0,2,0,0,0,0,13,0,0,0,0,0,0,0,23,0,4
    "hsa-miR-219a-2-3p",0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6829-5p",0,2,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0
    "hsa-miR-6744-5p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6895-5p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4639-5p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-556-3p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-5090",0,2,0,7,0,0,0,0,0,0,0,0,8,0,0,0,0
    "hsa-miR-4712-3p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4753-3p",0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6780b-3p",0,2,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0
    "hsa-miR-3664-5p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-7151-5p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4723-3p",0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1292-3p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548ag",0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4684-3p",0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-7152-3p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4677-5p",0,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6797-5p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548ak",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6792-5p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4470",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6874-3p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-581",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6884-3p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4642",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-7109-5p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6791-3p",0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6874-5p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6888-3p",0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4687-5p",0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-mir-642b",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548aj-5p|hsa-miR-548g-5p|hsa-miR-548x-5p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-651-5p",0,2,0,0,0,0,0,4,0,0,0,0,0,0,0,10,0
    "hsa-miR-3610",0,2,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0
    "hsa-miR-6752-3p",0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6734-3p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548w",0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-586",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3680-5p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4725-3p",0,2,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0
    "hsa-miR-548au-5p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3145-5p",0,2,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0
    "hsa-miR-409-5p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4787-3p",0,2,2,0,0,0,0,0,0,0,0,0,14,0,0,0,0
    "hsa-miR-3692-5p",0,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-5682",0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-7112-5p",0,2,2,0,0,0,0,0,0,0,0,0,0,0,20,0,0
    "hsa-miR-4501",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548ai|hsa-miR-570-5p",0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6830-5p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-449b-5p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-7151-3p",0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-488-5p",0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-5000-3p",0,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-515-5p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1304-3p",0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3917",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4691-3p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-551a",0,2,3,0,0,0,1,0,0,0,0,0,11,0,16,0,0
    "hsa-miR-3616-3p",0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4709-3p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6816-5p",0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548av-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4791",0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4714-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1914-5p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6890-5p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0
    "hsa-miR-6878-5p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548as-5p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6795-5p",0,1,0,0,6,8,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4800-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6798-5p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-216a-5p",0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-5006-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1298-5p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4695-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6768-5p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3134",0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4522",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-578",0,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4743-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6831-5p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-585-5p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1262",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-500b-3p",0,1,1,0,13,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-491-3p",0,1,1,0,0,0,0,0,0,0,0,0,0,0,21,0,0
    "hsa-miR-6780a-3p",0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4655-5p",0,1,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-7114-5p",0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1323",0,1,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6836-5p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0
    "hsa-miR-6776-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-5095",0,1,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-5692a",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548ah-3p|hsa-miR-548p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-5196-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6760-5p",0,1,0,0,0,0,0,0,0,0,0,18,23,0,16,0,0
    "hsa-miR-6842-5p",0,1,0,0,0,0,0,0,0,0,0,11,0,0,0,0,0
    "hsa-miR-6739-5p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6505-5p",0,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-187-3p",0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-31-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-670-3p",0,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-600",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6733-3p",0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4740-5p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6854-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6781-3p",0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6741-5p",0,1,2,0,14,0,7,0,0,0,0,0,0,0,15,0,0
    "hsa-miR-5582-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548au-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4734",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0
    "hsa-miR-5088-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-96-3p",0,1,3,0,0,0,0,0,0,0,0,18,0,0,0,0,0
    "hsa-miR-3529-5p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-516a-5p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1-5p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6865-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,7,5,0
    "hsa-miR-1277-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1238-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3135a",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548ad-5p|hsa-miR-548ae-5p|hsa-miR-548ay-5p|hsa-miR-548d-5p",0,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-mir-181b-2",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4639-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6779-5p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-106a-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4755-3p",0,1,0,0,0,0,0,0,0,0,0,0,9,2,0,0,0
    "hsa-mir-92a-1",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4459",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6729-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-588",0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1231",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-668-3p",0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6835-5p",0,1,3,0,12,0,11,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4786-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548d-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6125",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
    "hsa-miR-6735-5p",0,1,4,0,0,0,0,0,0,0,0,30,14,0,0,0,0
    "hsa-miR-4446-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4714-5p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-934",0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3153",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-369-3p",0,1,0,0,10,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-154-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-2113",0,1,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-493-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,21,0,0
    "hsa-miR-4723-5p",0,1,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6749-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-518a-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-mir-3142",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548ay-3p",0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3142",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3942-5p",0,1,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6769b-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6742-5p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1537-5p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-338-5p",0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6505-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4533",0,1,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-508-3p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-7152-5p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4664-5p",0,1,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0
    "hsa-miR-3659",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4701-5p",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3908",0,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6857-5p",0,1,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6861-3p",0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3183",0,1,1,0,0,0,0,0,0,0,0,0,9,0,0,12,0
    "hsa-miR-6893-3p",0,1,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6845-3p",0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1291",0,1,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-5003-5p",0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1248",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1297",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-mir-30c-1",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-mir-7-2",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-mir-92a-2",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-29c-3p",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-mir-500b",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-let-7a-2",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-let-7f-1",0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6824-3p",0,0,3,0,0,0,0,0,0,0,0,0,0,0,18,0,0
    "hsa-miR-450a-2-3p",0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6748-5p",0,0,3,0,0,0,0,0,0,0,0,0,11,0,0,9,0
    "hsa-miR-548p",0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6749-5p",0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3613-5p",0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-556-5p",0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-383-5p",0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6823-3p",0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6781-5p",0,0,2,0,0,0,9,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-15a-3p",0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3151-5p",0,0,2,3,0,0,0,0,0,0,0,0,0,0,0,8,0
    "hsa-miR-328-5p",0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6832-3p",0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4764-5p",0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-7155-5p",0,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0
    "hsa-miR-7108-3p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4465",0,0,1,0,0,0,0,0,0,0,0,0,0,0,10,0,0
    "hsa-miR-518f-5p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-2114-3p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4644",0,0,1,0,3,0,0,0,0,0,0,0,10,0,0,13,0
    "hsa-miR-3192-3p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3199",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-539-5p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4254",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6757-5p",0,0,1,0,0,11,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6509-3p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4523",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1185-5p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-526b-5p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-2116-5p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4784",0,0,1,0,0,0,3,0,0,0,0,0,7,0,24,8,0
    "hsa-miR-6782-5p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548a-5p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-559",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4688",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4453",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1909-3p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,14,0,0
    "hsa-miR-548f-3p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4515",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-376b-3p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4798-3p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-944",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-7113-5p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,4,0,0
    "hsa-miR-7976",0,0,1,0,0,0,7,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-2276-5p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6849-3p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6890-3p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6869-5p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3611",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4767",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6880-3p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-889-5p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6848-3p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-642b-3p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4781-5p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-5580-5p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6727-3p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4797-3p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6887-5p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-492",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-3663-5p",0,0,1,9,0,0,0,1,0,0,0,0,0,0,0,0,0
    "hsa-miR-512-3p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,18,3
    "hsa-miR-216b-3p",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-548aw",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,6,0
    "hsa-miR-548ah-3p|hsa-miR-548av-3p|hsa-miR-548p",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-mir-10a",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6816-3p",0,0,0,0,0,0,12,0,0,0,0,0,8,0,0,0,0
    "hsa-miR-6791-5p",0,0,0,0,0,0,11,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-1247-3p",0,0,0,0,0,0,11,0,0,0,0,0,28,3,0,0,0
    "hsa-miR-6737-5p",0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-299-5p",0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-223-3p",0,0,0,0,0,0,3,0,0,0,0,0,0,0,17,9,0
    "hsa-mir-378d-2",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6833-5p",0,0,0,0,0,0,0,1,0,0,0,0,17,0,0,0,0
    "hsa-miR-7108-5p",0,0,0,0,0,0,0,0,0,0,0,22,0,1,22,0,0
    "hsa-miR-6507-3p",0,0,0,0,0,0,0,0,0,0,0,17,0,0,0,0,0
    "hsa-miR-410-3p",0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0
    "hsa-miR-6891-3p",0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0
    "hsa-miR-3620-5p",0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0
    "hsa-miR-1827",0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,1,0
    "hsa-miR-3175",0,0,0,0,1,0,0,0,0,0,0,0,31,0,14,8,0
    "hsa-miR-1288-3p",0,0,0,0,0,0,0,0,0,0,0,0,18,0,0,0,0
    "hsa-miR-3653-5p",0,0,0,0,0,0,0,0,0,0,0,0,17,0,20,1,0
    "hsa-miR-5194",0,0,0,0,0,0,0,0,0,0,0,0,15,1,0,0,0
    "hsa-miR-6881-3p",0,0,0,0,0,0,0,0,0,0,0,0,13,0,0,0,0
    "hsa-miR-299-3p",0,0,0,5,0,0,0,0,0,0,0,0,12,0,0,0,0
    "hsa-miR-6510-5p",0,0,0,0,0,0,0,0,0,0,0,0,11,0,13,0,0
    "hsa-miR-6509-5p",0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,0
    "hsa-miR-1180-5p",0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,0,0
    "hsa-miR-4428",0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,13,0
    "hsa-miR-6861-5p",0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0
    "hsa-miR-6501-5p",0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0
    "hsa-miR-1233-5p",0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0
    "hsa-miR-4516",0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
    "hsa-miR-494-3p",0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
    "hsa-mir-151b",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-4695-5p",0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0
    "hsa-miR-145-5p",0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,0,0
    "hsa-miR-4482-3p",0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,0
    "hsa-miR-5006-5p",0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0
    "hsa-miR-3126-5p",0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,13,0
    "hsa-miR-602",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,19,0
    "hsa-miR-127-5p",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0
    "hsa-miR-376c-3p",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0
    "hsa-miR-6821-5p",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0
    "hsa-miR-6761-5p",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0
    "hsa-miR-371a-3p",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0
    "hsa-miR-4690-5p",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0
    "hsa-miR-1910-3p",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0
    "hsa-miR-3180",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0
    "hsa-miR-6764-3p",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0
    "hsa-miR-1915-3p",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0
    "hsa-miR-6870-5p",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
    "hsa-miR-6821-3p",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5
    "hsa-miR-6799-3p",0,0,0,8,13,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-5703",0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0
    "hsa-miR-6785-3p",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0

Analysis of the RNA binding protein (RBP) motifs for RNA-Seq and miRNAs (v3)

There are several alternative R packages and tools to perform motif enrichment analysis for RNA-binding proteins (RBPs), beyond PWMEnrich::motifEnrichment(). Here are the most notable ones:

Tool / Package Enrichment Custom Motifs CLI or R? RNA-specific? Notes
PWMEnrich R Tried (see pipeline.v1-block3)
RBPmap ❌ (uses own db) Web/CLI Tried RBPmap, but it is too slow
Biostrings/TFBSTools ❌ (only scanning) R ATtRACT+Biostrings/TFBSTools (tried, pipeline.v1-block3)
rmap ✅ (CLIP-based) R
Homer CLI ⚠ RNA optional
MEME (AME, FIMO) Web/CLI ⚠ Generic Finally using ATtRACT+FIMO, AME has BUG, not runnable
#For me it was suggested to use “RBPmap” or “GraphProt” to do this analysis.
  1. Get 3UTR.fasta, 5UTR.fasta, CDS.fasta and transcripts.fasta

             mRNA Transcript
     ┌────────────┬────────────┬────────────┐
     │   5′ UTR   │     CDS    │   3′ UTR   │
     └────────────┴────────────┴────────────┘
     ↑            ↑            ↑            ↑
     Start        Start        Stop         End
     of           Codon       Codon        of
     Transcript                             Transcript
    
     ✅ Option 1: Use GENCODE and python scripts (CHOSEN!)
    
     #Input: up- and down-, all-regulated files
     ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-up.txt    #20086
     ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-down.txt  #634
     ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-up.txt     #23832
     ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-down.txt   #375
    
     #Filtering the down-regulated genes to include only protein_coding genes before extracting 3' UTRs, because
     #1. Only protein_coding genes have well-annotated 3' UTRs
     #3' UTRs are defined as the region after the CDS (coding sequence) and before the poly-A tail.
     #Non-coding RNAs (e.g., lncRNA, snoRNA, miRNA precursors) do not have CDS, and therefore don't have canonical 3' UTRs.
     #2. In GENCODE, most UTR annotations are only provided for transcripts of gene_type = "protein_coding".
    
     cd ~/DATA/Data_Ute/RBPs_analysis/extract_3UTR_5UTR_CDS_transcript
     grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-up.txt > MKL-1_wt.EV_vs_parental-up_protein_coding.txt
     grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-down.txt > MKL-1_wt.EV_vs_parental-down_protein_coding.txt
     grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-up.txt > WaGa_wt.EV_vs_parental-up_protein_coding.txt
     grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-down.txt > WaGa_wt.EV_vs_parental-down_protein_coding.txt
    
     #Visit and Download: GENCODE FTP site https://www.gencodegenes.org/human/
         * GTF annotation file (e.g., gencode.v48.annotation.gtf.gz)
         * Corresponding genome FASTA (e.g., GRCh38.primary_assembly.genome.fa.gz)
     wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_48/gencode.v48.annotation.gtf.gz
     wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_48/GRCh38.primary_assembly.genome.fa.gz
     gunzip gencode.v48.annotation.gtf.gz
     gunzip GRCh38.primary_assembly.genome.fa.gz
    
     python extract_transcript_parts.py MKL-1_wt.EV_vs_parental-down_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa MKL-1_down
     python extract_transcript_parts.py MKL-1_wt.EV_vs_parental-up_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa MKL-1_up  #5988
     python extract_transcript_parts.py WaGa_wt.EV_vs_parental-down_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa WaGa_down  #93
     python extract_transcript_parts.py WaGa_wt.EV_vs_parental-up_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa WaGa_up  #6538
    
     ✅ Option 2-5 see at the end!
  2. Why 3′ UTR?

     🧬 miRNA, RBP, or translation/post-transcriptional regulation
     ➡️ Use 3' UTR sequences
    
     Because:
    
     Most miRNA binding and many RBP motifs are located in the 3' UTR.
    
     It’s the primary region for mRNA stability, localization, and translation regulation.
    
     🧠 Example: You're looking for binding enrichment of miRNAs or RNA-binding proteins (PUM, HuR, etc.)
     ✅ Input = 3UTR.fasta
    
     🧪 If you're testing PBRs related to:
     - Translation initiation, upstream ORFs, or 5' cap interaction:
     ➡️ Use 5' UTR
    
     - Coding mutations, protein-level motifs, or translational efficiency:
     ➡️ Use CDS
    
     - General transcriptome-wide motif search (no preference):
     ➡️ Use transcripts, or test all regions separately to localize signal
  3. Recommended Workflow with RBPmap https://rbpmap.technion.ac.il (Too slow!)

     RBPmap itself does not compute enrichment p-values or FDR; it's a motif scanning tool.
    
     To get statistically meaningful RBP enrichments, combine RBPmap with custom permutation testing or Fisher’s exact test + multiple testing correction.
    
         1. Prepare foreground (target) and background sequences
    
             Extract 3′ UTRs of:
    
             📉 Downregulated mRNAs (foreground) — likely targeted by upregulated miRNAs
    
             ⚪ A control set of 3′ UTRs — e.g., non-differentially expressed protein-coding genes
    
                 grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-all.txt > MKL-1_wt.EV_vs_parental-all_protein_coding.txt
                 grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-all.txt > WaGa_wt.EV_vs_parental-all_protein_coding.txt
    
                 cut -d',' -f1 MKL-1_wt.EV_vs_parental-all_protein_coding.txt | sort > all_genes.txt  #19239
                 cut -d',' -f1 MKL-1_wt.EV_vs_parental-up_protein_coding.txt | sort > up_genes.txt  #5988
                 cut -d',' -f1 MKL-1_wt.EV_vs_parental-down_protein_coding.txt | sort > down_genes.txt  #112
                 cat up_genes.txt down_genes.txt | sort | uniq > regulated_genes.txt
                 comm -23 all_genes.txt regulated_genes.txt > background_genes.txt
                 grep -Ff background_genes.txt MKL-1_wt.EV_vs_parental-all_protein_coding.txt > MKL-1_wt.EV_vs_parental-background_protein_coding.txt  #13139
    
                 cut -d',' -f1 WaGa_wt.EV_vs_parental-all_protein_coding.txt | sort > all_genes.txt  #19239
                 cut -d',' -f1 WaGa_wt.EV_vs_parental-up_protein_coding.txt | sort > up_genes.txt  #6538
                 cut -d',' -f1 WaGa_wt.EV_vs_parental-down_protein_coding.txt | sort > down_genes.txt  #93
                 cat up_genes.txt down_genes.txt | sort | uniq > regulated_genes.txt
                 comm -23 all_genes.txt regulated_genes.txt > background_genes.txt
                 grep -Ff background_genes.txt WaGa_wt.EV_vs_parental-all_protein_coding.txt > WaGa_wt.EV_vs_parental-background_protein_coding.txt  #12608
    
                 python extract_transcript_parts.py MKL-1_wt.EV_vs_parental-background_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa MKL-1_background
                 python extract_transcript_parts.py WaGa_wt.EV_vs_parental-background_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa WaGa_background
    
                 foreground.fasta: 你的目标(前景)序列,例如下调基因的 3′UTRs。
                 background.fasta: 你的背景对照序列,例如未显著差异表达的基因的 3′UTRs。
    
         2. Run RBPmap separately on both sets (in total of 6 calculations)
    
             * Submit both sets of UTRs to RBPmap.
             * Use the same settings (e.g., “human genome”, “high stringency”, "Apply conservation filter" etc.)
             * Choose all RBPs
             * Download motif match outputs for both sets
    
         3. Count motif hits per RBP in each set
    
             You now have:
             For each RBP:
             a: number of target 3′ UTRs with a motif match
             b: number of background 3′ UTRs with a motif match
             c: total number of target 3′ UTRs
             d: total number of background 3′ UTRs
    
         4. Perform Fisher’s Exact Test per RBP
    
             For each RBP, construct a 2x2 table:
    
             Motif Present   Motif Absent
             Foreground (targets)    a   c - a
             Background  b   d - b
    
         5. Adjust p-values for multiple testing
         Use Benjamini-Hochberg (FDR) correction (e.g., in Python or R) across all RBPs tested.
    
         6.✅ Summary
    
             Step    Tool
             Prepare Database of RNA-binding motifs  ATtRACT
             3′ UTR extraction   extract_transcript_parts.py
             Motif scan  RBPmap or FIMO
             Count motif hits    Your own parser (Python or R)
             Fisher’s exact test scipy.stats or fisher.test()
             FDR correction  multipletests() or p.adjust()
    
         python rbp_enrichment.py rbpmap_downregulated.tsv rbpmap_background.tsv rbp_enrichment_results.csv
  4. Quick Drop-In Plan (RBPmap Alternative with FIMO for motif scan)

     1. [ATtRACT + FIMO (MEME suite)]
    
         ATtRACT: Database of RNA-binding motifs.
         FIMO: Fast and scriptable motif scanning tool.
    
         #Download RBP motifs (PWM) from ATtRACT DB; Convert to MEME format (if needed); Use FIMO to scan UTR sequences
    
         grep "Homo_sapiens" ATtRACT_db.txt > attract_human.txt
    
         #cut -f12 attract_human.txt | sort | uniq > valid_ids.txt
    
         python filter_short_fasta.py ../Data_RNA-Seq_MKL-1+WaGa/motif_analysis/MKL-1_background.3UTR.fasta MKL-1_background.filtered.3UTR.fasta
         ✅ 筛选完成: 总序列 = 70650
         🧹 已移除过短序列 (<16 nt): 1760
         🟢 保留有效序列 (≥16 nt): 68890
         📁 新背景文件保存为: MKL-1_background.filtered.3UTR.fasta
         # 检查背景文件中有多少序列:
         grep -c "^>" MKL-1_background.filtered.3UTR.fasta
         68890
         # 检查背景 FIMO 命中的总序列数:
         cut -f3 fimo_background_MKL-1_background/fimo.tsv | sort | uniq | wc -l
         67841
         python filter_short_fasta.py ../Data_RNA-Seq_MKL-1+WaGa/motif_analysis/MKL-1_up.3UTR.fasta MKL-1_up.filtered.3UTR.fasta
         python filter_short_fasta.py ../Data_RNA-Seq_MKL-1+WaGa/motif_analysis/MKL-1_down.3UTR.fasta MKL-1_down.filtered.3UTR.fasta
         python filter_short_fasta.py ../Data_RNA-Seq_MKL-1+WaGa/motif_analysis/WaGa_background.3UTR.fasta WaGa_background.filtered.3UTR.fasta
         python filter_short_fasta.py ../Data_RNA-Seq_MKL-1+WaGa/motif_analysis/WaGa_up.3UTR.fasta WaGa_up.filtered.3UTR.fasta
         python filter_short_fasta.py ../Data_RNA-Seq_MKL-1+WaGa/motif_analysis/WaGa_down.3UTR.fasta WaGa_down.filtered.3UTR.fasta
    
         python convert_attract_pwm_to_meme.py
    
         fimo --thresh 1e-4 --oc fimo_foreground_MKL-1_down attract_human.meme ../Data_RNA-Seq_MKL-1+WaGa/motif_analysis/MKL-1_down.3UTR.fasta
         fimo --thresh 1e-4 --oc fimo_foreground_MKL-1_up attract_human.meme ../Data_RNA-Seq_MKL-1+WaGa/motif_analysis/MKL-1_up.3UTR.fasta
         fimo --thresh 1e-4 --oc fimo_background_MKL-1_background attract_human.meme ../Data_RNA-Seq_MKL-1+WaGa/motif_analysis/MKL-1_background.3UTR.fasta
         fimo --thresh 1e-4 --oc fimo_foreground_WaGa_down attract_human.meme ../Data_RNA-Seq_MKL-1+WaGa/motif_analysis/WaGa_down.3UTR.fasta
         fimo --thresh 1e-4 --oc fimo_foreground_WaGa_up attract_human.meme ../Data_RNA-Seq_MKL-1+WaGa/motif_analysis/WaGa_up.3UTR.fasta
         fimo --thresh 1e-4 --oc fimo_background_WaGa_background attract_human.meme ../Data_RNA-Seq_MKL-1+WaGa/motif_analysis/WaGa_background.3UTR.fasta
    
         #Explanation for the table from FIMO (Find Individual Motif Occurrences), which scans sequences to find statistically significant matches to known motifs (e.g., RNA or DNA binding sites).
    
         Column  Meaning
         motif_id    ID of the motif, as defined in the .meme file
         motif_alt_id    Alternative ID or name for the motif (may be blank or unused)
         sequence_name   Name of the sequence where the motif was found (e.g., gene
         start   Start position (1-based) of the motif match within the sequence
         stop    End position of the motif match
         strand  Strand on which the motif was found: + (forward) or - (reverse)
         score   Motif match score; higher scores indicate better matches
         p-value Statistical significance of the match (lower is more significant)
         q-value Adjusted p-value (False Discovery Rate corrected)
         matched_sequence    The actual sequence in the input that matches the motif
    
         ✅ Example Interpretation
         1338 ENSG00000134871|ENST00000714397|3UTR 103 114 + 23.4126 5.96e-08 0.111 GGAGAGAAGGGA
         motif_id: 1338 — a numeric ID from your motif file
         sequence_name: ENSG00000134871|ENST00000714397|3UTR — refers to the gene, transcript, and region (3′ UTR)
         start–stop: 103–114 — the motif occurs from position 103 to 114
         strand: + — found on the positive strand
         score: 23.41 — high score means strong motif match
         p-value: 5.96e-08 — very statistically significant
         q-value: 0.111 — FDR-corrected p-value
         matched_sequence: GGAGAGAAGGGA — the actual sequence match in the UTR
    
         💡 Tips
         You can map motif_id to RBP (RNA-binding protein) names using an annotation file like ATtRACT_db.txt.
         Typically, q-value < 0.05 is considered significant.
         Duplicate matches in different transcripts of the same gene may occur and are valid.
         Would you like help converting motif_id to RBP names for clarity?
    
         🧠 In most biological contexts:
             * Counting a motif as present multiple times because it's in several transcripts can inflate significance.
             * If you're using Fisher's exact test (as in enrichment), this transcript-level duplication can bias results.
    
         ⚠️ Caveat: If you're studying isoform-specific regulation, then transcript-level data may be valuable and shouldn't be collapsed. But for most general RBP enrichment or gene expression studies, the gene-level collapse is preferred.
    
         #Keep only one match per gene (based on Ensembl Gene ID like ENSG00000134871) for each RBP motif, even if multiple transcripts have hits.
         #python filter_fimo_best_per_gene.py --input fimo_foreground/fimo.tsv --output fimo_foreground/fimo.filtered.tsv
         convert_gtf_to_Gene_annotation_TSV_file.py  #generate gene_annotation.tsv
         python filter_fimo_best_per_gene_annotated.py \
         --input fimo_foreground_MKL-1_down/fimo.tsv \
         --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
         --output_filtered fimo_foreground_MKL-1_down/fimo.filtered.tsv \
         --output_annotated fimo_foreground_MKL-1_down/fimo.filtered.annotated.tsv
         #21559
         python filter_fimo_best_per_gene_annotated.py \
         --input fimo_foreground_MKL-1_up/fimo.tsv \
         --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
         --output_filtered fimo_foreground_MKL-1_up/fimo.filtered.tsv \
         --output_annotated fimo_foreground_MKL-1_up/fimo.filtered.annotated.tsv
         #(736661 rows)
         python filter_fimo_best_per_gene_annotated.py \
         --input fimo_background_MKL-1_background/fimo.tsv \
         --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
         --output_filtered fimo_background_MKL-1_background/fimo.filtered.tsv \
         --output_annotated fimo_background_MKL-1_background/fimo.filtered.annotated.tsv
         #(1869075 rows)
         python filter_fimo_best_per_gene_annotated.py \
         --input fimo_foreground_WaGa_down/fimo.tsv \
         --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
         --output_filtered fimo_foreground_WaGa_down/fimo.filtered.tsv \
         --output_annotated fimo_foreground_WaGa_down/fimo.filtered.annotated.tsv
         #(20364 rows)
         python filter_fimo_best_per_gene_annotated.py \
         --input fimo_foreground_WaGa_up/fimo.tsv \
         --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
         --output_filtered fimo_foreground_WaGa_up/fimo.filtered.tsv \
         --output_annotated fimo_foreground_WaGa_up/fimo.filtered.annotated.tsv
         #(805634 rows)
         python filter_fimo_best_per_gene_annotated.py \
         --input fimo_background_WaGa_background/fimo.tsv \
         --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
         --output_filtered fimo_background_WaGa_background/fimo.filtered.tsv \
         --output_annotated fimo_background_WaGa_background/fimo.filtered.annotated.tsv
         #(1811615 rows)
    
         python run_enrichment.py \
             --attract ATtRACT_db.txt \
             --fimo_fg fimo_foreground_MKL-1_up/fimo.filtered.tsv \
             --fimo_bg fimo_background_MKL-1_background/fimo.filtered.tsv \
             --output rbp_enrichment_MKL-1_up.csv \
             --strategy inclusive
         python run_enrichment.py \
             --attract ATtRACT_db.txt \
             --fimo_fg fimo_foreground_MKL-1_down/fimo.filtered.tsv \
             --fimo_bg fimo_background_MKL-1_background/fimo.filtered.tsv \
             --output rbp_enrichment_MKL-1_down.csv
         python run_enrichment.py \
             --attract ATtRACT_db.txt \
             --fimo_fg fimo_foreground_WaGa_up/fimo.filtered.tsv \
             --fimo_bg fimo_background_WaGa_background/fimo.filtered.tsv \
             --output rbp_enrichment_WaGa_up.csv
         python run_enrichment.py \
             --attract ATtRACT_db.txt \
             --fimo_fg fimo_foreground_WaGa_down/fimo.filtered.tsv \
             --fimo_bg fimo_background_WaGa_background/fimo.filtered.tsv \
             --output rbp_enrichment_WaGa_down.csv
    
         python plot_volcano.py --csv rbp_enrichment_MKL-1_up.csv --output MKL-1_volcano_up.pdf --title "Upregulated MKL-1"
         python plot_rbp_heatmap.py \
         --csvs rbp_enrichment_MKL-1_up.csv rbp_enrichment_MKL-1_down.csv \
         --labels Upregulated Downregulated \
         --output MKL-1_rbp_enrichment_heatmap.pdf
    
         #Column Meaning
         #a  Number of unique foreground UTRs hit by the RBP
         #b  Number of unique background UTRs hit by the RBP
         #c  Total number of foreground UTRs
         #d  Total number of background UTRs (⬅️ this is the value you're asking about)
         #p_value, fdr   From Fisher's exact test on enrichment
    
         #-- Get all genes the number 1621 refers to --
         #AGO2,1621,5050,5732,12987,1.0,1.0   #MKL-1_up
         #motif_ids are 414 and 399
         grep "^414" fimo.filtered.annotated.tsv > AGO2.txt
         grep "^399" fimo.filtered.annotated.tsv >> AGO2.txt
         cut -d$'\t' -f11 AGO2.txt | sort -u > AGO2_uniq.txt
         wc -l AGO2_uniq.txt
         #1621 AGO2_uniq.txt
    
         #工具 功能  关注点 应用场景
         FIMO    精确查找 motif 出现位置 motif 在什么位置出现   找出具体结合位点
         AME 统计 motif 富集情况   哪些 motif 在某组序列中更富集  比较 motif 是否显著出现更多
    
         如你还在做差异表达后的RBP富集分析,可以考虑先用 FIMO 扫描,再用你自己写的代码 + Fisher’s exact test 做类似 AME 的工作,或直接用 AME 做分析
    
         # Generate the attract_human.meme inkl. Gene_name!
         #python generate_named_meme.py pwm.txt attract_human.txt
         python generate_attract_human_meme.py pwm.txt ATtRACT_db.txt
    
         #ERROR during running ame --> DEBUG!
         #--control ../Data_RNA-Seq_MKL-1+WaGa/motif_analysis/WaGa_all.3UTR.fasta \
         ame --control --shuffle-- \
         --oc ame_out \
         --scoring avg \
         --method fisher --verbose 5 ../Data_RNA-Seq_MKL-1+WaGa/motif_analysis/WaGa_down.3UTR.fasta attract_human.meme
    
     2. GraphProt2 (ALTERNATIVE_TODO)
    
         ML-based tool using sequence + structure
    
         Pre-trained models for many RBPs
    
         ✅ Advantages:
    
         Local, GPU/CPU supported
    
         More biologically realistic (includes structure)
  5. miRNAs motif analysis using ATtRACT + FIMO

     ✅ Goal
    
         * Extract their sequences
         * Generate a background set
         * Run RBP enrichment (e.g., with RBPmap or FIMO)
         * Get p-adjusted enrichment stats (e.g., Fisher + BH)
    
         Input_1. DE results (differential expression file from smallRNA-seq)
             #Input: up- and down-, all-regulated files
             #~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/EV_vs_parental-up.txt  #83
             #~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/EV_vs_parental-down.txt  #34
             #~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/EV_vs_parental-all.txt  #1304
             ~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/untreated_vs_parental_cells-up.txt  #66
             ~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/untreated_vs_parental_cells-down.txt  #38
             ~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/untreated_vs_parental_cells-all.txt  #1304
             #Format: 1st column = miRNA ID (e.g., hsa-miR-21-5p), optionally with other stats.
    
         Input_2. Reference FASTA (Reference sequences from miRBase or GENCODE)
             #From miRBase: https://mirbase.org/download/  https://mirbase.org/download/CURRENT/
             ##miRBase_v21
             #mature.fa.gz → contains mature miRNA sequences
             #hairpin.fa.gz → for pre-miRNAs
    
             cp ~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/untreated_vs_parental_cells-*.txt .
             #"hsa-miR-3180|hsa-miR-3180-3p"
             #>hsa-miR-3180 MIMAT0018178 Homo sapiens miR-3180
             #UGGGGCGGAGCUUCCGGAG
             #>hsa-miR-3180-3p MIMAT0015058 Homo sapiens miR-3180-3p
             #UGGGGCGGAGCUUCCGGAGGCC
    
         5.1 (Optional, not used!)
    
             #python extract_miRNA_fasta.py EV_vs_parental-up.txt mature_v21.fa up_mature_miRNAs.fa --unmatched up_mature_unmatched.txt  #84+0
             #python extract_miRNA_fasta.py EV_vs_parental-up.txt hairpin_v21.fa up_precursor_miRNAs.fa --unmatched up_precursor_unmatched.txt  #0
             #python extract_miRNA_fasta.py EV_vs_parental-down.txt mature_v21.fa down_mature_miRNAs.fa --unmatched down_mature_unmatched.txt  #34+0
             #python extract_miRNA_fasta.py EV_vs_parental-down.txt hairpin_v21.fa down_precursor_miRNAs.fa --unmatched down_precursor_unmatched.txt  #0
             #python extract_miRNA_fasta.py EV_vs_parental-all.txt mature_v21.fa all_mature_miRNAs.fa --unmatched all_mature_unmatched.txt         #1304+16
             #python extract_miRNA_fasta.py EV_vs_parental-all.txt hairpin_v21.fa all_precursor_miRNAs.fa --unmatched all_precursor_unmatched.txt  #16
             python extract_miRNA_fasta.py untreated_vs_parental_cells-up.txt mature_v21.fa up_mature_miRNAs.fa --unmatched up_mature_unmatched.txt  #67+0
             python extract_miRNA_fasta.py untreated_vs_parental_cells-up.txt hairpin_v21.fa up_precursor_miRNAs.fa --unmatched up_precursor_unmatched.txt  #0
             python extract_miRNA_fasta.py untreated_vs_parental_cells-down.txt mature_v21.fa down_mature_miRNAs.fa --unmatched down_mature_unmatched.txt  #38+0
             python extract_miRNA_fasta.py untreated_vs_parental_cells-down.txt hairpin_v21.fa down_precursor_miRNAs.fa --unmatched down_precursor_unmatched.txt  #0
             python extract_miRNA_fasta.py untreated_vs_parental_cells-all.txt mature_v21.fa all_mature_miRNAs.fa --unmatched all_mature_unmatched.txt         #1304+16
             python extract_miRNA_fasta.py untreated_vs_parental_cells-all.txt hairpin_v21.fa all_precursor_miRNAs.fa --unmatched all_precursor_unmatched.txt  #16
    
         5.2 (Advanced)
             Extract Sequences + Background Set
    
             Inputs:
                 * up_miRNA.txt and down_miRNA.txt: DE results (first column = miRNA name, e.g., hsa-miR-21-5p)
                 * mature.fa or hairpin.fa from miRBase
    
             Outputs:
                 * mirna_up.fa
                 * mirna_down.fa
                 * mirna_background.fa
    
             #Use all remaining miRNAs as background:
             python prepare_miRNA_sets.py untreated_vs_parental_cells-up.txt untreated_vs_parental_cells-down.txt mature_v21.fa mirna --full-background
             mv mirna_background.fa mirna_full-background.fa
             #Use random subset background. Note that the generated background has the number of maxsize(up, down), in the case is up (84 records):
             python prepare_miRNA_sets.py untreated_vs_parental_cells-up.txt untreated_vs_parental_cells-down.txt mature_v21.fa mirna
             # grep ">" mature_v21.fa | wc -l  #35828
             # grep ">" mirna_full-background.fa | wc -l  #35710-->35723
             # grep ">" mirna_up.fa | wc -l  #84
             # grep ">" mirna_down.fa | wc -l  #34
             # grep ">" mirna_background.fa | wc -l  #84-->67
             # #35,710 + 84 + 34 = 35,828
    
         🔬 What You Can Do Next
         Goal    Tool    Input
         * RBP motif enrichment in pre-miRNAs    RBPmap, FIMO, AME   up_precursor_miRNAs.fa
         * Motif comparison (up vs down miRNAs)  DREME, MEME, HOMER  Up/down mature miRNAs
         * Build background for enrichment   Random subset of other miRNAs   Filtered from hairpin.fa
    
         fimo --thresh 1e-4 --oc fimo_mirna_down attract_human.meme mirna_down.fa
         fimo --thresh 1e-4 --oc fimo_mirna_up attract_human.meme mirna_up.fa
         fimo --thresh 1e-4 --oc fimo_mirna_full-background attract_human.meme mirna_full-background.fa
         fimo --thresh 1e-4 --oc fimo_mirna_background attract_human.meme mirna_background.fa
         #END
    
         python filter_fimo_best_per_gene_annotated.py \
         --input fimo_mirna_down/fimo.tsv \
         --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
         --output_filtered fimo_mirna_down/fimo.filtered.tsv \
         --output_annotated fimo_mirna_down/fimo.filtered.annotated.tsv  #21
         python filter_fimo_best_per_gene_annotated.py \
         --input fimo_mirna_up/fimo.tsv \
         --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
         --output_filtered fimo_mirna_up/fimo.filtered.tsv \
         --output_annotated fimo_mirna_up/fimo.filtered.annotated.tsv  #48
         python filter_fimo_best_per_gene_annotated.py \
         --input fimo_mirna_full-background/fimo.tsv \
         --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
         --output_filtered fimo_mirna_full-background/fimo.filtered.tsv \
         --output_annotated fimo_mirna_full-background/fimo.filtered.annotated.tsv  #896
         python filter_fimo_best_per_gene_annotated.py \
         --input fimo_mirna_background/fimo.tsv \
         --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
         --output_filtered fimo_mirna_background/fimo.filtered.tsv \
         --output_annotated fimo_mirna_background/fimo.filtered.annotated.tsv  #57
    
         python run_enrichment_miRNAs.py \
             --attract ATtRACT_db.txt \
             --fimo_fg fimo_mirna_up/fimo.filtered.tsv \
             --fimo_bg fimo_mirna_full-background/fimo.filtered.tsv \
             --output rbp_enrichment_mirna_up.csv \
             --strategy inclusive
         python run_enrichment_miRNAs.py \
             --attract ATtRACT_db.txt \
             --fimo_fg fimo_mirna_down/fimo.filtered.tsv \
             --fimo_bg fimo_mirna_full-background/fimo.filtered.tsv \
             --output rbp_enrichment_mirna_down.csv \
             --strategy inclusive
         #python run_enrichment_miRNAs.py \
         #    --attract ATtRACT_db.txt \
         #    --fimo_fg fimo_mirna_up/fimo.filtered.tsv \
         #    --fimo_bg fimo_mirna_background/fimo.filtered.tsv \
         #    --output rbp_enrichment_mirna_up_on_subset-background.csv \
         #    --strategy inclusive
         #python run_enrichment_miRNAs.py \
         #    --attract ATtRACT_db.txt \
         #    --fimo_fg fimo_mirna_down/fimo.filtered.tsv \
         #    --fimo_bg fimo_mirna_background/fimo.filtered.tsv \
         #    --output rbp_enrichment_mirna_down_on_subset-background.csv \
         #    --strategy inclusive
    
         #FXR2   1 (hsa-miR-92b-5p)  1   1   118 0.0168067226890756  0.365546218487395
         #ORB2   1 (hsa-miR-4748)    1   1   118 0.0168067226890756  0.365546218487395
    
         #-- Get all genes the number 1621 refers to --
         grep "^FXR2" ATtRACT_db.txt
         #motif_ids is M020_0.6
         grep "^M020_0.6" fimo_mirna_up/fimo.filtered.annotated.tsv > FXR2.txt
         grep "^M020_0.6" fimo_mirna_up/fimo.filtered.annotated.tsv
         #cut -d$'\t' -f11 AGO2.txt | sort -u > AGO2_uniq.txt
         #wc -l AGO2_uniq.txt (1621 records)
    
         grep "^ORB2" ATtRACT_db.txt
         grep "^M120_0.6" fimo_mirna_up/fimo.filtered.annotated.tsv
  6. RBP Enrichment from RBPmap Results (NOT implemented!) 🔹 Use RBPmap output (typically CSV or TSV) 🔹 Compare hit counts in input vs background 🔹 Perform Fisher’s exact test + Benjamini-Hochberg correction 🔹 Plot significantly enriched RBPs

         📁 Requirements
         You’ll need:
    
         File    Description
         rbpmap_up.tsv   RBPmap result file for upregulated set
         rbpmap_background.tsv   RBPmap result file for background set
    
         📝 These should have columns like:
    
         Motif Name or Protein
    
         Sequence Name or Sequence ID
         (If different, I’ll show you how to adjust.
    
         python analyze_rbpmap_enrichment.py rbpmap_up.tsv rbpmap_background.tsv enriched_up.csv enriched_up_plot.png
    
         ✅ Output
         enriched_up.csv
         RBP FG_hits BG_hits pval    padj    enriched
         ELAVL1  24  2   0.0001  0.003   ✅
         HNRNPA1 15  10  0.04    0.06    ❌
    
         enriched_up_plot.png
         Barplot showing top significant RBPs (lowest FDR)
    
         🧰 Customization Options
         Would you like:
    
             * Support for multiple RBPmap files at once?
             * To match by RBP family?
             * A full report (PDF/HTML) of top hits?
             * Let me know, and I’ll tailor the next script!
  7. The planned pipeline is very reasonable and scientifically sound for RBP (RNA-binding protein) enrichment using miRNAs as input for FIMO motif scanning and downstream analysis.

     Here’s a breakdown of why your steps make sense, plus a few important suggestions:
    
     ✅ YOUR PIPELINE LOGIC (EXCELLENT)
     1. Motif Scanning with FIMO
    
     fimo --thresh 1e-4 --oc fimo_mirna_* attract_human.meme mirna_*.fa
     ✔️ Uses ATtRACT motifs to scan miRNA sequences.
    
     ✔️ Applies a reasonable p-value threshold (1e-4) to detect motif hits.
    
     📌 If you want more sensitivity (but more noise), you could try 1e-3; for higher stringency, 1e-5.
    
     2. Filter FIMO hits to best per gene and annotate
    
     python filter_fimo_best_per_gene_annotated.py --input ... --annot ... --output ...
     ✔️ Filters redundant hits.
    
     ✔️ Annotates with gene symbols or other metadata — essential for interpretability.
    
     3. RBP Enrichment
    
     python run_enrichment.py --fimo_fg ... --fimo_bg ... --attract ... --output ...
     ✔️ Uses foreground (up/down-regulated) and background (non-differential) miRNAs.
    
     ✔️ The --strategy inclusive is typical and includes all motif hits, not just strongest per gene or region.
    
     ✔️ Running up and down separately = ✅ best practice.
    
     🔍 MINOR IMPROVEMENTS / SUGGESTIONS
     ✅ Use the same background for both up and down analyses:
     You already are — good!
    
     This ensures your enrichment comparison is balanced and interpretable.
    
     ⚠️ Check motif directionality (optional):
     Some RBPs are strand-specific — but since miRNAs are single-stranded, and your FASTA is from mature sequences, you’re probably okay.
    
     ✅ Consider including summary stats:
     How many motifs hit in up/down/background?
    
     How many unique RBPs?
    
     This helps interpret enrichment p-values.
    
     🧪 Statistical validation (optional but helpful):
     If you have many miRNAs (e.g. >30), consider bootstrapping enrichment from random subsets of the background to assess robustness.
    
     ✅ FINAL VERDICT
     Your pipeline is:
    
     ✔️ Technically correct
    
     ✔️ Biologically appropriate
    
     ✔️ Reproducible and clear
    
     🧠 Well-structured for miRNA-RBP motif enrichment
  8. Why background differs between RNA-seq and miRNA motif analysis:

     In RNA-seq differential expression analysis, especially for gene-level enrichment (e.g., GO or pathway analysis), the background typically includes all genes detected or expressed, not just a random subset. The logic is:
    
     ✅ “If a gene was testable (i.e., expressed above a threshold), it should be part of the universe we consider when evaluating enrichment.”
    
     This is because the statistical model assumes:
    
     Your universe is all genes tested.
    
     You're asking: Are my differentially expressed genes overrepresented in some category, compared to all testable genes?
    
     🔬 Why a random subset background was used in the miRNA motif pipeline (e.g., RBP enrichment):
     In the prepare_miRNA_sets.py script, the subset was used for one reason only:
    
     📉 To balance the number of background sequences relative to the foreground (up/down) — especially for statistical tests like Fisher's exact test.
    
     But this is optional and not required for correctness. A larger background improves statistical power, but must match distributional properties (e.g., GC content, sequence length) to avoid bias.
    
     ✅ So: Should you use all non-differentially expressed miRNAs as background?
     Yes, that is often more appropriate, assuming:
    
     You have a full set of detectable/testable miRNAs (e.g., all from the FASTA).
    
     You exclude the foreground (up/down) sets.
    
     The background matches the general properties of the foreground set.
  9. RBP enrichments via FIMO (Another description; it is the same to the workflow in the point 4)

     1. Collect the 3′ UTR sequences: Use the 3UTR.fasta file generated earlier, filtered to protein-coding and downregulated genes.
    
     2. Prepare Motif Database (MEME format)
    
         * ATtRACT: https://attract.cnic.es
         * RBPDB: http://rbpdb.ccbr.utoronto.ca
         * Ray2013 (CISBP-RNA motifs) — available via MEME Suite
         * [RBPmap motifs (if downloadable)]
         #Example format: rbp_motifs.meme
    
     2. Run FIMO to Scan for RBP Motifs (Similar to RBPmap)
    
         fimo --oc fimo_up rbp_motifs.meme mirna_up.fa
         fimo --oc fimo_down rbp_motifs.meme mirna_down.fa
         fimo --oc fimo_background rbp_motifs.meme mirna_background.fa
         #This produces fimo.tsv in each output folder.
    
     3. Run RBP motif enrichment using MEME Suite using AME (Analysis of Motif Enrichment). Note that FIMO+run_enrichment.py=AME, however, directly using AME returns ERROR:
    
         ame \
         --control control_3UTRs.fasta \
         --oc ame_out \
         --scoring avg \
         --method fisher \
         3UTR.fasta \
         rbp_motifs.meme
    
         Where:
    
         * 3UTR.fasta = your downregulated genes’ 3′ UTRs
         * control_3UTRs.fasta = background UTRs (e.g., random protein-coding genes not downregulated)
         * rbp_motifs.meme = motif file from RBPDB or Ray2013
    
     4. Interpret Results: Output includes RBP motifs enriched in your downregulated mRNAs' 3′ UTRs.
    
         You can then link enriched RBPs to known interactions with your upregulated miRNAs, or explore their regulatory roles.
    
     5. ✅ Bonus: Predict Which mRNAs Are Targets of Your miRNAs
    
         Use tools like: miRanda, TargetScan, miRDB
    
         Then intersect predicted targets with your downregulated genes to identify likely functional interactions.
    
     6. Summary
    
         Goal    Input   Tool / Approach
         RBP enrichment  3UTR.fasta of downregulated genes   AME with RBP motifs
         Background/control  3′ UTRs from non-differential or upregulated genes
         Link miRNA to targets   Use TargetScan / miRanda    Intersect with down genes
    
     7. Would you like:
    
         * Ready-to-use RBP motif .meme file?
         * Script to generate background sequences?
         * Visualization options for the enrichment results?
  10. Other options to get sequences of 3UTR, 5UTR, CDS and mRNA transcripts

     ✅ Option 2: Use Ensembl BioMart (web-based, no coding) --> Lasting too long!
    
         Go to Ensembl BioMart https://www.ensembl.org/biomart/martview/7b826bcbd0cec79021977f8dc12a8f61
    
         Select:
    
         Database: Ensembl Genes
         Dataset: Homo sapiens genes (GRCh38 or latest)
    
         Click on “Filters” → expand Region or Gene to limit your selection (optional).
         Click on “Attributes”:
         Under Sequences, check:
         Sequences
         3' UTR sequences
    
         Optionally add gene IDs, transcript IDs, etc.
    
         Click “Results” to view/download the FASTA of 3' UTRs.
    
     ✅ Option 3: Use GENCODE (precompiled annotations) and gffread
    
         Use a tool like gffread (from the Cufflinks or gffread package) to extract 3' UTRs:
    
             #gffread gencode.v44.annotation.gtf -g GRCh38.primary_assembly.genome.fa -w all_utrs.fa -U
             #gffread -w three_prime_utrs.fa -g GRCh38.fa -x cds.fa -y proteins.fa -U -F gencode.gtf
    
             grep -P "\tthree_prime_utr\t" gencode.v48.annotation.gtf > three_prime_utrs.gtf
             gtf2bed < three_prime_utrs.gtf > three_prime_utrs.bed
             bedtools getfasta -fi GRCh38.primary_assembly.genome.fa -bed three_prime_utrs.bed -name -s > three_prime_utrs.fa
    
             gffread gencode.v48.annotation.gtf -g GRCh38.primary_assembly.genome.fa -U -w all_with_utrs.fa
    
         Add -U flag to extract UTRs, and filter post hoc for only 3' UTRs if needed.
    
     ✅ Option 4: Use Bioconductor in R (UCSC-ID, not suitable!)
    
         # Install if not already installed
         if (!requireNamespace("BiocManager", quietly = TRUE))
             install.packages("BiocManager")
         BiocManager::install("GenomicFeatures")
         BiocManager::install("txdbmaker")
         #sudo apt-get update
         #sudo apt-get install libmariadb-dev
         #(optional)sudo apt-get install libmysqlclient-dev
         install.packages("RMariaDB")
    
         # Load library
         library(GenomicFeatures)
    
         # Create TxDb object for human genome
         txdb <- txdbmaker::makeTxDbFromUCSC(genome="hg38", tablename="refGene")
    
         # Extract 3' UTRs by transcript
         utr3 <- threeUTRsByTranscript(txdb, use.names=TRUE)
    
     # View or export as needed
    
     ✅ Option 5: Extract 3′ UTRs Using UCSC Table Browser (GUI method)
         🔗 Website:
         UCSC Table Browser
    
         🔹 Step-by-Step Instructions
         1. Set the basic parameters:
         Clade: Mammal
    
         Genome: Human
    
         Assembly: GRCh38/hg38
    
         Group: Genes and Gene Predictions
    
         Track: GENCODE v44 (or latest)
    
         Table: knownGene or wgEncodeGencodeBasicV44
    
         Choose knownGene for RefSeq-like models or wgEncodeGencodeBasicV44 for GENCODE
    
         2. Region:
         Select: genome (default)
    
         3. Output format:
         Select: sequence
    
         4. Click "get output"
         🔹 Sequence Retrieval Options:
         On the next page (after clicking "get output"), you’ll see sequence options.
    
         Configure as follows:
         ✅ Output format: FASTA
    
         ✅ Which part of the gene: Select only
         → UTRs → 3' UTR only
    
         ✅ Header options: choose if you want gene name,
  11. ⚡️ Bonus: Combine with miRNA-mRNA predictions

     Once you have RBPs enriched in downregulated mRNAs, you can intersect:
         * Which RBPs overlap miRNA binding regions (e.g., via CLIPdb or POSTAR)
         * Check if miRNAs and RBPs compete or co-bind
     This can lead to identifying miRNA-RBP regulatory modules.
  12. Reports

Please find attached the results of the RNA-binding protein (RBP) enrichment analysis using FIMO and the ATtRACT motif database, along with a brief description of the procedures used for both the 3′ UTR-based analysis (RNA-seq) and the miRNA-based analysis (small RNA-seq).

    1. RBP Motif Enrichment from RNA-seq (3′ UTRs)

    We focused on 3′ UTRs, as they are key regulatory regions for RBPs. Sequences shorter than 16 nucleotides were excluded. Using FIMO (from the MEME suite) with motifs from the ATtRACT database, we scanned both foreground and background 3′ UTR sets to identify motif occurrences.

    Foreground: Differentially expressed transcripts (e.g., MKL-1 up/down, WaGa up/down)
    Background: All non-differentially expressed transcripts

    Analysis: Fisher’s exact test was used to assess motif enrichment; p-values were adjusted using the Benjamini–Hochberg method.

    Output files (RNA-seq):

        * rbp_enrichment_MKL-1_down.xlsx / .png
        * rbp_enrichment_MKL-1_up.xlsx / .png
        * rbp_enrichment_WaGa_down.xlsx / .png
        * rbp_enrichment_WaGa_up.xlsx / .png

    2. RBP Motif Enrichment from Small RNA-seq (miRNAs)

    This analysis focused on differentially expressed miRNAs, using either mature miRNA sequences from miRBase. We scanned for RBP binding motifs within these sequences using FIMO and assessed motif enrichment relative to background sets.

    Foreground: DE miRNAs (up/down) from small RNA-seq comparisons
    Background: All other miRNAs from miRBase

    Analysis: FIMO was used with --thresh 1e-4, followed by annotation and filtering. Enrichment was assessed using Fisher’s test + BH correction.

    Output files (miRNAs):

        * rbp_enrichment_mirna_down.xlsx
        * rbp_enrichment_mirna_up.xlsx

    How to Interpret the Numbers
    Each row in the result tables represents one RBP and its enrichment statistics:

    a: foreground genes/sequences with the motif
    b: background genes/sequences with the motif
    c: total number of foreground genes/sequences
    d: total number of background genes/sequences

    These values are used to compute p-values and FDRs.

    For example, in rbp_enrichment_MKL-1_up.xlsx, AGO2 has a = 1621, meaning FIMO detected AGO2 motifs in 1,621 genes in the MKL-1 upregulated set. These genes are listed in AGO2_uniq.txt.

    Similarly, for the miRNA analysis (e.g., rbp_enrichment_mirna_up.xlsx and rbp_enrichment_mirna_down.xlsx), the numbers represent counts of unique miRNAs with at least one significant motif hit. As examples, I calculated the detailed membership for FXR2 and ORB2.

Post-processing of DAMIAN results

  1. Prepare input raw data

     # -- Ringversuch --
     ~/DATA/Data_Damian/241213_VH00358_120_AAG523FM5_Ringversuch
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20579/01_RV1_DNA_S1_R1_001.fastq.gz RV1_DNA_R1.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20579/01_RV1_DNA_S1_R2_001.fastq.gz RV1_DNA_R2.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20580/02_RV2_DNA_S2_R1_001.fastq.gz RV2_DNA_R1.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20580/02_RV2_DNA_S2_R2_001.fastq.gz RV2_DNA_R2.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20581/03_RV3_DNA_S3_R1_001.fastq.gz RV3_DNA_R1.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20581/03_RV3_DNA_S3_R2_001.fastq.gz RV3_DNA_R2.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20582/04_RV4_DNA_S4_R1_001.fastq.gz RV4_DNA_R1.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20582/04_RV4_DNA_S4_R2_001.fastq.gz RV4_DNA_R2.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20583/05_RV5_DNA_S5_R1_001.fastq.gz RV5_DNA_R1.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20583/05_RV5_DNA_S5_R2_001.fastq.gz RV5_DNA_R2.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20584/06_RV6_DNA_S6_R1_001.fastq.gz RV6_DNA_R1.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20584/06_RV6_DNA_S6_R2_001.fastq.gz RV6_DNA_R2.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20585/07_RV1_RNA_S7_R1_001.fastq.gz RV1_RNA_R1.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20585/07_RV1_RNA_S7_R2_001.fastq.gz RV1_RNA_R2.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20586/08_RV2_RNA_S8_R1_001.fastq.gz RV2_RNA_R1.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20586/08_RV2_RNA_S8_R2_001.fastq.gz RV2_RNA_R2.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20587/09_RV3_RNA_S9_R1_001.fastq.gz RV3_RNA_R1.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20587/09_RV3_RNA_S9_R2_001.fastq.gz RV3_RNA_R2.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20588/10_RV4_RNA_S10_R1_001.fastq.gz RV4_RNA_R1.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20588/10_RV4_RNA_S10_R2_001.fastq.gz RV4_RNA_R2.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20589/11_RV5_RNA_S11_R1_001.fastq.gz RV5_RNA_R1.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20589/11_RV5_RNA_S11_R2_001.fastq.gz RV5_RNA_R2.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20590/12_RV6_RNA_S12_R1_001.fastq.gz RV6_RNA_R1.fastq.gz
     ln ../241213_VH00358_120_AAG523FM5_Ringversuch/p20590/12_RV6_RNA_S12_R2_001.fastq.gz RV6_RNA_R2.fastq.gz
  2. Prepare virus database and select 8 representatives for the eight given viruses from the database

     # -- Download all genomes --
     # enterovirus D68
     # HSV-1
     # HSV-2
     # Influenza A H1N1
     # Cytomegalovirus AD169 (The genome size of Human herpesvirus 5 (HHV-5) — more commonly known as Cytomegalovirus (CMV))
     # Influenza A H3N2
     # Monkeypox
     # HIV-1
    
     esearch -db nucleotide -query "txid42789[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_42789_ncbi.fasta
     python ~/Scripts/filter_fasta.py genome_42789_ncbi.fasta complete_42789_ncbi.fasta    #899
     esearch -db nucleotide -query "txid10298[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_10298_ncbi.fasta
     python ~/Scripts/filter_fasta.py genome_10298_ncbi.fasta complete_10298_ncbi.fasta    #162
     esearch -db nucleotide -query "txid10310[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_10310_ncbi.fasta
     python ~/Scripts/filter_fasta.py genome_10310_ncbi.fasta complete_10310_ncbi.fasta    #33
     esearch -db nucleotide -query "txid1323429[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_1323429_ncbi.fasta
     python ~/Scripts/filter_fasta2.py genome_1323429_ncbi.fasta complete_1323429_ncbi.fasta    #465
     esearch -db nucleotide -query "txid10360[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_10360_ncbi.fasta
     python ~/Scripts/filter_fasta2.py genome_10360_ncbi.fasta complete_10360_ncbi.fasta    #1
     esearch -db nucleotide -query "txid41857[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_41857_ncbi.fasta
     python ~/Scripts/filter_fasta2.py genome_41857_ncbi.fasta complete_41857_ncbi.fasta    #120
     esearch -db nucleotide -query "txid10244[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_10244_ncbi.fasta
     python ~/Scripts/filter_fasta.py genome_10244_ncbi.fasta complete_10244_ncbi.fasta    #2525
     esearch -db nucleotide -query "txid11676[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_11676_ncbi.fasta
     python ~/Scripts/filter_fasta.py genome_11676_ncbi.fasta complete_11676_ncbi.fasta    #485995-->7416
    
     # ---- Alternatively, using ENA instead to download the genomes ----
     # https://www.ebi.ac.uk/ena/browser/view/11676 (1138065 records)
     # #Click "Sequence" and download "Counts" (1132648) and "Taxon descendants count" (1138065) if there is enough time! Downloading time points is 09.04.2025.
     # python ~/Scripts/filter_fasta.py  ena_11676_sequence.fasta complete_11676_ena.fasta  #1138065-->????
    
     # Virus Name    NCBI TaxID
     # ------------------------
     # Enterovirus D68   42789                             >PQ895337.1 Enterovirus D68 isolate SH2024-25870
     # HSV-1 (Herpes Simplex Virus 1)    10298             >PQ569920.1 Human alphaherpesvirus 1 isolate MacIntyre, complete genome
     # HSV-2 (Herpes Simplex Virus 2)    10310             >OM370995.1 Human alphaherpesvirus 2 strain G, complete genome
    
         samtools faidx complete_42789_ncbi.fasta PQ895337.1 > Enterovirus_D68_isolate_SH2024-25870.fasta
         samtools faidx complete_10298_ncbi.fasta PQ569920.1 > HSV-1_isolate_MacIntyre.fasta
         samtools faidx complete_10310_ncbi.fasta OM370995.1 > HSV-2_strain_G.fasta
    
     # Influenza A virus (H1N1)  1323429
     # The Influenza A virus (H1N1) genome is composed of eight single-stranded negative-sense RNA segments, and the total genome size is approximately 13,500 nucleotides (13.5 kb).
     # Segment   Gene    Protein Product(s)  Approx. Length (nt)
     # 1 PB2 Polymerase basic 2  ~2,341
     # 2 PB1 Polymerase basic 1, PB1-F2  ~2,341
     # 3 PA  Polymerase acidic   ~2,233
     # 4 HA  Hemagglutinin   ~1,778
     # 5 NP  Nucleoprotein   ~1,565
     # 6 NA  Neuraminidase   ~1,413
     # 7 M   Matrix proteins (M1, M2)    ~1,027
     # 8 NS  Nonstructural (NS1, NS2)    ~890
    
     # >LC662544.1 Influenza A virus (H1N1) A/PR/8/34 NEP, NS1 genes for nonstructural protein 2, nonstructural protein 1, complete cds
     # >LC662543.1 Influenza A virus (H1N1) A/PR/8/34 M2, M1 genes for matrix protein 2, matrix protein 1, complete cds
     # >LC662542.1 Influenza A virus (H1N1) A/PR/8/34 NA gene for neuraminidase, complete cds
     # >LC662541.1 Influenza A virus (H1N1) A/PR/8/34 NP gene for nucleoprotein, complete cds
     # >LC662540.1 Influenza A virus (H1N1) A/PR/8/34 HA gene for haemagglutinin, complete cds
     # >LC662539.1 Influenza A virus (H1N1) A/PR/8/34 PA, PA-X genes for polymerase PA, PA-X protein, complete cds
     # >LC662538.1 Influenza A virus (H1N1) A/PR/8/34 PB1, PB1-F2 genes for polymerase PB1, PB1-F2 protein, complete cds
     # >LC662537.1 Influenza A virus (H1N1) A/PR/8/34 PB2 gene for polymerase PB2, complete cds
    
         samtools faidx complete_1323429_ncbi.fasta LC662537.1 > H1N1_A-PR-8-34_PB2.fasta
         samtools faidx complete_1323429_ncbi.fasta LC662538.1 > H1N1_A-PR-8-34_PB1.fasta
         samtools faidx complete_1323429_ncbi.fasta LC662539.1 > H1N1_A-PR-8-34_PA.fasta
         samtools faidx complete_1323429_ncbi.fasta LC662540.1 > H1N1_A-PR-8-34_HA.fasta
         samtools faidx complete_1323429_ncbi.fasta LC662541.1 > H1N1_A-PR-8-34_NP.fasta
         samtools faidx complete_1323429_ncbi.fasta LC662542.1 > H1N1_A-PR-8-34_NA.fasta
         samtools faidx complete_1323429_ncbi.fasta LC662543.1 > H1N1_A-PR-8-34_M.fasta
         samtools faidx complete_1323429_ncbi.fasta LC662544.1 > H1N1_A-PR-8-34_NS.fasta
    
     # Human cytomegalovirus AD169   10360
    
     # Influenza A virus (H3N2)  41857
    
     # >LC817411.1 Influenza A virus H3N2 A_Fukushima_OR808_2023 RNA, seqment 8, complete sequence
     # >LC817410.1 Influenza A virus H3N2 A_Fukushima_OR808_2023 RNA, seqment 7, complete sequence
     # >LC817409.1 Influenza A virus H3N2 A_Fukushima_OR808_2023 RNA, seqment 6, complete sequence
     # >LC817408.1 Influenza A virus H3N2 A_Fukushima_OR808_2023 RNA, seqment 5, complete sequence
     # >LC817407.1 Influenza A virus H3N2 A_Fukushima_OR808_2023 RNA, seqment 4, complete sequence
     # >LC817406.1 Influenza A virus H3N2 A_Fukushima_OR808_2023 RNA, seqment 3, complete sequence
     # >LC817405.1 Influenza A virus H3N2 A_Fukushima_OR808_2023 RNA, seqment 2, complete sequence
     # >LC817404.1 Influenza A virus H3N2 A_Fukushima_OR808_2023 RNA, seqment 1, complete sequence
    
         samtools faidx complete_41857_ncbi.fasta LC817404.1 > H3N2_A-Fukushima-OR808-2023_PB2.fasta
         samtools faidx complete_41857_ncbi.fasta LC817405.1 > H3N2_A-Fukushima-OR808-2023_PB1.fasta
         samtools faidx complete_41857_ncbi.fasta LC817406.1 > H3N2_A-Fukushima-OR808-2023_PA.fasta
         samtools faidx complete_41857_ncbi.fasta LC817407.1 > H3N2_A-Fukushima-OR808-2023_HA.fasta
         samtools faidx complete_41857_ncbi.fasta LC817408.1 > H3N2_A-Fukushima-OR808-2023_NP.fasta
         samtools faidx complete_41857_ncbi.fasta LC817409.1 > H3N2_A-Fukushima-OR808-2023_NA.fasta
         samtools faidx complete_41857_ncbi.fasta LC817410.1 > H3N2_A-Fukushima-OR808-2023_M.fasta
         samtools faidx complete_41857_ncbi.fasta LC817411.1 > H3N2_A-Fukushima-OR808-2023_NS.fasta
    
     # Monkeypox virus   10244: >OP689666.1 Monkeypox virus isolate MPXV/Germany/2022/RKI513, complete genome
         samtools faidx complete_10244_ncbi.fasta OP689666.1 > Monkeypox_isolate_MPXV-Germany-2022-RKI513.fasta
    
     # Human immunodeficiency virus 1    11676: >AJ866558.1 Human immunodeficiency virus 1 complete genome, isolate 01IC-PCI127
         samtools faidx complete_11676_ncbi.fasta AJ866558.1 >  HIV-1_isolate_01IC-PCI127.fasta
    
     # -- Selected genomes saved in the fasta-files --
     # Enterovirus_D68_isolate_SH2024-25870.fasta
     # HSV-1_isolate_MacIntyre.fasta
     # HSV-2_strain_G.fasta
     # H1N1_A-PR-8-34_PB2.fasta
     # H1N1_A-PR-8-34_PB1.fasta
     # H1N1_A-PR-8-34_PA.fasta
     # H1N1_A-PR-8-34_HA.fasta
     # H1N1_A-PR-8-34_NP.fasta
     # H1N1_A-PR-8-34_NA.fasta
     # H1N1_A-PR-8-34_M.fasta
     # H1N1_A-PR-8-34_NS.fasta
     # Human_cytomegalovirus_strain_AD169.fasta
     # H3N2_A-Fukushima-OR808-2023_PB2.fasta
     # H3N2_A-Fukushima-OR808-2023_PB1.fasta
     # H3N2_A-Fukushima-OR808-2023_PA.fasta
     # H3N2_A-Fukushima-OR808-2023_HA.fasta
     # H3N2_A-Fukushima-OR808-2023_NP.fasta
     # H3N2_A-Fukushima-OR808-2023_NA.fasta
     # H3N2_A-Fukushima-OR808-2023_M.fasta
     # H3N2_A-Fukushima-OR808-2023_NS.fasta
     # Monkeypox_isolate_MPXV-Germany-2022-RKI513.fasta
     # HIV-1_isolate_01IC-PCI127.fasta
  3. (Optional) Run the first round of vrap (–virus==viruses_selected.fasta)

     ln -s ~/Tools/vrap/ .
     mamba activate /home/jhuang/miniconda3/envs/vrap
    
     cd ~/DATA/Data_Damian/vrap_Ringversuch
     cat complete_10244_ncbi.fasta complete_10298_ncbi.fasta complete_10310_ncbi.fasta complete_1323429_ncbi.fasta complete_10360_ncbi.fasta complete_41857_ncbi.fasta complete_10244_ncbi.fasta complete_11676_ncbi.fasta > viruses_selected.fasta
    
     #Run vrap (first round): replace --virus to the specific taxonomy (e.g. viruses_selected.fasta) --> change virus_user_db --> specific_bacteria_user_db
     (vrap) for sample in RV1_DNA RV2_DNA RV3_DNA RV4_DNA RV5_DNA RV6_DNA  RV1_RNA RV2_RNA RV3_RNA RV4_RNA RV5_RNA RV6_RNA; do
         vrap/vrap.py  -1 ${sample}_R1.fastq.gz -2 ${sample}_R2.fastq.gz  -o vrap_${sample} --bt2idx=/home/jhuang/REFs/genome --host=/home/jhuang/REFs/genome.fa --virus=/home/jhuang/DATA/Data_Damian/vrap_Ringversuch/viruses_selected.fasta --nt=/mnt/nvme1n1p1/blast/nt --nr=/mnt/nvme1n1p1/blast/nr  -t 100 -l 200  -g
     done
  4. Run the second round of vrap (–host==${virus}.fasta)

     cat Enterovirus_D68_isolate_SH2024-25870.fasta HSV-1_isolate_MacIntyre.fasta HSV-2_strain_G.fasta H1N1_A-PR-8-34_PB2.fasta H1N1_A-PR-8-34_PB1.fasta H1N1_A-PR-8-34_PA.fasta H1N1_A-PR-8-34_HA.fasta H1N1_A-PR-8-34_NP.fasta H1N1_A-PR-8-34_NA.fasta H1N1_A-PR-8-34_M.fasta H1N1_A-PR-8-34_NS.fasta Human_cytomegalovirus_strain_AD169.fasta H3N2_A-Fukushima-OR808-2023_PB2.fasta H3N2_A-Fukushima-OR808-2023_PB1.fasta H3N2_A-Fukushima-OR808-2023_PA.fasta H3N2_A-Fukushima-OR808-2023_HA.fasta H3N2_A-Fukushima-OR808-2023_NP.fasta H3N2_A-Fukushima-OR808-2023_NA.fasta H3N2_A-Fukushima-OR808-2023_M.fasta H3N2_A-Fukushima-OR808-2023_NS.fasta Monkeypox_isolate_MPXV-Germany-2022-RKI513.fasta HIV-1_isolate_01IC-PCI127.fasta > viruses_representative.fasta
    
     # Run vrap (second round): selecte some representative viruses from the generated Excel-files generated by the last step as --host
     (vrap) for sample in RV1_DNA RV2_DNA RV3_DNA RV4_DNA RV5_DNA RV6_DNA  RV1_RNA RV2_RNA RV3_RNA RV4_RNA RV5_RNA RV6_RNA; do
         vrap/vrap_until_bowtie2.py  -1 ${sample}_R1.fastq.gz -2 ${sample}_R2.fastq.gz  -o vrap_${sample}_on_representatives --host /home/jhuang/DATA/Data_Damian/vrap_Ringversuch/viruses_representative.fasta   -t 100 -l 200  --gbt2 --noblast
     done
  5. Generate the mapping statistics for the sam-files generated from last step

     for sample in RV1_DNA RV2_DNA RV3_DNA RV4_DNA RV5_DNA RV6_DNA  RV1_RNA RV2_RNA RV3_RNA RV4_RNA RV5_RNA RV6_RNA; do
         echo "-----${sample}_on_representatives------" >> LOG_mapping
         #cd vrap_${sample}_on_${virus}/bowtie
         cd vrap_${sample}_on_representatives/bowtie
         # Rename and convert SAM to BAM
         mv mapped mapped.sam 2>> ../../LOG_mapping
         samtools view -S -b mapped.sam > mapped.bam 2>> ../../LOG_mapping
         samtools sort mapped.bam -o mapped_sorted.bam 2>> ../../LOG_mapping
         samtools index mapped_sorted.bam 2>> ../../LOG_mapping
         # Write flagstat output to log (go up two levels to write correctly)
         samtools flagstat mapped_sorted.bam >> ../../LOG_mapping 2>&1
         cd ../..
     done
    
     #draw some plots for some representative isolates which found in the first round (see Excel-file).
     samtools depth -m 0 -a mapped_sorted.bam > coverage.txt
     grep "PQ895337.1" coverage.txt > PQ895337_coverage.txt
     grep "PQ569920.1" coverage.txt > PQ569920_coverage.txt
    
             import pandas as pd
             import matplotlib.pyplot as plt
    
             # Load coverage data
             df = pd.read_csv("PQ895337_coverage.txt", sep="\t", header=None, names=["chr", "pos", "coverage"])
    
             # Plot
             plt.figure(figsize=(10,4))
             plt.plot(df["pos"], df["coverage"], color="blue", linewidth=0.5)
             plt.xlabel("Genomic Position")
             plt.ylabel("Coverage Depth")
             plt.title("BAM Coverage Plot")
             plt.show()
  6. Report

     Subject: Mapping Results and Selected Reference Genomes
    
     Dear XXXX,
    
     Please find below the results. For each of the viruses you sent me, a representative isolate has been selected, as listed below:
    
     Selected Reference Isolates:
    
         Enterovirus D68:
             PQ895337.1 – Enterovirus D68 isolate SH2024-25870
    
         HSV-1 (Herpes Simplex Virus 1):
             PQ569920.1 – Human alphaherpesvirus 1 isolate MacIntyre, complete genome
    
         HSV-2 (Herpes Simplex Virus 2):
             OM370995.1 – Human alphaherpesvirus 2 strain G, complete genome
    
         Influenza A virus (H1N1):
    
             LC662537.1 – Influenza A virus (H1N1) A/PR/8/34 PB2 gene for polymerase PB2, complete cds
             LC662538.1 – Influenza A virus (H1N1) A/PR/8/34 PB1, PB1-F2 genes for polymerase PB1, PB1-F2 protein, complete cds
             LC662539.1 – Influenza A virus (H1N1) A/PR/8/34 PA, PA-X genes for polymerase PA, PA-X protein, complete cds
             LC662540.1 – Influenza A virus (H1N1) A/PR/8/34 HA gene for haemagglutinin, complete cds
             LC662541.1 – Influenza A virus (H1N1) A/PR/8/34 NP gene for nucleoprotein, complete cds
             LC662542.1 – Influenza A virus (H1N1) A/PR/8/34 NA gene for neuraminidase, complete cds
             LC662543.1 – Influenza A virus (H1N1) A/PR/8/34 M2, M1 genes for matrix protein 2, matrix protein 1, complete cds
             LC662544.1 – Influenza A virus (H1N1) A/PR/8/34 NEP, NS1 genes for nonstructural protein 2, nonstructural protein 1, complete cds
    
         Cytomegalovirus (strain AD169):
             X17403.1 – Human cytomegalovirus strain AD169, complete genome
    
         Influenza A virus (H3N2):
    
             LC817404.1 – Influenza A virus H3N2 A_Fukushima_OR808_2023 PB2 gene, complete sequence
             LC817405.1 – Influenza A virus H3N2 A_Fukushima_OR808_2023 PB1 gene, complete sequence
             LC817406.1 – Influenza A virus H3N2 A_Fukushima_OR808_2023 PA gene, complete sequence
             LC817407.1 – Influenza A virus H3N2 A_Fukushima_OR808_2023 HA gene, complete sequence
             LC817408.1 – Influenza A virus H3N2 A_Fukushima_OR808_2023 NP gene, complete sequence
             LC817409.1 – Influenza A virus H3N2 A_Fukushima_OR808_2023 NA gene, complete sequence
             LC817410.1 – Influenza A virus H3N2 A_Fukushima_OR808_2023 M gene, complete sequence
             LC817411.1 – Influenza A virus H3N2 A_Fukushima_OR808_2023 NS gene, complete sequence
    
         Monkeypox virus:
             OP689666.1 – Isolate MPXV/Germany/2022/RKI513, complete genome
    
         Human Immunodeficiency Virus 1 (HIV-1):
             AJ866558.1 – Isolate 01IC-PCI127, complete genome
    
     Mapping Results:
    
     Then, we mapped the paired-end reads from 12 samples of the Ringversuch project against the reference genomes listed above. The following are the mapping statistics. Coverage plots are attached for each case where reads map to the reference genome (see attachments).
    
     Mapping statistics:
    
         RV1_DNA_on_Enterovirus_D68_isolate_SH2024-25870: 0 + 0 mapped (0.00% : N/A)
         RV2_DNA_on_Enterovirus_D68_isolate_SH2024-25870: 0 + 0 mapped (0.00% : N/A)
         RV3_DNA_on_Enterovirus_D68_isolate_SH2024-25870: 0 + 0 mapped (0.00% : N/A)
         RV4_DNA_on_Enterovirus_D68_isolate_SH2024-25870: 0 + 0 mapped (0.00% : N/A)
         RV5_DNA_on_Enterovirus_D68_isolate_SH2024-25870: 0 + 0 mapped (0.00% : N/A)
         RV6_DNA_on_Enterovirus_D68_isolate_SH2024-25870: 0 + 0 mapped (0.00% : N/A)
         RV1_RNA_on_Enterovirus_D68_isolate_SH2024-25870: 0 + 0 mapped (0.00% : N/A)
         RV2_RNA_on_Enterovirus_D68_isolate_SH2024-25870: 0 + 0 mapped (0.00% : N/A)
         RV3_RNA_on_Enterovirus_D68_isolate_SH2024-25870: 0 + 0 mapped (0.00% : N/A)
         RV4_RNA_on_Enterovirus_D68_isolate_SH2024-25870: 0 + 0 mapped (0.00% : N/A)
         RV5_RNA_on_Enterovirus_D68_isolate_SH2024-25870: 0 + 0 mapped (0.00% : N/A)
         RV6_RNA_on_Enterovirus_D68_isolate_SH2024-25870: 0 + 0 mapped (0.00% : N/A)

Variant calling for Data_Pietschmann_229ECoronavirus_Mutations_2025 (via docker own_viral_ngs)

  1. Input data:

     ln -s ../raw_data_2024/hCoV229E_Rluc_R1.fastq.gz hCoV229E_Rluc_R1.fastq.gz
     ln -s ../raw_data_2024/hCoV229E_Rluc_R2.fastq.gz hCoV229E_Rluc_R2.fastq.gz
     ln -s ../raw_data_2024/p10_DMSO_R1.fastq.gz p10_DMSO_R1.fastq.gz
     ln -s ../raw_data_2024/p10_DMSO_R2.fastq.gz p10_DMSO_R2.fastq.gz
     ln -s ../raw_data_2024/p10_K22_R1.fastq.gz p10_K22_R1.fastq.gz
     ln -s ../raw_data_2024/p10_K22_R2.fastq.gz p10_K22_R2.fastq.gz
     ln -s ../raw_data_2024/p10_K7523_R1.fastq.gz p10_K7523_R1.fastq.gz
     ln -s ../raw_data_2024/p10_K7523_R2.fastq.gz p10_K7523_R2.fastq.gz
     ln -s ../raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20606/p16_DMSO_S29_R1_001.fastq.gz p16_DMSO_R1.fastq.gz
     ln -s ../raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20606/p16_DMSO_S29_R2_001.fastq.gz p16_DMSO_R2.fastq.gz
     ln -s ../raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20607/p16_K22_S30_R1_001.fastq.gz p16_K22_R1.fastq.gz
     ln -s ../raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20607/p16_K22_S30_R2_001.fastq.gz p16_K22_R2.fastq.gz
     ln -s ../raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20608/p16_X7523_S31_R1_001.fastq.gz p16_X7523_R1.fastq.gz
     ln -s ../raw_data_2025/250506_VH00358_136_AAG3YJ5M5/p20608/p16_X7523_S31_R2_001.fastq.gz p16_X7523_R2.fastq.gz
  2. Call variant calling using snippy

     ln -s ~/Tools/bacto/db/ .;
     ln -s ~/Tools/bacto/envs/ .;
     ln -s ~/Tools/bacto/local/ .;
     cp ~/Tools/bacto/Snakefile .;
     cp ~/Tools/bacto/bacto-0.1.json .;
     cp ~/Tools/bacto/cluster.json .;
    
     #download CU459141.gb from GenBank
     mv ~/Downloads/sequence\(2\).gb db/PP810610.gb
    
     #setting the following in bacto-0.1.json
         "fastqc": false,
         "taxonomic_classifier": false,
         "assembly": true,
         "typing_ariba": false,
         "typing_mlst": true,
         "pangenome": true,
         "variants_calling": true,
         "phylogeny_fasttree": true,
         "phylogeny_raxml": true,
         "recombination": false, (due to gubbins-error set false)
         "genus": "Alphacoronavirus",
         "kingdom": "Viruses",
         "species": "Human coronavirus 229E",
         "mykrobe": {
             "species": "corona"
         },
         "reference": "db/PP810610.gb"
    
     mamba activate /home/jhuang/miniconda3/envs/bengal3_ac3
     (bengal3_ac3) /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
  3. Summarize all SNPs and Indels from the snippy result directory.

     #Output: snippy/summary_snps_indels.csv
     # IMPORTANT_ADAPT the array isolates = ["AYE-S", "AYE-Q", "AYE-WT on Tig4", "AYE-craA on Tig4", "AYE-craA-1 on Cm200", "AYE-craA-2 on Cm200"]
     python3 ~/Scripts/summarize_snippy_res.py snippy
     cd snippy
     #grep -v "None,,,,,,None,None" summary_snps_indels.csv > summary_snps_indels_.csv
  4. Using spandx calling variants (almost the same results to the one from viral-ngs!)

     mamba activate /home/jhuang/miniconda3/envs/spandx
     mkdir ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610
     cp PP810610.gb  ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610/genes.gbk
     vim ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config
     /home/jhuang/miniconda3/envs/spandx/bin/snpEff build PP810610    #-d
     ~/Scripts/genbank2fasta.py PP810610.gb
     mv PP810610.gb_converted.fna PP810610.fasta    #rename "NC_001348.1 xxxxx" to "NC_001348" in the fasta-file
     ln -s /home/jhuang/Tools/spandx/ spandx
     (spandx) nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq" --ref PP810610.fasta --annotation --database PP810610 -resume
    
     # Rerun SNP_matrix.sh due to the error ERROR_CHROMOSOME_NOT_FOUND in the variants annotation
     cd Outputs/Master_vcf
     (spandx) cp -r ../../snippy/hCoV229E_Rluc/reference .
     (spandx) cp ../../spandx/bin/SNP_matrix.sh ./
     #Note that ${variant_genome_path}=NC_001348 in the following command, but it was not used after command replacement.
     #Adapt "snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -v ${variant_genome_path} out.vcf > out.annotated.vcf" to
     "/home/jhuang/miniconda3/envs/bengal3_ac3/bin/snpEff eff -no-downstream -no-intergenic -ud 100 -formatEff -c reference/snpeff.config -dataDir . ref out.vcf > out.annotated.vcf" in SNP_matrix.sh
     (spandx) bash SNP_matrix.sh PP810610 .
  5. Calling inter-host variants by merging the results from snippy+spandx (Manually!)

     # Inter-host variants(宿主间变异):一种病毒在两个人之间有不同的基因变异,这些变异可能与宿主的免疫反应、疾病表现或病毒传播的方式相关。
     cp All_SNPs_indels_annotated.txt All_SNPs_indels_annotated_backup.txt
     vim All_SNPs_indels_annotated.txt
    
     #in the file ids: grep "$(echo -e '\t')353$(echo -e '\t')" All_SNPs_indels_annotated.txt >> All_SNPs_indels_annotated_.txt
     #Replace \n with " All_SNPs_indels_annotated.txt >> All_SNPs_indels_annotated_.txt\ngrep "
     #Replace grep " --> grep "$(echo -e '\t')
     #Replace " All_ --> $(echo -e '\t')" All_
    
     # Potential intra-host variants: 10871, 19289, 23435.
     CHROM   POS     REF     ALT     TYPE    hCoV229E_Rluc_trimmed   p10_DMSO_trimmed        p10_K22_trimmed p10_K7523_trimmed       p16_DMSO_trimmed        p16_K22_trimmed p16_X7523_trimmed       Effect  Impact  Functional_Class        Codon_change    Protein_and_nucleotide_change   Amino_Acid_Length       Gene_name       Biotype
     PP810610        1464    T       C       SNP     C       C       C       C       C       C       C       missense_variant        MODERATE        MISSENSE        gTt/gCt p.Val416Ala/c.1247T>C   6757    CDS_1   protein_coding
     PP810610        1699    C       T       SNP     T       T       T       T       T       T       T       synonymous_variant      LOW     SILENT  gtC/gtT p.Val494Val/c.1482C>T   6757    CDS_1   protein_coding
     PP810610        6691    C       T       SNP     T       T       T       T       T       T       T       synonymous_variant      LOW     SILENT  tgC/tgT p.Cys2158Cys/c.6474C>T  6757    CDS_1   protein_coding
     PP810610        6919    C       G       SNP     G       G       G       G       G       G       G       synonymous_variant      LOW     SILENT  ggC/ggG p.Gly2234Gly/c.6702C>G  6757    CDS_1   protein_coding
     PP810610        7294    T       A       SNP     A       A       A       A       A       A       A       missense_variant        MODERATE        MISSENSE        agT/agA p.Ser2359Arg/c.7077T>A  6757    CDS_1   protein_coding
     * PP810610       10871   C       T       SNP     C       C/T     T       C/T     C/T     T       C/T     missense_variant        MODERATE        MISSENSE        Ctt/Ttt p.Leu3552Phe/c.10654C>T 6757    CDS_1   protein_coding
     PP810610        14472   T       C       SNP     C       C       C       C       C       C       C       missense_variant        MODERATE        MISSENSE        aTg/aCg p.Met4752Thr/c.14255T>C 6757    CDS_1   protein_coding
     PP810610        15458   T       C       SNP     C       C       C       C       C       C       C       synonymous_variant      LOW     SILENT  Ttg/Ctg p.Leu5081Leu/c.15241T>C 6757    CDS_1   protein_coding
     PP810610        16035   C       A       SNP     A       A       A       A       A       A       A       stop_gained     HIGH    NONSENSE        tCa/tAa p.Ser5273*/c.15818C>A   6757    CDS_1   protein_coding
     PP810610        17430   T       C       SNP     C       C       C       C       C       C       C       missense_variant        MODERATE        MISSENSE        tTa/tCa p.Leu5738Ser/c.17213T>C 6757    CDS_1   protein_coding
     * PP810610       19289   G       T       SNP     G       G       T       G       G       G/T     G       missense_variant        MODERATE        MISSENSE        Gtt/Ttt p.Val6358Phe/c.19072G>T 6757    CDS_1   protein_coding
     PP810610        21183   T       G       SNP     G       G       G       G       G       G       G       missense_variant        MODERATE        MISSENSE        tTt/tGt p.Phe230Cys/c.689T>G    1173    CDS_2   protein_coding
     PP810610        22636   T       G       SNP     G       G       G       G       G       G       G       missense_variant        MODERATE        MISSENSE        aaT/aaG p.Asn714Lys/c.2142T>G   1173    CDS_2   protein_coding
     PP810610        23022   T       C       SNP     C       C       C       C       C       C       C       missense_variant        MODERATE        MISSENSE        tTa/tCa p.Leu843Ser/c.2528T>C   1173    CDS_2   protein_coding
     * PP810610       23435   C       T       SNP     C       C       T       C/T     C       C/T     C/T     missense_variant        MODERATE        MISSENSE        Ctt/Ttt p.Leu981Phe/c.2941C>T   1173    CDS_2   protein_coding
     PP810610        24512   C       T       SNP     T       T       T       T       T       T       T       missense_variant        MODERATE        MISSENSE        Ctc/Ttc p.Leu36Phe/c.106C>T     88      CDS_4   protein_coding
     PP810610        24781   C       T       SNP     T       T       T       T       T       T       T       missense_variant        MODERATE        MISSENSE        aCt/aTt p.Thr36Ile/c.107C>T     77      CDS_5   protein_coding
     PP810610        25163   C       T       SNP     T       T       T       T       T       T       T       missense_variant        MODERATE        MISSENSE        Ctt/Ttt p.Leu82Phe/c.244C>T     225     CDS_6   protein_coding
     PP810610        25264   C       T       SNP     T       T       T       T       T       T       T       synonymous_variant      LOW     SILENT  gtC/gtT p.Val115Val/c.345C>T    225     CDS_6   protein_coding
     PP810610        26838   G       T       SNP     T       T       T       T       T       T       T
  6. Calling intra-host variants using viral-ngs

     # Intra-host variants(宿主内变异):同一个人感染了某种病毒,但在其体内的不同细胞或器官中可能存在多个不同的病毒变异株。
    
     #How to run and debug the viral-ngs docker?
     # ---- DEBUG_2025_1: using docker instead ----
     mkdir viralngs; cd viralngs
     ln -s ~/Tools/viral-ngs_docker/Snakefile Snakefile
     ln -s  ~/Tools/viral-ngs_docker/bin bin
     cp  ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/refsel.acids refsel.acids
     cp  ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/lastal.acids lastal.acids
     cp  ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/config.yaml config.yaml
     cp  ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-runs.txt samples-runs.txt
     cp  ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-depletion.txt samples-depletion.txt
     cp  ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-metagenomics.txt samples-metagenomics.txt
     cp  ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-assembly.txt samples-assembly.txt
     cp  ~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2024/samples-assembly-failures.txt samples-assembly-failures.txt
     # Adapt the sample-*.txt
    
     mkdir viralngs/data
     mkdir viralngs/data/00_raw
    
     mkdir bams
     ref_fa="PP810610.fasta";
     #for sample in hCoV229E_Rluc p10_DMSO p10_K22; do
     for sample in p10_K7523 p16_DMSO p16_K22 p16_X7523; do
         bwa index ${ref_fa}; \
         bwa mem -M -t 16 ${ref_fa} trimmed/${sample}_trimmed_P_1.fastq trimmed/${sample}_trimmed_P_2.fastq | samtools view -bS - > bams/${sample}_genome_alignment.bam; \
     done
    
     conda activate viral-ngs4
     #for sample in hCoV229E_Rluc p10_DMSO p10_K22; do
     #for sample in p10_K7523 p16_DMSO p16_K22 p16_X7523; do
     for sample in p16_K22; do
         picard AddOrReplaceReadGroups I=bams/${sample}_genome_alignment.bam O=~/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2025/viralngs/data/00_raw/${sample}.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=$sample RGSM=$sample RGLB=standard RGPU=$sample VALIDATION_STRINGENCY=LENIENT; \
     done
     conda deactivate
    
     # -- ! Firstly set the samples-assembly.txt empty, so that only focus on running depletion!
     docker run -it -v /mnt/md1/DATA_D/Data_Pietschmann_229ECoronavirus_Mutations_2025/viralngs:/work -v /home/jhuang/Tools/viral-ngs_docker:/home/jhuang/Tools/viral-ngs_docker -v /home/jhuang/REFs:/home/jhuang/REFs -v /home/jhuang/Tools/GenomeAnalysisTK-3.6:/home/jhuang/Tools/GenomeAnalysisTK-3.6 -v /home/jhuang/Tools/novocraft_v3:/home/jhuang/Tools/novocraft_v3 -v /usr/local/bin/gatk:/usr/local/bin/gatk   own_viral_ngs bash
     cd /work
     snakemake --directory /work --printshellcmds --cores 40
    
     # -- ! Secondly manully run assembly steps
     # --> By itereative add the unfinished assembly in the list, each time replace one, and run "snakemake --directory /work --printshellcmds --cores 40"
    
         # # ---- NOTE that the following steps need rerun --> DOES NOT WORK, USE STRATEGY ABOVE ----
         # #for sample in p10_K22 p10_K7523; do
         # for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523  p16_DMSO p16_K22 p16_X7523; do
         #     bin/read_utils.py merge_bams data/01_cleaned/${sample}.cleaned.bam tmp/01_cleaned/${sample}.cleaned.bam --picardOptions SORT_ORDER=queryname
         #     bin/read_utils.py rmdup_mvicuna_bam tmp/01_cleaned/${sample}.cleaned.bam data/01_per_sample/${sample}.cleaned.bam --JVMmemory 30g
         # done
         #
         # #Note that the error generated by nextflow is from the step gapfill_gap2seq!
         # for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523  p16_DMSO p16_K22 p16_X7523; do
         #     bin/assembly.py assemble_spades data/01_per_sample/${sample}.taxfilt.bam /home/jhuang/REFs/viral_ngs_dbs/trim_clip/contaminants.fasta tmp/02_assembly/${sample}.assembly1-spades.fasta --nReads 10000000 --threads 15 --memLimitGb 12
         # done
         # for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523  p16_DMSO p16_K22 p16_X7523; do
         # for sample in p10_K22 p10_K7523; do
         #     bin/assembly.py order_and_orient tmp/02_assembly/${sample}.assembly1-spades.fasta refsel_db/refsel.fasta tmp/02_assembly/${sample}.assembly2-scaffolded.fasta --min_pct_contig_aligned 0.05 --outAlternateContigs tmp/02_assembly/${sample}.assembly2-alternate_sequences.fasta --nGenomeSegments 1 --outReference tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta --threads 15
         # done
         #
         # for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523  p16_DMSO p16_K22 p16_X7523; do
         #     bin/assembly.py gapfill_gap2seq tmp/02_assembly/${sample}.assembly2-scaffolded.fasta data/01_per_sample/${sample}.cleaned.bam tmp/02_assembly/${sample}.assembly2-gapfilled.fasta --memLimitGb 12 --maskErrors --randomSeed 0 --loglevel DEBUG
         # done
    
     #IMPORTANT: Reun the following commands!
     for sample in hCoV229E_Rluc  p10_DMSO p10_K22 p10_K7523  p16_DMSO p16_K22 p16_X7523; do
    
         bin/assembly.py impute_from_reference tmp/02_assembly/${sample}.assembly2-gapfilled.fasta tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta tmp/02_assembly/${sample}.assembly3-modify.fasta --newName ${sample} --replaceLength 55 --minLengthFraction 0.05 --minUnambig 0.05 --index  --loglevel DEBUG
     done
    
         # for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523  p16_DMSO p16_K22 p16_X7523; do
         #     bin/assembly.py refine_assembly tmp/02_assembly/${sample}.assembly3-modify.fasta data/01_per_sample/${sample}.cleaned.bam tmp/02_assembly/${sample}.assembly4-refined.fasta --outVcf tmp/02_assembly/${sample}.assembly3.vcf.gz --min_coverage 2 --novo_params '-r Random -l 20 -g 40 -x 20 -t 502' --threads 15  --loglevel DEBUG
         #     bin/assembly.py refine_assembly tmp/02_assembly/${sample}.assembly4-refined.fasta data/01_per_sample/${sample}.cleaned.bam data/02_assembly/${sample}.fasta --outVcf tmp/02_assembly/${sample}.assembly4.vcf.gz --min_coverage 3 --novo_params '-r Random -l 20 -g 40 -x 20 -t 100' --threads 15  --loglevel DEBUG
         # done
    
     # -- ! Thirdly set the samples-assembly.txt completely and run "snakemake --directory /work --printshellcmds --cores 40"
  7. Merge intra- and inter-host variants, comparing the variants to the alignments of the assemblies to confirm its correctness.

     cat NC_001348.fasta viralngs/data/02_assembly/VZV_20S.fasta viralngs/data/02_assembly/VZV_60S.fasta > aligned_1.fasta
     mafft --clustalout aligned_1.fasta > aligned_1.aln
     #~/Scripts/convert_fasta_to_clustal.py aligned_1.fasta_orig aligned_1.aln
     ~/Scripts/convert_clustal_to_clustal.py aligned_1.aln aligned_1_.aln
     #manully delete the postion with all or '-' in aligned_1_.aln
     ~/Scripts/check_sequence_differences.py aligned_1_.aln
     ~/Scripts/check_sequence_differences.py aligned_1_.aln > aligned_1.res
     grep -v " = n" aligned_1.res > aligned_1_.res
    
     cat NC_001348.fasta viralngs/tmp/02_assembly/VZV_20S.assembly4-refined.fasta viralngs/tmp/02_assembly/VZV_60S.assembly4-refined.fasta > aligned_1.fasta
     mafft --clustalout aligned_1.fasta > aligned_1.aln
     ~/Scripts/convert_clustal_to_clustal.py aligned_1.aln aligned_1_.aln
     ~/Scripts/check_sequence_differences.py aligned_1_.aln > aligned_1.res
     grep -v " = n" aligned_1.res > aligned_1_.res
    
     #Differences found at the following positions (150):
     Position 8956: OP297860.1 = A, HSV1_S1-1 = A, HSV-Klinik_S2-1 = G
     Position 8991: OP297860.1 = A, HSV1_S1-1 = A, HSV-Klinik_S2-1 = C
     Position 8992: OP297860.1 = T, HSV1_S1-1 = C, HSV-Klinik_S2-1 = C
     Position 8995: OP297860.1 = T, HSV1_S1-1 = T, HSV-Klinik_S2-1 = C
     Position 9190: OP297860.1 = T, HSV1_S1-1 = A, HSV-Klinik_S2-1 = T
     * Position 13659: OP297860.1 = G, HSV1_S1-1 = T, HSV-Klinik_S2-1 = G
     * Position 47969: OP297860.1 = C, HSV1_S1-1 = T, HSV-Klinik_S2-1 = C
     * Position 53691: OP297860.1 = G, HSV1_S1-1 = T, HSV-Klinik_S2-1 = G
     * Position 55501: OP297860.1 = T, HSV1_S1-1 = C, HSV-Klinik_S2-1 = C
     * Position 63248: OP297860.1 = G, HSV1_S1-1 = T, HSV-Klinik_S2-1 = G
     Position 63799: OP297860.1 = T, HSV1_S1-1 = C, HSV-Klinik_S2-1 = T
     * Position 64328: OP297860.1 = C, HSV1_S1-1 = A, HSV-Klinik_S2-1 = C
     Position 65179: OP297860.1 = T, HSV1_S1-1 = T, HSV-Klinik_S2-1 = C
     * Position 65225: OP297860.1 = G, HSV1_S1-1 = G, HSV-Klinik_S2-1 = A
     * Position 95302: OP297860.1 = C, HSV1_S1-1 = A, HSV-Klinik_S2-1 = C
    
     gunzip isnvs.annot.txt.gz
     ~/Scripts/filter_isnv.py isnvs.annot.txt 0.05
     cut -d$'\t' filtered_isnvs.annot.txt -f1-7
     chr     pos     sample  patient time    alleles iSNV_freq
     OP297860        13203   HSV1_S1 HSV1_S1         T,C,A   1.0
     OP297860        13203   HSV-Klinik_S2   HSV-Klinik_S2           T,C,A   1.0
     OP297860        13522   HSV1_S1 HSV1_S1         G,T     1.0
     OP297860        13522   HSV-Klinik_S2   HSV-Klinik_S2           G,T     0.008905554253573941
     OP297860        13659   HSV1_S1 HSV1_S1         G,T     1.0
     OP297860        13659   HSV-Klinik_S2   HSV-Klinik_S2           G,T     0.008383233532934131
    
     ~/Scripts/convert_clustal_to_fasta.py aligned_1_.aln aligned_1.fasta
     samtools faidx aligned_1.fasta
     samtools faidx aligned_1.fasta OP297860.1 > OP297860.1.fasta
     samtools faidx aligned_1.fasta HSV1_S1-1 > HSV1_S1-1.fasta
     samtools faidx aligned_1.fasta HSV-Klinik_S2-1 > HSV-Klinik_S2-1.fasta
     seqkit seq OP297860.1.fasta -w 70 > OP297860.1_w70.fasta
     diff OP297860.1_w70.fasta ../../refsel_db/refsel.fasta
  8. Consensus sequences of each and of all isolates

     cp data/02_assembly/*.fasta ./
     for sample in 838_S1 840_S2 820_S3 828_S4 815_S5 834_S6 808_S7 811_S8 837_S9 768_S10 773_S11 767_S12 810_S13 814_S14 10121-16_S15 7510-15_S16 828-17_S17 8806-15_S18 9881-16_S19 8981-14_S20; do
     for sample in p953-84660-tsek p938-16972-nra p942-88507-nra p943-98523-nra p944-103323-nra p947-105565-nra p948-112830-nra; do \
     mv ${sample}.fasta ${sample}.fa
     cat all.fa ${sample}.fa >> all.fa
     done
     cat RSV_dedup.fa all.fa > RSV_all.fa
     mafft --adjustdirection RSV_all.fa > RSV_all.aln
     snp-sites RSV_all.aln -o RSV_all_.aln
  9. Download all Human alphaherpesvirus 3 (Varicella-zoster virus) genomes

     Human alphaherpesvirus 3
     acronym: HHV-3 VZV
     equivalent: Human herpes virus 3
    
     Human alphaherpesvirus 3 (Varicella-zoster virus)
         * Human herpesvirus 3 strain Dumas
         * Human herpesvirus 3 strain Oka vaccine
         * Human herpesvirus 3 VZV-32
    
     #Taxonomy ID: 10335
     esearch -db nucleotide -query "txid10335[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_10335_ncbi.fasta
     python ~/Scripts/filter_fasta.py genome_10335_ncbi.fasta complete_genome_10335_ncbi.fasta  #2041-->165
     # ---- Download related genomes from ENA ----
     https://www.ebi.ac.uk/ena/browser/view/10335
     #Click "Sequence" and download "Counts" (2003) and "Taxon descendants count" (2005) if there is enough time! Downloading time points is 11.03.2025.
     python ~/Scripts/filter_fasta.py  ena_10335_sequence.fasta complete_genome_10335_ena_taxon_descendants_count.fasta  #2005-->153
     #python ~/Scripts/filter_fasta.py ena_10335_sequence_Counts.fasta complete_genome_10335_ena_Counts.fasta  #xxx, 5.8G
     https://www.ebi.ac.uk/ena/browser/view/10239
     https://www.ebi.ac.uk/ena/browser/view/2497569
     https://www.ebi.ac.uk/ena/browser/view/Taxon:2497569
     ena_10239_sequence.fasta
     esearch -db nucleotide -query "txid10239[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_10239_ncbi.fasta
  10. Using Multi-CAR for scaffolding the contigs (If not useful, choose another scaffolding tool, e.g. https://github.com/malonge/RagTag)

      All contigs over 500 bp were successfully scaffolded to the graft genome using Multi-CAR (13), resulting in a chromosomal assembly of 4,506,689 bp.
      https://genome.cs.nthu.edu.tw/Multi-CAR/
      https://github.com/ablab-nthu/Multi-CSAR
  11. Using the bowtie of vrap to map the reads on ref_genome/reference.fasta (The reference refers to the closest related genome found from the list generated by vrap)

     (vrap) vrap/vrap.py  -1 trimmed/VZV_20S_trimmed_P_1.fastq -2 trimmed/VZV_20S_trimmed_P_2.fastq  -o VZV_20S_on_X04370 --host /home/jhuang/DATA/Data_Huang_Human_herpesvirus_3/X04370.fasta   -t 100 -l 200  -g
     cd bowtie
     mv mapped mapped.sam
     samtools view -S -b mapped.sam > mapped.bam
     samtools sort mapped.bam -o mapped_sorted.bam
     samtools index mapped_sorted.bam
     samtools view -H mapped_sorted.bam
     samtools flagstat mapped_sorted.bam
  12. Show the bw on IGV

  13. Reports

     diff data/02_assembly/2040_04.fasta tmp/02_assembly/2040_04.assembly4-refined.fasta
    
     diff data/02_assembly/2040_04.fasta tmp/02_assembly/2040_04.assembly1-spades.fasta
     diff data/02_assembly/2040_04.fasta tmp/02_assembly/2040_04.assembly2-scaffolded.fasta
     diff data/02_assembly/2040_04.fasta tmp/02_assembly/2040_04.assembly2-gapfilled.fasta
     diff data/02_assembly/2040_04.fasta tmp/02_assembly/2040_04.assembly3-modify.fasta
     diff data/02_assembly/2040_04.fasta tmp/02_assembly/2040_04.assembly4-refined.fasta
     ./2040_04.assembly2-alternate_sequences.fasta
     ./2040_04.assembly2-scaffold_ref.fasta

How to debug and construct the docker docker own_viral_ngs?

    mkdir viralngs; cd viralngs
    ln -s ~/Tools/viral-ngs_docker/Snakefile Snakefile
    ln -s  ~/Tools/viral-ngs_docker/bin bin
    cp  ~/Tools/viral-ngs_docker/refsel.acids refsel.acids
    cp  ~/Tools/viral-ngs_docker/lastal.acids lastal.acids
    cp  ~/Tools/viral-ngs_docker/config.yaml config.yaml
    cp  ~/Tools/viral-ngs_docker/samples-runs.txt samples-runs.txt
    cp  ~/Tools/viral-ngs_docker/samples-depletion.txt samples-depletion.txt
    cp  ~/Tools/viral-ngs_docker/samples-metagenomics.txt samples-metagenomics.txt
    cp  ~/Tools/viral-ngs_docker/samples-assembly.txt samples-assembly.txt
    cp  ~/Tools/viral-ngs_docker/samples-assembly-failures.txt samples-assembly-failures.txt

    docker run -it -v /mnt/md1/DATA/Data_Huang_Human_herpesvirus_3/viralngs:/work -v /home/jhuang/Tools/viral-ngs_docker:/home/jhuang/Tools/viral-ngs_docker -v /home/jhuang/REFs:/home/jhuang/REFs -v /home/jhuang/Tools/GenomeAnalysisTK-3.6:/home/jhuang/Tools/GenomeAnalysisTK-3.6 -v /home/jhuang/Tools/novocraft_v3:/home/jhuang/Tools/novocraft_v3 -v /usr/local/bin/gatk:/usr/local/bin/gatk   own_viral_ngs bash
    cd /work
    snakemake --directory /work --printshellcmds --cores 40

    #BUG_1: FileNotFoundError: [Errno 2] No such file or directory: '/home/jhuang/Tools/samtools-1.9/samtools': '/home/jhuang/Tools/samtools-1.9/samtools'
    #DEBUG_1 (DEPRECATED):
            # - In docker install independent samtools
            conda create -n samtools-1.9-env samtools=1.9 -c bioconda -c conda-forge
            # - persistence the modified docker, next time run own docker image
            docker ps
            #CONTAINER ID   IMAGE                              COMMAND   CREATED         STATUS         PORTS     NAMES
            #881a1ad6a990   quay.io/broadinstitute/viral-ngs   "bash"    8 minutes ago   Up 8 minutes             intelligent_yalow
            docker commit 881a1ad6a990 own_viral_ngs
            docker image ls
            docker run -it own_viral_ngs bash
            #Change the path as "/opt/miniconda/envs/samtools-1.9-env/bin/samtools" in /work/bin/tools/samtools.py
            #         If another tool expect for samtools could not be installed, also use the same method above to install it on own_viral_ngs!
    #DEBUG_1_BETTER_SIMPLE: TOOL_VERSION = '1.6' --> '1.9' in ~/Tools/viral-ngs_docker/bin/tools/samtools.py

    #BUG_2:
            bin/taxon_filter.py deplete data/00_raw/2040_04.bam tmp/01_cleaned/2040_04.raw.bam tmp/01_cleaned/2040_04.bmtagger_depleted.bam tmp/01_cleaned/2040_04.rmdup.bam data/01_cleaned/2040_04.cleaned.bam --bmtaggerDbs /home/jhuang/REFs/viral_ngs_dbs/bmtagger_dbs_remove/hg19 /home/jhuang/REFs/viral_ngs_dbs/bmtagger_dbs_remove/metagenomics_contaminants_v3 /home/jhuang/REFs/viral_ngs_dbs/bmtagger_dbs_remove/GRCh37.68_ncRNA-GRCh37.68_transcripts-HS_rRNA_mitRNA --blastDbs /home/jhuang/REFs/viral_ngs_dbs/blast_dbs_remove/hybsel_probe_adapters /home/jhuang/REFs/viral_ngs_dbs/blast_dbs_remove/metag_v3.ncRNA.mRNA.mitRNA.consensus --threads 15 --srprismMemory 14250 --JVMmemory 50g --loglevel DEBUG
            #2025-05-23 09:58:45,326 - __init__:445:_attempt_install - DEBUG - Currently installed version of blast: 2.7.1-h4422958_6
            #2025-05-23 09:58:45,327 - __init__:448:_attempt_install - DEBUG - Expected version of blast:            2.6.0
            #2025-05-23 09:58:45,327 - __init__:449:_attempt_install - DEBUG - Incorrect version of blast installed. Removing it...
    #DEBUG_2: TOOL_VERSION = "2.6.0" --> "2.7.1" in ~/Tools/viral-ngs_docker/bin/tools/blast.py

    #BUG_3:
            bin/read_utils.py bwamem_idxstats data/01_cleaned/1762_04.cleaned.bam /home/jhuang/REFs/viral_ngs_dbs/spikeins/ercc_spike-ins.fasta --outStats reports/spike_count/1762_04.spike_count.txt --minScoreToFilter 60 --loglevel DEBUG
    #DEBUG_3: TOOL_VERSION = "0.7.15" --> "0.7.17" in ~/Tools/viral-ngs_docker/bin/tools/bwa.py

    #BUG_4: FileNotFoundError: [Errno 2] No such file or directory: '/usr/local/bin/trimmomatic': '/usr/local/bin/trimmomatic'
    #DEBUG_4: TOOL_VERSION = "0.36" --> "0.38" in ~/Tools/viral-ngs_docker/bin/tools/trimmomatic.py

    #BUG_5: FileNotFoundError: [Errno 2] No such file or directory: '/usr/bin/spades.py': '/usr/bin/spades.py'
    #DEBUG_5:  TOOL_VERSION = "0.36" --> "0.38" in ~/Tools/viral-ngs_docker/bin/tools/trimmomatic.py
    #                def install_and_get_path(self):
    #                        # the conda version wraps the jar file with a shell script
    #                        return 'trimmomatic'

    #BUG_6: bin/assembly.py order_and_orient tmp/02_assembly/2039_04.assembly1-spades.fasta refsel_db/refsel.fasta tmp/02_assembly/2039_04.assembly2-scaffolded.fasta --min_pct_contig_aligned 0.05 --outAlternateContigs tmp/02_assembly/2039_04.assembly2-alternate_sequences.fasta --nGenomeSegments 1 --outReference tmp/02_assembly/2039_04.assembly2-scaffold_ref.fasta --threads 15 --loglevel DEBUG
    2025-05-23 17:40:19,526 - __init__:445:_attempt_install - DEBUG - Currently installed version of mummer4: 4.0.0beta2-pl526hf484d3e_4
    2025-05-23 17:40:19,527 - __init__:448:_attempt_install - DEBUG - Expected version of mummer4:            4.0.0rc1
    2025-05-23 17:40:19,527 - __init__:449:_attempt_install - DEBUG - Incorrect version of mummer4 installed. Removing it..
    DEBUG_6:  TOOL_VERSION = "4.0.0rc1" --> "4.0.0beta2" in ~/Tools/viral-ngs_docker/bin/tools/mummer.py

    #BUG_7: bin/assembly.py order_and_orient tmp/02_assembly/2039_04.assembly1-spades.fasta refsel_db/refsel.fasta tmp/02_assembly/2039_04.assembly2-scaffolded.fasta --min_pct_contig_aligned 0.05 --outAlternateContigs tmp/02_assembly/2039_04.assembly2-alternate_sequences.fasta --nGenomeSegments 1 --outReference tmp/02_assembly/2039_04.assembly2-scaffold_ref.fasta --threads 15 --loglevel DEBUG
            File "bin/assembly.py", line 549, in 
base_counts = [sum([len(seg.seq.replace(“N”, “”)) for seg in scaffold]) \ AttributeError: ‘Seq’ object has no attribute ‘replace’ DEBUG_7: base_counts = [sum([len(seg.seq.replace(“N”, “”)) for seg in scaffold]) –> base_counts = [sum([len(seg.seq.ungap(‘N’)) for seg in scaffold]) in ~/Tools/viral-ngs_docker/bin/assembly.py BUG_8: bin/assembly.py refine_assembly tmp/02_assembly/1243_2.assembly3-modify.fasta data/01_per_sample/1243_2.cleaned.bam tmp/02_assembly/1243_2.assembly4-refined.fasta –outVcf tmp/02_assembly/1243_2.assembly3.vcf.gz –min_coverage 2 –novo_params ‘-r Random -l 20 -g 40 -x 20 -t 502’ –threads 15 –loglevel DEBUG File “/work/bin/tools/gatk.py”, line 75, in execute FileNotFoundError: [Errno 2] No such file or directory: ‘/usr/local/bin/gatk’: ‘/usr/local/bin/gatk’ #DEBUG_8: -v /usr/local/bin/gatk:/usr/local/bin/gatk in ‘docker run’ and change default python in the script via a shebang; TOOL_VERSION = “3.8” –> “3.6” in ~/Tools/viral-ngs_docker/bin/tools/gatk.py BUG_9: pyyaml is missing! #DEBUG_9: NO_ERROR if rerun! bin/assembly.py impute_from_reference tmp/02_assembly/2039_04.assembly2-gapfilled.fasta tmp/02_assembly/2039_04.assembly2-scaffold_ref.fasta tmp/02_assembly/2039_04.assembly3-modify.fasta –newName 2039_04 –replaceLength 55 –minLengthFraction 0.05 –minUnambig 0.05 –index –loglevel DEBUG for sample in 2039_04 2040_04; do for sample in 1762_04 1243_2 875_04; do bin/assembly.py impute_from_reference tmp/02_assembly/${sample}.assembly2-gapfilled.fasta tmp/02_assembly/${sample}.assembly2-scaffold_ref.fasta tmp/02_assembly/${sample}.assembly3-modify.fasta –newName ${sample} –replaceLength 55 –minLengthFraction 0.05 –minUnambig 0.05 –index –loglevel DEBUG done #BUG_10: bin/reports.py consolidate_fastqc reports/fastqc/2039_04/align_to_self reports/fastqc/2040_04/align_to_self reports/fastqc/1762_04/align_to_self reports/fastqc/1243_2/align_to_self reports/fastqc/875_04/align_to_self reports/summary.fastqc.align_to_self.txt #DEBUG_10: File “bin/intrahost.py”, line 527 and line 579 in merge_to_vcf # #MODIFIED_BACK samp_to_seqIndex[sampleName] = seq.seq.ungap(‘-‘) #samp_to_seqIndex[sampleName] = seq.seq.replace(“-“, “”) #BUG_11: bin/interhost.py multichr_mafft ref_genome/reference.fasta data/02_assembly/2039_04.fasta data/02_assembly/2040_04.fasta data/02_assembly/1762_04.fasta data/02_assembly/1243_2.fasta data/02_assembly/875_04.fasta data/03_multialign_to_ref –ep 0.123 –maxiters 1000 –preservecase –localpair –outFilePrefix aligned –sampleNameListFile data/03_multialign_to_ref/sampleNameList.txt –threads 15 –loglevel DEBUG 2025-05-26 15:04:19,014 – cmd:195:main_argparse – INFO – command: bin/interhost.py multichr_mafft inFastas=[‘ref_genome/reference.fasta’, ‘data/02_assembly/2039_04.fasta’, ‘data/02_assembly/2040_04.fasta’, ‘data/02_assembly/1762_04.fasta’, ‘data/02_assembly/1243_2.fasta’, ‘data/02_assembly/875_04.fasta’] localpair=True globalpair=None preservecase=True reorder=None gapOpeningPenalty=1.53 ep=0.123 verbose=False outputAsClustal=None maxiters=1000 outDirectory=data/03_multialign_to_ref outFilePrefix=aligned sampleRelationFile=None sampleNameListFile=data/03_multialign_to_ref/sampleNameList.txt threads=15 loglevel=DEBUG tmp_dir=/tmp tmp_dirKeep=False 2025-05-26 15:04:19,014 – cmd:209:main_argparse – DEBUG – using tempDir: /tmp/tmp-interhost-multichr_mafft-nuws9mhp 2025-05-26 15:04:21,085 – __init__:445:_attempt_install – DEBUG – Currently installed version of mafft: 7.402-0 2025-05-26 15:04:21,085 – __init__:448:_attempt_install – DEBUG – Expected version of mafft: 7.221 2025-05-26 15:04:21,085 – __init__:449:_attempt_install – DEBUG – Incorrect version of mafft installed. Removing it… #DEBUG_11: TOOL_VERSION = “7.221” –> “7.402” in ~/Tools/viral-ngs_docker/bin/tools/mafft.py

Processing Data_Tam_RNAseq_2025_LB_vs_Mac_ATCC19606

  1. Targets

     Could you please assist me with processing RNA-seq data? The reference genome is CP059040. I aim to analyze the data using PCA, a Venn diagram, and KEGG and GO annotation enrichment analysis.
     The samples are labeled as follows (where 'x' indicates the replicate number):
    
         LB-AB-x
         LB-IJ-x
         LB-W1-x
         LB-WT19606-x
         LB-Y1-x
         Mac-AB-x
         Mac-IJ-x
         Mac-W1-x
         Mac-WT19606-x
         Mac-Y1-x
  2. Download the raw data

     ./lnd login -u X101SC25015922-Z02-J002 -p m*********5
     ./lnd list
     ./lnd cp -d oss://  ./
     ./lnd cp oss://CP2024102300053 .  #Error
     ./lnd list oss://CP2024102300053
     ./lnd cp -d oss://CP2024102300053/H101SC25015922/RSMR00204 .
     #CP2024102300053/H101SC25015922/RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002
  3. Prepare raw data

     mkdir raw_data; cd raw_data
    
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-AB-1/LB-AB-1_1.fq.gz LB-AB-r1_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-AB-1/LB-AB-1_2.fq.gz LB-AB-r1_R2.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-AB-2/LB-AB-2_1.fq.gz LB-AB-r2_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-AB-2/LB-AB-2_2.fq.gz LB-AB-r2_R2.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-AB-3/LB-AB-3_1.fq.gz LB-AB-r3_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-AB-3/LB-AB-3_2.fq.gz LB-AB-r3_R2.fq.gz
    
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-IJ-1/LB-IJ-1_1.fq.gz LB-IJ-r1_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-IJ-1/LB-IJ-1_2.fq.gz LB-IJ-r1_R2.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-IJ-2/LB-IJ-2_1.fq.gz LB-IJ-r2_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-IJ-2/LB-IJ-2_2.fq.gz LB-IJ-r2_R2.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-IJ-4/LB-IJ-4_1.fq.gz LB-IJ-r4_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-IJ-4/LB-IJ-4_2.fq.gz LB-IJ-r4_R2.fq.gz
    
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-W1-1/LB-W1-1_1.fq.gz LB-W1-r1_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-W1-1/LB-W1-1_2.fq.gz LB-W1-r1_R2.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-W1-2/LB-W1-2_1.fq.gz LB-W1-r2_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-W1-2/LB-W1-2_2.fq.gz LB-W1-r2_R2.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-W1-3/LB-W1-3_1.fq.gz LB-W1-r3_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-W1-3/LB-W1-3_2.fq.gz LB-W1-r3_R2.fq.gz
    
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-WT19606-2/LB-WT19606-2_1.fq.gz LB-WT19606-r2_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-WT19606-2/LB-WT19606-2_2.fq.gz LB-WT19606-r2_R2.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-WT19606-3/LB-WT19606-3_1.fq.gz LB-WT19606-r3_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-WT19606-3/LB-WT19606-3_2.fq.gz LB-WT19606-r3_R2.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-WT19606-4/LB-WT19606-4_1.fq.gz LB-WT19606-r4_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-WT19606-4/LB-WT19606-4_2.fq.gz LB-WT19606-r4_R2.fq.gz
    
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-Y1-2/LB-Y1-2_1.fq.gz LB-Y1-r2_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-Y1-2/LB-Y1-2_2.fq.gz LB-Y1-r2_R2.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-Y1-3/LB-Y1-3_1.fq.gz LB-Y1-r3_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-Y1-3/LB-Y1-3_2.fq.gz LB-Y1-r3_R2.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-Y1-4/LB-Y1-4_1.fq.gz LB-Y1-r4_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/LB-Y1-4/LB-Y1-4_2.fq.gz LB-Y1-r4_R2.fq.gz
    
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-AB-1/Mac-AB-1_1.fq.gz Mac-AB-r1_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-AB-1/Mac-AB-1_2.fq.gz Mac-AB-r1_R2.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-AB-2/Mac-AB-2_1.fq.gz Mac-AB-r2_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-AB-2/Mac-AB-2_2.fq.gz Mac-AB-r2_R2.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-AB-3/Mac-AB-3_1.fq.gz Mac-AB-r3_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-AB-3/Mac-AB-3_2.fq.gz Mac-AB-r3_R2.fq.gz
    
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-IJ-1/Mac-IJ-1_1.fq.gz Mac-IJ-r1_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-IJ-1/Mac-IJ-1_2.fq.gz Mac-IJ-r1_R2.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-IJ-2/Mac-IJ-2_1.fq.gz Mac-IJ-r2_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-IJ-2/Mac-IJ-2_2.fq.gz Mac-IJ-r2_R2.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-IJ-4/Mac-IJ-4_1.fq.gz Mac-IJ-r4_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-IJ-4/Mac-IJ-4_2.fq.gz Mac-IJ-r4_R2.fq.gz
    
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-W1-1/Mac-W1-1_1.fq.gz Mac-W1-r1_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-W1-1/Mac-W1-1_2.fq.gz Mac-W1-r1_R2.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-W1-2/Mac-W1-2_1.fq.gz Mac-W1-r2_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-W1-2/Mac-W1-2_2.fq.gz Mac-W1-r2_R2.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-W1-3/Mac-W1-3_1.fq.gz Mac-W1-r3_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-W1-3/Mac-W1-3_2.fq.gz Mac-W1-r3_R2.fq.gz
    
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-WT19606-2/Mac-WT19606-2_1.fq.gz Mac-WT19606-r2_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-WT19606-2/Mac-WT19606-2_2.fq.gz Mac-WT19606-r2_R2.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-WT19606-3/Mac-WT19606-3_1.fq.gz Mac-WT19606-r3_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-WT19606-3/Mac-WT19606-3_2.fq.gz Mac-WT19606-r3_R2.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-WT19606-4/Mac-WT19606-4_1.fq.gz Mac-WT19606-r4_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-WT19606-4/Mac-WT19606-4_2.fq.gz Mac-WT19606-r4_R2.fq.gz
    
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-Y1-2/Mac-Y1-2_1.fq.gz Mac-Y1-r2_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-Y1-2/Mac-Y1-2_2.fq.gz Mac-Y1-r2_R2.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-Y1-3/Mac-Y1-3_1.fq.gz Mac-Y1-r3_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-Y1-3/Mac-Y1-3_2.fq.gz Mac-Y1-r3_R2.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-Y1-4/Mac-Y1-4_1.fq.gz Mac-Y1-r4_R1.fq.gz
     ln -s ../RSMR00204/X101SC25015922-Z02/X101SC25015922-Z02-J002/01.RawData/Mac-Y1-4/Mac-Y1-4_2.fq.gz Mac-Y1-r4_R2.fq.gz
  4. Preparing the directory trimmed

     mkdir trimmed trimmed_unpaired;
     for sample_id in LB-AB-r1 LB-AB-r2 LB-AB-r3  LB-IJ-r1 LB-IJ-r2 LB-IJ-r4  LB-W1-r1 LB-W1-r2 LB-W1-r3  LB-WT19606-r2 LB-WT19606-r3 LB-WT19606-r4  LB-Y1-r2 LB-Y1-r3 LB-Y1-r4    Mac-AB-r1 Mac-AB-r2 Mac-AB-r3  Mac-IJ-r1 Mac-IJ-r2 Mac-IJ-r4  Mac-W1-r1 Mac-W1-r2 Mac-W1-r3  Mac-WT19606-r2 Mac-WT19606-r3 Mac-WT19606-r4  Mac-Y1-r2 Mac-Y1-r3 Mac-Y1-r4; do
             java -jar /home/jhuang/Tools/Trimmomatic-0.36/trimmomatic-0.36.jar PE -threads 100 raw_data/${sample_id}_R1.fq.gz raw_data/${sample_id}_R2.fq.gz trimmed/${sample_id}_R1.fq.gz trimmed_unpaired/${sample_id}_R1.fq.gz trimmed/${sample_id}_R2.fq.gz trimmed_unpaired/${sample_id}_R2.fq.gz ILLUMINACLIP:/home/jhuang/Tools/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa:2:30:10:8:TRUE LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 AVGQUAL:20; done 2> trimmomatic_pe.log;
     done
  5. Preparing samplesheet.csv

     sample,fastq_1,fastq_2,strandedness
     LB-AB-r1,LB-AB-r1_R1.fq.gz,LB-AB-r1_R2.fq.gz,auto
     LB-AB-r2,LB-AB-r2_R1.fq.gz,LB-AB-r2_R2.fq.gz,auto
     LB-AB-r3,LB-AB-r3_R1.fq.gz,LB-AB-r3_R2.fq.gz,auto
     LB-IJ-r1,LB-IJ-r1_R1.fq.gz,LB-IJ-r1_R2.fq.gz,auto
     LB-IJ-r2,LB-IJ-r2_R1.fq.gz,LB-IJ-r2_R2.fq.gz,auto
     LB-IJ-r4,LB-IJ-r4_R1.fq.gz,LB-IJ-r4_R2.fq.gz,auto
     LB-W1-r1,LB-W1-r1_R1.fq.gz,LB-W1-r1_R2.fq.gz,auto
     LB-W1-r2,LB-W1-r2_R1.fq.gz,LB-W1-r2_R2.fq.gz,auto
     LB-W1-r3,LB-W1-r3_R1.fq.gz,LB-W1-r3_R2.fq.gz,auto
     LB-WT19606-r2,LB-WT19606-r2_R1.fq.gz,LB-WT19606-r2_R2.fq.gz,auto
     LB-WT19606-r3,LB-WT19606-r3_R1.fq.gz,LB-WT19606-r3_R2.fq.gz,auto
     LB-WT19606-r4,LB-WT19606-r4_R1.fq.gz,LB-WT19606-r4_R2.fq.gz,auto
     LB-Y1-r2,LB-Y1-r2_R1.fq.gz,LB-Y1-r2_R2.fq.gz,auto
     LB-Y1-r3,LB-Y1-r3_R1.fq.gz,LB-Y1-r3_R2.fq.gz,auto
     LB-Y1-r4,LB-Y1-r4_R1.fq.gz,LB-Y1-r4_R2.fq.gz,auto
     Mac-AB-r1,Mac-AB-r1_R1.fq.gz,Mac-AB-r1_R2.fq.gz,auto
     Mac-AB-r2,Mac-AB-r2_R1.fq.gz,Mac-AB-r2_R2.fq.gz,auto
     Mac-AB-r3,Mac-AB-r3_R1.fq.gz,Mac-AB-r3_R2.fq.gz,auto
     Mac-IJ-r1,Mac-IJ-r1_R1.fq.gz,Mac-IJ-r1_R2.fq.gz,auto
     Mac-IJ-r2,Mac-IJ-r2_R1.fq.gz,Mac-IJ-r2_R2.fq.gz,auto
     Mac-IJ-r4,Mac-IJ-r4_R1.fq.gz,Mac-IJ-r4_R2.fq.gz,auto
     Mac-W1-r1,Mac-W1-r1_R1.fq.gz,Mac-W1-r1_R2.fq.gz,auto
     Mac-W1-r2,Mac-W1-r2_R1.fq.gz,Mac-W1-r2_R2.fq.gz,auto
     Mac-W1-r3,Mac-W1-r3_R1.fq.gz,Mac-W1-r3_R2.fq.gz,auto
     Mac-WT19606-r2,Mac-WT19606-r2_R1.fq.gz,Mac-WT19606-r2_R2.fq.gz,auto
     Mac-WT19606-r3,Mac-WT19606-r3_R1.fq.gz,Mac-WT19606-r3_R2.fq.gz,auto
     Mac-WT19606-r4,Mac-WT19606-r4_R1.fq.gz,Mac-WT19606-r4_R2.fq.gz,auto
     Mac-Y1-r2,Mac-Y1-r2_R1.fq.gz,Mac-Y1-r2_R2.fq.gz,auto
     Mac-Y1-r3,Mac-Y1-r3_R1.fq.gz,Mac-Y1-r3_R2.fq.gz,auto
     Mac-Y1-r4,Mac-Y1-r4_R1.fq.gz,Mac-Y1-r4_R2.fq.gz,auto
    
     #mv trimmed/* .
  6. nextflow run

     #Example1: http://xgenes.com/article/article-content/157/prepare-virus-gtf-for-nextflow-run/
     #docker pull nfcore/rnaseq
     ln -s /home/jhuang/Tools/nf-core-rnaseq-3.12.0/ rnaseq
    
     # ---- SUCCESSFUL with directly downloaded gff3 and fasta from NCBI using docker after replacing 'CDS' with 'exon' ----
     (host_env) /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results    --fasta "/home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040.fasta" --gff "/home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_m.gff"        -profile docker -resume  --max_cpus 55 --max_memory 512.GB --max_time 2400.h    --save_align_intermeds --save_unaligned --save_reference    --aligner 'star_salmon'    --gtf_group_features 'gene_id'  --gtf_extra_attributes 'gene_name' --featurecounts_group_type 'gene_biotype' --featurecounts_feature_type 'transcript'
  7. Import data and pca-plot

      #mamba activate r_env
    
     #install.packages("ggfun")
     # Import the required libraries
     library("AnnotationDbi")
     library("clusterProfiler")
     library("ReactomePA")
     library(gplots)
     library(tximport)
     library(DESeq2)
     #library("org.Hs.eg.db")
     library(dplyr)
     library(tidyverse)
     #install.packages("devtools")
     #devtools::install_version("gtable", version = "0.3.0")
     library(gplots)
     library("RColorBrewer")
     #install.packages("ggrepel")
     library("ggrepel")
     # install.packages("openxlsx")
     library(openxlsx)
     library(EnhancedVolcano)
     library(DESeq2)
     library(edgeR)
    
     setwd("~/DATA/Data_Tam_RNAseq_2025_LB_vs_Mac_ATCC19606/results/star_salmon")
     # Define paths to your Salmon output quantification files
    
     files <- c("LB-AB_r1" = "./LB-AB-r1/quant.sf",
             "LB-AB_r2" = "./LB-AB-r2/quant.sf",
             "LB-AB_r3" = "./LB-AB-r3/quant.sf",
             "LB-IJ_r1" = "./LB-IJ-r1/quant.sf",
             "LB-IJ_r2" = "./LB-IJ-r2/quant.sf",
             "LB-IJ_r4" = "./LB-IJ-r4/quant.sf",
             "LB-W1_r1" = "./LB-W1-r1/quant.sf",
             "LB-W1_r2" = "./LB-W1-r2/quant.sf",
             "LB-W1_r3" = "./LB-W1-r3/quant.sf",
             "LB-WT19606_r2" = "./LB-WT19606-r2/quant.sf",
             "LB-WT19606_r3" = "./LB-WT19606-r3/quant.sf",
             "LB-WT19606_r4" = "./LB-WT19606-r4/quant.sf",
             "LB-Y1_r2" = "./LB-Y1-r2/quant.sf",
             "LB-Y1_r3" = "./LB-Y1-r3/quant.sf",
             "LB-Y1_r4" = "./LB-Y1-r4/quant.sf",
             "Mac-AB_r1" = "./Mac-AB-r1/quant.sf",
             "Mac-AB_r2" = "./Mac-AB-r2/quant.sf",
             "Mac-AB_r3" = "./Mac-AB-r3/quant.sf",
             "Mac-IJ_r1" = "./Mac-IJ-r1/quant.sf",
             "Mac-IJ_r2" = "./Mac-IJ-r2/quant.sf",
             "Mac-IJ_r4" = "./Mac-IJ-r4/quant.sf",
             "Mac-W1_r1" = "./Mac-W1-r1/quant.sf",
             "Mac-W1_r2" = "./Mac-W1-r2/quant.sf",
             "Mac-W1_r3" = "./Mac-W1-r3/quant.sf",
             "Mac-WT19606_r2" = "./Mac-WT19606-r2/quant.sf",
             "Mac-WT19606_r3" = "./Mac-WT19606-r3/quant.sf",
             "Mac-WT19606_r4" = "./Mac-WT19606-r4/quant.sf",
             "Mac-Y1_r2" = "./Mac-Y1-r2/quant.sf",
             "Mac-Y1_r3" = "./Mac-Y1-r3/quant.sf",
             "Mac-Y1_r4" = "./Mac-Y1-r4/quant.sf")
    
     # Import the transcript abundance data with tximport
     txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
     # Define the replicates and condition of the samples
     #replicate <- factor(c("r1", "r2", "r3", "r1", "r2", "r3", "r1", "r2", "r3"))
     #adeA and adeB encode a membrane fusion protein that is part of the AdeABC efflux pump, which contributes to multidrug resistance.
     #System: Part of the AdeIJK efflux pump, which includes: adeI — membrane fusion protein, adeJ — RND transporter, adeK — outer membrane factor
     condition <- factor(c("LB-AB","LB-AB","LB-AB", "LB-IJ","LB-IJ","LB-IJ", "LB-W1","LB-W1","LB-W1","LB-WT19606","LB-WT19606","LB-WT19606","LB-Y1","LB-Y1","LB-Y1","Mac-AB","Mac-AB","Mac-AB","Mac-IJ","Mac-IJ","Mac-IJ","Mac-W1","Mac-W1","Mac-W1","Mac-WT19606","Mac-WT19606","Mac-WT19606","Mac-Y1","Mac-Y1","Mac-Y1"))
     # Define the colData for DESeq2
     colData <- data.frame(condition=condition, row.names=names(files))
    
     # ------------------------
     # 1️⃣ Setup and input files
     # ------------------------
    
     # Read in transcript-to-gene mapping
     tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
     colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
    
     # Prepare tx2gene for gene-level summarization (remove gene_name if needed)
     tx2gene_geneonly <- tx2gene[, c("transcript_id", "gene_id")]
    
     # -------------------------------
     # 2️⃣ Transcript-level counts
     # -------------------------------
     # Create DESeqDataSet directly from tximport (transcript-level)
     dds_tx <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
     write.csv(counts(dds_tx), file="transcript_counts.csv")
    
     # --------------------------------
     # 3️⃣ Gene-level summarization
     # --------------------------------
     # Re-import Salmon data summarized at gene level
     txi_gene <- tximport(files, type="salmon", tx2gene=tx2gene_geneonly, txOut=FALSE)
    
     # Create DESeqDataSet for gene-level counts
     #dds <- DESeqDataSetFromTximport(txi_gene, colData=colData, design=~condition+replicate)
     dds <- DESeqDataSetFromTximport(txi_gene, colData=colData, design=~condition)
    
     # --------------------------------
     # 4️⃣ Raw counts table (with gene names)
     # --------------------------------
     # Extract raw gene-level counts
     counts_data <- as.data.frame(counts(dds, normalized=FALSE))
     counts_data$gene_id <- rownames(counts_data)
    
     # Add gene names
     tx2gene_unique <- unique(tx2gene[, c("gene_id", "gene_name")])
     counts_data <- merge(counts_data, tx2gene_unique, by="gene_id", all.x=TRUE)
    
     # Reorder columns: gene_id, gene_name, then counts
     count_cols <- setdiff(colnames(counts_data), c("gene_id", "gene_name"))
     counts_data <- counts_data[, c("gene_id", "gene_name", count_cols)]
    
     # --------------------------------
     # 5️⃣ Calculate CPM
     # --------------------------------
     library(edgeR)
     library(openxlsx)
    
     # Prepare count matrix for CPM calculation
     count_matrix <- as.matrix(counts_data[, !(colnames(counts_data) %in% c("gene_id", "gene_name"))])
    
     # Calculate CPM
     #cpm_matrix <- cpm(count_matrix, normalized.lib.sizes=FALSE)
     total_counts <- colSums(count_matrix)
     cpm_matrix <- t(t(count_matrix) / total_counts) * 1e6
     cpm_matrix <- as.data.frame(cpm_matrix)
    
     # Add gene_id and gene_name back to CPM table
     cpm_counts <- cbind(counts_data[, c("gene_id", "gene_name")], cpm_matrix)
    
     # --------------------------------
     # 6️⃣ Save outputs
     # --------------------------------
     write.csv(counts_data, "gene_raw_counts.csv", row.names=FALSE)
     write.xlsx(counts_data, "gene_raw_counts.xlsx", row.names=FALSE)
     write.xlsx(cpm_counts, "gene_cpm_counts.xlsx", row.names=FALSE)
    
     # -- (Optional) Save the rlog-transformed counts --
     dim(counts(dds))
     head(counts(dds), 10)
     rld <- rlogTransformation(dds)
     rlog_counts <- assay(rld)
     write.xlsx(as.data.frame(rlog_counts), "gene_rlog_transformed_counts.xlsx")
    
     # ---- (Optional for NACHREIHEN) split the factos media and strain from condition (for comparison Mac vs LB) ----
     # AdeIJK vs. AdeABC Efflux Pumps
     #     * AdeIJK is the "housekeeping" pump — always active, broadly expressed, contributing to background resistance.
     #     * AdeABC is the "emergency" pump — induced under stress or mutations, more potent in contributing to clinical multidrug resistance.
     #LB = Luria-Bertani broth (a standard rich growth medium)
     #Mac = MacConkey agar or broth (selective for Gram-negative bacteria)
     # - Growth medium   Media or Condition, GrowthMedium
     # - Bacterial strain/genotype   Strain or Isolate, Genotype, SampleType
     media <- factor(c("LB","LB","LB", "LB","LB","LB", "LB","LB","LB","LB","LB","LB","LB","LB","LB","Mac","Mac","Mac","Mac","Mac","Mac","Mac","Mac","Mac","Mac","Mac","Mac","Mac","Mac","Mac"))
     strain <- factor(c("AB","AB","AB", "IJ","IJ","IJ", "W1","W1","W1","WT19606","WT19606","WT19606","Y1","Y1","Y1","AB","AB","AB","IJ","IJ","IJ","W1","W1","W1","WT19606","WT19606","WT19606","Y1","Y1","Y1"))
     # Define the colData for DESeq2
     colData <- data.frame(media=media, strain=strain, row.names=names(files))
     # -- transcript-level count data (x2) --
     # Create DESeqDataSet object
     dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~media+strain)
     #write.csv(counts(dds), file="transcript_counts_media_strain.csv")  #check correctness, it should be identical to transcript_counts.csv
     # -- gene-level count data (x2) --
     # Read in the tx2gene map from salmon_tx2gene.tsv
     tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
     # Set the column names
     colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
     # Remove the gene_name column if not needed
     tx2gene <- tx2gene[,1:2]
     # Import and summarize the Salmon data with tximport
     txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
     # Continue with the DESeq2 workflow as before...
     colData <- data.frame(media=media, strain=strain, row.names=names(files))
     dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~media+strain)
     #dds <- dds[rowSums(counts(dds) > 3) > 2, ]    #3796->????
     #write.csv(counts(dds, normalized=FALSE), file="gene_counts_media_strain.csv")  #check correctness, it should be identical to gene_counts.csv
     # ---- (Optional for NACHREIHEN) END ----
    
     # -- pca --
     png("pca2.png", 1200, 800)
     plotPCA(rld, intgroup=c("condition"))
     dev.off()
     # -- heatmap --
     png("heatmap2.png", 1200, 800)
     distsRL <- dist(t(assay(rld)))
     mat <- as.matrix(distsRL)
     hc <- hclust(distsRL)
     hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
     heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
     dev.off()
    
     # -- pca_media_strain --
     png("pca_media.png", 1200, 800)
     plotPCA(rld, intgroup=c("media"))
     dev.off()
     png("pca_strain.png", 1200, 800)
     plotPCA(rld, intgroup=c("strain"))
     dev.off()
  8. (Optional; ERROR–>need to be debugged!) ) estimate size factors and dispersion values.

     #Size Factors: These are used to normalize the read counts across different samples. The size factor for a sample accounts for differences in sequencing depth (i.e., the total number of reads) and other technical biases between samples. After normalization with size factors, the counts should be comparable across samples. Size factors are usually calculated in a way that they reflect the median or mean ratio of gene expression levels between samples, assuming that most genes are not differentially expressed.
     #Dispersion: This refers to the variability or spread of gene expression measurements. In RNA-seq data analysis, each gene has its own dispersion value, which reflects how much the counts for that gene vary between different samples, more than what would be expected just due to the Poisson variation inherent in counting. Dispersion is important for accurately modeling the data and for detecting differentially expressed genes.
     #So in summary, size factors are specific to samples (used to make counts comparable across samples), and dispersion values are specific to genes (reflecting variability in gene expression).
    
     sizeFactors(dds)
     #NULL
     # Estimate size factors
     dds <- estimateSizeFactors(dds)
     # Estimate dispersions
     dds <- estimateDispersions(dds)
     #> sizeFactors(dds)
    
     #control_r1 control_r2  HSV.d2_r1  HSV.d2_r2  HSV.d4_r1  HSV.d4_r2  HSV.d6_r1
     #2.3282468  2.0251928  1.8036883  1.3767551  0.9341929  1.0911693  0.5454526
     #HSV.d6_r2  HSV.d8_r1  HSV.d8_r2
     #0.4604461  0.5799834  0.6803681
    
     # (DEBUG) If avgTxLength is Necessary
     #To simplify the computation and ensure sizeFactors are calculated:
     assays(dds)$avgTxLength <- NULL
     dds <- estimateSizeFactors(dds)
     sizeFactors(dds)
     #If you want to retain avgTxLength but suspect it is causing issues, you can explicitly instruct DESeq2 to compute size factors without correcting for library size with average transcript lengths:
     dds <- estimateSizeFactors(dds, controlGenes = NULL, use = FALSE)
     sizeFactors(dds)
    
     # If alone with virus data, the following BUG occured:
     #Still NULL --> BUG --> using manual calculation method for sizeFactor calculation!
                         HeLa_TO_r1                      HeLa_TO_r2
                         0.9978755                       1.1092227
     data.frame(genes = rownames(dds), dispersions = dispersions(dds))
    
     #Given the raw counts, the control_r1 and control_r2 samples seem to have a much lower sequencing depth (total read count) than the other samples. Therefore, when normalization methods are applied, the normalization factors for these control samples will be relatively high, boosting the normalized counts.
     1/0.9978755=1.002129023
     1/1.1092227=
     #bamCoverage --bam ../markDuplicates/${sample}Aligned.sortedByCoord.out.bam -o ${sample}_norm.bw --binSize 10 --scaleFactor  --effectiveGenomeSize 2864785220
     bamCoverage --bam ../markDuplicates/HeLa_TO_r1Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r1.bw --binSize 10 --scaleFactor 1.002129023     --effectiveGenomeSize 2864785220
     bamCoverage --bam ../markDuplicates/HeLa_TO_r2Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r2.bw --binSize 10 --scaleFactor  0.901532217        --effectiveGenomeSize 2864785220
    
     raw_counts <- counts(dds)
     normalized_counts <- counts(dds, normalized=TRUE)
     #write.table(raw_counts, file="raw_counts.txt", sep="\t", quote=F, col.names=NA)
     #write.table(normalized_counts, file="normalized_counts.txt", sep="\t", quote=F, col.names=NA)
     #convert bam to bigwig using deepTools by feeding inverse of DESeq’s size Factor
     estimSf <- function (cds){
         # Get the count matrix
         cts <- counts(cds)
         # Compute the geometric mean
         geomMean <- function(x) prod(x)^(1/length(x))
         # Compute the geometric mean over the line
         gm.mean  <-  apply(cts, 1, geomMean)
         # Zero values are set to NA (avoid subsequentcdsdivision by 0)
         gm.mean[gm.mean == 0] <- NA
         # Divide each line by its corresponding geometric mean
         # sweep(x, MARGIN, STATS, FUN = "-", check.margin = TRUE, ...)
         # MARGIN: 1 or 2 (line or columns)
         # STATS: a vector of length nrow(x) or ncol(x), depending on MARGIN
         # FUN: the function to be applied
         cts <- sweep(cts, 1, gm.mean, FUN="/")
         # Compute the median over the columns
         med <- apply(cts, 2, median, na.rm=TRUE)
         # Return the scaling factor
         return(med)
     }
     #https://dputhier.github.io/ASG/practicals/rnaseq_diff_Snf2/rnaseq_diff_Snf2.html
     #http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#data-transformations-and-visualization
     #https://hbctraining.github.io/DGE_workshop/lessons/02_DGE_count_normalization.html
     #https://hbctraining.github.io/DGE_workshop/lessons/04_DGE_DESeq2_analysis.html
     #https://genviz.org/module-04-expression/0004/02/01/DifferentialExpression/
     #DESeq2’s median of ratios [1]
     #EdgeR’s trimmed mean of M values (TMM) [2]
     #http://www.nathalievialaneix.eu/doc/html/TP1_normalization.html  #very good website!
     test_normcount <- sweep(raw_counts, 2, sizeFactors(dds), "/")
     sum(test_normcount != normalized_counts)
  9. Select the differentially expressed genes

     #https://galaxyproject.eu/posts/2020/08/22/three-steps-to-galaxify-your-tool/
     #https://www.biostars.org/p/282295/
     #https://www.biostars.org/p/335751/
     #> dds$condition
     #LB-AB       LB-IJ       LB-W1       LB-WT19606  LB-Y1       Mac-AB     Mac-IJ      Mac-W1      Mac-WT19606 Mac-Y1
     #CONSOLE: mkdir star_salmon/degenes
    
     setwd("degenes")
     #---- relevel to control ----
     dds$condition <- relevel(dds$condition, "LB-WT19606")
     dds = DESeq(dds, betaPrior=FALSE)
     resultsNames(dds)
     clist <- c("LB.AB_vs_LB.WT19606","LB.IJ_vs_LB.WT19606","LB.W1_vs_LB.WT19606","LB.Y1_vs_LB.WT19606")
    
     dds$condition <- relevel(dds$condition, "Mac-WT19606")
     dds = DESeq(dds, betaPrior=FALSE)
     resultsNames(dds)
     clist <- c("Mac.AB_vs_Mac.WT19606","Mac.IJ_vs_Mac.WT19606","Mac.W1_vs_Mac.WT19606","Mac.Y1_vs_Mac.WT19606")
    
     # - 如果你的实验是关注细菌在没有选择性压力下的生长、基因表达或一般行为,LB 是更好的对照。
     # - 如果你希望研究细菌在选择性压力下的行为(例如,针对革兰氏阴性细菌、测试抗生素耐药性或区分乳糖发酵菌),那么 MacConkey 更适合作为对照。
     dds$media <- relevel(dds$media, "LB")
     dds = DESeq(dds, betaPrior=FALSE)
     resultsNames(dds)
     clist <- c("Mac_vs_LB")
    
     dds$media <- relevel(dds$media, "Mac")
     dds = DESeq(dds, betaPrior=FALSE)
     resultsNames(dds)
     clist <- c("LB_vs_Mac")
    
     for (i in clist) {
       #contrast = paste("condition", i, sep="_")
       contrast = paste("media", i, sep="_")
       res = results(dds, name=contrast)
       res <- res[!is.na(res$log2FoldChange),]
       res_df <- as.data.frame(res)
    
       write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
       up <- subset(res_df, padj<=0.05 & log2FoldChange>=2)
       down <- subset(res_df, padj<=0.05 & log2FoldChange<=-2)
       write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
       write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
     }
    
     # -- Under host-env --
     grep -P "\tgene\t" CP059040.gff > CP059040_gene.gff
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff LB.AB_vs_LB.WT19606-all.txt LB.AB_vs_LB.WT19606-all.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff LB.AB_vs_LB.WT19606-up.txt LB.AB_vs_LB.WT19606-up.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff LB.AB_vs_LB.WT19606-down.txt LB.AB_vs_LB.WT19606-down.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff LB.IJ_vs_LB.WT19606-all.txt LB.IJ_vs_LB.WT19606-all.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff LB.IJ_vs_LB.WT19606-up.txt LB.IJ_vs_LB.WT19606-up.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff LB.IJ_vs_LB.WT19606-down.txt LB.IJ_vs_LB.WT19606-down.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff LB.W1_vs_LB.WT19606-all.txt LB.W1_vs_LB.WT19606-all.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff LB.W1_vs_LB.WT19606-up.txt LB.W1_vs_LB.WT19606-up.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff LB.W1_vs_LB.WT19606-down.txt LB.W1_vs_LB.WT19606-down.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff LB.Y1_vs_LB.WT19606-all.txt LB.Y1_vs_LB.WT19606-all.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff LB.Y1_vs_LB.WT19606-up.txt LB.Y1_vs_LB.WT19606-up.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff LB.Y1_vs_LB.WT19606-down.txt LB.Y1_vs_LB.WT19606-down.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff Mac.AB_vs_Mac.WT19606-all.txt Mac.AB_vs_Mac.WT19606-all.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff Mac.AB_vs_Mac.WT19606-up.txt Mac.AB_vs_Mac.WT19606-up.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff Mac.AB_vs_Mac.WT19606-down.txt Mac.AB_vs_Mac.WT19606-down.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff Mac.IJ_vs_Mac.WT19606-all.txt Mac.IJ_vs_Mac.WT19606-all.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff Mac.IJ_vs_Mac.WT19606-up.txt Mac.IJ_vs_Mac.WT19606-up.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff Mac.IJ_vs_Mac.WT19606-down.txt Mac.IJ_vs_Mac.WT19606-down.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff Mac.W1_vs_Mac.WT19606-all.txt Mac.W1_vs_Mac.WT19606-all.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff Mac.W1_vs_Mac.WT19606-up.txt Mac.W1_vs_Mac.WT19606-up.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff Mac.W1_vs_Mac.WT19606-down.txt Mac.W1_vs_Mac.WT19606-down.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff Mac.Y1_vs_Mac.WT19606-all.txt Mac.Y1_vs_Mac.WT19606-all.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff Mac.Y1_vs_Mac.WT19606-up.txt Mac.Y1_vs_Mac.WT19606-up.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff Mac.Y1_vs_Mac.WT19606-down.txt Mac.Y1_vs_Mac.WT19606-down.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff Mac_vs_LB-all.txt Mac_vs_LB-all.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff Mac_vs_LB-up.txt Mac_vs_LB-up.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff Mac_vs_LB-down.txt Mac_vs_LB-down.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff LB_vs_Mac-all.txt LB_vs_Mac-all.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff LB_vs_Mac-up.txt LB_vs_Mac-up.csv
     python3 ~/Scripts/replace_gene_names.py /home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_gene.gff LB_vs_Mac-down.txt LB_vs_Mac-down.csv
    
     # ---- Mac_vs_LB ----
     res <- read.csv("Mac_vs_LB-all.csv")
     # Replace empty GeneName with modified GeneID
     res$GeneName <- ifelse(
       res$GeneName == "" | is.na(res$GeneName),
       gsub("gene-", "", res$GeneID),
       res$GeneName
     )
     duplicated_genes <- res[duplicated(res$GeneName), "GeneName"]
     #print(duplicated_genes)
     # [1] "bfr"  "lipA" "ahpF" "pcaF" "alr"  "pcaD" "cydB" "lpdA" "pgaC" "ppk1"
     #[11] "pcaF" "tuf"  "galE" "murI" "yccS" "rrf"  "rrf"  "arsB" "ptsP" "umuD"
     #[21] "map"  "pgaB" "rrf"  "rrf"  "rrf"  "pgaD" "uraH" "benE"
     #res[res$GeneName == "bfr", ]
    
     #1st_strategy First occurrence is kept and Subsequent duplicates are removed
     #res <- res[!duplicated(res$GeneName), ]
     #2nd_strategy keep the row with the smallest padj value for each GeneName
     res <- res %>%
       group_by(GeneName) %>%
       slice_min(padj, with_ties = FALSE) %>%
       ungroup()
     res <- as.data.frame(res)
     # Sort res first by padj (ascending) and then by log2FoldChange (descending)
     res <- res[order(res$padj, -res$log2FoldChange), ]
    
     # Assuming res is your dataframe and already processed
     # Filter up-regulated genes: log2FoldChange > 2 and padj < 1e-2
     up_regulated <- res[res$log2FoldChange > 2 & res$padj < 1e-2, ]
     # Filter down-regulated genes: log2FoldChange < -2 and padj < 1e-2
     down_regulated <- res[res$log2FoldChange < -2 & res$padj < 1e-2, ]
     # Create a new workbook
     wb <- createWorkbook()
     # Add the complete dataset as the first sheet
     addWorksheet(wb, "Complete_Data")
     writeData(wb, "Complete_Data", res)
     # Add the up-regulated genes as the second sheet
     addWorksheet(wb, "Up_Regulated")
     writeData(wb, "Up_Regulated", up_regulated)
     # Add the down-regulated genes as the third sheet
     addWorksheet(wb, "Down_Regulated")
     writeData(wb, "Down_Regulated", down_regulated)
     # Save the workbook to a file
     saveWorkbook(wb, "Gene_Expression_Mac_vs_LB.xlsx", overwrite = TRUE)
    
     # Set the 'GeneName' column as row.names
     rownames(res) <- res$GeneName
     # Drop the 'GeneName' column since it's now the row names
     res$GeneName <- NULL
     head(res)
    
     ## Ensure the data frame matches the expected format
     ## For example, it should have columns: log2FoldChange, padj, etc.
     #res <- as.data.frame(res)
     ## Remove rows with NA in log2FoldChange (if needed)
     #res <- res[!is.na(res$log2FoldChange),]
    
     # Replace padj = 0 with a small value
     res$padj[res$padj == 0] <- 1e-150
    
     #library(EnhancedVolcano)
     # Assuming res is already sorted and processed
     png("Mac_vs_LB.png", width=1200, height=2000)
     #max.overlaps = 10
     EnhancedVolcano(res,
                     lab = rownames(res),
                     x = 'log2FoldChange',
                     y = 'padj',
                     pCutoff = 1e-2,
                     FCcutoff = 2,
                     title = '',
                     subtitleLabSize = 18,
                     pointSize = 3.0,
                     labSize = 5.0,
                     colAlpha = 1,
                     legendIconSize = 4.0,
                     drawConnectors = TRUE,
                     widthConnectors = 0.5,
                     colConnectors = 'black',
                     subtitle = expression("Mac versus LB"))
     dev.off()
    
     # ---- LB.AB_vs_LB.WT19606 ----
     res <- read.csv("LB.AB_vs_LB.WT19606-all.csv")
     # Replace empty GeneName with modified GeneID
     res$GeneName <- ifelse(
       res$GeneName == "" | is.na(res$GeneName),
       gsub("gene-", "", res$GeneID),
       res$GeneName
     )
     duplicated_genes <- res[duplicated(res$GeneName), "GeneName"]
    
     res <- res %>%
       group_by(GeneName) %>%
       slice_min(padj, with_ties = FALSE) %>%
       ungroup()
     res <- as.data.frame(res)
     # Sort res first by padj (ascending) and then by log2FoldChange (descending)
     res <- res[order(res$padj, -res$log2FoldChange), ]
    
     # Assuming res is your dataframe and already processed
     # Filter up-regulated genes: log2FoldChange > 2 and padj < 1e-2
     up_regulated <- res[res$log2FoldChange > 2 & res$padj < 1e-2, ]
     # Filter down-regulated genes: log2FoldChange < -2 and padj < 1e-2
     down_regulated <- res[res$log2FoldChange < -2 & res$padj < 1e-2, ]
     # Create a new workbook
     wb <- createWorkbook()
     # Add the complete dataset as the first sheet
     addWorksheet(wb, "Complete_Data")
     writeData(wb, "Complete_Data", res)
     # Add the up-regulated genes as the second sheet
     addWorksheet(wb, "Up_Regulated")
     writeData(wb, "Up_Regulated", up_regulated)
     # Add the down-regulated genes as the third sheet
     addWorksheet(wb, "Down_Regulated")
     writeData(wb, "Down_Regulated", down_regulated)
     # Save the workbook to a file
     saveWorkbook(wb, "Gene_Expression_LB.AB_vs_LB.WT19606.xlsx", overwrite = TRUE)
    
     # Set the 'GeneName' column as row.names
     rownames(res) <- res$GeneName
     # Drop the 'GeneName' column since it's now the row names
     res$GeneName <- NULL
     head(res)
    
     ## Ensure the data frame matches the expected format
     ## For example, it should have columns: log2FoldChange, padj, etc.
     #res <- as.data.frame(res)
     ## Remove rows with NA in log2FoldChange (if needed)
     #res <- res[!is.na(res$log2FoldChange),]
    
     # Replace padj = 0 with a small value
     res$padj[res$padj == 0] <- 1e-12
    
     #library(EnhancedVolcano)
     # Assuming res is already sorted and processed
     png("LB.AB_vs_LB.WT19606.png", width=1200, height=1200)
     #max.overlaps = 10
     EnhancedVolcano(res,
                     lab = rownames(res),
                     x = 'log2FoldChange',
                     y = 'padj',
                     pCutoff = 1e-2,
                     FCcutoff = 2,
                     title = '',
                     subtitleLabSize = 18,
                     pointSize = 3.0,
                     labSize = 5.0,
                     colAlpha = 1,
                     legendIconSize = 4.0,
                     drawConnectors = TRUE,
                     widthConnectors = 0.5,
                     colConnectors = 'black',
                     subtitle = expression("LB.AB versus LB.WT19606"))
     dev.off()
    
     # ---- LB.IJ_vs_LB.WT19606 ----
     res <- read.csv("LB.IJ_vs_LB.WT19606-all.csv")
     # Replace empty GeneName with modified GeneID
     res$GeneName <- ifelse(
       res$GeneName == "" | is.na(res$GeneName),
       gsub("gene-", "", res$GeneID),
       res$GeneName
     )
     duplicated_genes <- res[duplicated(res$GeneName), "GeneName"]
    
     res <- res %>%
       group_by(GeneName) %>%
       slice_min(padj, with_ties = FALSE) %>%
       ungroup()
     res <- as.data.frame(res)
     # Sort res first by padj (ascending) and then by log2FoldChange (descending)
     res <- res[order(res$padj, -res$log2FoldChange), ]
    
     # Assuming res is your dataframe and already processed
     # Filter up-regulated genes: log2FoldChange > 2 and padj < 1e-2
     up_regulated <- res[res$log2FoldChange > 2 & res$padj < 1e-2, ]
     # Filter down-regulated genes: log2FoldChange < -2 and padj < 1e-2
     down_regulated <- res[res$log2FoldChange < -2 & res$padj < 1e-2, ]
     # Create a new workbook
     wb <- createWorkbook()
     # Add the complete dataset as the first sheet
     addWorksheet(wb, "Complete_Data")
     writeData(wb, "Complete_Data", res)
     # Add the up-regulated genes as the second sheet
     addWorksheet(wb, "Up_Regulated")
     writeData(wb, "Up_Regulated", up_regulated)
     # Add the down-regulated genes as the third sheet
     addWorksheet(wb, "Down_Regulated")
     writeData(wb, "Down_Regulated", down_regulated)
     # Save the workbook to a file
     saveWorkbook(wb, "Gene_Expression_LB.IJ_vs_LB.WT19606.xlsx", overwrite = TRUE)
    
     # Set the 'GeneName' column as row.names
     rownames(res) <- res$GeneName
     # Drop the 'GeneName' column since it's now the row names
     res$GeneName <- NULL
     head(res)
    
     ## Ensure the data frame matches the expected format
     ## For example, it should have columns: log2FoldChange, padj, etc.
     #res <- as.data.frame(res)
     ## Remove rows with NA in log2FoldChange (if needed)
     #res <- res[!is.na(res$log2FoldChange),]
    
     # Replace padj = 0 with a small value
     res$padj[res$padj == 0] <- 1e-12
    
     #library(EnhancedVolcano)
     # Assuming res is already sorted and processed
     png("LB.IJ_vs_LB.WT19606.png", width=1200, height=1200)
     #max.overlaps = 10
     EnhancedVolcano(res,
                     lab = rownames(res),
                     x = 'log2FoldChange',
                     y = 'padj',
                     pCutoff = 1e-2,
                     FCcutoff = 2,
                     title = '',
                     subtitleLabSize = 18,
                     pointSize = 3.0,
                     labSize = 5.0,
                     colAlpha = 1,
                     legendIconSize = 4.0,
                     drawConnectors = TRUE,
                     widthConnectors = 0.5,
                     colConnectors = 'black',
                     subtitle = expression("LB.IJ versus LB.WT19606"))
     dev.off()
    
     # ---- LB.W1_vs_LB.WT19606 ----
     res <- read.csv("LB.W1_vs_LB.WT19606-all.csv")
     # Replace empty GeneName with modified GeneID
     res$GeneName <- ifelse(
       res$GeneName == "" | is.na(res$GeneName),
       gsub("gene-", "", res$GeneID),
       res$GeneName
     )
     duplicated_genes <- res[duplicated(res$GeneName), "GeneName"]
    
     res <- res %>%
       group_by(GeneName) %>%
       slice_min(padj, with_ties = FALSE) %>%
       ungroup()
     res <- as.data.frame(res)
     # Sort res first by padj (ascending) and then by log2FoldChange (descending)
     res <- res[order(res$padj, -res$log2FoldChange), ]
    
     # Assuming res is your dataframe and already processed
     # Filter up-regulated genes: log2FoldChange > 2 and padj < 1e-2
     up_regulated <- res[res$log2FoldChange > 2 & res$padj < 1e-2, ]
     # Filter down-regulated genes: log2FoldChange < -2 and padj < 1e-2
     down_regulated <- res[res$log2FoldChange < -2 & res$padj < 1e-2, ]
     # Create a new workbook
     wb <- createWorkbook()
     # Add the complete dataset as the first sheet
     addWorksheet(wb, "Complete_Data")
     writeData(wb, "Complete_Data", res)
     # Add the up-regulated genes as the second sheet
     addWorksheet(wb, "Up_Regulated")
     writeData(wb, "Up_Regulated", up_regulated)
     # Add the down-regulated genes as the third sheet
     addWorksheet(wb, "Down_Regulated")
     writeData(wb, "Down_Regulated", down_regulated)
     # Save the workbook to a file
     saveWorkbook(wb, "Gene_Expression_LB.W1_vs_LB.WT19606.xlsx", overwrite = TRUE)
    
     # Set the 'GeneName' column as row.names
     rownames(res) <- res$GeneName
     # Drop the 'GeneName' column since it's now the row names
     res$GeneName <- NULL
     head(res)
    
     ## Ensure the data frame matches the expected format
     ## For example, it should have columns: log2FoldChange, padj, etc.
     #res <- as.data.frame(res)
     ## Remove rows with NA in log2FoldChange (if needed)
     #res <- res[!is.na(res$log2FoldChange),]
    
     # Replace padj = 0 with a small value
     res$padj[res$padj == 0] <- 1e-12
    
     #library(EnhancedVolcano)
     # Assuming res is already sorted and processed
     png("LB.W1_vs_LB.WT19606.png", width=1200, height=1200)
     #max.overlaps = 10
     EnhancedVolcano(res,
                     lab = rownames(res),
                     x = 'log2FoldChange',
                     y = 'padj',
                     pCutoff = 1e-2,
                     FCcutoff = 2,
                     title = '',
                     subtitleLabSize = 18,
                     pointSize = 3.0,
                     labSize = 5.0,
                     colAlpha = 1,
                     legendIconSize = 4.0,
                     drawConnectors = TRUE,
                     widthConnectors = 0.5,
                     colConnectors = 'black',
                     subtitle = expression("LB.W1 versus LB.WT19606"))
     dev.off()
    
     # ---- LB.Y1_vs_LB.WT19606 ----
     res <- read.csv("LB.Y1_vs_LB.WT19606-all.csv")
     # Replace empty GeneName with modified GeneID
     res$GeneName <- ifelse(
       res$GeneName == "" | is.na(res$GeneName),
       gsub("gene-", "", res$GeneID),
       res$GeneName
     )
     duplicated_genes <- res[duplicated(res$GeneName), "GeneName"]
    
     res <- res %>%
       group_by(GeneName) %>%
       slice_min(padj, with_ties = FALSE) %>%
       ungroup()
     res <- as.data.frame(res)
     # Sort res first by padj (ascending) and then by log2FoldChange (descending)
     res <- res[order(res$padj, -res$log2FoldChange), ]
    
     # Assuming res is your dataframe and already processed
     # Filter up-regulated genes: log2FoldChange > 2 and padj < 1e-2
     up_regulated <- res[res$log2FoldChange > 2 & res$padj < 1e-2, ]
     # Filter down-regulated genes: log2FoldChange < -2 and padj < 1e-2
     down_regulated <- res[res$log2FoldChange < -2 & res$padj < 1e-2, ]
     # Create a new workbook
     wb <- createWorkbook()
     # Add the complete dataset as the first sheet
     addWorksheet(wb, "Complete_Data")
     writeData(wb, "Complete_Data", res)
     # Add the up-regulated genes as the second sheet
     addWorksheet(wb, "Up_Regulated")
     writeData(wb, "Up_Regulated", up_regulated)
     # Add the down-regulated genes as the third sheet
     addWorksheet(wb, "Down_Regulated")
     writeData(wb, "Down_Regulated", down_regulated)
     # Save the workbook to a file
     saveWorkbook(wb, "Gene_Expression_LB.Y1_vs_LB.WT19606.xlsx", overwrite = TRUE)
    
     # Set the 'GeneName' column as row.names
     rownames(res) <- res$GeneName
     # Drop the 'GeneName' column since it's now the row names
     res$GeneName <- NULL
     head(res)
    
     ## Ensure the data frame matches the expected format
     ## For example, it should have columns: log2FoldChange, padj, etc.
     #res <- as.data.frame(res)
     ## Remove rows with NA in log2FoldChange (if needed)
     #res <- res[!is.na(res$log2FoldChange),]
    
     # Replace padj = 0 with a small value
     res$padj[res$padj == 0] <- 1e-12
    
     #library(EnhancedVolcano)
     # Assuming res is already sorted and processed
     png("LB.Y1_vs_LB.WT19606.png", width=1200, height=1200)
     #max.overlaps = 10
     EnhancedVolcano(res,
                     lab = rownames(res),
                     x = 'log2FoldChange',
                     y = 'padj',
                     pCutoff = 1e-2,
                     FCcutoff = 2,
                     title = '',
                     subtitleLabSize = 18,
                     pointSize = 3.0,
                     labSize = 5.0,
                     colAlpha = 1,
                     legendIconSize = 4.0,
                     drawConnectors = TRUE,
                     widthConnectors = 0.5,
                     colConnectors = 'black',
                     subtitle = expression("LB.Y1 versus LB.WT19606"))
     dev.off()
    
     # ---- Mac.AB_vs_Mac.WT19606 ----
     res <- read.csv("Mac.AB_vs_Mac.WT19606-all.csv")
     # Replace empty GeneName with modified GeneID
     res$GeneName <- ifelse(
       res$GeneName == "" | is.na(res$GeneName),
       gsub("gene-", "", res$GeneID),
       res$GeneName
     )
     duplicated_genes <- res[duplicated(res$GeneName), "GeneName"]
    
     res <- res %>%
       group_by(GeneName) %>%
       slice_min(padj, with_ties = FALSE) %>%
       ungroup()
     res <- as.data.frame(res)
     # Sort res first by padj (ascending) and then by log2FoldChange (descending)
     res <- res[order(res$padj, -res$log2FoldChange), ]
    
     # Assuming res is your dataframe and already processed
     # Filter up-regulated genes: log2FoldChange > 2 and padj < 1e-2
     up_regulated <- res[res$log2FoldChange > 2 & res$padj < 1e-2, ]
     # Filter down-regulated genes: log2FoldChange < -2 and padj < 1e-2
     down_regulated <- res[res$log2FoldChange < -2 & res$padj < 1e-2, ]
     # Create a new workbook
     wb <- createWorkbook()
     # Add the complete dataset as the first sheet
     addWorksheet(wb, "Complete_Data")
     writeData(wb, "Complete_Data", res)
     # Add the up-regulated genes as the second sheet
     addWorksheet(wb, "Up_Regulated")
     writeData(wb, "Up_Regulated", up_regulated)
     # Add the down-regulated genes as the third sheet
     addWorksheet(wb, "Down_Regulated")
     writeData(wb, "Down_Regulated", down_regulated)
     # Save the workbook to a file
     saveWorkbook(wb, "Gene_Expression_Mac.AB_vs_Mac.WT19606.xlsx", overwrite = TRUE)
    
     # Set the 'GeneName' column as row.names
     rownames(res) <- res$GeneName
     # Drop the 'GeneName' column since it's now the row names
     res$GeneName <- NULL
     head(res)
    
     ## Ensure the data frame matches the expected format
     ## For example, it should have columns: log2FoldChange, padj, etc.
     #res <- as.data.frame(res)
     ## Remove rows with NA in log2FoldChange (if needed)
     #res <- res[!is.na(res$log2FoldChange),]
    
     # Replace padj = 0 with a small value
     res$padj[res$padj == 0] <- 1e-12
    
     #library(EnhancedVolcano)
     # Assuming res is already sorted and processed
     png("Mac.AB_vs_Mac.WT19606.png", width=1200, height=1200)
     #max.overlaps = 10
     EnhancedVolcano(res,
                     lab = rownames(res),
                     x = 'log2FoldChange',
                     y = 'padj',
                     pCutoff = 1e-2,
                     FCcutoff = 2,
                     title = '',
                     subtitleLabSize = 18,
                     pointSize = 3.0,
                     labSize = 5.0,
                     colAlpha = 1,
                     legendIconSize = 4.0,
                     drawConnectors = TRUE,
                     widthConnectors = 0.5,
                     colConnectors = 'black',
                     subtitle = expression("Mac.AB versus Mac.WT19606"))
     dev.off()
    
     # ---- Mac.IJ_vs_Mac.WT19606 ----
     res <- read.csv("Mac.IJ_vs_Mac.WT19606-all.csv")
     # Replace empty GeneName with modified GeneID
     res$GeneName <- ifelse(
       res$GeneName == "" | is.na(res$GeneName),
       gsub("gene-", "", res$GeneID),
       res$GeneName
     )
     duplicated_genes <- res[duplicated(res$GeneName), "GeneName"]
    
     res <- res %>%
       group_by(GeneName) %>%
       slice_min(padj, with_ties = FALSE) %>%
       ungroup()
     res <- as.data.frame(res)
     # Sort res first by padj (ascending) and then by log2FoldChange (descending)
     res <- res[order(res$padj, -res$log2FoldChange), ]
    
     # Assuming res is your dataframe and already processed
     # Filter up-regulated genes: log2FoldChange > 2 and padj < 1e-2
     up_regulated <- res[res$log2FoldChange > 2 & res$padj < 1e-2, ]
     # Filter down-regulated genes: log2FoldChange < -2 and padj < 1e-2
     down_regulated <- res[res$log2FoldChange < -2 & res$padj < 1e-2, ]
     # Create a new workbook
     wb <- createWorkbook()
     # Add the complete dataset as the first sheet
     addWorksheet(wb, "Complete_Data")
     writeData(wb, "Complete_Data", res)
     # Add the up-regulated genes as the second sheet
     addWorksheet(wb, "Up_Regulated")
     writeData(wb, "Up_Regulated", up_regulated)
     # Add the down-regulated genes as the third sheet
     addWorksheet(wb, "Down_Regulated")
     writeData(wb, "Down_Regulated", down_regulated)
     # Save the workbook to a file
     saveWorkbook(wb, "Gene_Expression_Mac.IJ_vs_Mac.WT19606.xlsx", overwrite = TRUE)
    
     # Set the 'GeneName' column as row.names
     rownames(res) <- res$GeneName
     # Drop the 'GeneName' column since it's now the row names
     res$GeneName <- NULL
     head(res)
    
     ## Ensure the data frame matches the expected format
     ## For example, it should have columns: log2FoldChange, padj, etc.
     #res <- as.data.frame(res)
     ## Remove rows with NA in log2FoldChange (if needed)
     #res <- res[!is.na(res$log2FoldChange),]
    
     # Replace padj = 0 with a small value
     res$padj[res$padj == 0] <- 1e-12
    
     #library(EnhancedVolcano)
     # Assuming res is already sorted and processed
     png("Mac.IJ_vs_Mac.WT19606.png", width=1200, height=1200)
     #max.overlaps = 10
     EnhancedVolcano(res,
                     lab = rownames(res),
                     x = 'log2FoldChange',
                     y = 'padj',
                     pCutoff = 1e-2,
                     FCcutoff = 2,
                     title = '',
                     subtitleLabSize = 18,
                     pointSize = 3.0,
                     labSize = 5.0,
                     colAlpha = 1,
                     legendIconSize = 4.0,
                     drawConnectors = TRUE,
                     widthConnectors = 0.5,
                     colConnectors = 'black',
                     subtitle = expression("Mac.IJ versus Mac.WT19606"))
     dev.off()
    
     # ---- Mac.W1_vs_Mac.WT19606 ----
     res <- read.csv("Mac.W1_vs_Mac.WT19606-all.csv")
     # Replace empty GeneName with modified GeneID
     res$GeneName <- ifelse(
       res$GeneName == "" | is.na(res$GeneName),
       gsub("gene-", "", res$GeneID),
       res$GeneName
     )
     duplicated_genes <- res[duplicated(res$GeneName), "GeneName"]
    
     res <- res %>%
       group_by(GeneName) %>%
       slice_min(padj, with_ties = FALSE) %>%
       ungroup()
     res <- as.data.frame(res)
     # Sort res first by padj (ascending) and then by log2FoldChange (descending)
     res <- res[order(res$padj, -res$log2FoldChange), ]
    
     # Assuming res is your dataframe and already processed
     # Filter up-regulated genes: log2FoldChange > 2 and padj < 1e-2
     up_regulated <- res[res$log2FoldChange > 2 & res$padj < 1e-2, ]
     # Filter down-regulated genes: log2FoldChange < -2 and padj < 1e-2
     down_regulated <- res[res$log2FoldChange < -2 & res$padj < 1e-2, ]
     # Create a new workbook
     wb <- createWorkbook()
     # Add the complete dataset as the first sheet
     addWorksheet(wb, "Complete_Data")
     writeData(wb, "Complete_Data", res)
     # Add the up-regulated genes as the second sheet
     addWorksheet(wb, "Up_Regulated")
     writeData(wb, "Up_Regulated", up_regulated)
     # Add the down-regulated genes as the third sheet
     addWorksheet(wb, "Down_Regulated")
     writeData(wb, "Down_Regulated", down_regulated)
     # Save the workbook to a file
     saveWorkbook(wb, "Gene_Expression_Mac.W1_vs_Mac.WT19606.xlsx", overwrite = TRUE)
    
     # Set the 'GeneName' column as row.names
     rownames(res) <- res$GeneName
     # Drop the 'GeneName' column since it's now the row names
     res$GeneName <- NULL
     head(res)
    
     ## Ensure the data frame matches the expected format
     ## For example, it should have columns: log2FoldChange, padj, etc.
     #res <- as.data.frame(res)
     ## Remove rows with NA in log2FoldChange (if needed)
     #res <- res[!is.na(res$log2FoldChange),]
    
     # Replace padj = 0 with a small value
     res$padj[res$padj == 0] <- 1e-12
    
     #library(EnhancedVolcano)
     # Assuming res is already sorted and processed
     png("Mac.W1_vs_Mac.WT19606.png", width=1200, height=1200)
     #max.overlaps = 10
     EnhancedVolcano(res,
                     lab = rownames(res),
                     x = 'log2FoldChange',
                     y = 'padj',
                     pCutoff = 1e-2,
                     FCcutoff = 2,
                     title = '',
                     subtitleLabSize = 18,
                     pointSize = 3.0,
                     labSize = 5.0,
                     colAlpha = 1,
                     legendIconSize = 4.0,
                     drawConnectors = TRUE,
                     widthConnectors = 0.5,
                     colConnectors = 'black',
                     subtitle = expression("Mac.W1 versus Mac.WT19606"))
     dev.off()
    
     # ---- Mac.Y1_vs_Mac.WT19606 ----
     res <- read.csv("Mac.Y1_vs_Mac.WT19606-all.csv")
     # Replace empty GeneName with modified GeneID
     res$GeneName <- ifelse(
       res$GeneName == "" | is.na(res$GeneName),
       gsub("gene-", "", res$GeneID),
       res$GeneName
     )
     duplicated_genes <- res[duplicated(res$GeneName), "GeneName"]
    
     res <- res %>%
       group_by(GeneName) %>%
       slice_min(padj, with_ties = FALSE) %>%
       ungroup()
     res <- as.data.frame(res)
     # Sort res first by padj (ascending) and then by log2FoldChange (descending)
     res <- res[order(res$padj, -res$log2FoldChange), ]
    
     # Assuming res is your dataframe and already processed
     # Filter up-regulated genes: log2FoldChange > 2 and padj < 1e-2
     up_regulated <- res[res$log2FoldChange > 2 & res$padj < 1e-2, ]
     # Filter down-regulated genes: log2FoldChange < -2 and padj < 1e-2
     down_regulated <- res[res$log2FoldChange < -2 & res$padj < 1e-2, ]
     # Create a new workbook
     wb <- createWorkbook()
     # Add the complete dataset as the first sheet
     addWorksheet(wb, "Complete_Data")
     writeData(wb, "Complete_Data", res)
     # Add the up-regulated genes as the second sheet
     addWorksheet(wb, "Up_Regulated")
     writeData(wb, "Up_Regulated", up_regulated)
     # Add the down-regulated genes as the third sheet
     addWorksheet(wb, "Down_Regulated")
     writeData(wb, "Down_Regulated", down_regulated)
     # Save the workbook to a file
     saveWorkbook(wb, "Gene_Expression_Mac.Y1_vs_Mac.WT19606.xlsx", overwrite = TRUE)
    
     # Set the 'GeneName' column as row.names
     rownames(res) <- res$GeneName
     # Drop the 'GeneName' column since it's now the row names
     res$GeneName <- NULL
     head(res)
    
     ## Ensure the data frame matches the expected format
     ## For example, it should have columns: log2FoldChange, padj, etc.
     #res <- as.data.frame(res)
     ## Remove rows with NA in log2FoldChange (if needed)
     #res <- res[!is.na(res$log2FoldChange),]
    
     # Replace padj = 0 with a small value
     res$padj[res$padj == 0] <- 1e-12
    
     #library(EnhancedVolcano)
     # Assuming res is already sorted and processed
     png("Mac.Y1_vs_Mac.WT19606.png", width=1200, height=1200)
     #max.overlaps = 10
     EnhancedVolcano(res,
                     lab = rownames(res),
                     x = 'log2FoldChange',
                     y = 'padj',
                     pCutoff = 1e-2,
                     FCcutoff = 2,
                     title = '',
                     subtitleLabSize = 18,
                     pointSize = 3.0,
                     labSize = 5.0,
                     colAlpha = 1,
                     legendIconSize = 4.0,
                     drawConnectors = TRUE,
                     widthConnectors = 0.5,
                     colConnectors = 'black',
                     subtitle = expression("Mac.Y1 versus Mac.WT19606"))
     dev.off()
    
     #TODO: annotate the Gene_Expression_xxx_vs_yyy.xlsx
  10. Clustering the genes and draw heatmap

     #http://xgenes.com/article/article-content/150/draw-venn-diagrams-using-matplotlib/
     #http://xgenes.com/article/article-content/276/go-terms-for-s-epidermidis/
    
     # save the Up-regulated and Down-regulated genes into -up.id and -down.id
     for i in Mac_vs_LB LB.AB_vs_LB.WT19606 LB.IJ_vs_LB.WT19606 LB.W1_vs_LB.WT19606 LB.Y1_vs_LB.WT19606 Mac.AB_vs_Mac.WT19606 Mac.IJ_vs_Mac.WT19606 Mac.W1_vs_Mac.WT19606 Mac.Y1_vs_Mac.WT19606; do
       echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id";
       echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id";
     done
     #5 LB.AB_vs_LB.WT19606-down.id
     #20 LB.AB_vs_LB.WT19606-up.id
     #64 LB.IJ_vs_LB.WT19606-down.id
     #69 LB.IJ_vs_LB.WT19606-up.id
     #23 LB.W1_vs_LB.WT19606-down.id
     #97 LB.W1_vs_LB.WT19606-up.id
     #9 LB.Y1_vs_LB.WT19606-down.id
     #20 LB.Y1_vs_LB.WT19606-up.id
     #20 Mac.AB_vs_Mac.WT19606-down.id
     #29 Mac.AB_vs_Mac.WT19606-up.id
     #65 Mac.IJ_vs_Mac.WT19606-down.id
     #197 Mac.IJ_vs_Mac.WT19606-up.id
     #359 Mac_vs_LB-down.id
     #308 Mac_vs_LB-up.id
     #290 Mac.W1_vs_Mac.WT19606-down.id
     #343 Mac.W1_vs_Mac.WT19606-up.id
     #75 Mac.Y1_vs_Mac.WT19606-down.id
     #0 Mac.Y1_vs_Mac.WT19606.png-down.id
     #0 Mac.Y1_vs_Mac.WT19606.png-up.id
     #68 Mac.Y1_vs_Mac.WT19606-up.id
     #2061 total
    
     cat *.id | sort -u > ids
     #Delete "GeneName"
     #add Gene_Id in the first line, delete the ""  #Note that using GeneID as index, rather than GeneName, since .txt contains only GeneID.
     GOI <- read.csv("ids")$Gene_Id    #1329
     RNASeq.NoCellLine <- assay(rld)
     #install.packages("gplots")
     library("gplots")
     #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
     datamat = RNASeq.NoCellLine[GOI, ]
     #datamat = RNASeq.NoCellLine
     write.csv(as.data.frame(datamat), file ="DEGs_heatmap_expression_data.txt")
    
     constant_rows <- apply(datamat, 1, function(row) var(row) == 0)
     if(any(constant_rows)) {
       cat("Removing", sum(constant_rows), "constant rows.\n")
       datamat <- datamat[!constant_rows, ]
     }
     hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
     hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
     mycl = cutree(hr, h=max(hr$height)/1.15)
     mycol = c("YELLOW", "BLUE", "ORANGE", "MAGENTA", "CYAN", "RED", "GREEN", "MAROON", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN", "LIGHTRED", "LIGHTGREEN");
     mycol = mycol[as.vector(mycl)]
     #png("DEGs_heatmap.png", width=900, height=800)
     #cex.lab=10, labRow="",
     png("DEGs_heatmap.png", width=1200, height=1000)
     heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',labRow="",
                 scale='row',trace='none',col=bluered(75), cexCol=1.8,
                 RowSideColors = mycol, margins=c(10,2), cexRow=1.5, srtCol=30, lhei = c(1, 8), lwid=c(2, 8))  #rownames(datamat)
     #heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(5,5), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', labRow="", srtCol=30, lhei=c(0.1,2))
     dev.off()
     #### cluster members #####
     write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
     write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt')
     write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')
     write.csv(names(subset(mycl, mycl == '4')),file='cluster4_DARKMAGENTA.txt')
     write.csv(names(subset(mycl, mycl == '5')),file='cluster5_DARKCYAN.txt')
     #~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o DEGs_heatmap_cluster_members.xls
     #~/Tools/csv2xls-0.4/csv_to_xls.py DEGs_heatmap_expression_data.txt -d',' -o DEGs_heatmap_expression_data.xls;
    
     #### (NOT_WORKING) cluster members (adding annotations, note that it does not work for the bacteria, since it is not model-speices and we cannot use mart=ensembl) #####
     subset_1<-names(subset(mycl, mycl == '1'))
     data <- as.data.frame(datamat[rownames(datamat) %in% subset_1, ])  #2575
     subset_2<-names(subset(mycl, mycl == '2'))
     data <- as.data.frame(datamat[rownames(datamat) %in% subset_2, ])  #1855
     subset_3<-names(subset(mycl, mycl == '3'))
     data <- as.data.frame(datamat[rownames(datamat) %in% subset_3, ])  #217
     subset_4<-names(subset(mycl, mycl == '4'))
     data <- as.data.frame(datamat[rownames(datamat) %in% subset_4, ])  #
     subset_5<-names(subset(mycl, mycl == '5'))
     data <- as.data.frame(datamat[rownames(datamat) %in% subset_5, ])  #
     # Initialize an empty data frame for the annotated data
     annotated_data <- data.frame()
     # Determine total number of genes
     total_genes <- length(rownames(data))
     # Loop through each gene to annotate
     for (i in 1:total_genes) {
         gene <- rownames(data)[i]
         result <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
                         filters = 'ensembl_gene_id',
                         values = gene,
                         mart = ensembl)
         # If multiple rows are returned, take the first one
         if (nrow(result) > 1) {
             result <- result[1, ]
         }
         # Check if the result is empty
         if (nrow(result) == 0) {
             result <- data.frame(ensembl_gene_id = gene,
                                 external_gene_name = NA,
                                 gene_biotype = NA,
                                 entrezgene_id = NA,
                                 chromosome_name = NA,
                                 start_position = NA,
                                 end_position = NA,
                                 strand = NA,
                                 description = NA)
         }
         # Transpose expression values
         expression_values <- t(data.frame(t(data[gene, ])))
         colnames(expression_values) <- colnames(data)
         # Combine gene information and expression data
         combined_result <- cbind(result, expression_values)
         # Append to the final dataframe
         annotated_data <- rbind(annotated_data, combined_result)
         # Print progress every 100 genes
         if (i %% 100 == 0) {
             cat(sprintf("Processed gene %d out of %d\n", i, total_genes))
         }
     }
     # Save the annotated data to a new CSV file
     write.csv(annotated_data, "cluster1_YELLOW.csv", row.names=FALSE)
     write.csv(annotated_data, "cluster2_DARKBLUE.csv", row.names=FALSE)
     write.csv(annotated_data, "cluster3_DARKORANGE.csv", row.names=FALSE)
     write.csv(annotated_data, "cluster4_DARKMAGENTA.csv", row.names=FALSE)
     write.csv(annotated_data, "cluster5_DARKCYAN.csv", row.names=FALSE)
     #~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.csv -d',' -o DEGs_heatmap_clusters.xls

KEGG and GO annotations in non-model organisms

https://www.biobam.com/functional-analysis/

Blast2GO_workflow

  1. Assign KEGG and GO Terms (see diagram above)

    Since your organism is non-model, standard R databases (org.Hs.eg.db, etc.) won’t work. You’ll need to manually retrieve KEGG and GO annotations.

    Option 1 (KEGG Terms): EggNog based on orthology and phylogenies

     EggNOG-mapper assigns both KEGG Orthology (KO) IDs and GO terms.
    
     Install EggNOG-mapper:
    
         mamba create -n eggnog_env python=3.8 eggnog-mapper -c conda-forge -c bioconda  #eggnog-mapper_2.1.12
         mamba activate eggnog_env
    
     Run annotation:
    
         #diamond makedb --in eggnog6.prots.faa -d eggnog_proteins.dmnd
         mkdir /home/jhuang/mambaforge/envs/eggnog_env/lib/python3.8/site-packages/data/
         download_eggnog_data.py --dbname eggnog.db -y --data_dir /home/jhuang/mambaforge/envs/eggnog_env/lib/python3.8/site-packages/data/
         #NOT_WORKING: emapper.py -i CP059040_gene.fasta -o eggnog_dmnd_out --cpu 60 -m diamond[hmmer,mmseqs] --dmnd_db /home/jhuang/REFs/eggnog_data/data/eggnog_proteins.dmnd
         python ~/Scripts/update_fasta_header.py CP059040_protein_.fasta CP059040_protein.fasta
         emapper.py -i CP059040_protein.fasta -o eggnog_out --cpu 60 --resume
         #----> result annotations.tsv: Contains KEGG, GO, and other functional annotations.
         #---->  470.IX87_14445:
             * 470 likely refers to the organism or strain (e.g., Acinetobacter baumannii ATCC 19606 or another related strain).
             * IX87_14445 would refer to a specific gene or protein within that genome.
    
     Extract KEGG KO IDs from annotations.emapper.annotations.

    Option 2 (GO Terms from ‘Blast2GO 5 Basic’, saved in blast2go_annot.annot): Using Blast/Diamond + Blast2GO_GUI based on sequence alignment + GO mapping

    • ‘Load protein sequences’ (Tags: NONE, generated columns: Nr, SeqName) –>
    • Buttons ‘blast’ (Tags: BLASTED, generated columns: Description, Length, #Hits, e-Value, sim mean),
    • Button ‘mapping’ (Tags: MAPPED, generated columns: #GO, GO IDs, GO Names), “Mapping finished – Please proceed now to annotation.”
    • Button ‘annot’ (Tags: ANNOTATED, generated columns: Enzyme Codes, Enzyme Names), “Annotation finished.”
      • Used parameter ‘Annotation CutOff’: The Blast2GO Annotation Rule seeks to find the most specific GO annotations with a certain level of reliability. An annotation score is calculated for each candidate GO which is composed by the sequence similarity of the Blast Hit, the evidence code of the source GO and the position of the particular GO in the Gene Ontology hierarchy. This annotation score cutoff select the most specific GO term for a given GO branch which lies above this value.
      • Used parameter ‘GO Weight’ is a value which is added to Annotation Score of a more general/abstract Gene Ontology term for each of its more specific, original source GO terms. In this case, more general GO terms which summarise many original source terms (those ones directly associated to the Blast Hits) will have a higher Annotation Score.

    or blast2go_cli_v1.5.1 (NOT_USED)

         #https://help.biobam.com/space/BCD/2250407989/Installation
         #see ~/Scripts/blast2go_pipeline.sh

    Option 3 (GO Terms from ‘Blast2GO 5 Basic’, saved in blast2go_annot.annot2): Interpro based protein families / domains –> Button interpro

    • Button ‘interpro’ (Tags: INTERPRO, generated columns: InterPro IDs, InterPro GO IDs, InterPro GO Names) –> “InterProScan Finished – You can now merge the obtained GO Annotations.”

    MERGE the results of InterPro GO IDs (Option 3) to GO IDs (Option 2) and generate final GO IDs

    • Button ‘interpro’/’Merge InterProScan GOs to Annotation’ –> “Merge (add and validate) all GO terms retrieved via InterProScan to the already existing GO annotation.” –> “Finished merging GO terms from InterPro with annotations. Maybe you want to run ANNEX (Annotation Augmentation).” #* Button ‘annot’/’ANNEX’ –> “ANNEX finished. Maybe you want to do the next step: Enzyme Code Mapping.”
     #-- before merging (blast2go_annot.annot) --
     #H0N29_18790     GO:0004842      ankyrin repeat domain-containing protein
     #H0N29_18790     GO:0085020
     #-- after merging (blast2go_annot.annot2) -->
     #H0N29_18790     GO:0031436      ankyrin repeat domain-containing protein
     #H0N29_18790     GO:0070531
     #H0N29_18790     GO:0004842
     #H0N29_18790     GO:0005515
     #H0N29_18790     GO:0085020

    Option 4 (NOT_USED): RFAM for non-colding RNA

    Option 5 (NOT_USED): PSORTb for subcellular localizations

    Option 6 (NOT_USED): KAAS (KEGG Automatic Annotation Server)

    * Go to KAAS
    * Upload your FASTA file.
    * Select an appropriate gene set.
    * Download the KO assignments.
  2. Find the Closest KEGG Organism Code (NOT_USED)

    Since your species isn’t directly in KEGG, use a closely related organism.

    * Check available KEGG organisms:
    
         library(clusterProfiler)
         library(KEGGREST)
    
         kegg_organisms <- keggList("organism")
    
         Pick the closest relative (e.g., zebrafish "dre" for fish, Arabidopsis "ath" for plants).
    
         # Search for Acinetobacter in the list
         grep("Acinetobacter", kegg_organisms, ignore.case = TRUE, value = TRUE)
         # Gammaproteobacteria
         #Extract KO IDs from the eggnog results for  "Acinetobacter baumannii strain ATCC 19606"
  3. Find the Closest KEGG Organism for a Non-Model Species

    If your organism is not in KEGG, search for the closest relative:

         grep("fish", kegg_organisms, ignore.case = TRUE, value = TRUE)  # Example search

    For KEGG pathway enrichment in non-model species, use “ko” instead of a species code (the code has been intergrated in the point 4):

         kegg_enrich <- enrichKEGG(gene = gene_list, organism = "ko")  # "ko" = KEGG Orthology
  4. Perform KEGG and GO Enrichment in R (under dir ~/DATA/ata_Tam_RNAseq_2025_LB_vs_Mac_ATCC19606/results/star_salmon/degenes)

         #BiocManager::install("GO.db")
         #BiocManager::install("AnnotationDbi")
    
         # Load required libraries
         library(openxlsx)  # For Excel file handling
         library(dplyr)     # For data manipulation
         library(tidyr)
         library(stringr)
         library(clusterProfiler)  # For KEGG and GO enrichment analysis
         #library(org.Hs.eg.db)  # Replace with appropriate organism database
         library(GO.db)
         library(AnnotationDbi)
    
         setwd("~/DATA/Data_Tam_RNAseq_2025_LB_vs_Mac_ATCC19606/results/star_salmon/degenes")
         # PREPARING go_terms and ec_terms: annot_* file: cut -f1-2 -d$'\t' blast2go_annot.annot2 > blast2go_annot.annot2_
         # Step 1: Load the blast2go annotation file with a check for missing columns
         annot_df <- read.table("/home/jhuang/b2gWorkspace_Tam_RNAseq_2024/blast2go_annot.annot2_",
                             header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE)
    
         # If the structure is inconsistent, we can make sure there are exactly 3 columns:
         colnames(annot_df) <- c("GeneID", "Term")
         # Step 2: Filter and aggregate GO and EC terms as before
         go_terms <- annot_df %>%
         filter(grepl("^GO:", Term)) %>%
         group_by(GeneID) %>%
         summarize(GOs = paste(Term, collapse = ","), .groups = "drop")
         ec_terms <- annot_df %>%
         filter(grepl("^EC:", Term)) %>%
         group_by(GeneID) %>%
         summarize(EC = paste(Term, collapse = ","), .groups = "drop")
    
         # Load the results
         #res <- read.csv("Mac_vs_LB-all.csv")     #up307, down358
         #res <- read.csv("LB.AB_vs_LB.WT19606-all.csv")     #up307, down358
         #res <- read.csv("LB.IJ_vs_LB.WT19606-all.csv")     #up307, down358
         #res <- read.csv("LB.W1_vs_LB.WT19606-all.csv")     #up307, down358
         #res <- read.csv("LB.Y1_vs_LB.WT19606-all.csv")     #up307, down358
         #res <- read.csv("Mac.AB_vs_Mac.WT19606-all.csv")     #up307, down358
         #res <- read.csv("Mac.IJ_vs_Mac.WT19606-all.csv")     #up307, down358
         #res <- read.csv("Mac.W1_vs_Mac.WT19606-all.csv")     #up307, down358
         res <- read.csv("Mac.Y1_vs_Mac.WT19606-all.csv")     #up307, down358
    
         # Replace empty GeneName with modified GeneID
         res$GeneName <- ifelse(
             res$GeneName == "" | is.na(res$GeneName),
             gsub("gene-", "", res$GeneID),
             res$GeneName
         )
    
         # Remove duplicated genes by selecting the gene with the smallest padj
         duplicated_genes <- res[duplicated(res$GeneName), "GeneName"]
    
         res <- res %>%
         group_by(GeneName) %>%
         slice_min(padj, with_ties = FALSE) %>%
         ungroup()
    
         res <- as.data.frame(res)
         # Sort res first by padj (ascending) and then by log2FoldChange (descending)
         res <- res[order(res$padj, -res$log2FoldChange), ]
         # Read eggnog annotations
         eggnog_data <- read.delim("~/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_ATCC19606/eggnog_out.emapper.annotations.txt", header = TRUE, sep = "\t")
         # Remove the "gene-" prefix from GeneID in res to match eggnog 'query' format
         res$GeneID <- gsub("gene-", "", res$GeneID)
         # Merge eggnog data with res based on GeneID
         res <- res %>% left_join(eggnog_data, by = c("GeneID" = "query"))
    
         # Merge with the res dataframe
         # Perform the left joins and rename columns
         res_updated <- res %>%
         left_join(go_terms, by = "GeneID") %>%
         left_join(ec_terms, by = "GeneID") %>% dplyr::select(-EC.x, -GOs.x) %>% dplyr::rename(EC = EC.y, GOs = GOs.y)
    
         # Filter up-regulated genes
         up_regulated <- res_updated[res_updated$log2FoldChange > 2 & res_updated$padj < 0.01, ]
         # Filter down-regulated genes
         down_regulated <- res_updated[res_updated$log2FoldChange < -2 & res_updated$padj < 0.01, ]
    
         # Create a new workbook
         wb <- createWorkbook()
         # Add the complete dataset as the first sheet (with annotations)
         addWorksheet(wb, "Complete_Data")
         writeData(wb, "Complete_Data", res_updated)
         # Add the up-regulated genes as the second sheet (with annotations)
         addWorksheet(wb, "Up_Regulated")
         writeData(wb, "Up_Regulated", up_regulated)
         # Add the down-regulated genes as the third sheet (with annotations)
         addWorksheet(wb, "Down_Regulated")
         writeData(wb, "Down_Regulated", down_regulated)
         # Save the workbook to a file
         saveWorkbook(wb, "Gene_Expression_with_Annotations_Urine_vs_MHB.xlsx", overwrite = TRUE)
    
         # Set GeneName as row names after the join
         rownames(res_updated) <- res_updated$GeneName
         res_updated <- res_updated %>% dplyr::select(-GeneName)
         ## Set the 'GeneName' column as row.names
         #rownames(res_updated) <- res_updated$GeneName
         ## Drop the 'GeneName' column since it's now the row names
         #res_updated$GeneName <- NULL
         # -- BREAK_1 --
    
         # ---- Perform KEGG enrichment analysis (up_regulated) ----
         gene_list_kegg_up <- up_regulated$KEGG_ko
         gene_list_kegg_up <- gsub("ko:", "", gene_list_kegg_up)
         kegg_enrichment_up <- enrichKEGG(gene = gene_list_kegg_up, organism = 'ko')
         # -- convert the GeneID (Kxxxxxx) to the true GeneID --
         # Step 0: Create KEGG to GeneID mapping
         kegg_to_geneid_up <- up_regulated %>%
         dplyr::select(KEGG_ko, GeneID) %>%
         filter(!is.na(KEGG_ko)) %>%  # Remove missing KEGG KO entries
         mutate(KEGG_ko = str_remove(KEGG_ko, "ko:"))  # Remove 'ko:' prefix if present
         # Step 1: Clean KEGG_ko values (separate multiple KEGG IDs)
         kegg_to_geneid_clean <- kegg_to_geneid_up %>%
         mutate(KEGG_ko = str_remove_all(KEGG_ko, "ko:")) %>%  # Remove 'ko:' prefixes
         separate_rows(KEGG_ko, sep = ",") %>%  # Ensure each KEGG ID is on its own row
         filter(KEGG_ko != "-") %>%  # Remove invalid KEGG IDs ("-")
         distinct()  # Remove any duplicate mappings
         # Step 2.1: Expand geneID column in kegg_enrichment_up
         expanded_kegg <- kegg_enrichment_up %>%
         as.data.frame() %>%
         separate_rows(geneID, sep = "/") %>%  # Split multiple KEGG IDs (Kxxxxx)
         left_join(kegg_to_geneid_clean, by = c("geneID" = "KEGG_ko"), relationship = "many-to-many") %>%  # Explicitly handle many-to-many
         distinct() %>%  # Remove duplicate matches
         group_by(ID) %>%
         summarise(across(everything(), ~ paste(unique(na.omit(.)), collapse = "/")), .groups = "drop")  # Re-collapse results
         #dplyr::glimpse(expanded_kegg)
         # Step 3.1: Replace geneID column in the original dataframe
         kegg_enrichment_up_df <- as.data.frame(kegg_enrichment_up)
         # Remove old geneID column and merge new one
         kegg_enrichment_up_df <- kegg_enrichment_up_df %>%
         dplyr::select(-geneID) %>%  # Remove old geneID column
         left_join(expanded_kegg %>% dplyr::select(ID, GeneID), by = "ID") %>%  # Merge new GeneID column
         dplyr::rename(geneID = GeneID)  # Rename column back to geneID
    
         # ---- Perform KEGG enrichment analysis (down_regulated) ----
         # Step 1: Extract KEGG KO terms from down-regulated genes
         gene_list_kegg_down <- down_regulated$KEGG_ko
         gene_list_kegg_down <- gsub("ko:", "", gene_list_kegg_down)
         # Step 2: Perform KEGG enrichment analysis
         kegg_enrichment_down <- enrichKEGG(gene = gene_list_kegg_down, organism = 'ko')
         # --- Convert KEGG gene IDs (Kxxxxxx) to actual GeneIDs ---
         # Step 3: Create KEGG to GeneID mapping from down_regulated dataset
         kegg_to_geneid_down <- down_regulated %>%
         dplyr::select(KEGG_ko, GeneID) %>%
         filter(!is.na(KEGG_ko)) %>%  # Remove missing KEGG KO entries
         mutate(KEGG_ko = str_remove(KEGG_ko, "ko:"))  # Remove 'ko:' prefix if present
         # -- BREAK_2 --
    
         # Step 4: Clean KEGG_ko values (handle multiple KEGG IDs)
         kegg_to_geneid_down_clean <- kegg_to_geneid_down %>%
         mutate(KEGG_ko = str_remove_all(KEGG_ko, "ko:")) %>%  # Remove 'ko:' prefixes
         separate_rows(KEGG_ko, sep = ",") %>%  # Ensure each KEGG ID is on its own row
         filter(KEGG_ko != "-") %>%  # Remove invalid KEGG IDs ("-")
         distinct()  # Remove duplicate mappings
         # Step 5: Expand geneID column in kegg_enrichment_down
         expanded_kegg_down <- kegg_enrichment_down %>%
         as.data.frame() %>%
         separate_rows(geneID, sep = "/") %>%  # Split multiple KEGG IDs (Kxxxxx)
         left_join(kegg_to_geneid_down_clean, by = c("geneID" = "KEGG_ko"), relationship = "many-to-many") %>%  # Handle many-to-many mappings
         distinct() %>%  # Remove duplicate matches
         group_by(ID) %>%
         summarise(across(everything(), ~ paste(unique(na.omit(.)), collapse = "/")), .groups = "drop")  # Re-collapse results
         # Step 6: Replace geneID column in the original kegg_enrichment_down dataframe
         kegg_enrichment_down_df <- as.data.frame(kegg_enrichment_down) %>%
         dplyr::select(-geneID) %>%  # Remove old geneID column
         left_join(expanded_kegg_down %>% dplyr::select(ID, GeneID), by = "ID") %>%  # Merge new GeneID column
         dplyr::rename(geneID = GeneID)  # Rename column back to geneID
         # View the updated dataframe
         head(kegg_enrichment_down_df)
    
         # Create a new workbook
         wb <- createWorkbook()
         # Save enrichment results to the workbook
         addWorksheet(wb, "KEGG_Enrichment_Up")
         writeData(wb, "KEGG_Enrichment_Up", as.data.frame(kegg_enrichment_up_df))
         # Save enrichment results to the workbook
         addWorksheet(wb, "KEGG_Enrichment_Down")
         writeData(wb, "KEGG_Enrichment_Down", as.data.frame(kegg_enrichment_down_df))
    
         # Define gene list (up-regulated genes)
         gene_list_go_up <- up_regulated$GeneID  # Extract the 149 up-regulated genes
         gene_list_go_down <- down_regulated$GeneID  # Extract the 65 down-regulated genes
    
         # Define background gene set (all genes in res)
         background_genes <- res_updated$GeneID  # Extract the 3646 background genes
    
         # Prepare GO annotation data from res
         go_annotation <- res_updated[, c("GOs","GeneID")]  # Extract relevant columns
         go_annotation <- go_annotation %>%
         tidyr::separate_rows(GOs, sep = ",")  # Split multiple GO terms into separate rows
         # -- BREAK_3 --
    
         go_enrichment_up <- enricher(
             gene = gene_list_go_up,                # Up-regulated genes
             TERM2GENE = go_annotation,       # Custom GO annotation
             pvalueCutoff = 0.05,             # Significance threshold
             pAdjustMethod = "BH",
             universe = background_genes      # Define the background gene set
         )
         go_enrichment_up <- as.data.frame(go_enrichment_up)
    
         go_enrichment_down <- enricher(
             gene = gene_list_go_down,                # Up-regulated genes
             TERM2GENE = go_annotation,       # Custom GO annotation
             pvalueCutoff = 0.05,             # Significance threshold
             pAdjustMethod = "BH",
             universe = background_genes      # Define the background gene set
         )
         go_enrichment_down <- as.data.frame(go_enrichment_down)
    
         ## Remove the 'p.adjust' column since no adjusted methods have been applied --> In this version we have used pvalue filtering (see above)!
         #go_enrichment_up <- go_enrichment_up[, !names(go_enrichment_up) %in% "p.adjust"]
         # Update the Description column with the term descriptions
         go_enrichment_up$Description <- sapply(go_enrichment_up$ID, function(go_id) {
         # Using select to get the term description
         term <- tryCatch({
             AnnotationDbi::select(GO.db, keys = go_id, columns = "TERM", keytype = "GOID")
         }, error = function(e) {
             message(paste("Error for GO term:", go_id))  # Print which GO ID caused the error
             return(data.frame(TERM = NA))  # In case of error, return NA
         })
    
         if (nrow(term) > 0) {
             return(term$TERM)
         } else {
             return(NA)  # If no description found, return NA
         }
         })
         ## Print the updated data frame
         #print(go_enrichment_up)
    
         ## Remove the 'p.adjust' column since no adjusted methods have been applied --> In this version we have used pvalue filtering (see above)!
         #go_enrichment_down <- go_enrichment_down[, !names(go_enrichment_down) %in% "p.adjust"]
         # Update the Description column with the term descriptions
         go_enrichment_down$Description <- sapply(go_enrichment_down$ID, function(go_id) {
         # Using select to get the term description
         term <- tryCatch({
             AnnotationDbi::select(GO.db, keys = go_id, columns = "TERM", keytype = "GOID")
         }, error = function(e) {
             message(paste("Error for GO term:", go_id))  # Print which GO ID caused the error
             return(data.frame(TERM = NA))  # In case of error, return NA
         })
    
         if (nrow(term) > 0) {
             return(term$TERM)
         } else {
             return(NA)  # If no description found, return NA
         }
         })
    
         addWorksheet(wb, "GO_Enrichment_Up")
         writeData(wb, "GO_Enrichment_Up", as.data.frame(go_enrichment_up))
    
         addWorksheet(wb, "GO_Enrichment_Down")
         writeData(wb, "GO_Enrichment_Down", as.data.frame(go_enrichment_down))
    
         # Save the workbook with enrichment results
         saveWorkbook(wb, "KEGG_and_GO_Enrichments_Urine_vs_MHB.xlsx", overwrite = TRUE)
    
         #Error for GO term: GO:0006807: replace "GO:0006807 obsolete nitrogen compound metabolic process"
         #obsolete nitrogen compound metabolic process #https://www.ebi.ac.uk/QuickGO/term/GO:0006807
         #TODO: marked the color as yellow if the p.adjusted <= 0.05 in GO_enrichment!
    
         #mv KEGG_and_GO_Enrichments_Urine_vs_MHB.xlsx KEGG_and_GO_Enrichments_Mac_vs_LB.xlsx
         #Mac_vs_LB
         #LB.AB_vs_LB.WT19606
         #LB.IJ_vs_LB.WT19606
         #LB.W1_vs_LB.WT19606
         #LB.Y1_vs_LB.WT19606
         #Mac.AB_vs_Mac.WT19606
         #Mac.IJ_vs_Mac.WT19606
         #Mac.W1_vs_Mac.WT19606
         #Mac.Y1_vs_Mac.WT19606
  5. (DEBUG) Draw the Venn diagram to compare the total DEGs across AUM, Urine, and MHB, irrespective of up- or down-regulation.

             library(openxlsx)
    
             # Function to read and clean gene ID files
             read_gene_ids <- function(file_path) {
             # Read the gene IDs from the file
             gene_ids <- readLines(file_path)
    
             # Remove any quotes and trim whitespaces
             gene_ids <- gsub('"', '', gene_ids)  # Remove quotes
             gene_ids <- trimws(gene_ids)  # Trim whitespaces
    
             # Remove empty entries or NAs
             gene_ids <- gene_ids[gene_ids != "" & !is.na(gene_ids)]
    
             return(gene_ids)
             }
    
             # Example list of LB files with both -up.id and -down.id for each condition
             lb_files_up <- c("LB.AB_vs_LB.WT19606-up.id", "LB.IJ_vs_LB.WT19606-up.id",
                             "LB.W1_vs_LB.WT19606-up.id", "LB.Y1_vs_LB.WT19606-up.id")
             lb_files_down <- c("LB.AB_vs_LB.WT19606-down.id", "LB.IJ_vs_LB.WT19606-down.id",
                             "LB.W1_vs_LB.WT19606-down.id", "LB.Y1_vs_LB.WT19606-down.id")
    
             # Combine both up and down files for each condition
             lb_files <- c(lb_files_up, lb_files_down)
    
             # Read gene IDs for each file in LB group
             #lb_degs <- setNames(lapply(lb_files, read_gene_ids), gsub("-(up|down).id", "", lb_files))
             lb_degs <- setNames(lapply(lb_files, read_gene_ids), make.unique(gsub("-(up|down).id", "", lb_files)))
    
             lb_degs_ <- list()
             combined_set <- c(lb_degs[["LB.AB_vs_LB.WT19606"]], lb_degs[["LB.AB_vs_LB.WT19606.1"]])
             #unique_combined_set <- unique(combined_set)
             lb_degs_$AB <- combined_set
             combined_set <- c(lb_degs[["LB.IJ_vs_LB.WT19606"]], lb_degs[["LB.IJ_vs_LB.WT19606.1"]])
             lb_degs_$IJ <- combined_set
             combined_set <- c(lb_degs[["LB.W1_vs_LB.WT19606"]], lb_degs[["LB.W1_vs_LB.WT19606.1"]])
             lb_degs_$W1 <- combined_set
             combined_set <- c(lb_degs[["LB.Y1_vs_LB.WT19606"]], lb_degs[["LB.Y1_vs_LB.WT19606.1"]])
             lb_degs_$Y1 <- combined_set
    
             # Example list of Mac files with both -up.id and -down.id for each condition
             mac_files_up <- c("Mac.AB_vs_Mac.WT19606-up.id", "Mac.IJ_vs_Mac.WT19606-up.id",
                             "Mac.W1_vs_Mac.WT19606-up.id", "Mac.Y1_vs_Mac.WT19606-up.id")
             mac_files_down <- c("Mac.AB_vs_Mac.WT19606-down.id", "Mac.IJ_vs_Mac.WT19606-down.id",
                             "Mac.W1_vs_Mac.WT19606-down.id", "Mac.Y1_vs_Mac.WT19606-down.id")
    
             # Combine both up and down files for each condition in Mac group
             mac_files <- c(mac_files_up, mac_files_down)
    
             # Read gene IDs for each file in Mac group
             mac_degs <- setNames(lapply(mac_files, read_gene_ids), make.unique(gsub("-(up|down).id", "", mac_files)))
    
             mac_degs_ <- list()
             combined_set <- c(mac_degs[["Mac.AB_vs_Mac.WT19606"]], mac_degs[["Mac.AB_vs_Mac.WT19606.1"]])
             mac_degs_$AB <- combined_set
             combined_set <- c(mac_degs[["Mac.IJ_vs_Mac.WT19606"]], mac_degs[["Mac.IJ_vs_Mac.WT19606.1"]])
             mac_degs_$IJ <- combined_set
             combined_set <- c(mac_degs[["Mac.W1_vs_Mac.WT19606"]], mac_degs[["Mac.W1_vs_Mac.WT19606.1"]])
             mac_degs_$W1 <- combined_set
             combined_set <- c(mac_degs[["Mac.Y1_vs_Mac.WT19606"]], mac_degs[["Mac.Y1_vs_Mac.WT19606.1"]])
             mac_degs_$Y1 <- combined_set
    
             # Function to clean sheet names to ensure no sheet name exceeds 31 characters
             truncate_sheet_name <- function(names_list) {
             sapply(names_list, function(name) {
             if (nchar(name) > 31) {
             return(substr(name, 1, 31))  # Truncate sheet name to 31 characters
             }
             return(name)
             })
             }
    
             # Assuming lb_degs_ is already a list of gene sets (LB.AB, LB.IJ, etc.)
    
             # Define intersections between different conditions for LB
             inter_lb_ab_ij <- intersect(lb_degs_$AB, lb_degs_$IJ)
             inter_lb_ab_w1 <- intersect(lb_degs_$AB, lb_degs_$W1)
             inter_lb_ab_y1 <- intersect(lb_degs_$AB, lb_degs_$Y1)
             inter_lb_ij_w1 <- intersect(lb_degs_$IJ, lb_degs_$W1)
             inter_lb_ij_y1 <- intersect(lb_degs_$IJ, lb_degs_$Y1)
             inter_lb_w1_y1 <- intersect(lb_degs_$W1, lb_degs_$Y1)
    
             # Define intersections between three conditions for LB
             inter_lb_ab_ij_w1 <- Reduce(intersect, list(lb_degs_$AB, lb_degs_$IJ, lb_degs_$W1))
             inter_lb_ab_ij_y1 <- Reduce(intersect, list(lb_degs_$AB, lb_degs_$IJ, lb_degs_$Y1))
             inter_lb_ab_w1_y1 <- Reduce(intersect, list(lb_degs_$AB, lb_degs_$W1, lb_degs_$Y1))
             inter_lb_ij_w1_y1 <- Reduce(intersect, list(lb_degs_$IJ, lb_degs_$W1, lb_degs_$Y1))
    
             # Define intersection between all four conditions for LB
             inter_lb_ab_ij_w1_y1 <- Reduce(intersect, list(lb_degs_$AB, lb_degs_$IJ, lb_degs_$W1, lb_degs_$Y1))
    
             # Now remove the intersected genes from each original set for LB
             venn_list_lb <- list()
    
             # For LB.AB, remove genes that are also in other conditions
             venn_list_lb[["LB.AB_only"]] <- setdiff(lb_degs_$AB, union(inter_lb_ab_ij, union(inter_lb_ab_w1, inter_lb_ab_y1)))
    
             # For LB.IJ, remove genes that are also in other conditions
             venn_list_lb[["LB.IJ_only"]] <- setdiff(lb_degs_$IJ, union(inter_lb_ab_ij, union(inter_lb_ij_w1, inter_lb_ij_y1)))
    
             # For LB.W1, remove genes that are also in other conditions
             venn_list_lb[["LB.W1_only"]] <- setdiff(lb_degs_$W1, union(inter_lb_ab_w1, union(inter_lb_ij_w1, inter_lb_ab_w1_y1)))
    
             # For LB.Y1, remove genes that are also in other conditions
             venn_list_lb[["LB.Y1_only"]] <- setdiff(lb_degs_$Y1, union(inter_lb_ab_y1, union(inter_lb_ij_y1, inter_lb_ab_w1_y1)))
    
             # Add the intersections for LB (same as before)
             venn_list_lb[["LB.AB_AND_LB.IJ"]] <- inter_lb_ab_ij
             venn_list_lb[["LB.AB_AND_LB.W1"]] <- inter_lb_ab_w1
             venn_list_lb[["LB.AB_AND_LB.Y1"]] <- inter_lb_ab_y1
             venn_list_lb[["LB.IJ_AND_LB.W1"]] <- inter_lb_ij_w1
             venn_list_lb[["LB.IJ_AND_LB.Y1"]] <- inter_lb_ij_y1
             venn_list_lb[["LB.W1_AND_LB.Y1"]] <- inter_lb_w1_y1
    
             # Define intersections between three conditions for LB
             venn_list_lb[["LB.AB_AND_LB.IJ_AND_LB.W1"]] <- inter_lb_ab_ij_w1
             venn_list_lb[["LB.AB_AND_LB.IJ_AND_LB.Y1"]] <- inter_lb_ab_ij_y1
             venn_list_lb[["LB.AB_AND_LB.W1_AND_LB.Y1"]] <- inter_lb_ab_w1_y1
             venn_list_lb[["LB.IJ_AND_LB.W1_AND_LB.Y1"]] <- inter_lb_ij_w1_y1
    
             # Define intersection between all four conditions for LB
             venn_list_lb[["LB.AB_AND_LB.IJ_AND_LB.W1_AND_LB.Y1"]] <- inter_lb_ab_ij_w1_y1
    
             # Assuming mac_degs_ is already a list of gene sets (Mac.AB, Mac.IJ, etc.)
    
             # Define intersections between different conditions
             inter_mac_ab_ij <- intersect(mac_degs_$AB, mac_degs_$IJ)
             inter_mac_ab_w1 <- intersect(mac_degs_$AB, mac_degs_$W1)
             inter_mac_ab_y1 <- intersect(mac_degs_$AB, mac_degs_$Y1)
             inter_mac_ij_w1 <- intersect(mac_degs_$IJ, mac_degs_$W1)
             inter_mac_ij_y1 <- intersect(mac_degs_$IJ, mac_degs_$Y1)
             inter_mac_w1_y1 <- intersect(mac_degs_$W1, mac_degs_$Y1)
    
             # Define intersections between three conditions
             inter_mac_ab_ij_w1 <- Reduce(intersect, list(mac_degs_$AB, mac_degs_$IJ, mac_degs_$W1))
             inter_mac_ab_ij_y1 <- Reduce(intersect, list(mac_degs_$AB, mac_degs_$IJ, mac_degs_$Y1))
             inter_mac_ab_w1_y1 <- Reduce(intersect, list(mac_degs_$AB, mac_degs_$W1, mac_degs_$Y1))
             inter_mac_ij_w1_y1 <- Reduce(intersect, list(mac_degs_$IJ, mac_degs_$W1, mac_degs_$Y1))
    
             # Define intersection between all four conditions
             inter_mac_ab_ij_w1_y1 <- Reduce(intersect, list(mac_degs_$AB, mac_degs_$IJ, mac_degs_$W1, mac_degs_$Y1))
    
             # Now remove the intersected genes from each original set
             venn_list_mac <- list()
    
             # For Mac.AB, remove genes that are also in other conditions
             venn_list_mac[["Mac.AB_only"]] <- setdiff(mac_degs_$AB, union(inter_mac_ab_ij, union(inter_mac_ab_w1, inter_mac_ab_y1)))
    
             # For Mac.IJ, remove genes that are also in other conditions
             venn_list_mac[["Mac.IJ_only"]] <- setdiff(mac_degs_$IJ, union(inter_mac_ab_ij, union(inter_mac_ij_w1, inter_mac_ij_y1)))
    
             # For Mac.W1, remove genes that are also in other conditions
             venn_list_mac[["Mac.W1_only"]] <- setdiff(mac_degs_$W1, union(inter_mac_ab_w1, union(inter_mac_ij_w1, inter_mac_ab_w1_y1)))
    
             # For Mac.Y1, remove genes that are also in other conditions
             venn_list_mac[["Mac.Y1_only"]] <- setdiff(mac_degs_$Y1, union(inter_mac_ab_y1, union(inter_mac_ij_y1, inter_mac_ab_w1_y1)))
    
             # Add the intersections (same as before)
             venn_list_mac[["Mac.AB_AND_Mac.IJ"]] <- inter_mac_ab_ij
             venn_list_mac[["Mac.AB_AND_Mac.W1"]] <- inter_mac_ab_w1
             venn_list_mac[["Mac.AB_AND_Mac.Y1"]] <- inter_mac_ab_y1
             venn_list_mac[["Mac.IJ_AND_Mac.W1"]] <- inter_mac_ij_w1
             venn_list_mac[["Mac.IJ_AND_Mac.Y1"]] <- inter_mac_ij_y1
             venn_list_mac[["Mac.W1_AND_Mac.Y1"]] <- inter_mac_w1_y1
    
             # Define intersections between three conditions
             venn_list_mac[["Mac.AB_AND_Mac.IJ_AND_Mac.W1"]] <- inter_mac_ab_ij_w1
             venn_list_mac[["Mac.AB_AND_Mac.IJ_AND_Mac.Y1"]] <- inter_mac_ab_ij_y1
             venn_list_mac[["Mac.AB_AND_Mac.W1_AND_Mac.Y1"]] <- inter_mac_ab_w1_y1
             venn_list_mac[["Mac.IJ_AND_Mac.W1_AND_Mac.Y1"]] <- inter_mac_ij_w1_y1
    
             # Define intersection between all four conditions
             venn_list_mac[["Mac.AB_AND_Mac.IJ_AND_Mac.W1_AND_Mac.Y1"]] <- inter_mac_ab_ij_w1_y1
    
             # Save the gene IDs to Excel for further inspection (optional)
             write.xlsx(lb_degs, file = "LB_DEGs.xlsx")
             write.xlsx(mac_degs, file = "Mac_DEGs.xlsx")
    
             # Clean sheet names and write the Venn intersection sets for LB and Mac groups into Excel files
             write.xlsx(venn_list_lb, file = "Venn_LB_Genes_Intersect.xlsx", sheetName = truncate_sheet_name(names(venn_list_lb)), rowNames = FALSE)
             write.xlsx(venn_list_mac, file = "Venn_Mac_Genes_Intersect.xlsx", sheetName = truncate_sheet_name(names(venn_list_mac)), rowNames = FALSE)
    
             # Venn Diagram for LB group
             venn1 <- ggvenn(lb_degs_,
                             fill_color = c("skyblue", "tomato", "gold", "orchid"),
                             stroke_size = 0.4,
                             set_name_size = 5)
             ggsave("Venn_LB_Genes.png", plot = venn1, width = 7, height = 7, dpi = 300)
    
             # Venn Diagram for Mac group
             venn2 <- ggvenn(mac_degs_,
                             fill_color = c("lightgreen", "slateblue", "plum", "orange"),
                             stroke_size = 0.4,
                             set_name_size = 5)
             ggsave("Venn_Mac_Genes.png", plot = venn2, width = 7, height = 7, dpi = 300)
    
             cat("✅ All Venn intersection sets exported to Excel successfully.\n")

How to correlate RNA-seq Data with Mass Spectrometry Proteomics Data?

Correlating RNA-seq data with mass spectrometry (MS)-based proteomics data is a powerful way to link transcript-level expression with protein-level abundance. Here’s a step-by-step outline of how to approach it:

  1. Preprocessing and Normalization

    For RNA-Seq data:

    • Obtain gene-level expression data, usually as raw counts or TPM (transcripts per million) / FPKM (fragments per kilobase million).
    • Normalize the data (e.g., using DESeq2’s variance stabilizing transformation (VST) or edgeR’s TMM normalization).

    For MS proteomics data:

    • Quantify protein abundances, often using spectral counts, iBAQ, LFQ intensities, or other measures.
    • Log-transform the data if needed to stabilize variance.
  2. Data Mapping and Integration

    • Gene/Protein Mapping: Use gene symbols, Ensembl IDs, or UniProt IDs to map transcript-level data (RNA-seq) to protein-level data (MS). Be cautious of differences in annotation – e.g., some genes might have multiple protein isoforms.

    • Common Identifiers:

      • Convert all IDs to a common identifier (e.g., gene symbols or Ensembl IDs).
      • Remove entries without matching pairs to ensure one-to-one correspondence.
  3. Data Filtering

    • Filter out lowly expressed genes/proteins or those not reliably detected in both datasets.
    • Optionally, keep only genes/proteins of interest or those with high coverage.
  4. Correlation Analysis

    • For each matched gene/protein pair, calculate correlation (usually Pearson or Spearman) across the samples.

      Steps:

      • Construct a table with rows as genes/proteins and columns as samples.
      • For each row, you’ll have two vectors:
        • RNA expression (e.g., normalized RNA counts)
        • Protein abundance (e.g., log-transformed LFQ intensity)
      • Calculate:

          from scipy.stats import pearsonr, spearmanr
        
          rna_vector = [...]
          protein_vector = [...]
        
          pearson_corr, _ = pearsonr(rna_vector, protein_vector)
          spearman_corr, _ = spearmanr(rna_vector, protein_vector)
  5. Visualize and Interpret

    • Plot scatter plots of RNA vs protein levels for:

      • All genes/proteins together (aggregate view)
      • Selected genes of interest
    • Plot correlation coefficients:

      • Histogram of all gene/protein correlations
      • Heatmap if you have sample-wise data
    • Assess overall agreement:

      • Typically, moderate correlation (~0.3–0.6) is observed in many studies.
  6. Consider Batch Effects and Biological Variability

    • If the datasets come from different experiments or platforms, consider batch correction methods (e.g., ComBat from the sva R package).

    • Be mindful that:

      • Post-transcriptional regulation affects how well mRNA levels correlate with protein levels.
      • Some genes/proteins might show no correlation due to translational regulation, stability, etc.
  7. Summary Workflow

    ✅ Preprocess & normalize both datasets ✅ Map genes/proteins to common IDs ✅ Filter to shared, high-quality data ✅ Calculate correlations ✅ Visualize and interpret

  8. Python script that walks through the key steps of correlating RNA-seq data with proteomics data:

     import pandas as pd
     import numpy as np
     from scipy.stats import pearsonr, spearmanr
     import matplotlib.pyplot as plt
     import seaborn as sns
    
     # --- Step 1: Load your data ---
    
     # Example: CSVs with genes/proteins as rows, samples as columns
     rna_data = pd.read_csv('rna_seq_data.csv', index_col=0)  # genes x samples
     protein_data = pd.read_csv('proteomics_data.csv', index_col=0)  # proteins x samples
    
     # --- Step 2: Map genes to proteins (assuming same identifiers) ---
    
     # Filter to common genes/proteins
     common_genes = rna_data.index.intersection(protein_data.index)
     rna_data_filtered = rna_data.loc[common_genes]
     protein_data_filtered = protein_data.loc[common_genes]
    
     print(f"Number of common genes/proteins: {len(common_genes)}")
    
     # --- Step 3: Log transform if needed (optional) ---
    
     rna_data_log = np.log2(rna_data_filtered + 1)
     protein_data_log = np.log2(protein_data_filtered + 1)
    
     # --- Step 4: Calculate gene-wise correlations across samples ---
    
     pearson_corrs = []
     spearman_corrs = []
    
     for gene in common_genes:
         rna_vector = rna_data_log.loc[gene]
         protein_vector = protein_data_log.loc[gene]
    
         pearson_corr, _ = pearsonr(rna_vector, protein_vector)
         spearman_corr, _ = spearmanr(rna_vector, protein_vector)
    
         pearson_corrs.append(pearson_corr)
         spearman_corrs.append(spearman_corr)
    
     # Save results
     correlation_df = pd.DataFrame({
         'Gene': common_genes,
         'Pearson': pearson_corrs,
         'Spearman': spearman_corrs
     })
     correlation_df.to_csv('gene_protein_correlations.csv', index=False)
     print("Saved gene-wise correlation data to 'gene_protein_correlations.csv'")
    
     # --- Step 5: Visualize the correlation distributions ---
    
     sns.histplot(correlation_df['Pearson'], bins=30, kde=True, color='skyblue')
     plt.xlabel('Pearson Correlation')
     plt.title('Distribution of Pearson Correlations (RNA vs Protein)')
     plt.show()
    
     sns.histplot(correlation_df['Spearman'], bins=30, kde=True, color='salmon')
     plt.xlabel('Spearman Correlation')
     plt.title('Distribution of Spearman Correlations (RNA vs Protein)')
     plt.show()
    
     # --- Step 6: Scatter plot for a selected gene/protein ---
    
     example_gene = common_genes[0]  # change to your gene of interest
     plt.scatter(rna_data_log.loc[example_gene], protein_data_log.loc[example_gene])
     plt.xlabel('Log2 RNA Expression')
     plt.ylabel('Log2 Protein Abundance')
     plt.title(f'RNA vs Protein for {example_gene}')
     plt.grid(True)
     plt.show()
    
     # Key Notes:
     #✅ Replace the filenames (rna_seq_data.csv and proteomics_data.csv) with your actual files.
     #✅ The script expects rows to be genes/proteins and columns to be samples.
     #✅ Modify or add steps if you have different normalization needs (e.g., DESeq2 normalization).
  9. R script that covers the same steps as above:

     # --- Load libraries ---
     library(ggplot2)
     library(dplyr)
    
     # --- Step 1: Load your data ---
     # Example: CSVs with genes/proteins as rows, samples as columns
     rna_data <- read.csv("rna_seq_data.csv", row.names = 1)
     protein_data <- read.csv("proteomics_data.csv", row.names = 1)
    
     # --- Step 2: Find common genes/proteins ---
     common_genes <- intersect(rownames(rna_data), rownames(protein_data))
     rna_data_filtered <- rna_data[common_genes, ]
     protein_data_filtered <- protein_data[common_genes, ]
    
     cat("Number of common genes/proteins:", length(common_genes), "\n")
    
     # --- Step 3: Log-transform if needed (optional) ---
     rna_data_log <- log2(rna_data_filtered + 1)
     protein_data_log <- log2(protein_data_filtered + 1)
    
     # --- Step 4: Calculate gene-wise correlations across samples ---
     pearson_corrs <- numeric(length(common_genes))
     spearman_corrs <- numeric(length(common_genes))
    
     for (i in seq_along(common_genes)) {
     gene <- common_genes[i]
     rna_vector <- as.numeric(rna_data_log[gene, ])
     protein_vector <- as.numeric(protein_data_log[gene, ])
    
     pearson_corrs[i] <- cor(rna_vector, protein_vector, method = "pearson")
     spearman_corrs[i] <- cor(rna_vector, protein_vector, method = "spearman")
     }
    
     # Save the results
     correlation_df <- data.frame(
     Gene = common_genes,
     Pearson = pearson_corrs,
     Spearman = spearman_corrs
     )
    
     write.csv(correlation_df, "gene_protein_correlations.csv", row.names = FALSE)
     cat("Saved gene-wise correlation data to 'gene_protein_correlations.csv'\n")
    
     # --- Step 5: Visualize the correlation distributions ---
     ggplot(correlation_df, aes(x = Pearson)) +
     geom_histogram(bins = 30, fill = "skyblue", color = "black") +
     labs(title = "Distribution of Pearson Correlations (RNA vs Protein)",
         x = "Pearson Correlation", y = "Frequency") +
     theme_minimal()
    
     ggplot(correlation_df, aes(x = Spearman)) +
     geom_histogram(bins = 30, fill = "salmon", color = "black") +
     labs(title = "Distribution of Spearman Correlations (RNA vs Protein)",
         x = "Spearman Correlation", y = "Frequency") +
     theme_minimal()
    
     # --- Step 6: Scatter plot for a selected gene/protein ---
     example_gene <- common_genes[1]  # change this to your gene of interest
     df_example <- data.frame(
     RNA = as.numeric(rna_data_log[example_gene, ]),
     Protein = as.numeric(protein_data_log[example_gene, ])
     )
    
     ggplot(df_example, aes(x = RNA, y = Protein)) +
     geom_point() +
     labs(title = paste("RNA vs Protein for", example_gene),
         x = "Log2 RNA Expression", y = "Log2 Protein Abundance") +
     theme_minimal() +
     geom_smooth(method = "lm", se = FALSE, color = "red")
    
     # Key Notes:
     #✅ Replace "rna_seq_data.csv" and "proteomics_data.csv" with your real file names.
     #✅ Rows: genes/proteins, columns: samples.
     #✅ Change example_gene to any gene of interest for plotting.
     #Tweak this for the new dataset or extend it with batch correction or other normalizations?

All tools and services of BV-BRC

Genomics

  • Genome Assembly
  • Genome Annotation
  • Comprehensive Genome Analysis (B)
  • BLAST
  • Primer Design
  • Similar Genome Finder
  • Genome Alignment
  • Variation Analysis
  • Tn-Seq Analysis

Phylogenomics

  • Bacterial Genome Tree
  • Viral Genome Tree
  • Gene/Protein Tree

Protein Tools

  • MSA and SNP Analysis
  • Meta-CATS
  • Proteome Comparison
  • Protein Family Sorter
  • Comparative Systems
  • Docking

Metagenomics

  • Taxonomic Classification
  • Metagenomic Binning
  • Metagenomic Read Mapping

Transcriptomics

  • RNA-Seq Analysis
  • Expression Import

Utilities

  • Fastq Utilities
  • ID Mapper

Viral Tools

  • SARS-CoV-2 Genome Analysis
  • SARS-CoV-2 Wastewater Analysis
  • Influenza Sequence Submission
  • Influenza HA Subtype Conversion
  • Subspecies Classification
  • Viral Assembly

Outbreak Tracker

  • Measles 2025
  • Mpox 2024
  • Influenza H5N1 2024
  • SARS-CoV-2

DAMIAN Post-processing for Flavivirus and FSME

  1. Prepare input raw data

     ~/DATA/Data_DAMIAN_Post-processing_Flavivirus_and_FSME
    
     ln ./240621_M03701_0312_000000000-GHL9N/p20534/7448_7501_S0_R1_001.fastq.gz p20534_7448_R1.fastq.gz
     ln ./240621_M03701_0312_000000000-GHL9N/p20534/7448_7501_S0_R2_001.fastq.gz p20534_7448_R2.fastq.gz
  2. Prepare virus database and select 8 representatives for the eight given viruses from the database

     # -- Download genomes --
     # ---- Date is 13.06.2025. ----
     #Taxonomy ID: 3044782
     #Die Gattung Orthoflavivirus (früher Flavivirus) umfasst behüllte Viren mit einem positivsträngigen RNA-Einzelstrang als Genom, die durch Arthropoden (Zecken und Stechmücken) als Vektoren auf Vögel und Säugetiere übertragen werden.
     #The English name for Flavivirus is simply: Flavivirus
     #It is both the scientific and common name for the genus of viruses in the family Flaviviridae. This genus includes several well-known viruses such as:
             * Dengue virus
             * Zika virus
             * West Nile virus
             * Yellow fever virus
             * Tick-borne encephalitis virus (TBEV / FSME virus)
    
     esearch -db nucleotide -query "txid3044782[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_3044782_ncbi.fasta
     python ~/Scripts/filter_fasta.py genome_3044782_ncbi.fasta complete_genome_3044782_ncbi.fasta  #96579-->9431
     #https://www.ebi.ac.uk/ena/browser/view/3044782
    
     #Download FMSE
     esearch -db nucleotide -query "txid11084[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_11084_ncbi.fasta
     python ~/Scripts/filter_fasta.py genome_11084_ncbi.fasta complete_genome_11084_ncbi.fasta  #3426-->219
     #https://www.ebi.ac.uk/ena/browser/view/11084
    
     samtools faidx complete_genome_11084_ncbi.fasta PV626569.1 > PV626569.fasta
  3. Run the second round of vrap (–host==${virus}.fasta)

     #cat FluB_PB1.fasta FluB_PB2.fasta FluB_PA.fasta FluB_HA.fasta FluB_NP.fasta FluB_NB_NA.fasta FluB_M1_BM2.fasta FluB_NEP_NS1.fasta > FluB.fasta
    
     # Run vrap (second round): selecte some representative viruses from the generated Excel-files generated by the last step as --host
     (vrap) for sample in p20534_7448; do
         vrap/vrap_until_bowtie2.py  -1 ${sample}_R1.fastq.gz -2 ${sample}_R2.fastq.gz  -o vrap_${sample}_on_FSME --host /home/jhuang/DATA/Data_DAMIAN_Post-processing_Flavivirus_and_FSME/PV626569.fasta   -t 100 -l 200  --gbt2 --noblast
     done
    
     (vrap) for sample in p20534_7448; do
         vrap/vrap_until_bowtie2.py  -1 ${sample}_R1.fastq.gz -2 ${sample}_R2.fastq.gz  -o vrap_${sample}_on_Flavivirus --host /home/jhuang/DATA/Data_DAMIAN_Post-processing_Flavivirus_and_FSME/complete_genome_3044782_ncbi.fasta   -t 100 -l 200  --gbt2 --noblast
     done
  4. Generate the mapping statistics for the sam-files generated from last step

     for sample in p20534_7448; do
         echo "-----${sample}_on_representatives------" >> LOG_mapping
         #cd vrap_${sample}_on_${virus}/bowtie
         cd vrap_${sample}_on_Flavivirus/bowtie
         # Rename and convert SAM to BAM
         mv mapped mapped.sam 2>> ../../LOG_mapping
         samtools view -S -b mapped.sam > mapped.bam 2>> ../../LOG_mapping
         samtools sort mapped.bam -o mapped_sorted.bam 2>> ../../LOG_mapping
         samtools index mapped_sorted.bam 2>> ../../LOG_mapping
         # Write flagstat output to log (go up two levels to write correctly)
         samtools flagstat mapped_sorted.bam >> ../../LOG_mapping 2>&1
         #samtools idxstats mapped_sorted.bam >> ../../LOG_mapping 2>&1
         cd ../..
     done
    
     (bakta) jhuang@WS-2290C:/mnt/md1/DATA/Data_DAMIAN_Post-processing_Flavivirus_and_FSME/vrap_p20534_7448_on_FSME/bowtie$ samtools flagstat mapped_sorted.bam
     7836046 + 0 in total (QC-passed reads + QC-failed reads)
     7836046 + 0 primary
     0 + 0 secondary
     0 + 0 supplementary
     0 + 0 duplicates
     0 + 0 primary duplicates
     0 + 0 mapped (0.00% : N/A)
     0 + 0 primary mapped (0.00% : N/A)
     5539082 + 0 paired in sequencing
     2769541 + 0 read1
     2769541 + 0 read2
     0 + 0 properly paired (0.00% : N/A)
     0 + 0 with itself and mate mapped
     0 + 0 singletons (0.00% : N/A)
     0 + 0 with mate mapped to a different chr
     0 + 0 with mate mapped to a different chr (mapQ>=5)
    
     (bakta) jhuang@WS-2290C:/mnt/md1/DATA/Data_DAMIAN_Post-processing_Flavivirus_and_FSME/vrap_p20534_7448_on_Flavivirus/bowtie$ samtools flagstat mapped_sorted.bam
     7836234 + 0 in total (QC-passed reads + QC-failed reads)
     7836234 + 0 primary
     0 + 0 secondary
     0 + 0 supplementary
     0 + 0 duplicates
     0 + 0 primary duplicates
     52 + 0 mapped (0.00% : N/A)
     52 + 0 primary mapped (0.00% : N/A)
     5539458 + 0 paired in sequencing
     2769729 + 0 read1
     2769729 + 0 read2
     0 + 0 properly paired (0.00% : N/A)
     4 + 0 with itself and mate mapped
     13 + 0 singletons (0.00% : N/A)
     0 + 0 with mate mapped to a different chr
     0 + 0 with mate mapped to a different chr (mapQ>=5)
    
     samtools view -F 4 mapped_sorted.bam > mapped_reads.sam
     awk '{print $3}' mapped_reads.sam | sort | uniq -c
     52 KY766069.1 Zika virus isolate Pf13/251013-18, complete genome
    
     # ------------------ DEBUG ----------------------
     samtools idxstats mapped_sorted.bam | cut -f 1
    
     for ref in PV424649.1 PV424650.1 PV424648.1 PV424643.1 PV424646.1 PV424645.1 PV424644.1 PV424647.1; do
         echo "Reference: $ref"
         samtools view -b mapped_sorted.bam "$ref" | samtools flagstat -
     done
    
     When I run samtools flagstat mapped_sorted.bam
    
     49572521 + 0 in total (QC-passed reads + QC-failed reads)
     0 + 0 secondary
     0 + 0 supplementary
     0 + 0 duplicates
     1169 + 0 mapped (0.00% : N/A)
     38247374 + 0 paired in sequencing
     19123687 + 0 read1
     19123687 + 0 read2
     884 + 0 properly paired (0.00% : N/A)
     934 + 0 with itself and mate mapped
     227 + 0 singletons (0.00% : N/A)
     0 + 0 with mate mapped to a different chr
     0 + 0 with mate mapped to a different chr (mapQ>=5)
    
     however, wenn I run for ref in PV424649.1 PV424650.1 PV424648.1 PV424643.1 PV424646.1 PV424645.1 PV424644.1 PV424647.1; do
             echo "Reference: $ref"
             samtools view -b mapped_sorted.bam "$ref" | samtools flagstat -
             done
    
     Reference: PV424647.1
     83 + 0 in total (QC-passed reads + QC-failed reads)
     0 + 0 secondary
     0 + 0 supplementary
     0 + 0 duplicates
     72 + 0 mapped (86.75% : N/A)
     82 + 0 paired in sequencing
     41 + 0 read1
     41 + 0 read2
     56 + 0 properly paired (68.29% : N/A)
     60 + 0 with itself and mate mapped
     11 + 0 singletons (13.41% : N/A)
     0 + 0 with mate mapped to a different chr
     0 + 0 with mate mapped to a different chr (mapQ>=5)
    
     I want to also the same total name as "samtools flagstat mapped_sorted.bam". How?
    
     samtools view -b mapped_sorted.bam PV424649.1
    
     for ref in PV424649.1 PV424650.1 PV424648.1 PV424643.1 PV424646.1 PV424645.1 PV424644.1 PV424647.1; do
     echo "Reference: $ref"
     samtools view -h mapped_sorted.bam | grep -E "^@|$ref" | samtools view -Sb - | samtools flagstat -
     done
    
     # ---- DEBUG END ----
    
     #draw some plots for some representative isolates which found in the first round (see Excel-file).
     samtools depth -m 0 -a mapped_sorted.bam > coverage.txt
     #grep "PV424649.1" coverage.txt > FluB_PB1_coverage.txt
     #grep "PV424650.1" coverage.txt > FluB_PB2_coverage.txt
     #grep "PV424648.1" coverage.txt > FluB_PA_coverage.txt
     #grep "PV424643.1" coverage.txt > FluB_HA_coverage.txt
     #grep "PV424646.1" coverage.txt > FluB_NP_coverage.txt
     #grep "PV424645.1" coverage.txt > FluB_NB_NA_coverage.txt
     #grep "PV424644.1" coverage.txt > FluB_M1_BM2_coverage.txt
     #grep "PV424647.1" coverage.txt > FluB_NEP_NS1_coverage.txt
    
             import pandas as pd
             import matplotlib.pyplot as plt
    
             # Load coverage data
             df = pd.read_csv("coverage.txt", sep="\t", header=None, names=["chr", "pos", "coverage"])
    
             # Plot
             plt.figure(figsize=(10,4))
             plt.plot(df["pos"], df["coverage"], color="blue", linewidth=0.5)
             plt.xlabel("Genomic Position")
             plt.ylabel("Coverage Depth")
             plt.title("BAM Coverage Plot")
             plt.show()
  5. Report

     Subject: Mapping Results for FluB Representative Isolate
    
     I have re-analyzed sample P20534 (7448) with a focus on Flaviviruses and FSME.
    
     Using curated reference sets from NCBI (Taxonomy ID 3044782 for Flavivirus, comprising 9,431 complete genomes—see attached flavivirus_names.txt for details; and Taxonomy ID 11084 for FSME, with 219 complete genomes), I performed targeted mapping. The key findings are summarized below:
    
         * Total reads: 7,836,234
         * Mapped to Flavivirus: 52 reads
           All 52 reads mapped specifically to Zika virus (KY766069.1, complete genome of isolate Pf13/251013-18)
         * Mapped to FSME: No significant hits detected
    
     Please find attached a coverage plot for the Zika virus genome (KY766069).
  6. Preparing a database containing all representative viruses from NCBI Virus #https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/ #Download All Records (18,708) am 26.05.2025

     # ------------ Manually update the internal viral databases --------------
     ##https://www.ebi.ac.uk/ena/browser/view/10239
     #esearch -db nucleotide -query "txid10239[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > genome_10239_ncbi.fasta
     #esearch -db protein -query "txid11520[Organism:exp]" | efetch -format fasta -email j.huang@uke.de > protein_11520_ncbi.fasta
     #mv ~/Tools/vrap/database/viral_db/nucleotide.fa ~/Tools/vrap/database/viral_db/nucleotide_Human_alphaherpesvirus_1.fa
     #mv ~/Tools/vrap/database/viral_db/protein.fa ~/Tools/vrap/database/viral_db/protein_Human_alphaherpesvirus_1.fa
     #cp genome_11520_ncbi.fasta ~/Tools/vrap/database/viral_db/nucleotide.fa
     #cp protein_11520_ncbi.fasta ~/Tools/vrap/database/viral_db/protein.fa
     #cd ~/Tools/vrap/database/viral_db
     #~/Tools/vrap/external_tools/blast/makeblastdb -in nucleotide.fa -dbtype nucl -parse_seqids -out virus_nucleotide
     #~/Tools/vrap/external_tools/blast/makeblastdb -in protein.fa -dbtype prot -parse_seqids -out virus_protein
     #vrap/vrap_noassembly.py  -1 AW005486_R1.fastq.gz -2 AW005486_R2.fastq.gz -o vrap_AW005486_on_InfluB  --bt2idx=/home/jhuang/REFs/genome  --host=/home/jhuang/REFs/genome.fa --virus=/mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/complete_11520_ncbi.fasta  -t 20 -l 200  -g
    
     # ----------- Three databases ----------
     #db is [virus_user_db]
     /home/jhuang/Tools/vrap/external_tools/blast/makeblastdb -in /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/custom_viral_seq.fa -dbtype nucl -parse_seqids -out /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/db/virus >> /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/vrap.log 2>> /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/vrap.log
    
     #db is ~/Tools/vrap/database/viral_db/nucleotide.fa  [Human alphaherpesvirus 1] [virus_nt_db]
     /home/jhuang/Tools/vrap/external_tools/blast/blastn -num_threads 20 -query /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/vrap_contig.fasta -db "/home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/db/virus"  -evalue 1e-4 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/blastn.csv >> /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/vrap.log
     Warning: [blastn] Examining 5 or more matches is recommended
    
     #db is ~/Tools/vrap/database/viral_db/protein.fa [Human alphaherpesvirus 1] [virus_aa_db]
     /home/jhuang/Tools/vrap/external_tools/blast/blastx -num_threads 20 -query /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/blastn.fa -db "/home/jhuang/Tools/vrap/database/viral_db/viral_protein"  -evalue 1e-6 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/blast/blastx.csv >> /mnt/md1/DATA/Data_Nicole_DAMIAN_Post-processing_Pathoprobe_FluB_Links/vrap_AW005486_on_InfluB/vrap.log

阳光房漏水怎么办?丁基胶带才是最佳密封选择

TODO: 以后只用 丁基胶带 粘Wintergarten上 玻璃的侧边和金属的连接处!

HSButyl FoilBand https://www.hsbutyl.com/de/product/foilband-butyl-tape/

Das Premium Glasfalz Silikon OTTOSEAL S120 310ml Alle Farben (Transparent) Innen- und Außen https://www.ebay.de/itm/266565929135?var=566331892689

  1. 丁基胶带(Butyl Tape)简介:

    • 材料:由丁基橡胶制成,是一种柔软、自粘、防水的密封带。

    • 特点: ✅ 强力粘附:适用于玻璃、金属等材料 ✅ 防水防漏:密封性优良 ✅ 耐高低温、抗紫外线:适合户外使用 ✅ 无毒无味:环保 ✅ 易施工:可剪裁、手贴,干净整洁

    • 适用场景:

      • 屋顶玻璃、窗户缝隙密封
      • 房车、冷藏车、金属屋顶防水修补
      • 管道和通风口密封
  2. 沥青胶带(Bitumen Tape)简介:

    • 材料:由改性沥青制成,表面有铝箔,常用于建筑屋顶大面积防水。
    • 特点:

      ✅ 防水性能强 ❌ 易脏、气味重 ❌ 夏天会软化、流淌,污染玻璃 ❌ 冬天易变硬、粘性下降 ❌ 不适合用于玻璃细缝密封

✅ 哪种更适合密封阳光房玻璃缝?

比较项 丁基胶带(推荐)    沥青胶带(不推荐)
与玻璃的兼容性 ✅ 非常好   ❌ 易弄脏玻璃,附着差
抗紫外线能力  ✅ 优秀,适合阳光房长期使用  ❌ 容易老化、融化
外观美观    ✅ 整洁,可选灰黑色  ❌ 黑色粗糙,难清理
温度适应性   ✅ 热不融、冷不裂   ❌ 高温融化、低温变硬
安装简便    ✅ 自粘易贴、整洁   ❌ 易弄脏手、施工复杂
气味环保    ✅ 无味环保  ❌ 有强烈沥青味道

3 使用建议:

* 清洁玻璃与框架表面(干净、干燥、无油)
* 贴上丁基胶带,沿缝隙压实贴牢
* 可选:在外部再加一层铝箔反光带或密封压条,增加防晒耐久性
* 使用橡胶滚轮压实效果更佳

4 德国购买关键词:

* Butyl-Dichtband
* UV-beständig
* Für Glas und Aluminium
* 推荐品牌:Tesa、Sika、3M 等
* 购买渠道:Amazon.de、OBI、Bauhaus、Hornbach

✅ 总结结论:你要密封Wintergarten屋顶玻璃缝隙,请优先选择丁基胶带,不要使用沥青胶带。前者更干净、耐久、环保,且长期使用不影响玻璃美观与结构密封性。