Author Archives: gene_x

GSVA calculation for RNA-seq data

#------------------------------
#---- 1. prepare gene sets ----

library(org.Hs.eg.db)
library(dplyr)
library(AnnotationDbi)

#1. Platelets
data <- data.frame(
  geneSymbol = c("GP1BA","GP5","GP6","GP9","LY6G6D","MMRN1","PEAR1","PF4","PF4V1","PPBP","SLC35D3"),
  geneEntrezID = c("2811","2814","51206","2815","58530","22915","375033","5196","5197","5473","340146"),
  GeneSet = rep("Platelets", 11)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Platelets.csv", row.names = FALSE)

#2. Granulocytes
data <- data.frame(
  geneSymbol = c("CXCR2","CD177","CLC","CTSS","DEFA1","FUT7","LTB4R","MMP25","OSM","RETN"),
  geneEntrezID = c("3579","57126","1178","1520","1667","2529","1241","64386","5008","56729"),
  GeneSet = rep("Granulocytes", 10)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Granulocytes.csv", row.names = FALSE)

#3. LDG
data <- data.frame(
  geneSymbol = c("AZU1","CAMP","CEACAM6","CEACAM8","CTSG","DEFA4","ELANE","LCN2","LTF","MPO","OLFM4","RNASE3"),
  geneEntrezID = c("566","820","4680","1088","1511","1669","1991","3934","4057","4353","10562","6037"),
  GeneSet = rep("LDG", 12)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "LDG.csv", row.names = FALSE)

#4. pDC
data <- data.frame(
  geneSymbol = c("CLEC4C","NRP1"),
  geneEntrezID = c("170482","8829"),
  GeneSet = rep("pDC", 2)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "pDC.csv", row.names = FALSE)

#5. Anti-inflammation
data <- data.frame(
  geneSymbol = c("IL1RN","TNFAIP3","SOCS3"),
  geneEntrezID = c("3557","7128","9021"),
  GeneSet = rep("Anti-inflammation", 3)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Anti-inflammation.csv", row.names = FALSE)

#6. Pro-inflam. IL-1
data <- data.frame(
  geneSymbol = c("IL1A","IL1B","IL18","IL36A","IL36B","IL36G","IL33","IL1R1","IL1RAP","IL18R1","IL18RAP"),
  geneEntrezID = c("3552","3553","3606","27179","27177","56300","90865","3554","3556","8809","8807"),
  GeneSet = rep("Pro-inflam. IL-1", 11)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Pro-inflam._IL-1.csv", row.names = FALSE)

#7. Dendritic cells
data <- data.frame(
  geneSymbol = c("CLEC12A","CLEC10A","CLEC9A","CSF1R","IGIP","LILRA4","LY75","XCR1"),
  geneEntrezID = c("160364","10462","283420","1436","492311","23547","4065","2829"),
  GeneSet = rep("Dendritic cells", 8)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Dendritic_cells.csv", row.names = FALSE)

#8. MHC II
data <- data.frame(
  geneSymbol = c("HLA-DMA","HLA-DMB","HLA-DPA1","HLA-DPB1","HLA-DPB2","HLA-DQA1","HLA-DQA2","HLA-DQB1","HLA-DQB2","HLA-DRA","HLA-DRB1","HLA-DRB3","HLA-DRB4","HLA-DRB5","HLA-DRB6"),
  geneEntrezID = c("3108","3109","3113","3115","3116","3117","3118","3119","3120","3122","3123","3125","3126","3127","3128"),
  GeneSet = rep("MHC II", 15)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "MHC_II.csv", row.names = FALSE)

#9. Alt. complement
data <- data.frame(
  geneSymbol = c("C3","C5","C6","C7","C8A","C9","CFB","CFD","CFH","CFHR5","CFP","CR1","GZMM","MIR520B","MIR520E","VSIG4"),
  geneEntrezID = c("718","727","729","730","731","735","629","1675","3075","81494","5199","1378","3004","574473","574461","11326"),
  GeneSet = rep("Alt. complement", 16)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Alt._complement.csv", row.names = FALSE)

#10. TNF
data <- data.frame(
  geneSymbol = c("BRCA1","CASP1","CXCL1","CXCL2","GBP1","GP1BA","IFI44","IL18","IL1B","IL1RN","MX1","NAMPT","NR3C1","OAS3","PATJ","SH3BP5","TAP2","TNF","TNFAIP3","TNFRSF11A","WT1","ACLY","ACSL1","ADGRE2","AK3","AKAP10","AMPD3","APOL3","ARID3A","ARSE","ASAP1","B4GALT5","BCL2A1","BHLHE41","BHMT","BIRC3","CALD1","CASP10","CCL15","CCL20","CCL23","CCL3L1","CD37","CD38","CD83","CDKN3","CKB","CR2","CTNND2","CXCL3","CXCL8","CYP27B1","DAB2","EBI3","EGR1","EGR2","EPB41","EREG","ETAA1","F3","FABP1","FBXL2","FCER2","FCGR2A","FLJ11129","FLNA","G0S2","GCH1","GJB2","GLS","GMIP","GRK3","HCAR3","HHEX","HOMER2","HP","ICAM1","IDO1","IKBKG","IL16","IL1A","IL6","INHBA","INSIG1","ITGA6","KITLG","KLF1","KMO","LGALS3BP","MAP3K4","MARCKS","MGLL","MMP19","MN1","MRPS15","MSC","MTF1","NELL2","NFKB1","NFKB2","NFKBIA","NFKBIZ","NKX3-2","PDE4DIP","PDPN","PIAS4","PLAUR","PTGES","PTGS2","RELB","RPGR","RPS9","SDC4","SERPIND1","SFRP1","SLAMF1","SLC30A4","SOD2","SPI1","SSPN","STAT4","TAF15","TBX3","TFF1","TNFAIP2","TRAF1","TSC22D1","TYROBP","UBE2C","VEGFA"),
  geneEntrezID = c("672","834","2919","2920","2633","2811","10561","3606","3553","3557","4599","10135","2908","4940","10207","9467","6891","7124","7128","8792","7490","47","2180","30817","50808","11216","272","80833","1820","415","50807","9334","597","79365","635","330","800","843","6359","6364","6368","6349","951","952","9308","1033","1152","1380","1501","2921","3576","1594","1601","10148","1958","1959","2035","2069","54465","2152","2168","25827","2208","2212","54674","2316","50486","2643","2706","2744","51291","157","8843","3087","9455","3240","3383","3620","8517","3603","3552","3569","3624","3638","3655","4254","10661","8564","3959","4216","4082","11343","4327","4330","64960","9242","4520","4753","4790","4791","4792","64332","579","9659","10630","51588","5329","9536","5743","5971","6103","6203","6385","3053","6422","6504","7782","6648","6688","8082","6775","8148","6926","7031","7127","7185","8848","7305","11065","7422"),
  GeneSet = rep("TNF", 130)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "TNF.csv", row.names = FALSE)

#11. NLRP3 inflammasome
data <- data.frame(
  geneSymbol = c("APP","ATAT1","CARD8","CASP1","CASP4","CD36","CPTP","DHX33","EIF2AK2","GBP5","GSDMD","HSP90AB1","MEFV","NFKB1","NFKB2","NLRC3","NLRP3","P2RX7","PANX1","PSTPIP1","PYCARD","PYDC2","RELA","SIRT2","SUGT1","TLR4","TLR6","TXN","TXNIP","USP50"),
  geneEntrezID = c("351","79969","22900","834","837","948","80772","56919","5610","115362","79792","3326","4210","4790","4791","197358","114548","5027","24145","9051","29108","152138","5970","22933","10910","7099","10333","7295","10628","373509"),
  GeneSet = rep("NLRP3 inflammasome", 30)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "NLRP3_inflammasome.csv", row.names = FALSE)

#12. Unfolded protein
data <- data.frame(
  geneSymbol = c("B4GALT3","CALR","CALU","CANX","CDS2","CHST12","CHST2","DERL1","DERL2","DNAJC3","EDEM2","EDEM3","EMC9","ERAP1","ERGIC2","ERO1L","EXT1","GALNT2","GOLT1B","HERPUD1","HYOU1","IER3IP1","IMPAD1","KDELC1","KDELR2","LMAN2","LPGAT1","MAN1A1","MANEA","MANF","NUCB2","PDIA4","PDIA6","PIGK","PPIB","SEC24D","SEC61G","SPCS3","SSR1","SSR3","TRAM1","TRAM2","UGGT1","XBP1"),
  geneEntrezID = c("8703","811","813","821","8760","55501","9435","79139","51009","5611","55741","80267","51016","51752","51290","30001","2131","2590","51026","9709","10525","51124","54928","79070","11014","10960","9926","4121","79694","7873","4925","9601","10130","10026","5479","9871","23480","60559","6745","6747","23471","9697","56886","7494"),
  GeneSet = rep("Unfolded protein", 44)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Unfolded_protein.csv", row.names = FALSE)

#13. B cells
data <- data.frame(
  geneSymbol = c("IGHD","SH3BP5","BANK1","BLK","BLNK","CD19","CD22","CD79A","CD79B","DAPP1","FCRL1","FCRL2","FCRL3","FCRLA","GON4L","GPR183","IGHM","KLHL6","MS4A1","PAX5","PLCL2","VPREB1","ZNF318"),
  geneEntrezID = c("3495","9467","55024","640","29760","930","933","973","974","27071","115350","79368","115352","84824","54856","1880","3507","89857","931","5079","23228","7441","24149"),
  GeneSet = rep("B cells", 23)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "B_cells.csv", row.names = FALSE)

#14. Monocyte cell surface
data <- data.frame(
  geneSymbol = c("CD14","CD300C","CD33","CD68","CLEC12A","CLEC4D","CLEC4E","FCGR1A","FCGR1B","FCGR3B","LILRA5","LILRA6","LILRB2","LILRB3","OSCAR","SEMA4A","SIGLEC1"),
  geneEntrezID = c("929","10871","945","968","160364","338339","26253","2209","2210","2215","353514","79168","10288","11025","126014","64218","6614"),
  GeneSet = rep("Monocyte cell surface", 17)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Monocyte_cell_surface.csv", row.names = FALSE)

#15. Inflammasome
data <- data.frame(
  geneSymbol = c("CASP1","AIM2","CASP5","CTSB","GSDMB","GSDMD","NAIP","NEK7","NLRC4","NLRP1","NLRP3","NOD2","P2RX7","PANX1","PYCARD","RIPK1"),
  geneEntrezID = c("834","9447","838","1508","55876","79792","4671","140609","58484","22861","114548","64127","5027","24145","29108","8737"),
  GeneSet = rep("Inflammasome", 16)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Inflammasome.csv", row.names = FALSE)

#16. Monocytes_40
data <- data.frame(
  geneSymbol = c("C1QA","C1QB","C1QC","C1RL","CCL2","CCL8","CD14","CD300C","CD33","CD68","CLEC12A","CLEC4D","CLEC4E","CXCL1","CXCL2","CXCR2","FCGR1A","FCGR1B","FCGR3B","GRN","IK","IL18RAP","IL1B","IL1RN","LILRA5","LILRA6","LILRB2","LILRB3","OSCAR","S100A8","SEMA4A","SIGLEC1","THBD","BST1","C1R","CD163","CSF2","FUT4","MNDA","MRC1"),
  geneEntrezID = c("712","713","714","51279","6347","6355","929","10871","945","968","160364","338339","26253","2919","2920","3579","2209","2210","2215","2896","3550","8807","3553","3557","353514","79168","10288","11025","126014","6279","64218","6614","7056","683","715","9332","1437","2526","4332","4360"),
  GeneSet = rep("Monocytes_40", 40)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Monocytes_40.csv", row.names = FALSE)

#17. Monocyte secreted
data <- data.frame(
  geneSymbol = c("C1QA","C1QB","C1QC","C1RL","CCL2","CCL8","CXCL1","CXCL2","GRN","IK","IL18RAP","IL1B","IL1RN","S100A8","THBD","TNF","CXCL10"),
  geneEntrezID = c("712","713","714","51279","6347","6355","2919","2920","2896","3550","8807","3553","3557","6279","7056","7124","3627"),
  GeneSet = rep("Monocyte secreted", 17)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Monocyte_secreted.csv", row.names = FALSE)

#18. IL-1 cytokines
data <- data.frame(
  geneSymbol = c("IL18","IL1B"),
  geneEntrezID = c("3606","3553"),
  GeneSet = rep("IL-1 cytokines", 2)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "IL-1_cytokines.csv", row.names = FALSE)

#19. SNOR low UP
data <- data.frame(
  geneSymbol = c("FCGR1A","CEACAM1","LGALS1","SNORD24","SNORD44","SNORD47","SNORD80"),
  geneEntrezID = c("2209","634","3956","26820","26806","26802","26774"),
  GeneSet = rep("SNOR low UP", 7)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "SNOR_low_UP.csv", row.names = FALSE)

#20. CD40 activated
data <- data.frame(
  geneSymbol = c("BUB1B","CCL22","CD58","DBI","DUSP4","FABP5","FDPS","FEN1","GAPDH","GARS","GRHPR","H2AFX","H2AFZ","HMGB2","HMMR","HPRT1","IMPDH2","LDHA","LDHB","LMNB1","LPXN","MCM2","MCM3","MCM6","MSH6","MTHFD2","MYBL2","NDC80","NME1","PAICS","PCNA","PKM","PRDX3","RFTN1","RGS10","SLAMF1","TK1","TOP2A","TPX2","TRAF1","TUBA1B","TXN","TYMS","UBE2S","VDAC1","WEE1","WSB2","ZWINT"),
  geneEntrezID = c("701","6367","965","1622","1846","2171","2224","2237","2597","2617","9380","3014","3015","3148","3161","3251","3615","3939","3945","4001","9404","4171","4172","4175","2956","10797","4605","10403","4830","10606","5111","5315","10935","23180","6001","6504","7083","7153","22974","7185","10376","7295","7298","27338","7416","7465","55884","11130"),
  GeneSet = rep("CD40 activated", 48)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "CD40_activated.csv", row.names = FALSE)

#21. Lectin complement
data <- data.frame(
  geneSymbol = c("A2M","C2","C3","C4A","C4B","C4B_2","C5","C6","C7","C8A","C9","COLEC10","COLEC11","FCN1","FCN2","FCN3","KRT1","MASP1","MASP2","MBL2","SERPING1"),
  geneEntrezID = c("2","717","718","720","721","100293534","727","729","730","731","735","10584","78989","2219","2220","8547","3848","5648","10747","4153","710"),
  GeneSet = rep("Lectin complement", 21)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Lectin_complement.csv", row.names = FALSE)

#22. Classical complement
data <- data.frame(
  geneSymbol = c("C1QA","CIQB","C1QC","C1R","C1S","C2","C3","C4A","C4B","C4B_2","C5","C6","C7","C8A","C9"),
  geneEntrezID = c("712","713","714","715","716","717","718","720","721","100293534","727","729","730","731","735"),
  GeneSet = rep("Classical complement", 15)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Classical_complement.csv", row.names = FALSE)

#23. Cell cycle
data <- data.frame(
  geneSymbol = c("BRCA1","ASPM","AURKA","AURKB","CCNB1","CCNB2","CCNE1","CDC20","CENPM","CEP55","E2F3","GINS2","MCM10","MCM2","MKI67","NCAPG","NDC80","PTTG1","TYMS"),
  geneEntrezID = c("672","259266","6790","9212","891","9133","898","991","79019","55165","1871","51659","55388","4171","4288","64151","10403","9232","7298"),
  GeneSet = rep("Cell cycle", 19)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Cell_cycle.csv", row.names = FALSE)

#24. Plasma cells
data <- data.frame(
  geneSymbol = c("IGHV4-28","IGHV4-34","IGHD","IGHG1","IGHV2-5","IGHV3-20","IGHV3-23","IGKC","IGLV1-40","IGLV3-1","IGLV3-19","IGLV3-25","IGLV4-3","IGLV4-60","IGLV5-45","IGLV6-57","IGLVI-70","C19orf10","IGH","IGHMBP2","IGHV4-31","IGK","IGL","IGLJ3","IGLL1","IGLV@","IGLV1-44","IGLV2-14","IGLV2-5","MZB1","PRDM1","SDC1","THEMIS2","TNFRSF17"),
  geneEntrezID = c("28400","28395","3495","3500","28457","28445","28442","3514","28825","28809","28797","28793","28786","28785","28781","28778","28763","56005","3492","3508","28396","50802","3535","28831","3543","3546","28823","28815","28818","51237","639","6382","9473","608"),
  GeneSet = rep("Plasma cells", 34)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Plasma_cells.csv", row.names = FALSE)

#25. IG chains
data <- data.frame(
  geneSymbol = c("IGHG1","IGHV2-5","IGHV3-20","IGHV3-23","IGHV4-28","IGHV4-34","IGKC","IGLV1-40","IGLV3-1","IGLV3-19","IGLV3-25","IGLV4-3","IGLV4-60","IGLV5-45","IGLV6-57","IGLVI-70","IGHA1","IGHA2","IGHD2-15","IGHD2-2","IGHD2-21","IGHD3-10","IGHD3-16","IGHD3-3","IGHD3-9","IGHG2","IGHG3","IGHG4","IGHJ1","IGHJ2","IGHJ3","IGHJ4","IGHJ5","IGHJ6","IGHV1-18","IGHV1-2","IGHV1-24","IGHV1-3","IGHV1-45","IGHV1-46","IGHV1-58","IGHV1-69-2","IGHV2-26","IGHV2-70","IGHV3-13","IGHV3-15","IGHV3-21","IGHV3-33","IGHV3-43","IGHV3-48","IGHV3-49","IGHV3-53","IGHV3-62","IGHV3-64","IGHV3-7","IGHV3-72","IGHV3-73","IGHV3-74","IGHV4-30-2","IGHV4-39","IGHV4-59","IGHV4-61","IGHV5-51","IGHV6-1","IGHV7-81","IGKJ1","IGKJ2","IGKJ3","IGKJ4","IGKJ5","IGKV1-16","IGKV1-17","IGKV1-27","IGKV1-5","IGKV1-6","IGKV1-9","IGKV1D-16","IGKV1D-17","IGKV1D-43","IGKV1D-8","IGKV2-24","IGKV2D-26","IGKV2D-29","IGKV2D-30","IGKV3-20","IGKV3D-20","IGKV3D-7","IGKV4-1","IGKV5-2","IGLC2","IGLC7","IGLJ6","IGLV10-54","IGLV1-36","IGLV1-47","IGLV2-11","IGLV2-18","IGLV2-23","IGLV2-33","IGLV2-8","IGLV3-10","IGLV3-12","IGLV3-16","IGLV3-21","IGLV3-27","IGLV3-32","IGLV4-69","IGLV5-37","IGLV7-43","IGLV8-61","IGLV9-49"),
  geneEntrezID = c("3500","28457","28445","28442","28400","28395","3514","28825","28809","28797","28793","28786","28785","28781","28778","28763","3493","3494","28503","28505","28502","28499","28498","28501","28500","3501","3502","3503","28483","28481","28479","28477","28476","28475","28468","28474","28467","28473","28466","28465","28464","28458","28455","28454","28449","28448","28444","28434","28426","28424","28423","28420","28416","28414","28452","28410","28409","28408","28398","28394","28392","28391","28388","28385","28378","28950","28949","28948","28947","28946","28938","28937","28935","28299","28943","28941","28901","28900","28891","28904","28923","28884","28882","28881","28912","28874","28877","28908","28907","3538","28834","28828","28772","28826","28822","28816","28814","28813","28811","28817","28803","28802","28799","28796","28791","28787","28784","28783","28776","28774","28773"),
  GeneSet = rep("IG chains", 111)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "IG_chains.csv", row.names = FALSE)

#26. Erythrocytes
data <- data.frame(
  geneSymbol = c("EPO","GFI1B","GYPA","GYPB","GYPE","ICAM4","NFE2","SLC4A1","TRIM10","TSPO2","ZNF367"),
  geneEntrezID = c("2056","8328","2993","2994","2996","3386","4778","6521","10107","222642","195828"),
  GeneSet = rep("Erythrocytes", 11)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Erythrocytes.csv", row.names = FALSE)

#27. IL-6R complex
data <- data.frame(
  geneSymbol = c("IL6","IL6R","IL6ST"),
  geneEntrezID = c("3569","3570","3572"),
  GeneSet = rep("IL-6R complex", 3)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "IL-6R_complex.csv", row.names = FALSE)

#28. IFN
data <- data.frame(
  geneSymbol = c("EIF2AK2","GBP1","GBP2","GBP4","HERC5","HERC6","IFI27","IFI30","IFI35","IFI44","IFI44L","IFI6","IFIT1","IFIT2","IFIT3","IFIT5","IFITM1","IFITM2","IFITM3","ISG15","ISG20","MX1","MX2","OAS1","OAS2","OAS3","OASL","RSAD2","SAMD9","SAMD9L","SP100","SP110"),
  geneEntrezID = c("5610","2633","2634","115361","51191","55008","3429","10437","3430","10561","10964","2537","3434","3433","3437","24138","8519","10581","10410","9636","3669","4599","4600","4938","4939","4940","8638","91543","54809","219285","6672","3431"),
  GeneSet = rep("IFN", 32)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "IFN.csv", row.names = FALSE)

#29. TCRB
data <- data.frame(
  geneSymbol = c("TRBC2","TRBJ2-1","TRBJ2-2","TRBJ2-2P","TRBJ2-3","TRBJ2-4","TRBJ2-5","TRBJ2-6","TRBJ2-7","TRBV1","TRBV10-1","TRBV10-2","TRBV11-1","TRBV11-2","TRBV19","TRBV2","TRBV20-1","TRBV21-1","TRBV23-1","TRBV24-1","TRBV25-1","TRBV27","TRBV28","TRBV3-1","TRBV4-1","TRBV4-2","TRBV5-1","TRBV5-3","TRBV5-4","TRBV5-5","TRBV5-6","TRBV5-7","TRBV6-1","TRBV6-4","TRBV6-5","TRBV6-6","TRBV6-7","TRBV6-8","TRBV7-1","TRBV7-3","TRBV7-4","TRBV7-5","TRBV7-6","TRBV7-7","TRBV9"),
  geneEntrezID = c("28638","28629","28628","28627","28626","28625","28624","28623","28622","28621","28585","28584","28582","28581","28568","28620","28567","28566","28564","28563","28562","28560","28559","28619","28617","28616","28614","28612","28611","28610","28609","28608","28606","28603","28602","28601","28600","28599","28597","28595","28594","28593","28592","28591","28586"),
  GeneSet = rep("TCRB", 45)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "TCRB.csv", row.names = FALSE)

#30. TCRA
data <- data.frame(
  geneSymbol = c("TRAV10","TRAV1-1","TRAV1-2","TRAV12-1","TRAV12-2","TRAV12-3","TRAV13-1","TRAV13-2","TRAV14DV4","TRAV16","TRAV17","TRAV18","TRAV19","TRAV2","TRAV20","TRAV21","TRAV22","TRAV23DV6","TRAV24","TRAV25","TRAV26-1","TRAV26-2","TRAV27","TRAV29DV5","TRAV3","TRAV30","TRAV34","TRAV35","TRAV36DV7","TRAV38-1","TRAV38-2DV8","TRAV39","TRAV4","TRAV40","TRAV41","TRAV5","TRAV7","TRAV8-1","TRAV8-2","TRAV8-3","TRAV8-4","TRAV8-6","TRAV8-7","TRAV9-1","TRAV9-2"),
  geneEntrezID = c("28676","28693","28692","28674","28673","28672","28671","28670","28669","28667","28666","28665","28664","28691","28663","28662","28661","28660","28659","28658","28657","28656","28655","28653","28690","28652","28648","28647","28646","28644","28643","28642","28689","28641","28640","28688","28686","28685","28684","28683","28682","28680","28679","28678","28677"),
  GeneSet = rep("TCRA", 45)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "TCRA.csv", row.names = FALSE)

#31. Cyt. act. T cells
data <- data.frame(
  geneSymbol = c("GZMB","TAGAP","CD69","EOMES","GZMH","IFNG","IL2RB","PRF1","SGK1","TBX21","TFRC","ZNF683"),
  geneEntrezID = c("3002","117289","969","8320","2999","3458","3560","5551","6446","30009","7037","257101"),
  GeneSet = rep("Cyt. act. T cells", 12)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Cyt._act._T_cells.csv", row.names = FALSE)

#32. TCRG
data <- data.frame(
  geneSymbol = c("TRGC2","TRGV1","TRGV10","TRGV11","TRGV2","TRGV3","TRGV4","TRGV5","TRGV7","TRGV8","TRGV9"),
  geneEntrezID = c("6967","6973","6984","6985","6974","6976","6977","6978","6981","6982","6983"),
  GeneSet = rep("TCRG", 11)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "TCRG.csv", row.names = FALSE)

#33. T cells
data <- data.frame(
  geneSymbol = c("CD8A","CD8B","TRDC","CCR3","CD226","CD247","CD28","CD3D","CD3E","CD3G","CD4","CD5","ETS1","GATA3","GRAP2","LEF1","SH2D1A","TRAC","TRBC1"),
  geneEntrezID = c("925","926","28526","1232","10666","919","940","915","916","917","920","921","2113","2625","9402","51176","4068","28755","28639"),
  GeneSet = rep("T cells", 19)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "T_cells.csv", row.names = FALSE)

#34. CD8T-NK-NKT
data <- data.frame(
  geneSymbol = c("CD8A","CD8B","GZMB","NKTR","CD2","CD7","CRTAM","GNLY","GZMA","GZMK","GZMM","HCST","KIR2DL3","KIR3DL1","KIR3DL2","KLRB1","KLRC3","KLRC4","KLRD1","NKG7","RASAL3","TIA1","TXK"),
  geneEntrezID = c("925","926","3002","4820","914","924","56253","10578","3001","3003","3004","10870","3804","3811","3812","3820","3823","8302","3824","4818","64926","7072","7294"),
  GeneSet = rep("CD8T-NK-NKT", 23)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "CD8T-NK-NKT.csv", row.names = FALSE)

#35. Anergic or act. T cells
data <- data.frame(
  geneSymbol = c("CD160","CD244","CTLA4","HAVCR2","ICOS","KLRG1","LAG3","PDCD1"),
  geneEntrezID = c("11126","51744","1493","84868","29851","10219","3902","5133"),
  GeneSet = rep("Anergic or act. T cells", 8)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Anergic_or_act._T_cells.csv", row.names = FALSE)

#36. T activated
data <- data.frame(
  geneSymbol = c("FASLG","TAGAP","CD40LG","CREM","IL17A","IL17F","IL23R","JAKMIP1","KCNA3","KCNN4","P2RX5","PRKCQ","RELT","RNF125","SATB1","TNFRSF4","TNFRSF9"),
  geneEntrezID = c("356","117289","959","1390","3605","112744","149233","152789","3738","3783","5026","5588","84957","54941","6304","7293","3604"),
  GeneSet = rep("T activated", 17)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "T_activated.csv", row.names = FALSE)

#37. NK cells
data <- data.frame(
  geneSymbol = c("KLRF1","NCAM1","NCR1","NCR3","SH2D1B"),
  geneEntrezID = c("51348","4684","9437","259197","117157"),
  GeneSet = rep("NK cells", 5)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "NK_cells.csv", row.names = FALSE)

#38. TCRD
data <- data.frame(
  geneSymbol = c("TRDC","TRDJ1","TRDJ2","TRDJ3","TRDJ4","TRDV1","TRDV2","TRDV3"),
  geneEntrezID = c("28526","28522","28521","28520","28519","28518","28517","28516"),
  GeneSet = rep("TCRD", 8)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "TCRD.csv", row.names = FALSE)

#39. T regs
data <- data.frame(
  geneSymbol = c("FOXP3","IKZF2"),
  geneEntrezID = c("50943","22807"),
  GeneSet = rep("T regs", 2)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "T_regs.csv", row.names = FALSE)

#40. SNOR low DOWN
data <- data.frame(
  geneSymbol = c("RNU4ATAC","SNORA11","SNORA16A","SNORA23","SNORA38B","SNORA73A","SNORA73B"),
  geneEntrezID = c("100151683","677799","692073","677808","100124536","6080","26768"),
  GeneSet = rep("SNOR low DOWN", 7)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "SNOR_low_DOWN.csv", row.names = FALSE)

#--

#41. Monocytes
data <- data.frame(
  geneSymbol = c("ACE","ADAM8","ADAMDEC1","ADGRE1","ADGRE2","APOBEC3B","APOBEC3G","APOBR","ART4","C1QA","C1QC","C2","C4A","C4B","C4BPA","C4BPB","C5","C6","C8A","C9","CCL17","CCL18","CCL22","CCL28","CCL7","CCL8","CD14","CD163","CD209","CD300C","CD300E","CD5L","CD80","CFB","CFD","CFP","CHI3L1","CHIT1","CLEC12A","CLEC12B","CLEC4D","CLEC4E","CLEC5A","CLEC7A","CLEC9A","CSF1R","CXCL1","CXCL10","CXCL11","CXCL13","CXCL8","CXCL9","CYBB","F12","FCGR3B","FFAR2","HMMR","HVCN1","IGSF6","IL10RA","IL12A","IL12B","IL1RAP","IL20","IL23A","IL27","IL31RA","LAMP3","LGALS12","LGALS4","LGALS9","LGALS9B","LGALS9C","LILRA2","LILRA5","LILRA6","LILRB5","LMNB1","LY86","LYZ","MARCO","MEGF10","MERTK","MFGE8","MPEG1","MRC1","MS4A4A","MSR1","NTSR1","OCM","OLR1","OSCAR","OTOF","PDCD1LG2","PILRA","PLA2G2D","PLA2G5","PYHIN1","S100A8","S100A9","S1PR5","SCARB1","SCARF2","SECTM1","SEMA4A","SERPINB9","SERPING1","SIGLEC1","SIGLEC14","SIGLEC5","SIGLEC7","SLC11A1","SLITRK4","SMPDL3B","SPIC","STAB2","STAP2","TEK","TGM2","TIMD4","TLR2","TLR8","TNF","TNFAIP8L2","TNFRSF1B","TNIP3","TULP1","UBD","VENTX","VSTM1"),
  geneEntrezID = c("1636","101","27299","2015","30817","9582","60489","55911","420","712","714","717","720","721","722","725","727","729","731","735","6361","6362","6367","56477","6354","6355","929","9332","30835","10871","342510","922","941","629","1675","5199","1116","1118","160364","387837","338339","26253","23601","64581","283420","1436","2919","3627","6373","10563","3576","4283","1536","2161","2215","2867","3161","84329","10261","3587","3592","3593","3556","50604","51561","246778","133396","27074","85329","3960","3965","284194","654346","11027","353514","79168","10990","4001","9450","4069","8685","84466","10461","4240","219972","4360","51338","4481","4923","654231","4973","126014","9381","80380","29992","26279","5322","149628","6279","6280","53637","949","91179","6398","64218","5272","710","6614","100049587","8778","27036","6556","139065","27293","121599","55576","55620","7010","7052","91937","7097","51311","7124","79626","7133","79931","7287","10537","27287","284415"),
  GeneSet = rep("Monocytes", 130)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Monocytes.csv", row.names = FALSE)

#42. Myeloid cells
data <- data.frame(
  geneSymbol = c("ADGRE3","AIF1","AOAH","APOC1","BMX","C1QB","CD101","CD300LF","CD33","CD68","CLEC10A","CLEC16A","CLEC1A","CLEC2B","CLEC4A","CLEC4G","CLEC6A","CSF2RA","CSF2RB","CSF3R","CST3","CYBA","FCER1A","FCER1G","FCGR1A","FCGR1B","FCGR2A","FCGR2B","FCGR2C","FCGR3A","FGR","FLT3","FOLR2","IFI30","IFNL1","IFNL2","IFNL3","IL18","IL1A","IL1B","IL1F10","IL1RN","IL26","IL37","ITGAX","LILRA1","LILRA3","LILRB1","LILRB2","LILRB3","LILRB4","LYVE1","MEFV","MNDA","MS4A6A","MS4A7","NLRP12","NLRP3","NOD2","PECAM1","PLEK","PRAM1","RNASE3","RNASE6","RNASE7","SIGLEC10","SKAP2","SLAMF8","SLPI","SPI1","TNFSF13B","TNFSF14","TREM1","TREML2","TREML4","TWIST1","TYROBP","UNC93B1","VSIG4"),
  geneEntrezID = c("84658","199","313","341","660","713","9398","146722","945","968","10462","23274","51267","9976","50856","339390","93978","1438","1439","1441","1471","1535","2205","2207","2209","2210","2212","2213","9103","2214","2268","2322","2350","10437","282618","282616","282617","3606","3552","3553","84639","3557","55801","27178","3687","11024","11026","10859","10288","11025","11006","10894","4210","4332","64231","58475","91662","114548","64127","5175","5341","84106","6037","6039","84659","89790","8935","56833","6590","6688","10673","8740","54210","79865","285852","7291","7305","81622","11326"),
  GeneSet = rep("Myeloid cells", 79)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Myeloid_cells.csv", row.names = FALSE)

#43. Neutrophils
data <- data.frame(
  geneSymbol = c("ABHD3","ARG1","BPI","CAMP","CD177","CD83","CRISP3","DEFA1","DEFA1B","DEFA3","DEFB103A","DEFB103B","DEFB106B","DEFB136","DEFB4A","FUT4","LY6E","MMP8","OLFM4","OR1J2","PGLYRP1","PRTN3","S100A6"),
  geneEntrezID = c("171586","383","671","820","57126","9308","10321","1667","728358","1668","414325","55894","503841","613210","1673","2526","4061","4317","10562","26740","8993","5657","6277"),
  GeneSet = rep("Neutrophils", 23)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                      keys=data$geneEntrezID, 
                      keytype="ENTREZID", 
                      columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Neutrophils.csv", row.names = FALSE)

    #44. Neutrophils_
    data <- data.frame(
     geneSymbol = c("EMR2","C5AR2","PLEKHG3","DPEP2","PADI4","FAM212B","TREML2","REPS2","MAK","STEAP4","PGLYRP1","MXD1","MEFV","BTNL8","CREB5","ALOX5","BST1","CEACAM3","VNN3","LILRA2","HSPA6","NFE2","HAL","FFAR2","MMP25","QPCT","CDA","P2RY13","CHST15","TNFRSF10C","FPR2","MGAM","APOBEC3A","TLR2","CXCR1","FAM65B","LST1","TREM1","CSF3R","C5AR1","SELL","AQP9","CXCR2","VNN2","MNDA","FPR1","FCGR3B"),
     GeneSet = rep("Neutrophils_", 47)
    )
    data$geneSymbol <- sub("\\..*", "", data$geneSymbol)

    # Fetch ENSEMBL IDs for your gene symbols
    ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                          keys=data$geneSymbol, 
                          keytype="SYMBOL", 
                          columns=c("ENSEMBL", "ENTREZID"))

    # Join the original data with the fetched IDs
    result <- left_join(data, ensembl_ids, by = c("geneSymbol"="SYMBOL"))

    # Arrange columns as desired
    sorted_result <- result[, c("geneSymbol", "ENTREZID", "ENSEMBL", "GeneSet")]
    write.csv(sorted_result, file = "Neutrophils_.csv", row.names = FALSE)

    #45. Inflammatory_neutrophils
    data <- data.frame(
     geneSymbol = c("CD177.1","ZDHHC19.1","PLAC8.1","IFI6.2","MT2A.2","SERPING1.2","RSAD2.2","GBP1.2","ISG15.2","LY6E.2","S100A12.2","GBP5.2","XAF1.2","IFI44L.2","IFIT3.2","IFITM3.2","SELL.2","TRIM22.2","EPSTI1.2","FCER1G.2","IFI44.2","CST7.1","OAS1.2","TNFSF13B.2","SLFN5","IFIT2.2","MX1.2","PLSCR1.2","MMP9.2","LAP3","OAS2.2","IFIT1.2","TAP1.2","SAMD9L.2","HIST1H2AC.1","APOL6.2","OASL.1","DDX58.1","WSB1.1","RPL28.1","IL1RN.2","MCEMP1.1","HERC5.1","IRF7.2","CKAP4.1","UBE2L6.1","RNF213.2","GBP2.2","CLEC4D.1","LILRA6.1","SHISA5.2","PHF11.1","IFI16.2","OAS3.1","PIM1.1","SERPINB1.1","GYG1.1","IFITM1.1","PSTPIP2","TMSB10.1","GBP4","NFKBIA.2","EIF2AK2.1","FFAR2.2","PARP9.1","NT5C3A","TNFSF10.2","MYL12A.1","PARP14.1","DDX60","DDX60L.2","STAT1.1","XRN1","SAMD9.2","SNX3","HIST1H2BD","PGD.1","FCGR1A","JUN","CD44","PLP2.1","CARD16.2","LIMK2.2","CYSTM1.1","HMGB2.1","IFIH1.2","C1orf162","KLF4","STAT2","ZBP1","ALPL.1","GSTK1","SP110.2","ANXA1","ANXA3.1","METTL9","PML","NUCB1","CEACAM1","SECTM1","PSMB9.1","SAT1","RNF10.1","MAPK14.1","LILRA5","C3AR1","ZCCHC2","SAMHD1","LGALS9","CR1","C4orf3","IRF1.1","CD63","GRINA","TXN.1","MX2.1","GAPDH.1","RBCK1","NBN.2","NMI.1","CAST","HIST2H2BE","NTNG2","SP100.1","CASP1.1","TNFAIP6","PLEK.1","LMNB1","NFIL3","APOL2","BST1","NUB1","PIK3AP1","CCR1.2","ALOX5AP.1","EMB","ISG20","TMEM123.1","ADAR","GIMAP4","SPTLC2","SAMSN1.1","SPI1","GRN.1","APOBEC3A","GLRX","CD53.1","TRIM38","CD82","SH3GLB1","VIM.1","ADM.1","CASP4.1","S100A6.1","RAC2.1","BAZ1A","ADD3","ITGAM","LY96.2","MSRB1.1","DYSF.1","MOB1A","TPM3.1","FGR","ACSL1","IL2RG.1","CDKN2D","FYB1.1","CLEC4E.1","HLA-F","LILRB3","RBMS1","PLBD1","FLOT1","GNG5","PTEN.1","PROK2.1","UBE2J1","CD37.1","KCNJ15.1","BRI3.1","HIF1A","PRR13.1","LRG1","MAX","RHOG","NFE2","STXBP2","B4GALT5","CAPZA1","SERPINA1","ZYX","RGS19","FKBP5","GCA.1","FKBP1A","MTPN","VNN2","AC245128.3","CD55","CFL1.1"),  # shortened for clarity
     GeneSet = rep("Inflammatory neutrophils", 201)
    )
    data$geneSymbol <- sub("\\..*", "", data$geneSymbol)

    # Fetch ENSEMBL IDs for your gene symbols
    ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                          keys=data$geneSymbol, 
                          keytype="SYMBOL", 
                          columns=c("ENSEMBL", "ENTREZID"))

    # Join the original data with the fetched IDs
    result <- left_join(data, ensembl_ids, by = c("geneSymbol"="SYMBOL"))

    # Arrange columns as desired
    sorted_result <- result[, c("geneSymbol", "ENTREZID", "ENSEMBL", "GeneSet")]
    write.csv(sorted_result, file = "Inflammatory_neutrophils.csv", row.names = FALSE)

    #46. Suppressive_neutrophils
    data <- data.frame(
     geneSymbol = c("LY6E","XAF1","RSAD2","ISG15","IFI44L","IFITM3","MT2A","IFIT3","EPSTI1","IFIT1","IFI6","IFIT2","MX1","GBP5","PLSCR1","SELL","SERPING1","SHISA5","TRIM22","GBP1","IFI44","IRF7","FFAR2","OAS1","OAS2","IL1RN","IFI16","CCR1","SP110","APOL6","NFKBIA","TNFSF10","FCER1G","IFIH1","TNFSF13B","SAMD9","SAMD9L","DDX60L","TAP1","NBN","RNF213","CARD16","GBP2","LY96","CLEC2B","LIMK2"),
     GeneSet = rep("Suppressive neutrophils", 46)
    )
    data$geneSymbol <- sub("\\..*", "", data$geneSymbol)

    # Fetch ENSEMBL IDs for your gene symbols
    ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                          keys=data$geneSymbol, 
                          keytype="SYMBOL", 
                          columns=c("ENSEMBL", "ENTREZID"))

    # Join the original data with the fetched IDs
    result <- left_join(data, ensembl_ids, by = c("geneSymbol"="SYMBOL"))

    # Arrange columns as desired
    sorted_result <- result[, c("geneSymbol", "ENTREZID", "ENSEMBL", "GeneSet")]
    write.csv(sorted_result, file = "Suppressive_neutrophils.csv", row.names = FALSE)

    #47. Apoptosis
    data <- data.frame(
      geneSymbol = c("AIFM1","APAF1","BAD","BAK1","BAX","BCL2L11","BID","CASP10","CASP2","CASP3","CASP6","CASP7","CASP8","CASP9","DIABLO","ENDOG","FAS","FASLG","HTRA2","TNFRSF1A","TP53"),
      geneEntrezID = c("9131","317","572","578","581","10018","637","843","835","836","839","840","841","842","56616","2021","355","356","27429","7132","7157"),
      GeneSet = rep("Apoptosis", 21)
    )
    ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                          keys=data$geneEntrezID, 
                          keytype="ENTREZID", 
                          columns="ENSEMBL")
    result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
    sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
    write.csv(sorted_result, file = "Apoptosis.csv", row.names = FALSE)

    #48. NFkB complex
    data <- data.frame(
      geneSymbol = c("IKBKB","IKBKG","NFKB1","NFKB2","RELA","RELB","REL"),
      geneEntrezID = c("3551","8517","4790","4791","5970","5971","5966"),
      GeneSet = rep("NFkB complex", 7)
    )
    ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db, 
                          keys=data$geneEntrezID, 
                          keytype="ENTREZID", 
                          columns="ENSEMBL")
    result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
    sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
    write.csv(sorted_result, file = "NFkB_complex.csv", row.names = FALSE)

#~/Tools/csv2xls-0.4/csv_to_xls.py Alt._complement.csv Anergic_or_act._T_cells.csv Anti-inflammation.csv B_cells.csv CD40_activated.csv CD8T-NK-NKT.csv Cell_cycle.csv Classical_complement.csv Cyt._act._T_cells.csv Dendritic_cells.csv Erythrocytes.csv Granulocytes.csv IFN.csv IG_chains.csv IL-1_cytokines.csv IL-6R_complex.csv Inflammasome.csv LDG.csv Lectin_complement.csv MHC_II.csv Monocytes.csv Monocytes_40.csv Monocyte_cell_surface.csv Monocyte_secreted.csv Myeloid_cells.csv Neutrophils.csv NK_cells.csv NLRP3_inflammasome.csv pDC.csv Plasma_cells.csv Platelets.csv Pro-inflam._IL-1.csv SNOR_low_DOWN.csv SNOR_low_UP.csv TCRA.csv TCRB.csv TCRD.csv TCRG.csv TNF.csv T_activated.csv T_cells.csv T_regs.csv Unfolded_protein.csv Neutrophils_.csv Inflammatory_neutrophils.csv Suppressive_neutrophils.csv Apoptosis.csv NFkB_complex.csv -d',' -o Signatures.xls

#---------------------------------------------------------------------
#---- 2. Prepare gene expression matrix: calculate DESeq2 results ----

setwd("/home/jhuang/DATA/Data_Susanne_Carotis_RNASeq/results/featureCounts_2023")

library("AnnotationDbi")
library("clusterProfiler")
library("ReactomePA")
#BiocManager::install("org.Hs.eg.db")
library("org.Hs.eg.db")
library(DESeq2)
library(gplots)

d.raw<- read.delim2("merged_gene_counts.txt",sep="\t", header=TRUE, row.names=1)

colnames(d.raw)<-c("gene_name", "leer_mock_2h_r2", "Ace2_mock_2h_r2", "leer_inf_24h_r1", "Ace2_inf_2h_r1", "leer_inf_24h_r2", "leer_inf_2h_r1", "leer_mock_2h_r1", "leer_inf_2h_r2", "Ace2_inf_2h_r2", "Ace2_mock_2h_r1", "Ace2_inf_24h_r2", "Ace2_inf_24h_r1")

col_order <- c("gene_name", "leer_mock_2h_r1","leer_mock_2h_r2","leer_inf_2h_r1","leer_inf_2h_r2","leer_inf_24h_r1","leer_inf_24h_r2","Ace2_mock_2h_r1","Ace2_mock_2h_r2","Ace2_inf_2h_r1","Ace2_inf_2h_r2","Ace2_inf_24h_r1","Ace2_inf_24h_r2")

reordered.raw <- d.raw[,col_order]
reordered.raw$gene_name <- NULL
#d <- d.raw[rowSums(reordered.raw>3)>2,]

condition = as.factor(c("leer_mock_2h","leer_mock_2h","leer_inf_2h","leer_inf_2h","leer_inf_24h","leer_inf_24h","Ace2_mock_2h","Ace2_mock_2h","Ace2_inf_2h","Ace2_inf_2h","Ace2_inf_24h","Ace2_inf_24h"))
ids = as.factor(c("leer_mock_2h_r1","leer_mock_2h_r2","leer_inf_2h_r1","leer_inf_2h_r2","leer_inf_24h_r1","leer_inf_24h_r2","Ace2_mock_2h_r1","Ace2_mock_2h_r2","Ace2_inf_2h_r1","Ace2_inf_2h_r2","Ace2_inf_24h_r1","Ace2_inf_24h_r2"))

#cData = data.frame(row.names=colnames(reordered.raw), condition=condition,  batch=batch, ids=ids)
#dds<-DESeqDataSetFromMatrix(countData=reordered.raw, colData=cData, design=~batch+condition)
cData = data.frame(row.names=colnames(reordered.raw), condition=condition, ids=ids)
dds<-DESeqDataSetFromMatrix(countData=reordered.raw, colData=cData, design=~condition)

#----more detailed and specific with the following code!----
dds$condition <- relevel(dds$condition, "Ace2_mock_2h")
dds = DESeq(dds, betaPrior=FALSE)  # betaPrior default value is FALSE
resultsNames(dds)

#--------------------------------------
#---- 3. draw violin on signatures ----

library(GSVA)
library(ggplot2)
#install.packages("readxl")
library(readxl)

# Path to the Excel file
file_path <- "Signatures.xls"

#example of a signature:
#geneSymbol geneEntrezID    ENSEMBL GeneSet
#CD160  11126   ENSG00000117281 Anergic or act. T cells
#CD244  51744   ENSG00000122223 Anergic or act. T cells
#CTLA4  1493    ENSG00000163599 Anergic or act. T cells
#HAVCR2 84868   ENSG00000135077 Anergic or act. T cells
#ICOS   29851   ENSG00000163600 Anergic or act. T cells
#KLRG1  10219   ENSG00000139187 Anergic or act. T cells
#LAG3   3902    ENSG00000089692 Anergic or act. T cells
#PDCD1  5133    ENSG00000188389 Anergic or act. T cells
#PDCD1  5133    ENSG00000276977 Anergic or act. T cells

# Get the names of the sheets
sheet_names <- excel_sheets(file_path)

# Initialize an empty list to hold gene sets
geneSets <- list()

# Loop over each sheet, extract the ENSEMBL IDs, and add to the list
for (sheet in sheet_names) {
  # Read the sheet
  data <- read_excel(file_path, sheet = sheet)

  # Process the GeneSet names (replacing spaces with underscores, for example)
  gene_set_name <- gsub(" ", "_", unique(data$GeneSet)[1])

  # Add ENSEMBL IDs to the list
  geneSets[[gene_set_name]] <- as.character(data$ENSEMBL)
}

# Print the result to check
print(geneSets)

# 0. for Nanostring, the GSVA input requires a gene expression matrix 'exprs' where rows are genes and columns are samples.
#exprs <- exprs(filtered_or_neg_target_m666Data)
# 0. for RNAseq, the GSVA input requires a gene expression matrix where rows are genes and columns are samples.
exprs <- counts(dds, normalized=TRUE)

# 1. Compute GSVA scores:
library(GSVA)
gsva_scores <- gsva(exprs, geneSets, method="gsva")

# 2. Convert to data.frame for ggplot:
gsva_df <- as.data.frame(t(gsva_scores))

# 3. Add conditions to gsva_df:
gsva_df$Condition <- dds$condition

# 4. Filter the gsva_df to retain only the desired conditions:
#group 1 vs. group 3 in the nanostring data
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("Ace2_mock_2h", "Ace2_inf_24h"), ]

# 5. Define a function to plot violin plots:
# Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
gsva_df_filtered$Condition <- gsub("Ace2_mock_2h", "mock", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("Ace2_inf_24h", "infection", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("mock", "infection"))
plot_violin <- function(data, gene_name) {
  # Calculate the t-test p-value for the two conditions
  condition1_data <- data[data$Condition == "mock", gene_name]
  condition2_data <- data[data$Condition == "infection", gene_name]
  p_value <- t.test(condition1_data, condition2_data)$p.value

  # Convert p-value to annotation
  p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
  rounded_p_value <- paste0("p = ", round(p_value, 2))

  plot_title <- gsub("_", " ", gene_name)
  p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name))) +
    geom_violin(linewidth=1.2) + 
    labs(title=plot_title, y="GSVA Score") +
    ylim(-1, 1) +
    theme_light() +
    theme(
      axis.title.x = element_text(size=12),
      axis.title.y = element_text(size=12),
      axis.text.x  = element_text(size=10),
      axis.text.y  = element_text(size=10),
      plot.title   = element_text(size=12, hjust=0.5)
    )

  # Add p-value annotation to the plot
  p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)

  return(p)
}

# 6. Generate the list of plots:
#genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
#plots_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))

# 6. Generate the list of plots in a predefined order:
desired_order <- c("Platelets","Granulocytes","LDG","pDC","Anti-inflammation",  "Pro-inflam._IL-1","Dendritic_cells","MHC_II","Alt._complement","TNF",  "NLRP3_inflammasome","Unfolded_protein","B_cells","Monocyte_cell_surface","Inflammasome",  "Monocyte_secreted","IL-1_cytokines","SNOR_low_UP","CD40_activated","Lectin_complement",  "Classical_complement","Cell_cycle","Plasma_cells","IG_chains","Erythrocytes",  "IL-6R_complex","IFN","TCRB","TCRA","Cyt._act._T_cells",  "TCRG","T_cells","CD8T-NK-NKT","Anergic_or_act._T_cells","T_activated",  "NK_cells","TCRD","T_regs","SNOR_low_DOWN","Monocytes",  "Myeloid_cells","Neutrophils")
genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
genes <- genes[match(desired_order, genes)]
plots_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))

# 7. Pad the list of plots:
remaining_plots <- 6 - (length(plots_list) %% 6)
if (remaining_plots != 6) {
  plots_list <- c(plots_list, rep(list(NULL), remaining_plots))
}

# 8. Create the plots and arrange them in a grid:
library(gridExtra)
plots_matrix <- matrix(plots_list, ncol=6, byrow=T)
#do.call("grid.arrange", c(plots_matrix, list(ncol=6)))

# 9. Save the plots to a PNG:
png("All_Violin_Plots_RNAseq.png", width=1000, height=1000)
do.call("grid.arrange", c(plots_matrix, list(ncol=6)))
dev.off()

# ------------------------ Heatmap Presentation --------------------

#-- NanoString GSVA scores --
exprs <- exprs(target_m666Data)
#exprs <- exprs(filtered_or_neg_target_m666Data)

# 1. Compute GSVA scores:
gsva_scores <- gsva(exprs, geneSets, method="gsva")

# 2. Convert to data.frame for ggplot:
gsva_df <- as.data.frame(t(gsva_scores))

# 3. Add conditions to gsva_df:
identical(rownames(pData(target_m666Data)), rownames(gsva_df))
gsva_df$Condition <- pData(target_m666Data)$Grp
gsva_df$SampleID <- pData(target_m666Data)$SampleID

# 4. Filter the gsva_df to retain only the desired conditions:
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1", "3"), ]

# 5. Define a function to plot violin plots:
# Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
gsva_df_filtered$Condition <- gsub("1", "Group1", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1", "Group3"))

# Filter the data for "Group1" samples
group1_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group1", genes]

# Set the median values for each column in the data.frame
# Calculate median values for each column
group1_medians <- apply(group1_data, 2, median)

# 3. Add conditions to gsva_df:
identical(rownames(pData(target_m666Data)), rownames(gsva_df))
gsva_df$Condition <- pData(target_m666Data)$Group
#identical(rownames(gsva_df_filtered), rownames(pData(target_m666Data)) )
gsva_df$SampleID <- pData(target_m666Data)$SampleID

# 4. Filter the gsva_df to retain only the desired conditions:
#group 1 vs. group 3 in the nanostring data
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1a", "1b", "3"), ]

# 5. Define a function to plot violin plots:
# Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
gsva_df_filtered$Condition <- gsub("1a", "Group1a", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("1b", "Group1b", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1a", "Group1b", "Group3"))

# Filter the data for "Group1a" samples
group1a_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group1a", genes]
group1a_medians <- apply(group1a_data, 2, median)

# Filter the data for "Group1b" samples
group1b_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group1b", genes]
group1b_medians <- apply(group1b_data, 2, median)

# Filter the data for "Group3" samples
group3_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group3", genes]
group3_medians <- apply(group3_data, 2, median)

#-- RNAseq GSVA scores --
exprs <- counts(dds, normalized=TRUE)

# 1. Compute GSVA scores:
gsva_scores <- gsva(exprs, geneSets, method="gsva")

# 2. Convert to data.frame for ggplot:
gsva_df <- as.data.frame(t(gsva_scores))

# 3. Add conditions to gsva_df:
gsva_df$Condition <- dds$condition

# 4. Filter the gsva_df to retain only the desired conditions:
#group 1 vs. group 3 in the nanostring data
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("Ace2_mock_2h", "Ace2_inf_24h"), ]

# 5. Define a function to plot violin plots:
# Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
gsva_df_filtered$Condition <- gsub("Ace2_mock_2h", "mock", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("Ace2_inf_24h", "infection", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("mock", "infection"))

# Filter the data for "mock" samples
mock_data <- gsva_df_filtered[gsva_df_filtered$Condition == "mock", genes]
mock_medians <- apply(mock_data, 2, median)

# Filter the data for "infection" samples
infection_data <- gsva_df_filtered[gsva_df_filtered$Condition == "infection", genes]
infection_medians <- apply(infection_data, 2, median)

# Create a new data frame with the middle values for both groups
heatmap_data_rnaseq <- data.frame(mock = mock_medians, infection = infection_medians)

# Transpose the data matrix to switch rows and columns
#heatmap_data <- t(heatmap_data)

# Convert the data frame to a numeric matrix
heatmap_matrix <- as.matrix(heatmap_data)

# Specify heatmap colors
color_scheme <- colorRampPalette(c("blue", "white", "red"))(50)

#TODO: adjust the color, so that it is not so dark and light!!!!, add RNAseq data for mock and infection!!
# Create the heatmap without hierarchical clustering
library(gplots)
png("Heatmap.png", width=1000, height=1000)
heatmap.2(
  heatmap_matrix,
  Colv = FALSE,  # Turn off column dendrogram
  Rowv = FALSE,  # Turn off row dendrogram
  trace = "none",  # Turn off row and column dendrograms
  scale = "row",   # Scale the rows
  col = color_scheme,
  main = "GSVA Heatmap",  # Title of the heatmap
  key = TRUE,            # Show color key
  key.title = "GSVA Score",  # Color key title
  key.xlab = NULL,       # Remove x-axis label from color key
  density.info = "none",  # Remove density plot
  cexRow = 1.2,  # Increase font size of row names
  cexCol = 1.2,  # Increase font size of column names
  srtCol = 40, keysize = 0.72, margins = c(5, 14)
)
dev.off()

GSVA calculation for NanoString data

library("rmarkdown")
library("tidyverse")
library(rmarkdown)
library("GeomxTools")
library("GeoMxWorkflows")
library("NanoStringNCTools")
setwd("/home/jhuang/DATA/Data_Susanne_Carotis_spatialRNA_PUBLISHING/run_2023")
render("run.Rmd", c("html_document"))

# ------------------------ Violin Presentation 1 --------------------
#identical(exprs(target_m666Data), assay(target_m666Data, "exprs")): contains the gene expression values that have been both normalized and log-transformed, which are often used for downstream analysis and visualization in many genomic studies.

#assay(target_m666Data, "log_q"): These are likely log2-transformed values of the quantile-normalized data. Quantile normalization ensures that the distributions of gene expression values across samples are the same, and taking the log2 of these normalized values is a common step in the analysis of high-throughput gene expression data. It's a way to make the data more suitable for downstream statistical analysis and visualization.

#assay(target_m666Data, "q_norm"): This typically represents the quantile-normalized gene expression values. Quantile normalization adjusts the expression values in each sample so that they have the same distribution. This normalization method helps remove systematic biases and makes the data more comparable across samples.

library(readxl)

# Path to the Excel file
file_path <- "Signatures.xls"

#example of a signature:
#geneSymbol geneEntrezID    ENSEMBL GeneSet
#CD160  11126   ENSG00000117281 Anergic or act. T cells
#CD244  51744   ENSG00000122223 Anergic or act. T cells
#CTLA4  1493    ENSG00000163599 Anergic or act. T cells
#HAVCR2 84868   ENSG00000135077 Anergic or act. T cells
#ICOS   29851   ENSG00000163600 Anergic or act. T cells
#KLRG1  10219   ENSG00000139187 Anergic or act. T cells
#LAG3   3902    ENSG00000089692 Anergic or act. T cells
#PDCD1  5133    ENSG00000188389 Anergic or act. T cells
#PDCD1  5133    ENSG00000276977 Anergic or act. T cells

# Get the names of the sheets
sheet_names <- excel_sheets(file_path)

# Initialize an empty list to hold gene sets
geneSets <- list()

# Loop over each sheet, extract the ENSEMBL IDs, and add to the list
for (sheet in sheet_names) {
  # Read the sheet
  data <- read_excel(file_path, sheet = sheet)

  # Process the GeneSet names (replacing spaces with underscores, for example)
  gene_set_name <- gsub(" ", "_", unique(data$GeneSet)[1])

  # Add ENSEMBL IDs to the list
  geneSets[[gene_set_name]] <- unique(as.character(data$geneSymbol))
}

# Print the result to check
print(geneSets)

library(gridExtra)
library(ggplot2)
library(GSVA)

# 0. for the following calculation, the GSVA input requires a gene expression matrix 'exprs' where rows are genes and columns are samples.
exprs <- exprs(target_m666Data)
#exprs <- exprs(filtered_or_neg_target_m666Data)

# 1. Compute GSVA scores:
gsva_scores <- gsva(exprs, geneSets, method="gsva")

# 2. Convert to data.frame for ggplot:
gsva_df <- as.data.frame(t(gsva_scores))

# 3. Add conditions to gsva_df:
identical(rownames(pData(target_m666Data)), rownames(gsva_df))
gsva_df$Condition <- pData(target_m666Data)$Grp
#identical(rownames(gsva_df_filtered), rownames(pData(target_m666Data)) )
gsva_df$SampleID <- pData(target_m666Data)$SampleID

# 4. Filter the gsva_df to retain only the desired conditions:
#group 1 vs. group 3 in the nanostring data
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1", "3"), ]

# 5. Define a function to plot violin plots:
# Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
gsva_df_filtered$Condition <- gsub("1", "Group1", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1", "Group3"))
plot_violin <- function(data, gene_name) {
  # Calculate the t-test p-value for the two conditions
  condition1_data <- data[data$Condition == "Group1", gene_name]
  condition2_data <- data[data$Condition == "Group3", gene_name]
  p_value <- t.test(condition1_data, condition2_data)$p.value

  # Convert p-value to annotation
  p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
  rounded_p_value <- paste0("p = ", round(p_value, 2))

  plot_title <- gsub("_", " ", gene_name)
  p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name))) +
    geom_violin(linewidth=1.2) + 
    labs(title=plot_title, y="GSVA Score") +
    ylim(-1, 1) +
    theme_light() +
    theme(
      axis.title.x = element_text(size=12),
      axis.title.y = element_text(size=12),
      axis.text.x  = element_text(size=10),
      axis.text.y  = element_text(size=10),
      plot.title   = element_text(size=12, hjust=0.5)
    )

  # Add p-value annotation to the plot
  p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)

  return(p)
}

# 6. Generate the list of plots:
#genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
#plots_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))

# 6. Generate the list of plots in a predefined order:
desired_order <- c("Platelets","Granulocytes","LDG","pDC","Anti-inflammation",  "Pro-inflam._IL-1","Dendritic_cells","MHC_II","Alt._complement","TNF",  "NLRP3_inflammasome","Unfolded_protein","B_cells","Monocyte_cell_surface","Inflammasome",  "Monocyte_secreted","IL-1_cytokines","SNOR_low_UP","CD40_activated","Lectin_complement",  "Classical_complement","Cell_cycle","Plasma_cells","IG_chains","Erythrocytes",  "IL-6R_complex","IFN","TCRB","TCRA","Cyt._act._T_cells",  "TCRG","T_cells","CD8T-NK-NKT","Anergic_or_act._T_cells","T_activated",  "NK_cells","TCRD","T_regs","SNOR_low_DOWN","Monocytes",  "Myeloid_cells","Neutrophils")
genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
genes <- genes[match(desired_order, genes)]
genes <- genes[!is.na(genes)]
plots_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))

# 7. Pad the list of plots:
remaining_plots <- 6 - (length(plots_list) %% 6)
if (remaining_plots != 6) {
  plots_list <- c(plots_list, rep(list(NULL), remaining_plots))
}

# 7. Pad the list of plots:
#plots_needed_for_full_grid <- ceiling(length(plots_list) / 6) * 6
#remaining_plots <- plots_needed_for_full_grid - length(plots_list)
#plots_list <- c(plots_list, rep(list(NULL), remaining_plots))

# 8. Create the plots and arrange them in a grid:
library(gridExtra)
plots_matrix <- matrix(plots_list, ncol=6, byrow=T)
#do.call("grid.arrange", c(plots_matrix, list(ncol=6)))

# 9. Save the plots to a PNG:
png("All_Violin_Plots_NanoString_3_vs_1.png", width=1000, height=1000)
do.call("grid.arrange", c(plots_matrix, list(ncol=6)))
dev.off()

# ------------------------ Violin Presentation 2 --------------------

# 2. Convert to data.frame for ggplot:
gsva_df <- as.data.frame(t(gsva_scores))

# 3. Add conditions to gsva_df:
identical(rownames(pData(target_m666Data)), rownames(gsva_df))
gsva_df$Condition <- pData(target_m666Data)$Group
#identical(rownames(gsva_df_filtered), rownames(pData(target_m666Data)) )
gsva_df$SampleID <- pData(target_m666Data)$SampleID

# 4. Filter the gsva_df to retain only the desired conditions:
#group 1 vs. group 3 in the nanostring data
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1a", "3"), ]

# 5. Define a function to plot violin plots:
# Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
gsva_df_filtered$Condition <- gsub("1a", "Group1a", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1a", "Group3"))
plot_violin <- function(data, gene_name) {
  # Calculate the t-test p-value for the two conditions
  condition1_data <- data[data$Condition == "Group1a", gene_name]
  condition2_data <- data[data$Condition == "Group3", gene_name]
  p_value <- t.test(condition1_data, condition2_data)$p.value

  # Convert p-value to annotation
  p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
  rounded_p_value <- paste0("p = ", round(p_value, 2))

  plot_title <- gsub("_", " ", gene_name)
  p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name))) +
    geom_violin(linewidth=1.2) + 
    labs(title=plot_title, y="GSVA Score") +
    ylim(-1, 1) +
    theme_light() +
    theme(
      axis.title.x = element_text(size=12),
      axis.title.y = element_text(size=12),
      axis.text.x  = element_text(size=10),
      axis.text.y  = element_text(size=10),
      plot.title   = element_text(size=12, hjust=0.5)
    )

  # Add p-value annotation to the plot
  p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)

  return(p)
}

# 6. Generate the list of plots:
#genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
#plots_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))

# 6. Generate the list of plots in a predefined order:
desired_order <- c("Platelets","Granulocytes","LDG","pDC","Anti-inflammation",  "Pro-inflam._IL-1","Dendritic_cells","MHC_II","Alt._complement","TNF",  "NLRP3_inflammasome","Unfolded_protein","B_cells","Monocyte_cell_surface","Inflammasome",  "Monocyte_secreted","IL-1_cytokines","SNOR_low_UP","CD40_activated","Lectin_complement",  "Classical_complement","Cell_cycle","Plasma_cells","IG_chains","Erythrocytes",  "IL-6R_complex","IFN","TCRB","TCRA","Cyt._act._T_cells",  "TCRG","T_cells","CD8T-NK-NKT","Anergic_or_act._T_cells","T_activated",  "NK_cells","TCRD","T_regs","SNOR_low_DOWN","Monocytes",  "Myeloid_cells","Neutrophils")
genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
genes <- genes[match(desired_order, genes)]
genes <- genes[!is.na(genes)]
plots_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))

# 7. Pad the list of plots:
remaining_plots <- 6 - (length(plots_list) %% 6)
if (remaining_plots != 6) {
  plots_list <- c(plots_list, rep(list(NULL), remaining_plots))
}

# 7. Pad the list of plots:
#plots_needed_for_full_grid <- ceiling(length(plots_list) / 6) * 6
#remaining_plots <- plots_needed_for_full_grid - length(plots_list)
#plots_list <- c(plots_list, rep(list(NULL), remaining_plots))

# 8. Create the plots and arrange them in a grid:
library(gridExtra)
plots_matrix <- matrix(plots_list, ncol=6, byrow=T)
#do.call("grid.arrange", c(plots_matrix, list(ncol=6)))

# 9. Save the plots to a PNG:
png("All_Violin_Plots_NanoString_3_vs_1a.png", width=1000, height=1000)
do.call("grid.arrange", c(plots_matrix, list(ncol=6)))
dev.off()

# ------------------------ Heatmap Presentation --------------------

# 3. Add conditions to gsva_df:
identical(rownames(pData(target_m666Data)), rownames(gsva_df))
gsva_df$Condition <- pData(target_m666Data)$Grp
gsva_df$SampleID <- pData(target_m666Data)$SampleID

# 4. Filter the gsva_df to retain only the desired conditions:
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1", "3"), ]

# 5. Define a function to plot violin plots:
# Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
gsva_df_filtered$Condition <- gsub("1", "Group1", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1", "Group3"))

# Filter the data for "Group1" samples
group1_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group1", genes]

# Set the median values for each column in the data.frame
# Calculate median values for each column
group1_medians <- apply(group1_data, 2, median)

# 3. Add conditions to gsva_df:
identical(rownames(pData(target_m666Data)), rownames(gsva_df))
gsva_df$Condition <- pData(target_m666Data)$Group
#identical(rownames(gsva_df_filtered), rownames(pData(target_m666Data)) )
gsva_df$SampleID <- pData(target_m666Data)$SampleID

# 4. Filter the gsva_df to retain only the desired conditions:
#group 1 vs. group 3 in the nanostring data
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1a", "1b", "3"), ]

# 5. Define a function to plot violin plots:
# Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
gsva_df_filtered$Condition <- gsub("1a", "Group1a", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("1b", "Group1b", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1a", "Group1b", "Group3"))

# Filter the data for "Group1a" samples
group1a_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group1a", genes]
group1a_medians <- apply(group1a_data, 2, median)

# Filter the data for "Group1b" samples
group1b_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group1b", genes]
group1b_medians <- apply(group1b_data, 2, median)

# Filter the data for "Group3" samples
group3_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group3", genes]
group3_medians <- apply(group3_data, 2, median)

# Create a new data frame with the middle values for both groups
heatmap_data_nanostring <- data.frame(Group1 = group1_medians, Group1a = group1a_medians, Group1b = group1b_medians, Group3 = group3_medians)

# Merge two heatmap_data_* to a matrix
heatmap_data <- merge(heatmap_data_nanostring, heatmap_data_rnaseq, by="row.names", all=TRUE)
rownames(heatmap_data) <- heatmap_data$Row.names
heatmap_data$Row.names <- NULL

# Transpose the data matrix to switch rows and columns
#heatmap_data <- t(heatmap_data)

# Convert the data frame to a numeric matrix
heatmap_matrix <- as.matrix(heatmap_data)

# Transpose the data matrix to switch rows and columns
#heatmap_data <- t(heatmap_data)

# Convert the data frame to a numeric matrix
heatmap_matrix <- as.matrix(heatmap_data)

# Specify heatmap colors
color_scheme <- colorRampPalette(c("blue", "white", "red"))(50)

#TODO: adjust the color, so that it is not so dark and light!!!!, add RNAseq data for mock and infection!!
# Create the heatmap without hierarchical clustering
library(gplots)
png("Heatmap.png", width=1000, height=1000)
heatmap.2(
  heatmap_matrix,
  Colv = FALSE,  # Turn off column dendrogram
  Rowv = FALSE,  # Turn off row dendrogram
  trace = "none",  # Turn off row and column dendrograms
  scale = "row",   # Scale the rows
  col = color_scheme,
  main = "GSVA Heatmap",  # Title of the heatmap
  key = TRUE,            # Show color key
  key.title = "GSVA Score",  # Color key title
  key.xlab = NULL,       # Remove x-axis label from color key
  density.info = "none",  # Remove density plot
  cexRow = 1.2,  # Increase font size of row names
  cexCol = 1.2,  # Increase font size of column names
  srtCol = 40, keysize = 0.72, margins = c(5, 14)
)
dev.off()

# Write the table to Excel-file
library(writexl)
# Specify the file path where you want to save the Excel file
excel_file <- "heatmap_data.xlsx"

# Save the data frame to the Excel file
write_xlsx(heatmap_data, excel_file)

# Verify that the Excel file has been saved
if (file.exists(excel_file)) {
  cat("Data frame has been saved to", excel_file, "\n")
} else {
  cat("Error: Failed to save the data frame to", excel_file, "\n")
}

Ordinary vs. Moderated P-values: A Key Comparison in limma Differential Expression Analysis

In the context of differential expression analysis, limma is a popular R package that originally was designed for microarray data but has since been adapted for RNA-seq data (using voom transformation). One of the unique features of limma is its use of moderated statistics. Here’s a breakdown of the difference between ordinary p-values and moderated p-values in limma:

  • Ordinary P-values:

    • These are the p-values you would get if you were to do standard hypothesis testing on each gene individually without borrowing information from other genes.
    • Calculated based on the ordinary standard errors.
  • Moderated P-values:

    • Limma borrows information across genes to get more precise estimates of the variability for each gene. This is especially useful when the number of samples (replicates) is small.
    • The process involves “shrinking” the gene-wise sample variances towards a pooled estimate, resulting in moderated t-statistics, which are more stable than ordinary t-statistics.
    • Moderated p-values are calculated based on these moderated t-statistics.
    • As a result, these moderated p-values tend to be more reliable, especially in experiments with small sample sizes.
  • Why is Moderation Necessary?:

    • In many genomics experiments, there’s a challenge: While there are thousands of genes (or more), there might be a relatively small number of replicates or samples. This can make estimates of variance for each gene unreliable.
    • By borrowing strength from the ensemble of genes, limma can stabilize these variance estimates, which, in turn, makes the resulting p-values more reliable.
  • Empirical Bayes Method:

    • The moderation in limma is achieved through an empirical Bayes method. This doesn’t mean it’s a fully Bayesian approach but rather that it borrows some concepts from Bayesian statistics to stabilize variance estimates across genes.

In practice, when using limma, researchers often focus on the moderated p-values because of their enhanced reliability, especially in the context of multiple hypothesis testing in genomics. The moderated statistics help reduce the number of false positives that might arise from genes with unusually low variance estimates due to chance alone.

GSVA calculation for Mass spectrometry (MS)-based proteomics

library("rmarkdown")
library("tidyverse")
library(rmarkdown)
setwd("/home/jhuang/DATA/Data_Susanne_Carotis_MS/LIMMA-pipeline-proteomics/Results_20231006_165122")

# -1. prepare and read signatures (namely gene sets)
#Monocytes.csv #66
#Monocytes2.csv #201
#Neutrophils.csv #40
#Neutrophils2.csv #61
#~/Tools/csv2xls-0.4/csv_to_xls.py Alt._complement.csv Anergic_or_act._T_cells.csv Anti-inflammation.csv B_cells.csv CD40_activated.csv CD8T-NK-NKT.csv Cell_cycle.csv Classical_complement.csv Cyt._act._T_cells.csv Dendritic_cells.csv Erythrocytes.csv Granulocytes.csv IFN.csv IG_chains.csv IL-1_cytokines.csv IL-6R_complex.csv Inflammasome.csv LDG.csv Lectin_complement.csv MHC_II.csv Monocytes.csv Monocyte_cell_surface.csv Monocyte_secreted.csv Myeloid_cells.csv Neutrophils.csv NK_cells.csv NLRP3_inflammasome.csv pDC.csv Plasma_cells.csv Platelets.csv Pro-inflam._IL-1.csv SNOR_low_DOWN.csv SNOR_low_UP.csv TCRA.csv TCRB.csv TCRD.csv TCRG.csv TNF.csv T_activated.csv T_cells.csv T_regs.csv Unfolded_protein.csv  Inflammatory_neutrophils.csv Suppressive_neutrophils.csv Apoptosis.csv NFkB_complex.csv -d',' -o Signatures_46.xls 

library(readxl)
file_path <- "../../Signatures_46.xls"
# Get the names of the sheets
sheet_names <- excel_sheets(file_path)
# Initialize an empty list to hold gene sets
geneSets <- list()
# Loop over each sheet, extract the ENSEMBL IDs, and add to the list
for (sheet in sheet_names) {
  # Read the sheet
  data <- read_excel(file_path, sheet = sheet)
  # Process the GeneSet names (replacing spaces with underscores, for example)
  gene_set_name <- gsub(" ", "_", unique(data$GeneSet)[1])
  # Add ENSEMBL IDs to the list
  geneSets[[gene_set_name]] <- unique(as.character(data$geneSymbol))
}
# Print the result to check
print(geneSets)
length(geneSets)

# 0. for the following calculation, the GSVA input requires a gene expression matrix 'exprs' where rows are genes and columns are samples.
library(gridExtra)
library(ggplot2)
library(GSVA)
exprs_ <- read.csv(sep="\t", file="20231006_165122_final_data.tsv")

# List of desired columns
desired_columns <- c("CA1", "CA2", "CA3", "CA10", "CA5", "CA6", "CA8", "CA14", "CA12", "CA13", "Con1", "Con2", "Con3", "Con4", "Con5", "Con6")
# # Find duplicated symbols
# duplicated_symbols <- exprs_$Symbol[duplicated(exprs_$Symbol) | duplicated(exprs_$Symbol, fromLast = TRUE)]
# print(duplicated_symbols)
# Filter out rows with empty 'Symbol' values
exprs_ <- exprs_[exprs_$Symbol != "", ]
# Subset the dataframe
exprs <- exprs_[, desired_columns]
# Set the 'Symbol' column as row names
rownames(exprs) <- exprs_$Symbol
exprs_matrix <- as.matrix(exprs)

# 1. Compute GSVA scores:
library(GSVA)
gsva_scores <- gsva(exprs_matrix, geneSets, method="gsva", useNames=TRUE)

# 2. Convert to data.frame for ggplot:
gsva_df <- as.data.frame(t(gsva_scores))

# 3. Add conditions to gsva_df:
gsva_df$Condition <- c("COVID","COVID","COVID","COVID","COVID","COVID","COVID","COVID","COVID","COVID","control","control","control","control","control","control")

# 4. Filter the gsva_df to retain only the desired conditions:

# 5. Define a function to plot violin plots:
gsva_df$Condition <- factor(gsva_df$Condition, levels = c("control", "COVID"))
plot_violin <- function(data, gene_name) {
  # Calculate the t-test p-value for the two conditions
  condition1_data <- data[data$Condition == "control", gene_name]
  condition2_data <- data[data$Condition == "COVID", gene_name]
  p_value <- t.test(condition1_data, condition2_data)$p.value
  # Convert p-value to annotation
  p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
  rounded_p_value <- paste0("p = ", round(p_value, 2))
  plot_title <- gsub("_", " ", gene_name)
  p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name))) +
    geom_violin(linewidth=1.2) + 
    labs(title=plot_title, y="GSVA Score") +
    ylim(-1, 1) +
    theme_light() +
    theme(
      axis.title.x = element_text(size=12),
      axis.title.y = element_text(size=12),
      axis.text.x  = element_text(size=10),
      axis.text.y  = element_text(size=10),
      plot.title   = element_text(size=12, hjust=0.5)
    )
  # Add p-value annotation to the plot
  p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
  return(p)
}

# 6. Generate the list of plots in a predefined order:
# [1] "Platelets"                "Granulocytes"            
# [3] "LDG"                      "pDC"                     
# [5] "Anti-inflammation"        "Pro-inflam._IL-1"        
# [7] "Dendritic_cells"          "MHC_II"                  
# [9] "Alt._complement"          "TNF"                     
#[11] "NLRP3_inflammasome"       "Unfolded_protein"        
#[13] "B_cells"                  "Monocyte_cell_surface"   
#[15] "Inflammasome"             "Monocyte_secreted"       
#[17] "IL-1_cytokines"           "SNOR_low_UP"             
#[19] "CD40_activated"           "Lectin_complement"       
#[21] "Classical_complement"     "Cell_cycle"              
#[23] "Plasma_cells"             "IG_chains"               
#[25] "Erythrocytes"             "IL-6R_complex"           
#[27] "IFN"                      "TCRB"                    
#[29] "TCRA"                     "Cyt._act._T_cells"       
#[31] "TCRG"                     "T_cells"                 
#[33] "CD8T-NK-NKT"              "Anergic_or_act._T_cells" 
#[35] "T_activated"              "Neutrophils"             
#[37] "NK_cells"                 "TCRD"                    
#[39] "T_regs"                   "SNOR_low_DOWN"           
#[41] "Monocytes"                "Myeloid_cells"           
#[43] "Inflammatory_neutrophils" "Suppressive_neutrophils" 
#[45] "Apoptosis"                "NFkB_complex"
desired_order <- c("Platelets","Granulocytes","LDG","pDC","Anti-inflammation",  "Pro-inflam._IL-1","Dendritic_cells","MHC_II","Alt._complement","TNF",  "NLRP3_inflammasome","Unfolded_protein","B_cells","Monocyte_cell_surface","Inflammasome",  "Monocyte_secreted","IL-1_cytokines","SNOR_low_UP","CD40_activated","Lectin_complement",  "Classical_complement","Cell_cycle","Plasma_cells","IG_chains","Erythrocytes",  "IL-6R_complex","IFN","TCRB","TCRA","Cyt._act._T_cells",  "TCRG","T_cells","CD8T-NK-NKT","Anergic_or_act._T_cells","T_activated", "Neutrophils", "NK_cells","TCRD","T_regs","SNOR_low_DOWN","Monocytes", "Myeloid_cells", "Inflammatory_neutrophils", "Suppressive_neutrophils", "Apoptosis", "NFkB_complex")
genes <- colnames(gsva_df)[!colnames(gsva_df) %in% "Condition"]
genes <- genes[match(desired_order, genes)]
filtered_genes <- genes[!is.na(genes)]
print(filtered_genes)
plots_list <- lapply(filtered_genes, function(filtered_genes) plot_violin(gsva_df, filtered_genes))

# 7. Pad the list of plots:
remaining_plots <- 6 - (length(plots_list) %% 6)
if (remaining_plots != 6) {
  plots_list <- c(plots_list, rep(list(NULL), remaining_plots))
}

# 8. Create the plots and arrange them in a grid:
library(gridExtra)
plots_matrix <- matrix(plots_list, ncol=6, byrow=T)
#do.call("grid.arrange", c(plots_matrix, list(ncol=6)))

# 9. Save the plots to a PNG:
png("Violin_Plots_GSVA_MS.png", width=1000, height=1000)
do.call("grid.arrange", c(plots_matrix, list(ncol=6)))
dev.off()

# 10. Save GSVA scores as Excel-file.
library(writexl)
write_xlsx(gsva_df, "GSVA_Scores_MS.xlsx")

LIMMA pipeline processing proteomics (MS)

Statistical analysis on LC-MS data

In order to detect significant changes between two experimental groups we performed statistical analysis on LC-MS data. We have chosen LIMMA moderated t test statistics as proposed in Kammers et al. (2015) over standard t test, since it utilizes full data to shrink the observed sample variance toward a pooled estimate, resulting in far more stable and powerful inference compared to ordinary t test. Missing values in proteomic data are a common problem and it is widely used in practice to impute missing information before applying statistical methods. We used the imputation algorithm proposed in Tyanova et al. (2016) because it allows us to randomly draw values from a distribution meant to simulate values below the detection limit of the MS instrument. We implemented statistical analysis in R and codes are freely available at the following GitHub repository https://github.com/wasimaftab/LIMMA-pipeline– proteomics/tree/master/Mitochondrial_Loop. The final volcano plots only feature proteins that are part of the high-confidence mito- chondrial proteome (Morgenstern et al., 2017).

DATA AND CODE AVAILABILITY

The codes generated during this study to perform statistical analysis on the BioID LC-MS data are freely available at GitHub repository https://github.com/wasimaftab/LIMMA-pipeline-proteomics/tree/master/Mitochondrial_Loop. The data supporting this study have been uploaded on the Mendeley Dataset public repository at https://doi.org/10.17632/9symckg9wn.

#This is a pipeline to analyze proteiomic data in a proteinGroups.txt (MaxQuant output) file for two group comparision
#install.packages("matlab")
library(dplyr)
library(stringr)
library(MASS)
library(matlab)
library(plotly)
library(limma)
library(qvalue)
library(htmlwidgets)
library(rstudioapi)

source("limma_helper_functions.R")

# ---- INPUT for Gunnar Project, directly read the sheet "Gene Prot (FOUND)" ----
## Load the proteingroups file
proteingroups_ <- read.csv("210618_MG_Carotisartery_consensus.csv", sep=",", header=TRUE)
proteingroups <- proteingroups_ %>% filter(Checked == "WAHR")

#Gene Symbol    Accession   CA1 CA2 CA3 CA10    CA5 CA6 CA8 CA14    CA12    CA13    CA11    CA4 Con1    Con2    Con3    Con4    Con5    Con6
#IGKV3-7    A0A075B6H7      195532822   276145022   75324344    179451608   93312672    188741058   244072994   540080672   133307714   102293349.5 133798065   166553494.5 138312451   45263876    361462136   185500736   357708246   156721253

### Kill the code if proteingroups does not contain crap columns
#temp <-
#  select(
#    proteingroups,
#    matches("(Reverse|Potential.contaminant|Only.identified.by.site)")
#  )
#if (!nrow(temp) * ncol(temp)) {
#  stop("File error, It does not contain crap...enter another file with crap")
#}

## Display data to faciliate choice of treatment and control
temp <- select(proteingroups, matches("Abundance"))
print(names(temp))

### Remove "+" identifeid rows from proteingroups
#idx <- NULL
#temp <-
#  select(
#    proteingroups,
#    matches("(Only.identified.by.site|Reverse|Potential.contaminant)")
#  )
#for (i in 1:ncol(temp)) {
#  index <- which(unlist(!is.na(match(temp[, i], "+"))))
#  idx <- union(idx, index)
#}
#proteingroups <- proteingroups[-idx, ] # removing indexed rows

### Remove proteins with Sequence coverage [%] < 5 [%] and Unique peptides == 1
#temp <-
#  select(proteingroups,
#         matches("(^Sequence.coverage(.){4}$|^Unique.peptides$)"))
#proteingroups <-
#  proteingroups[-union(which(temp[, 1] == 1), which(temp[, 2] < 5)) , ] # removing indexed rows

## Extract Uniprot and gene symbols
#splits <- unlist(strsplit("CAD protein OS=Mus musculus OX=10090 GN=Cad PE=1 SV=1", '\\|'))
#splits <- unlist(str_match("CAD protein OS=Mus musculus OX=10090 GN=Cad PE=1 SV=1", "GN=(.*?) PE="))
Uniprot <- character(length = nrow(proteingroups))
Symbol <- character(length = nrow(proteingroups))
Descriptions <- character(length = nrow(proteingroups))
for (i in 1:nrow(proteingroups)) {
  temp <- as.character(proteingroups$Description[i])
  Descriptions[i] <- temp
  Uniprot[i] <- as.character(proteingroups$Accession[i])
  splits <- unlist(str_match(temp, "GN=(.*?) PE="))
  Symbol[i] <- splits[2]
}

# WAP_1, WAP_2, WAP_3, 
# D-ALPHA_1, D-ALPHA_2, D-ALPHA_3
# Q-ALPHA_1, Q-ALPHA_2, Q-ALPHA_3
# pTTSS+Q_1, pTTSS+Q_2, pTTSS+Q_3
# pTTSS+Q-ALPHA_1, pTTSS+Q-ALPHA_2, pTTSS+Q-ALPHA_3
##Extract required data for Limma
#treatment <- readline('Enter treatment name(case insensitive) as it appeared in the iBAQ/LFQ column= ')
#control <- readline('Enter control name(case insensitive) as it appeared in the iBAQ/LFQ column= ')
#ibaq <- readinteger_binary('Enter 1 for iBAQ or 0 for LFQ= ')
treatment <- c("Abundance..F4..Sample","Abundance..F5..Sample","Abundance..F6..Sample")
control <- c("Abundance..F1..Sample","Abundance..F2..Sample","Abundance..F3..Sample")
temp <- select(proteingroups, matches(paste('^.*', "Abundance", '.*$', sep = '')))  #LFQ
treatment_reps <- select(temp, matches(treatment))
#treatment_reps <- data_sanity_check(temp, 'treatment', treatment)
control_reps <- select(temp, matches(control))
#control_reps <- data_sanity_check(temp, 'control', control)
data <- cbind(treatment_reps,
        control_reps,
        #select(proteingroups, matches("^id$")),  #maybe problematic.
        Uniprot,
        Symbol)
##Extract required data for Limma
treatment <- c("Abundance..F7..Sample","Abundance..F8..Sample","Abundance..F9..Sample")
control <- c("Abundance..F1..Sample","Abundance..F2..Sample","Abundance..F3..Sample")
temp <- select(proteingroups, matches(paste('^.*', "Abundance", '.*$', sep = '')))
treatment_reps <- select(temp, matches(treatment))
control_reps <- select(temp, matches(control))
data <- cbind(treatment_reps,
        control_reps,
        Uniprot,
        Symbol)
##Extract required data for Limma
treatment <- c("Abundance..F13..Sample","Abundance..F14..Sample","Abundance..F15..Sample")
control <- c("Abundance..F10..Sample","Abundance..F11..Sample","Abundance..F12..Sample")
temp <- select(proteingroups, matches(paste('^.*', "Abundance", '.*$', sep = '')))
treatment_reps <- select(temp, matches(treatment))
control_reps <- select(temp, matches(control))
data <- cbind(treatment_reps,
        control_reps,
        Uniprot,
        Symbol)
#> head(data)
#  Abundance..F13..Sample Abundance..F14..Sample Abundance..F15..Sample
#1            74399257195             7824880491            2,67259E+11
#2             7002793339             2132521146            23458705782
#3             1060277580              620505532             5544368780
#4            442697991,6            613281893,8             1971535002
#5             1683646539            904648237,7             8359400640
#6            360135421,6             1339034150             1614352533
#  Abundance..F10..Sample Abundance..F11..Sample Abundance..F12..Sample Uniprot
#1             4487833586            4,47435E+11            2,23515E+11  B2RQC6
#2             2172021935            39427573076            28112131746  Q6ZQA0
#3            426743632,6             4947260656             6773100611  Q3UJB9
#4            926489052,4             1636804139             1250211974  Q8BX70
#5            797097872,6            10397417632             9384902608  Q9EP71
#6             2428231339             2254991154             6269016713  P26039
#  Symbol
#1    Cad
#2 Nbeal2
#3   Edc4
#4 Vps13c
#5  Rai14
#6   Tln1

# ---- INPUT for Susanne Project, directly read the sheet "Gene Prot (FOUND)" ----
data <- read.csv("210618_MG_Carotisartery_consensus.csv", dec=",")

#                          ,,1,1,1,1,1,1,1,1,1,1,                           2,2,         4,4,4,4,4,4
#Gene Symbol,Accession,    CA1,CA2,CA3,CA10,CA5,CA6,CA8,CA14,CA12,CA13,     CA11,CA4,    Con1,Con2,Con3,Con4,Con5,Con6
# Drop columns "CA11" and "CA4"
data <- data[, !(names(data) %in% c("CA11", "CA4"))]

## Impute data
#print(names(data))
#rep_treats <- readinteger("Enter the number of treatment replicates=")
#rep_conts <- readinteger("Enter the number of control replicates=")
#FC_Cutoff <- readfloat("Enter the log fold change cut off=")
rep_treats = 10
rep_conts = 6
FC_Cutoff = 2

## removing blank rows
#NO_BLANK_ROWS --> deactivate the following code.
#For Gunnar data: temp <- as.matrix(rowSums(apply(data[, 1:(rep_treats + rep_conts)], 2, as.numeric)))
temp <- as.matrix(rowSums(apply(data[, 3:(rep_treats + rep_conts + 2)], 2, as.numeric)))
idx <- which(temp == 0)
if (length(idx)) {
  data <- data[-idx,]
}
#NO INFINITE --> deactivate!  
#  data_limma <- log2(as.matrix( as.numeric(data[c(1:(rep_treats + rep_conts))]) ))
#data[,1] <- gsub(',', '.', data[,1])
#data[,2] <- gsub(',', '.', data[,2])
#data[,3] <- gsub(',', '.', data[,3])
#data[,4] <- gsub(',', '.', data[,4])
#data[,5] <- gsub(',', '.', data[,5])
#data[,6] <- gsub(',', '.', data[,6])

#For Gunnar data: data_limma <- log2(as.matrix( apply(data[, 1:(rep_treats + rep_conts)], 2, as.numeric) ))
data_limma <- log2(as.matrix( apply(data[, 3:(rep_treats + rep_conts + 2)], 2, as.numeric) ))
data_limma[is.infinite(data_limma)] <- NA
nan_idx <- which(is.na(data_limma))
#temp <- reshape(temp, nrow(data_limma)*ncol(data_limma), 1)
#hist(temp, na.rm = TRUE, xlab = "log2(intensity)", ylab = "Frequency", main =  "All data: before imputation")
# replace the following for the code before
temp <- as.vector(data_limma)
temp <- temp[!is.na(temp)]
png("Freq_log2_before_imputation.png", width=800, height=800)
hist(temp, xlab = "log2(intensity)", ylab = "Frequency", main = "All data: before imputation")
dev.off()
fit <- fitdistr(c(na.exclude(data_limma)), "normal")
mu <- as.double(fit$estimate[1])
sigma <- as.double(fit$estimate[2])
sigma_cutoff <- 6
new_width_cutoff <- 0.3
downshift <- 1.8
width <- sigma_cutoff * sigma
new_width <- width * new_width_cutoff
new_sigma <- new_width / sigma_cutoff
new_mean <- mu - downshift * sigma
imputed_vals_my = rnorm(length(nan_idx), new_mean, new_sigma)
# scaling_factor <- readfloat_0_1("Enter a number > 0 and <=1 to scale imputed values = ")
scaling_factor = 1.0
data_limma[nan_idx] <- imputed_vals_my*scaling_factor
#data_limma[nan_idx] <- imputed_vals_my
#NOTE: [10857] values are imputed!!!

#> tail(data_limma)
#            CA1      CA2      CA3     CA10      CA5      CA6      CA8     CA14
#[1785,] 24.04443 23.32959 22.77167 23.65635 21.63752 24.01719 22.81896 23.32914
#[1786,]       NA       NA       NA       NA       NA 22.37408 22.64151       NA
#[1787,]       NA 22.91260 22.48334       NA       NA 22.02580 21.90541       NA
#[1788,] 24.19555 22.50627       NA 22.73289 23.85307 23.26974 22.76822       NA
#[1789,]       NA 23.84345 23.01388 22.02657       NA       NA 22.55753 21.60069
#[1790,] 24.50819       NA       NA 25.07117 22.63140 21.78845 22.61968 20.99800
#            CA12     CA13     Con1     Con2     Con3     Con4     Con5     Con6
#[1785,] 22.64312 23.35157 22.65619 22.88652 23.56822 22.20197 22.85973 21.98199
#[1786,]       NA       NA       NA       NA       NA       NA       NA       NA
#[1787,]       NA       NA       NA       NA       NA       NA       NA       NA
#[1788,]       NA       NA       NA       NA       NA 21.93513       NA 23.16174
#[1789,] 23.15220 21.70746 20.82720       NA       NA       NA       NA       NA
#[1790,]       NA 23.65803       NA 24.47523       NA 23.56952 22.89298       NA
#----impute---->
#[1785,] 22.64312 23.35157 22.65619 22.88652 23.56822 22.20197
#[1786,] 23.01317 23.12527 21.11763 20.60838 20.00608 20.90938
#[1787,] 22.20547 22.69346 23.68541 22.05089 21.50437 21.32925
#[1788,] 21.47150 22.78137 22.29056 20.30675 21.50243 21.93513
#[1789,] 23.15220 21.70746 20.82720 20.68821 22.90624 22.58886
#[1790,] 21.16174 23.65803 21.56143 24.47523 21.91832 23.56952
#            CA1      CA2      CA3     CA10      CA5      CA6      CA8     CA14
#[1785,] 24.04443 23.32959 22.77167 23.65635 21.63752 24.01719 22.81896 23.32914
#[1786,] 23.37888 21.17247 21.03109 21.87904 21.97014 22.37408 22.64151 22.80035
#[1787,] 21.57987 22.91260 22.48334 21.98497 22.34736 22.02580 21.90541 20.12150
#[1788,] 24.19555 22.50627 21.69512 22.73289 23.85307 23.26974 22.76822 21.56332
#[1789,] 20.48388 23.84345 23.01388 22.02657 23.42263 21.80419 22.55753 21.60069
#[1790,] 24.50819 22.03749 21.90290 25.07117 22.63140 21.78845 22.61968 20.99800
#            CA12     CA13     Con1     Con2     Con3     Con4
#[1785,] 22.64312 23.35157 22.65619 22.88652 23.56822 22.20197
#[1786,] 20.89429 23.68783 22.26621 21.77314 22.70896 22.14385
#[1787,] 22.28350 22.87021 21.09328 23.33020 21.88887 20.88960
#[1788,] 22.25577 21.17894 21.69301 22.28463 22.30470 21.93513
#[1789,] 23.15220 21.70746 20.82720 20.97298 19.93577 23.94133
#[1790,] 21.19956 23.65803 21.75923 24.47523 22.53592 23.56952
# temp <- reshape(temp, nrow(data_limma)*ncol(data_limma), 1)
temp <- as.vector(data_limma)
png("Freq_log2_after_imputation.png", width=800, height=800)
#na.rm = TRUE, 
hist(temp, xlab = "log2(intensity)", ylab = "Frequency", main =  "All data: after imputation")
dev.off()

## Median Normalization Module
#want_normalization <- as.integer(readline(
#  cat(
#    'Enter 1: if you want to normalize the protein intensities in each experiemnt by substrating the median of the corresponding experiment\n',
#    '\bEnter 2: if you want to perform column wise median normalization of the data matrix\n',
#    '\b(for definition of "column wise median normalization" see README)= '
#  )
#))
#  if (want_normalization == 1) {
  # browser()
  # boxplot(data_limma[,1:rep_treats], main = paste(treatment, "replicates before normalization"))
  # boxplot(data_limma[,(rep_treats+1):(rep_treats+rep_conts)], main = paste(control, "replicates before normalization"))

  # Before normalization
  png("before_normalization.png", width=800, height=800)
  boxplot(data_limma[, 1:(rep_treats + rep_conts)], main = "data before median substract normalization")
  dev.off()

  # Subtracting the column medians
  #Subtracting Median of Columns (samples): When you subtract the median of columns (samples), you are ensuring that every sample has a median expression level set to zero (or another reference level if not using median). This is useful when you want to compare expression levels across different samples and want to correct for any systematic bias that might be present in a given sample. This type of normalization is particularly common when dealing with samples that may have been processed or sequenced at different times, by different technicians, or with slightly different protocols, leading to varying baseline levels.
  #Subtracting Median of Rows (genes): When you subtract the median of rows (genes), you are normalizing expression profiles of each gene across different samples. This might be useful in cases where you want to compare the expression profiles of different genes and you are interested in relative changes in gene expression across samples, not the absolute expression level of a gene.
  #In most gene expression studies, normalization by subtracting the median (or mean) of columns (samples) is the typical approach to account for variations between samples.
  col_med <- matrixStats::colMedians(data_limma)
  med_mat <- matlab::repmat(col_med, nrow(data_limma), 1)
  data_limma <- data_limma - med_mat
  #(optional) using sweep instead of matlab::repmat
  #col_med <- apply(data_limma, 2, median, na.rm = TRUE)
  #data_limma <- sweep(data_limma, 2, col_med)

  # After normalization
  png("after_normalization.png", width=800, height=800)
  boxplot(data_limma[, 1:(rep_treats + rep_conts)], main = "data after median substract normalization")
  dev.off()
  # browser()
  # boxplot(data_limma[,1:rep_treats], main = paste(treatment, "replicates after normalization"))
  # boxplot(data_limma[,(rep_treats+1):(rep_treats+rep_conts)], main = paste(control, "replicates after normalization"))
#  } else if (want_normalization == 2) {
#    boxplot(data_limma[, 1:(rep_treats + rep_conts)], main = "data before column wise median normalization")
#    data_limma <- median_normalization(data_limma)
#    boxplot(data_limma[, 1:(rep_treats + rep_conts)], main = "data after column wise median normalization")
#    # browser()
#    # boxplot(data_limma[,1:rep_treats], main = paste(treatment, "replicates after normalization"))
#    # boxplot(data_limma[,(rep_treats+1):(rep_treats+rep_conts)], main = paste(control, "replicates after normalization"))
#  }

#For Gunnar's data
#Symbol <- data$Symbol
#Uniprot <- data$Uniprot
#For Susanne's data
Symbol <- data$Gene.Symbol
Uniprot <- data$Accession
##Limma main code
design <- model.matrix( ~ factor(c(rep(2, rep_treats), rep(1, rep_conts))))
colnames(design) <- c("Intercept", "Diff")
res.eb <- eb.fit(data_limma, design, Symbol)
Sig_FC_idx <- union(which(res.eb$logFC < (-FC_Cutoff)), which(res.eb$logFC > FC_Cutoff))
Sig_Pval_mod_idx <- which(res.eb$p.mod < 0.05)
Sig_Pval_ord_idx <- which(res.eb$p.ord < 0.05)
Sig_mod_idx <-  intersect(Sig_FC_idx, Sig_Pval_mod_idx)
Sig_ord_idx <-  intersect(Sig_FC_idx, Sig_Pval_ord_idx)
categ_Ord <- rep(c("Not Significant"), times = length(Symbol))
categ_Mod <- categ_Ord
categ_Mod[Sig_mod_idx] <- "Significant"
categ_Ord[Sig_ord_idx] <- "Significant"
dat <-
  cbind(
    res.eb,
    categ_Ord,
    categ_Mod,
    NegLogPvalMod = (-log10(res.eb$p.mod)),
    NegLogPvalOrd = (-log10(res.eb$p.ord))
  )

##Save the data file
setwd("/home/jhuang/DATA/Data_Susanne_MS/LIMMA-pipeline-proteomics")
final_data <-
  cbind(#DELETED select(data, matches("^id$")),
        Uniprot,
        Symbol,
        #Descriptions,
        data_limma,
        dat)
#DELETED final_data <- select(final_data, -matches("^gene$"))
filename_final_data <-
  paste0(format(Sys.time(), "%Y%m%d_%H%M%S"), '_final_data')
# readline('Enter a filename for final data= ')

##Create plotly object and save plot as html
filename_mod <-
  paste0(format(Sys.time(), "%Y%m%d_%H%M%S"), '_limma_plot')
# readline('Enter a filename for limma plot= ')
filename_ord <-
  paste0(format(Sys.time(), "%Y%m%d_%H%M%S"), '_ord_plot')
# readline('Enter a filename for ordinary t-test plot= ')

display_plotly_figs(final_data, FC_Cutoff, filename_mod, filename_ord)

write.table(
  final_data,
  paste(filename_final_data, '.tsv', sep = ''),
  sep = '\t',
  row.names = FALSE,
  col.names = TRUE
)
#setwd(cur_dir)

#sorting, add description.
#delete t.ord .. s2.post
# senden auch die Method-part!
# check if Symbol==gene  -log10(p.mod)
#cut -f2-2 20220221_173330_final_data.tsv > Symbol.txt
#cut -f14-14 20220221_173330_final_data.tsv > gene.txt

#https://bioconductor.org/packages/devel/bioc/vignettes/GSVA/inst/doc/GSVA.html#6_Example_applications
#Input of GSVA should log and normalized or not normalized?

https://www.mcponline.org/article/S1535-9476(20)34997-5/fulltext DEqMS: A Method for Accurate Variance Estimation in Differential Protein Expression Analysis* https://genome.duke.edu/sites/default/files/2013Lecture3-DifferentialExpressionProteomicsforweb.pdf DESeq2 https://bioconductor.statistik.tu-dortmund.de/packages/3.5/bioc/vignettes/DESeq2/inst/doc/DESeq2.html Analyzing RNA-seq data with DESeq2 Comparative analysis of statistical methods used for detecting differential expression in label-free mass spectrometry proteomics

labeled mass spectrometry counts The only type of MS-based quantitation that is amenable to DESeq2 (or similar approaches) is spectral counting. Other approaches, whether labelled of unlabelled, are continuous data; I would also recommend limma for their analysis. https://www.researchgate.net/publication/221925844_Labeling_Methods_in_Mass_Spectrometry_Based_Quantitative_Proteomics https://de.wikipedia.org/wiki/Markierungsfreie_massenspektrometrische_Quantifizierung Label-free quantification is a method in mass spectrometry that aims to determine the relative amount of proteins in two or more biological samples. Unlike other methods for protein quantification, label-free quantification does not use a stable isotope containing compound to chemically bind to and thus label the protein.[1][2] https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-6-179 https://www.agilent.com/en/product/software-informatics/mass-spectrometry-software/data-analysis https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6412084/ https://bioinformatics.stackexchange.com/questions/16227/help-with-dia-mass-spectrometry-data-analysis-with-several-conditions-limma https://www.biostars.org/p/95405/#google_vignette https://www.sciencedirect.com/science/article/pii/S1570963913001866 https://github.com/wasimaftab/LIMMA-pipeline-proteomics/blob/master/limma_main.R

Statistical analysis on LC-MS data

  • To detect significant changes between two experimental groups we performed statistical analysis on LC-MS data.

Some comments to the analysis:

  • The LIMMA moderated t-test statistics are chosen for two-group comparison in a proteomic experiment.
  • Normalise by subtracting median: This method normalizes the protein intensities in each experiment by substrating the median of the corresponding experiment. We implemented statistical analysis in R and codes are freely available at the following GitHub repository https://github.com/wasimaftab/LIMMA-pipeline– proteomics/tree/master/Mitochondrial_Loop. The final volcano plots only feature proteins that are part of the high-confidence mito- chondrial proteome (Morgenstern et al., 2017).
    1. Normalise by subtracting median: This method normalizes the protein intensities in each experiemnt by substrating the median of the corresponding experiment.
    2. Column wise median normalization of the data matrix: This method calculates for each sample the median change (i.e. the difference between the observed value and the row average) and subtracts it from each row. Missing values are ignored in the procedure. The method is based on the assumption that a majority of the rows did not change. By default, all the rows are used for normalization but if we assume that the first 3 proteins are spike-ins then the call to median_normalization function needs to be modified as: median_normalization(data, spike_in_rows = 1:3)
  • The imputation algorithm proposed in Tyanova et al. [1] was used to impute missing values before the performance of statistical methods: 1. Tyanova, S., Temu, T., Sinitcyn, P., Carlson, A., Hein, M.Y., Geiger, T., Mann,M., and Cox, J. (2016). The Perseus computational platform for comprehensive analysis of (prote)omics data. Nat. Methods 13, 731–740.

域名预订类型

snapnames、namejet、dropcatch、youdot、mediaon、namecatch、name、domainmonster、hexonet、xz、epik、pool、asiaregister、dynadot、pheenix2、backorder、godaddy、ename、flappy、hupo、hooyoo、Juming、west263、yijie、cndns、bizcn、zgsj、17ex、66cn、aliy-pre、ename-pre、xw、22cn、dcpre、epre、Other-Pre、Gname-Pre、190

  1. 前言

    过期删除域名中,会看到域名有多种预订类型:Delete、GD-Pre、Ename-Pre、West-Pre、Aliy-Pre、Xw-Pre、22CN-Pre、Xz-Pre、Drop-Pre、SN-Pre、E-Pre、Other-pre、Gname-Pre。

  2. 预订类型:删除Delete

    介绍说明:“Delete”是正常过期删除域名,域名因过期后未续费,从而进入删除列表;

    注册时间:等同于新注册,注册时间按抢注成功后重新计算;

    域名过户:得标后0-3天内会完成域名过户,特殊情况除外;

    得标后是否会被赎回:不会,正常删除后原所有者并无权限赎回。

  3. 预订类型:Gd-Pre

    介绍说明:“Gd-Pre”过期提前释放的域名,是属于某国外注册商,由于域名过期后但注册者未续费,注册商提前开放预订。

    注册时间:注册的时间不变,为得标前域名注册成功的时间;

    得标过户:需要10-20天,特殊情况除外;

    得标后是否会被赎回:被赎回的概率较小。

  4. 预订类型:Ename-Pre

    介绍说明:“Ename-Pre”过期提前释放的域名,属于易名平台内由于域名过期后但注册者未续费,注册商提前开放预订;

    注册时间:注册的时间不变,为得标前域名注册成功的时间;

    得标过户:需要24-28天,特殊情况除外;

    得标后是否会被赎回:被赎回的概率较小。

  5. 预订类型:West-Pre

    介绍说明:“West-Pre”过期提前释放的域名,由于域名过期后但注册者未续费,注册商提前开放预订。

    注册时间:注册的时间不变,为得标前域名注册成功的时间;

    得标过户:需要20-30天,特殊情况除外;

    得标后是否会被赎回:被赎回的概率较小。

  6. 预订类型:Aliy-Pre

    介绍说明:“Aliy-Pre”过期提前释放的域名,由于域名过期后但注册者未续费,注册商提前开放预订。

    注册时间:注册的时间不变,为得标前域名注册成功的时间;

    得标过户:32-75天,特殊情况除外;

    得标后是否会被赎回:被赎回的概率较小。

  7. 预订类型:Xw-Pre

    介绍说明:“Xw-Pre”过期提前释放的域名,是属于某国内注册商,由于域名过期后但注册者未续费,注册商提前开放预订。

    注册时间:注册的时间不变,为得标前域名注册成功的时间;

    得标过户:需要35-42天,特殊情况除外;

    得标后是否会被赎回:被赎回的概率较小。

  8. 预订类型:22CN-Pre

    介绍说明:“22CN-Pre”提前释放拍卖域名,由用户提交或其它合作商过期提前释放拍卖的域名。

    注册时间:注册的时间不变,为得标前域名注册成功的时间;

    得标过户:付款完毕后30-35天完成过户,特殊情况除外;

    得标后是否会被赎回:被赎回的概率较小。

  9. 预订类型:Xz-Pre

    介绍说明:“Xz-Pre”过期提前释放的域名,是国内某注册商,由于域名过期后但注册者未续费,注册商提前开放预订。

    注册时间:注册的时间不变,为得标前域名注册成功的时间;

    得标过户:需要15天,特殊情况除外;

    得标后是否会被赎回:被赎回的概率较小。

  10. 预订类型:Drop-Pre

    介绍说明:“Drop-Pre”过期提前释放的域名,是属于某国外注册商,由于域名过期后但注册者未续费,注册商提前开放预订。 注册时间:注册的时间不变,为得标前域名注册成功的时间;

    得标过户:需要10天,特殊情况除外; 得标后是否会被赎回:被赎回的概率较小。

  11. 预订类型:SN-Pre

    介绍说明:“SN-Pre”过期提前释放的域名,是属于某国外注册商,由于域名过期后但注册者未续费,注册商提前开放预订。

    注册时间:注册的时间不变,为得标前域名注册成功的时间;

    得标过户:需要10天,特殊情况除外;

    得标后是否会被赎回:被赎回的概率较小。

  12. 预订类型:E-Pre

    介绍说明:E-PRE(易释放)易释放是易名平台用户自主提交预释放交易的交易类型,用户发布后,域名将同步到抢注平台进行预订竞价。

    注册时间:注册的时间不变,为得标前域名注册成功的时间;

    得标过户:付款后将会过户; 得标后是否会被赎回:被赎回的概率较小。

  13. 预订类型:Other-Pre

    介绍说明:“Other-Pre”过期提前释放的域名,是属于国内外多个注册商,由于域名过期后但注册者未续费,注册商提前开放预订。

    注册时间:注册的时间不变,为得标前域名注册成功的时间;

    得标过户:需要15-42天,特殊情况除外;

    得标后是否会被赎回:被赎回的概率较小。

  14. 预订类型:Gname-Pre

    介绍说明:“Gname-Pre”过期提前释放的域名,是属于某国外注册商,由于域名过期后但注册者未续费,注册商提前开放预订。

    注册时间:域名注册时间与以前的不变;

    得标过户:需要0-20天,特殊情况除外; 得标后是否会被赎回:得标后30天内可能被赎回,概率比较小。

SnapNames 是一家域名回购服务公司,帮助个人和企业在域名过期并再次可供注册时获取域名。以下是 SnapNames 回购过程的一般操作方法:

域名回购下单:
    访问 SnapNames 网站。
    搜索所需的域名。
    如果该域名已被注册,但您预期它可能很快会变得可用(由于过期或其他原因),则可以对其进行回购下单。

监控并尝试注册:
    一旦您下了回购订单,SnapNames 会监控该域名。
    如果域名到期并被当前所有者释放,SnapNames 会尝试代您注册。这通常涉及快速、自动化的过程,以确保在其他人之前有更高的机会获取域名。

拍卖流程:
    如果多个 SnapNames 用户回购同一域名,且 SnapNames 成功注册了该域名,则该域名将进入拍卖。
    用户可以对该域名进行出价,拍卖结束时出价最高的用户将赢得该域名。

    在 SnapNames 的系统中,当一个域名被成功捕获后,如果有多个用户对同一域名进行了回购下单,该域名将进入拍卖。通常情况下,只有那些对该域名进行了回购下单的用户有资格参与该域名的拍卖。这意味着,如果您没有提前进行回购下单,您通常无法参与那个域名的拍卖。

    但是,请注意,每个域名后购服务的具体规定可能会有所不同,因此建议您直接查看 SnapNames 或其他域名后购服务的官方文档或与他们联系,以获取确切和最新的信息。

域名转移:
    如果 SnapNames 成功地获得了域名(您是唯一的回购者或拍卖中的获胜者),该域名将转移到您的账户。
    然后,您可以管理它、将其转移到另一个注册商,或设置网络托管和电子邮件等。

费用:
    通常只有在 SnapNames 成功为您获取域名时才会收费。如果 SnapNames 无法获取域名,您通常不会收到费用。

注意事项:
    成功并不是保证的。即使使用回购服务,也存在获取理想域名的竞争。其他的回购服务、个人或当前域名所有者(如果他们选择续约)可能在您之前获得域名。

如果您考虑使用 SnapNames 或类似的服务,请始终查看他们的服务条款和任何相关费用,以确保您了解过程和潜在成本。

SPANDx Genomic Profiling: Verifying LTtr and K331A RNASeq Mutations

  1. Set up the directory for raw data.

    #Replace "p600" with "control", "p602" with "LT", "p605" with "LTtr", and "p783" with "K331A". Please note that the RNAseq data from the LT_K331A_d8 replicates were sequenced alongside the ChIPseq batch.
    ln -s /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/Raw_Data_ChIPseq/230306_NB501882_0417_AHMVHHBGXN/2023_022_nf_denise/nf857/1_NHDF_Donor_1_p783_S1_R1_001.fastq.gz LT_K331A_d8_DonorI.fastq.gz 
    ln -s /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/Raw_Data_ChIPseq/230306_NB501882_0417_AHMVHHBGXN/2023_022_nf_denise/nf858/2_NHDF_Donor_2_p783_S2_R1_001.fastq.gz LT_K331A_d8_DonorII.fastq.gz
    mv V_8_2_4_p600_d8_DonorI.fastq.gz    control_d8_DonorI.fastq.gz
    mv V_8_2_3_p600_d8_DonorII.fastq.gz   control_d8_DonorII.fastq.gz
    mv V_8_4_2_p602_d8_DonorI.fastq.gz    LT_d8_DonorI.fastq.gz
    mv V_8_4_1_p602_d8_DonorII.fastq.gz   LT_d8_DonorII.fastq.gz
    mv V_8_2_4_p605_d8_DonorI.fastq.gz    LTtr_d8_DonorI.fastq.gz
    mv V_8_2_3_p605_d8_DonorII.fastq.gz   LTtr_d8_DonorII.fastq.gz
  2. Execute SPANDx to produce the mapping profile. For the GenBank file used, refer to the provided link.

    LT_wt.gbk

    conda activate spandx
    [genbank copying]
    mkdir ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/LT_wildtype
    cp LT_wt.gbk ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/LT_wildtype/genes.gbk
    vim ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/snpEff.config
    /home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank LT_wildtype      -d
    
    #The mutation in LTtr is located approximately at position 780, while the K331A mutation in LT is found near position 993 out of a total of 2454 nt.
    MDLVLNRKEREALCKLLEIAPNCYGNIPLMKAAFKRSCLKHHPDKGGNPVIMMELNTLWSKFQQNIHKLRSDFSMFDEVDEAPIYGTTKFKEWWRSGGFSFGKAYEYGPNPHGTNSRSRKPSSNASRGAPSGSSPPHSQSSSSGYGSFSASQASDSQSRGPDIPPEHHEEPTSSSGSSSREETTNSGRESSTPNGTSVPRNSSRTDGTWEDLFCDESLSSPEPPSSSEEPEEPPSSRSSPRQPPSSSAEEASSSQFTDEEYRSSSFTTPKTPPPFSRKRKFGGSRSSASSASSASFTSTPPKPKKNRETPVPTDFPIDLSDYLSHAVYSN
    TVSCFAIYTTSDKAIELYDKIEKFKVDFKSRHACELGCILLFITLSKHRVSAIKNFCSTFCTISFLICKGVNKMPEMYNNLCKPPYKLLQENKPLLNYEFQEKEKEASCNWNLVAEFACEYELDDHFIILAHYLDFAKPFPCQKCENRSRLKPHKAHEAHHSNAKLFYESKSQKTICQQAADTVLAKRRLEMLEMTRTEMLCKKFKKHLERLRDLDTIDLLYYMGGVAWYCCLFEEFEKKLQKIIQLLTENIPKYRNIWFKGPINSGKTSFAAALIDLLEGKALNINCPSDKLPFELGCALDKFMVVFEDVKGQNSLNKDLQPGQGINNLDNLRDHLDGAVAVSLEKKHVNKKHQIFPPCIVTANDYFIPKTLIARFSYTLHFSPKANLRDSLDQNMEIRKRRILQSGTTLLLCLIWCLPDTTFKPCLQEEIKNWKQILQSEISYGKFCQMIENVEAGQDPLLNILIEEEGPEETEETQDSGTFSQ* nextflow run spandx/main.nf –fastq “Raw_Data_RNAseq_K331A_d8_SPANDx/*.fastq.gz” –ref LT_wt.fasta –annotation –database LT_wildtype –pairing SE -resume
  3. Open the BAM files created in the previous step using IGV.

mutations_on_LT_IGV

Native elongating transcript sequencing (NET-seq)

2023-10-10 16:18:07 星期二

Net-seq(原位延伸转录测序)是一种用于以核苷酸分辨率跨基因组分析RNA聚合酶(Pol)活性的方法。以下是Net-seq的简要概述:

目的: Net-seq旨在捕获并测序仍与RNA聚合酶结合在一起的活跃转录过程中的RNA的3’末端。这有助于创建跨基因组的活跃转录位点的高分辨率图。

工作原理:

  • 从细胞中提取RNA聚合酶及其绑定的RNA。
  • 使用特定的方法仅选择与RNA聚合酶结合的RNA的3’端。
  • 对这些3’端进行测序,从而获得在转录过程中的RNA的精确位置。

这种技术为研究基因的精确转录动态提供了一个强大的工具。

BRD4, or Bromodomain-containing protein 4, is a member of the bromodomain and extraterminal (BET) family of proteins. Here’s a brief overview:

Function: BRD4 is involved in several crucial cellular processes:

  • Transcriptional Regulation: BRD4 plays a vital role in promoting the transcription of many genes by associating with acetylated chromatin. It’s particularly significant in the regulation of genes associated with the cell cycle and can recruit positive transcription elongation factor b (P-TEFb) to promote transcriptional elongation.
  • Cell Cycle Progression: BRD4 is essential for progressing through various cell cycle stages, especially the G1/S transition.

Medical Relevance: Given its role in transcription and cell cycle regulation, it’s not surprising that BRD4 has been implicated in cancer biology. Inhibitors targeting the bromodomains of BRD4, commonly known as BET inhibitors, are being explored as potential therapeutic agents for various cancers.

Structure: BRD4 contains two bromodomains. Bromodomains can recognize and bind acetylated lysine residues, which are common post-translational modifications on histones and other proteins.

Associated Diseases: Alterations, mutations, or aberrant expression of BRD4 has been implicated in several cancers, including acute myeloid leukemia (AML), midline carcinoma, and others.

Inhibitors: Several small molecules that inhibit BRD4 have been developed as potential therapeutic agents. Notably, drugs like JQ1 and I-BET762 bind to the bromodomains of BRD4, inhibiting its interaction with acetylated proteins.

BRD4 is an active area of research, and new findings related to its function, regulation, and potential as a therapeutic target are regularly published.

HiS-NET-seq profiles the active enhancer landscape Putative enhancers show enhancer activity HiS-NET-seq reveals 11,007 new transcribed putative enhancers in K562 cells providing a more complete view on active enhancers. ChIP-Rx vs HiS-NET-seq: BRD4 is required for genome-wide enhancer transcription. HiChIP (H3K27Ac) is for discovering Enhancer connecting Promoter Enhancer and traget gene transcription is regulated by BRD4 Identify other post-initiation transcription regulators ablation: 切除,风化,消融,除去,切除 Acute BRD4-loss impairs transcription at the 3′-end genes. Acute BRD4-loss impairs transcription at the 5′ and 3′-end genes. BET & BRD4 degradation prompt (激励,鼓动,提示) 3′-processing defects Readthrough gene How do the distinct molecular interactions mediated by BRD4’s bromodomains contribute to its diverse roles in cellular processes, and how can this knowledge be leveraged to develop more targeted and effective therapeutic strategies for BRD4-associated diseases?

RNA-seq processing for control-LT-LTtr-K331A

#nextflow run rnaseq --reads '/home/jhuang/DATA/Data_Denise_tx_epi_MCPyV/Raw_Data_RNAseq_K331A/*.fastq.gz' --fasta /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa --gtf /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf --bed12 /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.bed --singleEnd -profile standard --aligner star --saveReference -resume --saveAlignedIntermediates --skip_rseqc --skip_dupradar --skip_genebody_coverage --skip_preseq --skip_edger --fcGroupFeaturesType gene_name

#under jhuang@hamm:~/DATA/Data_Denise_LT_RNASeq/Raw_Data_RNAseq_K331A_d8_SPANDx$
#Replace "p600" with "control", "p602" with "LT", "p605" with "LTtr", and "p783" with "K331A". Please note that the RNAseq data from the LT_K331A_d8 replicates were sequenced alongside the ChIPseq batch.
ln -s ~/DATA/Data_Denise_LT_RNASeq/231006_NB501882_0434_AHCT3NBGXV/nf928/1_NHDF_Donor_2_p600_S23_R1_001.fastq.gz control-d8-DII_re.fastq.gz
ln -s ~/DATA/Data_Denise_LT_RNASeq/231006_NB501882_0434_AHCT3NBGXV/nf929/2_NHDF_Donor_2_p783_S24_R1_001.fastq.gz LT-K331A-d8-DII_re.fastq.gz

(rnaseq_2021) jhuang@hamm:~/DATA/Data_Denise_LT_RNASeq$ nextflow run rnaseq --reads 'Raw_Data_RNAseq_K331A_d8_SPANDx/*.fastq.gz' --fasta /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa --gtf /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf --bed12 /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.bed --singleEnd -profile standard --aligner star --saveReference -resume --saveAlignedIntermediates --skip_rseqc --skip_dupradar --skip_genebody_coverage --skip_preseq --skip_edger --fcGroupFeaturesType gene_name

nextflow run rnaseq --reads 'Raw_Data_RNAseq_K331A_d8_SPANDx/*.fastq.gz' --fasta /media/jhuang/Smarty/ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa --gtf /media/jhuang/Smarty/ref/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf --star_index /media/jhuang/Smarty/ref/Homo_sapiens/UCSC/hg38/Sequence/STARIndex/ --bed12 /media/jhuang/Smarty/ref/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.bed --singleEnd -profile standard --aligner star --saveReference -resume --saveAlignedIntermediates --skip_rseqc --skip_dupradar --skip_genebody_coverage --skip_preseq --skip_edger --fcGroupFeaturesType gene_name

rsync -a -P jhuang@hamm:~/REFs/Homo_sapiens/UCSC/hg38 .

#cut -f2- merged_gene_counts.txt > merged_gene_counts_2.txt

library("AnnotationDbi")
library("clusterProfiler")
library("ReactomePA")
library("org.Hs.eg.db")
library(DESeq2)
library(gplots)
setwd("/media/jhuang/Elements1/Data_Denise_LT_DNA_Bindung/results_rnaseq_K331A_hg38/featureCounts_together_d8_2")

d.raw<- read.delim2("merged_gene_counts_2.txt",sep="\t", header=TRUE, row.names=1)

 [1] K331A_DonorIAligned.sortedByCoord.out.bam           
 [2] V_8_2_4_p600_d8_DonorIAligned.sortedByCoord.out.bam 
 [3] V_8_2_3_p600_d8_DonorIIAligned.sortedByCoord.out.bam
 [4] V_8_4_2_p602_d3_DonorIIAligned.sortedByCoord.out.bam
 [5] V_8_2_4_p600_d3_DonorIAligned.sortedByCoord.out.bam 
 [6] V_8_4_2_p602_d3_DonorIAligned.sortedByCoord.out.bam 
 [7] K331A_DonorIIAligned.sortedByCoord.out.bam          
 [8] V_8_2_4_p605_d3_DonorIAligned.sortedByCoord.out.bam 
 [9] V_8_4_2_p602_d8_DonorIAligned.sortedByCoord.out.bam 
[10] V_8_2_3_p600_d3_DonorIIAligned.sortedByCoord.out.bam
[11] V_8_4_1_p602_d8_DonorIIAligned.sortedByCoord.out.bam
[12] V_8_2_3_p605_d8_DonorIIAligned.sortedByCoord.out.bam
[13] V_8_2_3_p605_d3_DonorIIAligned.sortedByCoord.out.bam
[14] V_8_2_4_p605_d8_DonorIAligned.sortedByCoord.out.bam 

#replace p600 to control, p602 to LT, p605 to LTtr

colnames(d.raw)<- c("K331A_d8_DonorI","ctrl_d8_DonorI","ctrl_d8_DonorII","LT_d3_DonorII","ctrl_d3_DonorI","LT_d3_DonorI","K331A_d8_DonorII","LTtr_d3_DonorI","LT_d8_DonorI","ctrl_d3_DonorII","LT_d8_DonorII","LTtr_d8_DonorII","LTtr_d3_DonorII","LTtr_d8_DonorI")  #26485

col_order <- c("ctrl_d3_DonorI","ctrl_d3_DonorII","ctrl_d8_DonorI","ctrl_d8_DonorII",  "LT_d3_DonorI","LT_d3_DonorII","LT_d8_DonorI","LT_d8_DonorII",    "LTtr_d3_DonorI","LTtr_d3_DonorII","LTtr_d8_DonorI","LTtr_d8_DonorII",    "K331A_d8_DonorI","K331A_d8_DonorII")
reordered.raw <- d.raw[,col_order]

#selected all columns with "d8"
selected_reordered.raw <- reordered.raw[, c("ctrl_d8_DonorI","ctrl_d8_DonorII",  "LT_d8_DonorI","LT_d8_DonorII",  "LTtr_d8_DonorI","LTtr_d8_DonorII",  "K331A_d8_DonorI","K331A_d8_DonorII")]

condition = as.factor(c("control_d8","control_d8",  "LT_d8","LT_d8",  "LTtr_d8","LTtr_d8",  "K331A_d8","K331A_d8"))
ids = as.factor(c("ctrl_d8_DonorI","ctrl_d8_DonorII",  "LT_d8_DonorI","LT_d8_DonorII",  "LTtr_d8_DonorI","LTtr_d8_DonorII",  "K331A_d8_DonorI","K331A_d8_DonorII"))
#NOTE_3D_PCA (IMPORTANT): should always keep the name 'DI' and 'DII' as donor names; they are important for 3D PCA drawing!
#NOTE2_3D_PCA: correct the condition names: control_d8-->control d8, K331A_d8-->K331A d8, LT_d8-->LT d8, LTtr_d8-->LTtr d8.
donor = as.factor(c("DI","DII",  "DI","DII",  "DI","DII",  "DI","DII"))
#Note that we need selected_reordered.raw
cData = data.frame(row.names=colnames(selected_reordered.raw), condition=condition, donor=donor, ids=ids)
dds<-DESeqDataSetFromMatrix(countData=selected_reordered.raw, colData=cData, design=~donor+condition)  #batch+

> sizeFactors(dds)
NULL
> dds <- estimateSizeFactors(dds)
> sizeFactors(dds)
 ctrl_d3_DonorI ctrl_d3_DonorII  *ctrl_d8_DonorI *ctrl_d8_DonorII    LT_d3_DonorI 
      1.2221026       0.8126614       1.0631329       0.6271887       0.8553245 
  LT_d3_DonorII    *LT_d8_DonorI   *LT_d8_DonorII  LTtr_d3_DonorI LTtr_d3_DonorII 
      1.0386420       1.0324488       1.3568818       1.3070621       0.8283129 
 *LTtr_d8_DonorI *LTtr_d8_DonorII    *K331A_DonorI   *K331A_DonorII 
      1.6206130       0.8352236       1.6908424       0.6012788 

ctrl_d8_DonorI ctrl_d8_DonorII    LT_d8_DonorI   LT_d8_DonorII  LTtr_d8_DonorI   LTtr_d8_DonorII    K331A_DonorI   K331A_DonorII 
     1.0429778       0.6131069       1.0162992       1.3406000       1.5984592       0.8268460       1.6634451       0.5943143 
> sizeFactors(dds)
  ctrl_d8_DonorI  ctrl_d8_DonorII     LT_d8_DonorI    LT_d8_DonorII 
       1.0429778        0.6131069        1.0162992        1.3406000 
  LTtr_d8_DonorI  LTtr_d8_DonorII  K331A_d8_DonorI K331A_d8_DonorII 
       1.5984592        0.8268460        1.6634451        0.5943143

./STAR/V_8_2_4_p600_d3_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_2_3_p600_d3_DonorIIAligned.sortedByCoord.out.bam
./STAR/V_8_2_4_p600_d8_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_2_3_p600_d8_DonorIIAligned.sortedByCoord.out.bam

./STAR/V_8_4_2_p602_d3_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_4_2_p602_d3_DonorIIAligned.sortedByCoord.out.bam
./STAR/V_8_4_2_p602_d8_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_4_1_p602_d8_DonorIIAligned.sortedByCoord.out.bam

./STAR/V_8_2_4_p605_d3_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_2_3_p605_d3_DonorIIAligned.sortedByCoord.out.bam
./STAR/V_8_2_4_p605_d8_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_2_3_p605_d8_DonorIIAligned.sortedByCoord.out.bam

./STAR/K331A_DonorIAligned.sortedByCoord.out.bam
./STAR/K331A_DonorIIAligned.sortedByCoord.out.bam

bamCoverage --bam ./STAR/V_8_2_4_p600_d3_DonorIAligned.sortedByCoord.out.bam -o bigWigs/ctrl_d3_DonorI_norm.bw --binSize 10 --scaleFactor 0.8182619037059573 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_3_p600_d3_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/ctrl_d3_DonorII_norm.bw --binSize 10 --scaleFactor 1.230524791752137 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_4_p600_d8_DonorIAligned.sortedByCoord.out.bam -o bigWigs/ctrl_d8_DonorI_norm.bw --binSize 10 --scaleFactor 0.9406161731990421 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_3_p600_d8_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/ctrl_d8_DonorII_norm.bw --binSize 10 --scaleFactor 1.5944164810367278 --effectiveGenomeSize 2913022398

bamCoverage --bam ./STAR/V_8_4_2_p602_d3_DonorIAligned.sortedByCoord.out.bam -o bigWigs/LT_d3_DonorI_norm.bw --binSize 10 --scaleFactor 1.1691469144166922 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_4_2_p602_d3_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/LT_d3_DonorII_norm.bw --binSize 10 --scaleFactor 0.9627956504743693 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_4_2_p602_d8_DonorIAligned.sortedByCoord.out.bam -o bigWigs/LT_d8_DonorI_norm.bw --binSize 10 --scaleFactor 0.9685710322875091 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_4_1_p602_d8_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/LT_d8_DonorII_norm.bw --binSize 10 --scaleFactor 0.7369838699288324 --effectiveGenomeSize 2913022398

bamCoverage --bam ./STAR/V_8_2_4_p605_d3_DonorIAligned.sortedByCoord.out.bam -o bigWigs/LTtr_d3_DonorI_norm.bw --binSize 10 --scaleFactor 0.7650745897995206 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_3_p605_d3_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/LTtr_d3_DonorII_norm.bw --binSize 10 --scaleFactor 1.2072732417906324 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_4_p605_d8_DonorIAligned.sortedByCoord.out.bam -o bigWigs/LTtr_d8_DonorI_norm.bw --binSize 10 --scaleFactor 0.617050461769713 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_3_p605_d8_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/LTtr_d8_DonorII_norm.bw --binSize 10 --scaleFactor 1.1972841763570858 --effectiveGenomeSize 2913022398

bamCoverage --bam ./STAR/K331A_DonorIAligned.sortedByCoord.out.bam -o bigWigs/K331A_DonorI_norm.bw --binSize 10 --scaleFactor 0.5914211756222816 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/K331A_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/K331A_DonorII_norm.bw --binSize 10 --scaleFactor 1.6631219993121327 --effectiveGenomeSize 2913022398
#END

rld <- rlogTransformation(dds)

library(ggplot2)
svg("pca6.svg")
data <- plotPCA(rld, intgroup=c("condition", "donor"), returnData=TRUE)
percentVar <- round(100 * attr(data, "percentVar"))
ggplot(data, aes(PC1, PC2, color=condition, shape=donor)) +
  geom_point(size=3) +
  scale_color_manual(values = c("control" = "grey",
                                "K331A"="#1f78b4")) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) +
  coord_fixed()
dev.off()

png("pca.png", width=800, height=800)
data <- plotPCA(rld, intgroup=c("condition", "donor"), returnData=TRUE)
percentVar <- round(100 * attr(data, "percentVar"))
ggplot(data, aes(PC1, PC2, color=condition, shape=donor)) +
  geom_point(size=3) +
  scale_color_manual(values = c("control" = "grey",
                                "K331A"="#1f78b4")) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) +
  coord_fixed()
dev.off()

png("pca2.png", 1200, 800)
plotPCA(rld, intgroup=c("condition"))
dev.off()

library(ggplot2)
data <- plotPCA(rld, intgroup=c("condition", "donor"), returnData=TRUE)
write.csv(data, file="plotPCA_data.csv")
#calculate all PCs including PC3 with the following codes
library(genefilter)
ntop <- 500
rv <- rowVars(assay(rld))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
mat <- t( assay(rld)[select, ] )
pc <- prcomp(mat)
pc$x[,1:3]
#df_pc <- data.frame(pc$x[,1:3])
df_pc <- data.frame(pc$x)
identical(rownames(data), rownames(df_pc)) #-->TRUE
## define the desired order of row names
#desired_order <- rownames(data)
## sort the data frame by the desired order of row names
#df <- df[match(desired_order, rownames(df_pc)), ]
data$PC1 <- NULL
data$PC2 <- NULL
merged_df <- merge(data, df_pc, by = "row.names")
#merged_df <- merged_df[, -1]
row.names(merged_df) <- merged_df$Row.names
merged_df$Row.names <- NULL  # remove the "name" column
merged_df$name <- NULL
merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","group","condition","donor")]
#results in 26PCs: merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","group","condition","donor")]
write.csv(merged_df, file="merged_df_8PCs.csv")
#> summary(pc)    #0.5657 0.2195 0.1347  --> 0.57 0.22 0.13
#Importance of components:
#                           PC1     PC2    PC3    PC4     PC5     PC6     PC7
#Standard deviation     19.4051 12.0878 9.4683 4.5569 4.01016 3.00610 2.71918
#Proportion of Variance  0.5657  0.2195 0.1347 0.0312 0.02416 0.01358 0.01111
#Cumulative Proportion   0.5657  0.7853 0.9200 0.9512 0.97531 0.98889 1.00000

#---- * to untreated ----
dds$condition <- relevel(dds$condition, "control_d8")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("K331A_d8_vs_control_d8", "LT_d8_vs_control_d8", "LTtr_d8_vs_control_d8")

for (i in clist) {
  contrast = paste("condition", i, sep="_")
  res = results(dds, name=contrast)
  res <- res[!is.na(res$log2FoldChange),]
  geness <- AnnotationDbi::select(org.Hs.eg.db, keys = rownames(res), keytype = "SYMBOL", columns = c("ENSEMBL", "ENTREZID", "SYMBOL", "GENENAME"))
  geness <- geness[!duplicated(geness$SYMBOL), ]
  res$SYMBOL = rownames(res)
  rownames(geness) <- geness$SYMBOL
  identical(rownames(res), rownames(geness))
  res_df <- as.data.frame(res)
  geness_res <- merge(geness, res_df)
  dim(geness_res)
  geness_res$padj <- ifelse(is.na(geness_res$padj), 1, geness_res$padj)
  geness_res$pvalue <- ifelse(is.na(geness_res$pvalue), 1, geness_res$pvalue)
  write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
  #write.csv(res_df, file = paste(i, "background.txt", sep="_"))
  up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
  down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
  write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
  write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}

#cd ${comp}_output; \
#~/Tools/csv2xls-0.4/csv_to_xls.py upregulated_filtered downregulated_filtered background -d$',' -o ../${comp}_degenes.xls; \
#cd ..; \

~/Tools/csv2xls-0.4/csv_to_xls.py \
K331A_d8_vs_control_d8-all.txt \
K331A_d8_vs_control_d8-up.txt \
K331A_d8_vs_control_d8-down.txt \
-d$',' -o K331A_d8_vs_control_d8.xls;

~/Tools/csv2xls-0.4/csv_to_xls.py \
LT_d8_vs_control_d8-all.txt \
LT_d8_vs_control_d8-up.txt \
LT_d8_vs_control_d8-down.txt \
-d$',' -o LT_d8_vs_control_d8.xls;

~/Tools/csv2xls-0.4/csv_to_xls.py \
LTtr_d8_vs_control_d8-all.txt \
LTtr_d8_vs_control_d8-up.txt \
LTtr_d8_vs_control_d8-down.txt \
-d$',' -o LTtr_d8_vs_control_d8.xls;

#   719 K331A_d8_vs_control_d8-up.txt
#   371 LT_d8_vs_control_d8-up.txt
#   292 LTtr_d8_vs_control_d8-up.txt
#  1382 total
#  323 K331A_d8_vs_control_d8-down.txt
#  141 LT_d8_vs_control_d8-down.txt
#   80 LTtr_d8_vs_control_d8-down.txt
#  544 total

library(ggrepel) 

geness_res <- read.csv(file = paste("LT_d8_vs_control_d8", "all.txt", sep="-"), row.names=1)
geness_res$Color <- "NS or log2FC < 2.0"
geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
geness_res$Color[geness_res$padj < 0.001] <- "P-adj < 0.001"
geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
geness_res$Color <- factor(geness_res$Color,
                        levels = c("NS or log2FC < 2.0", "P < 0.05",
                                   "P-adj < 0.05", "P-adj < 0.001"))

geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
top_g <- c()
top_g <- c(top_g, geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:200]], geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:200]])
top_g <- unique(top_g)
geness_res <- geness_res[, -1*ncol(geness_res)]  #remove invert_P from matrix

png("LT_d8_vs_control_d8.png",width=1400, height=1000)
ggplot(geness_res,
       aes(x = log2FoldChange, y = -log10(pvalue),
           color = Color, label = SYMBOL)) +
  geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +
  geom_hline(yintercept = -log10(0.05), lty = "dashed") +
  geom_point() +
  labs(x = "log2(FC)",
       y = "Significance, -log10(P)",
       color = "Significance") +
  scale_color_manual(values = c(`P-adj < 0.001` = "dodgerblue",
                                `P-adj < 0.05` = "lightblue",
                                `P < 0.05` = "orange2",
                                `NS or log2FC < 2.0` = "gray"),
                     guide = guide_legend(override.aes = list(size = 4))) +
  scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
  geom_text_repel(data = subset(geness_res, SYMBOL %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)),
                  size = 4, point.padding = 0.15, color = "black",
                  min.segment.length = .1, box.padding = .2, lwd = 2) +
  theme_bw(base_size = 16) +
  theme(legend.position = "bottom")
dev.off()

up1 <-read.csv(file="./K331A_d8_vs_control_d8-up.txt")
up3 <-read.csv(file="./LT_d8_vs_control_d8-up.txt")
up5 <-read.csv(file="./LTtr_d8_vs_control_d8-up.txt")

down1 <-read.csv(file="./K331A_d8_vs_control_d8-down.txt")
down3 <-read.csv(file="./LT_d8_vs_control_d8-down.txt")
down5 <-read.csv(file="./LTtr_d8_vs_control_d8-down.txt")

RNASeq.NoCellLine <- assay(rld)
#Option3: as paper described, A heatmap showing expression values of all DEGs which are significant between any pair conditions.
all_genes <- c(up1$SYMBOL,down1$SYMBOL,  up3$SYMBOL,down3$SYMBOL,  up5$SYMBOL,down5$SYMBOL)  #1920
all_genes <- unique(all_genes)   #1223
RNASeq.NoCellLine_  <- RNASeq.NoCellLine[all_genes,]
write.csv(as.data.frame(RNASeq.NoCellLine_), file ="significant_gene_expressions.txt")

##### STEP4: clustering the genes and draw heatmap #####
#clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
datamat = RNASeq.NoCellLine_
hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.4) #1.08: 3 clusters; 
mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED",  "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");

mycol = mycol[as.vector(mycl)]
#sampleCols <- rep('GREY',ncol(RNASeq.NoCellLine_))
#names(sampleCols) <- c("mock_r1", "mock_r2", "mock_r3", "mock_r4", "WAP_r1", "WAP_r2",  "WAP_r3", "WAP_r4", "WAC_r1","WAC_r2")
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,4)=='mock'] <- 'GREY'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAP'] <- 'RED'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dM_'] <- 'CYAN'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dP_'] <- 'BLUE'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAC'] <- 'GREEN'
png("DEGs_heatmap.png", width=900, height=1010)
heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
            scale='row',trace='none',col=bluered(75), 
            RowSideColors = mycol, labRow="", srtCol=15, keysize=0.72)
dev.off()

#### cluster members #####
write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt') 
write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')  
write.csv(names(subset(mycl, mycl == '4')),file='cluster4_DARKMAGENTA.txt')  
write.csv(names(subset(mycl, mycl == '5')),file='cluster5_DARKCYAN.txt')  

~/Tools/csv2xls-0.4/csv_to_xls.py \
cluster1_YELLOW.txt \
cluster2_DARKBLUE.txt \
cluster3_DARKORANGE.txt \
cluster4_DARKMAGENTA.txt \
cluster5_DARKCYAN.txt \
-d$',' -o gene_clusters.xls;

~/Tools/csv2xls-0.4/csv_to_xls.py \
significant_gene_expressions.txt \
-d',' -o DEGs_heatmap_data.xls;

# ------------  LT (p602) vs ctrl (p600) d8 ------------
colnames(d.raw)<- c("K331A_d8_DonorI","ctrl_d8_DonorI","ctrl_d8_DonorII","LT_d3_DonorII","ctrl_d3_DonorI","LT_d3_DonorI","K331A_d8_DonorII","LTtr_d3_DonorI","LT_d8_DonorI","ctrl_d3_DonorII","LT_d8_DonorII","LTtr_d8_DonorII","LTtr_d3_DonorII","LTtr_d8_DonorI")  #26485

col_order <- c("ctrl_d3_DonorI","ctrl_d3_DonorII","ctrl_d8_DonorI","ctrl_d8_DonorII",  "LT_d3_DonorI","LT_d3_DonorII","LT_d8_DonorI","LT_d8_DonorII",    "LTtr_d3_DonorI","LTtr_d3_DonorII","LTtr_d8_DonorI","LTtr_d8_DonorII",    "K331A_d8_DonorI","K331A_d8_DonorII")
reordered.raw <- d.raw[,col_order]

#selected all columns with "d8"
selected_reordered.raw <- reordered.raw[, c("ctrl_d8_DonorI","ctrl_d8_DonorII",  "LT_d8_DonorI","LT_d8_DonorII")]

condition = as.factor(c("control_d8","control_d8",  "LT_d8","LT_d8"))
ids = as.factor(c("ctrl_d8_DonorI","ctrl_d8_DonorII",  "LT_d8_DonorI","LT_d8_DonorII"))
donor = as.factor(c("DI","DII",  "DI","DII"))
cData = data.frame(row.names=colnames(selected_reordered.raw), condition=condition, donor=donor, ids=ids)
dds<-DESeqDataSetFromMatrix(countData=selected_reordered.raw, colData=cData, design=~donor+condition)  #batch+

#> sizeFactors(dds)
#NULL
dds <- estimateSizeFactors(dds)
sizeFactors(dds)
# ctrl_d8_DonorI ctrl_d8_DonorII    LT_d8_DonorI   LT_d8_DonorII 
#      1.0879047       0.6411992       1.0519079       1.3937541 

rld <- rlogTransformation(dds)

setwd("~/DATA/Data_Denise_LT_DNA_Bindung/results_RNAseq_K331A_hg38/featureCounts_LT_vs_ctrl_d8")
dds$condition <- relevel(dds$condition, "control_d8")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("LT_d8_vs_control_d8")

for (i in clist) {
  contrast = paste("condition", i, sep="_")
  res = results(dds, name=contrast)
  res <- res[!is.na(res$log2FoldChange),]
  geness <- AnnotationDbi::select(org.Hs.eg.db, keys = rownames(res), keytype = "SYMBOL", columns = c("ENSEMBL", "ENTREZID", "SYMBOL", "GENENAME"))
  geness <- geness[!duplicated(geness$SYMBOL), ]
  res$SYMBOL = rownames(res)
  rownames(geness) <- geness$SYMBOL
  identical(rownames(res), rownames(geness))
  res_df <- as.data.frame(res)
  geness_res <- merge(geness, res_df)
  dim(geness_res)
  geness_res$padj <- ifelse(is.na(geness_res$padj), 1, geness_res$padj)
  geness_res$pvalue <- ifelse(is.na(geness_res$pvalue), 1, geness_res$pvalue)
  write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
  #write.csv(res_df, file = paste(i, "background.txt", sep="_"))
  up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
  down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
  write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
  write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}

#~/Tools/csv2xls-0.4/csv_to_xls.py \
#LT_d8_vs_control_d8-all.txt \
#LT_d8_vs_control_d8-up.txt \
#LT_d8_vs_control_d8-down.txt \
#-d$',' -o LT_d8_vs_control_d8.xls;

#   719 K331A_d8_vs_control_d8-up.txt
#   371 LT_d8_vs_control_d8-up.txt --> 399
#   292 LTtr_d8_vs_control_d8-up.txt
#  323 K331A_d8_vs_control_d8-down.txt
#  141 LT_d8_vs_control_d8-down.txt --> 184
#   80 LTtr_d8_vs_control_d8-down.txt

library(ggrepel) 

geness_res <- read.csv(file = paste("LT_d8_vs_control_d8", "all.txt", sep="-"), row.names=1)
geness_res$Color <- "NS or log2FC < 2.0"
geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
geness_res$Color[geness_res$padj < 0.001] <- "P-adj < 0.001"
geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
geness_res$Color <- factor(geness_res$Color,
                        levels = c("NS or log2FC < 2.0", "P < 0.05",
                                   "P-adj < 0.05", "P-adj < 0.001"))

geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
top_g <- c()
top_g <- c(top_g, geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:200]], geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:200]])
top_g <- unique(top_g)
geness_res <- geness_res[, -1*ncol(geness_res)]  #remove invert_P from matrix

png("LT_d8_vs_control_d8.png",width=1400, height=1000)
ggplot(geness_res,
       aes(x = log2FoldChange, y = -log10(pvalue),
           color = Color, label = SYMBOL)) +
  geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +
  geom_hline(yintercept = -log10(0.05), lty = "dashed") +
  geom_point() +
  labs(x = "log2(FC)",
       y = "Significance, -log10(P)",
       color = "Significance") +
  scale_color_manual(values = c(`P-adj < 0.001` = "dodgerblue",
                                `P-adj < 0.05` = "lightblue",
                                `P < 0.05` = "orange2",
                                `NS or log2FC < 2.0` = "gray"),
                     guide = guide_legend(override.aes = list(size = 4))) +
  scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
  geom_text_repel(data = subset(geness_res, SYMBOL %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)),
                  size = 4, point.padding = 0.15, color = "black",
                  min.segment.length = .1, box.padding = .2, lwd = 2) +
  theme_bw(base_size = 16) +
  theme(legend.position = "bottom")
dev.off()

# ------------ LTtr (p605) vs ctrl (p600) d8 ------------
colnames(d.raw)<- c("K331A_d8_DonorI","ctrl_d8_DonorI","ctrl_d8_DonorII","LT_d3_DonorII","ctrl_d3_DonorI","LT_d3_DonorI","K331A_d8_DonorII","LTtr_d3_DonorI","LT_d8_DonorI","ctrl_d3_DonorII","LT_d8_DonorII","LTtr_d8_DonorII","LTtr_d3_DonorII","LTtr_d8_DonorI")  #26485

col_order <- c("ctrl_d3_DonorI","ctrl_d3_DonorII","ctrl_d8_DonorI","ctrl_d8_DonorII",  "LT_d3_DonorI","LT_d3_DonorII","LT_d8_DonorI","LT_d8_DonorII",    "LTtr_d3_DonorI","LTtr_d3_DonorII","LTtr_d8_DonorI","LTtr_d8_DonorII",    "K331A_d8_DonorI","K331A_d8_DonorII")
reordered.raw <- d.raw[,col_order]

#selected all columns with "d8"
selected_reordered.raw <- reordered.raw[, c("ctrl_d8_DonorI","ctrl_d8_DonorII",  "LTtr_d8_DonorI","LTtr_d8_DonorII")]

condition = as.factor(c("control_d8","control_d8",  "LTtr_d8","LTtr_d8"))
ids = as.factor(c("ctrl_d8_DonorI","ctrl_d8_DonorII",  "LTtr_d8_DonorI","LTtr_d8_DonorII"))
donor = as.factor(c("DI","DII",  "DI","DII"))
cData = data.frame(row.names=colnames(selected_reordered.raw), condition=condition, donor=donor, ids=ids)
dds<-DESeqDataSetFromMatrix(countData=selected_reordered.raw, colData=cData, design=~donor+condition)  #batch+

#> sizeFactors(dds)
#NULL
dds <- estimateSizeFactors(dds)
sizeFactors(dds)
# ctrl_d8_DonorI ctrl_d8_DonorII  LTtr_d8_DonorI LTtr_d8_DonorII 
#      1.0990996       0.6476647       1.6536010       0.8632049

rld <- rlogTransformation(dds)

setwd("~/DATA/Data_Denise_LT_DNA_Bindung/results_RNAseq_K331A_hg38/featureCounts_LTtr_vs_ctrl_d8")
dds$condition <- relevel(dds$condition, "control_d8")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("LTtr_d8_vs_control_d8")

for (i in clist) {
  contrast = paste("condition", i, sep="_")
  res = results(dds, name=contrast)
  res <- res[!is.na(res$log2FoldChange),]
  geness <- AnnotationDbi::select(org.Hs.eg.db, keys = rownames(res), keytype = "SYMBOL", columns = c("ENSEMBL", "ENTREZID", "SYMBOL", "GENENAME"))
  geness <- geness[!duplicated(geness$SYMBOL), ]
  res$SYMBOL = rownames(res)
  rownames(geness) <- geness$SYMBOL
  identical(rownames(res), rownames(geness))
  res_df <- as.data.frame(res)
  geness_res <- merge(geness, res_df)
  dim(geness_res)
  geness_res$padj <- ifelse(is.na(geness_res$padj), 1, geness_res$padj)
  geness_res$pvalue <- ifelse(is.na(geness_res$pvalue), 1, geness_res$pvalue)
  write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
  #write.csv(res_df, file = paste(i, "background.txt", sep="_"))
  up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
  down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
  write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
  write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}

#~/Tools/csv2xls-0.4/csv_to_xls.py \
#LTtr_d8_vs_control_d8-all.txt \
#LTtr_d8_vs_control_d8-up.txt \
#LTtr_d8_vs_control_d8-down.txt \
#-d$',' -o LTtr_d8_vs_control_d8.xls;

#   719 K331A_d8_vs_control_d8-up.txt
#   371 LT_d8_vs_control_d8-up.txt --> 399
#   292 LTtr_d8_vs_control_d8-up.txt --> 380
#  323 K331A_d8_vs_control_d8-down.txt
#  141 LT_d8_vs_control_d8-down.txt --> 184
#   80 LTtr_d8_vs_control_d8-down.txt --> 136

library(ggrepel) 

geness_res <- read.csv(file = paste("LTtr_d8_vs_control_d8", "all.txt", sep="-"), row.names=1)
geness_res$Color <- "NS or log2FC < 2.0"
geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
geness_res$Color[geness_res$padj < 0.001] <- "P-adj < 0.001"
geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
geness_res$Color <- factor(geness_res$Color,
                        levels = c("NS or log2FC < 2.0", "P < 0.05",
                                   "P-adj < 0.05", "P-adj < 0.001"))

geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
top_g <- c()
top_g <- c(top_g, geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:200]], geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:200]])
top_g <- unique(top_g)
geness_res <- geness_res[, -1*ncol(geness_res)]  #remove invert_P from matrix

png("LTtr_d8_vs_control_d8.png",width=1400, height=1000)
ggplot(geness_res,
       aes(x = log2FoldChange, y = -log10(pvalue),
           color = Color, label = SYMBOL)) +
  geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +
  geom_hline(yintercept = -log10(0.05), lty = "dashed") +
  geom_point() +
  labs(x = "log2(FC)",
       y = "Significance, -log10(P)",
       color = "Significance") +
  scale_color_manual(values = c(`P-adj < 0.001` = "dodgerblue",
                                `P-adj < 0.05` = "lightblue",
                                `P < 0.05` = "orange2",
                                `NS or log2FC < 2.0` = "gray"),
                     guide = guide_legend(override.aes = list(size = 4))) +
  scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
  geom_text_repel(data = subset(geness_res, SYMBOL %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)),
                  size = 4, point.padding = 0.15, color = "black",
                  min.segment.length = .1, box.padding = .2, lwd = 2) +
  theme_bw(base_size = 16) +
  theme(legend.position = "bottom")
dev.off()

# ------------ LT K331A (p783) vs ctrl (p600) d8 ------------
colnames(d.raw)<- c("K331A_d8_DonorI","ctrl_d8_DonorI","ctrl_d8_DonorII","LT_d3_DonorII","ctrl_d3_DonorI","LT_d3_DonorI","K331A_d8_DonorII","LTtr_d3_DonorI","LT_d8_DonorI","ctrl_d3_DonorII","LT_d8_DonorII","LTtr_d8_DonorII","LTtr_d3_DonorII","LTtr_d8_DonorI")  #26485

col_order <- c("ctrl_d3_DonorI","ctrl_d3_DonorII","ctrl_d8_DonorI","ctrl_d8_DonorII",  "LT_d3_DonorI","LT_d3_DonorII","LT_d8_DonorI","LT_d8_DonorII",    "LTtr_d3_DonorI","LTtr_d3_DonorII","LTtr_d8_DonorI","LTtr_d8_DonorII",    "K331A_d8_DonorI","K331A_d8_DonorII")
reordered.raw <- d.raw[,col_order]

#selected all columns with "d8"
selected_reordered.raw <- reordered.raw[, c("ctrl_d8_DonorI","ctrl_d8_DonorII",  "K331A_d8_DonorI","K331A_d8_DonorII")]

condition = as.factor(c("control_d8","control_d8",  "K331A_d8","K331A_d8"))
ids = as.factor(c("ctrl_d8_DonorI","ctrl_d8_DonorII",  "K331A_d8_DonorI","K331A_d8_DonorII"))
donor = as.factor(c("DI","DII",  "DI","DII"))
cData = data.frame(row.names=colnames(selected_reordered.raw), condition=condition, donor=donor, ids=ids)
dds<-DESeqDataSetFromMatrix(countData=selected_reordered.raw, colData=cData, design=~donor+condition)  #batch+

#> sizeFactors(dds)
#NULL
dds <- estimateSizeFactors(dds)
sizeFactors(dds)
#  ctrl_d8_DonorI  ctrl_d8_DonorII  K331A_d8_DonorI K331A_d8_DonorII 
#       1.1747544        0.6871283        1.8656332        0.6817289

rld <- rlogTransformation(dds)

setwd("~/DATA/Data_Denise_LT_DNA_Bindung/results_RNAseq_K331A_hg38/featureCounts_K331A_vs_ctrl_d8")
dds$condition <- relevel(dds$condition, "control_d8")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("K331A_d8_vs_control_d8")

for (i in clist) {
  contrast = paste("condition", i, sep="_")
  res = results(dds, name=contrast)
  res <- res[!is.na(res$log2FoldChange),]
  geness <- AnnotationDbi::select(org.Hs.eg.db, keys = rownames(res), keytype = "SYMBOL", columns = c("ENSEMBL", "ENTREZID", "SYMBOL", "GENENAME"))
  geness <- geness[!duplicated(geness$SYMBOL), ]
  res$SYMBOL = rownames(res)
  rownames(geness) <- geness$SYMBOL
  identical(rownames(res), rownames(geness))
  res_df <- as.data.frame(res)
  geness_res <- merge(geness, res_df)
  dim(geness_res)
  geness_res$padj <- ifelse(is.na(geness_res$padj), 1, geness_res$padj)
  geness_res$pvalue <- ifelse(is.na(geness_res$pvalue), 1, geness_res$pvalue)
  write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
  #write.csv(res_df, file = paste(i, "background.txt", sep="_"))
  up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
  down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
  write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
  write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}

#~/Tools/csv2xls-0.4/csv_to_xls.py \
#K331A_d8_vs_control_d8-all.txt \
#K331A_d8_vs_control_d8-up.txt \
#K331A_d8_vs_control_d8-down.txt \
#-d$',' -o K331A_d8_vs_control_d8.xls;

#   719 K331A_d8_vs_control_d8-up.txt --> 315
#   371 LT_d8_vs_control_d8-up.txt --> 399
#   292 LTtr_d8_vs_control_d8-up.txt --> 380
#  323 K331A_d8_vs_control_d8-down.txt --> 277
#  141 LT_d8_vs_control_d8-down.txt --> 184
#   80 LTtr_d8_vs_control_d8-down.txt --> 136

library(ggrepel) 

geness_res <- read.csv(file = paste("K331A_d8_vs_control_d8", "all.txt", sep="-"), row.names=1)
geness_res$Color <- "NS or log2FC < 2.0"
geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
geness_res$Color[geness_res$padj < 0.001] <- "P-adj < 0.001"
geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
geness_res$Color <- factor(geness_res$Color,
                        levels = c("NS or log2FC < 2.0", "P < 0.05",
                                   "P-adj < 0.05", "P-adj < 0.001"))

geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
top_g <- c()
top_g <- c(top_g, geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:200]], geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:200]])
top_g <- unique(top_g)
geness_res <- geness_res[, -1*ncol(geness_res)]  #remove invert_P from matrix

png("K331A_d8_vs_control_d8.png",width=1400, height=1000)
ggplot(geness_res,
       aes(x = log2FoldChange, y = -log10(pvalue),
           color = Color, label = SYMBOL)) +
  geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +
  geom_hline(yintercept = -log10(0.05), lty = "dashed") +
  geom_point() +
  labs(x = "log2(FC)",
       y = "Significance, -log10(P)",
       color = "Significance") +
  scale_color_manual(values = c(`P-adj < 0.001` = "dodgerblue",
                                `P-adj < 0.05` = "lightblue",
                                `P < 0.05` = "orange2",
                                `NS or log2FC < 2.0` = "gray"),
                     guide = guide_legend(override.aes = list(size = 4))) +
  scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
  geom_text_repel(data = subset(geness_res, SYMBOL %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)),
                  size = 4, point.padding = 0.15, color = "black",
                  min.segment.length = .1, box.padding = .2, lwd = 2) +
  theme_bw(base_size = 16) +
  theme(legend.position = "bottom")
dev.off()

# ------------------- heatmap for separate analyses ----------------
setwd("~/DATA/Data_Denise_LT_DNA_Bindung/results_rnaseq_K331A_hg38/heatmap_separate_analyses")
up1 <-read.csv(file="../featureCounts_K331A_vs_ctrl_d8_csv/K331A_d8_vs_control_d8-up.txt")
up3 <-read.csv(file="../featureCounts_LT_vs_ctrl_d8_csv/LT_d8_vs_control_d8-up.txt")
up5 <-read.csv(file="../featureCounts_LTtr_vs_ctrl_d8_csv/LTtr_d8_vs_control_d8-up.txt")

down1 <-read.csv(file="../featureCounts_K331A_vs_ctrl_d8_csv/K331A_d8_vs_control_d8-down.txt")
down3 <-read.csv(file="../featureCounts_LT_vs_ctrl_d8_csv/LT_d8_vs_control_d8-down.txt")
down5 <-read.csv(file="../featureCounts_LTtr_vs_ctrl_d8_csv/LTtr_d8_vs_control_d8-down.txt")

RNASeq.NoCellLine <- assay(rld)  # using the overview analysis, only choose the gene ids between conditions seperately. NOT DONE!
#Option3: as paper described, A heatmap showing expression values of all DEGs which are significant between any pair conditions.
all_genes <- c(up1$SYMBOL,down1$SYMBOL,  up3$SYMBOL,down3$SYMBOL,  up5$SYMBOL,down5$SYMBOL)  #1685
all_genes <- unique(all_genes)   #1685-->1016
RNASeq.NoCellLine_  <- RNASeq.NoCellLine[all_genes,]
write.csv(as.data.frame(RNASeq.NoCellLine_), file ="significant_gene_expressions.txt")

##### STEP4: clustering the genes and draw heatmap #####
#clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
datamat = RNASeq.NoCellLine_
hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.08)
mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED",  "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");

mycol = mycol[as.vector(mycl)]
#sampleCols <- rep('GREY',ncol(RNASeq.NoCellLine_))
#names(sampleCols) <- c("mock_r1", "mock_r2", "mock_r3", "mock_r4", "WAP_r1", "WAP_r2",  "WAP_r3", "WAP_r4", "WAC_r1","WAC_r2")
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,4)=='mock'] <- 'GREY'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAP'] <- 'RED'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dM_'] <- 'CYAN'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dP_'] <- 'BLUE'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAC'] <- 'GREEN'
png("DEGs_heatmap.png", width=900, height=1010)
heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
            scale='row',trace='none',col=bluered(75), 
            RowSideColors = mycol, labRow="", srtCol=15, keysize=0.72)
dev.off()

#### cluster members #####
write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt') 
write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')  

~/Tools/csv2xls-0.4/csv_to_xls.py \
cluster1_YELLOW.txt \
cluster2_DARKBLUE.txt \
cluster3_DARKORANGE.txt \
-d$',' -o gene_culsters.xls;

~/Tools/csv2xls-0.4/csv_to_xls.py \
significant_gene_expressions.txt \
-d',' -o DEGs_heatmap_data.xls;

Nextflow RNAseq

  1. Merge re-sequenced FastQ files (cat)
  2. Sub-sample FastQ files and auto-infer strandedness (fq, Salmon)
  3. Read QC (FastQC)
  4. UMI extraction (UMI-tools)
  5. Adapter and quality trimming (Trim Galore!)
  6. Removal of genome contaminants (BBSplit)
  7. Removal of ribosomal RNA (SortMeRNA)
  8. Choice of multiple alignment and quantification routes: STAR -> Salmon STAR -> RSEM HiSAT2 -> NO QUANTIFICATION
  9. Sort and index alignments (SAMtools)
  10. UMI-based deduplication (UMI-tools)
  11. Duplicate read marking (picard MarkDuplicates)
  12. Transcript assembly and quantification (StringTie)
  13. Create bigWig coverage files (BEDTools, bedGraphToBigWig)
  14. Extensive quality control: RSeQC Qualimap dupRadar Preseq DESeq2
  15. Pseudo-alignment and quantification (Salmon; optional)
  16. Present QC for raw read, alignment, gene biotype, sample similarity, and strand-specificity checks (MultiQC, R)

umi-tools extract:

Flexible removal of UMI sequences from fastq reads.
UMIs are removed and appended to the read name. Any other barcode, for example a library barcode, is left on the read. Can also filter reads by quality or against a whitelist (see above)

The remaining commands, group, dedup and count/count_tab, are used to identify PCR duplicates using the UMIs and perform different levels of analysis depending on the needs of the user. A number of different UMI deduplication schemes are enabled - The recommended method is directional.

umi-tools dedup:

Groups PCR duplicates and deduplicates reads to yield one read per group
Use this when you want to remove the PCR duplicates prior to any downstream analysis

Introducing BBSplit: Read Binning Tool for Metagenomes and Contaminated Libraries

Removal of genome contaminants (BBSplit)
Removal of ribosomal RNA

StringTie for Transcript assembly and quantification

Extensive quality control:

The preseq package is aimed at predicting and estimating the complexity of a genomic sequencing library,

Herpes-virus is double-stranded RNA.

Herpes-simplex-Viren (humane Herpesviren Typ 1 und 2) sind häufige Ursachen rezidivierender Infektionen mit Beteiligung von Haut, Mund, Lippen, Augen und …

Salmon used expectation–maximization (EM) algorithm to assign reads if two copy of genes occurs.