#------------------------------
#---- 1. prepare gene sets ----
library(org.Hs.eg.db)
library(dplyr)
library(AnnotationDbi)
#1. Platelets
data <- data.frame(
geneSymbol = c("GP1BA","GP5","GP6","GP9","LY6G6D","MMRN1","PEAR1","PF4","PF4V1","PPBP","SLC35D3"),
geneEntrezID = c("2811","2814","51206","2815","58530","22915","375033","5196","5197","5473","340146"),
GeneSet = rep("Platelets", 11)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Platelets.csv", row.names = FALSE)
#2. Granulocytes
data <- data.frame(
geneSymbol = c("CXCR2","CD177","CLC","CTSS","DEFA1","FUT7","LTB4R","MMP25","OSM","RETN"),
geneEntrezID = c("3579","57126","1178","1520","1667","2529","1241","64386","5008","56729"),
GeneSet = rep("Granulocytes", 10)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Granulocytes.csv", row.names = FALSE)
#3. LDG
data <- data.frame(
geneSymbol = c("AZU1","CAMP","CEACAM6","CEACAM8","CTSG","DEFA4","ELANE","LCN2","LTF","MPO","OLFM4","RNASE3"),
geneEntrezID = c("566","820","4680","1088","1511","1669","1991","3934","4057","4353","10562","6037"),
GeneSet = rep("LDG", 12)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "LDG.csv", row.names = FALSE)
#4. pDC
data <- data.frame(
geneSymbol = c("CLEC4C","NRP1"),
geneEntrezID = c("170482","8829"),
GeneSet = rep("pDC", 2)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "pDC.csv", row.names = FALSE)
#5. Anti-inflammation
data <- data.frame(
geneSymbol = c("IL1RN","TNFAIP3","SOCS3"),
geneEntrezID = c("3557","7128","9021"),
GeneSet = rep("Anti-inflammation", 3)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Anti-inflammation.csv", row.names = FALSE)
#6. Pro-inflam. IL-1
data <- data.frame(
geneSymbol = c("IL1A","IL1B","IL18","IL36A","IL36B","IL36G","IL33","IL1R1","IL1RAP","IL18R1","IL18RAP"),
geneEntrezID = c("3552","3553","3606","27179","27177","56300","90865","3554","3556","8809","8807"),
GeneSet = rep("Pro-inflam. IL-1", 11)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Pro-inflam._IL-1.csv", row.names = FALSE)
#7. Dendritic cells
data <- data.frame(
geneSymbol = c("CLEC12A","CLEC10A","CLEC9A","CSF1R","IGIP","LILRA4","LY75","XCR1"),
geneEntrezID = c("160364","10462","283420","1436","492311","23547","4065","2829"),
GeneSet = rep("Dendritic cells", 8)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Dendritic_cells.csv", row.names = FALSE)
#8. MHC II
data <- data.frame(
geneSymbol = c("HLA-DMA","HLA-DMB","HLA-DPA1","HLA-DPB1","HLA-DPB2","HLA-DQA1","HLA-DQA2","HLA-DQB1","HLA-DQB2","HLA-DRA","HLA-DRB1","HLA-DRB3","HLA-DRB4","HLA-DRB5","HLA-DRB6"),
geneEntrezID = c("3108","3109","3113","3115","3116","3117","3118","3119","3120","3122","3123","3125","3126","3127","3128"),
GeneSet = rep("MHC II", 15)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "MHC_II.csv", row.names = FALSE)
#9. Alt. complement
data <- data.frame(
geneSymbol = c("C3","C5","C6","C7","C8A","C9","CFB","CFD","CFH","CFHR5","CFP","CR1","GZMM","MIR520B","MIR520E","VSIG4"),
geneEntrezID = c("718","727","729","730","731","735","629","1675","3075","81494","5199","1378","3004","574473","574461","11326"),
GeneSet = rep("Alt. complement", 16)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Alt._complement.csv", row.names = FALSE)
#10. TNF
data <- data.frame(
geneSymbol = c("BRCA1","CASP1","CXCL1","CXCL2","GBP1","GP1BA","IFI44","IL18","IL1B","IL1RN","MX1","NAMPT","NR3C1","OAS3","PATJ","SH3BP5","TAP2","TNF","TNFAIP3","TNFRSF11A","WT1","ACLY","ACSL1","ADGRE2","AK3","AKAP10","AMPD3","APOL3","ARID3A","ARSE","ASAP1","B4GALT5","BCL2A1","BHLHE41","BHMT","BIRC3","CALD1","CASP10","CCL15","CCL20","CCL23","CCL3L1","CD37","CD38","CD83","CDKN3","CKB","CR2","CTNND2","CXCL3","CXCL8","CYP27B1","DAB2","EBI3","EGR1","EGR2","EPB41","EREG","ETAA1","F3","FABP1","FBXL2","FCER2","FCGR2A","FLJ11129","FLNA","G0S2","GCH1","GJB2","GLS","GMIP","GRK3","HCAR3","HHEX","HOMER2","HP","ICAM1","IDO1","IKBKG","IL16","IL1A","IL6","INHBA","INSIG1","ITGA6","KITLG","KLF1","KMO","LGALS3BP","MAP3K4","MARCKS","MGLL","MMP19","MN1","MRPS15","MSC","MTF1","NELL2","NFKB1","NFKB2","NFKBIA","NFKBIZ","NKX3-2","PDE4DIP","PDPN","PIAS4","PLAUR","PTGES","PTGS2","RELB","RPGR","RPS9","SDC4","SERPIND1","SFRP1","SLAMF1","SLC30A4","SOD2","SPI1","SSPN","STAT4","TAF15","TBX3","TFF1","TNFAIP2","TRAF1","TSC22D1","TYROBP","UBE2C","VEGFA"),
geneEntrezID = c("672","834","2919","2920","2633","2811","10561","3606","3553","3557","4599","10135","2908","4940","10207","9467","6891","7124","7128","8792","7490","47","2180","30817","50808","11216","272","80833","1820","415","50807","9334","597","79365","635","330","800","843","6359","6364","6368","6349","951","952","9308","1033","1152","1380","1501","2921","3576","1594","1601","10148","1958","1959","2035","2069","54465","2152","2168","25827","2208","2212","54674","2316","50486","2643","2706","2744","51291","157","8843","3087","9455","3240","3383","3620","8517","3603","3552","3569","3624","3638","3655","4254","10661","8564","3959","4216","4082","11343","4327","4330","64960","9242","4520","4753","4790","4791","4792","64332","579","9659","10630","51588","5329","9536","5743","5971","6103","6203","6385","3053","6422","6504","7782","6648","6688","8082","6775","8148","6926","7031","7127","7185","8848","7305","11065","7422"),
GeneSet = rep("TNF", 130)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "TNF.csv", row.names = FALSE)
#11. NLRP3 inflammasome
data <- data.frame(
geneSymbol = c("APP","ATAT1","CARD8","CASP1","CASP4","CD36","CPTP","DHX33","EIF2AK2","GBP5","GSDMD","HSP90AB1","MEFV","NFKB1","NFKB2","NLRC3","NLRP3","P2RX7","PANX1","PSTPIP1","PYCARD","PYDC2","RELA","SIRT2","SUGT1","TLR4","TLR6","TXN","TXNIP","USP50"),
geneEntrezID = c("351","79969","22900","834","837","948","80772","56919","5610","115362","79792","3326","4210","4790","4791","197358","114548","5027","24145","9051","29108","152138","5970","22933","10910","7099","10333","7295","10628","373509"),
GeneSet = rep("NLRP3 inflammasome", 30)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "NLRP3_inflammasome.csv", row.names = FALSE)
#12. Unfolded protein
data <- data.frame(
geneSymbol = c("B4GALT3","CALR","CALU","CANX","CDS2","CHST12","CHST2","DERL1","DERL2","DNAJC3","EDEM2","EDEM3","EMC9","ERAP1","ERGIC2","ERO1L","EXT1","GALNT2","GOLT1B","HERPUD1","HYOU1","IER3IP1","IMPAD1","KDELC1","KDELR2","LMAN2","LPGAT1","MAN1A1","MANEA","MANF","NUCB2","PDIA4","PDIA6","PIGK","PPIB","SEC24D","SEC61G","SPCS3","SSR1","SSR3","TRAM1","TRAM2","UGGT1","XBP1"),
geneEntrezID = c("8703","811","813","821","8760","55501","9435","79139","51009","5611","55741","80267","51016","51752","51290","30001","2131","2590","51026","9709","10525","51124","54928","79070","11014","10960","9926","4121","79694","7873","4925","9601","10130","10026","5479","9871","23480","60559","6745","6747","23471","9697","56886","7494"),
GeneSet = rep("Unfolded protein", 44)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Unfolded_protein.csv", row.names = FALSE)
#13. B cells
data <- data.frame(
geneSymbol = c("IGHD","SH3BP5","BANK1","BLK","BLNK","CD19","CD22","CD79A","CD79B","DAPP1","FCRL1","FCRL2","FCRL3","FCRLA","GON4L","GPR183","IGHM","KLHL6","MS4A1","PAX5","PLCL2","VPREB1","ZNF318"),
geneEntrezID = c("3495","9467","55024","640","29760","930","933","973","974","27071","115350","79368","115352","84824","54856","1880","3507","89857","931","5079","23228","7441","24149"),
GeneSet = rep("B cells", 23)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "B_cells.csv", row.names = FALSE)
#14. Monocyte cell surface
data <- data.frame(
geneSymbol = c("CD14","CD300C","CD33","CD68","CLEC12A","CLEC4D","CLEC4E","FCGR1A","FCGR1B","FCGR3B","LILRA5","LILRA6","LILRB2","LILRB3","OSCAR","SEMA4A","SIGLEC1"),
geneEntrezID = c("929","10871","945","968","160364","338339","26253","2209","2210","2215","353514","79168","10288","11025","126014","64218","6614"),
GeneSet = rep("Monocyte cell surface", 17)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Monocyte_cell_surface.csv", row.names = FALSE)
#15. Inflammasome
data <- data.frame(
geneSymbol = c("CASP1","AIM2","CASP5","CTSB","GSDMB","GSDMD","NAIP","NEK7","NLRC4","NLRP1","NLRP3","NOD2","P2RX7","PANX1","PYCARD","RIPK1"),
geneEntrezID = c("834","9447","838","1508","55876","79792","4671","140609","58484","22861","114548","64127","5027","24145","29108","8737"),
GeneSet = rep("Inflammasome", 16)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Inflammasome.csv", row.names = FALSE)
#16. Monocytes_40
data <- data.frame(
geneSymbol = c("C1QA","C1QB","C1QC","C1RL","CCL2","CCL8","CD14","CD300C","CD33","CD68","CLEC12A","CLEC4D","CLEC4E","CXCL1","CXCL2","CXCR2","FCGR1A","FCGR1B","FCGR3B","GRN","IK","IL18RAP","IL1B","IL1RN","LILRA5","LILRA6","LILRB2","LILRB3","OSCAR","S100A8","SEMA4A","SIGLEC1","THBD","BST1","C1R","CD163","CSF2","FUT4","MNDA","MRC1"),
geneEntrezID = c("712","713","714","51279","6347","6355","929","10871","945","968","160364","338339","26253","2919","2920","3579","2209","2210","2215","2896","3550","8807","3553","3557","353514","79168","10288","11025","126014","6279","64218","6614","7056","683","715","9332","1437","2526","4332","4360"),
GeneSet = rep("Monocytes_40", 40)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Monocytes_40.csv", row.names = FALSE)
#17. Monocyte secreted
data <- data.frame(
geneSymbol = c("C1QA","C1QB","C1QC","C1RL","CCL2","CCL8","CXCL1","CXCL2","GRN","IK","IL18RAP","IL1B","IL1RN","S100A8","THBD","TNF","CXCL10"),
geneEntrezID = c("712","713","714","51279","6347","6355","2919","2920","2896","3550","8807","3553","3557","6279","7056","7124","3627"),
GeneSet = rep("Monocyte secreted", 17)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Monocyte_secreted.csv", row.names = FALSE)
#18. IL-1 cytokines
data <- data.frame(
geneSymbol = c("IL18","IL1B"),
geneEntrezID = c("3606","3553"),
GeneSet = rep("IL-1 cytokines", 2)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "IL-1_cytokines.csv", row.names = FALSE)
#19. SNOR low UP
data <- data.frame(
geneSymbol = c("FCGR1A","CEACAM1","LGALS1","SNORD24","SNORD44","SNORD47","SNORD80"),
geneEntrezID = c("2209","634","3956","26820","26806","26802","26774"),
GeneSet = rep("SNOR low UP", 7)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "SNOR_low_UP.csv", row.names = FALSE)
#20. CD40 activated
data <- data.frame(
geneSymbol = c("BUB1B","CCL22","CD58","DBI","DUSP4","FABP5","FDPS","FEN1","GAPDH","GARS","GRHPR","H2AFX","H2AFZ","HMGB2","HMMR","HPRT1","IMPDH2","LDHA","LDHB","LMNB1","LPXN","MCM2","MCM3","MCM6","MSH6","MTHFD2","MYBL2","NDC80","NME1","PAICS","PCNA","PKM","PRDX3","RFTN1","RGS10","SLAMF1","TK1","TOP2A","TPX2","TRAF1","TUBA1B","TXN","TYMS","UBE2S","VDAC1","WEE1","WSB2","ZWINT"),
geneEntrezID = c("701","6367","965","1622","1846","2171","2224","2237","2597","2617","9380","3014","3015","3148","3161","3251","3615","3939","3945","4001","9404","4171","4172","4175","2956","10797","4605","10403","4830","10606","5111","5315","10935","23180","6001","6504","7083","7153","22974","7185","10376","7295","7298","27338","7416","7465","55884","11130"),
GeneSet = rep("CD40 activated", 48)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "CD40_activated.csv", row.names = FALSE)
#21. Lectin complement
data <- data.frame(
geneSymbol = c("A2M","C2","C3","C4A","C4B","C4B_2","C5","C6","C7","C8A","C9","COLEC10","COLEC11","FCN1","FCN2","FCN3","KRT1","MASP1","MASP2","MBL2","SERPING1"),
geneEntrezID = c("2","717","718","720","721","100293534","727","729","730","731","735","10584","78989","2219","2220","8547","3848","5648","10747","4153","710"),
GeneSet = rep("Lectin complement", 21)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Lectin_complement.csv", row.names = FALSE)
#22. Classical complement
data <- data.frame(
geneSymbol = c("C1QA","CIQB","C1QC","C1R","C1S","C2","C3","C4A","C4B","C4B_2","C5","C6","C7","C8A","C9"),
geneEntrezID = c("712","713","714","715","716","717","718","720","721","100293534","727","729","730","731","735"),
GeneSet = rep("Classical complement", 15)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Classical_complement.csv", row.names = FALSE)
#23. Cell cycle
data <- data.frame(
geneSymbol = c("BRCA1","ASPM","AURKA","AURKB","CCNB1","CCNB2","CCNE1","CDC20","CENPM","CEP55","E2F3","GINS2","MCM10","MCM2","MKI67","NCAPG","NDC80","PTTG1","TYMS"),
geneEntrezID = c("672","259266","6790","9212","891","9133","898","991","79019","55165","1871","51659","55388","4171","4288","64151","10403","9232","7298"),
GeneSet = rep("Cell cycle", 19)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Cell_cycle.csv", row.names = FALSE)
#24. Plasma cells
data <- data.frame(
geneSymbol = c("IGHV4-28","IGHV4-34","IGHD","IGHG1","IGHV2-5","IGHV3-20","IGHV3-23","IGKC","IGLV1-40","IGLV3-1","IGLV3-19","IGLV3-25","IGLV4-3","IGLV4-60","IGLV5-45","IGLV6-57","IGLVI-70","C19orf10","IGH","IGHMBP2","IGHV4-31","IGK","IGL","IGLJ3","IGLL1","IGLV@","IGLV1-44","IGLV2-14","IGLV2-5","MZB1","PRDM1","SDC1","THEMIS2","TNFRSF17"),
geneEntrezID = c("28400","28395","3495","3500","28457","28445","28442","3514","28825","28809","28797","28793","28786","28785","28781","28778","28763","56005","3492","3508","28396","50802","3535","28831","3543","3546","28823","28815","28818","51237","639","6382","9473","608"),
GeneSet = rep("Plasma cells", 34)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Plasma_cells.csv", row.names = FALSE)
#25. IG chains
data <- data.frame(
geneSymbol = c("IGHG1","IGHV2-5","IGHV3-20","IGHV3-23","IGHV4-28","IGHV4-34","IGKC","IGLV1-40","IGLV3-1","IGLV3-19","IGLV3-25","IGLV4-3","IGLV4-60","IGLV5-45","IGLV6-57","IGLVI-70","IGHA1","IGHA2","IGHD2-15","IGHD2-2","IGHD2-21","IGHD3-10","IGHD3-16","IGHD3-3","IGHD3-9","IGHG2","IGHG3","IGHG4","IGHJ1","IGHJ2","IGHJ3","IGHJ4","IGHJ5","IGHJ6","IGHV1-18","IGHV1-2","IGHV1-24","IGHV1-3","IGHV1-45","IGHV1-46","IGHV1-58","IGHV1-69-2","IGHV2-26","IGHV2-70","IGHV3-13","IGHV3-15","IGHV3-21","IGHV3-33","IGHV3-43","IGHV3-48","IGHV3-49","IGHV3-53","IGHV3-62","IGHV3-64","IGHV3-7","IGHV3-72","IGHV3-73","IGHV3-74","IGHV4-30-2","IGHV4-39","IGHV4-59","IGHV4-61","IGHV5-51","IGHV6-1","IGHV7-81","IGKJ1","IGKJ2","IGKJ3","IGKJ4","IGKJ5","IGKV1-16","IGKV1-17","IGKV1-27","IGKV1-5","IGKV1-6","IGKV1-9","IGKV1D-16","IGKV1D-17","IGKV1D-43","IGKV1D-8","IGKV2-24","IGKV2D-26","IGKV2D-29","IGKV2D-30","IGKV3-20","IGKV3D-20","IGKV3D-7","IGKV4-1","IGKV5-2","IGLC2","IGLC7","IGLJ6","IGLV10-54","IGLV1-36","IGLV1-47","IGLV2-11","IGLV2-18","IGLV2-23","IGLV2-33","IGLV2-8","IGLV3-10","IGLV3-12","IGLV3-16","IGLV3-21","IGLV3-27","IGLV3-32","IGLV4-69","IGLV5-37","IGLV7-43","IGLV8-61","IGLV9-49"),
geneEntrezID = c("3500","28457","28445","28442","28400","28395","3514","28825","28809","28797","28793","28786","28785","28781","28778","28763","3493","3494","28503","28505","28502","28499","28498","28501","28500","3501","3502","3503","28483","28481","28479","28477","28476","28475","28468","28474","28467","28473","28466","28465","28464","28458","28455","28454","28449","28448","28444","28434","28426","28424","28423","28420","28416","28414","28452","28410","28409","28408","28398","28394","28392","28391","28388","28385","28378","28950","28949","28948","28947","28946","28938","28937","28935","28299","28943","28941","28901","28900","28891","28904","28923","28884","28882","28881","28912","28874","28877","28908","28907","3538","28834","28828","28772","28826","28822","28816","28814","28813","28811","28817","28803","28802","28799","28796","28791","28787","28784","28783","28776","28774","28773"),
GeneSet = rep("IG chains", 111)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "IG_chains.csv", row.names = FALSE)
#26. Erythrocytes
data <- data.frame(
geneSymbol = c("EPO","GFI1B","GYPA","GYPB","GYPE","ICAM4","NFE2","SLC4A1","TRIM10","TSPO2","ZNF367"),
geneEntrezID = c("2056","8328","2993","2994","2996","3386","4778","6521","10107","222642","195828"),
GeneSet = rep("Erythrocytes", 11)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Erythrocytes.csv", row.names = FALSE)
#27. IL-6R complex
data <- data.frame(
geneSymbol = c("IL6","IL6R","IL6ST"),
geneEntrezID = c("3569","3570","3572"),
GeneSet = rep("IL-6R complex", 3)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "IL-6R_complex.csv", row.names = FALSE)
#28. IFN
data <- data.frame(
geneSymbol = c("EIF2AK2","GBP1","GBP2","GBP4","HERC5","HERC6","IFI27","IFI30","IFI35","IFI44","IFI44L","IFI6","IFIT1","IFIT2","IFIT3","IFIT5","IFITM1","IFITM2","IFITM3","ISG15","ISG20","MX1","MX2","OAS1","OAS2","OAS3","OASL","RSAD2","SAMD9","SAMD9L","SP100","SP110"),
geneEntrezID = c("5610","2633","2634","115361","51191","55008","3429","10437","3430","10561","10964","2537","3434","3433","3437","24138","8519","10581","10410","9636","3669","4599","4600","4938","4939","4940","8638","91543","54809","219285","6672","3431"),
GeneSet = rep("IFN", 32)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "IFN.csv", row.names = FALSE)
#29. TCRB
data <- data.frame(
geneSymbol = c("TRBC2","TRBJ2-1","TRBJ2-2","TRBJ2-2P","TRBJ2-3","TRBJ2-4","TRBJ2-5","TRBJ2-6","TRBJ2-7","TRBV1","TRBV10-1","TRBV10-2","TRBV11-1","TRBV11-2","TRBV19","TRBV2","TRBV20-1","TRBV21-1","TRBV23-1","TRBV24-1","TRBV25-1","TRBV27","TRBV28","TRBV3-1","TRBV4-1","TRBV4-2","TRBV5-1","TRBV5-3","TRBV5-4","TRBV5-5","TRBV5-6","TRBV5-7","TRBV6-1","TRBV6-4","TRBV6-5","TRBV6-6","TRBV6-7","TRBV6-8","TRBV7-1","TRBV7-3","TRBV7-4","TRBV7-5","TRBV7-6","TRBV7-7","TRBV9"),
geneEntrezID = c("28638","28629","28628","28627","28626","28625","28624","28623","28622","28621","28585","28584","28582","28581","28568","28620","28567","28566","28564","28563","28562","28560","28559","28619","28617","28616","28614","28612","28611","28610","28609","28608","28606","28603","28602","28601","28600","28599","28597","28595","28594","28593","28592","28591","28586"),
GeneSet = rep("TCRB", 45)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "TCRB.csv", row.names = FALSE)
#30. TCRA
data <- data.frame(
geneSymbol = c("TRAV10","TRAV1-1","TRAV1-2","TRAV12-1","TRAV12-2","TRAV12-3","TRAV13-1","TRAV13-2","TRAV14DV4","TRAV16","TRAV17","TRAV18","TRAV19","TRAV2","TRAV20","TRAV21","TRAV22","TRAV23DV6","TRAV24","TRAV25","TRAV26-1","TRAV26-2","TRAV27","TRAV29DV5","TRAV3","TRAV30","TRAV34","TRAV35","TRAV36DV7","TRAV38-1","TRAV38-2DV8","TRAV39","TRAV4","TRAV40","TRAV41","TRAV5","TRAV7","TRAV8-1","TRAV8-2","TRAV8-3","TRAV8-4","TRAV8-6","TRAV8-7","TRAV9-1","TRAV9-2"),
geneEntrezID = c("28676","28693","28692","28674","28673","28672","28671","28670","28669","28667","28666","28665","28664","28691","28663","28662","28661","28660","28659","28658","28657","28656","28655","28653","28690","28652","28648","28647","28646","28644","28643","28642","28689","28641","28640","28688","28686","28685","28684","28683","28682","28680","28679","28678","28677"),
GeneSet = rep("TCRA", 45)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "TCRA.csv", row.names = FALSE)
#31. Cyt. act. T cells
data <- data.frame(
geneSymbol = c("GZMB","TAGAP","CD69","EOMES","GZMH","IFNG","IL2RB","PRF1","SGK1","TBX21","TFRC","ZNF683"),
geneEntrezID = c("3002","117289","969","8320","2999","3458","3560","5551","6446","30009","7037","257101"),
GeneSet = rep("Cyt. act. T cells", 12)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Cyt._act._T_cells.csv", row.names = FALSE)
#32. TCRG
data <- data.frame(
geneSymbol = c("TRGC2","TRGV1","TRGV10","TRGV11","TRGV2","TRGV3","TRGV4","TRGV5","TRGV7","TRGV8","TRGV9"),
geneEntrezID = c("6967","6973","6984","6985","6974","6976","6977","6978","6981","6982","6983"),
GeneSet = rep("TCRG", 11)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "TCRG.csv", row.names = FALSE)
#33. T cells
data <- data.frame(
geneSymbol = c("CD8A","CD8B","TRDC","CCR3","CD226","CD247","CD28","CD3D","CD3E","CD3G","CD4","CD5","ETS1","GATA3","GRAP2","LEF1","SH2D1A","TRAC","TRBC1"),
geneEntrezID = c("925","926","28526","1232","10666","919","940","915","916","917","920","921","2113","2625","9402","51176","4068","28755","28639"),
GeneSet = rep("T cells", 19)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "T_cells.csv", row.names = FALSE)
#34. CD8T-NK-NKT
data <- data.frame(
geneSymbol = c("CD8A","CD8B","GZMB","NKTR","CD2","CD7","CRTAM","GNLY","GZMA","GZMK","GZMM","HCST","KIR2DL3","KIR3DL1","KIR3DL2","KLRB1","KLRC3","KLRC4","KLRD1","NKG7","RASAL3","TIA1","TXK"),
geneEntrezID = c("925","926","3002","4820","914","924","56253","10578","3001","3003","3004","10870","3804","3811","3812","3820","3823","8302","3824","4818","64926","7072","7294"),
GeneSet = rep("CD8T-NK-NKT", 23)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "CD8T-NK-NKT.csv", row.names = FALSE)
#35. Anergic or act. T cells
data <- data.frame(
geneSymbol = c("CD160","CD244","CTLA4","HAVCR2","ICOS","KLRG1","LAG3","PDCD1"),
geneEntrezID = c("11126","51744","1493","84868","29851","10219","3902","5133"),
GeneSet = rep("Anergic or act. T cells", 8)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Anergic_or_act._T_cells.csv", row.names = FALSE)
#36. T activated
data <- data.frame(
geneSymbol = c("FASLG","TAGAP","CD40LG","CREM","IL17A","IL17F","IL23R","JAKMIP1","KCNA3","KCNN4","P2RX5","PRKCQ","RELT","RNF125","SATB1","TNFRSF4","TNFRSF9"),
geneEntrezID = c("356","117289","959","1390","3605","112744","149233","152789","3738","3783","5026","5588","84957","54941","6304","7293","3604"),
GeneSet = rep("T activated", 17)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "T_activated.csv", row.names = FALSE)
#37. NK cells
data <- data.frame(
geneSymbol = c("KLRF1","NCAM1","NCR1","NCR3","SH2D1B"),
geneEntrezID = c("51348","4684","9437","259197","117157"),
GeneSet = rep("NK cells", 5)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "NK_cells.csv", row.names = FALSE)
#38. TCRD
data <- data.frame(
geneSymbol = c("TRDC","TRDJ1","TRDJ2","TRDJ3","TRDJ4","TRDV1","TRDV2","TRDV3"),
geneEntrezID = c("28526","28522","28521","28520","28519","28518","28517","28516"),
GeneSet = rep("TCRD", 8)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "TCRD.csv", row.names = FALSE)
#39. T regs
data <- data.frame(
geneSymbol = c("FOXP3","IKZF2"),
geneEntrezID = c("50943","22807"),
GeneSet = rep("T regs", 2)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "T_regs.csv", row.names = FALSE)
#40. SNOR low DOWN
data <- data.frame(
geneSymbol = c("RNU4ATAC","SNORA11","SNORA16A","SNORA23","SNORA38B","SNORA73A","SNORA73B"),
geneEntrezID = c("100151683","677799","692073","677808","100124536","6080","26768"),
GeneSet = rep("SNOR low DOWN", 7)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "SNOR_low_DOWN.csv", row.names = FALSE)
#--
#41. Monocytes
data <- data.frame(
geneSymbol = c("ACE","ADAM8","ADAMDEC1","ADGRE1","ADGRE2","APOBEC3B","APOBEC3G","APOBR","ART4","C1QA","C1QC","C2","C4A","C4B","C4BPA","C4BPB","C5","C6","C8A","C9","CCL17","CCL18","CCL22","CCL28","CCL7","CCL8","CD14","CD163","CD209","CD300C","CD300E","CD5L","CD80","CFB","CFD","CFP","CHI3L1","CHIT1","CLEC12A","CLEC12B","CLEC4D","CLEC4E","CLEC5A","CLEC7A","CLEC9A","CSF1R","CXCL1","CXCL10","CXCL11","CXCL13","CXCL8","CXCL9","CYBB","F12","FCGR3B","FFAR2","HMMR","HVCN1","IGSF6","IL10RA","IL12A","IL12B","IL1RAP","IL20","IL23A","IL27","IL31RA","LAMP3","LGALS12","LGALS4","LGALS9","LGALS9B","LGALS9C","LILRA2","LILRA5","LILRA6","LILRB5","LMNB1","LY86","LYZ","MARCO","MEGF10","MERTK","MFGE8","MPEG1","MRC1","MS4A4A","MSR1","NTSR1","OCM","OLR1","OSCAR","OTOF","PDCD1LG2","PILRA","PLA2G2D","PLA2G5","PYHIN1","S100A8","S100A9","S1PR5","SCARB1","SCARF2","SECTM1","SEMA4A","SERPINB9","SERPING1","SIGLEC1","SIGLEC14","SIGLEC5","SIGLEC7","SLC11A1","SLITRK4","SMPDL3B","SPIC","STAB2","STAP2","TEK","TGM2","TIMD4","TLR2","TLR8","TNF","TNFAIP8L2","TNFRSF1B","TNIP3","TULP1","UBD","VENTX","VSTM1"),
geneEntrezID = c("1636","101","27299","2015","30817","9582","60489","55911","420","712","714","717","720","721","722","725","727","729","731","735","6361","6362","6367","56477","6354","6355","929","9332","30835","10871","342510","922","941","629","1675","5199","1116","1118","160364","387837","338339","26253","23601","64581","283420","1436","2919","3627","6373","10563","3576","4283","1536","2161","2215","2867","3161","84329","10261","3587","3592","3593","3556","50604","51561","246778","133396","27074","85329","3960","3965","284194","654346","11027","353514","79168","10990","4001","9450","4069","8685","84466","10461","4240","219972","4360","51338","4481","4923","654231","4973","126014","9381","80380","29992","26279","5322","149628","6279","6280","53637","949","91179","6398","64218","5272","710","6614","100049587","8778","27036","6556","139065","27293","121599","55576","55620","7010","7052","91937","7097","51311","7124","79626","7133","79931","7287","10537","27287","284415"),
GeneSet = rep("Monocytes", 130)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Monocytes.csv", row.names = FALSE)
#42. Myeloid cells
data <- data.frame(
geneSymbol = c("ADGRE3","AIF1","AOAH","APOC1","BMX","C1QB","CD101","CD300LF","CD33","CD68","CLEC10A","CLEC16A","CLEC1A","CLEC2B","CLEC4A","CLEC4G","CLEC6A","CSF2RA","CSF2RB","CSF3R","CST3","CYBA","FCER1A","FCER1G","FCGR1A","FCGR1B","FCGR2A","FCGR2B","FCGR2C","FCGR3A","FGR","FLT3","FOLR2","IFI30","IFNL1","IFNL2","IFNL3","IL18","IL1A","IL1B","IL1F10","IL1RN","IL26","IL37","ITGAX","LILRA1","LILRA3","LILRB1","LILRB2","LILRB3","LILRB4","LYVE1","MEFV","MNDA","MS4A6A","MS4A7","NLRP12","NLRP3","NOD2","PECAM1","PLEK","PRAM1","RNASE3","RNASE6","RNASE7","SIGLEC10","SKAP2","SLAMF8","SLPI","SPI1","TNFSF13B","TNFSF14","TREM1","TREML2","TREML4","TWIST1","TYROBP","UNC93B1","VSIG4"),
geneEntrezID = c("84658","199","313","341","660","713","9398","146722","945","968","10462","23274","51267","9976","50856","339390","93978","1438","1439","1441","1471","1535","2205","2207","2209","2210","2212","2213","9103","2214","2268","2322","2350","10437","282618","282616","282617","3606","3552","3553","84639","3557","55801","27178","3687","11024","11026","10859","10288","11025","11006","10894","4210","4332","64231","58475","91662","114548","64127","5175","5341","84106","6037","6039","84659","89790","8935","56833","6590","6688","10673","8740","54210","79865","285852","7291","7305","81622","11326"),
GeneSet = rep("Myeloid cells", 79)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Myeloid_cells.csv", row.names = FALSE)
#43. Neutrophils
data <- data.frame(
geneSymbol = c("ABHD3","ARG1","BPI","CAMP","CD177","CD83","CRISP3","DEFA1","DEFA1B","DEFA3","DEFB103A","DEFB103B","DEFB106B","DEFB136","DEFB4A","FUT4","LY6E","MMP8","OLFM4","OR1J2","PGLYRP1","PRTN3","S100A6"),
geneEntrezID = c("171586","383","671","820","57126","9308","10321","1667","728358","1668","414325","55894","503841","613210","1673","2526","4061","4317","10562","26740","8993","5657","6277"),
GeneSet = rep("Neutrophils", 23)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Neutrophils.csv", row.names = FALSE)
#44. Neutrophils_
data <- data.frame(
geneSymbol = c("EMR2","C5AR2","PLEKHG3","DPEP2","PADI4","FAM212B","TREML2","REPS2","MAK","STEAP4","PGLYRP1","MXD1","MEFV","BTNL8","CREB5","ALOX5","BST1","CEACAM3","VNN3","LILRA2","HSPA6","NFE2","HAL","FFAR2","MMP25","QPCT","CDA","P2RY13","CHST15","TNFRSF10C","FPR2","MGAM","APOBEC3A","TLR2","CXCR1","FAM65B","LST1","TREM1","CSF3R","C5AR1","SELL","AQP9","CXCR2","VNN2","MNDA","FPR1","FCGR3B"),
GeneSet = rep("Neutrophils_", 47)
)
data$geneSymbol <- sub("\\..*", "", data$geneSymbol)
# Fetch ENSEMBL IDs for your gene symbols
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneSymbol,
keytype="SYMBOL",
columns=c("ENSEMBL", "ENTREZID"))
# Join the original data with the fetched IDs
result <- left_join(data, ensembl_ids, by = c("geneSymbol"="SYMBOL"))
# Arrange columns as desired
sorted_result <- result[, c("geneSymbol", "ENTREZID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Neutrophils_.csv", row.names = FALSE)
#45. Inflammatory_neutrophils
data <- data.frame(
geneSymbol = c("CD177.1","ZDHHC19.1","PLAC8.1","IFI6.2","MT2A.2","SERPING1.2","RSAD2.2","GBP1.2","ISG15.2","LY6E.2","S100A12.2","GBP5.2","XAF1.2","IFI44L.2","IFIT3.2","IFITM3.2","SELL.2","TRIM22.2","EPSTI1.2","FCER1G.2","IFI44.2","CST7.1","OAS1.2","TNFSF13B.2","SLFN5","IFIT2.2","MX1.2","PLSCR1.2","MMP9.2","LAP3","OAS2.2","IFIT1.2","TAP1.2","SAMD9L.2","HIST1H2AC.1","APOL6.2","OASL.1","DDX58.1","WSB1.1","RPL28.1","IL1RN.2","MCEMP1.1","HERC5.1","IRF7.2","CKAP4.1","UBE2L6.1","RNF213.2","GBP2.2","CLEC4D.1","LILRA6.1","SHISA5.2","PHF11.1","IFI16.2","OAS3.1","PIM1.1","SERPINB1.1","GYG1.1","IFITM1.1","PSTPIP2","TMSB10.1","GBP4","NFKBIA.2","EIF2AK2.1","FFAR2.2","PARP9.1","NT5C3A","TNFSF10.2","MYL12A.1","PARP14.1","DDX60","DDX60L.2","STAT1.1","XRN1","SAMD9.2","SNX3","HIST1H2BD","PGD.1","FCGR1A","JUN","CD44","PLP2.1","CARD16.2","LIMK2.2","CYSTM1.1","HMGB2.1","IFIH1.2","C1orf162","KLF4","STAT2","ZBP1","ALPL.1","GSTK1","SP110.2","ANXA1","ANXA3.1","METTL9","PML","NUCB1","CEACAM1","SECTM1","PSMB9.1","SAT1","RNF10.1","MAPK14.1","LILRA5","C3AR1","ZCCHC2","SAMHD1","LGALS9","CR1","C4orf3","IRF1.1","CD63","GRINA","TXN.1","MX2.1","GAPDH.1","RBCK1","NBN.2","NMI.1","CAST","HIST2H2BE","NTNG2","SP100.1","CASP1.1","TNFAIP6","PLEK.1","LMNB1","NFIL3","APOL2","BST1","NUB1","PIK3AP1","CCR1.2","ALOX5AP.1","EMB","ISG20","TMEM123.1","ADAR","GIMAP4","SPTLC2","SAMSN1.1","SPI1","GRN.1","APOBEC3A","GLRX","CD53.1","TRIM38","CD82","SH3GLB1","VIM.1","ADM.1","CASP4.1","S100A6.1","RAC2.1","BAZ1A","ADD3","ITGAM","LY96.2","MSRB1.1","DYSF.1","MOB1A","TPM3.1","FGR","ACSL1","IL2RG.1","CDKN2D","FYB1.1","CLEC4E.1","HLA-F","LILRB3","RBMS1","PLBD1","FLOT1","GNG5","PTEN.1","PROK2.1","UBE2J1","CD37.1","KCNJ15.1","BRI3.1","HIF1A","PRR13.1","LRG1","MAX","RHOG","NFE2","STXBP2","B4GALT5","CAPZA1","SERPINA1","ZYX","RGS19","FKBP5","GCA.1","FKBP1A","MTPN","VNN2","AC245128.3","CD55","CFL1.1"), # shortened for clarity
GeneSet = rep("Inflammatory neutrophils", 201)
)
data$geneSymbol <- sub("\\..*", "", data$geneSymbol)
# Fetch ENSEMBL IDs for your gene symbols
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneSymbol,
keytype="SYMBOL",
columns=c("ENSEMBL", "ENTREZID"))
# Join the original data with the fetched IDs
result <- left_join(data, ensembl_ids, by = c("geneSymbol"="SYMBOL"))
# Arrange columns as desired
sorted_result <- result[, c("geneSymbol", "ENTREZID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Inflammatory_neutrophils.csv", row.names = FALSE)
#46. Suppressive_neutrophils
data <- data.frame(
geneSymbol = c("LY6E","XAF1","RSAD2","ISG15","IFI44L","IFITM3","MT2A","IFIT3","EPSTI1","IFIT1","IFI6","IFIT2","MX1","GBP5","PLSCR1","SELL","SERPING1","SHISA5","TRIM22","GBP1","IFI44","IRF7","FFAR2","OAS1","OAS2","IL1RN","IFI16","CCR1","SP110","APOL6","NFKBIA","TNFSF10","FCER1G","IFIH1","TNFSF13B","SAMD9","SAMD9L","DDX60L","TAP1","NBN","RNF213","CARD16","GBP2","LY96","CLEC2B","LIMK2"),
GeneSet = rep("Suppressive neutrophils", 46)
)
data$geneSymbol <- sub("\\..*", "", data$geneSymbol)
# Fetch ENSEMBL IDs for your gene symbols
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneSymbol,
keytype="SYMBOL",
columns=c("ENSEMBL", "ENTREZID"))
# Join the original data with the fetched IDs
result <- left_join(data, ensembl_ids, by = c("geneSymbol"="SYMBOL"))
# Arrange columns as desired
sorted_result <- result[, c("geneSymbol", "ENTREZID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Suppressive_neutrophils.csv", row.names = FALSE)
#47. Apoptosis
data <- data.frame(
geneSymbol = c("AIFM1","APAF1","BAD","BAK1","BAX","BCL2L11","BID","CASP10","CASP2","CASP3","CASP6","CASP7","CASP8","CASP9","DIABLO","ENDOG","FAS","FASLG","HTRA2","TNFRSF1A","TP53"),
geneEntrezID = c("9131","317","572","578","581","10018","637","843","835","836","839","840","841","842","56616","2021","355","356","27429","7132","7157"),
GeneSet = rep("Apoptosis", 21)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "Apoptosis.csv", row.names = FALSE)
#48. NFkB complex
data <- data.frame(
geneSymbol = c("IKBKB","IKBKG","NFKB1","NFKB2","RELA","RELB","REL"),
geneEntrezID = c("3551","8517","4790","4791","5970","5971","5966"),
GeneSet = rep("NFkB complex", 7)
)
ensembl_ids <- AnnotationDbi::select(org.Hs.eg.db,
keys=data$geneEntrezID,
keytype="ENTREZID",
columns="ENSEMBL")
result <- left_join(data, ensembl_ids, by = c("geneEntrezID"="ENTREZID"))
sorted_result <- result[, c("geneSymbol", "geneEntrezID", "ENSEMBL", "GeneSet")]
write.csv(sorted_result, file = "NFkB_complex.csv", row.names = FALSE)
#~/Tools/csv2xls-0.4/csv_to_xls.py Alt._complement.csv Anergic_or_act._T_cells.csv Anti-inflammation.csv B_cells.csv CD40_activated.csv CD8T-NK-NKT.csv Cell_cycle.csv Classical_complement.csv Cyt._act._T_cells.csv Dendritic_cells.csv Erythrocytes.csv Granulocytes.csv IFN.csv IG_chains.csv IL-1_cytokines.csv IL-6R_complex.csv Inflammasome.csv LDG.csv Lectin_complement.csv MHC_II.csv Monocytes.csv Monocytes_40.csv Monocyte_cell_surface.csv Monocyte_secreted.csv Myeloid_cells.csv Neutrophils.csv NK_cells.csv NLRP3_inflammasome.csv pDC.csv Plasma_cells.csv Platelets.csv Pro-inflam._IL-1.csv SNOR_low_DOWN.csv SNOR_low_UP.csv TCRA.csv TCRB.csv TCRD.csv TCRG.csv TNF.csv T_activated.csv T_cells.csv T_regs.csv Unfolded_protein.csv Neutrophils_.csv Inflammatory_neutrophils.csv Suppressive_neutrophils.csv Apoptosis.csv NFkB_complex.csv -d',' -o Signatures.xls
#---------------------------------------------------------------------
#---- 2. Prepare gene expression matrix: calculate DESeq2 results ----
setwd("/home/jhuang/DATA/Data_Susanne_Carotis_RNASeq/results/featureCounts_2023")
library("AnnotationDbi")
library("clusterProfiler")
library("ReactomePA")
#BiocManager::install("org.Hs.eg.db")
library("org.Hs.eg.db")
library(DESeq2)
library(gplots)
d.raw<- read.delim2("merged_gene_counts.txt",sep="\t", header=TRUE, row.names=1)
colnames(d.raw)<-c("gene_name", "leer_mock_2h_r2", "Ace2_mock_2h_r2", "leer_inf_24h_r1", "Ace2_inf_2h_r1", "leer_inf_24h_r2", "leer_inf_2h_r1", "leer_mock_2h_r1", "leer_inf_2h_r2", "Ace2_inf_2h_r2", "Ace2_mock_2h_r1", "Ace2_inf_24h_r2", "Ace2_inf_24h_r1")
col_order <- c("gene_name", "leer_mock_2h_r1","leer_mock_2h_r2","leer_inf_2h_r1","leer_inf_2h_r2","leer_inf_24h_r1","leer_inf_24h_r2","Ace2_mock_2h_r1","Ace2_mock_2h_r2","Ace2_inf_2h_r1","Ace2_inf_2h_r2","Ace2_inf_24h_r1","Ace2_inf_24h_r2")
reordered.raw <- d.raw[,col_order]
reordered.raw$gene_name <- NULL
#d <- d.raw[rowSums(reordered.raw>3)>2,]
condition = as.factor(c("leer_mock_2h","leer_mock_2h","leer_inf_2h","leer_inf_2h","leer_inf_24h","leer_inf_24h","Ace2_mock_2h","Ace2_mock_2h","Ace2_inf_2h","Ace2_inf_2h","Ace2_inf_24h","Ace2_inf_24h"))
ids = as.factor(c("leer_mock_2h_r1","leer_mock_2h_r2","leer_inf_2h_r1","leer_inf_2h_r2","leer_inf_24h_r1","leer_inf_24h_r2","Ace2_mock_2h_r1","Ace2_mock_2h_r2","Ace2_inf_2h_r1","Ace2_inf_2h_r2","Ace2_inf_24h_r1","Ace2_inf_24h_r2"))
#cData = data.frame(row.names=colnames(reordered.raw), condition=condition, batch=batch, ids=ids)
#dds<-DESeqDataSetFromMatrix(countData=reordered.raw, colData=cData, design=~batch+condition)
cData = data.frame(row.names=colnames(reordered.raw), condition=condition, ids=ids)
dds<-DESeqDataSetFromMatrix(countData=reordered.raw, colData=cData, design=~condition)
#----more detailed and specific with the following code!----
dds$condition <- relevel(dds$condition, "Ace2_mock_2h")
dds = DESeq(dds, betaPrior=FALSE) # betaPrior default value is FALSE
resultsNames(dds)
#--------------------------------------
#---- 3. draw violin on signatures ----
library(GSVA)
library(ggplot2)
#install.packages("readxl")
library(readxl)
# Path to the Excel file
file_path <- "Signatures.xls"
#example of a signature:
#geneSymbol geneEntrezID ENSEMBL GeneSet
#CD160 11126 ENSG00000117281 Anergic or act. T cells
#CD244 51744 ENSG00000122223 Anergic or act. T cells
#CTLA4 1493 ENSG00000163599 Anergic or act. T cells
#HAVCR2 84868 ENSG00000135077 Anergic or act. T cells
#ICOS 29851 ENSG00000163600 Anergic or act. T cells
#KLRG1 10219 ENSG00000139187 Anergic or act. T cells
#LAG3 3902 ENSG00000089692 Anergic or act. T cells
#PDCD1 5133 ENSG00000188389 Anergic or act. T cells
#PDCD1 5133 ENSG00000276977 Anergic or act. T cells
# Get the names of the sheets
sheet_names <- excel_sheets(file_path)
# Initialize an empty list to hold gene sets
geneSets <- list()
# Loop over each sheet, extract the ENSEMBL IDs, and add to the list
for (sheet in sheet_names) {
# Read the sheet
data <- read_excel(file_path, sheet = sheet)
# Process the GeneSet names (replacing spaces with underscores, for example)
gene_set_name <- gsub(" ", "_", unique(data$GeneSet)[1])
# Add ENSEMBL IDs to the list
geneSets[[gene_set_name]] <- as.character(data$ENSEMBL)
}
# Print the result to check
print(geneSets)
# 0. for Nanostring, the GSVA input requires a gene expression matrix 'exprs' where rows are genes and columns are samples.
#exprs <- exprs(filtered_or_neg_target_m666Data)
# 0. for RNAseq, the GSVA input requires a gene expression matrix where rows are genes and columns are samples.
exprs <- counts(dds, normalized=TRUE)
# 1. Compute GSVA scores:
library(GSVA)
gsva_scores <- gsva(exprs, geneSets, method="gsva")
# 2. Convert to data.frame for ggplot:
gsva_df <- as.data.frame(t(gsva_scores))
# 3. Add conditions to gsva_df:
gsva_df$Condition <- dds$condition
# 4. Filter the gsva_df to retain only the desired conditions:
#group 1 vs. group 3 in the nanostring data
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("Ace2_mock_2h", "Ace2_inf_24h"), ]
# 5. Define a function to plot violin plots:
# Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
gsva_df_filtered$Condition <- gsub("Ace2_mock_2h", "mock", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("Ace2_inf_24h", "infection", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("mock", "infection"))
plot_violin <- function(data, gene_name) {
# Calculate the t-test p-value for the two conditions
condition1_data <- data[data$Condition == "mock", gene_name]
condition2_data <- data[data$Condition == "infection", gene_name]
p_value <- t.test(condition1_data, condition2_data)$p.value
# Convert p-value to annotation
p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
rounded_p_value <- paste0("p = ", round(p_value, 2))
plot_title <- gsub("_", " ", gene_name)
p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name))) +
geom_violin(linewidth=1.2) +
labs(title=plot_title, y="GSVA Score") +
ylim(-1, 1) +
theme_light() +
theme(
axis.title.x = element_text(size=12),
axis.title.y = element_text(size=12),
axis.text.x = element_text(size=10),
axis.text.y = element_text(size=10),
plot.title = element_text(size=12, hjust=0.5)
)
# Add p-value annotation to the plot
p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
return(p)
}
# 6. Generate the list of plots:
#genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
#plots_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))
# 6. Generate the list of plots in a predefined order:
desired_order <- c("Platelets","Granulocytes","LDG","pDC","Anti-inflammation", "Pro-inflam._IL-1","Dendritic_cells","MHC_II","Alt._complement","TNF", "NLRP3_inflammasome","Unfolded_protein","B_cells","Monocyte_cell_surface","Inflammasome", "Monocyte_secreted","IL-1_cytokines","SNOR_low_UP","CD40_activated","Lectin_complement", "Classical_complement","Cell_cycle","Plasma_cells","IG_chains","Erythrocytes", "IL-6R_complex","IFN","TCRB","TCRA","Cyt._act._T_cells", "TCRG","T_cells","CD8T-NK-NKT","Anergic_or_act._T_cells","T_activated", "NK_cells","TCRD","T_regs","SNOR_low_DOWN","Monocytes", "Myeloid_cells","Neutrophils")
genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
genes <- genes[match(desired_order, genes)]
plots_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))
# 7. Pad the list of plots:
remaining_plots <- 6 - (length(plots_list) %% 6)
if (remaining_plots != 6) {
plots_list <- c(plots_list, rep(list(NULL), remaining_plots))
}
# 8. Create the plots and arrange them in a grid:
library(gridExtra)
plots_matrix <- matrix(plots_list, ncol=6, byrow=T)
#do.call("grid.arrange", c(plots_matrix, list(ncol=6)))
# 9. Save the plots to a PNG:
png("All_Violin_Plots_RNAseq.png", width=1000, height=1000)
do.call("grid.arrange", c(plots_matrix, list(ncol=6)))
dev.off()
# ------------------------ Heatmap Presentation --------------------
#-- NanoString GSVA scores --
exprs <- exprs(target_m666Data)
#exprs <- exprs(filtered_or_neg_target_m666Data)
# 1. Compute GSVA scores:
gsva_scores <- gsva(exprs, geneSets, method="gsva")
# 2. Convert to data.frame for ggplot:
gsva_df <- as.data.frame(t(gsva_scores))
# 3. Add conditions to gsva_df:
identical(rownames(pData(target_m666Data)), rownames(gsva_df))
gsva_df$Condition <- pData(target_m666Data)$Grp
gsva_df$SampleID <- pData(target_m666Data)$SampleID
# 4. Filter the gsva_df to retain only the desired conditions:
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1", "3"), ]
# 5. Define a function to plot violin plots:
# Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
gsva_df_filtered$Condition <- gsub("1", "Group1", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1", "Group3"))
# Filter the data for "Group1" samples
group1_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group1", genes]
# Set the median values for each column in the data.frame
# Calculate median values for each column
group1_medians <- apply(group1_data, 2, median)
# 3. Add conditions to gsva_df:
identical(rownames(pData(target_m666Data)), rownames(gsva_df))
gsva_df$Condition <- pData(target_m666Data)$Group
#identical(rownames(gsva_df_filtered), rownames(pData(target_m666Data)) )
gsva_df$SampleID <- pData(target_m666Data)$SampleID
# 4. Filter the gsva_df to retain only the desired conditions:
#group 1 vs. group 3 in the nanostring data
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1a", "1b", "3"), ]
# 5. Define a function to plot violin plots:
# Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
gsva_df_filtered$Condition <- gsub("1a", "Group1a", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("1b", "Group1b", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1a", "Group1b", "Group3"))
# Filter the data for "Group1a" samples
group1a_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group1a", genes]
group1a_medians <- apply(group1a_data, 2, median)
# Filter the data for "Group1b" samples
group1b_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group1b", genes]
group1b_medians <- apply(group1b_data, 2, median)
# Filter the data for "Group3" samples
group3_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group3", genes]
group3_medians <- apply(group3_data, 2, median)
#-- RNAseq GSVA scores --
exprs <- counts(dds, normalized=TRUE)
# 1. Compute GSVA scores:
gsva_scores <- gsva(exprs, geneSets, method="gsva")
# 2. Convert to data.frame for ggplot:
gsva_df <- as.data.frame(t(gsva_scores))
# 3. Add conditions to gsva_df:
gsva_df$Condition <- dds$condition
# 4. Filter the gsva_df to retain only the desired conditions:
#group 1 vs. group 3 in the nanostring data
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("Ace2_mock_2h", "Ace2_inf_24h"), ]
# 5. Define a function to plot violin plots:
# Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
gsva_df_filtered$Condition <- gsub("Ace2_mock_2h", "mock", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("Ace2_inf_24h", "infection", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("mock", "infection"))
# Filter the data for "mock" samples
mock_data <- gsva_df_filtered[gsva_df_filtered$Condition == "mock", genes]
mock_medians <- apply(mock_data, 2, median)
# Filter the data for "infection" samples
infection_data <- gsva_df_filtered[gsva_df_filtered$Condition == "infection", genes]
infection_medians <- apply(infection_data, 2, median)
# Create a new data frame with the middle values for both groups
heatmap_data_rnaseq <- data.frame(mock = mock_medians, infection = infection_medians)
# Transpose the data matrix to switch rows and columns
#heatmap_data <- t(heatmap_data)
# Convert the data frame to a numeric matrix
heatmap_matrix <- as.matrix(heatmap_data)
# Specify heatmap colors
color_scheme <- colorRampPalette(c("blue", "white", "red"))(50)
#TODO: adjust the color, so that it is not so dark and light!!!!, add RNAseq data for mock and infection!!
# Create the heatmap without hierarchical clustering
library(gplots)
png("Heatmap.png", width=1000, height=1000)
heatmap.2(
heatmap_matrix,
Colv = FALSE, # Turn off column dendrogram
Rowv = FALSE, # Turn off row dendrogram
trace = "none", # Turn off row and column dendrograms
scale = "row", # Scale the rows
col = color_scheme,
main = "GSVA Heatmap", # Title of the heatmap
key = TRUE, # Show color key
key.title = "GSVA Score", # Color key title
key.xlab = NULL, # Remove x-axis label from color key
density.info = "none", # Remove density plot
cexRow = 1.2, # Increase font size of row names
cexCol = 1.2, # Increase font size of column names
srtCol = 40, keysize = 0.72, margins = c(5, 14)
)
dev.off()
Author Archives: gene_x
GSVA calculation for NanoString data
library("rmarkdown")
library("tidyverse")
library(rmarkdown)
library("GeomxTools")
library("GeoMxWorkflows")
library("NanoStringNCTools")
setwd("/home/jhuang/DATA/Data_Susanne_Carotis_spatialRNA_PUBLISHING/run_2023")
render("run.Rmd", c("html_document"))
# ------------------------ Violin Presentation 1 --------------------
#identical(exprs(target_m666Data), assay(target_m666Data, "exprs")): contains the gene expression values that have been both normalized and log-transformed, which are often used for downstream analysis and visualization in many genomic studies.
#assay(target_m666Data, "log_q"): These are likely log2-transformed values of the quantile-normalized data. Quantile normalization ensures that the distributions of gene expression values across samples are the same, and taking the log2 of these normalized values is a common step in the analysis of high-throughput gene expression data. It's a way to make the data more suitable for downstream statistical analysis and visualization.
#assay(target_m666Data, "q_norm"): This typically represents the quantile-normalized gene expression values. Quantile normalization adjusts the expression values in each sample so that they have the same distribution. This normalization method helps remove systematic biases and makes the data more comparable across samples.
library(readxl)
# Path to the Excel file
file_path <- "Signatures.xls"
#example of a signature:
#geneSymbol geneEntrezID ENSEMBL GeneSet
#CD160 11126 ENSG00000117281 Anergic or act. T cells
#CD244 51744 ENSG00000122223 Anergic or act. T cells
#CTLA4 1493 ENSG00000163599 Anergic or act. T cells
#HAVCR2 84868 ENSG00000135077 Anergic or act. T cells
#ICOS 29851 ENSG00000163600 Anergic or act. T cells
#KLRG1 10219 ENSG00000139187 Anergic or act. T cells
#LAG3 3902 ENSG00000089692 Anergic or act. T cells
#PDCD1 5133 ENSG00000188389 Anergic or act. T cells
#PDCD1 5133 ENSG00000276977 Anergic or act. T cells
# Get the names of the sheets
sheet_names <- excel_sheets(file_path)
# Initialize an empty list to hold gene sets
geneSets <- list()
# Loop over each sheet, extract the ENSEMBL IDs, and add to the list
for (sheet in sheet_names) {
# Read the sheet
data <- read_excel(file_path, sheet = sheet)
# Process the GeneSet names (replacing spaces with underscores, for example)
gene_set_name <- gsub(" ", "_", unique(data$GeneSet)[1])
# Add ENSEMBL IDs to the list
geneSets[[gene_set_name]] <- unique(as.character(data$geneSymbol))
}
# Print the result to check
print(geneSets)
library(gridExtra)
library(ggplot2)
library(GSVA)
# 0. for the following calculation, the GSVA input requires a gene expression matrix 'exprs' where rows are genes and columns are samples.
exprs <- exprs(target_m666Data)
#exprs <- exprs(filtered_or_neg_target_m666Data)
# 1. Compute GSVA scores:
gsva_scores <- gsva(exprs, geneSets, method="gsva")
# 2. Convert to data.frame for ggplot:
gsva_df <- as.data.frame(t(gsva_scores))
# 3. Add conditions to gsva_df:
identical(rownames(pData(target_m666Data)), rownames(gsva_df))
gsva_df$Condition <- pData(target_m666Data)$Grp
#identical(rownames(gsva_df_filtered), rownames(pData(target_m666Data)) )
gsva_df$SampleID <- pData(target_m666Data)$SampleID
# 4. Filter the gsva_df to retain only the desired conditions:
#group 1 vs. group 3 in the nanostring data
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1", "3"), ]
# 5. Define a function to plot violin plots:
# Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
gsva_df_filtered$Condition <- gsub("1", "Group1", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1", "Group3"))
plot_violin <- function(data, gene_name) {
# Calculate the t-test p-value for the two conditions
condition1_data <- data[data$Condition == "Group1", gene_name]
condition2_data <- data[data$Condition == "Group3", gene_name]
p_value <- t.test(condition1_data, condition2_data)$p.value
# Convert p-value to annotation
p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
rounded_p_value <- paste0("p = ", round(p_value, 2))
plot_title <- gsub("_", " ", gene_name)
p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name))) +
geom_violin(linewidth=1.2) +
labs(title=plot_title, y="GSVA Score") +
ylim(-1, 1) +
theme_light() +
theme(
axis.title.x = element_text(size=12),
axis.title.y = element_text(size=12),
axis.text.x = element_text(size=10),
axis.text.y = element_text(size=10),
plot.title = element_text(size=12, hjust=0.5)
)
# Add p-value annotation to the plot
p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
return(p)
}
# 6. Generate the list of plots:
#genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
#plots_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))
# 6. Generate the list of plots in a predefined order:
desired_order <- c("Platelets","Granulocytes","LDG","pDC","Anti-inflammation", "Pro-inflam._IL-1","Dendritic_cells","MHC_II","Alt._complement","TNF", "NLRP3_inflammasome","Unfolded_protein","B_cells","Monocyte_cell_surface","Inflammasome", "Monocyte_secreted","IL-1_cytokines","SNOR_low_UP","CD40_activated","Lectin_complement", "Classical_complement","Cell_cycle","Plasma_cells","IG_chains","Erythrocytes", "IL-6R_complex","IFN","TCRB","TCRA","Cyt._act._T_cells", "TCRG","T_cells","CD8T-NK-NKT","Anergic_or_act._T_cells","T_activated", "NK_cells","TCRD","T_regs","SNOR_low_DOWN","Monocytes", "Myeloid_cells","Neutrophils")
genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
genes <- genes[match(desired_order, genes)]
genes <- genes[!is.na(genes)]
plots_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))
# 7. Pad the list of plots:
remaining_plots <- 6 - (length(plots_list) %% 6)
if (remaining_plots != 6) {
plots_list <- c(plots_list, rep(list(NULL), remaining_plots))
}
# 7. Pad the list of plots:
#plots_needed_for_full_grid <- ceiling(length(plots_list) / 6) * 6
#remaining_plots <- plots_needed_for_full_grid - length(plots_list)
#plots_list <- c(plots_list, rep(list(NULL), remaining_plots))
# 8. Create the plots and arrange them in a grid:
library(gridExtra)
plots_matrix <- matrix(plots_list, ncol=6, byrow=T)
#do.call("grid.arrange", c(plots_matrix, list(ncol=6)))
# 9. Save the plots to a PNG:
png("All_Violin_Plots_NanoString_3_vs_1.png", width=1000, height=1000)
do.call("grid.arrange", c(plots_matrix, list(ncol=6)))
dev.off()
# ------------------------ Violin Presentation 2 --------------------
# 2. Convert to data.frame for ggplot:
gsva_df <- as.data.frame(t(gsva_scores))
# 3. Add conditions to gsva_df:
identical(rownames(pData(target_m666Data)), rownames(gsva_df))
gsva_df$Condition <- pData(target_m666Data)$Group
#identical(rownames(gsva_df_filtered), rownames(pData(target_m666Data)) )
gsva_df$SampleID <- pData(target_m666Data)$SampleID
# 4. Filter the gsva_df to retain only the desired conditions:
#group 1 vs. group 3 in the nanostring data
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1a", "3"), ]
# 5. Define a function to plot violin plots:
# Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
gsva_df_filtered$Condition <- gsub("1a", "Group1a", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1a", "Group3"))
plot_violin <- function(data, gene_name) {
# Calculate the t-test p-value for the two conditions
condition1_data <- data[data$Condition == "Group1a", gene_name]
condition2_data <- data[data$Condition == "Group3", gene_name]
p_value <- t.test(condition1_data, condition2_data)$p.value
# Convert p-value to annotation
p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
rounded_p_value <- paste0("p = ", round(p_value, 2))
plot_title <- gsub("_", " ", gene_name)
p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name))) +
geom_violin(linewidth=1.2) +
labs(title=plot_title, y="GSVA Score") +
ylim(-1, 1) +
theme_light() +
theme(
axis.title.x = element_text(size=12),
axis.title.y = element_text(size=12),
axis.text.x = element_text(size=10),
axis.text.y = element_text(size=10),
plot.title = element_text(size=12, hjust=0.5)
)
# Add p-value annotation to the plot
p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
return(p)
}
# 6. Generate the list of plots:
#genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
#plots_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))
# 6. Generate the list of plots in a predefined order:
desired_order <- c("Platelets","Granulocytes","LDG","pDC","Anti-inflammation", "Pro-inflam._IL-1","Dendritic_cells","MHC_II","Alt._complement","TNF", "NLRP3_inflammasome","Unfolded_protein","B_cells","Monocyte_cell_surface","Inflammasome", "Monocyte_secreted","IL-1_cytokines","SNOR_low_UP","CD40_activated","Lectin_complement", "Classical_complement","Cell_cycle","Plasma_cells","IG_chains","Erythrocytes", "IL-6R_complex","IFN","TCRB","TCRA","Cyt._act._T_cells", "TCRG","T_cells","CD8T-NK-NKT","Anergic_or_act._T_cells","T_activated", "NK_cells","TCRD","T_regs","SNOR_low_DOWN","Monocytes", "Myeloid_cells","Neutrophils")
genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
genes <- genes[match(desired_order, genes)]
genes <- genes[!is.na(genes)]
plots_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))
# 7. Pad the list of plots:
remaining_plots <- 6 - (length(plots_list) %% 6)
if (remaining_plots != 6) {
plots_list <- c(plots_list, rep(list(NULL), remaining_plots))
}
# 7. Pad the list of plots:
#plots_needed_for_full_grid <- ceiling(length(plots_list) / 6) * 6
#remaining_plots <- plots_needed_for_full_grid - length(plots_list)
#plots_list <- c(plots_list, rep(list(NULL), remaining_plots))
# 8. Create the plots and arrange them in a grid:
library(gridExtra)
plots_matrix <- matrix(plots_list, ncol=6, byrow=T)
#do.call("grid.arrange", c(plots_matrix, list(ncol=6)))
# 9. Save the plots to a PNG:
png("All_Violin_Plots_NanoString_3_vs_1a.png", width=1000, height=1000)
do.call("grid.arrange", c(plots_matrix, list(ncol=6)))
dev.off()
# ------------------------ Heatmap Presentation --------------------
# 3. Add conditions to gsva_df:
identical(rownames(pData(target_m666Data)), rownames(gsva_df))
gsva_df$Condition <- pData(target_m666Data)$Grp
gsva_df$SampleID <- pData(target_m666Data)$SampleID
# 4. Filter the gsva_df to retain only the desired conditions:
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1", "3"), ]
# 5. Define a function to plot violin plots:
# Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
gsva_df_filtered$Condition <- gsub("1", "Group1", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1", "Group3"))
# Filter the data for "Group1" samples
group1_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group1", genes]
# Set the median values for each column in the data.frame
# Calculate median values for each column
group1_medians <- apply(group1_data, 2, median)
# 3. Add conditions to gsva_df:
identical(rownames(pData(target_m666Data)), rownames(gsva_df))
gsva_df$Condition <- pData(target_m666Data)$Group
#identical(rownames(gsva_df_filtered), rownames(pData(target_m666Data)) )
gsva_df$SampleID <- pData(target_m666Data)$SampleID
# 4. Filter the gsva_df to retain only the desired conditions:
#group 1 vs. group 3 in the nanostring data
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1a", "1b", "3"), ]
# 5. Define a function to plot violin plots:
# Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
gsva_df_filtered$Condition <- gsub("1a", "Group1a", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("1b", "Group1b", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1a", "Group1b", "Group3"))
# Filter the data for "Group1a" samples
group1a_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group1a", genes]
group1a_medians <- apply(group1a_data, 2, median)
# Filter the data for "Group1b" samples
group1b_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group1b", genes]
group1b_medians <- apply(group1b_data, 2, median)
# Filter the data for "Group3" samples
group3_data <- gsva_df_filtered[gsva_df_filtered$Condition == "Group3", genes]
group3_medians <- apply(group3_data, 2, median)
# Create a new data frame with the middle values for both groups
heatmap_data_nanostring <- data.frame(Group1 = group1_medians, Group1a = group1a_medians, Group1b = group1b_medians, Group3 = group3_medians)
# Merge two heatmap_data_* to a matrix
heatmap_data <- merge(heatmap_data_nanostring, heatmap_data_rnaseq, by="row.names", all=TRUE)
rownames(heatmap_data) <- heatmap_data$Row.names
heatmap_data$Row.names <- NULL
# Transpose the data matrix to switch rows and columns
#heatmap_data <- t(heatmap_data)
# Convert the data frame to a numeric matrix
heatmap_matrix <- as.matrix(heatmap_data)
# Transpose the data matrix to switch rows and columns
#heatmap_data <- t(heatmap_data)
# Convert the data frame to a numeric matrix
heatmap_matrix <- as.matrix(heatmap_data)
# Specify heatmap colors
color_scheme <- colorRampPalette(c("blue", "white", "red"))(50)
#TODO: adjust the color, so that it is not so dark and light!!!!, add RNAseq data for mock and infection!!
# Create the heatmap without hierarchical clustering
library(gplots)
png("Heatmap.png", width=1000, height=1000)
heatmap.2(
heatmap_matrix,
Colv = FALSE, # Turn off column dendrogram
Rowv = FALSE, # Turn off row dendrogram
trace = "none", # Turn off row and column dendrograms
scale = "row", # Scale the rows
col = color_scheme,
main = "GSVA Heatmap", # Title of the heatmap
key = TRUE, # Show color key
key.title = "GSVA Score", # Color key title
key.xlab = NULL, # Remove x-axis label from color key
density.info = "none", # Remove density plot
cexRow = 1.2, # Increase font size of row names
cexCol = 1.2, # Increase font size of column names
srtCol = 40, keysize = 0.72, margins = c(5, 14)
)
dev.off()
# Write the table to Excel-file
library(writexl)
# Specify the file path where you want to save the Excel file
excel_file <- "heatmap_data.xlsx"
# Save the data frame to the Excel file
write_xlsx(heatmap_data, excel_file)
# Verify that the Excel file has been saved
if (file.exists(excel_file)) {
cat("Data frame has been saved to", excel_file, "\n")
} else {
cat("Error: Failed to save the data frame to", excel_file, "\n")
}
Ordinary vs. Moderated P-values: A Key Comparison in limma Differential Expression Analysis
In the context of differential expression analysis, limma is a popular R package that originally was designed for microarray data but has since been adapted for RNA-seq data (using voom transformation). One of the unique features of limma is its use of moderated statistics. Here’s a breakdown of the difference between ordinary p-values and moderated p-values in limma:
-
Ordinary P-values:
- These are the p-values you would get if you were to do standard hypothesis testing on each gene individually without borrowing information from other genes.
- Calculated based on the ordinary standard errors.
-
Moderated P-values:
- Limma borrows information across genes to get more precise estimates of the variability for each gene. This is especially useful when the number of samples (replicates) is small.
- The process involves “shrinking” the gene-wise sample variances towards a pooled estimate, resulting in moderated t-statistics, which are more stable than ordinary t-statistics.
- Moderated p-values are calculated based on these moderated t-statistics.
- As a result, these moderated p-values tend to be more reliable, especially in experiments with small sample sizes.
-
Why is Moderation Necessary?:
- In many genomics experiments, there’s a challenge: While there are thousands of genes (or more), there might be a relatively small number of replicates or samples. This can make estimates of variance for each gene unreliable.
- By borrowing strength from the ensemble of genes, limma can stabilize these variance estimates, which, in turn, makes the resulting p-values more reliable.
-
Empirical Bayes Method:
- The moderation in limma is achieved through an empirical Bayes method. This doesn’t mean it’s a fully Bayesian approach but rather that it borrows some concepts from Bayesian statistics to stabilize variance estimates across genes.
In practice, when using limma, researchers often focus on the moderated p-values because of their enhanced reliability, especially in the context of multiple hypothesis testing in genomics. The moderated statistics help reduce the number of false positives that might arise from genes with unusually low variance estimates due to chance alone.
GSVA calculation for Mass spectrometry (MS)-based proteomics
library("rmarkdown")
library("tidyverse")
library(rmarkdown)
setwd("/home/jhuang/DATA/Data_Susanne_Carotis_MS/LIMMA-pipeline-proteomics/Results_20231006_165122")
# -1. prepare and read signatures (namely gene sets)
#Monocytes.csv #66
#Monocytes2.csv #201
#Neutrophils.csv #40
#Neutrophils2.csv #61
#~/Tools/csv2xls-0.4/csv_to_xls.py Alt._complement.csv Anergic_or_act._T_cells.csv Anti-inflammation.csv B_cells.csv CD40_activated.csv CD8T-NK-NKT.csv Cell_cycle.csv Classical_complement.csv Cyt._act._T_cells.csv Dendritic_cells.csv Erythrocytes.csv Granulocytes.csv IFN.csv IG_chains.csv IL-1_cytokines.csv IL-6R_complex.csv Inflammasome.csv LDG.csv Lectin_complement.csv MHC_II.csv Monocytes.csv Monocyte_cell_surface.csv Monocyte_secreted.csv Myeloid_cells.csv Neutrophils.csv NK_cells.csv NLRP3_inflammasome.csv pDC.csv Plasma_cells.csv Platelets.csv Pro-inflam._IL-1.csv SNOR_low_DOWN.csv SNOR_low_UP.csv TCRA.csv TCRB.csv TCRD.csv TCRG.csv TNF.csv T_activated.csv T_cells.csv T_regs.csv Unfolded_protein.csv Inflammatory_neutrophils.csv Suppressive_neutrophils.csv Apoptosis.csv NFkB_complex.csv -d',' -o Signatures_46.xls
library(readxl)
file_path <- "../../Signatures_46.xls"
# Get the names of the sheets
sheet_names <- excel_sheets(file_path)
# Initialize an empty list to hold gene sets
geneSets <- list()
# Loop over each sheet, extract the ENSEMBL IDs, and add to the list
for (sheet in sheet_names) {
# Read the sheet
data <- read_excel(file_path, sheet = sheet)
# Process the GeneSet names (replacing spaces with underscores, for example)
gene_set_name <- gsub(" ", "_", unique(data$GeneSet)[1])
# Add ENSEMBL IDs to the list
geneSets[[gene_set_name]] <- unique(as.character(data$geneSymbol))
}
# Print the result to check
print(geneSets)
length(geneSets)
# 0. for the following calculation, the GSVA input requires a gene expression matrix 'exprs' where rows are genes and columns are samples.
library(gridExtra)
library(ggplot2)
library(GSVA)
exprs_ <- read.csv(sep="\t", file="20231006_165122_final_data.tsv")
# List of desired columns
desired_columns <- c("CA1", "CA2", "CA3", "CA10", "CA5", "CA6", "CA8", "CA14", "CA12", "CA13", "Con1", "Con2", "Con3", "Con4", "Con5", "Con6")
# # Find duplicated symbols
# duplicated_symbols <- exprs_$Symbol[duplicated(exprs_$Symbol) | duplicated(exprs_$Symbol, fromLast = TRUE)]
# print(duplicated_symbols)
# Filter out rows with empty 'Symbol' values
exprs_ <- exprs_[exprs_$Symbol != "", ]
# Subset the dataframe
exprs <- exprs_[, desired_columns]
# Set the 'Symbol' column as row names
rownames(exprs) <- exprs_$Symbol
exprs_matrix <- as.matrix(exprs)
# 1. Compute GSVA scores:
library(GSVA)
gsva_scores <- gsva(exprs_matrix, geneSets, method="gsva", useNames=TRUE)
# 2. Convert to data.frame for ggplot:
gsva_df <- as.data.frame(t(gsva_scores))
# 3. Add conditions to gsva_df:
gsva_df$Condition <- c("COVID","COVID","COVID","COVID","COVID","COVID","COVID","COVID","COVID","COVID","control","control","control","control","control","control")
# 4. Filter the gsva_df to retain only the desired conditions:
# 5. Define a function to plot violin plots:
gsva_df$Condition <- factor(gsva_df$Condition, levels = c("control", "COVID"))
plot_violin <- function(data, gene_name) {
# Calculate the t-test p-value for the two conditions
condition1_data <- data[data$Condition == "control", gene_name]
condition2_data <- data[data$Condition == "COVID", gene_name]
p_value <- t.test(condition1_data, condition2_data)$p.value
# Convert p-value to annotation
p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
rounded_p_value <- paste0("p = ", round(p_value, 2))
plot_title <- gsub("_", " ", gene_name)
p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name))) +
geom_violin(linewidth=1.2) +
labs(title=plot_title, y="GSVA Score") +
ylim(-1, 1) +
theme_light() +
theme(
axis.title.x = element_text(size=12),
axis.title.y = element_text(size=12),
axis.text.x = element_text(size=10),
axis.text.y = element_text(size=10),
plot.title = element_text(size=12, hjust=0.5)
)
# Add p-value annotation to the plot
p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
return(p)
}
# 6. Generate the list of plots in a predefined order:
# [1] "Platelets" "Granulocytes"
# [3] "LDG" "pDC"
# [5] "Anti-inflammation" "Pro-inflam._IL-1"
# [7] "Dendritic_cells" "MHC_II"
# [9] "Alt._complement" "TNF"
#[11] "NLRP3_inflammasome" "Unfolded_protein"
#[13] "B_cells" "Monocyte_cell_surface"
#[15] "Inflammasome" "Monocyte_secreted"
#[17] "IL-1_cytokines" "SNOR_low_UP"
#[19] "CD40_activated" "Lectin_complement"
#[21] "Classical_complement" "Cell_cycle"
#[23] "Plasma_cells" "IG_chains"
#[25] "Erythrocytes" "IL-6R_complex"
#[27] "IFN" "TCRB"
#[29] "TCRA" "Cyt._act._T_cells"
#[31] "TCRG" "T_cells"
#[33] "CD8T-NK-NKT" "Anergic_or_act._T_cells"
#[35] "T_activated" "Neutrophils"
#[37] "NK_cells" "TCRD"
#[39] "T_regs" "SNOR_low_DOWN"
#[41] "Monocytes" "Myeloid_cells"
#[43] "Inflammatory_neutrophils" "Suppressive_neutrophils"
#[45] "Apoptosis" "NFkB_complex"
desired_order <- c("Platelets","Granulocytes","LDG","pDC","Anti-inflammation", "Pro-inflam._IL-1","Dendritic_cells","MHC_II","Alt._complement","TNF", "NLRP3_inflammasome","Unfolded_protein","B_cells","Monocyte_cell_surface","Inflammasome", "Monocyte_secreted","IL-1_cytokines","SNOR_low_UP","CD40_activated","Lectin_complement", "Classical_complement","Cell_cycle","Plasma_cells","IG_chains","Erythrocytes", "IL-6R_complex","IFN","TCRB","TCRA","Cyt._act._T_cells", "TCRG","T_cells","CD8T-NK-NKT","Anergic_or_act._T_cells","T_activated", "Neutrophils", "NK_cells","TCRD","T_regs","SNOR_low_DOWN","Monocytes", "Myeloid_cells", "Inflammatory_neutrophils", "Suppressive_neutrophils", "Apoptosis", "NFkB_complex")
genes <- colnames(gsva_df)[!colnames(gsva_df) %in% "Condition"]
genes <- genes[match(desired_order, genes)]
filtered_genes <- genes[!is.na(genes)]
print(filtered_genes)
plots_list <- lapply(filtered_genes, function(filtered_genes) plot_violin(gsva_df, filtered_genes))
# 7. Pad the list of plots:
remaining_plots <- 6 - (length(plots_list) %% 6)
if (remaining_plots != 6) {
plots_list <- c(plots_list, rep(list(NULL), remaining_plots))
}
# 8. Create the plots and arrange them in a grid:
library(gridExtra)
plots_matrix <- matrix(plots_list, ncol=6, byrow=T)
#do.call("grid.arrange", c(plots_matrix, list(ncol=6)))
# 9. Save the plots to a PNG:
png("Violin_Plots_GSVA_MS.png", width=1000, height=1000)
do.call("grid.arrange", c(plots_matrix, list(ncol=6)))
dev.off()
# 10. Save GSVA scores as Excel-file.
library(writexl)
write_xlsx(gsva_df, "GSVA_Scores_MS.xlsx")
LIMMA pipeline processing proteomics (MS)
Statistical analysis on LC-MS data
In order to detect significant changes between two experimental groups we performed statistical analysis on LC-MS data. We have chosen LIMMA moderated t test statistics as proposed in Kammers et al. (2015) over standard t test, since it utilizes full data to shrink the observed sample variance toward a pooled estimate, resulting in far more stable and powerful inference compared to ordinary t test. Missing values in proteomic data are a common problem and it is widely used in practice to impute missing information before applying statistical methods. We used the imputation algorithm proposed in Tyanova et al. (2016) because it allows us to randomly draw values from a distribution meant to simulate values below the detection limit of the MS instrument. We implemented statistical analysis in R and codes are freely available at the following GitHub repository https://github.com/wasimaftab/LIMMA-pipeline– proteomics/tree/master/Mitochondrial_Loop. The final volcano plots only feature proteins that are part of the high-confidence mito- chondrial proteome (Morgenstern et al., 2017).
DATA AND CODE AVAILABILITY
The codes generated during this study to perform statistical analysis on the BioID LC-MS data are freely available at GitHub repository https://github.com/wasimaftab/LIMMA-pipeline-proteomics/tree/master/Mitochondrial_Loop. The data supporting this study have been uploaded on the Mendeley Dataset public repository at https://doi.org/10.17632/9symckg9wn.
#This is a pipeline to analyze proteiomic data in a proteinGroups.txt (MaxQuant output) file for two group comparision
#install.packages("matlab")
library(dplyr)
library(stringr)
library(MASS)
library(matlab)
library(plotly)
library(limma)
library(qvalue)
library(htmlwidgets)
library(rstudioapi)
source("limma_helper_functions.R")
# ---- INPUT for Gunnar Project, directly read the sheet "Gene Prot (FOUND)" ----
## Load the proteingroups file
proteingroups_ <- read.csv("210618_MG_Carotisartery_consensus.csv", sep=",", header=TRUE)
proteingroups <- proteingroups_ %>% filter(Checked == "WAHR")
#Gene Symbol Accession CA1 CA2 CA3 CA10 CA5 CA6 CA8 CA14 CA12 CA13 CA11 CA4 Con1 Con2 Con3 Con4 Con5 Con6
#IGKV3-7 A0A075B6H7 195532822 276145022 75324344 179451608 93312672 188741058 244072994 540080672 133307714 102293349.5 133798065 166553494.5 138312451 45263876 361462136 185500736 357708246 156721253
### Kill the code if proteingroups does not contain crap columns
#temp <-
# select(
# proteingroups,
# matches("(Reverse|Potential.contaminant|Only.identified.by.site)")
# )
#if (!nrow(temp) * ncol(temp)) {
# stop("File error, It does not contain crap...enter another file with crap")
#}
## Display data to faciliate choice of treatment and control
temp <- select(proteingroups, matches("Abundance"))
print(names(temp))
### Remove "+" identifeid rows from proteingroups
#idx <- NULL
#temp <-
# select(
# proteingroups,
# matches("(Only.identified.by.site|Reverse|Potential.contaminant)")
# )
#for (i in 1:ncol(temp)) {
# index <- which(unlist(!is.na(match(temp[, i], "+"))))
# idx <- union(idx, index)
#}
#proteingroups <- proteingroups[-idx, ] # removing indexed rows
### Remove proteins with Sequence coverage [%] < 5 [%] and Unique peptides == 1
#temp <-
# select(proteingroups,
# matches("(^Sequence.coverage(.){4}$|^Unique.peptides$)"))
#proteingroups <-
# proteingroups[-union(which(temp[, 1] == 1), which(temp[, 2] < 5)) , ] # removing indexed rows
## Extract Uniprot and gene symbols
#splits <- unlist(strsplit("CAD protein OS=Mus musculus OX=10090 GN=Cad PE=1 SV=1", '\\|'))
#splits <- unlist(str_match("CAD protein OS=Mus musculus OX=10090 GN=Cad PE=1 SV=1", "GN=(.*?) PE="))
Uniprot <- character(length = nrow(proteingroups))
Symbol <- character(length = nrow(proteingroups))
Descriptions <- character(length = nrow(proteingroups))
for (i in 1:nrow(proteingroups)) {
temp <- as.character(proteingroups$Description[i])
Descriptions[i] <- temp
Uniprot[i] <- as.character(proteingroups$Accession[i])
splits <- unlist(str_match(temp, "GN=(.*?) PE="))
Symbol[i] <- splits[2]
}
# WAP_1, WAP_2, WAP_3,
# D-ALPHA_1, D-ALPHA_2, D-ALPHA_3
# Q-ALPHA_1, Q-ALPHA_2, Q-ALPHA_3
# pTTSS+Q_1, pTTSS+Q_2, pTTSS+Q_3
# pTTSS+Q-ALPHA_1, pTTSS+Q-ALPHA_2, pTTSS+Q-ALPHA_3
##Extract required data for Limma
#treatment <- readline('Enter treatment name(case insensitive) as it appeared in the iBAQ/LFQ column= ')
#control <- readline('Enter control name(case insensitive) as it appeared in the iBAQ/LFQ column= ')
#ibaq <- readinteger_binary('Enter 1 for iBAQ or 0 for LFQ= ')
treatment <- c("Abundance..F4..Sample","Abundance..F5..Sample","Abundance..F6..Sample")
control <- c("Abundance..F1..Sample","Abundance..F2..Sample","Abundance..F3..Sample")
temp <- select(proteingroups, matches(paste('^.*', "Abundance", '.*$', sep = ''))) #LFQ
treatment_reps <- select(temp, matches(treatment))
#treatment_reps <- data_sanity_check(temp, 'treatment', treatment)
control_reps <- select(temp, matches(control))
#control_reps <- data_sanity_check(temp, 'control', control)
data <- cbind(treatment_reps,
control_reps,
#select(proteingroups, matches("^id$")), #maybe problematic.
Uniprot,
Symbol)
##Extract required data for Limma
treatment <- c("Abundance..F7..Sample","Abundance..F8..Sample","Abundance..F9..Sample")
control <- c("Abundance..F1..Sample","Abundance..F2..Sample","Abundance..F3..Sample")
temp <- select(proteingroups, matches(paste('^.*', "Abundance", '.*$', sep = '')))
treatment_reps <- select(temp, matches(treatment))
control_reps <- select(temp, matches(control))
data <- cbind(treatment_reps,
control_reps,
Uniprot,
Symbol)
##Extract required data for Limma
treatment <- c("Abundance..F13..Sample","Abundance..F14..Sample","Abundance..F15..Sample")
control <- c("Abundance..F10..Sample","Abundance..F11..Sample","Abundance..F12..Sample")
temp <- select(proteingroups, matches(paste('^.*', "Abundance", '.*$', sep = '')))
treatment_reps <- select(temp, matches(treatment))
control_reps <- select(temp, matches(control))
data <- cbind(treatment_reps,
control_reps,
Uniprot,
Symbol)
#> head(data)
# Abundance..F13..Sample Abundance..F14..Sample Abundance..F15..Sample
#1 74399257195 7824880491 2,67259E+11
#2 7002793339 2132521146 23458705782
#3 1060277580 620505532 5544368780
#4 442697991,6 613281893,8 1971535002
#5 1683646539 904648237,7 8359400640
#6 360135421,6 1339034150 1614352533
# Abundance..F10..Sample Abundance..F11..Sample Abundance..F12..Sample Uniprot
#1 4487833586 4,47435E+11 2,23515E+11 B2RQC6
#2 2172021935 39427573076 28112131746 Q6ZQA0
#3 426743632,6 4947260656 6773100611 Q3UJB9
#4 926489052,4 1636804139 1250211974 Q8BX70
#5 797097872,6 10397417632 9384902608 Q9EP71
#6 2428231339 2254991154 6269016713 P26039
# Symbol
#1 Cad
#2 Nbeal2
#3 Edc4
#4 Vps13c
#5 Rai14
#6 Tln1
# ---- INPUT for Susanne Project, directly read the sheet "Gene Prot (FOUND)" ----
data <- read.csv("210618_MG_Carotisartery_consensus.csv", dec=",")
# ,,1,1,1,1,1,1,1,1,1,1, 2,2, 4,4,4,4,4,4
#Gene Symbol,Accession, CA1,CA2,CA3,CA10,CA5,CA6,CA8,CA14,CA12,CA13, CA11,CA4, Con1,Con2,Con3,Con4,Con5,Con6
# Drop columns "CA11" and "CA4"
data <- data[, !(names(data) %in% c("CA11", "CA4"))]
## Impute data
#print(names(data))
#rep_treats <- readinteger("Enter the number of treatment replicates=")
#rep_conts <- readinteger("Enter the number of control replicates=")
#FC_Cutoff <- readfloat("Enter the log fold change cut off=")
rep_treats = 10
rep_conts = 6
FC_Cutoff = 2
## removing blank rows
#NO_BLANK_ROWS --> deactivate the following code.
#For Gunnar data: temp <- as.matrix(rowSums(apply(data[, 1:(rep_treats + rep_conts)], 2, as.numeric)))
temp <- as.matrix(rowSums(apply(data[, 3:(rep_treats + rep_conts + 2)], 2, as.numeric)))
idx <- which(temp == 0)
if (length(idx)) {
data <- data[-idx,]
}
#NO INFINITE --> deactivate!
# data_limma <- log2(as.matrix( as.numeric(data[c(1:(rep_treats + rep_conts))]) ))
#data[,1] <- gsub(',', '.', data[,1])
#data[,2] <- gsub(',', '.', data[,2])
#data[,3] <- gsub(',', '.', data[,3])
#data[,4] <- gsub(',', '.', data[,4])
#data[,5] <- gsub(',', '.', data[,5])
#data[,6] <- gsub(',', '.', data[,6])
#For Gunnar data: data_limma <- log2(as.matrix( apply(data[, 1:(rep_treats + rep_conts)], 2, as.numeric) ))
data_limma <- log2(as.matrix( apply(data[, 3:(rep_treats + rep_conts + 2)], 2, as.numeric) ))
data_limma[is.infinite(data_limma)] <- NA
nan_idx <- which(is.na(data_limma))
#temp <- reshape(temp, nrow(data_limma)*ncol(data_limma), 1)
#hist(temp, na.rm = TRUE, xlab = "log2(intensity)", ylab = "Frequency", main = "All data: before imputation")
# replace the following for the code before
temp <- as.vector(data_limma)
temp <- temp[!is.na(temp)]
png("Freq_log2_before_imputation.png", width=800, height=800)
hist(temp, xlab = "log2(intensity)", ylab = "Frequency", main = "All data: before imputation")
dev.off()
fit <- fitdistr(c(na.exclude(data_limma)), "normal")
mu <- as.double(fit$estimate[1])
sigma <- as.double(fit$estimate[2])
sigma_cutoff <- 6
new_width_cutoff <- 0.3
downshift <- 1.8
width <- sigma_cutoff * sigma
new_width <- width * new_width_cutoff
new_sigma <- new_width / sigma_cutoff
new_mean <- mu - downshift * sigma
imputed_vals_my = rnorm(length(nan_idx), new_mean, new_sigma)
# scaling_factor <- readfloat_0_1("Enter a number > 0 and <=1 to scale imputed values = ")
scaling_factor = 1.0
data_limma[nan_idx] <- imputed_vals_my*scaling_factor
#data_limma[nan_idx] <- imputed_vals_my
#NOTE: [10857] values are imputed!!!
#> tail(data_limma)
# CA1 CA2 CA3 CA10 CA5 CA6 CA8 CA14
#[1785,] 24.04443 23.32959 22.77167 23.65635 21.63752 24.01719 22.81896 23.32914
#[1786,] NA NA NA NA NA 22.37408 22.64151 NA
#[1787,] NA 22.91260 22.48334 NA NA 22.02580 21.90541 NA
#[1788,] 24.19555 22.50627 NA 22.73289 23.85307 23.26974 22.76822 NA
#[1789,] NA 23.84345 23.01388 22.02657 NA NA 22.55753 21.60069
#[1790,] 24.50819 NA NA 25.07117 22.63140 21.78845 22.61968 20.99800
# CA12 CA13 Con1 Con2 Con3 Con4 Con5 Con6
#[1785,] 22.64312 23.35157 22.65619 22.88652 23.56822 22.20197 22.85973 21.98199
#[1786,] NA NA NA NA NA NA NA NA
#[1787,] NA NA NA NA NA NA NA NA
#[1788,] NA NA NA NA NA 21.93513 NA 23.16174
#[1789,] 23.15220 21.70746 20.82720 NA NA NA NA NA
#[1790,] NA 23.65803 NA 24.47523 NA 23.56952 22.89298 NA
#----impute---->
#[1785,] 22.64312 23.35157 22.65619 22.88652 23.56822 22.20197
#[1786,] 23.01317 23.12527 21.11763 20.60838 20.00608 20.90938
#[1787,] 22.20547 22.69346 23.68541 22.05089 21.50437 21.32925
#[1788,] 21.47150 22.78137 22.29056 20.30675 21.50243 21.93513
#[1789,] 23.15220 21.70746 20.82720 20.68821 22.90624 22.58886
#[1790,] 21.16174 23.65803 21.56143 24.47523 21.91832 23.56952
# CA1 CA2 CA3 CA10 CA5 CA6 CA8 CA14
#[1785,] 24.04443 23.32959 22.77167 23.65635 21.63752 24.01719 22.81896 23.32914
#[1786,] 23.37888 21.17247 21.03109 21.87904 21.97014 22.37408 22.64151 22.80035
#[1787,] 21.57987 22.91260 22.48334 21.98497 22.34736 22.02580 21.90541 20.12150
#[1788,] 24.19555 22.50627 21.69512 22.73289 23.85307 23.26974 22.76822 21.56332
#[1789,] 20.48388 23.84345 23.01388 22.02657 23.42263 21.80419 22.55753 21.60069
#[1790,] 24.50819 22.03749 21.90290 25.07117 22.63140 21.78845 22.61968 20.99800
# CA12 CA13 Con1 Con2 Con3 Con4
#[1785,] 22.64312 23.35157 22.65619 22.88652 23.56822 22.20197
#[1786,] 20.89429 23.68783 22.26621 21.77314 22.70896 22.14385
#[1787,] 22.28350 22.87021 21.09328 23.33020 21.88887 20.88960
#[1788,] 22.25577 21.17894 21.69301 22.28463 22.30470 21.93513
#[1789,] 23.15220 21.70746 20.82720 20.97298 19.93577 23.94133
#[1790,] 21.19956 23.65803 21.75923 24.47523 22.53592 23.56952
# temp <- reshape(temp, nrow(data_limma)*ncol(data_limma), 1)
temp <- as.vector(data_limma)
png("Freq_log2_after_imputation.png", width=800, height=800)
#na.rm = TRUE,
hist(temp, xlab = "log2(intensity)", ylab = "Frequency", main = "All data: after imputation")
dev.off()
## Median Normalization Module
#want_normalization <- as.integer(readline(
# cat(
# 'Enter 1: if you want to normalize the protein intensities in each experiemnt by substrating the median of the corresponding experiment\n',
# '\bEnter 2: if you want to perform column wise median normalization of the data matrix\n',
# '\b(for definition of "column wise median normalization" see README)= '
# )
#))
# if (want_normalization == 1) {
# browser()
# boxplot(data_limma[,1:rep_treats], main = paste(treatment, "replicates before normalization"))
# boxplot(data_limma[,(rep_treats+1):(rep_treats+rep_conts)], main = paste(control, "replicates before normalization"))
# Before normalization
png("before_normalization.png", width=800, height=800)
boxplot(data_limma[, 1:(rep_treats + rep_conts)], main = "data before median substract normalization")
dev.off()
# Subtracting the column medians
#Subtracting Median of Columns (samples): When you subtract the median of columns (samples), you are ensuring that every sample has a median expression level set to zero (or another reference level if not using median). This is useful when you want to compare expression levels across different samples and want to correct for any systematic bias that might be present in a given sample. This type of normalization is particularly common when dealing with samples that may have been processed or sequenced at different times, by different technicians, or with slightly different protocols, leading to varying baseline levels.
#Subtracting Median of Rows (genes): When you subtract the median of rows (genes), you are normalizing expression profiles of each gene across different samples. This might be useful in cases where you want to compare the expression profiles of different genes and you are interested in relative changes in gene expression across samples, not the absolute expression level of a gene.
#In most gene expression studies, normalization by subtracting the median (or mean) of columns (samples) is the typical approach to account for variations between samples.
col_med <- matrixStats::colMedians(data_limma)
med_mat <- matlab::repmat(col_med, nrow(data_limma), 1)
data_limma <- data_limma - med_mat
#(optional) using sweep instead of matlab::repmat
#col_med <- apply(data_limma, 2, median, na.rm = TRUE)
#data_limma <- sweep(data_limma, 2, col_med)
# After normalization
png("after_normalization.png", width=800, height=800)
boxplot(data_limma[, 1:(rep_treats + rep_conts)], main = "data after median substract normalization")
dev.off()
# browser()
# boxplot(data_limma[,1:rep_treats], main = paste(treatment, "replicates after normalization"))
# boxplot(data_limma[,(rep_treats+1):(rep_treats+rep_conts)], main = paste(control, "replicates after normalization"))
# } else if (want_normalization == 2) {
# boxplot(data_limma[, 1:(rep_treats + rep_conts)], main = "data before column wise median normalization")
# data_limma <- median_normalization(data_limma)
# boxplot(data_limma[, 1:(rep_treats + rep_conts)], main = "data after column wise median normalization")
# # browser()
# # boxplot(data_limma[,1:rep_treats], main = paste(treatment, "replicates after normalization"))
# # boxplot(data_limma[,(rep_treats+1):(rep_treats+rep_conts)], main = paste(control, "replicates after normalization"))
# }
#For Gunnar's data
#Symbol <- data$Symbol
#Uniprot <- data$Uniprot
#For Susanne's data
Symbol <- data$Gene.Symbol
Uniprot <- data$Accession
##Limma main code
design <- model.matrix( ~ factor(c(rep(2, rep_treats), rep(1, rep_conts))))
colnames(design) <- c("Intercept", "Diff")
res.eb <- eb.fit(data_limma, design, Symbol)
Sig_FC_idx <- union(which(res.eb$logFC < (-FC_Cutoff)), which(res.eb$logFC > FC_Cutoff))
Sig_Pval_mod_idx <- which(res.eb$p.mod < 0.05)
Sig_Pval_ord_idx <- which(res.eb$p.ord < 0.05)
Sig_mod_idx <- intersect(Sig_FC_idx, Sig_Pval_mod_idx)
Sig_ord_idx <- intersect(Sig_FC_idx, Sig_Pval_ord_idx)
categ_Ord <- rep(c("Not Significant"), times = length(Symbol))
categ_Mod <- categ_Ord
categ_Mod[Sig_mod_idx] <- "Significant"
categ_Ord[Sig_ord_idx] <- "Significant"
dat <-
cbind(
res.eb,
categ_Ord,
categ_Mod,
NegLogPvalMod = (-log10(res.eb$p.mod)),
NegLogPvalOrd = (-log10(res.eb$p.ord))
)
##Save the data file
setwd("/home/jhuang/DATA/Data_Susanne_MS/LIMMA-pipeline-proteomics")
final_data <-
cbind(#DELETED select(data, matches("^id$")),
Uniprot,
Symbol,
#Descriptions,
data_limma,
dat)
#DELETED final_data <- select(final_data, -matches("^gene$"))
filename_final_data <-
paste0(format(Sys.time(), "%Y%m%d_%H%M%S"), '_final_data')
# readline('Enter a filename for final data= ')
##Create plotly object and save plot as html
filename_mod <-
paste0(format(Sys.time(), "%Y%m%d_%H%M%S"), '_limma_plot')
# readline('Enter a filename for limma plot= ')
filename_ord <-
paste0(format(Sys.time(), "%Y%m%d_%H%M%S"), '_ord_plot')
# readline('Enter a filename for ordinary t-test plot= ')
display_plotly_figs(final_data, FC_Cutoff, filename_mod, filename_ord)
write.table(
final_data,
paste(filename_final_data, '.tsv', sep = ''),
sep = '\t',
row.names = FALSE,
col.names = TRUE
)
#setwd(cur_dir)
#sorting, add description.
#delete t.ord .. s2.post
# senden auch die Method-part!
# check if Symbol==gene -log10(p.mod)
#cut -f2-2 20220221_173330_final_data.tsv > Symbol.txt
#cut -f14-14 20220221_173330_final_data.tsv > gene.txt
#https://bioconductor.org/packages/devel/bioc/vignettes/GSVA/inst/doc/GSVA.html#6_Example_applications
#Input of GSVA should log and normalized or not normalized?
https://www.mcponline.org/article/S1535-9476(20)34997-5/fulltext DEqMS: A Method for Accurate Variance Estimation in Differential Protein Expression Analysis* https://genome.duke.edu/sites/default/files/2013Lecture3-DifferentialExpressionProteomicsforweb.pdf DESeq2 https://bioconductor.statistik.tu-dortmund.de/packages/3.5/bioc/vignettes/DESeq2/inst/doc/DESeq2.html Analyzing RNA-seq data with DESeq2 Comparative analysis of statistical methods used for detecting differential expression in label-free mass spectrometry proteomics
labeled mass spectrometry counts The only type of MS-based quantitation that is amenable to DESeq2 (or similar approaches) is spectral counting. Other approaches, whether labelled of unlabelled, are continuous data; I would also recommend limma for their analysis. https://www.researchgate.net/publication/221925844_Labeling_Methods_in_Mass_Spectrometry_Based_Quantitative_Proteomics https://de.wikipedia.org/wiki/Markierungsfreie_massenspektrometrische_Quantifizierung Label-free quantification is a method in mass spectrometry that aims to determine the relative amount of proteins in two or more biological samples. Unlike other methods for protein quantification, label-free quantification does not use a stable isotope containing compound to chemically bind to and thus label the protein.[1][2] https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-6-179 https://www.agilent.com/en/product/software-informatics/mass-spectrometry-software/data-analysis https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6412084/ https://bioinformatics.stackexchange.com/questions/16227/help-with-dia-mass-spectrometry-data-analysis-with-several-conditions-limma https://www.biostars.org/p/95405/#google_vignette https://www.sciencedirect.com/science/article/pii/S1570963913001866 https://github.com/wasimaftab/LIMMA-pipeline-proteomics/blob/master/limma_main.R
Statistical analysis on LC-MS data
- To detect significant changes between two experimental groups we performed statistical analysis on LC-MS data.
Some comments to the analysis:
- The LIMMA moderated t-test statistics are chosen for two-group comparison in a proteomic experiment.
- Normalise by subtracting median: This method normalizes the protein intensities in each experiment by substrating the median of the corresponding experiment. We implemented statistical analysis in R and codes are freely available at the following GitHub repository https://github.com/wasimaftab/LIMMA-pipeline– proteomics/tree/master/Mitochondrial_Loop. The final volcano plots only feature proteins that are part of the high-confidence mito- chondrial proteome (Morgenstern et al., 2017).
- Normalise by subtracting median: This method normalizes the protein intensities in each experiemnt by substrating the median of the corresponding experiment.
- Column wise median normalization of the data matrix: This method calculates for each sample the median change (i.e. the difference between the observed value and the row average) and subtracts it from each row. Missing values are ignored in the procedure. The method is based on the assumption that a majority of the rows did not change. By default, all the rows are used for normalization but if we assume that the first 3 proteins are spike-ins then the call to median_normalization function needs to be modified as: median_normalization(data, spike_in_rows = 1:3)
- The imputation algorithm proposed in Tyanova et al. [1] was used to impute missing values before the performance of statistical methods: 1. Tyanova, S., Temu, T., Sinitcyn, P., Carlson, A., Hein, M.Y., Geiger, T., Mann,M., and Cox, J. (2016). The Perseus computational platform for comprehensive analysis of (prote)omics data. Nat. Methods 13, 731–740.
域名预订类型
snapnames、namejet、dropcatch、youdot、mediaon、namecatch、name、domainmonster、hexonet、xz、epik、pool、asiaregister、dynadot、pheenix2、backorder、godaddy、ename、flappy、hupo、hooyoo、Juming、west263、yijie、cndns、bizcn、zgsj、17ex、66cn、aliy-pre、ename-pre、xw、22cn、dcpre、epre、Other-Pre、Gname-Pre、190
-
前言
过期删除域名中,会看到域名有多种预订类型:Delete、GD-Pre、Ename-Pre、West-Pre、Aliy-Pre、Xw-Pre、22CN-Pre、Xz-Pre、Drop-Pre、SN-Pre、E-Pre、Other-pre、Gname-Pre。
-
预订类型:删除Delete
介绍说明:“Delete”是正常过期删除域名,域名因过期后未续费,从而进入删除列表;
注册时间:等同于新注册,注册时间按抢注成功后重新计算;
域名过户:得标后0-3天内会完成域名过户,特殊情况除外;
得标后是否会被赎回:不会,正常删除后原所有者并无权限赎回。
-
预订类型:Gd-Pre
介绍说明:“Gd-Pre”过期提前释放的域名,是属于某国外注册商,由于域名过期后但注册者未续费,注册商提前开放预订。
注册时间:注册的时间不变,为得标前域名注册成功的时间;
得标过户:需要10-20天,特殊情况除外;
得标后是否会被赎回:被赎回的概率较小。
-
预订类型:Ename-Pre
介绍说明:“Ename-Pre”过期提前释放的域名,属于易名平台内由于域名过期后但注册者未续费,注册商提前开放预订;
注册时间:注册的时间不变,为得标前域名注册成功的时间;
得标过户:需要24-28天,特殊情况除外;
得标后是否会被赎回:被赎回的概率较小。
-
预订类型:West-Pre
介绍说明:“West-Pre”过期提前释放的域名,由于域名过期后但注册者未续费,注册商提前开放预订。
注册时间:注册的时间不变,为得标前域名注册成功的时间;
得标过户:需要20-30天,特殊情况除外;
得标后是否会被赎回:被赎回的概率较小。
-
预订类型:Aliy-Pre
介绍说明:“Aliy-Pre”过期提前释放的域名,由于域名过期后但注册者未续费,注册商提前开放预订。
注册时间:注册的时间不变,为得标前域名注册成功的时间;
得标过户:32-75天,特殊情况除外;
得标后是否会被赎回:被赎回的概率较小。
-
预订类型:Xw-Pre
介绍说明:“Xw-Pre”过期提前释放的域名,是属于某国内注册商,由于域名过期后但注册者未续费,注册商提前开放预订。
注册时间:注册的时间不变,为得标前域名注册成功的时间;
得标过户:需要35-42天,特殊情况除外;
得标后是否会被赎回:被赎回的概率较小。
-
预订类型:22CN-Pre
介绍说明:“22CN-Pre”提前释放拍卖域名,由用户提交或其它合作商过期提前释放拍卖的域名。
注册时间:注册的时间不变,为得标前域名注册成功的时间;
得标过户:付款完毕后30-35天完成过户,特殊情况除外;
得标后是否会被赎回:被赎回的概率较小。
-
预订类型:Xz-Pre
介绍说明:“Xz-Pre”过期提前释放的域名,是国内某注册商,由于域名过期后但注册者未续费,注册商提前开放预订。
注册时间:注册的时间不变,为得标前域名注册成功的时间;
得标过户:需要15天,特殊情况除外;
得标后是否会被赎回:被赎回的概率较小。
-
预订类型:Drop-Pre
介绍说明:“Drop-Pre”过期提前释放的域名,是属于某国外注册商,由于域名过期后但注册者未续费,注册商提前开放预订。 注册时间:注册的时间不变,为得标前域名注册成功的时间;
得标过户:需要10天,特殊情况除外; 得标后是否会被赎回:被赎回的概率较小。
-
预订类型:SN-Pre
介绍说明:“SN-Pre”过期提前释放的域名,是属于某国外注册商,由于域名过期后但注册者未续费,注册商提前开放预订。
注册时间:注册的时间不变,为得标前域名注册成功的时间;
得标过户:需要10天,特殊情况除外;
得标后是否会被赎回:被赎回的概率较小。
-
预订类型:E-Pre
介绍说明:E-PRE(易释放)易释放是易名平台用户自主提交预释放交易的交易类型,用户发布后,域名将同步到抢注平台进行预订竞价。
注册时间:注册的时间不变,为得标前域名注册成功的时间;
得标过户:付款后将会过户; 得标后是否会被赎回:被赎回的概率较小。
-
预订类型:Other-Pre
介绍说明:“Other-Pre”过期提前释放的域名,是属于国内外多个注册商,由于域名过期后但注册者未续费,注册商提前开放预订。
注册时间:注册的时间不变,为得标前域名注册成功的时间;
得标过户:需要15-42天,特殊情况除外;
得标后是否会被赎回:被赎回的概率较小。
-
预订类型:Gname-Pre
介绍说明:“Gname-Pre”过期提前释放的域名,是属于某国外注册商,由于域名过期后但注册者未续费,注册商提前开放预订。
注册时间:域名注册时间与以前的不变;
得标过户:需要0-20天,特殊情况除外; 得标后是否会被赎回:得标后30天内可能被赎回,概率比较小。
SnapNames 是一家域名回购服务公司,帮助个人和企业在域名过期并再次可供注册时获取域名。以下是 SnapNames 回购过程的一般操作方法:
域名回购下单:
访问 SnapNames 网站。
搜索所需的域名。
如果该域名已被注册,但您预期它可能很快会变得可用(由于过期或其他原因),则可以对其进行回购下单。
监控并尝试注册:
一旦您下了回购订单,SnapNames 会监控该域名。
如果域名到期并被当前所有者释放,SnapNames 会尝试代您注册。这通常涉及快速、自动化的过程,以确保在其他人之前有更高的机会获取域名。
拍卖流程:
如果多个 SnapNames 用户回购同一域名,且 SnapNames 成功注册了该域名,则该域名将进入拍卖。
用户可以对该域名进行出价,拍卖结束时出价最高的用户将赢得该域名。
在 SnapNames 的系统中,当一个域名被成功捕获后,如果有多个用户对同一域名进行了回购下单,该域名将进入拍卖。通常情况下,只有那些对该域名进行了回购下单的用户有资格参与该域名的拍卖。这意味着,如果您没有提前进行回购下单,您通常无法参与那个域名的拍卖。
但是,请注意,每个域名后购服务的具体规定可能会有所不同,因此建议您直接查看 SnapNames 或其他域名后购服务的官方文档或与他们联系,以获取确切和最新的信息。
域名转移:
如果 SnapNames 成功地获得了域名(您是唯一的回购者或拍卖中的获胜者),该域名将转移到您的账户。
然后,您可以管理它、将其转移到另一个注册商,或设置网络托管和电子邮件等。
费用:
通常只有在 SnapNames 成功为您获取域名时才会收费。如果 SnapNames 无法获取域名,您通常不会收到费用。
注意事项:
成功并不是保证的。即使使用回购服务,也存在获取理想域名的竞争。其他的回购服务、个人或当前域名所有者(如果他们选择续约)可能在您之前获得域名。
如果您考虑使用 SnapNames 或类似的服务,请始终查看他们的服务条款和任何相关费用,以确保您了解过程和潜在成本。
SPANDx Genomic Profiling: Verifying LTtr and K331A RNASeq Mutations
-
Set up the directory for raw data.
#Replace "p600" with "control", "p602" with "LT", "p605" with "LTtr", and "p783" with "K331A". Please note that the RNAseq data from the LT_K331A_d8 replicates were sequenced alongside the ChIPseq batch. ln -s /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/Raw_Data_ChIPseq/230306_NB501882_0417_AHMVHHBGXN/2023_022_nf_denise/nf857/1_NHDF_Donor_1_p783_S1_R1_001.fastq.gz LT_K331A_d8_DonorI.fastq.gz ln -s /home/jhuang/DATA/Data_Denise_LT_DNA_Bindung/Raw_Data_ChIPseq/230306_NB501882_0417_AHMVHHBGXN/2023_022_nf_denise/nf858/2_NHDF_Donor_2_p783_S2_R1_001.fastq.gz LT_K331A_d8_DonorII.fastq.gz mv V_8_2_4_p600_d8_DonorI.fastq.gz control_d8_DonorI.fastq.gz mv V_8_2_3_p600_d8_DonorII.fastq.gz control_d8_DonorII.fastq.gz mv V_8_4_2_p602_d8_DonorI.fastq.gz LT_d8_DonorI.fastq.gz mv V_8_4_1_p602_d8_DonorII.fastq.gz LT_d8_DonorII.fastq.gz mv V_8_2_4_p605_d8_DonorI.fastq.gz LTtr_d8_DonorI.fastq.gz mv V_8_2_3_p605_d8_DonorII.fastq.gz LTtr_d8_DonorII.fastq.gz
-
Execute SPANDx to produce the mapping profile. For the GenBank file used, refer to the provided link.
conda activate spandx [genbank copying] mkdir ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/LT_wildtype cp LT_wt.gbk ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/LT_wildtype/genes.gbk vim ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/snpEff.config /home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank LT_wildtype -d #The mutation in LTtr is located approximately at position 780, while the K331A mutation in LT is found near position 993 out of a total of 2454 nt. MDLVLNRKEREALCKLLEIAPNCYGNIPLMKAAFKRSCLKHHPDKGGNPVIMMELNTLWSKFQQNIHKLRSDFSMFDEVDEAPIYGTTKFKEWWRSGGFSFGKAYEYGPNPHGTNSRSRKPSSNASRGAPSGSSPPHSQSSSSGYGSFSASQASDSQSRGPDIPPEHHEEPTSSSGSSSREETTNSGRESSTPNGTSVPRNSSRTDGTWEDLFCDESLSSPEPPSSSEEPEEPPSSRSSPRQPPSSSAEEASSSQFTDEEYRSSSFTTPKTPPPFSRKRKFGGSRSSASSASSASFTSTPPKPKKNRETPVPTDFPIDLSDYLSHAVYSN
TVSCFAIYTTSDKAIELYDKIEKFKVDFKSRHACELGCILLFITLSKHRVSAIKNFCSTFCTISFLICKGVNKMPEMYNNLCKPPYKLLQENKPLLNYEFQEKEKEASCNWNLVAEFACEYELDDHFIILAHYLDFAKPFPCQKCENRSRLKPHKAHEAHHSNAKLFYESKSQKTICQQAADTVLAKRRLEMLEMTRTEMLCKKFKKHLERLRDLDTIDLLYYMGGVAWYCCLFEEFEKKLQKIIQLLTENIPKYRNIWFKGPINSGKTSFAAALIDLLEGKALNINCPSDKLPFELGCALDKFMVVFEDVKGQNSLNKDLQPGQGINNLDNLRDHLDGAVAVSLEKKHVNKKHQIFPPCIVTANDYFIPKTLIARFSYTLHFSPKANLRDSLDQNMEIRKRRILQSGTTLLLCLIWCLPDTTFKPCLQEEIKNWKQILQSEISYGKFCQMIENVEAGQDPLLNILIEEEGPEETEETQDSGTFSQ* nextflow run spandx/main.nf –fastq “Raw_Data_RNAseq_K331A_d8_SPANDx/*.fastq.gz” –ref LT_wt.fasta –annotation –database LT_wildtype –pairing SE -resume -
Open the BAM files created in the previous step using IGV.
Native elongating transcript sequencing (NET-seq)
2023-10-10 16:18:07 星期二
Net-seq(原位延伸转录测序)是一种用于以核苷酸分辨率跨基因组分析RNA聚合酶(Pol)活性的方法。以下是Net-seq的简要概述:
目的: Net-seq旨在捕获并测序仍与RNA聚合酶结合在一起的活跃转录过程中的RNA的3’末端。这有助于创建跨基因组的活跃转录位点的高分辨率图。
工作原理:
- 从细胞中提取RNA聚合酶及其绑定的RNA。
- 使用特定的方法仅选择与RNA聚合酶结合的RNA的3’端。
- 对这些3’端进行测序,从而获得在转录过程中的RNA的精确位置。
这种技术为研究基因的精确转录动态提供了一个强大的工具。
BRD4, or Bromodomain-containing protein 4, is a member of the bromodomain and extraterminal (BET) family of proteins. Here’s a brief overview:
Function: BRD4 is involved in several crucial cellular processes:
- Transcriptional Regulation: BRD4 plays a vital role in promoting the transcription of many genes by associating with acetylated chromatin. It’s particularly significant in the regulation of genes associated with the cell cycle and can recruit positive transcription elongation factor b (P-TEFb) to promote transcriptional elongation.
- Cell Cycle Progression: BRD4 is essential for progressing through various cell cycle stages, especially the G1/S transition.
Medical Relevance: Given its role in transcription and cell cycle regulation, it’s not surprising that BRD4 has been implicated in cancer biology. Inhibitors targeting the bromodomains of BRD4, commonly known as BET inhibitors, are being explored as potential therapeutic agents for various cancers.
Structure: BRD4 contains two bromodomains. Bromodomains can recognize and bind acetylated lysine residues, which are common post-translational modifications on histones and other proteins.
Associated Diseases: Alterations, mutations, or aberrant expression of BRD4 has been implicated in several cancers, including acute myeloid leukemia (AML), midline carcinoma, and others.
Inhibitors: Several small molecules that inhibit BRD4 have been developed as potential therapeutic agents. Notably, drugs like JQ1 and I-BET762 bind to the bromodomains of BRD4, inhibiting its interaction with acetylated proteins.
BRD4 is an active area of research, and new findings related to its function, regulation, and potential as a therapeutic target are regularly published.
HiS-NET-seq profiles the active enhancer landscape Putative enhancers show enhancer activity HiS-NET-seq reveals 11,007 new transcribed putative enhancers in K562 cells providing a more complete view on active enhancers. ChIP-Rx vs HiS-NET-seq: BRD4 is required for genome-wide enhancer transcription. HiChIP (H3K27Ac) is for discovering Enhancer connecting Promoter Enhancer and traget gene transcription is regulated by BRD4 Identify other post-initiation transcription regulators ablation: 切除,风化,消融,除去,切除 Acute BRD4-loss impairs transcription at the 3′-end genes. Acute BRD4-loss impairs transcription at the 5′ and 3′-end genes. BET & BRD4 degradation prompt (激励,鼓动,提示) 3′-processing defects Readthrough gene How do the distinct molecular interactions mediated by BRD4’s bromodomains contribute to its diverse roles in cellular processes, and how can this knowledge be leveraged to develop more targeted and effective therapeutic strategies for BRD4-associated diseases?
RNA-seq processing for control-LT-LTtr-K331A
#nextflow run rnaseq --reads '/home/jhuang/DATA/Data_Denise_tx_epi_MCPyV/Raw_Data_RNAseq_K331A/*.fastq.gz' --fasta /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa --gtf /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf --bed12 /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.bed --singleEnd -profile standard --aligner star --saveReference -resume --saveAlignedIntermediates --skip_rseqc --skip_dupradar --skip_genebody_coverage --skip_preseq --skip_edger --fcGroupFeaturesType gene_name
#under jhuang@hamm:~/DATA/Data_Denise_LT_RNASeq/Raw_Data_RNAseq_K331A_d8_SPANDx$
#Replace "p600" with "control", "p602" with "LT", "p605" with "LTtr", and "p783" with "K331A". Please note that the RNAseq data from the LT_K331A_d8 replicates were sequenced alongside the ChIPseq batch.
ln -s ~/DATA/Data_Denise_LT_RNASeq/231006_NB501882_0434_AHCT3NBGXV/nf928/1_NHDF_Donor_2_p600_S23_R1_001.fastq.gz control-d8-DII_re.fastq.gz
ln -s ~/DATA/Data_Denise_LT_RNASeq/231006_NB501882_0434_AHCT3NBGXV/nf929/2_NHDF_Donor_2_p783_S24_R1_001.fastq.gz LT-K331A-d8-DII_re.fastq.gz
(rnaseq_2021) jhuang@hamm:~/DATA/Data_Denise_LT_RNASeq$ nextflow run rnaseq --reads 'Raw_Data_RNAseq_K331A_d8_SPANDx/*.fastq.gz' --fasta /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa --gtf /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf --bed12 /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.bed --singleEnd -profile standard --aligner star --saveReference -resume --saveAlignedIntermediates --skip_rseqc --skip_dupradar --skip_genebody_coverage --skip_preseq --skip_edger --fcGroupFeaturesType gene_name
nextflow run rnaseq --reads 'Raw_Data_RNAseq_K331A_d8_SPANDx/*.fastq.gz' --fasta /media/jhuang/Smarty/ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa --gtf /media/jhuang/Smarty/ref/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf --star_index /media/jhuang/Smarty/ref/Homo_sapiens/UCSC/hg38/Sequence/STARIndex/ --bed12 /media/jhuang/Smarty/ref/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.bed --singleEnd -profile standard --aligner star --saveReference -resume --saveAlignedIntermediates --skip_rseqc --skip_dupradar --skip_genebody_coverage --skip_preseq --skip_edger --fcGroupFeaturesType gene_name
rsync -a -P jhuang@hamm:~/REFs/Homo_sapiens/UCSC/hg38 .
#cut -f2- merged_gene_counts.txt > merged_gene_counts_2.txt
library("AnnotationDbi")
library("clusterProfiler")
library("ReactomePA")
library("org.Hs.eg.db")
library(DESeq2)
library(gplots)
setwd("/media/jhuang/Elements1/Data_Denise_LT_DNA_Bindung/results_rnaseq_K331A_hg38/featureCounts_together_d8_2")
d.raw<- read.delim2("merged_gene_counts_2.txt",sep="\t", header=TRUE, row.names=1)
[1] K331A_DonorIAligned.sortedByCoord.out.bam
[2] V_8_2_4_p600_d8_DonorIAligned.sortedByCoord.out.bam
[3] V_8_2_3_p600_d8_DonorIIAligned.sortedByCoord.out.bam
[4] V_8_4_2_p602_d3_DonorIIAligned.sortedByCoord.out.bam
[5] V_8_2_4_p600_d3_DonorIAligned.sortedByCoord.out.bam
[6] V_8_4_2_p602_d3_DonorIAligned.sortedByCoord.out.bam
[7] K331A_DonorIIAligned.sortedByCoord.out.bam
[8] V_8_2_4_p605_d3_DonorIAligned.sortedByCoord.out.bam
[9] V_8_4_2_p602_d8_DonorIAligned.sortedByCoord.out.bam
[10] V_8_2_3_p600_d3_DonorIIAligned.sortedByCoord.out.bam
[11] V_8_4_1_p602_d8_DonorIIAligned.sortedByCoord.out.bam
[12] V_8_2_3_p605_d8_DonorIIAligned.sortedByCoord.out.bam
[13] V_8_2_3_p605_d3_DonorIIAligned.sortedByCoord.out.bam
[14] V_8_2_4_p605_d8_DonorIAligned.sortedByCoord.out.bam
#replace p600 to control, p602 to LT, p605 to LTtr
colnames(d.raw)<- c("K331A_d8_DonorI","ctrl_d8_DonorI","ctrl_d8_DonorII","LT_d3_DonorII","ctrl_d3_DonorI","LT_d3_DonorI","K331A_d8_DonorII","LTtr_d3_DonorI","LT_d8_DonorI","ctrl_d3_DonorII","LT_d8_DonorII","LTtr_d8_DonorII","LTtr_d3_DonorII","LTtr_d8_DonorI") #26485
col_order <- c("ctrl_d3_DonorI","ctrl_d3_DonorII","ctrl_d8_DonorI","ctrl_d8_DonorII", "LT_d3_DonorI","LT_d3_DonorII","LT_d8_DonorI","LT_d8_DonorII", "LTtr_d3_DonorI","LTtr_d3_DonorII","LTtr_d8_DonorI","LTtr_d8_DonorII", "K331A_d8_DonorI","K331A_d8_DonorII")
reordered.raw <- d.raw[,col_order]
#selected all columns with "d8"
selected_reordered.raw <- reordered.raw[, c("ctrl_d8_DonorI","ctrl_d8_DonorII", "LT_d8_DonorI","LT_d8_DonorII", "LTtr_d8_DonorI","LTtr_d8_DonorII", "K331A_d8_DonorI","K331A_d8_DonorII")]
condition = as.factor(c("control_d8","control_d8", "LT_d8","LT_d8", "LTtr_d8","LTtr_d8", "K331A_d8","K331A_d8"))
ids = as.factor(c("ctrl_d8_DonorI","ctrl_d8_DonorII", "LT_d8_DonorI","LT_d8_DonorII", "LTtr_d8_DonorI","LTtr_d8_DonorII", "K331A_d8_DonorI","K331A_d8_DonorII"))
#NOTE_3D_PCA (IMPORTANT): should always keep the name 'DI' and 'DII' as donor names; they are important for 3D PCA drawing!
#NOTE2_3D_PCA: correct the condition names: control_d8-->control d8, K331A_d8-->K331A d8, LT_d8-->LT d8, LTtr_d8-->LTtr d8.
donor = as.factor(c("DI","DII", "DI","DII", "DI","DII", "DI","DII"))
#Note that we need selected_reordered.raw
cData = data.frame(row.names=colnames(selected_reordered.raw), condition=condition, donor=donor, ids=ids)
dds<-DESeqDataSetFromMatrix(countData=selected_reordered.raw, colData=cData, design=~donor+condition) #batch+
> sizeFactors(dds)
NULL
> dds <- estimateSizeFactors(dds)
> sizeFactors(dds)
ctrl_d3_DonorI ctrl_d3_DonorII *ctrl_d8_DonorI *ctrl_d8_DonorII LT_d3_DonorI
1.2221026 0.8126614 1.0631329 0.6271887 0.8553245
LT_d3_DonorII *LT_d8_DonorI *LT_d8_DonorII LTtr_d3_DonorI LTtr_d3_DonorII
1.0386420 1.0324488 1.3568818 1.3070621 0.8283129
*LTtr_d8_DonorI *LTtr_d8_DonorII *K331A_DonorI *K331A_DonorII
1.6206130 0.8352236 1.6908424 0.6012788
ctrl_d8_DonorI ctrl_d8_DonorII LT_d8_DonorI LT_d8_DonorII LTtr_d8_DonorI LTtr_d8_DonorII K331A_DonorI K331A_DonorII
1.0429778 0.6131069 1.0162992 1.3406000 1.5984592 0.8268460 1.6634451 0.5943143
> sizeFactors(dds)
ctrl_d8_DonorI ctrl_d8_DonorII LT_d8_DonorI LT_d8_DonorII
1.0429778 0.6131069 1.0162992 1.3406000
LTtr_d8_DonorI LTtr_d8_DonorII K331A_d8_DonorI K331A_d8_DonorII
1.5984592 0.8268460 1.6634451 0.5943143
./STAR/V_8_2_4_p600_d3_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_2_3_p600_d3_DonorIIAligned.sortedByCoord.out.bam
./STAR/V_8_2_4_p600_d8_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_2_3_p600_d8_DonorIIAligned.sortedByCoord.out.bam
./STAR/V_8_4_2_p602_d3_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_4_2_p602_d3_DonorIIAligned.sortedByCoord.out.bam
./STAR/V_8_4_2_p602_d8_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_4_1_p602_d8_DonorIIAligned.sortedByCoord.out.bam
./STAR/V_8_2_4_p605_d3_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_2_3_p605_d3_DonorIIAligned.sortedByCoord.out.bam
./STAR/V_8_2_4_p605_d8_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_2_3_p605_d8_DonorIIAligned.sortedByCoord.out.bam
./STAR/K331A_DonorIAligned.sortedByCoord.out.bam
./STAR/K331A_DonorIIAligned.sortedByCoord.out.bam
bamCoverage --bam ./STAR/V_8_2_4_p600_d3_DonorIAligned.sortedByCoord.out.bam -o bigWigs/ctrl_d3_DonorI_norm.bw --binSize 10 --scaleFactor 0.8182619037059573 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_3_p600_d3_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/ctrl_d3_DonorII_norm.bw --binSize 10 --scaleFactor 1.230524791752137 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_4_p600_d8_DonorIAligned.sortedByCoord.out.bam -o bigWigs/ctrl_d8_DonorI_norm.bw --binSize 10 --scaleFactor 0.9406161731990421 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_3_p600_d8_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/ctrl_d8_DonorII_norm.bw --binSize 10 --scaleFactor 1.5944164810367278 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_4_2_p602_d3_DonorIAligned.sortedByCoord.out.bam -o bigWigs/LT_d3_DonorI_norm.bw --binSize 10 --scaleFactor 1.1691469144166922 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_4_2_p602_d3_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/LT_d3_DonorII_norm.bw --binSize 10 --scaleFactor 0.9627956504743693 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_4_2_p602_d8_DonorIAligned.sortedByCoord.out.bam -o bigWigs/LT_d8_DonorI_norm.bw --binSize 10 --scaleFactor 0.9685710322875091 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_4_1_p602_d8_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/LT_d8_DonorII_norm.bw --binSize 10 --scaleFactor 0.7369838699288324 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_4_p605_d3_DonorIAligned.sortedByCoord.out.bam -o bigWigs/LTtr_d3_DonorI_norm.bw --binSize 10 --scaleFactor 0.7650745897995206 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_3_p605_d3_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/LTtr_d3_DonorII_norm.bw --binSize 10 --scaleFactor 1.2072732417906324 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_4_p605_d8_DonorIAligned.sortedByCoord.out.bam -o bigWigs/LTtr_d8_DonorI_norm.bw --binSize 10 --scaleFactor 0.617050461769713 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_3_p605_d8_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/LTtr_d8_DonorII_norm.bw --binSize 10 --scaleFactor 1.1972841763570858 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/K331A_DonorIAligned.sortedByCoord.out.bam -o bigWigs/K331A_DonorI_norm.bw --binSize 10 --scaleFactor 0.5914211756222816 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/K331A_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/K331A_DonorII_norm.bw --binSize 10 --scaleFactor 1.6631219993121327 --effectiveGenomeSize 2913022398
#END
rld <- rlogTransformation(dds)
library(ggplot2)
svg("pca6.svg")
data <- plotPCA(rld, intgroup=c("condition", "donor"), returnData=TRUE)
percentVar <- round(100 * attr(data, "percentVar"))
ggplot(data, aes(PC1, PC2, color=condition, shape=donor)) +
geom_point(size=3) +
scale_color_manual(values = c("control" = "grey",
"K331A"="#1f78b4")) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()
dev.off()
png("pca.png", width=800, height=800)
data <- plotPCA(rld, intgroup=c("condition", "donor"), returnData=TRUE)
percentVar <- round(100 * attr(data, "percentVar"))
ggplot(data, aes(PC1, PC2, color=condition, shape=donor)) +
geom_point(size=3) +
scale_color_manual(values = c("control" = "grey",
"K331A"="#1f78b4")) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()
dev.off()
png("pca2.png", 1200, 800)
plotPCA(rld, intgroup=c("condition"))
dev.off()
library(ggplot2)
data <- plotPCA(rld, intgroup=c("condition", "donor"), returnData=TRUE)
write.csv(data, file="plotPCA_data.csv")
#calculate all PCs including PC3 with the following codes
library(genefilter)
ntop <- 500
rv <- rowVars(assay(rld))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
mat <- t( assay(rld)[select, ] )
pc <- prcomp(mat)
pc$x[,1:3]
#df_pc <- data.frame(pc$x[,1:3])
df_pc <- data.frame(pc$x)
identical(rownames(data), rownames(df_pc)) #-->TRUE
## define the desired order of row names
#desired_order <- rownames(data)
## sort the data frame by the desired order of row names
#df <- df[match(desired_order, rownames(df_pc)), ]
data$PC1 <- NULL
data$PC2 <- NULL
merged_df <- merge(data, df_pc, by = "row.names")
#merged_df <- merged_df[, -1]
row.names(merged_df) <- merged_df$Row.names
merged_df$Row.names <- NULL # remove the "name" column
merged_df$name <- NULL
merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","group","condition","donor")]
#results in 26PCs: merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","group","condition","donor")]
write.csv(merged_df, file="merged_df_8PCs.csv")
#> summary(pc) #0.5657 0.2195 0.1347 --> 0.57 0.22 0.13
#Importance of components:
# PC1 PC2 PC3 PC4 PC5 PC6 PC7
#Standard deviation 19.4051 12.0878 9.4683 4.5569 4.01016 3.00610 2.71918
#Proportion of Variance 0.5657 0.2195 0.1347 0.0312 0.02416 0.01358 0.01111
#Cumulative Proportion 0.5657 0.7853 0.9200 0.9512 0.97531 0.98889 1.00000
#---- * to untreated ----
dds$condition <- relevel(dds$condition, "control_d8")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("K331A_d8_vs_control_d8", "LT_d8_vs_control_d8", "LTtr_d8_vs_control_d8")
for (i in clist) {
contrast = paste("condition", i, sep="_")
res = results(dds, name=contrast)
res <- res[!is.na(res$log2FoldChange),]
geness <- AnnotationDbi::select(org.Hs.eg.db, keys = rownames(res), keytype = "SYMBOL", columns = c("ENSEMBL", "ENTREZID", "SYMBOL", "GENENAME"))
geness <- geness[!duplicated(geness$SYMBOL), ]
res$SYMBOL = rownames(res)
rownames(geness) <- geness$SYMBOL
identical(rownames(res), rownames(geness))
res_df <- as.data.frame(res)
geness_res <- merge(geness, res_df)
dim(geness_res)
geness_res$padj <- ifelse(is.na(geness_res$padj), 1, geness_res$padj)
geness_res$pvalue <- ifelse(is.na(geness_res$pvalue), 1, geness_res$pvalue)
write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
#write.csv(res_df, file = paste(i, "background.txt", sep="_"))
up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}
#cd ${comp}_output; \
#~/Tools/csv2xls-0.4/csv_to_xls.py upregulated_filtered downregulated_filtered background -d$',' -o ../${comp}_degenes.xls; \
#cd ..; \
~/Tools/csv2xls-0.4/csv_to_xls.py \
K331A_d8_vs_control_d8-all.txt \
K331A_d8_vs_control_d8-up.txt \
K331A_d8_vs_control_d8-down.txt \
-d$',' -o K331A_d8_vs_control_d8.xls;
~/Tools/csv2xls-0.4/csv_to_xls.py \
LT_d8_vs_control_d8-all.txt \
LT_d8_vs_control_d8-up.txt \
LT_d8_vs_control_d8-down.txt \
-d$',' -o LT_d8_vs_control_d8.xls;
~/Tools/csv2xls-0.4/csv_to_xls.py \
LTtr_d8_vs_control_d8-all.txt \
LTtr_d8_vs_control_d8-up.txt \
LTtr_d8_vs_control_d8-down.txt \
-d$',' -o LTtr_d8_vs_control_d8.xls;
# 719 K331A_d8_vs_control_d8-up.txt
# 371 LT_d8_vs_control_d8-up.txt
# 292 LTtr_d8_vs_control_d8-up.txt
# 1382 total
# 323 K331A_d8_vs_control_d8-down.txt
# 141 LT_d8_vs_control_d8-down.txt
# 80 LTtr_d8_vs_control_d8-down.txt
# 544 total
library(ggrepel)
geness_res <- read.csv(file = paste("LT_d8_vs_control_d8", "all.txt", sep="-"), row.names=1)
geness_res$Color <- "NS or log2FC < 2.0"
geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
geness_res$Color[geness_res$padj < 0.001] <- "P-adj < 0.001"
geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
geness_res$Color <- factor(geness_res$Color,
levels = c("NS or log2FC < 2.0", "P < 0.05",
"P-adj < 0.05", "P-adj < 0.001"))
geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
top_g <- c()
top_g <- c(top_g, geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:200]], geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:200]])
top_g <- unique(top_g)
geness_res <- geness_res[, -1*ncol(geness_res)] #remove invert_P from matrix
png("LT_d8_vs_control_d8.png",width=1400, height=1000)
ggplot(geness_res,
aes(x = log2FoldChange, y = -log10(pvalue),
color = Color, label = SYMBOL)) +
geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +
geom_hline(yintercept = -log10(0.05), lty = "dashed") +
geom_point() +
labs(x = "log2(FC)",
y = "Significance, -log10(P)",
color = "Significance") +
scale_color_manual(values = c(`P-adj < 0.001` = "dodgerblue",
`P-adj < 0.05` = "lightblue",
`P < 0.05` = "orange2",
`NS or log2FC < 2.0` = "gray"),
guide = guide_legend(override.aes = list(size = 4))) +
scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
geom_text_repel(data = subset(geness_res, SYMBOL %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)),
size = 4, point.padding = 0.15, color = "black",
min.segment.length = .1, box.padding = .2, lwd = 2) +
theme_bw(base_size = 16) +
theme(legend.position = "bottom")
dev.off()
up1 <-read.csv(file="./K331A_d8_vs_control_d8-up.txt")
up3 <-read.csv(file="./LT_d8_vs_control_d8-up.txt")
up5 <-read.csv(file="./LTtr_d8_vs_control_d8-up.txt")
down1 <-read.csv(file="./K331A_d8_vs_control_d8-down.txt")
down3 <-read.csv(file="./LT_d8_vs_control_d8-down.txt")
down5 <-read.csv(file="./LTtr_d8_vs_control_d8-down.txt")
RNASeq.NoCellLine <- assay(rld)
#Option3: as paper described, A heatmap showing expression values of all DEGs which are significant between any pair conditions.
all_genes <- c(up1$SYMBOL,down1$SYMBOL, up3$SYMBOL,down3$SYMBOL, up5$SYMBOL,down5$SYMBOL) #1920
all_genes <- unique(all_genes) #1223
RNASeq.NoCellLine_ <- RNASeq.NoCellLine[all_genes,]
write.csv(as.data.frame(RNASeq.NoCellLine_), file ="significant_gene_expressions.txt")
##### STEP4: clustering the genes and draw heatmap #####
#clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). pearson or spearman
datamat = RNASeq.NoCellLine_
hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.4) #1.08: 3 clusters;
mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED", "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");
mycol = mycol[as.vector(mycl)]
#sampleCols <- rep('GREY',ncol(RNASeq.NoCellLine_))
#names(sampleCols) <- c("mock_r1", "mock_r2", "mock_r3", "mock_r4", "WAP_r1", "WAP_r2", "WAP_r3", "WAP_r4", "WAC_r1","WAC_r2")
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,4)=='mock'] <- 'GREY'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAP'] <- 'RED'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dM_'] <- 'CYAN'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dP_'] <- 'BLUE'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAC'] <- 'GREEN'
png("DEGs_heatmap.png", width=900, height=1010)
heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
scale='row',trace='none',col=bluered(75),
RowSideColors = mycol, labRow="", srtCol=15, keysize=0.72)
dev.off()
#### cluster members #####
write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt')
write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')
write.csv(names(subset(mycl, mycl == '4')),file='cluster4_DARKMAGENTA.txt')
write.csv(names(subset(mycl, mycl == '5')),file='cluster5_DARKCYAN.txt')
~/Tools/csv2xls-0.4/csv_to_xls.py \
cluster1_YELLOW.txt \
cluster2_DARKBLUE.txt \
cluster3_DARKORANGE.txt \
cluster4_DARKMAGENTA.txt \
cluster5_DARKCYAN.txt \
-d$',' -o gene_clusters.xls;
~/Tools/csv2xls-0.4/csv_to_xls.py \
significant_gene_expressions.txt \
-d',' -o DEGs_heatmap_data.xls;
# ------------ LT (p602) vs ctrl (p600) d8 ------------
colnames(d.raw)<- c("K331A_d8_DonorI","ctrl_d8_DonorI","ctrl_d8_DonorII","LT_d3_DonorII","ctrl_d3_DonorI","LT_d3_DonorI","K331A_d8_DonorII","LTtr_d3_DonorI","LT_d8_DonorI","ctrl_d3_DonorII","LT_d8_DonorII","LTtr_d8_DonorII","LTtr_d3_DonorII","LTtr_d8_DonorI") #26485
col_order <- c("ctrl_d3_DonorI","ctrl_d3_DonorII","ctrl_d8_DonorI","ctrl_d8_DonorII", "LT_d3_DonorI","LT_d3_DonorII","LT_d8_DonorI","LT_d8_DonorII", "LTtr_d3_DonorI","LTtr_d3_DonorII","LTtr_d8_DonorI","LTtr_d8_DonorII", "K331A_d8_DonorI","K331A_d8_DonorII")
reordered.raw <- d.raw[,col_order]
#selected all columns with "d8"
selected_reordered.raw <- reordered.raw[, c("ctrl_d8_DonorI","ctrl_d8_DonorII", "LT_d8_DonorI","LT_d8_DonorII")]
condition = as.factor(c("control_d8","control_d8", "LT_d8","LT_d8"))
ids = as.factor(c("ctrl_d8_DonorI","ctrl_d8_DonorII", "LT_d8_DonorI","LT_d8_DonorII"))
donor = as.factor(c("DI","DII", "DI","DII"))
cData = data.frame(row.names=colnames(selected_reordered.raw), condition=condition, donor=donor, ids=ids)
dds<-DESeqDataSetFromMatrix(countData=selected_reordered.raw, colData=cData, design=~donor+condition) #batch+
#> sizeFactors(dds)
#NULL
dds <- estimateSizeFactors(dds)
sizeFactors(dds)
# ctrl_d8_DonorI ctrl_d8_DonorII LT_d8_DonorI LT_d8_DonorII
# 1.0879047 0.6411992 1.0519079 1.3937541
rld <- rlogTransformation(dds)
setwd("~/DATA/Data_Denise_LT_DNA_Bindung/results_RNAseq_K331A_hg38/featureCounts_LT_vs_ctrl_d8")
dds$condition <- relevel(dds$condition, "control_d8")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("LT_d8_vs_control_d8")
for (i in clist) {
contrast = paste("condition", i, sep="_")
res = results(dds, name=contrast)
res <- res[!is.na(res$log2FoldChange),]
geness <- AnnotationDbi::select(org.Hs.eg.db, keys = rownames(res), keytype = "SYMBOL", columns = c("ENSEMBL", "ENTREZID", "SYMBOL", "GENENAME"))
geness <- geness[!duplicated(geness$SYMBOL), ]
res$SYMBOL = rownames(res)
rownames(geness) <- geness$SYMBOL
identical(rownames(res), rownames(geness))
res_df <- as.data.frame(res)
geness_res <- merge(geness, res_df)
dim(geness_res)
geness_res$padj <- ifelse(is.na(geness_res$padj), 1, geness_res$padj)
geness_res$pvalue <- ifelse(is.na(geness_res$pvalue), 1, geness_res$pvalue)
write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
#write.csv(res_df, file = paste(i, "background.txt", sep="_"))
up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}
#~/Tools/csv2xls-0.4/csv_to_xls.py \
#LT_d8_vs_control_d8-all.txt \
#LT_d8_vs_control_d8-up.txt \
#LT_d8_vs_control_d8-down.txt \
#-d$',' -o LT_d8_vs_control_d8.xls;
# 719 K331A_d8_vs_control_d8-up.txt
# 371 LT_d8_vs_control_d8-up.txt --> 399
# 292 LTtr_d8_vs_control_d8-up.txt
# 323 K331A_d8_vs_control_d8-down.txt
# 141 LT_d8_vs_control_d8-down.txt --> 184
# 80 LTtr_d8_vs_control_d8-down.txt
library(ggrepel)
geness_res <- read.csv(file = paste("LT_d8_vs_control_d8", "all.txt", sep="-"), row.names=1)
geness_res$Color <- "NS or log2FC < 2.0"
geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
geness_res$Color[geness_res$padj < 0.001] <- "P-adj < 0.001"
geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
geness_res$Color <- factor(geness_res$Color,
levels = c("NS or log2FC < 2.0", "P < 0.05",
"P-adj < 0.05", "P-adj < 0.001"))
geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
top_g <- c()
top_g <- c(top_g, geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:200]], geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:200]])
top_g <- unique(top_g)
geness_res <- geness_res[, -1*ncol(geness_res)] #remove invert_P from matrix
png("LT_d8_vs_control_d8.png",width=1400, height=1000)
ggplot(geness_res,
aes(x = log2FoldChange, y = -log10(pvalue),
color = Color, label = SYMBOL)) +
geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +
geom_hline(yintercept = -log10(0.05), lty = "dashed") +
geom_point() +
labs(x = "log2(FC)",
y = "Significance, -log10(P)",
color = "Significance") +
scale_color_manual(values = c(`P-adj < 0.001` = "dodgerblue",
`P-adj < 0.05` = "lightblue",
`P < 0.05` = "orange2",
`NS or log2FC < 2.0` = "gray"),
guide = guide_legend(override.aes = list(size = 4))) +
scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
geom_text_repel(data = subset(geness_res, SYMBOL %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)),
size = 4, point.padding = 0.15, color = "black",
min.segment.length = .1, box.padding = .2, lwd = 2) +
theme_bw(base_size = 16) +
theme(legend.position = "bottom")
dev.off()
# ------------ LTtr (p605) vs ctrl (p600) d8 ------------
colnames(d.raw)<- c("K331A_d8_DonorI","ctrl_d8_DonorI","ctrl_d8_DonorII","LT_d3_DonorII","ctrl_d3_DonorI","LT_d3_DonorI","K331A_d8_DonorII","LTtr_d3_DonorI","LT_d8_DonorI","ctrl_d3_DonorII","LT_d8_DonorII","LTtr_d8_DonorII","LTtr_d3_DonorII","LTtr_d8_DonorI") #26485
col_order <- c("ctrl_d3_DonorI","ctrl_d3_DonorII","ctrl_d8_DonorI","ctrl_d8_DonorII", "LT_d3_DonorI","LT_d3_DonorII","LT_d8_DonorI","LT_d8_DonorII", "LTtr_d3_DonorI","LTtr_d3_DonorII","LTtr_d8_DonorI","LTtr_d8_DonorII", "K331A_d8_DonorI","K331A_d8_DonorII")
reordered.raw <- d.raw[,col_order]
#selected all columns with "d8"
selected_reordered.raw <- reordered.raw[, c("ctrl_d8_DonorI","ctrl_d8_DonorII", "LTtr_d8_DonorI","LTtr_d8_DonorII")]
condition = as.factor(c("control_d8","control_d8", "LTtr_d8","LTtr_d8"))
ids = as.factor(c("ctrl_d8_DonorI","ctrl_d8_DonorII", "LTtr_d8_DonorI","LTtr_d8_DonorII"))
donor = as.factor(c("DI","DII", "DI","DII"))
cData = data.frame(row.names=colnames(selected_reordered.raw), condition=condition, donor=donor, ids=ids)
dds<-DESeqDataSetFromMatrix(countData=selected_reordered.raw, colData=cData, design=~donor+condition) #batch+
#> sizeFactors(dds)
#NULL
dds <- estimateSizeFactors(dds)
sizeFactors(dds)
# ctrl_d8_DonorI ctrl_d8_DonorII LTtr_d8_DonorI LTtr_d8_DonorII
# 1.0990996 0.6476647 1.6536010 0.8632049
rld <- rlogTransformation(dds)
setwd("~/DATA/Data_Denise_LT_DNA_Bindung/results_RNAseq_K331A_hg38/featureCounts_LTtr_vs_ctrl_d8")
dds$condition <- relevel(dds$condition, "control_d8")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("LTtr_d8_vs_control_d8")
for (i in clist) {
contrast = paste("condition", i, sep="_")
res = results(dds, name=contrast)
res <- res[!is.na(res$log2FoldChange),]
geness <- AnnotationDbi::select(org.Hs.eg.db, keys = rownames(res), keytype = "SYMBOL", columns = c("ENSEMBL", "ENTREZID", "SYMBOL", "GENENAME"))
geness <- geness[!duplicated(geness$SYMBOL), ]
res$SYMBOL = rownames(res)
rownames(geness) <- geness$SYMBOL
identical(rownames(res), rownames(geness))
res_df <- as.data.frame(res)
geness_res <- merge(geness, res_df)
dim(geness_res)
geness_res$padj <- ifelse(is.na(geness_res$padj), 1, geness_res$padj)
geness_res$pvalue <- ifelse(is.na(geness_res$pvalue), 1, geness_res$pvalue)
write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
#write.csv(res_df, file = paste(i, "background.txt", sep="_"))
up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}
#~/Tools/csv2xls-0.4/csv_to_xls.py \
#LTtr_d8_vs_control_d8-all.txt \
#LTtr_d8_vs_control_d8-up.txt \
#LTtr_d8_vs_control_d8-down.txt \
#-d$',' -o LTtr_d8_vs_control_d8.xls;
# 719 K331A_d8_vs_control_d8-up.txt
# 371 LT_d8_vs_control_d8-up.txt --> 399
# 292 LTtr_d8_vs_control_d8-up.txt --> 380
# 323 K331A_d8_vs_control_d8-down.txt
# 141 LT_d8_vs_control_d8-down.txt --> 184
# 80 LTtr_d8_vs_control_d8-down.txt --> 136
library(ggrepel)
geness_res <- read.csv(file = paste("LTtr_d8_vs_control_d8", "all.txt", sep="-"), row.names=1)
geness_res$Color <- "NS or log2FC < 2.0"
geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
geness_res$Color[geness_res$padj < 0.001] <- "P-adj < 0.001"
geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
geness_res$Color <- factor(geness_res$Color,
levels = c("NS or log2FC < 2.0", "P < 0.05",
"P-adj < 0.05", "P-adj < 0.001"))
geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
top_g <- c()
top_g <- c(top_g, geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:200]], geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:200]])
top_g <- unique(top_g)
geness_res <- geness_res[, -1*ncol(geness_res)] #remove invert_P from matrix
png("LTtr_d8_vs_control_d8.png",width=1400, height=1000)
ggplot(geness_res,
aes(x = log2FoldChange, y = -log10(pvalue),
color = Color, label = SYMBOL)) +
geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +
geom_hline(yintercept = -log10(0.05), lty = "dashed") +
geom_point() +
labs(x = "log2(FC)",
y = "Significance, -log10(P)",
color = "Significance") +
scale_color_manual(values = c(`P-adj < 0.001` = "dodgerblue",
`P-adj < 0.05` = "lightblue",
`P < 0.05` = "orange2",
`NS or log2FC < 2.0` = "gray"),
guide = guide_legend(override.aes = list(size = 4))) +
scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
geom_text_repel(data = subset(geness_res, SYMBOL %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)),
size = 4, point.padding = 0.15, color = "black",
min.segment.length = .1, box.padding = .2, lwd = 2) +
theme_bw(base_size = 16) +
theme(legend.position = "bottom")
dev.off()
# ------------ LT K331A (p783) vs ctrl (p600) d8 ------------
colnames(d.raw)<- c("K331A_d8_DonorI","ctrl_d8_DonorI","ctrl_d8_DonorII","LT_d3_DonorII","ctrl_d3_DonorI","LT_d3_DonorI","K331A_d8_DonorII","LTtr_d3_DonorI","LT_d8_DonorI","ctrl_d3_DonorII","LT_d8_DonorII","LTtr_d8_DonorII","LTtr_d3_DonorII","LTtr_d8_DonorI") #26485
col_order <- c("ctrl_d3_DonorI","ctrl_d3_DonorII","ctrl_d8_DonorI","ctrl_d8_DonorII", "LT_d3_DonorI","LT_d3_DonorII","LT_d8_DonorI","LT_d8_DonorII", "LTtr_d3_DonorI","LTtr_d3_DonorII","LTtr_d8_DonorI","LTtr_d8_DonorII", "K331A_d8_DonorI","K331A_d8_DonorII")
reordered.raw <- d.raw[,col_order]
#selected all columns with "d8"
selected_reordered.raw <- reordered.raw[, c("ctrl_d8_DonorI","ctrl_d8_DonorII", "K331A_d8_DonorI","K331A_d8_DonorII")]
condition = as.factor(c("control_d8","control_d8", "K331A_d8","K331A_d8"))
ids = as.factor(c("ctrl_d8_DonorI","ctrl_d8_DonorII", "K331A_d8_DonorI","K331A_d8_DonorII"))
donor = as.factor(c("DI","DII", "DI","DII"))
cData = data.frame(row.names=colnames(selected_reordered.raw), condition=condition, donor=donor, ids=ids)
dds<-DESeqDataSetFromMatrix(countData=selected_reordered.raw, colData=cData, design=~donor+condition) #batch+
#> sizeFactors(dds)
#NULL
dds <- estimateSizeFactors(dds)
sizeFactors(dds)
# ctrl_d8_DonorI ctrl_d8_DonorII K331A_d8_DonorI K331A_d8_DonorII
# 1.1747544 0.6871283 1.8656332 0.6817289
rld <- rlogTransformation(dds)
setwd("~/DATA/Data_Denise_LT_DNA_Bindung/results_RNAseq_K331A_hg38/featureCounts_K331A_vs_ctrl_d8")
dds$condition <- relevel(dds$condition, "control_d8")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("K331A_d8_vs_control_d8")
for (i in clist) {
contrast = paste("condition", i, sep="_")
res = results(dds, name=contrast)
res <- res[!is.na(res$log2FoldChange),]
geness <- AnnotationDbi::select(org.Hs.eg.db, keys = rownames(res), keytype = "SYMBOL", columns = c("ENSEMBL", "ENTREZID", "SYMBOL", "GENENAME"))
geness <- geness[!duplicated(geness$SYMBOL), ]
res$SYMBOL = rownames(res)
rownames(geness) <- geness$SYMBOL
identical(rownames(res), rownames(geness))
res_df <- as.data.frame(res)
geness_res <- merge(geness, res_df)
dim(geness_res)
geness_res$padj <- ifelse(is.na(geness_res$padj), 1, geness_res$padj)
geness_res$pvalue <- ifelse(is.na(geness_res$pvalue), 1, geness_res$pvalue)
write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
#write.csv(res_df, file = paste(i, "background.txt", sep="_"))
up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}
#~/Tools/csv2xls-0.4/csv_to_xls.py \
#K331A_d8_vs_control_d8-all.txt \
#K331A_d8_vs_control_d8-up.txt \
#K331A_d8_vs_control_d8-down.txt \
#-d$',' -o K331A_d8_vs_control_d8.xls;
# 719 K331A_d8_vs_control_d8-up.txt --> 315
# 371 LT_d8_vs_control_d8-up.txt --> 399
# 292 LTtr_d8_vs_control_d8-up.txt --> 380
# 323 K331A_d8_vs_control_d8-down.txt --> 277
# 141 LT_d8_vs_control_d8-down.txt --> 184
# 80 LTtr_d8_vs_control_d8-down.txt --> 136
library(ggrepel)
geness_res <- read.csv(file = paste("K331A_d8_vs_control_d8", "all.txt", sep="-"), row.names=1)
geness_res$Color <- "NS or log2FC < 2.0"
geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
geness_res$Color[geness_res$padj < 0.001] <- "P-adj < 0.001"
geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
geness_res$Color <- factor(geness_res$Color,
levels = c("NS or log2FC < 2.0", "P < 0.05",
"P-adj < 0.05", "P-adj < 0.001"))
geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
top_g <- c()
top_g <- c(top_g, geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:200]], geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:200]])
top_g <- unique(top_g)
geness_res <- geness_res[, -1*ncol(geness_res)] #remove invert_P from matrix
png("K331A_d8_vs_control_d8.png",width=1400, height=1000)
ggplot(geness_res,
aes(x = log2FoldChange, y = -log10(pvalue),
color = Color, label = SYMBOL)) +
geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +
geom_hline(yintercept = -log10(0.05), lty = "dashed") +
geom_point() +
labs(x = "log2(FC)",
y = "Significance, -log10(P)",
color = "Significance") +
scale_color_manual(values = c(`P-adj < 0.001` = "dodgerblue",
`P-adj < 0.05` = "lightblue",
`P < 0.05` = "orange2",
`NS or log2FC < 2.0` = "gray"),
guide = guide_legend(override.aes = list(size = 4))) +
scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
geom_text_repel(data = subset(geness_res, SYMBOL %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)),
size = 4, point.padding = 0.15, color = "black",
min.segment.length = .1, box.padding = .2, lwd = 2) +
theme_bw(base_size = 16) +
theme(legend.position = "bottom")
dev.off()
# ------------------- heatmap for separate analyses ----------------
setwd("~/DATA/Data_Denise_LT_DNA_Bindung/results_rnaseq_K331A_hg38/heatmap_separate_analyses")
up1 <-read.csv(file="../featureCounts_K331A_vs_ctrl_d8_csv/K331A_d8_vs_control_d8-up.txt")
up3 <-read.csv(file="../featureCounts_LT_vs_ctrl_d8_csv/LT_d8_vs_control_d8-up.txt")
up5 <-read.csv(file="../featureCounts_LTtr_vs_ctrl_d8_csv/LTtr_d8_vs_control_d8-up.txt")
down1 <-read.csv(file="../featureCounts_K331A_vs_ctrl_d8_csv/K331A_d8_vs_control_d8-down.txt")
down3 <-read.csv(file="../featureCounts_LT_vs_ctrl_d8_csv/LT_d8_vs_control_d8-down.txt")
down5 <-read.csv(file="../featureCounts_LTtr_vs_ctrl_d8_csv/LTtr_d8_vs_control_d8-down.txt")
RNASeq.NoCellLine <- assay(rld) # using the overview analysis, only choose the gene ids between conditions seperately. NOT DONE!
#Option3: as paper described, A heatmap showing expression values of all DEGs which are significant between any pair conditions.
all_genes <- c(up1$SYMBOL,down1$SYMBOL, up3$SYMBOL,down3$SYMBOL, up5$SYMBOL,down5$SYMBOL) #1685
all_genes <- unique(all_genes) #1685-->1016
RNASeq.NoCellLine_ <- RNASeq.NoCellLine[all_genes,]
write.csv(as.data.frame(RNASeq.NoCellLine_), file ="significant_gene_expressions.txt")
##### STEP4: clustering the genes and draw heatmap #####
#clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). pearson or spearman
datamat = RNASeq.NoCellLine_
hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.08)
mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED", "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");
mycol = mycol[as.vector(mycl)]
#sampleCols <- rep('GREY',ncol(RNASeq.NoCellLine_))
#names(sampleCols) <- c("mock_r1", "mock_r2", "mock_r3", "mock_r4", "WAP_r1", "WAP_r2", "WAP_r3", "WAP_r4", "WAC_r1","WAC_r2")
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,4)=='mock'] <- 'GREY'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAP'] <- 'RED'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dM_'] <- 'CYAN'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dP_'] <- 'BLUE'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAC'] <- 'GREEN'
png("DEGs_heatmap.png", width=900, height=1010)
heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
scale='row',trace='none',col=bluered(75),
RowSideColors = mycol, labRow="", srtCol=15, keysize=0.72)
dev.off()
#### cluster members #####
write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt')
write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')
~/Tools/csv2xls-0.4/csv_to_xls.py \
cluster1_YELLOW.txt \
cluster2_DARKBLUE.txt \
cluster3_DARKORANGE.txt \
-d$',' -o gene_culsters.xls;
~/Tools/csv2xls-0.4/csv_to_xls.py \
significant_gene_expressions.txt \
-d',' -o DEGs_heatmap_data.xls;
Nextflow RNAseq
- Merge re-sequenced FastQ files (cat)
- Sub-sample FastQ files and auto-infer strandedness (fq, Salmon)
- Read QC (FastQC)
- UMI extraction (UMI-tools)
- Adapter and quality trimming (Trim Galore!)
- Removal of genome contaminants (BBSplit)
- Removal of ribosomal RNA (SortMeRNA)
- Choice of multiple alignment and quantification routes: STAR -> Salmon STAR -> RSEM HiSAT2 -> NO QUANTIFICATION
- Sort and index alignments (SAMtools)
- UMI-based deduplication (UMI-tools)
- Duplicate read marking (picard MarkDuplicates)
- Transcript assembly and quantification (StringTie)
- Create bigWig coverage files (BEDTools, bedGraphToBigWig)
- Extensive quality control: RSeQC Qualimap dupRadar Preseq DESeq2
- Pseudo-alignment and quantification (Salmon; optional)
- Present QC for raw read, alignment, gene biotype, sample similarity, and strand-specificity checks (MultiQC, R)
umi-tools extract:
Flexible removal of UMI sequences from fastq reads.
UMIs are removed and appended to the read name. Any other barcode, for example a library barcode, is left on the read. Can also filter reads by quality or against a whitelist (see above)
The remaining commands, group, dedup and count/count_tab, are used to identify PCR duplicates using the UMIs and perform different levels of analysis depending on the needs of the user. A number of different UMI deduplication schemes are enabled - The recommended method is directional.
umi-tools dedup:
Groups PCR duplicates and deduplicates reads to yield one read per group
Use this when you want to remove the PCR duplicates prior to any downstream analysis
Introducing BBSplit: Read Binning Tool for Metagenomes and Contaminated Libraries
Removal of genome contaminants (BBSplit)
Removal of ribosomal RNA
StringTie for Transcript assembly and quantification
Extensive quality control:
The preseq package is aimed at predicting and estimating the complexity of a genomic sequencing library,
Herpes-virus is double-stranded RNA.
Herpes-simplex-Viren (humane Herpesviren Typ 1 und 2) sind häufige Ursachen rezidivierender Infektionen mit Beteiligung von Haut, Mund, Lippen, Augen und …
Salmon used expectation–maximization (EM) algorithm to assign reads if two copy of genes occurs.