rMAUPS - 0.1.1
This vignette assumes that you have already properly installed rMAUPS.
First, load the rMAUPS package.
library(rMAUPS)
Next, read in protein sequences from uniprot.
uniprot_path <- system.file("extdata", "human_uniprot_seq.txt", package = "rMAUPS")
uniprot <- read.delim(uniprot_path, sep='\t')
head(uniprot)
## gene ID
## 1 KRAS P01116
## 2 CYP4F11 Q9HBI6
## 3 CDH5 P33151
## 4 CCDC8 Q9H0W5
## 5 CEP76 Q8TAP6
## 6 CEP72 Q9P209
## protein_sequence
## 1 MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKCDLPSRTVDTKQAQDLARSYGIPFIETSAKTRQRVEDAFYTLVREIRQYRLKKISKEEKTPGCVKIKKCIIM
## 2 MPQLSLSWLGLGPVAASPWLLLLLVGGSWLLARVLAWTYTFYDNCRRLQCFPQPPKQNWFWGHQGLVTPTEEGMKTLTQLVTTYPQGFKLWLGPTFPLLILCHPDIIRPITSASAAVAPKDMIFYGFLKPWLGDGLLLSGGDKWSRHRRMLTPAFHFNILKPYMKIFNKSVNIMHDKWQRLASEGSARLDMFEHISLMTLDSLQKCVFSFESNCQEKPSEYIAAILELSAFVEKRNQQILLHTDFLYYLTPDGQRFRRACHLVHDFTDAVIQERRCTLPTQGIDDFLKNKAKSKTLDFIDVLLLSKDEDGKELSDEDIRAEADTFMFEGHDTTASGLSWVLYHLAKHPEYQEQCRQEVQELLKDREPIEIEWDDLAQLPFLTMCIKESLRLHPPVPVISRCCTQDFVLPDGRVIPKGIVCLINIIGIHYNPTVWPDPEVYDPFRFDQENIKERSPLAFIPFSAGPRNCIGQAFAMAEMKVVLALTLLHFRILPTHTEPRRKPELILRAEGGLWLRVEPLGANSQ
## 3 MQRLMMLLATSGACLGLLAVAAVAAAGANPAQRDTHSLLPTHRRQKRDWIWNQMHIDEEKNTSLPHHVGKIKSSVSRKNAKYLLKGEYVGKVFRVDAETGDVFAIERLDRENISEYHLTAVIVDKDTGENLETPSSFTIKVHDVNDNWPVFTHRLFNASVPESSAVGTSVISVTAVDADDPTVGDHASVMYQILKGKEYFAIDNSGRIITITKSLDREKQARYEIVVEARDAQGLRGDSGTATVLVTLQDINDNFPFFTQTKYTFVVPEDTRVGTSVGSLFVEDPDEPQNRMTKYSILRGDYQDAFTIETNPAHNEGIIKPMKPLDYEYIQQYSFIVEATDPTIDLRYMSPPAGNRAQVIINITDVDEPPIFQQPFYHFQLKENQKKPLIGTVLAMDPDAARHSIGYSIRRTSDKGQFFRVTKKGDIYNEKELDREVYPWYNLTVEAKELDSTGTPTGKESIVQVHIEVLDENDNAPEFAKPYQPKVCENAVHGQLVLQISAIDKDITPRNVKFKFILNTENNFTLTDNHDNTANITVKYGQFDREHTKVHFLPVVISDNGMPSRTGTSTLTVAVCKCNEQGEFTFCEDMAAQVGVSIQAVVAILLCILTITVITLLIFLRRRLRKQARAHGKSVPEIHEQLVTYDEEGGGEMDTTSYDVSVLNSVRRGGAKPPRPALDARPSLYAQVQKPPRHAPGAHGGPGEMAAMIEVKKDEADHDGDGPPYDTLHIYGYEGSESIAESLSSLGTDSSDSDVDYDFLNDWGPRFKMLAELYGSDPREELLY
## 4 MLQIGEDVDYLLIPREVRLAGGVWRVISKPATKEAEFRERLTQFLEEEGRTLEDVARIMEKSTPHPPQPPKKPKEPRVRRRVQQMVTPPPRLVVGTYDSSNASDSEFSDFETSRDKSRQGPRRGKKVRKMPVSYLGSKFLGSDLESEDDEELVEAFLRRQEKQPSAPPARRRVNLPVPMFEDNLGPQLSKADRWREYVSQVSWGKLKRRVKGWAPRAGPGVGEARLASTAVESAGVSSAPEGTSPGDRLGNAGDVCVPQASPRRWRPKINWASFRRRRKEQTAPTGQGADIEADQGGEAADSQREEAIADQREGAAGNQRAGAPADQGAEAADNQREEAADNQRAGAPAEEGAEAADNQREEAADNQRAEAPADQRSQGTDNHREEAADNQRAEAPADQGSEVTDNQREEAVHDQRERAPAVQGADNQRAQARAGQRAEAAHNQRAGAPGIQEAEVSAAQGTTGTAPGARARKQVKTVRFQTPGRFSWFCKRRRAFWHTPRLPTLPKRVPRAGEARNLRVLRAEARAEAEQGEQEDQL
## 5 MSLPPEKASELKQLIHQQLSKMDVHGRIREILAETIREELAPDQQHLSTEDLIKALRRRGIIDDVMKELNFVTDSVEQELPSSPKQPICFDRQSTLKKTNIDPTRRYLYLQVLGGKAFLEHLQEPEPLPGQVCSTFTLCLHYRNQRFRSKPVPCACEPDFHDGFLLEVHRESLGDGTRMADSTTMLSISDPIHMVLIKTDIFGETTLVASYFLEWRSVLGSENGVTSLTVELMGVGTESKVSVGILNIKLEMYPPLNQTLSQEVVNTQLALERQKTAEKERLFLVYAKQWWREYLQIRPSHNSRLVKIFAQDENGINRPVCSYVKPLRAGRLLDTPRQAARFVNVLGYERAPVIGGGGKQEQWCTLLAFLCRNKGDCEDHANLLCSLLLGYGLEAFVCVGTKAKGVPHAWVMTCGTDGAITFWESLTGHRYIHKPTNPDEPPVAEQPKPLYPYRTIGCVFNHQMFLGNCQPSDAVETCVFDLNDESKWKPMSEEAIKSVCAPGATTSLPPFPPLCASTIDASVTSNEIEMQLRLLVSEHRKDLGLTTVWEDQLSYLLSPALASYEFERTTSISAGNEEFQDAIRRAVPDGHTFKGFPIHFVYRNARRAFATCLRSPFCEEIICCRGDQVRLAVRVRVFTYPESACAVWIMFACKYRSVL
## 6 MARAGPRLVLSEEAVRAKSGLGPHRDLAELQSLSIPGTYQEKITHLGHSLMSLTGLKSLDLSRNSLVSLEGIQYLTALESLNLYYNCISSLAEVFRLHALTELVDVDFRLNPVVKVEPDYRLFVVHLLPKLQQLDDRPVRASERKASRLHFASEDSLDSKESVPASLKEGRPHHPRAKCTEALAKQSLVMDADDEAVLNLIAECEWDLGRPPGSTSFSQKGREADSRGSQESRHLLSPQLVQYQCGDSGKQGRETRRSSCRGCCLEKMPWSQLCGELPPLYGAEPEASRAPRPHTYFTPHPDSMDTEDSASSQKLDLSGEMVPGPLPAPGKCRKRRMPVGRFQTFSDQEGLGCPERTHGSSVPKESLSRQDSSESRNGRTLSQPEASETEEQRSRGVTDTREPSPGSHSALPGKKTALQAALLETLLDLVDRSWGGCRSLHSNEAFLAQARHILSSVEEFTAAQDSSAMVGEDVGSLALESKSLQSRLAEQQQQHAREMSEVTAELHHTHKELDDLRQHLDKSLEENSRLKSLLLSMKKEVKSADTAATLNLQIAGLQTSVKRLCGEIVELKQHLEHYDKIQELTQMLQESHSSLVSTNEHLLQELSQVRAQHRAEVEQMHWSYQELKKTMALFPHSSASHGGCQAC
Next, search the protein sequences for lysine residues (“K”).
lysine_pos <- searchProtSeq(head(uniprot), 'K')
head(lysine_pos)
## UniprotId start end
## 1 P01116 5 5
## 2 P01116 16 16
## 3 P01116 42 42
## 4 P01116 88 88
## 5 P01116 101 101
## 6 P01116 104 104
Note, for the sake of a quick example, the above command only searched against the top few protein sequences. Additionally, although not used here, the searchProtSeq function can also handle searching for sequences using regular expression.
First, we’ll view all of the lysine residues for KRAS.
# get hits for P01116
id <- 'P01116'
start <- lysine_pos[lysine_pos['UniprotId']==id, 'start']
end <- lysine_pos[lysine_pos['UniprotId']==id, 'end']
# view on protein structure
browseProtStructure(id, start, end)
## https://mupit.icm.jhu.edu/MuPIT_Interactive/?gm=P01116:5,P01116:16,P01116:42,P01116:88,P01116:101,P01116:104,P01116:117,P01116:128,P01116:147,P01116:169,P01116:170,P01116:173,P01116:176,P01116:182,P01116:184,P01116:185&protquery=y
Next, we’ll compare those to the lysines with reported ubiquitination sites.
# Here, we manually create a dataframe for the KRAS ubiqutination sites.
# In the realisitic scenario, you should load the "Ubiquitination_site_dataset" file
# from PhosphositePlus DB using the readPSPUbiquitin function. You'll need to download
# it your self due to copy protection.
id <- 'P01116'
ub <- data.frame(gene='KRAS', ID=id, position=c(117, 128, 147))
# get ub sites for P01116
start <- ub[ub['ID']==id, 'position']
end <- ub[ub['ID']==id, 'position']
# view on protein structure
browseProtStructure(id, start, end)
## https://mupit.icm.jhu.edu/MuPIT_Interactive/?gm=P01116:117,P01116:128,P01116:147&protquery=y
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] rMAUPS_0.1.1 BiocStyle_2.16.1
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.10
## [3] BiocFileCache_1.12.1 sn_1.6-2
## [5] plyr_1.8.6 GSEABase_1.50.1
## [7] splines_4.0.2 BiocParallel_1.22.0
## [9] TH.data_1.0-10 GenomeInfoDb_1.24.2
## [11] ggplot2_3.3.2 digest_0.6.26
## [13] foreach_1.5.1 htmltools_0.5.0
## [15] magrittr_1.5 memoise_1.1.0
## [17] doParallel_1.0.16 openxlsx_4.2.2
## [19] limma_3.44.3 annotate_1.66.0
## [21] msmsTests_1.26.0 matrixStats_0.57.0
## [23] sandwich_3.0-0 askpass_1.1
## [25] prettyunits_1.1.1 colorspace_1.4-1
## [27] msmsEDA_1.26.0 blob_1.2.1
## [29] rappdirs_0.3.1 ggrepel_0.8.2
## [31] haven_2.3.1 rbibutils_1.3
## [33] xfun_0.18 dplyr_1.0.2
## [35] jsonlite_1.7.2 crayon_1.3.4
## [37] RCurl_1.98-1.2 graph_1.66.0
## [39] genefilter_1.70.0 impute_1.62.0
## [41] zoo_1.8-8 survival_3.2-7
## [43] iterators_1.0.13 glue_1.4.2
## [45] gtable_0.3.0 zlibbioc_1.34.0
## [47] XVector_0.28.0 DelayedArray_0.14.1
## [49] car_3.0-10 BiocGenerics_0.34.0
## [51] msigdbr_7.2.1 abind_1.4-5
## [53] scales_1.1.1 vsn_3.56.0
## [55] mvtnorm_1.1-1 DBI_1.1.0
## [57] edgeR_3.30.3 rstatix_0.6.0
## [59] Rcpp_1.0.5 plotrix_3.7-8
## [61] metap_1.4 mzR_2.22.0
## [63] xtable_1.8-4 progress_1.2.2
## [65] tmvnsim_1.0-2 foreign_0.8-80
## [67] bit_4.0.4 preprocessCore_1.50.0
## [69] stats4_4.0.2 GSVA_1.36.3
## [71] httr_1.4.2 gplots_3.1.0
## [73] RColorBrewer_1.1-2 TFisher_0.2.0
## [75] ellipsis_0.3.1 pkgconfig_2.0.3
## [77] XML_3.99-0.5 dbplyr_1.4.4
## [79] locfit_1.5-9.4 reshape2_1.4.4
## [81] tidyselect_1.1.0 rlang_0.4.8
## [83] later_1.1.0.1 AnnotationDbi_1.50.3
## [85] munsell_0.5.0 cellranger_1.1.0
## [87] tools_4.0.2 generics_0.1.0
## [89] RSQLite_2.2.1 mathjaxr_1.0-1
## [91] broom_0.7.2 evaluate_0.14
## [93] stringr_1.4.0 fastmap_1.0.1
## [95] mzID_1.26.0 yaml_2.2.1
## [97] knitr_1.30 bit64_4.0.5
## [99] zip_2.1.1 caTools_1.18.0
## [101] purrr_0.3.4 ncdf4_1.17
## [103] mime_0.9 xml2_1.3.2
## [105] biomaRt_2.44.4 compiler_4.0.2
## [107] shinythemes_1.1.2 curl_4.3
## [109] affyio_1.58.0 ggsignif_0.6.0
## [111] tibble_3.0.4 geneplotter_1.66.0
## [113] stringi_1.5.3 forcats_0.5.0
## [115] MSnbase_2.14.2 lattice_0.20-41
## [117] ProtGenerics_1.20.0 Matrix_1.2-18
## [119] multtest_2.44.0 vctrs_0.3.4
## [121] mutoss_0.1-12 pillar_1.4.6
## [123] lifecycle_0.2.0 BiocManager_1.30.10
## [125] Rdpack_2.0 MALDIquant_1.19.3
## [127] data.table_1.13.0 bitops_1.0-6
## [129] gbRd_0.4-11 qvalue_2.20.0
## [131] httpuv_1.5.4 GenomicRanges_1.40.0
## [133] R6_2.4.1 pcaMethods_1.80.0
## [135] affy_1.66.0 bookdown_0.21
## [137] promises_1.1.1 KernSmooth_2.23-17
## [139] rio_0.5.16 IRanges_2.22.2
## [141] codetools_0.2-16 gtools_3.8.2
## [143] MASS_7.3-53 assertthat_0.2.1
## [145] SummarizedExperiment_1.18.2 openssl_1.4.3
## [147] DESeq2_1.28.1 mnormt_2.0.2
## [149] multcomp_1.4-14 S4Vectors_0.26.1
## [151] GenomeInfoDbData_1.2.3 parallel_4.0.2
## [153] hms_0.5.3 grid_4.0.2
## [155] tidyr_1.1.2 rmarkdown_2.4
## [157] carData_3.0-4 ggpubr_0.4.0
## [159] numDeriv_2016.8-1.1 Biobase_2.48.0
## [161] shiny_1.5.0