Contents

This vignette assumes that you have already properly installed rMAUPS.

1 Searching protein sequence

First, load the rMAUPS package.

library(rMAUPS)

Next, read in protein sequences from uniprot.

uniprot_path <- system.file("extdata", "human_uniprot_seq.txt", package = "rMAUPS")
uniprot <- read.delim(uniprot_path, sep='\t')
head(uniprot)
##      gene     ID
## 1    KRAS P01116
## 2 CYP4F11 Q9HBI6
## 3    CDH5 P33151
## 4   CCDC8 Q9H0W5
## 5   CEP76 Q8TAP6
## 6   CEP72 Q9P209
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   protein_sequence
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKCDLPSRTVDTKQAQDLARSYGIPFIETSAKTRQRVEDAFYTLVREIRQYRLKKISKEEKTPGCVKIKKCIIM
## 2                                                                                                                                                                                                                                                                     MPQLSLSWLGLGPVAASPWLLLLLVGGSWLLARVLAWTYTFYDNCRRLQCFPQPPKQNWFWGHQGLVTPTEEGMKTLTQLVTTYPQGFKLWLGPTFPLLILCHPDIIRPITSASAAVAPKDMIFYGFLKPWLGDGLLLSGGDKWSRHRRMLTPAFHFNILKPYMKIFNKSVNIMHDKWQRLASEGSARLDMFEHISLMTLDSLQKCVFSFESNCQEKPSEYIAAILELSAFVEKRNQQILLHTDFLYYLTPDGQRFRRACHLVHDFTDAVIQERRCTLPTQGIDDFLKNKAKSKTLDFIDVLLLSKDEDGKELSDEDIRAEADTFMFEGHDTTASGLSWVLYHLAKHPEYQEQCRQEVQELLKDREPIEIEWDDLAQLPFLTMCIKESLRLHPPVPVISRCCTQDFVLPDGRVIPKGIVCLINIIGIHYNPTVWPDPEVYDPFRFDQENIKERSPLAFIPFSAGPRNCIGQAFAMAEMKVVLALTLLHFRILPTHTEPRRKPELILRAEGGLWLRVEPLGANSQ
## 3 MQRLMMLLATSGACLGLLAVAAVAAAGANPAQRDTHSLLPTHRRQKRDWIWNQMHIDEEKNTSLPHHVGKIKSSVSRKNAKYLLKGEYVGKVFRVDAETGDVFAIERLDRENISEYHLTAVIVDKDTGENLETPSSFTIKVHDVNDNWPVFTHRLFNASVPESSAVGTSVISVTAVDADDPTVGDHASVMYQILKGKEYFAIDNSGRIITITKSLDREKQARYEIVVEARDAQGLRGDSGTATVLVTLQDINDNFPFFTQTKYTFVVPEDTRVGTSVGSLFVEDPDEPQNRMTKYSILRGDYQDAFTIETNPAHNEGIIKPMKPLDYEYIQQYSFIVEATDPTIDLRYMSPPAGNRAQVIINITDVDEPPIFQQPFYHFQLKENQKKPLIGTVLAMDPDAARHSIGYSIRRTSDKGQFFRVTKKGDIYNEKELDREVYPWYNLTVEAKELDSTGTPTGKESIVQVHIEVLDENDNAPEFAKPYQPKVCENAVHGQLVLQISAIDKDITPRNVKFKFILNTENNFTLTDNHDNTANITVKYGQFDREHTKVHFLPVVISDNGMPSRTGTSTLTVAVCKCNEQGEFTFCEDMAAQVGVSIQAVVAILLCILTITVITLLIFLRRRLRKQARAHGKSVPEIHEQLVTYDEEGGGEMDTTSYDVSVLNSVRRGGAKPPRPALDARPSLYAQVQKPPRHAPGAHGGPGEMAAMIEVKKDEADHDGDGPPYDTLHIYGYEGSESIAESLSSLGTDSSDSDVDYDFLNDWGPRFKMLAELYGSDPREELLY
## 4                                                                                                                                                                                                                                                       MLQIGEDVDYLLIPREVRLAGGVWRVISKPATKEAEFRERLTQFLEEEGRTLEDVARIMEKSTPHPPQPPKKPKEPRVRRRVQQMVTPPPRLVVGTYDSSNASDSEFSDFETSRDKSRQGPRRGKKVRKMPVSYLGSKFLGSDLESEDDEELVEAFLRRQEKQPSAPPARRRVNLPVPMFEDNLGPQLSKADRWREYVSQVSWGKLKRRVKGWAPRAGPGVGEARLASTAVESAGVSSAPEGTSPGDRLGNAGDVCVPQASPRRWRPKINWASFRRRRKEQTAPTGQGADIEADQGGEAADSQREEAIADQREGAAGNQRAGAPADQGAEAADNQREEAADNQRAGAPAEEGAEAADNQREEAADNQRAEAPADQRSQGTDNHREEAADNQRAEAPADQGSEVTDNQREEAVHDQRERAPAVQGADNQRAQARAGQRAEAAHNQRAGAPGIQEAEVSAAQGTTGTAPGARARKQVKTVRFQTPGRFSWFCKRRRAFWHTPRLPTLPKRVPRAGEARNLRVLRAEARAEAEQGEQEDQL
## 5                                                                                                                              MSLPPEKASELKQLIHQQLSKMDVHGRIREILAETIREELAPDQQHLSTEDLIKALRRRGIIDDVMKELNFVTDSVEQELPSSPKQPICFDRQSTLKKTNIDPTRRYLYLQVLGGKAFLEHLQEPEPLPGQVCSTFTLCLHYRNQRFRSKPVPCACEPDFHDGFLLEVHRESLGDGTRMADSTTMLSISDPIHMVLIKTDIFGETTLVASYFLEWRSVLGSENGVTSLTVELMGVGTESKVSVGILNIKLEMYPPLNQTLSQEVVNTQLALERQKTAEKERLFLVYAKQWWREYLQIRPSHNSRLVKIFAQDENGINRPVCSYVKPLRAGRLLDTPRQAARFVNVLGYERAPVIGGGGKQEQWCTLLAFLCRNKGDCEDHANLLCSLLLGYGLEAFVCVGTKAKGVPHAWVMTCGTDGAITFWESLTGHRYIHKPTNPDEPPVAEQPKPLYPYRTIGCVFNHQMFLGNCQPSDAVETCVFDLNDESKWKPMSEEAIKSVCAPGATTSLPPFPPLCASTIDASVTSNEIEMQLRLLVSEHRKDLGLTTVWEDQLSYLLSPALASYEFERTTSISAGNEEFQDAIRRAVPDGHTFKGFPIHFVYRNARRAFATCLRSPFCEEIICCRGDQVRLAVRVRVFTYPESACAVWIMFACKYRSVL
## 6                                                                                                                                          MARAGPRLVLSEEAVRAKSGLGPHRDLAELQSLSIPGTYQEKITHLGHSLMSLTGLKSLDLSRNSLVSLEGIQYLTALESLNLYYNCISSLAEVFRLHALTELVDVDFRLNPVVKVEPDYRLFVVHLLPKLQQLDDRPVRASERKASRLHFASEDSLDSKESVPASLKEGRPHHPRAKCTEALAKQSLVMDADDEAVLNLIAECEWDLGRPPGSTSFSQKGREADSRGSQESRHLLSPQLVQYQCGDSGKQGRETRRSSCRGCCLEKMPWSQLCGELPPLYGAEPEASRAPRPHTYFTPHPDSMDTEDSASSQKLDLSGEMVPGPLPAPGKCRKRRMPVGRFQTFSDQEGLGCPERTHGSSVPKESLSRQDSSESRNGRTLSQPEASETEEQRSRGVTDTREPSPGSHSALPGKKTALQAALLETLLDLVDRSWGGCRSLHSNEAFLAQARHILSSVEEFTAAQDSSAMVGEDVGSLALESKSLQSRLAEQQQQHAREMSEVTAELHHTHKELDDLRQHLDKSLEENSRLKSLLLSMKKEVKSADTAATLNLQIAGLQTSVKRLCGEIVELKQHLEHYDKIQELTQMLQESHSSLVSTNEHLLQELSQVRAQHRAEVEQMHWSYQELKKTMALFPHSSASHGGCQAC

Next, search the protein sequences for lysine residues (“K”).

lysine_pos <- searchProtSeq(head(uniprot), 'K')
head(lysine_pos)
##   UniprotId start end
## 1    P01116     5   5
## 2    P01116    16  16
## 3    P01116    42  42
## 4    P01116    88  88
## 5    P01116   101 101
## 6    P01116   104 104

Note, for the sake of a quick example, the above command only searched against the top few protein sequences. Additionally, although not used here, the searchProtSeq function can also handle searching for sequences using regular expression.

2 Viewing protein structure

First, we’ll view all of the lysine residues for KRAS.

# get hits for P01116
id <- 'P01116'
start <- lysine_pos[lysine_pos['UniprotId']==id, 'start']
end <- lysine_pos[lysine_pos['UniprotId']==id, 'end']

# view on protein structure
browseProtStructure(id, start, end)
## https://mupit.icm.jhu.edu/MuPIT_Interactive/?gm=P01116:5,P01116:16,P01116:42,P01116:88,P01116:101,P01116:104,P01116:117,P01116:128,P01116:147,P01116:169,P01116:170,P01116:173,P01116:176,P01116:182,P01116:184,P01116:185&protquery=y

Next, we’ll compare those to the lysines with reported ubiquitination sites.

# Here, we manually create a dataframe for the KRAS ubiqutination sites.
# In the realisitic scenario, you should load the "Ubiquitination_site_dataset" file
# from PhosphositePlus DB using the readPSPUbiquitin function. You'll need to download
# it your self due to copy protection.
id <- 'P01116'
ub <- data.frame(gene='KRAS', ID=id, position=c(117, 128, 147))

# get ub sites for P01116
start <- ub[ub['ID']==id, 'position']
end <- ub[ub['ID']==id, 'position']

# view on protein structure
browseProtStructure(id, start, end)
## https://mupit.icm.jhu.edu/MuPIT_Interactive/?gm=P01116:117,P01116:128,P01116:147&protquery=y

3 Session info

## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS  10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] rMAUPS_0.1.1     BiocStyle_2.16.1
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1                backports_1.1.10           
##   [3] BiocFileCache_1.12.1        sn_1.6-2                   
##   [5] plyr_1.8.6                  GSEABase_1.50.1            
##   [7] splines_4.0.2               BiocParallel_1.22.0        
##   [9] TH.data_1.0-10              GenomeInfoDb_1.24.2        
##  [11] ggplot2_3.3.2               digest_0.6.26              
##  [13] foreach_1.5.1               htmltools_0.5.0            
##  [15] magrittr_1.5                memoise_1.1.0              
##  [17] doParallel_1.0.16           openxlsx_4.2.2             
##  [19] limma_3.44.3                annotate_1.66.0            
##  [21] msmsTests_1.26.0            matrixStats_0.57.0         
##  [23] sandwich_3.0-0              askpass_1.1                
##  [25] prettyunits_1.1.1           colorspace_1.4-1           
##  [27] msmsEDA_1.26.0              blob_1.2.1                 
##  [29] rappdirs_0.3.1              ggrepel_0.8.2              
##  [31] haven_2.3.1                 rbibutils_1.3              
##  [33] xfun_0.18                   dplyr_1.0.2                
##  [35] jsonlite_1.7.2              crayon_1.3.4               
##  [37] RCurl_1.98-1.2              graph_1.66.0               
##  [39] genefilter_1.70.0           impute_1.62.0              
##  [41] zoo_1.8-8                   survival_3.2-7             
##  [43] iterators_1.0.13            glue_1.4.2                 
##  [45] gtable_0.3.0                zlibbioc_1.34.0            
##  [47] XVector_0.28.0              DelayedArray_0.14.1        
##  [49] car_3.0-10                  BiocGenerics_0.34.0        
##  [51] msigdbr_7.2.1               abind_1.4-5                
##  [53] scales_1.1.1                vsn_3.56.0                 
##  [55] mvtnorm_1.1-1               DBI_1.1.0                  
##  [57] edgeR_3.30.3                rstatix_0.6.0              
##  [59] Rcpp_1.0.5                  plotrix_3.7-8              
##  [61] metap_1.4                   mzR_2.22.0                 
##  [63] xtable_1.8-4                progress_1.2.2             
##  [65] tmvnsim_1.0-2               foreign_0.8-80             
##  [67] bit_4.0.4                   preprocessCore_1.50.0      
##  [69] stats4_4.0.2                GSVA_1.36.3                
##  [71] httr_1.4.2                  gplots_3.1.0               
##  [73] RColorBrewer_1.1-2          TFisher_0.2.0              
##  [75] ellipsis_0.3.1              pkgconfig_2.0.3            
##  [77] XML_3.99-0.5                dbplyr_1.4.4               
##  [79] locfit_1.5-9.4              reshape2_1.4.4             
##  [81] tidyselect_1.1.0            rlang_0.4.8                
##  [83] later_1.1.0.1               AnnotationDbi_1.50.3       
##  [85] munsell_0.5.0               cellranger_1.1.0           
##  [87] tools_4.0.2                 generics_0.1.0             
##  [89] RSQLite_2.2.1               mathjaxr_1.0-1             
##  [91] broom_0.7.2                 evaluate_0.14              
##  [93] stringr_1.4.0               fastmap_1.0.1              
##  [95] mzID_1.26.0                 yaml_2.2.1                 
##  [97] knitr_1.30                  bit64_4.0.5                
##  [99] zip_2.1.1                   caTools_1.18.0             
## [101] purrr_0.3.4                 ncdf4_1.17                 
## [103] mime_0.9                    xml2_1.3.2                 
## [105] biomaRt_2.44.4              compiler_4.0.2             
## [107] shinythemes_1.1.2           curl_4.3                   
## [109] affyio_1.58.0               ggsignif_0.6.0             
## [111] tibble_3.0.4                geneplotter_1.66.0         
## [113] stringi_1.5.3               forcats_0.5.0              
## [115] MSnbase_2.14.2              lattice_0.20-41            
## [117] ProtGenerics_1.20.0         Matrix_1.2-18              
## [119] multtest_2.44.0             vctrs_0.3.4                
## [121] mutoss_0.1-12               pillar_1.4.6               
## [123] lifecycle_0.2.0             BiocManager_1.30.10        
## [125] Rdpack_2.0                  MALDIquant_1.19.3          
## [127] data.table_1.13.0           bitops_1.0-6               
## [129] gbRd_0.4-11                 qvalue_2.20.0              
## [131] httpuv_1.5.4                GenomicRanges_1.40.0       
## [133] R6_2.4.1                    pcaMethods_1.80.0          
## [135] affy_1.66.0                 bookdown_0.21              
## [137] promises_1.1.1              KernSmooth_2.23-17         
## [139] rio_0.5.16                  IRanges_2.22.2             
## [141] codetools_0.2-16            gtools_3.8.2               
## [143] MASS_7.3-53                 assertthat_0.2.1           
## [145] SummarizedExperiment_1.18.2 openssl_1.4.3              
## [147] DESeq2_1.28.1               mnormt_2.0.2               
## [149] multcomp_1.4-14             S4Vectors_0.26.1           
## [151] GenomeInfoDbData_1.2.3      parallel_4.0.2             
## [153] hms_0.5.3                   grid_4.0.2                 
## [155] tidyr_1.1.2                 rmarkdown_2.4              
## [157] carData_3.0-4               ggpubr_0.4.0               
## [159] numDeriv_2016.8-1.1         Biobase_2.48.0             
## [161] shiny_1.5.0