class: title-slide, center, bottom # Build a model ## Tidymodels, virtually — Session 01 ### Alison Hill --- class: center, middle, inverse # What is Machine Learning? --- class: middle # .center[Alzheimer's disease data] Data from a clinical trial of individuals with well-characterized cognitive impairment, and age-matched control participants. ```r # install.packages("modeldata") library(modeldata) data("ad_data") alz <- ad_data ``` --- class: middle ```r glimpse(alz) # Rows: 333 # Columns: 131 # $ ACE_CD143_Angiotensin_Converti <dbl> 2.0031003, 1.561… # $ ACTH_Adrenocorticotropic_Hormon <dbl> -1.3862944, -1.3… # $ AXL <dbl> 1.09838668, 0.68… # $ Adiponectin <dbl> -5.360193, -5.02… # $ Alpha_1_Antichymotrypsin <dbl> 1.7404662, 1.458… # $ Alpha_1_Antitrypsin <dbl> -12.631361, -11.… # $ Alpha_1_Microglobulin <dbl> -2.577022, -3.24… # $ Alpha_2_Macroglobulin <dbl> -72.65029, -154.… # $ Angiopoietin_2_ANG_2 <dbl> 1.06471074, 0.74… # $ Angiotensinogen <dbl> 2.510547, 2.4572… # $ Apolipoprotein_A_IV <dbl> -1.427116, -1.66… # $ Apolipoprotein_A1 <dbl> -7.402052, -7.04… # $ Apolipoprotein_A2 <dbl> -0.26136476, -0.… # $ Apolipoprotein_B <dbl> -4.624044, -6.74… # $ Apolipoprotein_CI <dbl> -1.2729657, -1.2… # $ Apolipoprotein_CIII <dbl> -2.312635, -2.34… # $ Apolipoprotein_D <dbl> 2.0794415, 1.335… # $ Apolipoprotein_E <dbl> 3.7545215, 3.097… # $ Apolipoprotein_H <dbl> -0.15734908, -0.… # $ B_Lymphocyte_Chemoattractant_BL <dbl> 2.296982, 1.6731… # $ BMP_6 <dbl> -2.200744, -1.72… # $ Beta_2_Microglobulin <dbl> 0.69314718, 0.47… # $ Betacellulin <int> 34, 53, 49, 52, … # $ C_Reactive_Protein <dbl> -4.074542, -6.64… # $ CD40 <dbl> -0.7964147, -1.2… # $ CD5L <dbl> 0.09531018, -0.6… # $ Calbindin <dbl> 33.21363, 25.276… # $ Calcitonin <dbl> 1.3862944, 3.610… # $ CgA <dbl> 397.6536, 465.67… # $ Clusterin_Apo_J <dbl> 3.555348, 3.0445… # $ Complement_3 <dbl> -10.36305, -16.1… # $ Complement_Factor_H <dbl> 3.573725, 3.6000… # $ Connective_Tissue_Growth_Factor <dbl> 0.5306283, 0.587… # $ Cortisol <dbl> 10.0, 12.0, 10.0… # $ Creatine_Kinase_MB <dbl> -1.710172, -1.75… # $ Cystatin_C <dbl> 9.041922, 9.0676… # $ EGF_R <dbl> -0.1354543, -0.3… # $ EN_RAGE <dbl> -3.688879, -3.81… # $ ENA_78 <dbl> -1.349543, -1.35… # $ Eotaxin_3 <int> 53, 62, 62, 44, … # $ FAS <dbl> -0.08338161, -0.… # $ FSH_Follicle_Stimulation_Hormon <dbl> -0.6516715, -1.6… # $ Fas_Ligand <dbl> 3.1014922, 2.978… # $ Fatty_Acid_Binding_Protein <dbl> 2.5208712, 2.247… # $ Ferritin <dbl> 3.329165, 3.9329… # $ Fetuin_A <dbl> 1.2809338, 1.193… # $ Fibrinogen <dbl> -7.035589, -8.04… # $ GRO_alpha <dbl> 1.381830, 1.3724… # $ Gamma_Interferon_induced_Monokin <dbl> 2.949822, 2.7217… # $ Glutathione_S_Transferase_alpha <dbl> 1.0641271, 0.867… # $ HB_EGF <dbl> 6.559746, 8.7545… # $ HCC_4 <dbl> -3.036554, -4.07… # $ Hepatocyte_Growth_Factor_HGF <dbl> 0.58778666, 0.53… # $ I_309 <dbl> 3.433987, 3.1354… # $ ICAM_1 <dbl> -0.1907787, -0.4… # $ IGF_BP_2 <dbl> 5.609472, 5.3471… # $ IL_11 <dbl> 5.121987, 4.9367… # $ IL_13 <dbl> 1.282549, 1.2694… # $ IL_16 <dbl> 4.192081, 2.8763… # $ IL_17E <dbl> 5.731246, 6.7058… # $ IL_1alpha <dbl> -6.571283, -8.04… # $ IL_3 <dbl> -3.244194, -3.91… # $ IL_4 <dbl> 2.484907, 2.3978… # $ IL_5 <dbl> 1.09861229, 0.69… # $ IL_6 <dbl> 0.26936976, 0.09… # $ IL_6_Receptor <dbl> 0.64279595, 0.43… # $ IL_7 <dbl> 4.8050453, 3.705… # $ IL_8 <dbl> 1.711325, 1.6755… # $ IP_10_Inducible_Protein_10 <dbl> 6.242223, 5.6869… # $ IgA <dbl> -6.812445, -6.37… # $ Insulin <dbl> -0.6258253, -0.9… # $ Kidney_Injury_Molecule_1_KIM_1 <dbl> -1.204295, -1.19… # $ LOX_1 <dbl> 1.7047481, 1.526… # $ Leptin <dbl> -1.5290628, -1.4… # $ Lipoprotein_a <dbl> -4.268698, -4.93… # $ MCP_1 <dbl> 6.740519, 6.8490… # $ MCP_2 <dbl> 1.9805094, 1.808… # $ MIF <dbl> -1.237874, -1.89… # $ MIP_1alpha <dbl> 4.968453, 3.6901… # $ MIP_1beta <dbl> 3.258097, 3.1354… # $ MMP_2 <dbl> 4.478566, 3.7814… # $ MMP_3 <dbl> -2.207275, -2.46… # $ MMP10 <dbl> -3.270169, -3.64… # $ MMP7 <dbl> -3.7735027, -5.9… # $ Myoglobin <dbl> -1.89711998, -0.… # $ NT_proBNP <dbl> 4.553877, 4.2195… # $ NrCAM <dbl> 5.003946, 5.2094… # $ Osteopontin <dbl> 5.356586, 6.0038… # $ PAI_1 <dbl> 1.00350156, -0.0… # $ PAPP_A <dbl> -2.902226, -2.81… # $ PLGF <dbl> 4.442651, 4.0253… # $ PYY <dbl> 3.218876, 3.1354… # $ Pancreatic_polypeptide <dbl> 0.5787808, 0.336… # $ Prolactin <dbl> 0.00000000, -0.5… # $ Prostatic_Acid_Phosphatase <dbl> -1.620527, -1.73… # $ Protein_S <dbl> -1.784998, -2.46… # $ Pulmonary_and_Activation_Regulat <dbl> -0.8439701, -2.3… # $ RANTES <dbl> -6.214608, -6.93… # $ Resistin <dbl> -16.475315, -16.… # $ S100b <dbl> 1.5618560, 1.756… # $ SGOT <dbl> -0.94160854, -0.… # $ SHBG <dbl> -1.897120, -1.56… # $ SOD <dbl> 5.609472, 5.8141… # $ Serum_Amyloid_P <dbl> -5.599422, -6.11… # $ Sortilin <dbl> 4.908629, 5.4787… # $ Stem_Cell_Factor <dbl> 4.174387, 3.7135… # $ TGF_alpha <dbl> 8.649098, 11.331… # $ TIMP_1 <dbl> 15.20465, 11.266… # $ TNF_RII <dbl> -0.0618754, -0.3… # $ TRAIL_R3 <dbl> -0.1829004, -0.5… # $ TTR_prealbumin <dbl> 2.944439, 2.8332… # $ Tamm_Horsfall_Protein_THP <dbl> -3.095810, -3.11… # $ Thrombomodulin <dbl> -1.340566, -1.67… # $ Thrombopoietin <dbl> -0.1026334, -0.6… # $ Thymus_Expressed_Chemokine_TECK <dbl> 4.149327, 3.8101… # $ Thyroid_Stimulating_Hormone <dbl> -3.863233, -4.82… # $ Thyroxine_Binding_Globulin <dbl> -1.4271164, -1.6… # $ Tissue_Factor <dbl> 2.04122033, 2.02… # $ Transferrin <dbl> 3.332205, 2.8903… # $ Trefoil_Factor_3_TFF3 <dbl> -3.381395, -3.91… # $ VCAM_1 <dbl> 3.258097, 2.7080… # $ VEGF <dbl> 22.03456, 18.601… # $ Vitronectin <dbl> -0.04082199, -0.… # $ von_Willebrand_Factor <dbl> -3.146555, -3.86… # $ age <dbl> 0.9876238, 0.986… # $ tau <dbl> 6.297754, 6.6592… # $ p_tau <dbl> 4.348108, 4.8599… # $ Ab_42 <dbl> 12.019678, 11.01… # $ male <dbl> 0, 0, 1, 0, 0, 1… # $ Genotype <fct> E3E3, E3E4, E3E4… # $ Class <fct> Control, Control… ``` --- background-image: url(images/hands.jpg) background-size: contain background-position: left class: middle .pull-right[ ## Alzheimer's disease data + N = 333 + 1 categorical outcome: `Class` + 130 predictors + 126 protein measurements + also: `age`, `male`, `Genotype` ] --- background-image: url(images/hands.jpg) background-size: contain background-position: left class: middle .pull-right[ <img src="figs/rmed01-model/unnamed-chunk-3-1.png" width="504" style="display: block; margin: auto;" /> ] --- class: middle, center, inverse # What is the goal of machine learning? --- class: middle, center, frame # Goal -- ## Build .display[models] that -- ## generate .display[accurate predictions] -- ## for .display[future, yet-to-be-seen data]. -- .footnote[Max Kuhn & Kjell Johnston, http://www.feat.engineering/] ??? This is our whole game vision for today. This is the main goal for predictive modeling broadly, and for machine learning specifically. We'll use this goal to drive learning of 3 core tidymodels packages: - parsnip - yardstick - and rsample --- class: inverse, middle, center # 🔨 Build models -- # with parsnip ??? Enter the parsnip package --- class: middle, center, frame # parsnip <iframe src="https://parsnip.tidymodels.org" width="100%" height="400px"></iframe> --- class: middle # .center[`glm()`] ```r glm(Class ~ tau, family = binomial, data = alz) # # Call: glm(formula = Class ~ tau, family = binomial, data = alz) # # Coefficients: # (Intercept) tau # 13.664 -2.148 # # Degrees of Freedom: 332 Total (i.e. Null); 331 Residual # Null Deviance: 390.6 # Residual Deviance: 318.8 AIC: 322.8 ``` ??? So let's start with prediction. To predict, we have to have two things: a model to generate predictions, and data to predict This type of formula interface may look familiar How would we use parsnip to build this kind of linear regression model? --- name: step1 background-image: url("images/predicting/predicting.001.jpeg") background-size: contain --- class: middle, frame # .center[To specify a model with parsnip] .right-column[ 1\. Pick a .display[model] 2\. Set the .display[engine] 3\. Set the .display[mode] (if needed) ] --- class: middle, frame # .center[To specify a model with parsnip] ```r logistic_reg() %>% set_engine("glm") %>% set_mode("classification") # Logistic Regression Model Specification (classification) # # Computational engine: glm ``` --- class: middle, frame # .center[To specify a model with parsnip] ```r decision_tree() %>% set_engine("C5.0") %>% set_mode("classification") # Decision Tree Model Specification (classification) # # Computational engine: C5.0 ``` --- class: middle, frame # .center[To specify a model with parsnip] ```r nearest_neighbor() %>% set_engine("kknn") %>% set_mode("classification") # K-Nearest Neighbor Model Specification (classification) # # Computational engine: kknn ``` --- class: middle, frame .fade[ # .center[To specify a model with parsnip] ] .right-column[ 1\. Pick a .display[model] .fade[ 2\. Set the .display[engine] 3\. Set the .display[mode] (if needed) ] ] --- class: middle, center # 1\. Pick a .display[model] All available models are listed at <https://www.tidymodels.org/find/parsnip/> <iframe src="https://www.tidymodels.org/find/parsnip/" width="100%" height="400px"></iframe> --- class: middle .center[ # `logistic_reg()` Specifies a model that uses logistic regression ] ```r logistic_reg(penalty = NULL, mixture = NULL) ``` --- class: middle .center[ # `logistic_reg()` Specifies a model that uses logistic regression ] ```r logistic_reg( mode = "classification", # "default" mode, if exists penalty = NULL, # model hyper-parameter mixture = NULL # model hyper-parameter ) ``` --- class: middle, frame .fade[ # .center[To specify a model with parsnip] ] .right-column[ .fade[ 1\. Pick a .display[model] ] 2\. Set the .display[engine] .fade[ 3\. Set the .display[mode] (if needed) ] ] --- class: middle, center # `set_engine()` Adds an engine to power or implement the model. ```r logistic_reg() %>% set_engine(engine = "glm") ``` --- class: middle, frame .fade[ # .center[To specify a model with parsnip] ] .right-column[ .fade[ 1\. Pick a .display[model] 2\. Set the .display[engine] ] 3\. Set the .display[mode] (if needed) ] --- class: middle, center # `set_mode()` Sets the class of problem the model will solve, which influences which output is collected. Not necessary if mode is set in Step 1. ```r logistic_reg() %>% set_mode(mode = "classification") ``` --- class: your-turn # Your turn 1 Run the chunk in your .Rmd and look at the output. Then, copy/paste the code and edit to create: + a decision tree model for classification + that uses the `C5.0` engine. Save it as `tree_mod` and look at the object. What is different about the output? *Hint: you'll need https://www.tidymodels.org/find/parsnip/*
03
:
00
--- ```r lr_mod # Logistic Regression Model Specification (classification) # # Computational engine: glm tree_mod <- decision_tree() %>% set_engine(engine = "C5.0") %>% set_mode("classification") tree_mod # Decision Tree Model Specification (classification) # # Computational engine: C5.0 ``` --- class: inverse, middle, center ## Now we've built a model. -- ## But, how do we *use* a model? -- ## First - what does it mean to use a model? --- class: inverse, middle, center ![](https://media.giphy.com/media/fhAwk4DnqNgw8/giphy.gif) Statistical models learn from the data. Many learn model parameters, which *can* be useful as values for inference and interpretation. --- class: center, middle # Show of hands How many people have .display[fitted] a statistical model with R? --- # A fitted model .pull-left[ ```r lr_mod %>% fit(Class ~ tau + VEGF, data = alz) %>% broom::tidy() # # A tibble: 3 x 5 # term estimate std.error statistic p.value # <chr> <dbl> <dbl> <dbl> <dbl> # 1 (Intercept) 8.97 1.98 4.54 5.61e- 6 # 2 tau -4.01 0.456 -8.79 1.55e-18 # 3 VEGF 0.934 0.130 7.19 6.38e-13 ``` ] .pull-right[ <img src="figs/rmed01-model/unnamed-chunk-18-1.png" width="504" style="display: block; margin: auto;" /> ] --- ## "All models are wrong, but some are useful" <img src="figs/rmed01-model/unnamed-chunk-20-1.png" width="504" style="display: block; margin: auto;" /> --- ## "All models are wrong, but some are useful" <img src="figs/rmed01-model/unnamed-chunk-22-1.png" width="504" style="display: block; margin: auto;" /> --- ## Predict new data ```r alz_new <- tibble(tau = c(5, 6, 7), VEGF = c(15, 15, 15), Class = c("Control", "Control", "Impaired")) %>% mutate(Class = factor(Class, levels = c("Impaired", "Control"))) alz_new # # A tibble: 3 x 3 # tau VEGF Class # <dbl> <dbl> <fct> # 1 5 15 Control # 2 6 15 Control # 3 7 15 Impaired ``` --- class: center, middle # Show of hands How many people have used a model to generate .display[predictions] with R? --- # Predict old data ```r tree_mod %>% fit(Class ~ tau + VEGF, data = alz) %>% predict(new_data = alz) %>% mutate(true_class = alz$Class) %>% accuracy(truth = true_class, estimate = .pred_class) # # A tibble: 1 x 3 # .metric .estimator .estimate # <chr> <chr> <dbl> # 1 accuracy binary 0.856 ``` --- # Predict new data .pull-left[ ## out with the old... ```r tree_mod %>% fit(Class ~ tau + VEGF, data = alz) %>% predict(new_data = alz) %>% mutate(true_class = alz$Class) %>% accuracy(truth = true_class, estimate = .pred_class) # # A tibble: 1 x 3 # .metric .estimator .estimate # <chr> <chr> <dbl> # 1 accuracy binary 0.856 ``` ] .pull-right[ ## in with the 🆕 ```r tree_mod %>% fit(Class ~ tau + VEGF, data = alz) %>% * predict(new_data = alz_new) %>% * mutate(true_class = alz_new$Class) %>% accuracy(truth = true_class, estimate = .pred_class) # # A tibble: 1 x 3 # .metric .estimator .estimate # <chr> <chr> <dbl> # 1 accuracy binary 0.667 ``` ] --- class: middle, center # `fit()` Train a model by fitting a model. Returns a parsnip model fit. ```r fit(tree_mod, Class ~ tau + VEGF, data = alz) ``` --- class: middle .center[ # `fit()` Train a model by fitting a model. Returns a parsnip model fit. ] ```r tree_mod %>% # parsnip model fit(Class ~ tau + VEGF, # a formula data = alz # dataframe ) ``` --- class: middle .center[ # `fit()` Train a model by fitting a model. Returns a parsnip model fit. ] ```r tree_fit <- tree_mod %>% # parsnip model fit(Class ~ tau + VEGF, # a formula data = alz # dataframe ) ``` --- template: step1 --- name: step2 background-image: url("images/predicting/predicting.003.jpeg") background-size: contain --- class: middle, center # `predict()` Use a fitted model to predict new `y` values from data. Returns a tibble. ```r predict(tree_fit, new_data = alz_new) ``` --- ```r tree_fit %>% predict(new_data = alz_new) # # A tibble: 3 x 1 # .pred_class # <fct> # 1 Control # 2 Impaired # 3 Impaired ``` --- class: middle, center, frame # Axiom The best way to measure a model's performance at predicting new data is to .display[predict new data]. --- class: middle, center, frame # Data splitting -- <img src="figs/rmed01-model/all-split-1.png" width="864" style="display: block; margin: auto;" /> ??? We refer to the group for which we know the outcome, and use to develop the algorithm, as the training set. We refer to the group for which we pretend we don’t know the outcome as the test set. --- class: inverse, middle, center # ♻️ Resample models -- # with rsample ??? Enter the rsample package --- class: middle, center, frame # rsample <iframe src="https://tidymodels.github.io/rsample/" width="100%" height="400px"></iframe> --- class: center, middle # `initial_split()*` "Splits" data randomly into a single testing and a single training set. ```r initial_split(data, prop = 3/4) ``` .footnote[`*` from `rsample`] --- ```r alz_split <- initial_split(alz, strata = Class, prop = .9) alz_split # <Analysis/Assess/Total> # <300/33/333> ``` ??? data splitting --- class: center, middle # `training()` and `testing()*` Extract training and testing sets from an rsplit ```r training(alz_split) testing(alz_split) ``` .footnote[`*` from `rsample`] --- ```r alz_train <- training(alz_split) alz_train # # A tibble: 300 x 131 # ACE_CD143_Angio… ACTH_Adrenocort… AXL Adiponectin # <dbl> <dbl> <dbl> <dbl> # 1 2.00 -1.39 1.10 -5.36 # 2 1.56 -1.39 0.683 -5.02 # 3 1.52 -1.71 -0.145 -5.81 # 4 1.68 -1.61 0.683 -5.12 # 5 2.40 -0.968 0.191 -4.78 # 6 0.431 -1.27 -0.222 -5.22 # 7 0.946 -1.90 0.530 -6.12 # 8 0.708 -1.83 -0.327 -4.88 # 9 1.11 -1.97 0.191 -5.17 # 10 1.60 -1.51 0.449 -5.57 # # … with 290 more rows, and 127 more variables: # # Alpha_1_Antichymotrypsin <dbl>, # # Alpha_1_Antitrypsin <dbl>, Alpha_1_Microglobulin <dbl>, # # Alpha_2_Macroglobulin <dbl>, # # Angiopoietin_2_ANG_2 <dbl>, … ``` --- class: middle, center # Quiz Now that we have training and testing sets... -- Which dataset do you think we use for .display[fitting]? -- Which do we use for .display[predicting]? --- template: step1 --- template: step2 --- template: step3 background-image: url("images/predicting/predicting.004.jpeg") background-size: contain --- name: holdout-step2 background-image: url("images/predicting/predicting.006.jpeg") background-size: contain --- name: holdout-step3 background-image: url("images/predicting/predicting.007.jpeg") background-size: contain --- name: holdout-step4 background-image: url("images/predicting/predicting.008.jpeg") background-size: contain --- name: holdout background-image: url("images/predicting/predicting.009.jpeg") background-size: contain --- class: your-turn # Your turn 2 Fill in the blanks. Use `initial_split()`, `training()`, and `testing()` to: 1. Split **alz** into training and test sets. Save the rsplit! 2. Extract the training data and fit your classification tree model. 3. Predict the testing data, and save the true `Class` values. 4. Measure the accuracy of your model with your test set. Keep `set.seed(100)` at the start of your code.
04
:
00
--- ```r set.seed(100) # Important! alz_split <- initial_split(alz, strata = Class, prop = .9) alz_train <- training(alz_split) alz_test <- testing(alz_split) tree_mod %>% fit(Class ~ tau + VEGF, data = alz_train) %>% predict(new_data = alz_test) %>% mutate(true_class = alz_test$Class) %>% accuracy(truth = true_class, estimate = .pred_class) ``` --- template: predictions --- name: accurate-predictions class: middle, center, frame # Goal of Machine Learning ## 🎯 generate .display[accurate predictions] ??? Now we have predictions from our model. What can we do with them? If we already know the truth, that is, the outcome variable that was observed, we can compare them! --- class: middle, center, frame # Axiom Better Model = Better Predictions (Lower error rate) --- class: middle, center # `accuracy()*` Calculates the accuracy based on two columns in a dataframe: The .display[truth]: `\({y}_i\)` The predicted .display[estimate]: `\(\hat{y}_i\)` ```r accuracy(data, truth, estimate) ``` .footnote[`*` from `yardstick`] --- ```r tree_mod %>% fit(Class ~ tau + VEGF, data = alz_train) %>% predict(new_data = alz_test) %>% mutate(true_class = alz_test$Class) %>% * accuracy(truth = true_class, estimate = .pred_class) # # A tibble: 1 x 3 # .metric .estimator .estimate # <chr> <chr> <dbl> # 1 accuracy binary 0.848 ``` --- template: step1 --- template: step2 --- name: step3 background-image: url("images/predicting/predicting.004.jpeg") background-size: contain --- class: your-turn # Your Turn 3 What would happen if you repeated this process? Would you get the same answers? Note your accuracy from above. Then change your seed number and rerun just the last code chunk above. Do you get the same answer? Try it a few times with a few different seeds.
02
:
00
--- .pull-left[ ``` # # A tibble: 1 x 3 # .metric .estimator .estimate # <chr> <chr> <dbl> # 1 accuracy binary 0.848 ``` ``` # # A tibble: 1 x 3 # .metric .estimator .estimate # <chr> <chr> <dbl> # 1 accuracy binary 0.848 ``` ``` # # A tibble: 1 x 3 # .metric .estimator .estimate # <chr> <chr> <dbl> # 1 accuracy binary 0.848 ``` ] -- .pull-right[ ``` # # A tibble: 1 x 3 # .metric .estimator .estimate # <chr> <chr> <dbl> # 1 accuracy binary 0.848 ``` ``` # # A tibble: 1 x 3 # .metric .estimator .estimate # <chr> <chr> <dbl> # 1 accuracy binary 0.848 ``` ``` # # A tibble: 1 x 3 # .metric .estimator .estimate # <chr> <chr> <dbl> # 1 accuracy binary 0.848 ``` ] --- class: middle, center # Quiz Why is the new estimate different? --- class: middle, center # Data Splitting -- <img src="figs/rmed01-model/unnamed-chunk-52-1.png" width="720" style="display: block; margin: auto;" /> -- <img src="figs/rmed01-model/unnamed-chunk-53-1.png" width="720" style="display: block; margin: auto;" /> -- <img src="figs/rmed01-model/unnamed-chunk-54-1.png" width="720" style="display: block; margin: auto;" /> -- <img src="figs/rmed01-model/unnamed-chunk-55-1.png" width="720" style="display: block; margin: auto;" /> -- <img src="figs/rmed01-model/unnamed-chunk-56-1.png" width="720" style="display: block; margin: auto;" /> -- <img src="figs/rmed01-model/unnamed-chunk-57-1.png" width="720" style="display: block; margin: auto;" /> -- <img src="figs/rmed01-model/unnamed-chunk-58-1.png" width="720" style="display: block; margin: auto;" /> -- <img src="figs/rmed01-model/unnamed-chunk-59-1.png" width="720" style="display: block; margin: auto;" /> --- <img src="figs/rmed01-model/unnamed-chunk-60-1.png" width="1080" style="display: block; margin: auto;" /> -- <img src="figs/rmed01-model/unnamed-chunk-61-1.png" width="1080" style="display: block; margin: auto;" /> -- <img src="figs/rmed01-model/unnamed-chunk-62-1.png" width="1080" style="display: block; margin: auto;" /> -- <img src="figs/rmed01-model/unnamed-chunk-63-1.png" width="1080" style="display: block; margin: auto;" /> -- <img src="figs/rmed01-model/unnamed-chunk-64-1.png" width="1080" style="display: block; margin: auto;" /> -- <img src="figs/rmed01-model/unnamed-chunk-65-1.png" width="1080" style="display: block; margin: auto;" /> -- <img src="figs/rmed01-model/unnamed-chunk-66-1.png" width="1080" style="display: block; margin: auto;" /> -- <img src="figs/rmed01-model/unnamed-chunk-67-1.png" width="1080" style="display: block; margin: auto;" /> -- .right[Mean RMSE] --- class: frame, center, middle # Resampling Let's resample 10 times then compute the mean of the results... --- ```r acc %>% tibble::enframe(name = "accuracy") # # A tibble: 10 x 2 # accuracy value # <int> <dbl> # 1 1 0.855 # 2 2 0.807 # 3 3 0.831 # 4 4 0.855 # 5 5 0.880 # 6 6 0.880 # 7 7 0.831 # 8 8 0.843 # 9 9 0.880 # 10 10 0.892 mean(acc) # [1] 0.8554217 ``` --- class: middle, center # Guess Which do you think is a better estimate? The best result or the mean of the results? Why? --- class: middle, center # But also... Fit with .display[training set] Predict with .display[testing set] -- Rinse and repeat? --- # There has to be a better way... ```r acc <- vector(length = 10, mode = "double") for (i in 1:10) { new_split <- initial_split(alz) new_train <- training(new_split) new_test <- testing(new_split) acc[i] <- lr_mod %>% fit(Class ~ tau + VEGF, data = new_train) %>% predict(new_test) %>% mutate(truth = new_test$Class) %>% accuracy(truth, .pred_class) %>% pull(.estimate) } ``` --- background-image: url(images/diamonds.jpg) background-size: contain background-position: left class: middle, center background-color: #f5f5f5 .pull-right[ ## The .display[testing set] is precious... ## we can only use it once! ] --- background-image: url(images/diamonds.jpg) background-size: contain background-position: left class: middle, center background-color: #f5f5f5 .pull-right[ ## How can we use the training set to compare, evaluate, and tune models? ] --- background-image: url(https://www.tidymodels.org/start/resampling/img/resampling.svg) background-size: 60% --- class: middle, center, inverse # Cross-validation --- background-image: url(images/cross-validation/Slide2.png) background-size: contain --- background-image: url(images/cross-validation/Slide3.png) background-size: contain --- background-image: url(images/cross-validation/Slide4.png) background-size: contain --- background-image: url(images/cross-validation/Slide5.png) background-size: contain --- background-image: url(images/cross-validation/Slide6.png) background-size: contain --- background-image: url(images/cross-validation/Slide7.png) background-size: contain --- background-image: url(images/cross-validation/Slide8.png) background-size: contain --- background-image: url(images/cross-validation/Slide9.png) background-size: contain --- background-image: url(images/cross-validation/Slide10.png) background-size: contain --- background-image: url(images/cross-validation/Slide11.png) background-size: contain --- class: middle, center # V-fold cross-validation ```r vfold_cv(data, v = 10, ...) ``` --- exclude: true --- class: middle, center # Guess How many times does in observation/row appear in the assessment set? <img src="figs/rmed01-model/vfold-tiles-1.png" width="864" style="display: block; margin: auto;" /> --- <img src="figs/rmed01-model/unnamed-chunk-72-1.png" width="864" style="display: block; margin: auto;" /> --- class: middle, center # Quiz If we use 10 folds, which percent of our data will end up in the training set and which percent in the testing set for each fold? -- 90% - training 10% - test --- class: your-turn # Your Turn 4 Run the code below. What does it return? ```r set.seed(100) alz_folds <- vfold_cv(alz_train, v = 10, strata = Class) alz_folds ```
01
:
00
--- ```r set.seed(100) alz_folds <- vfold_cv(alz_train, v = 10, strata = Class) alz_folds # # 10-fold cross-validation using stratification # # A tibble: 10 x 2 # splits id # <list> <chr> # 1 <split [269/31]> Fold01 # 2 <split [269/31]> Fold02 # 3 <split [270/30]> Fold03 # 4 <split [270/30]> Fold04 # 5 <split [270/30]> Fold05 # 6 <split [270/30]> Fold06 # 7 <split [270/30]> Fold07 # 8 <split [270/30]> Fold08 # 9 <split [271/29]> Fold09 # 10 <split [271/29]> Fold10 ``` --- class: middle .center[ # We need a new way to fit ] ```r split1 <- alz_folds %>% pluck("splits", 1) split1_train <- training(split1) split1_test <- testing(split1) tree_mod %>% fit(Class ~ ., data = split1_train) %>% predict(split1_test) %>% mutate(truth = split1_test$Class) %>% rmse(truth, .pred_class) # rinse and repeat split2 <- ... ``` --- class: middle .center[ # `fit_resamples()` Trains and tests a resampled model. ] ```r tree_mod %>% fit_resamples( Class ~ tau + VEGF, resamples = alz_folds ) ``` --- ```r tree_mod %>% fit_resamples( Class ~ tau + VEGF, resamples = alz_folds ) # # Resampling results # # 10-fold cross-validation using stratification # # A tibble: 10 x 4 # splits id .metrics .notes # <list> <chr> <list> <list> # 1 <split [269/31]> Fold01 <tibble [2 × 3]> <tibble [0 × 1]> # 2 <split [269/31]> Fold02 <tibble [2 × 3]> <tibble [0 × 1]> # 3 <split [270/30]> Fold03 <tibble [2 × 3]> <tibble [0 × 1]> # 4 <split [270/30]> Fold04 <tibble [2 × 3]> <tibble [0 × 1]> # 5 <split [270/30]> Fold05 <tibble [2 × 3]> <tibble [0 × 1]> # 6 <split [270/30]> Fold06 <tibble [2 × 3]> <tibble [0 × 1]> # 7 <split [270/30]> Fold07 <tibble [2 × 3]> <tibble [0 × 1]> # 8 <split [270/30]> Fold08 <tibble [2 × 3]> <tibble [0 × 1]> # 9 <split [271/29]> Fold09 <tibble [2 × 3]> <tibble [0 × 1]> # 10 <split [271/29]> Fold10 <tibble [2 × 3]> <tibble [0 × 1]> ``` --- class: middle, center # `collect_metrics()` Unnest the metrics column from a tidymodels `fit_resamples()` ```r _results %>% collect_metrics(summarize = TRUE) ``` -- .footnote[`TRUE` is actually the default; averages across folds] --- ```r tree_mod %>% fit_resamples( Class ~ tau + VEGF, resamples = alz_folds ) %>% collect_metrics(summarize = FALSE) # # A tibble: 20 x 4 # id .metric .estimator .estimate # <chr> <chr> <chr> <dbl> # 1 Fold01 accuracy binary 0.774 # 2 Fold01 roc_auc binary 0.692 # 3 Fold02 accuracy binary 0.839 # 4 Fold02 roc_auc binary 0.848 # 5 Fold03 accuracy binary 0.867 # 6 Fold03 roc_auc binary 0.852 # 7 Fold04 accuracy binary 0.8 # 8 Fold04 roc_auc binary 0.795 # 9 Fold05 accuracy binary 0.767 # 10 Fold05 roc_auc binary 0.744 # # … with 10 more rows ``` --- class: middle, center, frame # 10-fold CV ### 10 different analysis/assessment sets ### 10 different models (trained on .display[analysis] sets) ### 10 different sets of performance statistics (on .display[assessment] sets) --- class: your-turn # Your Turn 5 Modify the code below to use `fit_resamples` and `alz_folds` to cross-validate the classification tree model. What is the ROC AUC that you collect at the end? ```r set.seed(100) tree_mod %>% fit(Class ~ tau + VEGF, data = alz_train) %>% predict(new_data = alz_test) %>% mutate(true_class = alz_test$Class) %>% accuracy(truth = true_class, estimate = .pred_class) ```
03
:
00
--- ```r set.seed(100) lr_mod %>% fit_resamples(Class ~ tau + VEGF, resamples = alz_folds) %>% collect_metrics() # # A tibble: 2 x 5 # .metric .estimator mean n std_err # <chr> <chr> <dbl> <int> <dbl> # 1 accuracy binary 0.854 10 0.0187 # 2 roc_auc binary 0.893 10 0.0120 ``` --- # How did we do? ```r tree_mod %>% fit(Class ~ tau + VEGF, data = alz_train) %>% predict(alz_test) %>% mutate(truth = alz_test$Class) %>% accuracy(truth, .pred_class) # # A tibble: 1 x 3 # .metric .estimator .estimate # <chr> <chr> <dbl> # 1 accuracy binary 0.848 ``` ``` # # A tibble: 2 x 5 # .metric .estimator mean n std_err # <chr> <chr> <dbl> <int> <dbl> # 1 accuracy binary 0.854 10 0.0187 # 2 roc_auc binary 0.893 10 0.0120 ```