My Research
  • Stars Dataset
  • Diamonds Dataset
  • Cars93 Dataset
  • Covid-19 Analysis
  • Covid-19 Analysis Continued
  • Regression Modeling
Selecting Best Model for Measuring the Students Aptitude

Selecting Best Model for Measuring the Students Aptitude

Murtaza Badshah

12/9/2020

Data Analysis

Step 1:

Here we are going to first call all the important libraries we will be using in order to perform the data analysis.

library(ggplot2)
library(dplyr)
library(tidymodels)
library(readr)
library(janitor)
library(furniture)
library(rstanarm)
library(rstan)
library(moderndive)
library(tidyverse)
library(vip)
library(leaps)  #function to search for the best model
library(faraway)
library(ISLR)
library(knitr)
library(glmnet)

Next, we are going to call our dataset and create a table from it.

students <- read_csv("~/Murtaza/students.csv")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   DataYear = col_double(),
##   ClassGrade = col_double(),
##   Ageyears = col_double(),
##   Score_in_memory_game = col_double(),
##   Importance_reducing_pollution = col_double(),
##   Importance_recycling_rubbish = col_double(),
##   Importance_conserving_water = col_double(),
##   Importance_saving_energy = col_double(),
##   Importance_owning_computer = col_double(),
##   Importance_Internet_access = col_double()
## )
## See spec(...) for full column specifications.
students <- students %>% clean_names()

Let us take a look at the available data set!

head(students)
## # A tibble: 6 x 60
##   country region data_year class_grade gender ageyears handed height_cm
##   <chr>   <chr>      <dbl>       <dbl> <chr>     <dbl> <chr>  <chr>    
## 1 USA     MI          2013          11 Female       17 Right… 156.6    
## 2 USA     MI          2019           9 Male         15 Right… 170      
## 3 USA     MI          2014          12 Female       17 Right… 177      
## 4 USA     MI          2014          12 Female       17 Right… 162.5    
## 5 USA     MI          2017          12 Male         17 Left-… 180      
## 6 USA     MI          2018          12 Female       16 Ambid… 152.4    
## # … with 52 more variables: footlength_cm <chr>, armspan_cm <chr>,
## #   languages_spoken <chr>, travel_to_school <chr>,
## #   travel_time_to_school <chr>, reaction_time <chr>,
## #   score_in_memory_game <dbl>, favourite_physical_activity <chr>,
## #   importance_reducing_pollution <dbl>, importance_recycling_rubbish <dbl>,
## #   importance_conserving_water <dbl>, importance_saving_energy <dbl>,
## #   importance_owning_computer <dbl>, importance_internet_access <dbl>,
## #   left_footlength_cm <chr>, longer_foot <chr>, index_fingerlength_mm <chr>,
## #   ring_fingerlength_mm <chr>, longer_finger_lefthand <chr>,
## #   birth_month <chr>, favorite_season <chr>, allergies <chr>,
## #   vegetarian <chr>, favorite_food <chr>, beverage <chr>,
## #   favorite_school_subject <chr>, sleep_hours_schoolnight <chr>,
## #   sleep_hours_non_schoolnight <chr>, home_occupants <chr>,
## #   home_internet_access <chr>, communication_with_friends <chr>,
## #   text_messages_sent_yesterday <chr>, text_messages_received_yesterday <chr>,
## #   hanging_out_with_friends_hours <chr>, talking_on_phone_hours <chr>,
## #   doing_homework_hours <chr>, doing_things_with_family_hours <chr>,
## #   outdoor_activities_hours <chr>, video_games_hours <chr>,
## #   social_websites_hours <chr>, texting_messaging_hours <chr>,
## #   computer_use_hours <chr>, watching_tv_hours <chr>, paid_work_hours <chr>,
## #   work_at_home_hours <chr>, schoolwork_pressure <chr>,
## #   planned_education_level <chr>, favorite_music <chr>, superpower <chr>,
## #   preferred_status <chr>, role_model_type <chr>, charity_donation <chr>

Step 2 - Methods: Data Cleansing and Data Wrangling

In order to answer our question we will first perform some simple cleaning of the data. Some of the columns seem unnecessary for our analysis so we will be removing them in order to do the next step. We will also remove any records that are missing.

students_new <- students %>% select(gender, class_grade, ageyears, 
    height_cm, reaction_time, score_in_memory_game, sleep_hours_schoolnight, 
    sleep_hours_non_schoolnight, hanging_out_with_friends_hours, 
    doing_homework_hours, doing_things_with_family_hours, outdoor_activities_hours, 
    video_games_hours)

glimpse(students_new)
## Rows: 500
## Columns: 13
## $ gender                         <chr> "Female", "Male", "Female", "Female", …
## $ class_grade                    <dbl> 11, 9, 12, 12, 12, 12, 12, 4, 12, 9, 9…
## $ ageyears                       <dbl> 17, 15, 17, 17, 17, 16, 17, 16, 17, 13…
## $ height_cm                      <chr> "156.6", "170", "177", "162.5", "180",…
## $ reaction_time                  <chr> ".431", ".001", ".4", ".482", ".521", …
## $ score_in_memory_game           <dbl> 46, 30, NA, 31, 36, 71, 45, 33, NA, 53…
## $ sleep_hours_schoolnight        <chr> "7.5", "2", "8", "8", "4", "7", "7", "…
## $ sleep_hours_non_schoolnight    <chr> "9", NA, "12", "9", "3", "9", "10", "9…
## $ hanging_out_with_friends_hours <chr> "6", NA, "7", "6", "4", ".5", "16", "2…
## $ doing_homework_hours           <chr> "8", NA, "1", "10", "0", "5", "5", "3"…
## $ doing_things_with_family_hours <chr> "2", NA, "6", "10", "8", "7", "3", "10…
## $ outdoor_activities_hours       <chr> "3", NA, "10", "20", "1", "0", "0", "2…
## $ video_games_hours              <chr> "0", NA, "0", "0", "40", "3", "4", "70…

Now that we have our table set up, I will remove the field with NA as we don’t need them for our analysis.

students_new <- students_new %>% drop_na()

Let us now summarize and look at the dataset available

summary(students_new)
##     gender           class_grade       ageyears         height_cm        
##  Length:400         Min.   : 4.00   Min.   :    4.00   Length:400        
##  Class :character   1st Qu.:11.00   1st Qu.:   15.00   Class :character  
##  Mode  :character   Median :12.00   Median :   17.00   Mode  :character  
##                     Mean   :10.96   Mean   :   43.87                     
##                     3rd Qu.:12.00   3rd Qu.:   17.00                     
##                     Max.   :12.00   Max.   :11102.00                     
##  reaction_time      score_in_memory_game sleep_hours_schoolnight
##  Length:400         Min.   : 2.00        Length:400             
##  Class :character   1st Qu.:36.75        Class :character       
##  Mode  :character   Median :43.00        Mode  :character       
##                     Mean   :44.11                               
##                     3rd Qu.:50.00                               
##                     Max.   :91.00                               
##  sleep_hours_non_schoolnight hanging_out_with_friends_hours
##  Length:400                  Length:400                    
##  Class :character            Class :character              
##  Mode  :character            Mode  :character              
##                                                            
##                                                            
##                                                            
##  doing_homework_hours doing_things_with_family_hours outdoor_activities_hours
##  Length:400           Length:400                     Length:400              
##  Class :character     Class :character               Class :character        
##  Mode  :character     Mode  :character               Mode  :character        
##                                                                              
##                                                                              
##                                                                              
##  video_games_hours 
##  Length:400        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

Note:

From a closer look at the way the variables are recognized it seems that R is not recognizing variables such as gender, height, reaction time, hanging out with friends, etc. as continuous variables but instead as character vectors. After a deeper dive into the data set I have found some minor inaccuracies in the data set which is causing R to not recognize each column appropriately. An example of an inconsistency I found is students reporting their information in Feet when CM was asked of them. There are instances of them adding special characters such as “‘" or’”’. In order to fix the data individually I will be opening up excel and formatting the information accordingly.

Now that I have fixed the data I will be re uploading the data set and running the summary to compare and see how R recognizes each variable.

students_fixed <- read_csv("~/Murtaza/students_fixed.csv")
## Parsed with column specification:
## cols(
##   gender = col_character(),
##   class_grade = col_double(),
##   ageyears = col_double(),
##   height_cm = col_double(),
##   reaction_time = col_double(),
##   score_in_memory_game = col_double(),
##   sleep_hours_schoolnight = col_double(),
##   sleep_hours_non_schoolnight = col_double(),
##   hanging_out_with_friends_hours = col_double(),
##   doing_homework_hours = col_double(),
##   doing_things_with_family_hours = col_double(),
##   outdoor_activities_hours = col_double(),
##   video_games_hours = col_double()
## )
# Data wrangling to remove any outliers and data that skews
# the overall results

students_fixed <- students_fixed %>% filter(height_cm > 120, 
    reaction_time < 10, doing_homework_hours <= 168, doing_things_with_family_hours <= 
        168, outdoor_activities_hours <= 168)

121.92
## [1] 121.92
summary(students_fixed)
##     gender           class_grade       ageyears       height_cm    
##  Length:372         Min.   : 4.00   Min.   : 4.00   Min.   :132.0  
##  Class :character   1st Qu.:11.00   1st Qu.:15.00   1st Qu.:161.0  
##  Mode  :character   Median :12.00   Median :17.00   Median :168.0  
##                     Mean   :10.99   Mean   :16.18   Mean   :168.7  
##                     3rd Qu.:12.00   3rd Qu.:17.00   3rd Qu.:177.0  
##                     Max.   :12.00   Max.   :21.00   Max.   :231.0  
##  reaction_time    score_in_memory_game sleep_hours_schoolnight
##  Min.   :0.0100   Min.   : 7.00        Min.   : 3.000         
##  1st Qu.:0.3360   1st Qu.:37.00        1st Qu.: 6.000         
##  Median :0.3985   Median :43.00        Median : 7.000         
##  Mean   :0.5118   Mean   :44.22        Mean   : 6.797         
##  3rd Qu.:0.5365   3rd Qu.:50.00        3rd Qu.: 7.625         
##  Max.   :6.6900   Max.   :91.00        Max.   :12.000         
##  sleep_hours_non_schoolnight hanging_out_with_friends_hours
##  Min.   : 0.00               Min.   :  0.00                
##  1st Qu.: 8.00               1st Qu.:  5.00                
##  Median : 9.00               Median :  9.00                
##  Mean   : 8.77               Mean   : 15.83                
##  3rd Qu.:10.00               3rd Qu.: 20.00                
##  Max.   :16.00               Max.   :144.00                
##  doing_homework_hours doing_things_with_family_hours outdoor_activities_hours
##  Min.   :  0.000      Min.   :  0.00                 Min.   :  0.000         
##  1st Qu.:  3.000      1st Qu.:  3.00                 1st Qu.:  2.000         
##  Median :  7.000      Median :  5.00                 Median :  6.000         
##  Mean   :  9.882      Mean   : 10.15                 Mean   :  9.245         
##  3rd Qu.: 12.000      3rd Qu.: 10.00                 3rd Qu.: 14.000         
##  Max.   :160.000      Max.   :168.00                 Max.   :100.000         
##  video_games_hours
##  Min.   : 0.000   
##  1st Qu.: 0.000   
##  Median : 1.000   
##  Mean   : 5.194   
##  3rd Qu.: 5.000   
##  Max.   :80.000

I also noticed that for the ageyears variable, there seems to be one entry thats 11102.00 which is going to create a huge problem for our analysis. That same student reported un realistic results which makes me doubt this data validity so I have decided to remove the record entirely from the analysis.

Now that we have the summary of the cleaned data we can see a couple of issues with how R is recognizing each field we are now going to adjust the columns appropriately below:

# Converting the gender to a factor instead of character

students_fixed$gender <- as.factor(students_fixed$gender)

# Converting the class grade to factor

students_fixed$class_grade <- as.factor(students_fixed$class_grade)

# Converting the age to factor

students_fixed$ageyears <- as.factor(students_fixed$ageyears)

summary(students_fixed)
##     gender    class_grade    ageyears     height_cm     reaction_time   
##  Female:231   4 :  9      17     :182   Min.   :132.0   Min.   :0.0100  
##  Male  :141   8 :  1      16     : 62   1st Qu.:161.0   1st Qu.:0.3360  
##               9 : 75      14     : 57   Median :168.0   Median :0.3985  
##               10:  1      15     : 33   Mean   :168.7   Mean   :0.5118  
##               11: 74      18     : 27   3rd Qu.:177.0   3rd Qu.:0.5365  
##               12:212      13     :  6   Max.   :231.0   Max.   :6.6900  
##                           (Other):  5                                   
##  score_in_memory_game sleep_hours_schoolnight sleep_hours_non_schoolnight
##  Min.   : 7.00        Min.   : 3.000          Min.   : 0.00              
##  1st Qu.:37.00        1st Qu.: 6.000          1st Qu.: 8.00              
##  Median :43.00        Median : 7.000          Median : 9.00              
##  Mean   :44.22        Mean   : 6.797          Mean   : 8.77              
##  3rd Qu.:50.00        3rd Qu.: 7.625          3rd Qu.:10.00              
##  Max.   :91.00        Max.   :12.000          Max.   :16.00              
##                                                                          
##  hanging_out_with_friends_hours doing_homework_hours
##  Min.   :  0.00                 Min.   :  0.000     
##  1st Qu.:  5.00                 1st Qu.:  3.000     
##  Median :  9.00                 Median :  7.000     
##  Mean   : 15.83                 Mean   :  9.882     
##  3rd Qu.: 20.00                 3rd Qu.: 12.000     
##  Max.   :144.00                 Max.   :160.000     
##                                                     
##  doing_things_with_family_hours outdoor_activities_hours video_games_hours
##  Min.   :  0.00                 Min.   :  0.000          Min.   : 0.000   
##  1st Qu.:  3.00                 1st Qu.:  2.000          1st Qu.: 0.000   
##  Median :  5.00                 Median :  6.000          Median : 1.000   
##  Mean   : 10.15                 Mean   :  9.245          Mean   : 5.194   
##  3rd Qu.: 10.00                 3rd Qu.: 14.000          3rd Qu.: 5.000   
##  Max.   :168.00                 Max.   :100.000          Max.   :80.000   
## 

Now that the data is sorted and cleaned let us use the furnitre package to find the estimated mean, sd and of all the variables.

table1(students_fixed, gender, class_grade, ageyears, height_cm, 
    reaction_time, score_in_memory_game, sleep_hours_schoolnight, 
    sleep_hours_non_schoolnight, hanging_out_with_friends_hours, 
    doing_homework_hours, doing_things_with_family_hours, outdoor_activities_hours, 
    video_games_hours, output = "markdown")
Mean/Count (SD/%)
n = 372
gender
Female 231 (62.1%)
Male 141 (37.9%)
class_grade
4 9 (2.4%)
8 1 (0.3%)
9 75 (20.2%)
10 1 (0.3%)
11 74 (19.9%)
12 212 (57%)
ageyears
4 1 (0.3%)
12 1 (0.3%)
13 6 (1.6%)
14 57 (15.3%)
15 33 (8.9%)
16 62 (16.7%)
17 182 (48.9%)
17.5 1 (0.3%)
18 27 (7.3%)
20 1 (0.3%)
21 1 (0.3%)
height_cm
168.7 (10.9)
reaction_time
0.5 (0.5)
score_in_memory_game
44.2 (11.2)
sleep_hours_schoolnight
6.8 (1.3)
sleep_hours_non_schoolnight
8.8 (2.2)
hanging_out_with_friends_hours
15.8 (19.2)
doing_homework_hours
9.9 (13.6)
doing_things_with_family_hours
10.1 (18.6)
outdoor_activities_hours
9.2 (11.9)
video_games_hours
5.2 (10.8)

STEP 3: Data Analysis

Let us look at the frequency of the different categorical variables in the dataset.

students_fixed %>% ggplot(aes(class_grade)) + geom_bar(aes(fill = gender)) + 
    facet_wrap(~gender)

What is the mean memory score difference between each gender?

students_fixed %>% group_by(gender) %>% summarise(mean(score_in_memory_game))
## # A tibble: 2 x 2
##   gender `mean(score_in_memory_game)`
## * <fct>                         <dbl>
## 1 Female                         43.9
## 2 Male                           44.8

Let us plot this data and see it more objectively.

students_fixed %>% ggplot(aes(x = gender, y = score_in_memory_game)) + 
    geom_boxplot() + labs(title = "Boxplot of score comparision between genders") + 
    ylab("Memory game score")

# facet_wrap( ~ class_grade )

Based on the plot above we can see that males are scoring better on the memory game when compared to females. Although there are less males than females in the data set. It could just be that since there are more females who took the survey their mean goes down.

Let us now check to see if there is a relationship between height and memory game score.

ggplot(data = students_fixed, aes(y = score_in_memory_game, x = height_cm)) + 
    geom_point() + geom_smooth(method = "lm", se = FALSE) + # geom_smooth(color = 'orange')+ ylim(0,40)+
facet_wrap(~gender)
## `geom_smooth()` using formula 'y ~ x'

Based on the results above it looks like there is some negative correlation, suggesting that taller people have a harder time in getting a good score in the memory game as compared to shorter people. But overall we see here the males are doing much better on the test than females.

ggplot(data = students_fixed, aes(x = score_in_memory_game)) + 
    geom_histogram() + # ylim(0,40)+
facet_wrap(~gender)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The histogram suggests that the average score in females is less than the males but there are also more participants for females.

Let us now see how well each of the grades did on the memory game when we compare it to the amount of hours they spend playing video games. Is there some kind of correlation? I decided to filter out some of the grade as there was no data for it.

students_fixed %>% filter(class_grade != 6, class_grade != 8, 
    class_grade != 10) %>% ggplot(aes(y = score_in_memory_game, 
    x = video_games_hours)) + geom_point() + # geom_smooth(method = 'lm', se = FALSE)+
facet_wrap(~class_grade)

It is interesting to see that there is a negative correlation between the height and the overall score of the memory game. I was expecting a positive correlation for tall people but let us further investigate and create a base linear regression model using TidyModels to see which variables are best to create the prediction model.

lmodel <- lm(score_in_memory_game ~ height_cm, data = students_fixed)

get_regression_table(lmodel)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   55.9       9.07       6.16   0       38.0     73.7  
## 2 height_cm   -0.069     0.054     -1.29   0.198   -0.175    0.036
students_fixed %>% summarise(r = cor(score_in_memory_game, height_cm))
## # A tibble: 1 x 1
##         r
##     <dbl>
## 1 -0.0668

Our correlation coefficient is -0.06683197 and based on the above results from the table we can assume the following:

  1. When height does not increase the predicted average score is around 55.883
  2. For every increase in height by 1 we see an associated decrease in score result by -0.069 percentage points.

This model actually suggests that taller people fair poorly in cognitive based tests.

Linear regression modelling

In order to accomplish the following the code was referenced from here https://www.gmudatamining.com/lesson-10-r-tutorial.html

Data Splitting

Let us now create our test and training dataset:

set.seed(10)


# Converting these variables back to numeric in order to
# perform the linear analysis.
students_fixed$ageyears <- as.numeric(students_fixed$ageyears)
students_fixed$class_grade <- as.numeric(students_fixed$class_grade)

# Create a split object
data_split <- initial_split(students_fixed, prop = 0.67, strata = score_in_memory_game)

# Build training data set
training <- data_split %>% training()

# Build testing data set
test <- data_split %>% testing()

Model Specification

lm_model <- linear_reg() %>% set_engine("lm") %>% set_mode("regression")

lm_model
## Linear Regression Model Specification (regression)
## 
## Computational engine: lm

Fitting to training Data

lm_fit <- lm_model %>% fit(score_in_memory_game ~ ., data = training)

lm_fit
## parsnip model object
## 
## Fit time:  4ms 
## 
## Call:
## stats::lm(formula = score_in_memory_game ~ ., data = data)
## 
## Coefficients:
##                    (Intercept)                      genderMale  
##                     73.1038528                       2.9387243  
##                    class_grade                        ageyears  
##                     -0.3015864                      -0.7517686  
##                      height_cm                   reaction_time  
##                     -0.1118949                       1.7083436  
##        sleep_hours_schoolnight     sleep_hours_non_schoolnight  
##                     -0.7491965                      -0.0309466  
## hanging_out_with_friends_hours            doing_homework_hours  
##                     -0.0130812                      -0.0016059  
## doing_things_with_family_hours        outdoor_activities_hours  
##                     -0.0536262                       0.0404418  
##              video_games_hours  
##                     -0.0007044

Now that we have our training model lets take a look at the training results and call the summary of the model to get the estimated coefficients, f-stats, p-values and RMSE.

names(lm_fit)
## [1] "lvl"     "spec"    "fit"     "preproc" "elapsed"
summary(lm_fit$fit)
## 
## Call:
## stats::lm(formula = score_in_memory_game ~ ., data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.772  -7.126  -0.914   4.916  45.443 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    73.1038528 13.9486469   5.241 3.52e-07 ***
## genderMale                      2.9387243  1.9241911   1.527    0.128    
## class_grade                    -0.3015864  0.6821792  -0.442    0.659    
## ageyears                       -0.7517686  0.6075561  -1.237    0.217    
## height_cm                      -0.1118949  0.0799610  -1.399    0.163    
## reaction_time                   1.7083436  1.4186305   1.204    0.230    
## sleep_hours_schoolnight        -0.7491965  0.5493701  -1.364    0.174    
## sleep_hours_non_schoolnight    -0.0309466  0.3278968  -0.094    0.925    
## hanging_out_with_friends_hours -0.0130812  0.0407753  -0.321    0.749    
## doing_homework_hours           -0.0016059  0.0521176  -0.031    0.975    
## doing_things_with_family_hours -0.0536262  0.0420188  -1.276    0.203    
## outdoor_activities_hours        0.0404418  0.0626019   0.646    0.519    
## video_games_hours              -0.0007044  0.0681141  -0.010    0.992    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.84 on 238 degrees of freedom
## Multiple R-squared:  0.05882,    Adjusted R-squared:  0.01136 
## F-statistic: 1.239 on 12 and 238 DF,  p-value: 0.2567
par(mfrow = c(2, 2))

plot(lm_fit$fit, pch = 16, col = "#006EA1")

Evaluating the test set Accuracy

To obtain the detailed results from our trained linear regression model in a data frame, we can use the tidy() and glance() functions directly on our trained parsnip model, lm_fit.

tidy(lm_fit)
## # A tibble: 13 x 5
##    term                            estimate std.error statistic     p.value
##    <chr>                              <dbl>     <dbl>     <dbl>       <dbl>
##  1 (Intercept)                    73.1        13.9       5.24   0.000000352
##  2 genderMale                      2.94        1.92      1.53   0.128      
##  3 class_grade                    -0.302       0.682    -0.442  0.659      
##  4 ageyears                       -0.752       0.608    -1.24   0.217      
##  5 height_cm                      -0.112       0.0800   -1.40   0.163      
##  6 reaction_time                   1.71        1.42      1.20   0.230      
##  7 sleep_hours_schoolnight        -0.749       0.549    -1.36   0.174      
##  8 sleep_hours_non_schoolnight    -0.0309      0.328    -0.0944 0.925      
##  9 hanging_out_with_friends_hours -0.0131      0.0408   -0.321  0.749      
## 10 doing_homework_hours           -0.00161     0.0521   -0.0308 0.975      
## 11 doing_things_with_family_hours -0.0536      0.0420   -1.28   0.203      
## 12 outdoor_activities_hours        0.0404      0.0626    0.646  0.519      
## 13 video_games_hours              -0.000704    0.0681   -0.0103 0.992
glance(lm_fit)
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1    0.0588        0.0114  10.8      1.24   0.257    12  -948. 1923. 1973.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

Evaluate test set accuracy

predict(lm_fit, new_data = test)
## # A tibble: 121 x 1
##    .pred
##    <dbl>
##  1  45.0
##  2  44.8
##  3  43.1
##  4  45.4
##  5  45.3
##  6  43.7
##  7  41.2
##  8  42.6
##  9  42.2
## 10  41.8
## # … with 111 more rows
# In order to see the most important variables in the list.
vip(lm_fit)

This is actually a very interesting analysis. It seems like rather than time spent video gaming, gender, height_cm, sleep, doing things with family, age and reaction_time are better factors to predict the score.

Lets combine the test data set and the predictions into a single data frame. We create a data frame with the predictions on the test data and then use bind_cols to add the test data to the results.

test_results <- predict(lm_fit, new_data = test) %>% bind_cols(test)

head(test_results)
## # A tibble: 6 x 14
##   .pred gender class_grade ageyears height_cm reaction_time score_in_memory…
##   <dbl> <fct>        <dbl>    <dbl>     <dbl>         <dbl>            <dbl>
## 1  45.0 Female           6        6      152.         0.664               71
## 2  44.8 Male             5        6      176          0.669               58
## 3  43.1 Female           5        6      172          0.558               50
## 4  45.4 Female           3        4      166          0.78                48
## 5  45.3 Male             6        7      170          0.285               40
## 6  43.7 Male             5        7      192          0.692               60
## # … with 7 more variables: sleep_hours_schoolnight <dbl>,
## #   sleep_hours_non_schoolnight <dbl>, hanging_out_with_friends_hours <dbl>,
## #   doing_homework_hours <dbl>, doing_things_with_family_hours <dbl>,
## #   outdoor_activities_hours <dbl>, video_games_hours <dbl>

Calculate the RMSE and R-squared on the test data.

rmse(test_results, truth = score_in_memory_game, estimate = .pred)
## # A tibble: 1 x 3
##   .metric .estimator   .estimate
##   <chr>   <chr>            <dbl>
## 1 rmse    standard   11.86021383
rsq(test_results, truth = score_in_memory_game, estimate = .pred)
## # A tibble: 1 x 3
##   .metric .estimator     .estimate
##   <chr>   <chr>              <dbl>
## 1 rsq     standard   0.01243135209

Best way to test the accuracy is by making an R-squared plot. This can be used for any regression model.

ggplot(data = test_results, mapping = aes(x = .pred, y = score_in_memory_game)) + 
    geom_point(color = "#006EA1") + geom_abline(intercept = 0, 
    slope = 1, color = "orange") + labs(title = "Linear Regression Results - Students Test Set", 
    x = "Predicted Score", y = "Actual Score")

# xlim(35,55)

Since our RMSE is at 11.86021. Its hard to tell if this would be a best suitable model to use for our prediction at the moment. Let us run a few other and come to a proper conclusion.

let us compare the OLS results to the above RMSE and validate our results.

new_linear <- lm(score_in_memory_game ~ ., data = training)

prediction_linear <- predict(new_linear, test)

sqrt(mean((prediction_linear - test$score_in_memory_game)^2))
## [1] 11.86021

Looks like our linear regression model using Tidy Models was done correctly as the results are same.

Multi Linear regression Modelling

Best Subset Modeling

We are going to use the same training and testing model we created previously.

bsubset <- regsubsets(score_in_memory_game ~ ., data = training)

reg_summary <- summary(bsubset)

reg_summary
## Subset selection object
## Call: regsubsets.formula(score_in_memory_game ~ ., data = training)
## 12 Variables  (and intercept)
##                                Forced in Forced out
## genderMale                         FALSE      FALSE
## class_grade                        FALSE      FALSE
## ageyears                           FALSE      FALSE
## height_cm                          FALSE      FALSE
## reaction_time                      FALSE      FALSE
## sleep_hours_schoolnight            FALSE      FALSE
## sleep_hours_non_schoolnight        FALSE      FALSE
## hanging_out_with_friends_hours     FALSE      FALSE
## doing_homework_hours               FALSE      FALSE
## doing_things_with_family_hours     FALSE      FALSE
## outdoor_activities_hours           FALSE      FALSE
## video_games_hours                  FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          genderMale class_grade ageyears height_cm reaction_time
## 1  ( 1 ) " "        " "         "*"      " "       " "          
## 2  ( 1 ) " "        " "         "*"      " "       " "          
## 3  ( 1 ) " "        " "         "*"      " "       " "          
## 4  ( 1 ) " "        " "         "*"      " "       "*"          
## 5  ( 1 ) "*"        " "         "*"      "*"       " "          
## 6  ( 1 ) "*"        " "         "*"      "*"       "*"          
## 7  ( 1 ) "*"        " "         "*"      "*"       "*"          
## 8  ( 1 ) "*"        "*"         "*"      "*"       "*"          
##          sleep_hours_schoolnight sleep_hours_non_schoolnight
## 1  ( 1 ) " "                     " "                        
## 2  ( 1 ) "*"                     " "                        
## 3  ( 1 ) "*"                     " "                        
## 4  ( 1 ) "*"                     " "                        
## 5  ( 1 ) "*"                     " "                        
## 6  ( 1 ) "*"                     " "                        
## 7  ( 1 ) "*"                     " "                        
## 8  ( 1 ) "*"                     " "                        
##          hanging_out_with_friends_hours doing_homework_hours
## 1  ( 1 ) " "                            " "                 
## 2  ( 1 ) " "                            " "                 
## 3  ( 1 ) " "                            " "                 
## 4  ( 1 ) " "                            " "                 
## 5  ( 1 ) " "                            " "                 
## 6  ( 1 ) " "                            " "                 
## 7  ( 1 ) " "                            " "                 
## 8  ( 1 ) " "                            " "                 
##          doing_things_with_family_hours outdoor_activities_hours
## 1  ( 1 ) " "                            " "                     
## 2  ( 1 ) " "                            " "                     
## 3  ( 1 ) "*"                            " "                     
## 4  ( 1 ) "*"                            " "                     
## 5  ( 1 ) "*"                            " "                     
## 6  ( 1 ) "*"                            " "                     
## 7  ( 1 ) "*"                            "*"                     
## 8  ( 1 ) "*"                            "*"                     
##          video_games_hours
## 1  ( 1 ) " "              
## 2  ( 1 ) " "              
## 3  ( 1 ) " "              
## 4  ( 1 ) " "              
## 5  ( 1 ) " "              
## 6  ( 1 ) " "              
## 7  ( 1 ) " "              
## 8  ( 1 ) " "

Out of all the variables the best 4 predictors seem to be height, reaction time, age and sleep.

names(reg_summary)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
data.frame(Adj.R2 = which.max(reg_summary$adjr2), RSS = which.max(reg_summary$rss), 
    CP = which.min(reg_summary$cp), BIC = which.min(reg_summary$bic))
##   Adj.R2 RSS CP BIC
## 1      6   1  2   1

Here the R-square tells us the best model to use is the the one with a criteria of 6 predictors. RSS tells us 1, CP tells us to use 3 and BIC only suggests 1. Although no one model is correct.

reg_summary$rsq
## [1] 0.01942275 0.02998832 0.03706057 0.04322996 0.05029775 0.05651022 0.05779270
## [8] 0.05835763

In the RSQ model we see that the R-squared statistic is only 1% with one variable as more variables are added it increases to 5%

multi-linear regression model based on the adj.R2 model.

r2_linear <- lm(score_in_memory_game ~ ageyears + sleep_hours_schoolnight + 
    reaction_time + gender + doing_things_with_family_hours + 
    outdoor_activities_hours, data = training)

r2_pred_lin <- predict(r2_linear, test)

sqrt(mean((r2_pred_lin - test$score_in_memory_game)^2))
## [1] 11.95949
plot(bsubset, scale = "bic")

Based on the plot the 1 variable model has the lowest BIC, the 1 variable model with the largest R-square is ageyears.

Lets test the RMSE of the of the BIC model.

bic_linear <- lm(score_in_memory_game ~ ageyears, data = training)

bic_pred_lin <- predict(bic_linear, test)
bic_test_results <- bic_pred_lin %>% bind_cols(test)

rmse(bic_test_results, truth = score_in_memory_game, estimate = ...1)
## # A tibble: 1 x 3
##   .metric .estimator   .estimate
##   <chr>   <chr>            <dbl>
## 1 rmse    standard   11.87164601
rsq(bic_test_results, truth = score_in_memory_game, estimate = ...1)
## # A tibble: 1 x 3
##   .metric .estimator      .estimate
##   <chr>   <chr>               <dbl>
## 1 rsq     standard   0.004080157493

Ridge Regression

train.mat = model.matrix(score_in_memory_game ~ ., data = training)
test.mat = model.matrix(score_in_memory_game ~ ., data = test)


grid = 10^seq(4, -2, length = 100)
mod.ridge = cv.glmnet(train.mat, training$score_in_memory_game, 
    alpha = 0, lambda = grid, thresh = 1e-12)
lambda.best = mod.ridge$lambda.min
lambda.best
## [1] 37.64936
plot(mod.ridge)

ridge.pred = predict(mod.ridge, newx = test.mat, s = lambda.best)

ridge_test_results <- ridge.pred %>% bind_cols(test)
# Converting the column into a numeric from a numeric matrix.
ridge_test_results$...1 <- as.numeric(ridge_test_results$...1)

rmse(ridge_test_results, truth = score_in_memory_game, estimate = ...1)
## # A tibble: 1 x 3
##   .metric .estimator   .estimate
##   <chr>   <chr>            <dbl>
## 1 rmse    standard   11.82801963
rsq(ridge_test_results, truth = score_in_memory_game, estimate = ...1)
## # A tibble: 1 x 3
##   .metric .estimator     .estimate
##   <chr>   <chr>              <dbl>
## 1 rsq     standard   0.01128111429

Lasso analysis

mod.lasso = cv.glmnet(train.mat, training$score_in_memory_game, 
    alpha = 1, lambda = grid, thresh = 1e-12)
lambda.best = mod.lasso$lambda.min
lambda.best
## [1] 10000
plot(mod.lasso)

lasso.pred = predict(mod.lasso, newx = test.mat, s = lambda.best)

lasso_test_results <- lasso.pred %>% bind_cols(test)
# Converting the column into a numeric from a numeric matrix.
lasso_test_results$...1 <- as.numeric(lasso_test_results$...1)

rmse(lasso_test_results, truth = score_in_memory_game, estimate = ...1)
## # A tibble: 1 x 3
##   .metric .estimator   .estimate
##   <chr>   <chr>            <dbl>
## 1 rmse    standard   11.87983702
rsq(lasso_test_results, truth = score_in_memory_game, estimate = ...1)
## Warning: A correlation computation is required, but `estimate` is constant
## and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be
## returned.
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard          NA

PLS

library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:rstanarm':
## 
##     R2
## The following object is masked from 'package:stats':
## 
##     loadings
pls.fit = plsr(score_in_memory_game ~ ., data = training, scale = T, 
    validation = "CV")
validationplot(pls.fit, val.type = "MSEP")

pls.pred = predict(pls.fit, test, ncomp = 10)

sqrt(mean((test$score_in_memory_game - pls.pred)^2))
## [1] 11.86021

Setting up the base RMSE to compare the other RMSE’s with. (Mean only)

meanonly_test_mse <- mean((mean(training$score_in_memory_game) - 
    test$score_in_memory_game)^2)
sqrt(meanonly_test_mse)
## [1] 11.87984