Here we are going to first call all the important libraries we will be using in order to perform the data analysis.
library(ggplot2)
library(dplyr)
library(tidymodels)
library(readr)
library(janitor)
library(furniture)
library(rstanarm)
library(rstan)
library(moderndive)
library(tidyverse)
library(vip)
library(leaps) #function to search for the best model
library(faraway)
library(ISLR)
library(knitr)
library(glmnet)
Next, we are going to call our dataset and create a table from it.
students <- read_csv("~/Murtaza/students.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## DataYear = col_double(),
## ClassGrade = col_double(),
## Ageyears = col_double(),
## Score_in_memory_game = col_double(),
## Importance_reducing_pollution = col_double(),
## Importance_recycling_rubbish = col_double(),
## Importance_conserving_water = col_double(),
## Importance_saving_energy = col_double(),
## Importance_owning_computer = col_double(),
## Importance_Internet_access = col_double()
## )
## See spec(...) for full column specifications.
students <- students %>% clean_names()
Let us take a look at the available data set!
head(students)
## # A tibble: 6 x 60
## country region data_year class_grade gender ageyears handed height_cm
## <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr> <chr>
## 1 USA MI 2013 11 Female 17 Right… 156.6
## 2 USA MI 2019 9 Male 15 Right… 170
## 3 USA MI 2014 12 Female 17 Right… 177
## 4 USA MI 2014 12 Female 17 Right… 162.5
## 5 USA MI 2017 12 Male 17 Left-… 180
## 6 USA MI 2018 12 Female 16 Ambid… 152.4
## # … with 52 more variables: footlength_cm <chr>, armspan_cm <chr>,
## # languages_spoken <chr>, travel_to_school <chr>,
## # travel_time_to_school <chr>, reaction_time <chr>,
## # score_in_memory_game <dbl>, favourite_physical_activity <chr>,
## # importance_reducing_pollution <dbl>, importance_recycling_rubbish <dbl>,
## # importance_conserving_water <dbl>, importance_saving_energy <dbl>,
## # importance_owning_computer <dbl>, importance_internet_access <dbl>,
## # left_footlength_cm <chr>, longer_foot <chr>, index_fingerlength_mm <chr>,
## # ring_fingerlength_mm <chr>, longer_finger_lefthand <chr>,
## # birth_month <chr>, favorite_season <chr>, allergies <chr>,
## # vegetarian <chr>, favorite_food <chr>, beverage <chr>,
## # favorite_school_subject <chr>, sleep_hours_schoolnight <chr>,
## # sleep_hours_non_schoolnight <chr>, home_occupants <chr>,
## # home_internet_access <chr>, communication_with_friends <chr>,
## # text_messages_sent_yesterday <chr>, text_messages_received_yesterday <chr>,
## # hanging_out_with_friends_hours <chr>, talking_on_phone_hours <chr>,
## # doing_homework_hours <chr>, doing_things_with_family_hours <chr>,
## # outdoor_activities_hours <chr>, video_games_hours <chr>,
## # social_websites_hours <chr>, texting_messaging_hours <chr>,
## # computer_use_hours <chr>, watching_tv_hours <chr>, paid_work_hours <chr>,
## # work_at_home_hours <chr>, schoolwork_pressure <chr>,
## # planned_education_level <chr>, favorite_music <chr>, superpower <chr>,
## # preferred_status <chr>, role_model_type <chr>, charity_donation <chr>
In order to answer our question we will first perform some simple cleaning of the data. Some of the columns seem unnecessary for our analysis so we will be removing them in order to do the next step. We will also remove any records that are missing.
students_new <- students %>% select(gender, class_grade, ageyears,
height_cm, reaction_time, score_in_memory_game, sleep_hours_schoolnight,
sleep_hours_non_schoolnight, hanging_out_with_friends_hours,
doing_homework_hours, doing_things_with_family_hours, outdoor_activities_hours,
video_games_hours)
glimpse(students_new)
## Rows: 500
## Columns: 13
## $ gender <chr> "Female", "Male", "Female", "Female", …
## $ class_grade <dbl> 11, 9, 12, 12, 12, 12, 12, 4, 12, 9, 9…
## $ ageyears <dbl> 17, 15, 17, 17, 17, 16, 17, 16, 17, 13…
## $ height_cm <chr> "156.6", "170", "177", "162.5", "180",…
## $ reaction_time <chr> ".431", ".001", ".4", ".482", ".521", …
## $ score_in_memory_game <dbl> 46, 30, NA, 31, 36, 71, 45, 33, NA, 53…
## $ sleep_hours_schoolnight <chr> "7.5", "2", "8", "8", "4", "7", "7", "…
## $ sleep_hours_non_schoolnight <chr> "9", NA, "12", "9", "3", "9", "10", "9…
## $ hanging_out_with_friends_hours <chr> "6", NA, "7", "6", "4", ".5", "16", "2…
## $ doing_homework_hours <chr> "8", NA, "1", "10", "0", "5", "5", "3"…
## $ doing_things_with_family_hours <chr> "2", NA, "6", "10", "8", "7", "3", "10…
## $ outdoor_activities_hours <chr> "3", NA, "10", "20", "1", "0", "0", "2…
## $ video_games_hours <chr> "0", NA, "0", "0", "40", "3", "4", "70…
Now that we have our table set up, I will remove the field with NA as we don’t need them for our analysis.
students_new <- students_new %>% drop_na()
Let us now summarize and look at the dataset available
summary(students_new)
## gender class_grade ageyears height_cm
## Length:400 Min. : 4.00 Min. : 4.00 Length:400
## Class :character 1st Qu.:11.00 1st Qu.: 15.00 Class :character
## Mode :character Median :12.00 Median : 17.00 Mode :character
## Mean :10.96 Mean : 43.87
## 3rd Qu.:12.00 3rd Qu.: 17.00
## Max. :12.00 Max. :11102.00
## reaction_time score_in_memory_game sleep_hours_schoolnight
## Length:400 Min. : 2.00 Length:400
## Class :character 1st Qu.:36.75 Class :character
## Mode :character Median :43.00 Mode :character
## Mean :44.11
## 3rd Qu.:50.00
## Max. :91.00
## sleep_hours_non_schoolnight hanging_out_with_friends_hours
## Length:400 Length:400
## Class :character Class :character
## Mode :character Mode :character
##
##
##
## doing_homework_hours doing_things_with_family_hours outdoor_activities_hours
## Length:400 Length:400 Length:400
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## video_games_hours
## Length:400
## Class :character
## Mode :character
##
##
##
From a closer look at the way the variables are recognized it seems that R is not recognizing variables such as gender, height, reaction time, hanging out with friends, etc. as continuous variables but instead as character vectors. After a deeper dive into the data set I have found some minor inaccuracies in the data set which is causing R to not recognize each column appropriately. An example of an inconsistency I found is students reporting their information in Feet when CM was asked of them. There are instances of them adding special characters such as “‘" or’”’. In order to fix the data individually I will be opening up excel and formatting the information accordingly.
Now that I have fixed the data I will be re uploading the data set and running the summary to compare and see how R recognizes each variable.
students_fixed <- read_csv("~/Murtaza/students_fixed.csv")
## Parsed with column specification:
## cols(
## gender = col_character(),
## class_grade = col_double(),
## ageyears = col_double(),
## height_cm = col_double(),
## reaction_time = col_double(),
## score_in_memory_game = col_double(),
## sleep_hours_schoolnight = col_double(),
## sleep_hours_non_schoolnight = col_double(),
## hanging_out_with_friends_hours = col_double(),
## doing_homework_hours = col_double(),
## doing_things_with_family_hours = col_double(),
## outdoor_activities_hours = col_double(),
## video_games_hours = col_double()
## )
# Data wrangling to remove any outliers and data that skews
# the overall results
students_fixed <- students_fixed %>% filter(height_cm > 120,
reaction_time < 10, doing_homework_hours <= 168, doing_things_with_family_hours <=
168, outdoor_activities_hours <= 168)
121.92
## [1] 121.92
summary(students_fixed)
## gender class_grade ageyears height_cm
## Length:372 Min. : 4.00 Min. : 4.00 Min. :132.0
## Class :character 1st Qu.:11.00 1st Qu.:15.00 1st Qu.:161.0
## Mode :character Median :12.00 Median :17.00 Median :168.0
## Mean :10.99 Mean :16.18 Mean :168.7
## 3rd Qu.:12.00 3rd Qu.:17.00 3rd Qu.:177.0
## Max. :12.00 Max. :21.00 Max. :231.0
## reaction_time score_in_memory_game sleep_hours_schoolnight
## Min. :0.0100 Min. : 7.00 Min. : 3.000
## 1st Qu.:0.3360 1st Qu.:37.00 1st Qu.: 6.000
## Median :0.3985 Median :43.00 Median : 7.000
## Mean :0.5118 Mean :44.22 Mean : 6.797
## 3rd Qu.:0.5365 3rd Qu.:50.00 3rd Qu.: 7.625
## Max. :6.6900 Max. :91.00 Max. :12.000
## sleep_hours_non_schoolnight hanging_out_with_friends_hours
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 8.00 1st Qu.: 5.00
## Median : 9.00 Median : 9.00
## Mean : 8.77 Mean : 15.83
## 3rd Qu.:10.00 3rd Qu.: 20.00
## Max. :16.00 Max. :144.00
## doing_homework_hours doing_things_with_family_hours outdoor_activities_hours
## Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 3.000 1st Qu.: 3.00 1st Qu.: 2.000
## Median : 7.000 Median : 5.00 Median : 6.000
## Mean : 9.882 Mean : 10.15 Mean : 9.245
## 3rd Qu.: 12.000 3rd Qu.: 10.00 3rd Qu.: 14.000
## Max. :160.000 Max. :168.00 Max. :100.000
## video_games_hours
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 1.000
## Mean : 5.194
## 3rd Qu.: 5.000
## Max. :80.000
I also noticed that for the ageyears variable, there seems to be one entry thats 11102.00 which is going to create a huge problem for our analysis. That same student reported un realistic results which makes me doubt this data validity so I have decided to remove the record entirely from the analysis.
Now that we have the summary of the cleaned data we can see a couple of issues with how R is recognizing each field we are now going to adjust the columns appropriately below:
# Converting the gender to a factor instead of character
students_fixed$gender <- as.factor(students_fixed$gender)
# Converting the class grade to factor
students_fixed$class_grade <- as.factor(students_fixed$class_grade)
# Converting the age to factor
students_fixed$ageyears <- as.factor(students_fixed$ageyears)
summary(students_fixed)
## gender class_grade ageyears height_cm reaction_time
## Female:231 4 : 9 17 :182 Min. :132.0 Min. :0.0100
## Male :141 8 : 1 16 : 62 1st Qu.:161.0 1st Qu.:0.3360
## 9 : 75 14 : 57 Median :168.0 Median :0.3985
## 10: 1 15 : 33 Mean :168.7 Mean :0.5118
## 11: 74 18 : 27 3rd Qu.:177.0 3rd Qu.:0.5365
## 12:212 13 : 6 Max. :231.0 Max. :6.6900
## (Other): 5
## score_in_memory_game sleep_hours_schoolnight sleep_hours_non_schoolnight
## Min. : 7.00 Min. : 3.000 Min. : 0.00
## 1st Qu.:37.00 1st Qu.: 6.000 1st Qu.: 8.00
## Median :43.00 Median : 7.000 Median : 9.00
## Mean :44.22 Mean : 6.797 Mean : 8.77
## 3rd Qu.:50.00 3rd Qu.: 7.625 3rd Qu.:10.00
## Max. :91.00 Max. :12.000 Max. :16.00
##
## hanging_out_with_friends_hours doing_homework_hours
## Min. : 0.00 Min. : 0.000
## 1st Qu.: 5.00 1st Qu.: 3.000
## Median : 9.00 Median : 7.000
## Mean : 15.83 Mean : 9.882
## 3rd Qu.: 20.00 3rd Qu.: 12.000
## Max. :144.00 Max. :160.000
##
## doing_things_with_family_hours outdoor_activities_hours video_games_hours
## Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 3.00 1st Qu.: 2.000 1st Qu.: 0.000
## Median : 5.00 Median : 6.000 Median : 1.000
## Mean : 10.15 Mean : 9.245 Mean : 5.194
## 3rd Qu.: 10.00 3rd Qu.: 14.000 3rd Qu.: 5.000
## Max. :168.00 Max. :100.000 Max. :80.000
##
Now that the data is sorted and cleaned let us use the furnitre package to find the estimated mean, sd and of all the variables.
table1(students_fixed, gender, class_grade, ageyears, height_cm,
reaction_time, score_in_memory_game, sleep_hours_schoolnight,
sleep_hours_non_schoolnight, hanging_out_with_friends_hours,
doing_homework_hours, doing_things_with_family_hours, outdoor_activities_hours,
video_games_hours, output = "markdown")
| Mean/Count (SD/%) | |
|---|---|
| n = 372 | |
| gender | |
| Female | 231 (62.1%) |
| Male | 141 (37.9%) |
| class_grade | |
| 4 | 9 (2.4%) |
| 8 | 1 (0.3%) |
| 9 | 75 (20.2%) |
| 10 | 1 (0.3%) |
| 11 | 74 (19.9%) |
| 12 | 212 (57%) |
| ageyears | |
| 4 | 1 (0.3%) |
| 12 | 1 (0.3%) |
| 13 | 6 (1.6%) |
| 14 | 57 (15.3%) |
| 15 | 33 (8.9%) |
| 16 | 62 (16.7%) |
| 17 | 182 (48.9%) |
| 17.5 | 1 (0.3%) |
| 18 | 27 (7.3%) |
| 20 | 1 (0.3%) |
| 21 | 1 (0.3%) |
| height_cm | |
| 168.7 (10.9) | |
| reaction_time | |
| 0.5 (0.5) | |
| score_in_memory_game | |
| 44.2 (11.2) | |
| sleep_hours_schoolnight | |
| 6.8 (1.3) | |
| sleep_hours_non_schoolnight | |
| 8.8 (2.2) | |
| hanging_out_with_friends_hours | |
| 15.8 (19.2) | |
| doing_homework_hours | |
| 9.9 (13.6) | |
| doing_things_with_family_hours | |
| 10.1 (18.6) | |
| outdoor_activities_hours | |
| 9.2 (11.9) | |
| video_games_hours | |
| 5.2 (10.8) |
students_fixed %>% ggplot(aes(class_grade)) + geom_bar(aes(fill = gender)) +
facet_wrap(~gender)
students_fixed %>% group_by(gender) %>% summarise(mean(score_in_memory_game))
## # A tibble: 2 x 2
## gender `mean(score_in_memory_game)`
## * <fct> <dbl>
## 1 Female 43.9
## 2 Male 44.8
Let us plot this data and see it more objectively.
students_fixed %>% ggplot(aes(x = gender, y = score_in_memory_game)) +
geom_boxplot() + labs(title = "Boxplot of score comparision between genders") +
ylab("Memory game score")
# facet_wrap( ~ class_grade )
Based on the plot above we can see that males are scoring better on the memory game when compared to females. Although there are less males than females in the data set. It could just be that since there are more females who took the survey their mean goes down.
ggplot(data = students_fixed, aes(y = score_in_memory_game, x = height_cm)) +
geom_point() + geom_smooth(method = "lm", se = FALSE) + # geom_smooth(color = 'orange')+ ylim(0,40)+
facet_wrap(~gender)
## `geom_smooth()` using formula 'y ~ x'
Based on the results above it looks like there is some negative correlation, suggesting that taller people have a harder time in getting a good score in the memory game as compared to shorter people. But overall we see here the males are doing much better on the test than females.
ggplot(data = students_fixed, aes(x = score_in_memory_game)) +
geom_histogram() + # ylim(0,40)+
facet_wrap(~gender)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The histogram suggests that the average score in females is less than the males but there are also more participants for females.
Let us now see how well each of the grades did on the memory game when we compare it to the amount of hours they spend playing video games. Is there some kind of correlation? I decided to filter out some of the grade as there was no data for it.
students_fixed %>% filter(class_grade != 6, class_grade != 8,
class_grade != 10) %>% ggplot(aes(y = score_in_memory_game,
x = video_games_hours)) + geom_point() + # geom_smooth(method = 'lm', se = FALSE)+
facet_wrap(~class_grade)
It is interesting to see that there is a negative correlation between the height and the overall score of the memory game. I was expecting a positive correlation for tall people but let us further investigate and create a base linear regression model using TidyModels to see which variables are best to create the prediction model.
lmodel <- lm(score_in_memory_game ~ height_cm, data = students_fixed)
get_regression_table(lmodel)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 55.9 9.07 6.16 0 38.0 73.7
## 2 height_cm -0.069 0.054 -1.29 0.198 -0.175 0.036
students_fixed %>% summarise(r = cor(score_in_memory_game, height_cm))
## # A tibble: 1 x 1
## r
## <dbl>
## 1 -0.0668
Our correlation coefficient is -0.06683197 and based on the above results from the table we can assume the following:
This model actually suggests that taller people fair poorly in cognitive based tests.
In order to accomplish the following the code was referenced from here https://www.gmudatamining.com/lesson-10-r-tutorial.html
Let us now create our test and training dataset:
set.seed(10)
# Converting these variables back to numeric in order to
# perform the linear analysis.
students_fixed$ageyears <- as.numeric(students_fixed$ageyears)
students_fixed$class_grade <- as.numeric(students_fixed$class_grade)
# Create a split object
data_split <- initial_split(students_fixed, prop = 0.67, strata = score_in_memory_game)
# Build training data set
training <- data_split %>% training()
# Build testing data set
test <- data_split %>% testing()
lm_model <- linear_reg() %>% set_engine("lm") %>% set_mode("regression")
lm_model
## Linear Regression Model Specification (regression)
##
## Computational engine: lm
lm_fit <- lm_model %>% fit(score_in_memory_game ~ ., data = training)
lm_fit
## parsnip model object
##
## Fit time: 4ms
##
## Call:
## stats::lm(formula = score_in_memory_game ~ ., data = data)
##
## Coefficients:
## (Intercept) genderMale
## 73.1038528 2.9387243
## class_grade ageyears
## -0.3015864 -0.7517686
## height_cm reaction_time
## -0.1118949 1.7083436
## sleep_hours_schoolnight sleep_hours_non_schoolnight
## -0.7491965 -0.0309466
## hanging_out_with_friends_hours doing_homework_hours
## -0.0130812 -0.0016059
## doing_things_with_family_hours outdoor_activities_hours
## -0.0536262 0.0404418
## video_games_hours
## -0.0007044
Now that we have our training model lets take a look at the training results and call the summary of the model to get the estimated coefficients, f-stats, p-values and RMSE.
names(lm_fit)
## [1] "lvl" "spec" "fit" "preproc" "elapsed"
summary(lm_fit$fit)
##
## Call:
## stats::lm(formula = score_in_memory_game ~ ., data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.772 -7.126 -0.914 4.916 45.443
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.1038528 13.9486469 5.241 3.52e-07 ***
## genderMale 2.9387243 1.9241911 1.527 0.128
## class_grade -0.3015864 0.6821792 -0.442 0.659
## ageyears -0.7517686 0.6075561 -1.237 0.217
## height_cm -0.1118949 0.0799610 -1.399 0.163
## reaction_time 1.7083436 1.4186305 1.204 0.230
## sleep_hours_schoolnight -0.7491965 0.5493701 -1.364 0.174
## sleep_hours_non_schoolnight -0.0309466 0.3278968 -0.094 0.925
## hanging_out_with_friends_hours -0.0130812 0.0407753 -0.321 0.749
## doing_homework_hours -0.0016059 0.0521176 -0.031 0.975
## doing_things_with_family_hours -0.0536262 0.0420188 -1.276 0.203
## outdoor_activities_hours 0.0404418 0.0626019 0.646 0.519
## video_games_hours -0.0007044 0.0681141 -0.010 0.992
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.84 on 238 degrees of freedom
## Multiple R-squared: 0.05882, Adjusted R-squared: 0.01136
## F-statistic: 1.239 on 12 and 238 DF, p-value: 0.2567
par(mfrow = c(2, 2))
plot(lm_fit$fit, pch = 16, col = "#006EA1")
To obtain the detailed results from our trained linear regression model in a data frame, we can use the tidy() and glance() functions directly on our trained parsnip model, lm_fit.
tidy(lm_fit)
## # A tibble: 13 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 73.1 13.9 5.24 0.000000352
## 2 genderMale 2.94 1.92 1.53 0.128
## 3 class_grade -0.302 0.682 -0.442 0.659
## 4 ageyears -0.752 0.608 -1.24 0.217
## 5 height_cm -0.112 0.0800 -1.40 0.163
## 6 reaction_time 1.71 1.42 1.20 0.230
## 7 sleep_hours_schoolnight -0.749 0.549 -1.36 0.174
## 8 sleep_hours_non_schoolnight -0.0309 0.328 -0.0944 0.925
## 9 hanging_out_with_friends_hours -0.0131 0.0408 -0.321 0.749
## 10 doing_homework_hours -0.00161 0.0521 -0.0308 0.975
## 11 doing_things_with_family_hours -0.0536 0.0420 -1.28 0.203
## 12 outdoor_activities_hours 0.0404 0.0626 0.646 0.519
## 13 video_games_hours -0.000704 0.0681 -0.0103 0.992
glance(lm_fit)
## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0588 0.0114 10.8 1.24 0.257 12 -948. 1923. 1973.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
predict(lm_fit, new_data = test)
## # A tibble: 121 x 1
## .pred
## <dbl>
## 1 45.0
## 2 44.8
## 3 43.1
## 4 45.4
## 5 45.3
## 6 43.7
## 7 41.2
## 8 42.6
## 9 42.2
## 10 41.8
## # … with 111 more rows
# In order to see the most important variables in the list.
vip(lm_fit)
This is actually a very interesting analysis. It seems like rather than time spent video gaming, gender, height_cm, sleep, doing things with family, age and reaction_time are better factors to predict the score.
Lets combine the test data set and the predictions into a single data frame. We create a data frame with the predictions on the test data and then use bind_cols to add the test data to the results.
test_results <- predict(lm_fit, new_data = test) %>% bind_cols(test)
head(test_results)
## # A tibble: 6 x 14
## .pred gender class_grade ageyears height_cm reaction_time score_in_memory…
## <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 45.0 Female 6 6 152. 0.664 71
## 2 44.8 Male 5 6 176 0.669 58
## 3 43.1 Female 5 6 172 0.558 50
## 4 45.4 Female 3 4 166 0.78 48
## 5 45.3 Male 6 7 170 0.285 40
## 6 43.7 Male 5 7 192 0.692 60
## # … with 7 more variables: sleep_hours_schoolnight <dbl>,
## # sleep_hours_non_schoolnight <dbl>, hanging_out_with_friends_hours <dbl>,
## # doing_homework_hours <dbl>, doing_things_with_family_hours <dbl>,
## # outdoor_activities_hours <dbl>, video_games_hours <dbl>
rmse(test_results, truth = score_in_memory_game, estimate = .pred)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 11.86021383
rsq(test_results, truth = score_in_memory_game, estimate = .pred)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.01243135209
Best way to test the accuracy is by making an R-squared plot. This can be used for any regression model.
ggplot(data = test_results, mapping = aes(x = .pred, y = score_in_memory_game)) +
geom_point(color = "#006EA1") + geom_abline(intercept = 0,
slope = 1, color = "orange") + labs(title = "Linear Regression Results - Students Test Set",
x = "Predicted Score", y = "Actual Score")
# xlim(35,55)
Since our RMSE is at 11.86021. Its hard to tell if this would be a best suitable model to use for our prediction at the moment. Let us run a few other and come to a proper conclusion.
let us compare the OLS results to the above RMSE and validate our results.
new_linear <- lm(score_in_memory_game ~ ., data = training)
prediction_linear <- predict(new_linear, test)
sqrt(mean((prediction_linear - test$score_in_memory_game)^2))
## [1] 11.86021
Looks like our linear regression model using Tidy Models was done correctly as the results are same.
We are going to use the same training and testing model we created previously.
bsubset <- regsubsets(score_in_memory_game ~ ., data = training)
reg_summary <- summary(bsubset)
reg_summary
## Subset selection object
## Call: regsubsets.formula(score_in_memory_game ~ ., data = training)
## 12 Variables (and intercept)
## Forced in Forced out
## genderMale FALSE FALSE
## class_grade FALSE FALSE
## ageyears FALSE FALSE
## height_cm FALSE FALSE
## reaction_time FALSE FALSE
## sleep_hours_schoolnight FALSE FALSE
## sleep_hours_non_schoolnight FALSE FALSE
## hanging_out_with_friends_hours FALSE FALSE
## doing_homework_hours FALSE FALSE
## doing_things_with_family_hours FALSE FALSE
## outdoor_activities_hours FALSE FALSE
## video_games_hours FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## genderMale class_grade ageyears height_cm reaction_time
## 1 ( 1 ) " " " " "*" " " " "
## 2 ( 1 ) " " " " "*" " " " "
## 3 ( 1 ) " " " " "*" " " " "
## 4 ( 1 ) " " " " "*" " " "*"
## 5 ( 1 ) "*" " " "*" "*" " "
## 6 ( 1 ) "*" " " "*" "*" "*"
## 7 ( 1 ) "*" " " "*" "*" "*"
## 8 ( 1 ) "*" "*" "*" "*" "*"
## sleep_hours_schoolnight sleep_hours_non_schoolnight
## 1 ( 1 ) " " " "
## 2 ( 1 ) "*" " "
## 3 ( 1 ) "*" " "
## 4 ( 1 ) "*" " "
## 5 ( 1 ) "*" " "
## 6 ( 1 ) "*" " "
## 7 ( 1 ) "*" " "
## 8 ( 1 ) "*" " "
## hanging_out_with_friends_hours doing_homework_hours
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) " " " "
## 5 ( 1 ) " " " "
## 6 ( 1 ) " " " "
## 7 ( 1 ) " " " "
## 8 ( 1 ) " " " "
## doing_things_with_family_hours outdoor_activities_hours
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) "*" " "
## 4 ( 1 ) "*" " "
## 5 ( 1 ) "*" " "
## 6 ( 1 ) "*" " "
## 7 ( 1 ) "*" "*"
## 8 ( 1 ) "*" "*"
## video_games_hours
## 1 ( 1 ) " "
## 2 ( 1 ) " "
## 3 ( 1 ) " "
## 4 ( 1 ) " "
## 5 ( 1 ) " "
## 6 ( 1 ) " "
## 7 ( 1 ) " "
## 8 ( 1 ) " "
Out of all the variables the best 4 predictors seem to be height, reaction time, age and sleep.
names(reg_summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
data.frame(Adj.R2 = which.max(reg_summary$adjr2), RSS = which.max(reg_summary$rss),
CP = which.min(reg_summary$cp), BIC = which.min(reg_summary$bic))
## Adj.R2 RSS CP BIC
## 1 6 1 2 1
Here the R-square tells us the best model to use is the the one with a criteria of 6 predictors. RSS tells us 1, CP tells us to use 3 and BIC only suggests 1. Although no one model is correct.
reg_summary$rsq
## [1] 0.01942275 0.02998832 0.03706057 0.04322996 0.05029775 0.05651022 0.05779270
## [8] 0.05835763
In the RSQ model we see that the R-squared statistic is only 1% with one variable as more variables are added it increases to 5%
multi-linear regression model based on the adj.R2 model.
r2_linear <- lm(score_in_memory_game ~ ageyears + sleep_hours_schoolnight +
reaction_time + gender + doing_things_with_family_hours +
outdoor_activities_hours, data = training)
r2_pred_lin <- predict(r2_linear, test)
sqrt(mean((r2_pred_lin - test$score_in_memory_game)^2))
## [1] 11.95949
plot(bsubset, scale = "bic")
Based on the plot the 1 variable model has the lowest BIC, the 1 variable model with the largest R-square is ageyears.
Lets test the RMSE of the of the BIC model.
bic_linear <- lm(score_in_memory_game ~ ageyears, data = training)
bic_pred_lin <- predict(bic_linear, test)
bic_test_results <- bic_pred_lin %>% bind_cols(test)
rmse(bic_test_results, truth = score_in_memory_game, estimate = ...1)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 11.87164601
rsq(bic_test_results, truth = score_in_memory_game, estimate = ...1)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.004080157493
train.mat = model.matrix(score_in_memory_game ~ ., data = training)
test.mat = model.matrix(score_in_memory_game ~ ., data = test)
grid = 10^seq(4, -2, length = 100)
mod.ridge = cv.glmnet(train.mat, training$score_in_memory_game,
alpha = 0, lambda = grid, thresh = 1e-12)
lambda.best = mod.ridge$lambda.min
lambda.best
## [1] 37.64936
plot(mod.ridge)
ridge.pred = predict(mod.ridge, newx = test.mat, s = lambda.best)
ridge_test_results <- ridge.pred %>% bind_cols(test)
# Converting the column into a numeric from a numeric matrix.
ridge_test_results$...1 <- as.numeric(ridge_test_results$...1)
rmse(ridge_test_results, truth = score_in_memory_game, estimate = ...1)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 11.82801963
rsq(ridge_test_results, truth = score_in_memory_game, estimate = ...1)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.01128111429
mod.lasso = cv.glmnet(train.mat, training$score_in_memory_game,
alpha = 1, lambda = grid, thresh = 1e-12)
lambda.best = mod.lasso$lambda.min
lambda.best
## [1] 10000
plot(mod.lasso)
lasso.pred = predict(mod.lasso, newx = test.mat, s = lambda.best)
lasso_test_results <- lasso.pred %>% bind_cols(test)
# Converting the column into a numeric from a numeric matrix.
lasso_test_results$...1 <- as.numeric(lasso_test_results$...1)
rmse(lasso_test_results, truth = score_in_memory_game, estimate = ...1)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 11.87983702
rsq(lasso_test_results, truth = score_in_memory_game, estimate = ...1)
## Warning: A correlation computation is required, but `estimate` is constant
## and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be
## returned.
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard NA
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:rstanarm':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
pls.fit = plsr(score_in_memory_game ~ ., data = training, scale = T,
validation = "CV")
validationplot(pls.fit, val.type = "MSEP")
pls.pred = predict(pls.fit, test, ncomp = 10)
sqrt(mean((test$score_in_memory_game - pls.pred)^2))
## [1] 11.86021
meanonly_test_mse <- mean((mean(training$score_in_memory_game) -
test$score_in_memory_game)^2)
sqrt(meanonly_test_mse)
## [1] 11.87984