From Problem Set 2.
Let’s set our random seed:
set.seed(42)
Let’s write a function that takes in a number of iterations and returns a data frame with all of the relevant information for our Monte Carlo simulation.
library(tidyverse)
mc_pi = function(n) {
df = tibble(x = runif(n)*2-1, y = runif(n)*2-1)
df = df %>% mutate(r = x^2+y^2) %>%
mutate(incirc = ifelse(x^2+y^2 <= 1, 1, 0)) %>%
mutate(perc_inside = cummean(incirc)) %>%
mutate(pi_est = perc_inside*4) %>%
mutate(err = pi-pi_est) %>%
mutate(abs_err = abs(err))
return(df)
}
Test this out:
test = mc_pi(10^6)
tail(test$pi_est)
## [1] 3.140900 3.140901 3.140901 3.140902 3.140903 3.140904
Graph our error:
test %>% slice(seq(1,length(test$y),1000)) %>%
ggplot() + geom_point(aes(x=1:length(x), y=abs_err), size = 0.1) + scale_y_log10() + xlab("Iteration") + ylab("Log error")
Load our data:
hr = read_csv("HR_comma_sep.csv")
Label factors:
hr = hr %>% mutate(number_project = ordered(number_project)) %>%
mutate(time_spend_company = ordered(time_spend_company)) %>%
mutate(work_accident = factor(Work_accident)) %>%
mutate(left = factor(left)) %>%
mutate(sales = factor(sales)) %>%
mutate(salary = factor(salary))
Drop the extra column with inconsistent naming:
hr = hr %>% select(-Work_accident)
Let’s shuffle the dataframe:
sh_hr = slice(hr, sample(nrow(hr), replace = FALSE))
head(hr[1:3])
## # A tibble: 6 x 3
## satisfaction_level last_evaluation number_project
## <dbl> <dbl> <ord>
## 1 0.38 0.53 2
## 2 0.80 0.86 5
## 3 0.11 0.88 7
## 4 0.72 0.87 5
## 5 0.37 0.52 2
## 6 0.41 0.50 2
head(sh_hr[1:3])
## # A tibble: 6 x 3
## satisfaction_level last_evaluation number_project
## <dbl> <dbl> <ord>
## 1 0.36 0.57 2
## 2 0.09 0.79 6
## 3 0.65 0.96 2
## 4 0.56 0.79 4
## 5 0.99 0.73 3
## 6 0.78 0.89 4
Split the dataset:
hr_train = slice(sh_hr,1:10000)
hr_test = slice(sh_hr, seq(10001, nrow(sh_hr)))
Examine some summary statistics:
summary(hr_train)
## satisfaction_level last_evaluation number_project average_montly_hours
## Min. :0.0900 Min. :0.3600 2:1547 Min. : 96.0
## 1st Qu.:0.4400 1st Qu.:0.5600 3:2708 1st Qu.:156.0
## Median :0.6500 Median :0.7200 4:2911 Median :201.0
## Mean :0.6147 Mean :0.7174 5:1859 Mean :201.7
## 3rd Qu.:0.8200 3rd Qu.:0.8700 6: 806 3rd Qu.:246.0
## Max. :1.0000 Max. :1.0000 7: 169 Max. :310.0
##
## time_spend_company left promotion_last_5years sales
## 3 :4304 0:7627 Min. :0.0000 sales :2740
## 2 :2116 1:2373 1st Qu.:0.0000 technical :1856
## 4 :1725 Median :0.0000 support :1489
## 5 : 983 Mean :0.0222 IT : 829
## 6 : 478 3rd Qu.:0.0000 marketing : 588
## 10 : 147 Max. :1.0000 product_mng: 577
## (Other): 247 (Other) :1921
## salary work_accident
## high : 824 0:8574
## low :4845 1:1426
## medium:4331
##
##
##
##
library(GGally)
prs = ggpairs(hr_train)
prs