## The objective of this simulation is to prove the following hypothesis:
## In a regression model, when you have predictors with different standard deviations, the standardized coefficients will be no longer reliable
# repeat the simulation N times
N = 1000
results = replicate(N, {
# create 2 variables x1 and x2, normally distributed with mean 0 with the only difference that x2 has a larger standard deviation
x1 = rnorm(n = 30, mean = 0, sd = 1)
x2 = rnorm(n = 30, mean = 0, sd = 20)
# create the y variable such as the true coefficients of x1 and x2 are equal to 1
# and add a small error term to avoid a perfect model fit
y = x1 + x2 + 0.01 * rnorm(30, 0, 20)
# standardize the variables
x1 = scale(x1)
x2 = scale(x2)
y = scale(y)
# run a linear regression model
model = lm(y ~ x1 + x2)
# save the difference between the coefficient of x2 and x1 in the variable "results"
abs(model$coefficients[3]) - abs(model$coefficients[2])
})
# get the mean difference between the standardized coefficients of x2 and x1
mean(results)
## HOW TO INTERPRET THE OUTCOME
## the output is close to 1, which means that
## x2 has a standardized coefficient bigger than that of x1
## which proves the above hypothesis
## NOTE:
## try re-running this simulation without standardizing the variables
## the output will be very close to 0
## i.e. the unstandardized coefficients of x1 and x2 will be equal