Quadratic Linear Mixed Effects Models - Private-Projects237/Statistics GitHub Wiki
Overview
The goal of this wiki is generate several datasets and test to see whether modeling predictors as quadratic can improve model fit. We will not spend much time emphasizing on creating the dataset but more on interpreting the plots and how that relates to the model that will best explain the outcome. Additionally, nested data was prioritized since using the anova()
function for model comparison can directly present the number of parameters that had to be estimated by using quadratic modeling.
Part 1: Income Dataset
The following datasets were generated with the help of Grok. They are showcasing the relationship between age and income by years of experience (1-5) and sex (m, f). Three datasets were generated, one where the relationship between age and income is linear for both sexes, another where this relationship is quadratic, and one where this relationship is quadratic but only for one sex.
Generating the datasets
set.seed(123)
n_people <- 100
obs_per_person <- 5
total_obs <- n_people * obs_per_person
library(ggplot2)
# Dataset 1: Linear relationship for both males and females
dummy_data_linear <- data.frame(
person_id = factor(rep(1:n_people, each = obs_per_person)),
age = rep(runif(n_people, 20, 60), each = obs_per_person), # Base age per person
experience = rep(1:obs_per_person, times = n_people), # 1-5 years of experience
gender = factor(rep(sample(c(0, 1), n_people, replace = TRUE), each = obs_per_person),
labels = c("Male", "Female"))
) %>%
mutate(
age_centered = age - mean(age), # Center age around its mean
# Income model: Linear effect for both genders, no quadratic terms
income = 30000 + # Base income
500 * age_centered + # Linear age effect (both genders)
2000 * experience + # Linear effect of experience (2000/year)
5000 * (gender == "Female") + # Females have higher base income
rnorm(n_people, sd = 5000)[person_id] + # Random intercept per person
rnorm(total_obs, sd = 2000) # Residual error
)
# Dataset 2: Quadratic relationship for both males and females
dummy_data_quadratic <- data.frame(
person_id = factor(rep(1:n_people, each = obs_per_person)),
age = rep(runif(n_people, 20, 60), each = obs_per_person), # Base age per person
experience = rep(1:obs_per_person, times = n_people), # 1-5 years of experience
gender = factor(rep(sample(c(0, 1), n_people, replace = TRUE), each = obs_per_person),
labels = c("Male", "Female"))
) %>%
mutate(
age_centered = age - mean(age), # Center age around its mean
# Income model: Quadratic effect for both genders
income = 30000 + # Base income
500 * age_centered + # Linear age effect (both genders)
(-40 * age_centered^2) + # Quadratic effect for both genders
2000 * experience + # Linear effect of experience (2000/year)
5000 * (gender == "Female") + # Females have higher base income
rnorm(n_people, sd = 5000)[person_id] + # Random intercept per person
rnorm(total_obs, sd = 2000) # Residual error
)
# Generate the third and final dataset
dummy_data_mixed <- data.frame(
person_id = factor(rep(1:n_people, each = obs_per_person)),
age = rep(runif(n_people, 20, 60), each = obs_per_person), # Base age per person
experience = rep(1:obs_per_person, times = n_people), # 1-5 years of experience
gender = factor(rep(sample(c(0, 1), n_people, replace = TRUE), each = obs_per_person),
labels = c("Male", "Female"))
) %>%
mutate(
age_centered = age - mean(age), # Center age around its mean
# Income model:
# - Quadratic effect for males: strong linear and quadratic terms
# - For females: minimal/no quadratic effect, similar linear effect
# - Experience: linear increase in income, same for both genders
income = 30000 + # Base income
500 * age_centered + # Linear age effect (both genders)
(-40 * age_centered^2) * (gender == "Male") + # Strong quadratic for males
(0 * age_centered^2) * (gender == "Female") + # No quadratic for females
2000 * experience + # Linear effect of experience (2000/year)
5000 * (gender == "Female") + # Females have higher base income
rnorm(n_people, sd = 5000)[person_id] + # Random intercept per person
rnorm(total_obs, sd = 2000) # Residual error
)
Visualizing the datasets
# Visualize Dataset 1 (Linear for both)
ggplot(dummy_data_linear, aes(x = age_centered, y = income, color = gender)) +
geom_point(alpha = 0.5) +
geom_smooth(se = FALSE) +
facet_wrap(~ experience) +
theme_minimal() +
labs(title = "Linear Model: Income vs. Centered Age\nby Gender and Experience",
x = "Centered Age", y = "Income")
# Visualize Dataset 2 (Quadratic for both)
ggplot(dummy_data_quadratic, aes(x = age_centered, y = income, color = gender)) +
geom_point(alpha = 0.5) +
geom_smooth( se = FALSE) +
facet_wrap(~experience) +
theme_minimal() +
labs(title = "Quadratic Model: Income vs. Centered Age\nby Gender and Experience",
x = "Centered Age", y = "Income")
# Visualize Dataset 3 (Quadratic for one and Linear for the other)
ggplot(dummy_data_mixed, aes(x = age_centered, y = income, color = gender)) +
geom_point(alpha = 0.5) +
geom_smooth( se = FALSE) +
facet_wrap(~experience) +
theme_minimal() +
labs(title = "Quadratic and Linear Model: Income vs. Centered Age\nby Gender and Experience",
x = "Centered Age", y = "Income")
Linear data | Quadratic data | Mixed data |
---|---|---|
Fitting the models Four models are being specified below for each of the three datasets above. These models were chosen on purpose to prove a point, and that is that we can to an extent know what model would best explain our data based on what the graphs show. The following models are specifying and comparing are the following:
- Model 1: Contains three main effects (experience, gender, age_centered) and the random effects (person_id)
- Model 2: Introduces the quadratic version of age_centered (I(age_centered^2))
- Model 3: Introduces the linear version of age and gender interaction
- Model 4: Introduces the quadratic version of age and gender interaction.
From the plots above we can predict that the following models will best represent the data:
- Linear data: The first model will since the relationship between age and income looks linear for both sexes
- Quadratic data: The second model will since both sexes have a quadratic relationship between age and income. An interaction will not make sense here since this relationship looks constant across the levels of sex.
- Mixed data: The fourth model since we can see that for males there is a quadratic relationship between age and income while for females it is linear. Thus, an interaction on this relationship is needed for the model to test this.
# Generate the models of interest
mod1 <- lmer(income ~ (1 | person_id) + experience + gender + age_centered, data = dummy_data_linear)
mod2 <- update(mod1, . ~ . + I(age_centered^2))
mod3 <- update(mod2, . ~ . + age_centered * gender)
mod4 <- update(mod3, . ~ . + I(age_centered^2) *gender)
# Model comparison
anova(mod1, mod2, mod3, mod4)
# Generate the models of interest
mod1 <- lmer(income ~ (1 | person_id) + experience + gender + age_centered, data = dummy_data_quadratic)
mod2 <- update(mod1, . ~ . + I(age_centered^2))
mod3 <- update(mod2, . ~ . + age_centered * gender)
mod4 <- update(mod3, . ~ . + I(age_centered^2) *gender)
# Model comparison
anova(mod1, mod2, mod3, mod4)
# Generate the models of interest
mod1 <- lmer(income ~ (1 | person_id) + experience + gender + age_centered, data = dummy_data_mixed)
mod2 <- update(mod1, . ~ . + I(age_centered^2))
mod3 <- update(mod2, . ~ . + age_centered * gender)
mod4 <- update(mod3, . ~ . + I(age_centered^2) *gender)
# Model comparison
anova(mod1, mod2, mod3, mod4)
Model Comparison for the Linear data |
---|
Model Comparison for the Quadratic data |
---|
Model Comparison for the Mixed data |
---|