Final Project Code

RESEARCH QUESTION- 1

How does physical activity level influence sleep quality across different age groups?

# Load necessary libraries
library(glmnet)

Warning: package 'glmnet' was built under R version 4.3.3

Loading required package: Matrix

Loaded glmnet 4.1-8

library(plotly)

Warning: package 'plotly' was built under R version 4.3.3

Loading required package: ggplot2


Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout

library(ggplot2)
# Load the dataset
data1 <- read.csv("C:/Users/ACER/Desktop/New folder/Sai Sriram Uppada/FinalProject/Sleep_health_and_lifestyle_dataset.csv")  

# Replace with the path to the original dataset

# Prepare the data
# Assuming 'Quality_of_Sleep' is a continuous variable and both 'Age' and 'Physical_Activity_Level' are already numerical.
x <- model.matrix(Quality.of.Sleep ~ Age * Physical.Activity.Level, data = data1)[, -1]  # removing intercept
y <- data1$Quality.of.Sleep

# Fit Lasso Regression model
lasso_model <- glmnet(x, y, alpha = 1)  # alpha = 1 for Lasso

# Cross-validation for optimal lambda
cv_lasso <- cv.glmnet(x, y, alpha = 1)

coef(lasso_model, s = cv_lasso$lambda.min)

4 x 1 sparse Matrix of class "dgCMatrix"
                                      s1
(Intercept)                 -2.336480711
Age                          0.216280398
Physical.Activity.Level      0.120627719
Age:Physical.Activity.Level -0.002615155

# Plotting the coefficient path
plot(lasso_model, xvar = "lambda", label = TRUE)

# Create a ggplot



# Plotting cross-validation plot to see the optimal lambda
plot(cv_lasso)

# Creating a new data frame for predictions
age_range <- seq(min(data1$Age), max(data1$Age), by = 1)
pa_levels <- seq(min(data1$Physical.Activity.Level), max(data1$Physical.Activity.Level), length.out = 100)

# Creating a grid of Age and Physical Activity levels
grid_data <- expand.grid(Age = age_range, Physical.Activity.Level = pa_levels)
# Correcting newdata matrix generation
grid_data_matrix <- model.matrix(~ Age * Physical.Activity.Level, data = grid_data)[, -1]
# Predicting using the correct newx argument
grid_data$Quality_of_Sleep_Pred <- predict(lasso_model, newx = grid_data_matrix, s = cv_lasso$lambda.min)

# Plotting
a = ggplot(grid_data, aes(x = Age, y = Physical.Activity.Level, fill = Quality_of_Sleep_Pred)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = median(grid_data$Quality_of_Sleep_Pred), space = "Lab", name = "Predicted\nSleep Quality") +
  labs(x = "Age", y = "Physical Activity Level", title = "Predicted Sleep Quality Across Age and Physical Activity Levels") +
  theme_minimal() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

ggplotly(a)

RESEARCH QUESTION- 2

What are the key predictors influencing sleep quality, and how can we predict sleep quality based on lifestyle and health metrics?

library(randomForest)

Warning: package 'randomForest' was built under R version 4.3.3

randomForest 4.7-1.1

Type rfNews() to see new features/changes/bug fixes.


Attaching package: 'randomForest'

The following object is masked from 'package:ggplot2':

    margin

library(ggplot2)
library(plotly)
library(dplyr)


Attaching package: 'dplyr'

The following object is masked from 'package:randomForest':

    combine

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

# Load the dataset
data1 <- read.csv("C:/Users/ACER/Desktop/New folder/Sai Sriram Uppada/FinalProject/Sleep_health_and_lifestyle_dataset.csv")  # Replace with the path to the original dataset

# Split the "Blood Pressure" into "Systolic BP" and "Diastolic BP"
data1 <- data1 %>%
  mutate(Systolic.BP = as.integer(sub("/.*", "", Blood.Pressure)),
         Diastolic.BP = as.integer(sub(".*/", "", Blood.Pressure)))

# Fit Random Forest model to predict sleep quality
set.seed(123)  # for reproducibility
rf_model <- randomForest(Quality.of.Sleep ~ Age + Physical.Activity.Level + Stress.Level + Heart.Rate + Systolic.BP + Diastolic.BP, 
                         data=data1, importance=TRUE, ntree=500)

# Check out the model summary
print(rf_model)


Call:
 randomForest(formula = Quality.of.Sleep ~ Age + Physical.Activity.Level +      Stress.Level + Heart.Rate + Systolic.BP + Diastolic.BP, data = data1,      importance = TRUE, ntree = 500) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 2

          Mean of squared residuals: 0.02697864
                    % Var explained: 98.11

# Variable importance plot
importance_data <- importance(rf_model)
varImpPlot(rf_model)

# Predict sleep quality
predicted_quality <- predict(rf_model, data1)

# Create a plot of actual vs predicted sleep quality
data1$Predicted_Quality_of_Sleep <- predicted_quality
a = ggplot(data1, aes(x=Quality.of.Sleep, y=Predicted_Quality_of_Sleep)) +
  geom_point(alpha=0.5) +
  geom_smooth(method='lm', color='red') +
  labs(x="Actual Sleep Quality", y="Predicted Sleep Quality", title="Actual vs Predicted Sleep Quality") +
  theme_minimal()


# Convert ggplot object to an interactive plotly object
ggplotly(a)

`geom_smooth()` using formula = 'y ~ x'

RESEARCH QUESTION- 3

Does BMI predict the presence of sleep apnea?

# Load necessary libraries
library(readr)

Warning: package 'readr' was built under R version 4.3.3

library(dplyr)
library(glmnet)
library(ggplot2)

# Load the dataset (adjust the file path as needed)
sleep_data <-read.csv("C:/Users/ACER/Desktop/New folder/Sai Sriram Uppada/FinalProject/Sleep_health_and_lifestyle_dataset.csv") 

# Rename columns if needed to match the code
# Assuming "BMI.Category" is the column containing BMI information
colnames(sleep_data) <- gsub("\\.", "_", colnames(sleep_data))  

# Prepare the predictor variables and the response
# Assuming "BMI_Category" is the column name for BMI information
sleep_data$Sleep_Apnea <- as.factor(ifelse(sleep_data$Sleep_Disorder == "Sleep Apnea", 1, 0))
predictors <- model.matrix(~ BMI_Category, sleep_data)[, -1]
response <- sleep_data$Sleep_Apnea


# Fit logistic regression model
fit_apnea <- glmnet(predictors, response, family = "binomial")

# Print coefficients
coef(fit_apnea)

4 x 59 sparse Matrix of class "dgCMatrix"

  [[ suppressing 59 column names 's0', 's1', 's2' ... ]]

                                                                               
(Intercept)               -1.333651 -1.4169592 -1.4967333 -1.5731075 -1.6461937
BMI_CategoryNormal Weight  .         .          .          .          .        
BMI_CategoryObese          .         .          .          .          .        
BMI_CategoryOverweight     .         0.2032473  0.3858892  0.5515479  0.7028368
                                                                             
(Intercept)               -1.7160866 -1.7828678 -1.846610 -1.907379 -1.965239
BMI_CategoryNormal Weight  .          .          .         .         .       
BMI_CategoryObese          .          .          .         .         .       
BMI_CategoryOverweight     0.8417121  0.9696839  1.087948  1.197474  1.299062
                                                                             
(Intercept)               -2.0377290 -2.112016 -2.1836814 -2.252754 -2.319240
BMI_CategoryNormal Weight  .          .         .          .         .       
BMI_CategoryObese          0.3484589  0.696250  0.9825335  1.227607  1.442582
BMI_CategoryOverweight     1.4108657  1.520571  1.6241941  1.722146  1.814758
                                                                           
(Intercept)               -2.383149 -2.444492 -2.503278 -2.558883 -2.612608
BMI_CategoryNormal Weight  .         .         .         .         .       
BMI_CategoryObese          1.634265  1.807125  1.964271  2.106844  2.238800
BMI_CategoryOverweight     1.902326  1.985111  2.063344  2.136371  2.206135
                                                                           
(Intercept)               -2.663841 -2.712656 -2.758981 -2.802901 -2.844461
BMI_CategoryNormal Weight  .         .         .         .         .       
BMI_CategoryObese          2.360350  2.472666  2.576373  2.672314  2.761120
BMI_CategoryOverweight     2.271942  2.334015  2.392380  2.447247  2.498764
                                                                           
(Intercept)               -2.883716 -2.920722 -2.955546 -2.988255 -3.018925
BMI_CategoryNormal Weight  .         .         .         .         .       
BMI_CategoryObese          2.843340  2.919461  2.989923  3.055122  3.115427
BMI_CategoryOverweight     2.547075  2.592322  2.634643  2.674177  2.711058
                                                                      
(Intercept)               -3.05478251 -3.0953852 -3.1339198 -3.1702471
BMI_CategoryNormal Weight  0.07351836  0.2018241  0.3141272  0.4137426
BMI_CategoryObese          3.17818849  3.2435523  3.3045906  3.3614665
BMI_CategoryOverweight     2.75252307  2.7982236  2.8414624  2.8820695
                                                                               
(Intercept)               -3.2044301 -3.2365384 -3.266646 -3.2948281 -3.3211657
BMI_CategoryNormal Weight  0.5026901  0.5825245  0.654468  0.7195043  0.7784408
BMI_CategoryObese          3.4144096  3.4636453  3.509389  3.5518460  3.5912145
BMI_CategoryOverweight     2.9201475  2.9558015  2.989137  3.0202591  3.0492740
                                                                              
(Intercept)               -3.345740 -3.3683699 -3.3896571 -3.4094442 -3.427803
BMI_CategoryNormal Weight  0.831952  0.8802018  0.9244895  0.9648602  1.001673
BMI_CategoryObese          3.627683  3.6610423  3.6922381  3.7210733  3.747690
BMI_CategoryOverweight     3.076287  3.1011069  3.1244170  3.1460485  3.166088
                                                                           
(Intercept)               -3.444815 -3.460560 -3.475116 -3.488557 -3.500652
BMI_CategoryNormal Weight  1.035258  1.065909  1.093890  1.119438  1.142322
BMI_CategoryObese          3.772238  3.794859  3.815688  3.834854  3.852043
BMI_CategoryOverweight     3.184631  3.201772  3.217599  3.232200  3.245318
                                                                           
(Intercept)               -3.512073 -3.522614 -3.532316 -3.541237 -3.549102
BMI_CategoryNormal Weight  1.163616  1.183095  1.200886  1.217133  1.231495
BMI_CategoryObese          3.868227  3.883120  3.896793  3.909336  3.920375
BMI_CategoryOverweight     3.257702  3.269121  3.279625  3.289276  3.297774
                                                                           
(Intercept)               -3.556610 -3.563536 -3.569894 -3.575632 -3.580222
BMI_CategoryNormal Weight  1.245015  1.257410  1.268733  1.278940  1.287317
BMI_CategoryObese          3.930889  3.940571  3.949443  3.957439  3.963839
BMI_CategoryOverweight     3.305887  3.313368  3.320233  3.326423  3.331362

# Perform cross-validation
cv_fit_apnea <- cv.glmnet(predictors, response, family = "binomial")
best_lambda_apnea <- cv_fit_apnea$lambda.min


# Load necessary libraries
library(pROC)

Warning: package 'pROC' was built under R version 4.3.3

Type 'citation("pROC")' for a citation.


Attaching package: 'pROC'

The following objects are masked from 'package:stats':

    cov, smooth, var

# Make predictions on the training set
predictions_train <- predict(cv_fit_apnea, type = "response", s = "lambda.min", newx = predictors)

# Calculate ROC curve
roc_curve <- roc(as.numeric(response) -1 , predictions_train)

Setting levels: control = 0, case = 1

Warning in roc.default(as.numeric(response) - 1, predictions_train): Deprecated
use a matrix as predictor. Unexpected results may be produced, please pass a
numeric vector.

Setting direction: controls < cases

# Plot ROC curve with AUC
plot(roc_curve, main = "ROC Curve", col = "blue")
legend("bottomright", legend = paste("AUC =", round(auc(roc_curve), 2)), col = "blue", lty = 1)