# Load necessary libraries library(bestNormalize) library(e1071) library(openxlsx) library(dplyr) # Step 0: Set invalid Backward Span scores to NA (if Forward Span was completed) df$Backwards_Total[df$Backwards_Total == 0 & df$Forwards_Total > 0] <- NA # Step 1: Define the list of variables for transformation variables_to_transform <- c( "PD_Duration", "LEDD", "MoCA_Total", "UPDRS_NonMotor", "UPDRS_Motor", "CFRS", "GAD", "GDS", "CERQ_Planning", "CERQ_Perspective", "CERQ_Catastrophising", "CERQ_OtherBlame", "BERQ_ActivelyApproach", "BERQ_Ignoring", "TMT_A", "TMT_A_RT", "TMT_B", "TMT_B_RT", "Forwards_Total", "Backwards_Total", "Go", "NoGo", "FalseAlarm", "Go_RT" ) # Step 2: Function to calculate skewness, kurtosis, and Shapiro-Wilk p-value calc_skew_kurt_shapiro <- function(x) { c( skewness = e1071::skewness(x, na.rm = TRUE), kurtosis = e1071::kurtosis(x, na.rm = TRUE), shapiro_p = if (sum(!is.na(x)) >= 3) shapiro.test(x)$p.value else NA ) } # Step 3: Apply transformations and calculate distribution stats transformations <- function(df, variables) { results <- list() transformed_df <- df for (var in variables) { data <- df[[var]] # Coerce to numeric if needed if (!is.numeric(data)) { warning(paste("Variable", var, "is not numeric. Coercing to numeric.")) data <- as.numeric(data) } # Apply Yeo-Johnson transformation yeojohnson_trans <- bestNormalize::yeojohnson(data)$x.t # Apply Log Transformation (adding a small constant to handle zeros) log_trans <- log(data + 1e-6) # Apply Square Root Transformation (adding a small constant to handle zeros) sqrt_trans <- sqrt(data + 1e-6) # Calculate stats original_stats <- calc_skew_kurt_shapiro(data) yeojohnson_stats <- calc_skew_kurt_shapiro(yeojohnson_trans) log_stats <- calc_skew_kurt_shapiro(log_trans) sqrt_stats <- calc_skew_kurt_shapiro(sqrt_trans) # Store results results[[var]] <- data.frame( Transformation = c("Original", "Yeo-Johnson", "Log", "Square Root"), Skewness = c(original_stats["skewness"], yeojohnson_stats["skewness"], log_stats["skewness"], sqrt_stats["skewness"]), Kurtosis = c(original_stats["kurtosis"], yeojohnson_stats["kurtosis"], log_stats["kurtosis"], sqrt_stats["kurtosis"]), Shapiro_Wilk_p = c(original_stats["shapiro_p"], yeojohnson_stats["shapiro_p"], log_stats["shapiro_p"], sqrt_stats["shapiro_p"]) ) # Store best transformation (using Yeo-Johnson for now) transformed_df[[var]] <- yeojohnson_trans } list(results = results, transformed_df = transformed_df) } # Step 4: Run transformation function transformation_results <- transformations(df, variables_to_transform) # Step 5: Print summary stats for each variable for (var in names(transformation_results$results)) { cat("Variable:", var, "\n") print(transformation_results$results[[var]]) cat("\n") } # Step 6: Extract the transformed data frame transformed_df <- transformation_results$transformed_df # Step 7: Combine transformation summaries into one table results_df <- do.call(rbind, lapply(names(transformation_results$results), function(var) { df <- transformation_results$results[[var]] df$Variable <- var df })) # Reorder columns results_df <- results_df[, c("Variable", "Transformation", "Skewness", "Kurtosis", "Shapiro_Wilk_p")] # Step 8: Save outputs write.xlsx(results_df, "~/Desktop/transformation_summary.xlsx") write.xlsx(transformed_df, "~/Desktop/df_transformed.xlsx") # Step 9: Standardise numeric columns df_standardised <- transformed_df %>% mutate(across(where(is.numeric), ~ scale(.) %>% as.vector())) # Preview and save print(head(df_standardised)) write.xlsx(df_standardised, "~/Desktop/df_standardised.xlsx")