Insight Crafting: Sejun's Data Analysis Expedition
Analysis of Salary Drivers, Gender Wage Disparity, and Job Satisfaction in the U.S. Labor Market
In this project, the dynamics of the U.S. labor market are examined with a focus on understanding salary drivers, the gender wage gap, and levels of job satisfaction. Through the application of robust statistical methods and data visualization techniques, significant insights were obtained from three large datasets. This analysis allowed for the identification of patterns and trends, providing a comprehensive view of the current labor market conditions.

Results & Analysis
Download:
Code Appendix
Job Satisfaction & Unemployment Survey Data
library(dplyr) library(data.table) library(tables) library(knitr) library(ggplot2) library(visdat) library(caret) library(tidyr) library(car) library(GGally) library(kableExtra) library(logistf) library(MASS) library(vcd) library(nnet) library(gt) library(brant) library(ordinalNet) library(knitr) library(forcats) JU <- fread('JU Manipulation.csv') str(JU) summary(JU) data_vis_dat(JU) ## checking categories for each variable unique(JU$`1. What is your current employment status?`) unique(JU$`2. If you are currently employed, was your main job....? (check all that apply) - Selected Choice`) unique(JU$`10. Is your main job permanent (meaning there is no pre-specified date of termination)?`) unique(JU$`12. How would you classify your work? - Selected Choice`) unique(JU$`14. Overall, how satisfied or dissatisfied are you with your current employer?`) unique(JU$`15. Which of the following benefits do you have that are paid either in full or part by your employer? (check all that apply)`) unique(JU$`16. Please estimate how many people work for your employer at all locations:`) unique(JU$`21. Which of the following best describes your work schedule at your main job? (Check one answer only). Is your schedule? - Selected Choice`) unique(JU$`23. In addition to your contractual schedule, do you usually work extra overtime hours in your main job, for which you do not receive compensation?`) unique(JU$`32. What is your highest level of education? - Selected Choice`) unique(JU$`33. How closely is your current job related to your education? Is it...`) ## Removing redundant variable JU$`2. If you are currently employed, was your main job....? (check all that apply) - Selected Choice` <- NULL JU$`2. If you are currently employed, was your main job....? (check all that apply) - Other (please specify) - Text` <- NULL JU$`3. If you are employed part-time in your main job (less than 30 hours), was it because you: - Selected Choice` <- NULL JU$`3. If you are employed part-time in your main job (less than 30 hours), was it because you: - Other (please specify) - Text`<- NULL JU$`4. If you are currently unemployed and actively looking for work in the last 4 weeks, are you unemployed because of: (check only one answer) - Selected Choice`<- NULL JU$`4. If you are currently unemployed and actively looking for work in the last 4 weeks, are you unemployed because of: (check only one answer) - Other (please specify) - Text`<- NULL JU$`5. If you are now unemployed, how long have you been unemployed?`<- NULL JU$`6. If you are unemployed, are you mainly looking for:`<- NULL JU$`7. Would you like to have a paying job?`<- NULL JU$`8. For what reason do you not have a paying job? (Enter a maximum of 3 reasons) - Selected Choice`<- NULL JU$`8. For what reason do you not have a paying job? (Enter a maximum of 3 reasons) - Other? (please specify) - Text`<- NULL JU$`9. If you want a job, what do you think the chances are that you will find one in the next 3 months?`<- NULL JU$`10. Is your main job permanent (meaning there is no pre-specified date of termination)?`<- NULL JU$`11. Why is your job not permanent? (Please check only the one answer that best describes your job.) - Selected Choice`<- NULL JU$`11. Why is your job not permanent? (Please check only the one answer that best describes your job.) - Other (please specify) - Text`<- NULL JU$`12. How would you classify your work? - Selected Choice`<- NULL JU$`12. How would you classify your work? - Other (please specify) - Text`<- NULL JU$`13. What is your occupation? - Selected Choice`<- NULL JU$`13. What is your occupation? - Other (please specify) - Text`<- NULL JU$`16. Please estimate how many people work for your employer at all locations:`<- NULL JU$`17. Do you currently have more than one job?`<- NULL JU$`18. How many jobs do you currently have?`<- NULL JU$`19. What is the main reason you work at more than one job? (check one only): - Selected Choice`<- NULL JU$`19. What is the main reason you work at more than one job? (check one only): - Other (please specify) - Text`<- NULL JU$`20. Are you on a flexible schedule that allows you to choose the time to begin and end your work day?`<- NULL JU$`21. Which of the following best describes your work schedule at your main job? (Check one answer only). Is your schedule… - Other (please specify) - Text`<- NULL JU$`22. What is the main reason you work this schedule? (Choose main reason - check only one answer) - Selected Choice`<- NULL JU$`22. What is the main reason you work this schedule? (Choose main reason - check only one answer) - Other (please specify) - Text`<- NULL JU$`24. Have you had paid work in the past 12 months?`<- NULL JU$`25. Have any of the following situations in your work environment caused you excess worry or stress in the past 12 months? (Check all that apply). - Selected Choice`<- NULL JU$`25. Have any of the following situations in your work environment caused you excess worry or stress in the past 12 months? (Check all that apply). - Other (please specify) - Text`<- NULL JU$`26. Do you think it is likely you will lose your job or be laid off in the next year? Would you say it is...`<- NULL JU$`27. If you think it is likely you will lose your job or be laid off in the next year, why do you think this will be? (Please specify)`<- NULL JU$`28. During the last 3 years, did you lose or leave a job for any reason? (Do not count summer jobs you had while a student or completion of a contract job.)`<- NULL JU$`29. Why did you lose or leave a job? (If it happened more than once, check other columns. For each job you lost or left in the last three years, enter the most important reasons up to a maximum of 3 reasons): - Poor work performance/conflict at work`<- NULL JU$`30. During the last 3 years, how many different paid jobs have you held with one or more employers?`<- NULL JU$`31. During the last 3 years, how many times have you switched jobs from one employer to another?`<- NULL JU$`32. What is your highest level of education? - Other (please specify) - Text`<- NULL JU$`36. What is your race/ethnicity? - Other (Please specify) - Text`<- NULL JU$`37. Do you identify as a person with a disability or other chronic condition?`<- NULL #### Data Manipulation (Feature Engineering) ## See how many answered values are recorded in each column response_counts % summarise_all(function(x) sum(!is.na(x) & x != "")) print(response_counts) ## Remove any null values in job_satisfaction_score JU <- JU[JU$`14. Overall, how satisfied or dissatisfied are you with your current employer?` != "", ] ## Replace the value for consistency : Replace "Jan-49" with "1 - 49" # JU$`16. Please estimate how many people work for your employer at all locations:`[JU$`16. Please estimate how many people work for your employer at all locations:` == "Jan-49"] <- "1-49" # JU$`16. Please estimate how many people work for your employer at all locations:` Go to question 12" with "" (an empty string) JU$`1. What is your current employment status?` Go to question 4", "", JU$`1. What is your current employment status?`) JU$`1. What is your current employment status?` Go to question 7", "", JU$`1. What is your current employment status?`) # JU$`2. If you are currently employed, was your main job....? (check all that apply) - Selected Choice` <- gsub(" \\(please specify\\)", "", JU$`2. If you are currently employed, was your main job....? (check all that apply) - Selected Choice`) # JU$`2. If you are currently employed, was your main job....? (check all that apply) - Selected Choice` <- gsub(" \\(less than 30 hours\\)", "", JU$`2. If you are currently employed, was your main job....? (check all that apply) - Selected Choice`) # JU$`2. If you are currently employed, was your main job....? (check all that apply) - Selected Choice` <- gsub(" \\(30 or more hours\\)", "", JU$`2. If you are currently employed, was your main job....? (check all that apply) - Selected Choice`) # JU$`10. Is your main job permanent (meaning there is no pre-specified date of termination)?` Go to question 12", "", JU$`10. Is your main job permanent (meaning there is no pre-specified date of termination)?`) # JU$`12. How would you classify your work? - Selected Choice` <- gsub(" \\(please specify\\)", "", JU$`12. How would you classify your work? - Selected Choice`) JU$`15. Which of the following benefits do you have that are paid either in full or part by your employer? (check all that apply)` <- gsub(" \\(other than mandatory Canada Pension Plan\\)", "", JU$`15. Which of the following benefits do you have that are paid either in full or part by your employer? (check all that apply)`) JU$`15. Which of the following benefits do you have that are paid either in full or part by your employer? (check all that apply)` <- gsub(" \\(other than provincial Medicare\\)", "", JU$`15. Which of the following benefits do you have that are paid either in full or part by your employer? (check all that apply)`) JU$`21. Which of the following best describes your work schedule at your main job? (Check one answer only). Is your schedule? - Selected Choice` <- gsub("\\?", "", JU$`21. Which of the following best describes your work schedule at your main job? (Check one answer only). Is your schedule? - Selected Choice`) JU$`21. Which of the following best describes your work schedule at your main job? (Check one answer only). Is your schedule? - Selected Choice` <- gsub(" \\(please specify\\)", "", JU$`21. Which of the following best describes your work schedule at your main job? (Check one answer only). Is your schedule? - Selected Choice`) JU$`32. What is your highest level of education? - Selected Choice` <- gsub(" \\(please specify\\)", "", JU$`32. What is your highest level of education? - Selected Choice`) # Filter out only employed people JU <- JU[JU$`1. What is your current employment status?` == "Employed"] ## Fill the blank with values JU$`23. In addition to your contractual schedule, do you usually work extra overtime hours in your main job, for which you do not receive compensation?`[JU$`23. In addition to your contractual schedule, do you usually work extra overtime hours in your main job, for which you do not receive compensation?` == ""] <- "Yes" JU$`32. What is your highest level of education? - Selected Choice`[JU$`32. What is your highest level of education? - Selected Choice` == ""] <- "Associate degree" JU$`33. How closely is your current job related to your education? Is it...`[JU$`33. How closely is your current job related to your education? Is it...` == ""] <- "Somewhat related" JU$`34. What is your age?`[JU$`34. What is your age?` == ""] <- "21 - 24" JU$`35. How do you identify?`[JU$`35. How do you identify?` == ""] <-"Male" JU$`36. What is your race/ethnicity? - Selected Choice`[JU$`36. What is your race/ethnicity? - Selected Choice` == ""] <- "Hispanic or Latino" ## Changing Variable Names setnames(JU, c("1. What is your current employment status?", "14. Overall, how satisfied or dissatisfied are you with your current employer?", "15. Which of the following benefits do you have that are paid either in full or part by your employer? (check all that apply)", "21. Which of the following best describes your work schedule at your main job? (Check one answer only). Is your schedule? - Selected Choice", "23. In addition to your contractual schedule, do you usually work extra overtime hours in your main job, for which you do not receive compensation?", "32. What is your highest level of education? - Selected Choice", "33. How closely is your current job related to your education? Is it...", "34. What is your age?", "35. How do you identify?", "36. What is your race/ethnicity? - Selected Choice"), c("employment_status_1", "job_satisfaction_14", "employee_benefits_15", "work_schedule_type_21", "uncompensated_overtime_23", "education_level_32", "job_education_relation_33","age_34", "gender_35", "race_36")) ## Grouping values for "Paid Leave" JU % mutate(employee_benefits_15 = str_replace_all(employee_benefits_15, c("Other Paid Personal Leave" = "Paid Leave"))) ## Remove Duplicates of "Paid Leave" JU % mutate(employee_benefits_15 = sapply(strsplit(employee_benefits_15, ","), function(x) { paste(unique(x), collapse = ",") })) # Convert categorical variables as factors JU$employment_status_1 <-as.factor(JU$employment_status_1) JU$job_satisfaction_14 <- as.factor(JU$job_satisfaction_14) JU$employee_benefits_15 <- as.factor(JU$employee_benefits_15) JU$work_schedule_type_21 <- as.factor(JU$work_schedule_type_21) JU$uncompensated_overtime_23 <- as.factor(JU$uncompensated_overtime_23) JU$education_level_32 <- as.factor(JU$education_level_32) JU$job_education_relation_33 <- as.factor(JU$job_education_relation_33) JU$age_34 <-as.factor(JU$age_34) JU$gender_35 <-as.factor(JU$gender_35) JU$race_36 <-as.factor(JU$race_36) # fwrite(JU, "C:\\Users\\yss06\\Desktop\\SURE - Reserach\\Glassdoor\\Job+Satisfaction+&+Unemployment_July+16,+2023_22.08\\JU_test.csv", row.names=FALSE) ## Summary of the data - Frequency Aanalysis (EDA) ##Q14 JU_employer_satisfaction <- table(JU$job_satisfaction_14) freq_employer_satisfaction <- as.data.frame(JU_employer_satisfaction) freq_employer_satisfaction <- rename(freq_employer_satisfaction, employer_satisfaction = Var1) hist(freq_employer_satisfaction$Freq) ##Q21 JU_work_schedule_type <- table(JU$work_schedule_type_21) freq_work_schedule_type <- as.data.frame(JU_work_schedule_type) freq_work_schedule_type <- rename(freq_work_schedule_type, work_schedule = Var1) hist(freq_work_schedule_type$Freq) ##Q23 JU_uncompensated_overtime <- table(JU$uncompensated_overtime_23) freq_uncompensated_overtime <- as.data.frame(JU_uncompensated_overtime) freq_employer_size <- rename(freq_uncompensated_overtime,uncompensated_overtime = Var1) hist(freq_uncompensated_overtime$Freq) ##Q32 JU_education_level <- table(JU$education_level_32) freq_education_level <-as.data.frame(JU_education_level) freq_education_level <- rename(freq_education_level, education_level = Var1) ##33 JU_job_education_relation <- table(JU$job_education_relation_33) freq_job_education_relation <- as.data.frame(JU_job_education_relation) freq_job_education_relation <- rename(freq_job_education_relation, job_education_relation = Var1) ## Demographic Information of Survey Dataset ## Age Table JU_age % mutate(age_34 = paste0(age_34, " years old")) %>% # Add "years old" to age group_by(age_34) %>% summarise(n = n()) %>% mutate(percentage = n / sum(n) * 100) # Create a pie chart for age distribution ggplot(JU_age, aes(x="", y=percentage, fill=age_34)) + geom_bar(width = 1, stat = "identity", color="black", size=1) + coord_polar("y", start=0) + geom_text(aes(label = paste0(round(percentage, 1), "%")), position = position_stack(vjust = 0.5), color="black", size=8) + labs(title="Age Distribution", x=NULL, y=NULL, fill="age_34", position = "center") + theme_void() + theme(legend.position="right", plot.title = element_text(hjust = 0.5, size=20, face="bold"), legend.title=element_text(size=10), legend.text=element_text(size=14.8)) ## Gender Table JU_gender % group_by(gender_35) %>% summarise(n = n()) %>% mutate(percentage = n / sum(n) * 100) # Create a pie chart for age distribution ggplot(JU_gender, aes(x="", y=percentage, fill=gender_35)) + geom_bar(width = 1, stat = "identity", color="black", size=1) + coord_polar("y", start=0) + geom_text(aes(label = paste0(round(percentage, 1), "%")), position = position_stack(vjust = 0.5), color="black", size=8) + labs(title="Gender Distribution", x=NULL, y=NULL, fill="gender_35") + theme_void() + theme(legend.position="right", plot.title = element_text(hjust = 0.5, size=20, face="bold"), legend.title=element_text(size=10), legend.text=element_text(size=14.8)) ## Race Table JU_race % group_by(race_36) %>% summarise(n = n()) %>% mutate(percentage = n / sum(n) * 100) # Create a pie chart for race distribution ggplot(JU_race, aes(x="", y=percentage, fill=race_36)) + geom_bar(width = 1, stat = "identity", color="black", size=1) + coord_polar("y", start=0) + geom_text(aes(label = paste0(round(percentage, 1), "%")), position = position_stack(vjust = 0.5), color="black", size=7) + labs(title="Race/Ethnicity Distribution", x=NULL, y=NULL, fill="race_36") + theme_void() + theme(legend.position="right", plot.title = element_text(hjust = 0.5, size=20, face="bold"), legend.title=element_text(size=10), legend.text=element_text(size=14.5)) ## Chi-square test to check multicollinearity # list of categorical variables variables <- c("job_satisfaction_14", "employee_benefits_15", "work_schedule_type_21", "uncompensated_overtime_23", "education_level_32", "job_education_relation_33","age_34","gender_35","race_36") # empty list to store results results <- list() # loop over all pairs of variables for (i in 1:(length(variables)-1)) { for (j in (i+1):length(variables)) { # get the two variables var1 <- JU[[variables[i]]] var2 <- JU[[variables[j]]] # perform chi-square test chi_sq_test <- chisq.test(var1, var2) # calculate Cramer's V cramers_v <- sqrt(chi_sq_test$statistic / (sum(chi_sq_test$observed) * (min(length(levels(var1)), length(levels(var2))) - 1))) # store results results[[paste(variables[i], "vs", variables[j])]] <- list(chi_sq_test = chi_sq_test, cramers_v = cramers_v) } } # print the results results ## Interpretation, most pairs of variables have the p-values greater than 0.05, meaning that these pairs of variables are not signficantly assocaited ## No Multicollinearity checked. #--------------------------------------------------------------- # Research Question 1: How Demographics and one of job-related factor, work schedule, affect job satisfaction in the U.S. labor market # Reducing the level of groups for the further analysis JU$job_satisfaction_14 <- fct_collapse(JU$job_satisfaction_14, "Satisfied" = c("Extremely satisfied", "Moderately satisfied", "Slightly satisfied"), "Dissatisfied" = c("Extremely dissatisfied", "Moderately dissatisfied", "Slightly dissatisfied")) JU$age_34 <- fct_collapse(JU$age_34, "16 - 24" = c("16 - 20", "21 - 24")) JU$work_schedule_type_21 <- fct_collapse(JU$work_schedule_type_21, "An irregular schedule" = c("A rotating shift (that changes from days to evenings)", "An irregular schedule")) ### Create dummies for ordinal variable JU[, job_satisfaction_14 := factor(job_satisfaction_14, levels = (c('Satisfied', 'Dissatisfied', 'Neither satisfied nor dissatisfied')), ordered = TRUE)] # JU$job_satisfaction_14 <- as.numeric(JU$job_satisfaction_14) JU[, job_education_relation_33 := factor(job_education_relation_33, levels = c('Not at all related','Somewhat related','Closely related'), ordered = TRUE)] # JU$job_education_relation_33_dummy <- as.numeric(JU$job_education_relation_33) JU[, age_34 := factor(age_34, levels = c('16 - 24','25 - 34', "35 - 54", "55 and older"), ordered = TRUE)] ## Save Job_satisfaction variable job_satisfaction_14 <- JU$job_satisfaction_14 ## Create dummies for demographic variable and work schedule variables dummy_var <- dummyVars(~ education_level_32 + age_34 + gender_35 + race_36 + work_schedule_type_21 + uncompensated_overtime_23, JU) dummy_data <- data.frame(predict(dummy_var, newdata = JU)) ## add job satisfaction column in the dummy data frame dummy_data$job_satisfaction_14 <- job_satisfaction_14 ## Fit the ordinal logistic regression model model_Q1 <- polr(job_satisfaction_14 ~ ., data = dummy_data, Hess=TRUE) summary(model_Q1) ctable <- coef(summary(model_Q1)) ## calculate and store p values p_value <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2 ## combined table ctable <- cbind(ctable, "p value" = p_value) ### Making a table for the summary # Create a data frame from the coefficients table # Format the table using kable kable_table % kable_styling(full_width = FALSE) %>% add_header_above(c("Ordinal Logistic Regression Results" = 5)) %>% row_spec(0, bold = TRUE) # Print the table print(kable_table) # Run the Brant test on your model # brant(model_Q1) ## Invalidity of the Brant test since the data is dealing with categorical variables that have many levels, especially if some levels are rare. ## Alternative Way of Likelihood ratio test for the assumption of Proportional odds model_multinom <- multinom(job_satisfaction_14 ~ ., data = dummy_data) summary(model_multinom) ## Likelihood ratio test lrtest(model_Q1, model_multinom) # Create a data frame with the likelihood ratio test results lrtest_results Chisq)` = c("", 0.8598) ) # Format the table using kable kable_table % kable_styling(full_width = FALSE) %>% add_header_above(c("Likelihood Ratio Test Results" = 5)) %>% row_spec(0, bold = TRUE) # Print the table print(kable_table) ## Justification Using Likelihood ratio test instead of Brant test # In the process of assessing the assumptions of our ordinal logistic regression model, I initially intended to use the Brant test, which is a standard test for the proportional odds assumption. #However, due to technical issues in the implementation of the Brant test with our data, I opted to use the likelihood ratio test instead. #This test allowed us to compare the fit of our ordinal logistic regression model, which assumes proportional odds, to a more general multinomial logistic model that does not make this assumption. #The likelihood ratio test did not indicate a significant difference between these two models, suggesting that the proportional odds assumption for our ordinal logistic regression model was reasonable." #-------------------------------------------------------------- # Regularized Ordinal Regression # L1 regularization technique (also known as Lasso regularization) in combination with ordinal logistic regression ## Change predictor variables as numeric dummy_data$age_34.L <- as.numeric(dummy_data$age_34.L) dummy_data$age_34.Q <- as.numeric(dummy_data$age_34.Q) dummy_data$age_34.C <- as.numeric(dummy_data$age_34.C) dummy_data$gender_35.Female<- as.numeric(dummy_data$gender_35.Female) dummy_data$gender_35.Male <- as.numeric(dummy_data$gender_35.Male) dummy_data$gender_35.Prefer.not.to.say <- as.numeric(dummy_data$gender_35.Prefer.not.to.say) dummy_data$education_level_32.Associate.degree <- as.numeric(dummy_data$education_level_32.Associate.degree) dummy_data$education_level_32.Bachelor.s.degree <- as.numeric(dummy_data$education_level_32.Bachelor.s.degree) dummy_data$education_level_32.Doctorate.degree <- as.numeric(dummy_data$education_level_32.Doctorate.degree) dummy_data$education_level_32.High.school.diploma.or.equivalent <- as.numeric(dummy_data$education_level_32.High.school.diploma.or.equivalent) dummy_data$education_level_32.Master.s.degree <- as.numeric(dummy_data$education_level_32.Master.s.degree) dummy_data$education_level_32.Other <- as.numeric(dummy_data$education_level_32.Other) dummy_data$race_36.Asian.or.Asian.American <- as.numeric(dummy_data$race_36.Asian.or.Asian.American) dummy_data$race_36.Black.or.African.American <- as.numeric(dummy_data$race_36.Black.or.African.American) dummy_data$race_36.Hispanic.or.Latino <- as.numeric(dummy_data$race_36.Hispanic.or.Latino) dummy_data$race_36.Other..Please.specify. <- as.numeric(dummy_data$race_36.Other..Please.specify.) dummy_data$race_36.Prefer.not.to.state <- as.numeric(dummy_data$race_36.Prefer.not.to.state) dummy_data$race_36.White.or.Caucasian <- as.numeric(dummy_data$race_36.White.or.Caucasian) dummy_data$work_schedule_type_21.A.regular.daytime.schedule <- as.numeric(dummy_data$work_schedule_type_21.A.regular.daytime.schedule) dummy_data$work_schedule_type_21.An.irregular.schedule <- as.numeric(dummy_data$work_schedule_type_21.An.irregular.schedule) dummy_data$work_schedule_type_21.On.call...Casual <- as.numeric(dummy_data$work_schedule_type_21.On.call...Casual) dummy_data$work_schedule_type_21.Other <- as.numeric(dummy_data$work_schedule_type_21.Other) dummy_data$uncompensated_overtime_23.No <- as.numeric(dummy_data$uncompensated_overtime_23.No) dummy_data$uncompensated_overtime_23.Yes <- as.numeric(dummy_data$uncompensated_overtime_23.Yes) X <- dummy_data[, -which(names(dummy_data) == "job_satisfaction_14")] Y <- dummy_data$job_satisfaction_14 ## create a data frame and matrix that only contains predictor variables newdata <-dummy_data[, -which(names(dummy_data) == "job_satisfaction_14")] newdata <- as.matrix(newdata) X <- as.matrix(X) Y <- as.factor(dummy_data$job_satisfaction_14) model_Q1 <- ordinalNet(X, Y, family="cumulative", alpha=1) predictions <- predict(model_Q1, newdata) print(predictions) # Create a data frame with the updated labels predicted_labels <- data.frame( "Satisfied" = predictions[, 1], "Dissatisfied" = predictions[, 2], "Neither satisfied nor dissatisfied" = predictions[, 3], "Observations" = as.character(1:nrow(predictions)) ) # Add the column names to the data frame colnames(predicted_labels) <- c("Satisfied", "Dissatisfied", "Neither satisfied nor dissatisfied","Observations") # Create a nice table with the predicted probabilities and updated labels prob_table % kable_styling(full_width = FALSE) %>% add_header_above(c("Predicted Probabilities for Job Satisfaction Levels" = 4)) %>% row_spec(0, bold = TRUE) # Print the table print(prob_table) #-------------------------------------------------------------- # Research Question 2: Does the level of education influence job satisfaction, especially when the job is closely related to the respondent's education? ## Convert it to Numerical variable for the analysis JU$job_education_relation_33_dummy <- as.numeric(JU$job_education_relation_33) dummy_education_32 <- model.matrix(~education_level_32 - 1, JU) model_Q2 <- polr(as.formula(job_satisfaction_14 ~ education_level_32 + job_education_relation_33), data = JU, Hess = TRUE) summary(model_Q2) ctable <- coef(summary(model_Q2)) ## calculate and store p values p_value <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2 ## combined table ctable <- cbind(ctable, "p value" = p_value) # Format the table using kable kable_table2 % kable_styling(full_width = FALSE) %>% add_header_above(c("Ordinal Logistic Regression Results" = 5)) %>% row_spec(0, bold = TRUE) # Print the table print(kable_table2) ## Alternative Way of Likelihood ratio test model_multinom2 <- multinom(job_satisfaction_14 ~ education_level_32 + job_education_relation_33, data = JU) summary(model_multinom2) ## Likelihood ratio test lrtest(model_Q2, model_multinom2) lrtest_results2 Chisq)` = c("", 0.8367) ) # Format the table using kable lrtest_Q2 % kable_styling(full_width = FALSE) %>% add_header_above(c("Likelihood Ratio Test Results" = 5)) %>% row_spec(0, bold = TRUE) # Print the table print(lrtest_Q2) ## Check the assumptions of Ordinal logistic Regression Analysis # 1.The dependent variable are ordered. # 2.One or more of the independent variables are either continuous, categorical or ordinal. # 3.No multicollinearity. # 4.Proportional odds ## Chi-square Test for age contingency_table <- table(JU$age_34, JU$job_satisfaction_14) # Perform the chi-square test of independence chi_square_result_age <- chisq.test(contingency_table) # Print the results print(chi_square_result_age) ## Chi-square Test for gender contingency_table_gender <- table(JU$gender_35, JU$job_satisfaction_14) # Perform the chi-square test of independence chi_square_result_gender <- chisq.test(contingency_table) # Print the results print(chi_square_result_gender) ## Chi-square Test for race contingency_table <- table(JU$race_36, JU$job_satisfaction_14) # Perform the chi-square test of independence chi_square_result <- chisq.test(contingency_table) # Print the results print(chi_square_result) ## Chi-square Test for work schedule contingency_table <- table(JU$work_schedule_type_21, JU$job_satisfaction_14) # Perform the chi-square test of independence chi_square_result <- chisq.test(contingency_table) # Print the results print(chi_square_result) ## Chi-square Test for uncompensated work status contingency_table <- table(JU$uncompensated_overtime_23, JU$job_satisfaction_14) # Perform the chi-square test of independence chi_square_result <- chisq.test(contingency_table) # Print the results print(chi_square_result) ## Chi-square Test for education contingency_table <- table(JU$education_level_32, JU$job_satisfaction_14) # Perform the chi-square test of independence chi_square_result <- chisq.test(contingency_table) # Print the results print(chi_square_result) ## Chi-square Test for benefits contingency_table <- table(JU$employee_benefits_15, JU$job_satisfaction_14) # Perform the chi-square test of independence chi_square_result
Glassdoor - Gender Wage Disparity & Impact of Education Level on U.S Labor Makret
library(ggplot2) library(dplyr) library(data.table) library(tables) library(knitr) library(ggplot2) library(tidyr) library(MASS) library(plotly) # Glassdoor Data GD <- read.csv('Glassdoor Gender Pay Gap.csv') str(GD) vis_dat(GD) summary(GD) # Create five employee age bins. GD$age_bin <- 0 GD$age_bin <- ifelse(GD$Age < 25, 1, GD$age_bin) # Below age 25 GD$age_bin = 25 & GD$Age < 35, 2, GD$age_bin) # Age 25-34. GD$age_bin = 35 & GD$Age < 45, 3, GD$age_bin) # Age 35-44. GD$age_bin = 45 & GD$Age < 55, 4, GD$age_bin) # Age 45-54. GD$age_bin = 55, 5, GD$age_bin) # Age 55+. # Create total compensation variable (base pay + bonus) GD$totalPay <- GD$BasePay + GD$Bonus # Take natural logarithm of compensation variables (for percentage pay gap interpretation in regressions). GD$log_base <- log(GD$BasePay, base = exp(1)) # Base pay. GD$log_total <- log(GD$totalPay, base = exp(1)) # Total comp. GD$log_bonus <- log(GD$Bonus + 1, base = exp(1)) # Incentive pay. Add 1 to allow for log of 0 bonus values. GD$male <- ifelse(GD$Gender == "Male", 1, 0) # Male = 1, Female = 0. # Check the structure of the data after preprocessing. str(GD) vis_dat(GD) summary(GD) # Cast all categorical variable as factors for the regression analysis. GD$JobTitle <- as.factor(GD$JobTitle) GD$Gender <- as.factor(GD$Gender) GD$Education <- as.factor(GD$Education) GD$Dept <- as.factor(GD$Dept) # Change the name of a category in Education variable set(GD, which(GD$Education == "College"), "Education", "Bachelor") ## Summary stat for Base Pay summary_base % group_by(Gender) %>% summarise(avg_basepay = mean(BasePay, na.rm=TRUE), med_basepay = median(BasePay, na.rm=TRUE), count= sum(!(is.na(BasePay)))) summary_base %>% gt() %>% tab_header( title = "Summary Statistics by Gender", ) %>% fmt_number( columns = vars(avg_basepay, med_basepay), decimals = 2 # The number of decimal places ) # Create a bar plot for average base pay by gender ggplot(summary_base, aes(x = Gender, y = avg_basepay, fill=Gender)) + geom_bar(stat = "identity", width = 0.5) + labs(x = "Gender", y = "Average Base Pay") + geom_text(aes(label = paste0("$",round(avg_basepay,3))), vjust = -0.5, fontface="bold")+ theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 10), axis.text.y = element_text(face = "bold", size = 10), axis.title = element_text(face = "bold", size = 12), plot.title = element_text(face = "bold", size = 14, hjust = 0.5)) + ggtitle("Average Base Pay by Gender") # Create a bar plot for median base pay by gender ggplot(summary_base, aes(x = Gender, y = med_basepay, fill=Gender)) + geom_bar(stat = "identity", width = 0.5) + labs(x = "Gender", y = "Median Base Pay") + geom_text(aes(label = paste0("$",round(med_basepay,3))), vjust = -0.5, fontface="bold")+ theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 10), axis.text.y = element_text(face = "bold", size = 10), axis.title = element_text(face = "bold", size = 12), plot.title = element_text(face = "bold", size = 14, hjust = 0.5)) + ggtitle("Median Base Pay by Gender") ## Summary stat for Bonus Pay summary_bonus % group_by(Gender) %>% summarise(avg_bonus = mean(Bonus, na.rm=TRUE), med_Bonus = median(Bonus, na.rm=TRUE), count=sum(!(is.na(Bonus)))) summary_bonus %>% gt() %>% tab_header( title = "Bonus Summary by Gender" ) %>% cols_label( Gender = "Gender", avg_bonus = "Average Bonus", med_Bonus = "Median Bonus" ) %>% fmt_number( columns = vars(avg_bonus, med_Bonus), decimals = 2 ) # Create a bar plot for average bonus pay by gender ggplot(summary_bonus, aes(x = Gender, y = avg_bonus, fill=Gender)) + geom_bar(stat = "identity", width = 0.5) + labs(x = "Gender", y = "Average Bonus Pay") + ggtitle("Average Bonus Pay by Gender")+ theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 10), axis.text.y = element_text(face = "bold", size = 10), axis.title = element_text(face = "bold", size = 12), plot.title = element_text(face = "bold", size = 14, hjust = 0.5)) + geom_text(aes(label = paste0("$",round(avg_bonus,3))), vjust = -0.5 , fontface="bold") # Create a bar plot for median bonus pay by gender ggplot(summary_bonus, aes(x = Gender, y = med_Bonus, fill=Gender)) + geom_bar(stat = "identity", width = 0.5) + labs(x = "Gender", y = "Median Bonus Pay") + theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 10), axis.text.y = element_text(face = "bold", size = 10), axis.title = element_text(face = "bold", size = 12), plot.title = element_text(face = "bold", size = 14, hjust = 0.5)) + geom_text(aes(label = paste0("$",med_Bonus)), vjust = -0.5, fontface="bold")+ ggtitle("Median Bonus Pay by Gender") ## Summary stat for Total Pay summary_totalpay % group_by(Gender) %>% summarise(avg_totalpay = mean(totalPay, na.rm=TRUE), med_totalpay = median(totalPay, na.rm=TRUE), cnt=sum(!(is.na(totalPay)))) summary_totalpay %>% gt() %>% tab_header( title = "Total Pay Summary by Gender" ) %>% cols_label( Gender = "Gender", avg_totalpay = "Average Total Pay", med_totalpay = "Median Total Pay", cnt = "Count" ) %>% fmt_number( columns = vars(avg_totalpay, med_totalpay, cnt), decimals = 2 ) # Create a bar plot for average total pay by gender ggplot(summary_totalpay, aes(x = Gender, y = avg_totalpay, fill=Gender)) + geom_bar(stat = "identity", width = 0.5) + labs(x = "Gender", y = "Average Total Pay") + ggtitle("Average Total Pay by Gender") + theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 10), axis.text.y = element_text(face = "bold", size = 10), axis.title = element_text(face = "bold", size = 12), plot.title = element_text(face = "bold", size = 14, hjust = 0.5)) + geom_text(aes(label = paste0("$",round(avg_totalpay,2))), vjust = -0.5, fontface="bold") # Create a bar plot for median total pay by gender ggplot(summary_totalpay, aes(x = Gender, y = med_totalpay, fill=Gender)) + geom_bar(stat = "identity", width = 0.5) + labs(x = "Gender", y = "Median Total Pay") + theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 10), axis.text.y = element_text(face = "bold", size = 10), axis.title = element_text(face = "bold", size = 12), plot.title = element_text(face = "bold", size = 14, hjust = 0.5)) + geom_text(aes(label = paste0("$",round(med_totalpay,2))), vjust = -0.5 , fontface="bold")+ ggtitle("Median Total Pay by Gender") #------------------------------------------ # Summary stat for Performance Evaluation by Gender summary_perfeval % group_by(Gender) %>% summarise(avg_perfeval = mean(PerfEval, na.rm=TRUE), med_perfeval = median(PerfEval, na.rm=TRUE), cnt=sum(!(is.na(PerfEval)))) summary_perfeval %>% gt() %>% tab_header( title = "Performance Evaluation Summary by Gender" ) %>% cols_label( Gender = "Gender", avg_perfeval = "Average Performance Evaluation", med_perfeval = "Median Performance Evaluation", cnt = "Count" ) %>% fmt_number( columns = vars(avg_perfeval, med_perfeval, cnt), decimals = 2 ) # Create a bar plot for Performance Evaluation by gender ggplot(summary_perfeval, aes(x = Gender, y = avg_perfeval, fill=Gender)) + geom_bar(stat = "identity", width = 0.5) + labs(x = "Gender", y = "Average Performance Evaluation") + ggtitle("Performance Evaluation by Gender") + theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 10), axis.text.y = element_text(face = "bold", size = 10), axis.title = element_text(face = "bold", size = 12), plot.title = element_text(face = "bold", size = 14, hjust = 0.5)) + geom_text(aes(label = round(avg_perfeval,2)), vjust = -0.5, fontface="bold") ## Model Estimation # Direct relationship between Gender and pay model1 <- lm(totalPay ~ male, data=GD) model1_summary <- summary(model1) # Create a data frame model1_df % tab_header( title = "Linear Model Summary: Total Pay by Gender" ) %>% cols_label( Term = "Term", Estimate = "Estimate", Std_Error = "Std. Error", t_value = "t value", Pr = "Pr(>|t|)" ) %>% fmt_number( columns = c("Estimate", "Std_Error", "t_value", "Pr"), decimals = 8 ) %>% fmt_scientific ( columns = "Pr", decimals = 3 ) # Check residuals vs fitted values par(mfrow = c(2, 2)) plot(model1, which = 1:4) # With the p-value of close to 0, there is strong evidence that there is significant relationship between gender and pay # On average, being male is associated with an increase of 8.813 percent in total pay. # Adding controls model2 <- lm(totalPay ~ Gender+age_bin+PerfEval+Education+Seniority+Dept, data=GD) model2_summary <- summary(model2) #Check Assumption plot(model2, which = 1:4) ## Create a dataframe model2_df |t|)"] ) model2_df % mutate(significance = case_when( Pr < 0.001 ~ "***", Pr < 0.01 ~ "**", Pr % tab_header( title = "Linear Model Summary: Total Pay by Gender, Age, Performance, Education, Seniority, and Department" ) %>% cols_label( Term = "Term", Estimate = "Estimate", Std_Error = "Std. Error", t_value = "t value", Pr = "Pr(>|t|)" ) %>% fmt_number( columns = c("Estimate", "Std_Error", "t_value"), decimals = 4 ) %>% fmt_scientific( columns = "Pr", decimals = 4 ) # Overall, when considering the relationship between gender and pay, we find that gender (being male) has a significant association with pay even after accounting for other variables such as age, performance evaluation, education, seniority, and department. # This suggests that gender plays a role in the pay differences observed in the labor market. #### Research Question 2: Is there a significant correlation between levels of educational attainment and salary within the labor market? #### Do higher-educated individuals tend to earn more? # Summary Statistics by Education and Gender summary_edu % group_by(Education,Dept) %>% summarise(avg_total = mean(totalPay), med_total = median(totalPay), cnt = sum(!(is.na(Education)))) ggplot(summary_edu, aes(x=reorder(Education, -avg_total), y= avg_total, fill=Education)) + geom_bar(stat="identity", width=0.5) + labs(x = "Education Level", y = "Total Salary") + ggtitle("Total Salary Vs Education by Department") + theme (axis.title = element_text(face = "bold", size = 12), plot.title = element_text(face = "bold", size = 14, hjust = 0.5)) + geom_text(aes(label = paste0("$",round(avg_total,2))), size = 6.5, hjust = 1.2, fontface="bold")+ coord_flip() + facet_wrap(~ Dept) # Perform regression analysis without adding any controls to examine the relationship between education and total pay model_edu <- lm(totalPay ~ Education, data = GD) model3_edu <- summary(model_edu) plot(model_edu, which = 1:4) model3_df |t|)"] ) model3_df % mutate(significance = case_when( Pr < 0.001 ~ "***", Pr < 0.01 ~ "**", Pr % tab_header( title = "Linear Model Summary: Impact of Education on Total Pay" ) %>% cols_label( Term = "Term", Estimate = "Estimate", Std_Error = "Std. Error", t_value = "t value", Pr = "Pr(>|t|)" ) %>% fmt_number( columns = c("Estimate", "Std_Error", "t_value"), decimals = 2 ) %>% fmt_scientific( columns = "Pr", decimals = 3 ) ## Interpretation: # The coefficient for "EducationHigh School" (0.0405) suggests that, on average, individuals withthe Bachelor's degree have a higher total pay compared to the high school degree. However, the p-value (0.080202) indicates that this result is not statistically significant # The coefficient for "EducationMasters" (0.0990) suggests that, on average, individuals with a master's degree have a higher total pay compared to the high school degree. The p-value (0.0000153) indicates that this result is statistically significant, suggesting a positive relationship between having a master's degree and total pay. # The coefficient for "EducationPhD" (0.1215) suggests that, on average, individuals with a Ph.D. degree have a higher total pay compared to the high school degree. The p-value (0.0000002) indicates that this result is statistically significant, suggesting a positive relationship between having a Ph.D. degree and total pay. # Overall, the results suggest that higher education levels, specifically having a master's degree or Ph.D., are associated with higher total pay compared high school education
Kaggle - A dataset for data analytics jobs was identified and sourced from Kaggle
library(dplyr) library(data.table) library(tables) library(knitr) library(ggplot2) library(tidyr) library(MASS) library(plotly) library(car) library(ggthemes) library(viridis) library(gt) library(stringr) library(lmtest) library(visdat) library(PMCMRplus) library(pheatmap) DA <- fread('DataAnalyst.csv') DA$V1 <- NULL str(DA) summary(DA) vis_dat(DA) ## Removing redundant variables DA$Founded <- NULL DA$Competitors <- NULL DA$'Easy Apply'<- NULL DA$Headquarters <- NULL DA$V1 <- NULL DA$`Type of ownership` <- NULL DA$`Company Name`<- NULL ## Removing Null values DA <- DA[(Sector != '-1')] DA <- DA[!(DA$`Salary Estimate` == '-1')] DA <- DA[Rating == -1.0, Rating := NA] #changing the name of `Job Description` variable setnames(DA, "Job Description", "job_description") # Save Salary Estimate variable DA$Salary_Estimate <- DA$`Salary Estimate` # Cast all categorical variable as factors for the advanced analysis DA$`Job Title` <- as.factor(DA$`Job Title`) DA$Industry <- as.factor(DA$Industry) DA$Size <- as.factor(DA$Size) DA$Sector <- as.factor(DA$Sector) DA$`Salary Estimate`<- as.factor(DA$`Salary Estimate`) DA$Revenue <- as.factor(DA$Revenue) ## Converting Salary Estimate to Numerical value # Data cleaning and transformation DA$`Salary Estimate` <- gsub("\\$|\\(.*\\)", "", DA$'Salary Estimate') # Split salary range into lower and upper bounds DA % separate('Salary Estimate', into = c("Salary_Min", "Salary_Max"), sep = "-") DA$Salary_Min <- gsub("K", "", DA$Salary_Min) DA$Salary_Max <- gsub("K", "", DA$Salary_Max) # Convert lower and upper bounds to numeric DA$Salary_Min <- as.numeric(DA$Salary_Min) DA$Salary_Max <- as.numeric(DA$Salary_Max) DA$Salary_Min = DA$Salary_Min * 1000 DA$Salary_Max = DA$Salary_Max * 1000 # Calculate average salary for each value DA$Average_Salary <- (DA$Salary_Min + DA$Salary_Max) / 2 ## Descriptive Analysis (EDA) hist(DA$Rating) hist(DA$Average_Salary) ## Examine the distribution of observations within each category for `Salary Estimate` variable obs_per_category <- table(DA$Salary_Estimate) print(obs_per_category) ## Check the balance of observations across categories for `Salary Estimate` variable plot(obs_per_category, main = "Distribution of Observations Across Categories", xlab = "Category", ylab = "Number of Observations") # Bar Graphs for Salary Estimate binwidth = 5000 ggplot(DA, aes(x = Salary_Min)) + geom_histogram(aes(y = ..count..), binwidth = binwidth, fill = "orange", color = "black", alpha = 1) + geom_density(aes(y = ..count.. * binwidth), color = "red", size = 1) + theme_minimal() + theme( plot.title = element_text(hjust = 0.5, face="bold", size=20), # Center the title and make it bold axis.title.x = element_text(face="bold", size=15), # Make x-axis title bold axis.title.y = element_text(face="bold", size=15), # Make y-axis title bold axis.text = element_text(size=12) # Adjust the size of axis text ) + labs(title = "Bar Graph of Minimum Salary for Data Analyst Roles", x = "Minimum Salary", y = "Count") ggplot(DA, aes(x = Salary_Max)) + geom_histogram(aes(y = ..count..), binwidth = 5000, fill = "skyblue", color = "black", alpha = 1) + geom_density(aes(y = ..count.. * binwidth), color = "steelblue", size = 1) + theme_minimal() + theme( plot.title = element_text(hjust = 0.5, face="bold", size=20), # Center the title and make it bold axis.title.x = element_text(face="bold", size=15), # Make x-axis title bold axis.title.y = element_text(face="bold", size=15), # Make y-axis title bold axis.text = element_text(size=12) # Adjust the size of axis text ) + labs(title = "Bar Graph of Maximum Salary for Data Analyst Roles", x = "Maximum Salary", y = "Count") ggplot(DA, aes(x = Average_Salary)) + geom_histogram(aes(y = ..count..), binwidth = 5000, fill = "seagreen2", color = "black", alpha = 1) + geom_density(aes(y = ..count.. * binwidth), color = "red", size = 1) + theme_minimal() + theme( plot.title = element_text(hjust = 0.5, face="bold", size=20), # Center the title and make it bold axis.title.x = element_text(face="bold", size=15), # Make x-axis title bold axis.title.y = element_text(face="bold", size=15), # Make y-axis title bold axis.text = element_text(size=12) # Adjust the size of axis text ) + labs(title = "Bar Graph of Average Salary for Data Analyst Roles", x = "Average Salary", y = "Count") # Table for the Frequency analysis of Job Title JT <-table(DA$`Job Title`) freq_jt <- as.data.frame(JT) category <- unique(DA$JT) num_cate <- length(category) # Subset to get only the top 20 jobs job_top20 % count(`Job Title`) %>% arrange(desc(n)) %>% top_n(20) job_plot <- ggplot(job_top20, aes(x = reorder(`Job Title`, -n), y = n)) + geom_segment(aes(y = 0, x = reorder(`Job Title`, -n), yend = n, xend = reorder(`Job Title`, -n)), color = "green", linetype = "dashed") + geom_point(color = "blue", size = 3) + geom_text(aes(label=n), vjust=-1, color="red",fontface ="bold") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(title = "Top 20 Data Analyst Job Openings by Job Title", x = "Job Title", y = "Count of Job Openings") job_plot # Table for the Frequency analysis of Size table1 <-table(DA$Size) freq_size <- as.data.frame(table1) category <- unique(DA$Size) num_cate <- length(category) freq_size$Var1 <- factor(freq_size$Var1, levels = freq_size$Var1[order(freq_size$Freq, decreasing = TRUE)]) # Create a bar chart ggplot(freq_size, aes(x = Var1, y = Freq)) + geom_bar(stat = "summary", fill = "brown", color = "black") + xlab("Company Size") + ylab("Frequency") + theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 10), axis.text.y = element_text(face = "bold", size = 10), axis.title = element_text(face = "bold", size = 12), plot.title = element_text(face = "bold", size = 14, hjust = 0.5)) + ggtitle("Frequency Analysis of Company Size") + theme(axis.text.x = element_text(angle = 45, hjust = 1, face="bold"),axis.text.y = element_text(face = "bold")) + geom_text(aes(label = Freq), vjust = -0.5, hjust = 0.5, colour = "Black", fontface="bold") # Table for the Frequency analysis of Industry table2 <- table(DA$Industry) freq_industry <- as.data.frame(table2) category <- unique(DA$Industry) num_cate <- length(category) # Create a bar chart with all categories of industry ggplot(freq_industry, aes(x = Var1, y = Freq)) + geom_bar(stat = "summary", fill = "brown", color = "black") + xlab("Industry") + ylab("Frequency") + ggtitle("Frequency Analysis of Industry") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Create a bar chart with industries that have more than 100 frequencies in the data set # Filter categories with more than 100 frequencies filtered_freq 100, ] # Reorder the categories in descending order by frequency filtered_freq$Var1 <- factor(filtered_freq$Var1, levels = filtered_freq$Var1[order(filtered_freq$Freq, decreasing = TRUE)]) ggplot(filtered_freq, aes(x = Var1, y = Freq, fill = Freq)) + geom_bar(stat = "summary", color = "black") + geom_text(aes(label = Freq), vjust = -0.5, hjust = 0.5, color = "white", size = 4) + xlab("Industry") + ylab("Frequency") + ggtitle("Frequency Analysis of Industry that have more than 100 frequency") + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 10), axis.text.y = element_text(face = "bold", size = 10), axis.title = element_text(face = "bold", size = 12), plot.title = element_text(face = "bold", size = 14, hjust = 0.5)) + geom_text(aes(label=Freq),vjust=-0.5,hjust=0.5, fontface ="bold")+ scale_fill_gradient(low = "#1f78b4", high = "#a6cee3") # Table for the Frequency analysis of Sector table3 <- table(DA$Sector) freq_sector <- as.data.frame(table3) category <- unique(DA$Sector) num_cate <- length(category) # Create a bar chart freq_sector$Var1 <- factor(freq_sector$Var1, levels = freq_sector$Var1[order(freq_sector$Freq, decreasing = TRUE)]) ggplot(freq_sector, aes(x = Var1, y = Freq, fill = Freq)) + geom_bar(stat = "summary", color = "black") + geom_text(aes(label = Freq), vjust = -0.5, hjust = 0.5, color = "white", size = 4) + xlab("Sector") + ylab("Frequency") + ggtitle("Frequency Analysis of Sector") + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 8), axis.text.y = element_text(face = "bold", size = 10), axis.title = element_text(face = "bold", size = 14), plot.title = element_text(face = "bold", size = 14, hjust = 0.5)) + geom_text(aes(label=Freq),vjust=-0.5,hjust=0.5, fontface ="bold", size = 3)+ scale_fill_gradient(low = "#1f78b4", high = "#a6cee3") # Table for the Frequency analysis of Revenue table4 <- table(DA$Revenue) freq_revenue <- as.data.frame(table4) category <- unique(DA$Revenue) num_cate <- length(category) # Create a bar chart freq_revenue$Var1 <- factor(freq_revenue$Var1, levels = freq_revenue$Var1[order(freq_revenue$Freq, decreasing = TRUE)]) ggplot(freq_revenue, aes(x = Var1, y = Freq, fill = Freq)) + geom_bar(stat = "summary", color = "black") + geom_text(aes(label = Freq), vjust = -0.5, hjust = 0.5, color = "white", size = 4) + xlab("Revenue") + ylab("Frequency") + ggtitle("Frequency Analysis of Revenue") + theme_minimal() + theme(axis.text.x = element_text(angle = 50, hjust = 1, face = "bold", size = 9), axis.text.y = element_text(face = "bold", size = 10), axis.title = element_text(face = "bold", size = 14), plot.title = element_text(face = "bold", size = 14, hjust = 0.5)) + geom_text(aes(label=Freq),vjust=-0.5,hjust=0.5, fontface ="bold", size = 3)+ scale_fill_gradient(low = "#FF0000", high = "#FFFF00") # ================================================================================================================== # ================================================================================================================== #### Research Question 1. How do salaries for data analyst roles vary across different sectors? ## SECTOR sector_summary % group_by(Sector)%>% summarise( count = n(), Average_Salary = mean(Average_Salary, na.rm = TRUE), Salary_Min = mean(Salary_Min, na.rm = TRUE), Salary_Max = mean(Salary_Max, na.rm = TRUE), .groups = "drop" ) print(sector_summary) # Sort the sectors by mean salary in descending order sector_summary % arrange(desc(Average_Salary)) # Select the top 10 sectors top_10_sectors <- head(sector_summary, 10) top_10_sectors # Visualize the top 10 sectors by mean salary ggplot(top_10_sectors, aes(x = reorder(Sector, -Average_Salary), y = Average_Salary, fill=Sector)) + geom_bar(stat = "identity", color = "black") + geom_text(aes(label=paste("$", round(Average_Salary,0), sep="")), vjust=-0.3, size=6, fontface = 'bold') + xlab("Sector") + ylab("Average Salary") + ggtitle("Top 10 Sectors with Highest Salary") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 10), axis.text.y = element_text(face = "bold", size = 10), axis.title = element_text(face = "bold", size = 14), plot.title = element_text(face = "bold", size = 14, hjust = 0.5)) + scale_fill_viridis(discrete = TRUE) # Create a subset data frame with the selected 5 sectors with high frequency selected_sectors <- c("Information Technology", "Business Services", "Finance", "Health Care", "Education") subset_DA % filter(Sector %in% selected_sectors) unique(subset_DA$Sector) # Log Transformation on Dependent Variable DA$Average_Salary <- log(DA$Average_Salary) DA$Salary_Min <- log(DA$Salary_Min) DA$Salary_Max <- log(DA$Salary_Max) # Perform ANOVA anova_sector <- aov(Average_Salary ~ Sector, data = DA) anova_summary <- summary(anova_sector) # Extract the relevant information term <- rownames(anova_summary[[1]]) df <- anova_summary[[1]][, "Df"] sum_sq <- anova_summary[[1]][, "Sum Sq"] mean_sq <- anova_summary[[1]][, "Mean Sq"] f_value <- anova_summary[[1]][, "F value"] p_value F)"] # Create the data frame anova_summary_df <- data.frame( Term = term, DF = df, Sum_Sq = sum_sq, Mean_Sq = mean_sq, F_value = f_value, Pr_value = p_value ) # Create the gt table gt_table % tab_header(title = "Summary Statistics for Average Salary by Sector") %>% cols_align(align = "center") %>% cols_label( Term = "Term", DF = "DF", Sum_Sq = "Sum Sq", Mean_Sq = "Mean Sq", F_value = "F value", Pr_value = "Pr(>F)" ) # Print the table print(gt_table) # Check assumption ## 1. Gaussian -> residuals follow normal distribution # QQ plot qq_plot same amount of variation in the residuals # Range of Residuals should be randomly and vertically spread (not more than twice) : Strip chart ggplot( data = data.frame( residuals = anova_sector$residuals, fitted = anova_sector$fitted.values ), mapping = aes(x = fitted, y = residuals) ) + geom_point(size = 2) + theme_bw() + labs(title = "Homoscedasticity of Residuals", x = "Fitted values (points)", y = "Residuals (points)") + theme(plot.title = element_text(hjust = 0.5)) # LeveneTest 함수를 사용하여 등분산성 검정을 수행합니다 : H0: All variance equal H1: not H0 -> if p assumption fail levene_result <- leveneTest(Average_Salary ~ Sector, data = DA) # Create a data frame for the test result levene_df F)`[1], stringsAsFactors = FALSE ) # Create the gt table levene_table % tab_header(title = "Levene's Test for Homogeneity of Variance") %>% cols_label(Df = "Degrees of Freedom", F_value = "F value", Pr_greater_F = "Pr(>F)") %>% fmt_number(columns = c(F_value, Pr_greater_F), decimals = 4) %>% fmt_missing(columns = everything(), missing_text = "") %>% tab_style(style = list(cell_text(weight = "bold")), locations = cells_body()) # Print the table levene_table ## 3. Independence of observation -> not autocorrelated # index plot + DW test statistic ( DW < 1.5 or 2.5 < DW) ggplot( data = data.frame( residuals = anova_sector$residuals, index = 1:length(anova_sector$residuals) ), mapping = aes(x = index, y = residuals) ) + geom_point(size = 1.5) + geom_line() + theme_bw() + geom_hline( yintercept = 0, linetype = "dashed", color = "red" ) + xlab("Measurement order") + ylab("Residuals") + labs(title = "Independence of Observations")+ theme(plot.title = element_text(hjust = 0.5)) durbinWatsonTest(anova_sector) # 1.5 ~ 2.5 사이면 패스 dw_results <- data.frame( lag = 1, autocorrelation = 0.9484165, statistic = 0.1024746, p_value = 0 ) # Create a gt table dw_table % tab_header(title = "Durbin-Watson Test Results") %>% fmt_number(columns = vars(lag, autocorrelation, statistic, p_value), decimals = 4) %>% cols_label(lag = "Lag", autocorrelation = "Autocorrelation", statistic = "D-W Statistic", p_value = "p-value") %>% cols_align(autocorrelation:p_value, align = "center") # Print the table dw_table ## Normality and Independence of observation assumptions are violated # Perform Non-parametric Test with original dataset kruskal_result <- kruskal.test(Average_Salary ~ Sector, data = DA) kruskal_df <- data.frame( Test = "Kruskal-Wallis", Chi_Squared = kruskal_result$statistic, Degrees_of_Freedom = kruskal_result$parameter, p_value = kruskal_result$p.value ) # Format the p-value kruskal_df$p_value <- format(kruskal_df$p_value, digits = 4) # Create the gt table kruskal_table % tab_header( title = "Kruskal-Wallis Test Result", subtitle = "" ) # Print the table print(kruskal_table) # Perform pairwise Wilcoxon rank-sum tests with Bonferroni correction pairwise_result <- pairwise.wilcox.test(DA$Average_Salary, DA$Sector, p.adjust.method = "bonferroni") # Display significant pairwise comparisons (Bonferroni corrected p-value < 0.05) significant_pairs <- pairwise_result$p.adjusted < 0.05 if (any(significant_pairs)) { cat("Significant pairwise comparisons (Bonferroni corrected p-value < 0.05):\n") for (i in which(significant_pairs)) { cat(pairwise_result$method[i], "\n") } } else { cat("No significant pairwise comparisons.\n") } ## Interpretation: ## it indicates that there is evidence of a significant difference in the "Average_Salary" variable across the different sectors. ## In other words, the salaries of data analysts vary significantly depending on the sector in which they work. ##This suggests that there are significant differences in average salaries among the sectors you examined. # Calculate effect size using Epsilon-squared (ε²) num_groups <- length(unique(DA$Sector)) epsilon_squared <- (kruskal_result$statistic - (num_groups - 1)) / (length(DA$Sector) - num_groups) effect_size_epsilon_squared <- epsilon_squared / (1 + epsilon_squared) # Calculate effect size using Kendall's W kendall_w <- kruskal_result$statistic / (length(DA$Sector) * (length(DA$Sector) - 1)) effect_size_kendall_w <- 1 - (1 / (1 + kendall_w)) # Create a data frame to store the effect size measures effect_size_df <- data.frame( Measure = c("Epsilon-squared (ε²)", "Kendall's W"), Effect_Size = c(effect_size_epsilon_squared, effect_size_kendall_w) ) # Print the effect size measures print(effect_size_df) # Create the gt table gt_table % tab_header(title = "Effect Size Measures") %>% cols_label( Measure = "Measure", Effect_Size = "Effect Size" ) %>% fmt_number (columns = vars(Effect_Size), decimals = 8) gt_table # Perform Dunn's Test to check which group is signficantly different than other groups dunn_test_result <- kwAllPairsDunnTest(Average_Salary ~ Sector, data = DA, method="bh") # Print the results print(dunn_test_result) # Boxplot for Dunn's Test ggplot(DA, aes(x=Sector, y=Average_Salary)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 90))+ # rotate x-axis labels for readability labs(title = "Average Salary by Sector") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 10), axis.text.y = element_text(face = "bold", size = 10), axis.title = element_text(face = "bold", size = 14), plot.title = element_text(face = "bold", size = 14, hjust = 0.5)) # Create a data frame with the result table_dunn <- data.frame( Comparison = rownames(dunn_test_result$p.value), `Arts, Entertainment & Recreation` = dunn_test_result$p.value[, "Arts, Entertainment & Recreation"], `Biotech & Pharmaceuticals` = dunn_test_result$p.value[, "Biotech & Pharmaceuticals"] ) colnames(table_dunn)[2:3] <- c("Arts, Entertainment & Recreation", "Biotech & Pharmaceuticals") colnames(table_dunn) <- gsub("\\.+", "&", colnames(table_dunn)) # Create a gt table table_gt <- gt(table_dunn) # Set table headers table_gt % tab_header( title = "Pairwise comparisons using Dunn's all-pairs test", subtitle = "Data: Average_Salary by Sector" ) # Print the table print(table_gt) ## Interpretation: ## 1. There is some evidence to suggest a difference in average salaries between the "Biotech & Pharmaceuticals" and "Restaurants, Bars & Food Services" sectors ## 2. This suggests that there is evidence to support a difference in average salaries between the "Finance" and "Biotech & Pharmaceuticals" sectors. ### Conduct Chi-Square test using `Salary Estimate` variable as if it's categorical variable and 'Sector' variable ## Hypothesis: H0:There is no association between the sector and salary estimate for data analyst roles. | Ha: There is an association between the sector and salary estimate for data analyst roles. # Create the contingency table that cross-tabulates the frequency or count of observations for each combination of sector and salary estimate category contingency_table <- table(DA$Sector, DA$Salary_Estimate) # Perform the chi-square test of independence chi_square_result <- chisq.test(contingency_table) # Print the results print(chi_square_result) #Interpretation: # 1. Since the p-value (3.706e-14) is less than the significance level of 0.05, we reject the null hypothesis. # 2. This indicates that the sector in which a data analyst works is associated with differences in salary estimates. # ================================================================================================================== # ================================================================================================================== ## Research Question 2. How do salaries for data analyst roles vary across different industries? DA$Average_Salary <- exp(DA$Average_Salary) DA$Salary_Min <- exp(DA$Salary_Min) DA$Salary_Max <- exp(DA$Salary_Max) ## Industry industry_summary % group_by(Industry) %>% summarise( count = n(), Average_Salary = mean(Average_Salary, na.rm = TRUE), Salary_Min = mean(Salary_Min, na.rm = TRUE), Salary_Max = mean(Salary_Max, na.rm = TRUE), ) # Sort the industries by mean salary in descending order industry_summary % arrange(desc(Average_Salary)) # Select the top 10 industries top_10_industries <- head(industry_summary, 10) # Visualize the top 10 industries by mean salary ggplot(top_10_industries, aes(x = reorder(Industry, -Average_Salary), y = Average_Salary, fill=Industry)) + geom_bar(stat = "identity", color = "black") + geom_text(aes(label=paste("$", round(Average_Salary), sep="")), vjust=-0.3, size=3.5, fontface = 'bold') + xlab("Industry") + ylab("Mean Salary") + ggtitle("Top 10 Industries with Highest Salary") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 12), axis.text.y = element_text(face = "bold", size = 12), axis.title = element_text(face = "bold", size = 16), plot.title = element_text(face = "bold", size = 16, hjust = 0.5)) + scale_fill_viridis(discrete = TRUE) # Create a subset data frame with selected industries selected_Indus <- c("Staffing & Outsourcing", "IT Services", "Health Care Services & Hospitals", "Computer Hardware & Software", "Consulting", "Investment Banking & Asset Management", "Enterprise Software & Network Solutions", "Internet", "Advertising & Marketing", "Banks & Credit Unions", "Insurance Carriers") subset_DA_Indus % filter(Industry %in% selected_Indus) # Frequency of Industries table_indus <- table(subset_DA_Indus$Industry) freq_indus <- as.data.frame(table_indus) category <- unique(DA$Revenue) num_cate <- length(category) # Log Transformation on Dependent Variable DA$Average_Salary <- log(DA$Average_Salary) DA$Salary_Min <- log(DA$Salary_Min) DA$Salary_Max <- log(DA$Salary_Max) # Perform ANOVA anova_ind <- aov(Average_Salary ~ Industry, data = DA) summary(anova_ind) anova_summary <- summary(anova_ind) # Create a data frame for the ANOVA summary anova_summary_df F)"] ) # Format the data frame anova_summary_df$DF <- format(round(anova_summary_df$DF, 4), nsmall = 4) anova_summary_df$Sum_Sq <- format(round(anova_summary_df$Sum_Sq, 4), nsmall = 4) anova_summary_df$Mean_Sq <- format(round(anova_summary_df$Mean_Sq, 4), nsmall = 4) anova_summary_df$F_value <- format(round(anova_summary_df$F_value, 4), nsmall = 4) anova_summary_df$Pr_value <- format(round(anova_summary_df$Pr_value, 4), nsmall = 4) # Create the gt table gt_table % tab_header(title = "Summary Statistics for Average Salary by Industry") %>% cols_align(align = "center") %>% cols_label( Term = "Term", DF = "DF", Sum_Sq = "Sum Sq", Mean_Sq = "Mean Sq", F_value = "F value", Pr_value = "Pr(>F)" ) gt_table ## Checking Assumptions ## 1. Gaussian -> residuals follow normal distribution car::qqPlot( x = anova_ind$residuals, distribution = "norm", envelope = 0.90, id = FALSE, pch = 20, ylab = "Residuals" ) title(main = "QQ Plot: Gaussian Assumption") ## 2. Homoscedasticity of residuals ggplot( data = data.frame( residuals = anova_ind$residuals, fitted = anova_ind$fitted.values ), mapping = aes(x = fitted, y = residuals) ) + geom_point(size = 2) + theme_bw() + labs(title = "Homoscedasticity of Residuals", x = "Fitted values (points)", y = "Residuals (points)") + theme(plot.title = element_text(hjust = 0.5)) # LeveneTest 함수를 사용하여 등분산성 검정을 수행합니다 : H0: All variance equal H1: not H0 -> if p assumption fail Lev2 <- leveneTest(Average_Salary ~ Industry, data = DA) # Create a data frame for the test result levene_df F)`[1], stringsAsFactors = FALSE ) # Create the gt table levene_table % tab_header(title = "Levene's Test for Homogeneity of Variance") %>% cols_label(Df = "Degrees of Freedom", F_value = "F value", Pr_greater_F = "Pr(>F)") %>% fmt_number(columns = c(F_value, Pr_greater_F), decimals = 4) %>% fmt_missing(columns = everything(), missing_text = "") %>% tab_style(style = list(cell_text(weight = "bold")), locations = cells_body()) levene_table ## 3. Independence of observation -> not autocorrelated # index plot + DW test statistic ( DW < 1.5 or 2.5 < DW) ggplot( data = data.frame( residuals = anova_ind$residuals, index = 1:length(anova_ind$residuals) ), mapping = aes(x = index, y = residuals) ) + geom_point(size = 1.5) + geom_line() + theme_bw() + geom_hline( yintercept = 0, linetype = "dashed", color = "red" ) + labs(title = "Independence of Observations")+ xlab("Measurement order") + ylab("Residuals")+ theme(plot.title = element_text(hjust = 0.5)) durbinWatsonTest(anova_ind) # 1.5 ~ 2.5 사이면 패스 dw_results <- data.frame( lag = 1, autocorrelation = 0.912026, statistic = 0.1752381, p_value = 0 ) # Create a gt table dw_table % tab_header(title = "Durbin-Watson Test Results") %>% fmt_number(columns = vars(lag, autocorrelation, statistic, p_value), decimals = 4) %>% cols_label(lag = "Lag", autocorrelation = "Autocorrelation", statistic = "D-W Statistic", p_value = "p-value") %>% cols_align(autocorrelation:p_value, align = "center") # Print the table dw_table #Perform Non-parametric test kruskal.test(Average_Salary ~ Industry, data = DA) kruskal_result_Indus <- kruskal.test(Average_Salary ~ Industry, data = DA) kruskal_df <- data.frame( Test = "Kruskal-Wallis", Chi_Squared = kruskal_result_Indus$statistic, Degrees_of_Freedom = kruskal_result_Indus$parameter, p_value = kruskal_result_Indus$p.value ) kruskal_df$p_value <- format(kruskal_df$p_value, digits = 4) kruskal_table % tab_header( title = "Kruskal-Wallis Test Result", subtitle = "" ) %>% cols_label( Test = "Test", Chi_Squared = "Chi-Squared", Degrees_of_Freedom = "Degrees of Freedom", p_value = "p-value" ) kruskal_table ## Interpretation: ## This indicates that there is a significant difference in the "Average_Salary" among the different industries. ## The p-value of 0.04089 suggests that the observed differences in average salaries across industries are unlikely to have occurred by chance alone, ## assuming the null hypothesis of equal population medians # Calculate effect size using Epsilon-squared (ε²) num_groups <- length(unique(DA$Industry)) epsilon_squared <- (kruskal_result_Indus$statistic - (num_groups - 1)) / (length(DA$Industry) - num_groups) effect_size_epsilon_squared <- epsilon_squared / (1 + epsilon_squared) # Calculate effect size using Kendall's W kendall_w <- kruskal_result_Indus$statistic / (length(DA$Industry) * (length(DA$Industry) - 1)) effect_size_kendall_w <- 1 - (1 / (1 + kendall_w)) # Create a data frame to store the effect size measures effect_size_indus <- data.frame( Measure = c("Epsilon-squared (ε²)", "Kendall's W"), Effect_Size = c(effect_size_epsilon_squared, effect_size_kendall_w) ) gt_table % tab_header(title = "Effect Size Measures") %>% cols_label( Measure = "Measure", Effect_Size = "Effect Size" ) gt_table # Print the effect size measures print(effect_size_df) ### Conduct Chi-Square test using `Salary Estimate` variable as if it's categorical variable and 'Industry' variable ## Hypothesis: H0:There is no association between the industry and salary estimate for data analyst roles. | Ha: There is an association between the industry and salary estimate for data analyst roles. # Create the contingency table that cross-tabulates the frequency or count of observations for each combination of industries and salary estimate category contingency_table <- table(DA$Industry, DA$Salary_Estimate) # Perform the chi-square test of independence chi_square_result <- chisq.test(contingency_table) # Print the results print(chi_square_result) #Interpretation: # 1. Since the p-value (< 2.2e-16) is well below the significance level of 0.05, we reject the null hypothesis. # 2. This indicates that the industry in which a data analyst works is associated with differences in salary estimates. # ================================================================================================================== # ================================================================================================================== # --------------------------------------------------------------------------------------------------------------- ## Research Question 3: Is there an association between company size and job satisfaction for data analyst roles? #Do larger companies tend to have higher job satisfaction rates among data analysts? hist(DA$Rating) DA <- DA[!is.na(DA$Rating),] # Fit an ANOVA model model_aov <- aov(Rating ~ Size, data = DA) model3 <- summary(model_aov) anova_summary_3 F)"] ) # Format the data frame anova_summary_3$DF <- format(round(anova_summary_3$DF, 4), nsmall = 4) anova_summary_3$Sum_Sq <- format(round(anova_summary_3$Sum_Sq, 4), nsmall = 4) anova_summary_3$Mean_Sq <- format(round(anova_summary_3$Mean_Sq, 4), nsmall = 4) anova_summary_3$F_value <- format(round(anova_summary_3$F_value, 4), nsmall = 4) anova_summary_3$Pr_value <- format(round(anova_summary_3$Pr_value, 4), nsmall = 4) gt_table % tab_header(title = "Summary Statistics for Company Rate by Company Size") %>% cols_align(align = "center") %>% cols_label( Term = "Term", DF = "DF", Sum_Sq = "Sum Sq", Mean_Sq = "Mean Sq", F_value = "F value", Pr_value = "Pr(>F)" ) gt_table # Create QQ Plot for Gaussian assumption qqPlot( x = model_aov$residuals, distribution = "norm", envelope = 0.90, id = FALSE, pch = 20, ylab = "Residuals" ) title(main = "QQ Plot: Gaussian Assumption") # Create plot for Homoscedasticity of residuals ggplot( data = data.frame( residuals = model_aov$residuals, fitted = model_aov$fitted.values ), mapping = aes(x = fitted, y = residuals) ) + geom_point(size = 2) + theme_bw() + labs(title = "Homoscedasticity of Residuals", x = "Fitted values (points)", y = "Residuals (points)") + theme(plot.title = element_text(hjust = 0.5)) # Perform Levene's test for Homogeneity of Variance Lev3 <- leveneTest(Rating ~ Size, data = DA) # Create a data frame for the test result levene_df F)`[1], stringsAsFactors = FALSE ) # Create the gt table for Levene's Test levene_table % tab_header(title = "Levene's Test for Homogeneity of Variance") %>% cols_label(Df = "Degrees of Freedom", F_value = "F value", Pr_greater_F = "Pr(>F)") %>% fmt_number(columns = c(F_value, Pr_greater_F), decimals = 4) %>% fmt_missing(columns = everything(), missing_text = "") %>% tab_style(style = list(cell_text(weight = "bold")), locations = cells_body()) # Create plot for Independence of Observations ggplot( data = data.frame( residuals = model_aov$residuals, index = 1:length(model_aov$residuals) ), mapping = aes(x = index, y = residuals) ) + geom_point(size = 1.5) + geom_line() + theme_bw() + geom_hline( yintercept = 0, linetype = "dashed", color = "red" ) + labs(title = "Independence of Observations") + xlab("Measurement order") + ylab("Residuals") + theme(plot.title = element_text(hjust = 0.5)) #Perform Durbin Watson Test durbinWatsonTest(model_aov) dw_results <- data.frame( lag = 1, autocorrelation = 0.04392941, statistic = 1.911718, p_value = 0.048 ) # Create the gt table dw_table % gt() %>% tab_header(title = "Durbin-Watson Test Results") %>% cols_label(lag = "Lag", autocorrelation = "Autocorrelation", statistic = "D-W Statistic", p_value = "p-value") %>% fmt_number(columns = vars(autocorrelation, statistic, p_value), decimals = 3) %>% tab_style(style = list(cell_text(weight = "bold")), locations = cells_body()) # Print the table print(dw_table) # Perform the Kruskal-Wallis test kruskal_test_RS <- kruskal.test(Rating ~ Size, data = DA) print(kruskal_test_RS) kruskal_df <- data.frame( Test = "Kruskal-Wallis", Chi_Squared = kruskal_test_RS$statistic, Degrees_of_Freedom = kruskal_test_RS$parameter, p_value = kruskal_test_RS$p.value ) kruskal_df$p_value <- format(kruskal_df$p_value, digits = 4) kruskal_table % tab_header( title = "Kruskal-Wallis Test Result", subtitle = "" ) %>% cols_label( Test = "Test", Chi_Squared = "Chi-Squared", Degrees_of_Freedom = "Degrees of Freedom", p_value = "p-value" ) kruskal_table # Interpretation # There are significant differences in the ratings across different company sizes. #### Perform Dunn's Test to check which group is significantly different than other groups dunn_test_rating <- kwAllPairsDunnTest(Rating ~ Size, data = DA, method="bh") # Print the results print(dunn_test_rating) # Boxplot for Dunn's Test ggplot(DA, aes(x=Size, y=Rating)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 90))+ # rotate x-axis labels for readability labs(title = "Rating by Company Size") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 10), axis.text.y = element_text(face = "bold", size = 10), axis.title = element_text(face = "bold", size = 14), plot.title = element_text(face = "bold", size = 14, hjust = 0.5)) # Create a data frame with the result table_dunn_rating <- data.frame( Comparison = rownames(dunn_test_rating$p.value), `1 to 50 employees` = round(dunn_test_rating$p.value[, "1 to 50 employees"], 3), `10000+ employees` = round(dunn_test_rating$p.value[, "10000+ employees"], 3), `1001 to 5000 employees` = round(dunn_test_rating$p.value[, "1001 to 5000 employees"], 3), `201 to 500 employees` = round(dunn_test_rating$p.value[, "201 to 500 employees"], 3), `5001 to 10000 employees` = round(dunn_test_rating$p.value[, "5001 to 10000 employees"], 3), `501 to 1000 employees` = round(dunn_test_rating$p.value[, "501 to 1000 employees"], 3), `51 to 200 employees` = round(dunn_test_rating$p.value[, "51 to 200 employees"], 3), `Unknown` = round(dunn_test_rating$p.value[, "Unknown"], 3) ) # Remove the "X" prefix from column names colnames(table_dunn_rating) <- c("Comparison", "1 to 50 employees", "10000+ employees", "1001 to 5000 employees", "201 to 500 employees", "5001 to 10000 employees", "501 to 1000 employees", "51 to 200 employees") # Create a gt table table_gt <- gt(table_dunn_rating) # Set table headers table_gt % tab_header( title = "Pairwise comparisons using Dunn's all-pairs test", subtitle = "Data: Rating by Size" ) # Print the table print(table_gt) # Calculate effect sizes using Eta-squared eta_squared <- kruskal_test_RS$statistic / (kruskal_test_RS$statistic + length(unique(DA$Size)) - 1) # Calculate effect sizes using Kendall's W num_groups_RS <- length(unique(DA$Size)) kendall_w <- (kruskal_test_RS$statistic - (num_groups_RS - 1)) / (length(DA$Size) * (num_groups_RS - 1)) result_effectsize <- data.frame( Test = "Kruskal-Wallis", Chi_Squared = kruskal_test_RS$statistic, Degrees_of_Freedom = kruskal_test_RS$parameter, p_value = kruskal_test_RS$p.value, Eta_squared = eta_squared, Kendall_W = kendall_w ) effect_size_table % tab_header(title = "Effect Sizes") %>% cols_label( Measure = "Measure", Effect_Size = "Effect Size" ) effect_size_table # ================================================================================================================== # ================================================================================================================== ############################################################################################################################## ## Research Question 4: Is there an association between a company's revenue and the salaries they offer for data analyst roles? ## Do companies with higher revenues tend to offer higher salaries for these positions? DA$Average_Salary <- exp(DA$Average_Salary) DA$Salary_Min <- exp(DA$Salary_Min) DA$Salary_Max <- exp(DA$Salary_Max) # Produce values of descriptive statistics revenue_stats % tibble::remove_rownames() %>% dplyr::select( group1, n, min, Q0.25, median, Q0.75, max, mad, mean, sd, skew, kurtosis ) %>% knitr::kable( caption = "Summary Statistics for Salaries by Company Revenue", digits = 2, format.args = list(big.mark = ","), align = rep(c("l", "c"), times = c(2, 11)), col.names = c("Revenue", "n", "Min", "Q1", "Median", "Q3", "Max", "MAD", "Mean", "SD", "Sample Skew", "Sample Ex. Kurtosis"), booktabs = TRUE ) %>% kableExtra::kable_styling( font_size = 12, latex_options = c("HOLD_position", "scale_down") ) # Skewness = 0: If the skewness is around 0, the data are fairly symmetrical. Although being symmetrical doesn't mean that the data are normally distributed, many statistical techniques assume that the data are at least symmetrically distributed. # # -0.5 < Skewness < 0.5: The data are approximately symmetrical. # # -1 < Skewness < -0.5 or 0.5 < Skewness < 1: The data are moderately skewed. # # Skewness 1: The data are highly skewed. # Create side-by-side box plots of the distances ---- ggplot( data = DA, mapping = aes( x = Revenue, y = Average_Salary ) ) + geom_boxplot() + theme_bw() + xlab("Company Revenue") + ylab("Average Salary for Data Analysts") + theme( text = element_text(size = 12) ) # Log Transformation DA$Average_Salary <- exp(DA$Average_Salary) DA$Salary_Min <- exp(DA$Salary_Min) DA$Salary_Max <- exp(DA$Salary_Max) # Perform Chi-square test contingency_table_revenue <- table(DA$Revenue, DA$Salary_Estimate) # Perform the chi-square test of independence chi_square_result <- chisq.test(contingency_table_revenue) # Print the results print(chi_square_result) chi_squared = 1258.2 degrees_of_freedom = 1044 p_value = 5.045e-06 # Create a data frame with the results result_df <- data.frame( Measure = c("Chi-Squared", "Degrees of Freedom", "p-value"), Value = c(chi_squared, degrees_of_freedom, p_value) ) # Create a table with gt result_table % tab_header(title = "Chi-Square Test Results") %>% cols_label( Measure = "Measure", Value = "Value" ) # Print the table result_table ## Interpretation ## There is a significant association between a company's revenue and the salaries they offer for data analyst roles ## Perform ANOVA anova_result <- aov(Average_Salary ~ Revenue, data = DA) summary(anova_result) # Create a data frame with the ANOVA results anova_df <- data.frame( Df = c(12, 1886), Sum_Sq = c(1.69, 194.59), Mean_Sq = c(0.1406, 0.1032), F_value = c(1.363, " "), Pr_F = c(0.177, " ") ) rownames(anova_df) % tab_header( title = "ANOVA Result for Average Salary by Company Revenue" ) ## Conclusion: There is no statistically significant difference in the salary estimate for data analysts across the company revenue. # ================================================================================================================== # ================================================================================================================== # ================================================================================================================== ## Research Question 5: Is there an association between company ratings and salaries for these roles? ## Do companies with higher ratings tend to offer higher salaries for data analyst roles? DA <- DA[!is.na(DA$Rating),] # Create a scatter plot ggplot(DA, aes(x = Rating, y = Average_Salary)) + geom_point(alpha = 0.5) + geom_smooth(method = "lm", col = "red") + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 10), axis.text.y = element_text(face = "bold", size = 10), axis.title = element_text(face = "bold", size = 14), plot.title = element_text(face = "bold", size = 14, hjust = 0.5)) + theme(axis.text.x = element_text(angle = 0, hjust = 1)) + labs(title = "Salary vs Rating", x = "Rating", y = "Salary") ### Perform a linear regression without adding any controls model <- lm(Average_Salary ~ Rating, data = DA) summary(model) model_summary % tab_header( title = "Summary of Linear Regression Model") %>% fmt_number(columns = vars(estimate, std.error, statistic, p.value), decimals = 4) # Check residuals vs fitted values par(mfrow = c(2, 2)) plot(model, which = 1:4) # Fit the regression model with interaction effects model_control <- lm(Average_Salary ~ Rating+Size, data = DA) summary(model_control) model_summary <- broom::tidy(summary(model_control)) model_summary % mutate(significance = case_when( p.value < .001 ~ "***", p.value < .01 ~ "**", p.value % tab_header( title = "Summary of Linear Regression Model") %>% fmt_number(columns = vars(estimate, std.error, statistic, p.value), decimals = 4) # Check Assumptions plot(model_control, which = 1:4) #Check for multicollinearity vif(model_control) ## The result indicates that there is no strong linear relationship between these variables and the average salary # ================================================================================================================== # ================================================================================================================== # ================================================================================================================== ## Research Question 6: What are the most commonly requested skills for data analyst roles? preprocessed_text % str_replace_all("[^[:alpha:]]", " ") %>% str_replace_all("machine learning", "machine_learning") %>% str_replace_all("no degree", "no_degree") %>% str_replace_all("high school", "high_school") %>% str_split("\\s+") %>% unlist() # Create a frequency table freq_table <- table(preprocessed_text) # Relevant skills list skills <- c("sql", "python", "r", "tableau", "powerbi", "excel", "java", "aws", "spark", "powerpoint", "snowflake", "hadoop", "vba", "azure", "scala", "kafka", "machine_learning", "looker", "word", "matlab") # Filter the frequency table for relevant skills filtered_table <- freq_table[names(freq_table) %in% skills] # Sort the filtered table in descending order sorted_table <- sort(filtered_table, decreasing = TRUE) # Print the top 20 most commonly mentioned skills topskills <- head(sorted_table, n = 20) print(topskills) topskills <- data.frame(Skill = names(topskills), Frequency = as.vector(topskills)) topskills$Skill % gt() %>% tab_header( title = "Top 20 Skills" ) %>% cols_label( Skill = "Skill", Frequency = "Frequency" ) %>% fmt_number( columns = vars(Frequency), decimals = 0 ) # Skills vs Frequency plot ggplot(data = topskills, aes(x = Frequency, y = reorder(Skill, Frequency), fill = reorder(Skill, Frequency))) + geom_col() + geom_text(aes(label = Frequency), hjust = 0,size = 3, colour = "Black", fontface="bold") + theme(legend.position="none") + labs(x = "Count", y = "Skills", title ="Skills vs Frequency", subtitle = "Which skills are described the most in the job description?") # Skills vs Percentage of Occurrence # Calculate the total frequency total_frequency <- sum(topskills$Frequency) topskills$Percentage <- (topskills$Frequency / total_frequency) * 100 print(topskills) ## Bar Graph : Skills vs Percentage of Occurrence ggplot(topskills, aes(x = Percentage, y = reorder(Skill,Percentage), fill = Skill)) + geom_bar(stat = "identity") + labs(title = "Skills vs Percentage of Occurrence", x = "Percentage (%)", y = "Skills", subtitle = "Which skills are requested most?") + geom_text(aes(label = paste0(round(Percentage,2),"%")), size= 2.8, hjust = 0, colour = "Black", fontface="bold") + theme(axis.text.x = element_text(angle = 0, hjust = 1)) + scale_fill_discrete(na.value = "gray") + theme(legend.position = "top", legend.direction = "horizontal", legend.box = "vertical") ## Creating Degree Indices ## Extract degrees from job_description variable degrees <- c("bachelor", "master", "no_degree", "phd", "high_school") # Filter the frequency table for relevant degrees filtered_table <- freq_table[names(freq_table) %in% degrees] # Sort the filtered table in descending order sorted_table <- sort(filtered_table, decreasing = TRUE) # Print the top most commonly mentioned degrees degrees <- head(sorted_table) print(degrees) degrees <- data.frame(degrees) setnames(degrees, c("preprocessed_text", "Freq"), c("Degree", "Frequency")) # Upper Case for degrees degrees$Degree % gt() %>% tab_header( title = "Top Degree" ) %>% cols_label( Degree = "Degree", Frequency = "Frequency" ) %>% fmt_number( columns = vars(Frequency), decimals = 0 ) # Degrees vs Frequency plot ggplot(data = degrees, aes(x = Frequency, y = reorder(Degree, Frequency), fill = reorder(Degree, Frequency))) + geom_col() + geom_text(aes(label = Frequency), hjust = -0.2, colour = "Black", fontface="bold") + theme(legend.position="none") + labs(x = "Count", y = "Degree", title ="Degree vs Frequency", subtitle = "Which degrees are described the most in the job description?") # Degree vs Percentage of Occurrence total_frequency <- sum(degrees$Frequency) degrees$Percentage