|
| 1 | +### ETAPE 2 |
| 2 | + |
| 3 | + |
| 4 | +##importation des donnees |
| 5 | +data <- read.csv("/Users/keliane/Downloads/StudentDepressionDataset.csv",header=T) |
| 6 | +head(data,15) |
| 7 | +str(data) |
| 8 | +print(dim(data)) |
| 9 | + |
| 10 | +## nettoyage des donnes |
| 11 | +sum(is.na(data)) |
| 12 | +data_clean <- na.omit(data) |
| 13 | + |
| 14 | + #Convertir variables categorielles |
| 15 | +ID_fct <- factor(data$id) |
| 16 | +gender_fct <- factor(data$Gender) |
| 17 | +city_fct <- factor(data$City) |
| 18 | +profession_fct <- factor(data$Profession) |
| 19 | +Sleep_duration_fct <- factor(data$Sleep.Duration) |
| 20 | +dietary_habits_fct <- factor(data$Dietary.Habits) |
| 21 | +family_history_fct <- factor(data$Family.History.of.Mental.Illness) |
| 22 | +suicidal_thoughts_fct <- factor(data$Have.you.ever.had.suicidal.thoughts) |
| 23 | +degree_fct <- factor(data$Degree) |
| 24 | +depression_status_fct <- factor(data$Depression) |
| 25 | + |
| 26 | +ID_fct |
| 27 | +gender_fct |
| 28 | +city_fct |
| 29 | +profession_fct |
| 30 | +Sleep_duration_fct |
| 31 | +dietary_habits_fct |
| 32 | +family_history_fct |
| 33 | +suicidal_thoughts_fct |
| 34 | +degree_fct |
| 35 | +depression_status_fct |
| 36 | + |
| 37 | + |
| 38 | + |
| 39 | +### ETAPE 3 : Analyse et exploration des donnees |
| 40 | + |
| 41 | +##Statistiques Descriptives |
| 42 | + |
| 43 | + #colonne Age |
| 44 | +mean_age <- mean(data$Age, na.rm = TRUE) |
| 45 | +median_age <- median(data$Age, na.rm = TRUE) |
| 46 | +variance_age <- var(data$Age, na.rm = TRUE) |
| 47 | +sd_age <- sd(data$Age, na.rm = TRUE) |
| 48 | +min_age <- min(data$Age, na.rm = TRUE) |
| 49 | +max_age <- max(data$Age, na.rm = TRUE) |
| 50 | +quartiles_age <- quantile(data$Age, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE) |
| 51 | +summary_stats_age <- summary(data$Age) |
| 52 | + |
| 53 | +print(summary_stats_age) |
| 54 | + |
| 55 | + |
| 56 | + #colonne CGPA |
| 57 | +mean_cgpa <- mean(data$CGPA, na.rm = TRUE) |
| 58 | +median_CGPA <- median(data$CGPA, na.rm = TRUE) |
| 59 | +variance_CGPA <- var(data$CGPA, na.rm = TRUE) |
| 60 | +sd_CGPA <- sd(data$CGPA, na.rm = TRUE) |
| 61 | +min_CGPA <- min(data$CGPA, na.rm = TRUE) |
| 62 | +max_CGPA <- max(data$CGPA, na.rm = TRUE) |
| 63 | +quartiles_CGPA <- quantile(data$CGPA, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE) |
| 64 | +summary_stats_CGPA <- summary(data$CGPA) |
| 65 | + |
| 66 | +print(summary_stats_CGPA) |
| 67 | + |
| 68 | + |
| 69 | + |
| 70 | + |
| 71 | + |
| 72 | + #colonne Study Satisfaction |
| 73 | +mean_Study_satisfaction <- mean(data$Study.Satisfaction, na.rm = TRUE) |
| 74 | +median_Study_satisfaction <- median(data$Study.Satisfaction, na.rm = TRUE) |
| 75 | +variance_Study_satisfaction <- var(data$Study.Satisfaction, na.rm = TRUE) |
| 76 | +sd_Study_satisfaction <- sd(data$Study.Satisfaction, na.rm = TRUE) |
| 77 | +min_Study_satisfaction <- min(data$Study.Satisfaction, na.rm = TRUE) |
| 78 | +max_Study_satisfaction <- max(data$Study.Satisfaction, na.rm = TRUE) |
| 79 | +quartiles_Study_satisfaction <- quantile(data$Study.Satisfaction, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE) |
| 80 | +summary_stats_Study_satisfaction <- summary(data$Study.Satisfaction) |
| 81 | + |
| 82 | +print(summary_stats_Study_satisfaction) |
| 83 | + |
| 84 | + |
| 85 | + #colonne Job Satisfaction |
| 86 | +mean_Job_Satisfaction <- mean(data$Job.Satisfaction, na.rm = TRUE) |
| 87 | +median_Job_Satisfaction <- median(data$Job.Satisfaction, na.rm = TRUE) |
| 88 | +variance_Job_Satisfaction <- var(data$Job.Satisfaction, na.rm = TRUE) |
| 89 | +sd_Job_Satisfaction <- sd(data$Job.Satisfaction, na.rm = TRUE) |
| 90 | +min_Job_Satisfaction <- min(data$Job.Satisfaction, na.rm = TRUE) |
| 91 | +max_Job_Satisfaction <- max(data$Job.Satisfaction, na.rm = TRUE) |
| 92 | +quartiles_Job_Satisfaction <- quantile(data$Job.Satisfaction, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE) |
| 93 | +summary_stats_Job_Satisfaction <- summary(data$Job.Satisfaction) |
| 94 | + |
| 95 | +print(summary_stats_Job_Satisfaction) |
| 96 | + |
| 97 | + |
| 98 | + |
| 99 | + |
| 100 | + |
| 101 | + #colonne Academic Pressure |
| 102 | +mean_Academic_Pressure <- mean(data$Academic.Pressure, na.rm = TRUE) |
| 103 | +median_Academic_Pressure <- median(data$Academic.Pressure, na.rm = TRUE) |
| 104 | +variance_Academic_Pressure <- var(data$Academic.Pressure, na.rm = TRUE) |
| 105 | +sd_Academic_Pressure <- sd(data$Academic.Pressure, na.rm = TRUE) |
| 106 | +min_Academic_Pressure <- min(data$Academic.Pressure, na.rm = TRUE) |
| 107 | +max_Academic_Pressure <- max(data$Academic.Pressure, na.rm = TRUE) |
| 108 | +quartiles_Academic_Pressure <- quantile(data$Academic.Pressure, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE) |
| 109 | +summary_stats_Academic_Pressure <- summary(data$Academic.Pressure) |
| 110 | + |
| 111 | +print(summary_stats_Academic_Pressure) |
| 112 | + |
| 113 | + |
| 114 | + #colonne Work Pressure |
| 115 | +mean_Work_Pressure <- mean(data$Work.Pressure, na.rm = TRUE) |
| 116 | +median_Work_Pressure <- median(data$Work.Pressure, na.rm = TRUE) |
| 117 | +variance_Work_Pressure <- var(data$Work.Pressure, na.rm = TRUE) |
| 118 | +sd_Work_Pressure <- sd(data$Work.Pressure, na.rm = TRUE) |
| 119 | +min_Work_Pressure <- min(data$Work.Pressure, na.rm = TRUE) |
| 120 | +max_Work_Pressure <- max(data$Work.Pressure, na.rm = TRUE) |
| 121 | +quartiles_Work_Pressure <- quantile(data$Work.Pressure, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE) |
| 122 | +summary_stats_Work_Pressure <- summary(data$Work.Pressure) |
| 123 | + |
| 124 | +print(summary_stats_Work_Pressure) |
| 125 | + |
| 126 | + |
| 127 | + |
| 128 | + #colonne Financial Stress |
| 129 | +mean_Financial_stress <- mean(data$Financial.Stress, na.rm = TRUE) |
| 130 | +median_Financial_stress <- median(data$Financial.Stress, na.rm = TRUE) |
| 131 | +variance_Financial_stress <- var(data$Financial.Stress, na.rm = TRUE) |
| 132 | +sd_Financial_stress <- sd(data$Financial.Stress, na.rm = TRUE) |
| 133 | +min_Financial_stress <- min(data$Financial.Stress, na.rm = TRUE) |
| 134 | +max_Financial_stress <- max(data$Financial.Stress, na.rm = TRUE) |
| 135 | +quartiles_Financial_stress <- quantile(data$Financial.Stress, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE) |
| 136 | +summary_stats_Financial_stress <- summary(data$Financial.Stress) |
| 137 | + |
| 138 | +print(summary_stats_Financial_stress) |
| 139 | + |
| 140 | + #correlation |
| 141 | +correlation_matrix <- cor(data[, c("Age", "CGPA", "Study.Satisfaction", "Job.Satisfaction", "Academic.Pressure", "Work.Pressure", "Financial.Stress")], use = "complete.obs") |
| 142 | +print(correlation_matrix) |
| 143 | + |
| 144 | + |
| 145 | + |
| 146 | + |
| 147 | +## Visualisation |
| 148 | + |
| 149 | + #Données catégorielles : Diagrammes en barres (geom_bar()) |
| 150 | +library(ggplot2) |
| 151 | +ggplot(data, aes(x = gender_fct)) + |
| 152 | + geom_bar(fill = "blue") + |
| 153 | + labs(title = "Distribution of Gender", x = "Gender", y = "Count") |
| 154 | + |
| 155 | +ggplot(data, aes(x = city_fct)) + |
| 156 | + geom_bar(fill = "green") + |
| 157 | + labs(title = "Distribution by City", x = "City", y = "Count") + |
| 158 | + theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Rotation des labels si nécessaire |
| 159 | + |
| 160 | +ggplot(data, aes(x = profession_fct)) + |
| 161 | + geom_bar(fill = "red") + |
| 162 | + labs(title = "Distribution of Profession", x = "Profession", y = "Count") |
| 163 | + |
| 164 | +# Pour Sleep Duration |
| 165 | +ggplot(data, aes(x = Sleep_duration_fct)) + |
| 166 | + geom_bar(fill = "purple") + |
| 167 | + labs(title = "Distribution of Sleep Duration", x = "Sleep Duration", y = "Count") |
| 168 | + |
| 169 | +# Pour Dietary Habits |
| 170 | +ggplot(data, aes(x = dietary_habits_fct)) + |
| 171 | + geom_bar(fill = "orange") + |
| 172 | + labs(title = "Distribution of Dietary Habits", x = "Dietary Habits", y = "Count") |
| 173 | + |
| 174 | +# Pour Family History of Mental Illness |
| 175 | +ggplot(data, aes(x = family_history_fct)) + |
| 176 | + geom_bar(fill = "cyan") + |
| 177 | + labs(title = "Family History of Mental Illness", x = "Family History", y = "Count") |
| 178 | + |
| 179 | +# Pour Suicidal Thoughts |
| 180 | +ggplot(data, aes(x = suicidal_thoughts_fct)) + |
| 181 | + geom_bar(fill = "magenta") + |
| 182 | + labs(title = "Have you ever had suicidal thoughts?", x = "Suicidal Thoughts", y = "Count") |
| 183 | + |
| 184 | +# Pour Degree |
| 185 | +ggplot(data, aes(x = degree_fct)) + |
| 186 | + geom_bar(fill = "yellow") + |
| 187 | + labs(title = "Distribution of Degrees", x = "Degree", y = "Count") |
| 188 | + |
| 189 | +# Pour Depression Status |
| 190 | +ggplot(data, aes(x = depression_status_fct)) + |
| 191 | + geom_bar(fill = "lightblue") + |
| 192 | + labs(title = "Depression Status", x = "Depression", y = "Count") |
| 193 | + |
| 194 | + |
| 195 | + # Données continues : Histogrammes (geom_histogram()) |
| 196 | + |
| 197 | +library(ggplot2) |
| 198 | + |
| 199 | +# Histogramme pour la colonne 'Age' |
| 200 | +ggplot(data, aes(x = Age)) + |
| 201 | + geom_histogram(bins = 30, fill = "cornflowerblue", color = "black") + |
| 202 | + labs(title = "Histogram of Age", x = "Age", y = "Frequency") |
| 203 | + |
| 204 | +# Histogramme pour la colonne 'CGPA' |
| 205 | +ggplot(data, aes(x = CGPA)) + |
| 206 | + geom_histogram(bins = 30, fill = "steelblue", color = "black") + |
| 207 | + labs(title = "Histogram of CGPA", x = "CGPA", y = "Frequency") |
| 208 | + |
| 209 | +# Histogramme pour la colonne 'Study Satisfaction' |
| 210 | +ggplot(data, aes(x = Study.Satisfaction)) + |
| 211 | + geom_histogram(bins = 30, fill = "lightblue", color = "black") + |
| 212 | + labs(title = "Histogram of Study Satisfaction", x = "Study Satisfaction", y = "Frequency") |
| 213 | + |
| 214 | +# Histogramme pour la colonne 'Job Satisfaction' |
| 215 | +ggplot(data, aes(x = Job.Satisfaction)) + |
| 216 | + geom_histogram(bins = 30, fill = "gray", color = "black") + |
| 217 | + labs(title = "Histogram of Job Satisfaction", x = "Job Satisfaction", y = "Frequency") |
| 218 | + |
| 219 | +# Histogramme pour la colonne 'Academic Pressure' |
| 220 | +ggplot(data, aes(x = Academic.Pressure)) + |
| 221 | + geom_histogram(bins = 30, fill = "darkblue", color = "black") + |
| 222 | + labs(title = "Histogram of Academic Pressure", x = "Academic Pressure", y = "Frequency") |
| 223 | + |
| 224 | +# Histogramme pour la colonne 'Work Pressure' |
| 225 | +ggplot(data, aes(x = Work.Pressure)) + |
| 226 | + geom_histogram(bins = 30, fill = "purple", color = "black") + |
| 227 | + labs(title = "Histogram of Work Pressure", x = "Work Pressure", y = "Frequency") |
| 228 | + |
| 229 | +# Histogramme pour la colonne 'Financial Stress' |
| 230 | +ggplot(data, aes(x = Financial.Stress)) + |
| 231 | + geom_histogram(bins = 30, fill = "red", color = "black") + |
| 232 | + labs(title = "Histogram of Financial Stress", x = "Financial Stress", y = "Frequency") |
| 233 | + |
| 234 | + |
| 235 | + |
| 236 | + |
| 237 | + #Relations entre variables : Nuages de points (geom_point()) |
| 238 | +data_frame <- data.frame(Variable1 = data$Age, Variable2 = data$CGPA) |
| 239 | +ggplot(data_frame, aes(x = Variable1, y = Variable2)) + |
| 240 | + geom_point(color = "darkorange") + |
| 241 | + theme_minimal() + |
| 242 | + labs(title = " Age vs CGPA", x = "Variable 1", y = "Variable 2") |
| 243 | + |
| 244 | + |
| 245 | +data_frame <- data.frame(Variable3 = data$Study.Satisfaction, Variable4 = data$Job.Satisfaction) |
| 246 | +ggplot(data_frame, aes(x = Variable3, y = Variable4)) + |
| 247 | + geom_point(color = "darkorange") + |
| 248 | + theme_minimal() + |
| 249 | + labs(title = " Study Satisfaction vs Job Satisfaction", x = "Variable 3", y = "Variable 4") |
| 250 | + |
| 251 | + |
| 252 | +data_frame <- data.frame(Variable5 = data$Academic.Pressure, Variable6 = data$Work.Pressure) |
| 253 | +ggplot(data_frame, aes(x = Variable5, y = Variable6)) + |
| 254 | + geom_point(color = "darkorange") + |
| 255 | + theme_minimal() + |
| 256 | + labs(title = " Academic Pressure vs Work Pressure", x = "Variable 5", y = "Variable 6") |
| 257 | + |
| 258 | + |
| 259 | +data_frame <- data.frame(Variable5 = data$Academic.Pressure, Variable7 = data$Financial.Stress, Variable4=data$Job.Satisfaction) |
| 260 | +ggplot(data_frame, aes(x = Variable5, y = Variable7, color = factor(Study.Satisfaction), size =Variable4)) + |
| 261 | + geom_point(color = "darkorange") + |
| 262 | + theme_minimal() + |
| 263 | + scale_color_manual(values = c("blue", "green", "red", "purple")) + # Exemple avec des couleurs prédéfinies |
| 264 | + labs(title = "Age vs CGPA with Study Satisfaction and Job Satisfaction", |
| 265 | + x = "Variable 5", |
| 266 | + y = "Variable 7", |
| 267 | + color = "Study Satisfaction", |
| 268 | + size = "Job Satisfaction") |
| 269 | + |
| 270 | +labs(title = " Academic Pressure vs Financial Stress", x = "Variable 5", y = "Variable 7") |
| 271 | + |
| 272 | + |
| 273 | + |
| 274 | + #Détection des valeurs aberrantes : Boîtes à moustaches (geom_boxplot()) |
| 275 | +data_frame <- data.frame(Variable1=data$Age, Variable2=data$CGPA, Variable3=data$Study.Satisfaction, Variable4=data$Job.Satisfaction, Variable5 = data$Academic.Pressure, Variable6 = data$Financial.Stress, Variable7=data$Work.Pressure) |
| 276 | +boxplot(data_frame, main="Données aléatoires | Graphiques en boîte comparatifs", ylab="Fréquence", |
| 277 | + outpch=18, outcex=1.7, outcol="red", |
| 278 | + col="lightblue") |
| 279 | + |
| 280 | + |
| 281 | + |
| 282 | + |
| 283 | +### ETAPE 4 : ANALYSE STATISTIQUE |
| 284 | + |
| 285 | +## Tendances centrales |
| 286 | + |
| 287 | + #Moyenne et Mediane |
| 288 | +mean_age <- mean(data$Age, na.rm = TRUE) |
| 289 | +median_age <- median(data$Age, na.rm = TRUE) |
| 290 | + |
| 291 | +mean_CGPA <- mean(data$CGPA, na.rm = TRUE) |
| 292 | +median_CGPA <- median(data$CGPA, na.rm = TRUE) |
| 293 | + |
| 294 | +mean_Study_satisfaction <- mean(data$Study.Satisfaction, na.rm = TRUE) |
| 295 | +median_Study_satisfaction <- median(data$Study.Satisfaction, na.rm = TRUE) |
| 296 | + |
| 297 | +mean_Job_Satisfaction <- mean(data$Job.Satisfaction, na.rm = TRUE) |
| 298 | +median_Job_Satisfaction <- median(data$Job.Satisfaction, na.rm = TRUE) |
| 299 | + |
| 300 | +mean_Academic_Pressure <- mean(data$Academic.Pressure, na.rm = TRUE) |
| 301 | +median_Academic_Pressure <- median(data$Academic.Pressure, na.rm = TRUE) |
| 302 | + |
| 303 | +mean_Work_Pressure <- mean(data$Work.Pressure, na.rm = TRUE) |
| 304 | +median_Work_Pressure <- median(data$Work.Pressure, na.rm = TRUE) |
| 305 | + |
| 306 | +mean_Financial_stress <- mean(data$Financial.Stress, na.rm = TRUE) |
| 307 | +median_Financial_stress <- median(data$Financial.Stress, na.rm = TRUE) |
| 308 | + |
| 309 | + |
| 310 | + #Variance et ecart type |
| 311 | +variance_values <- sapply(data[c("Age", "CGPA", "Study.Satisfaction", "Job.Satisfaction", "Academic.Pressure", "Work.Pressure", "Financial.Stress")], var, na.rm = TRUE) |
| 312 | +print(variance_values) |
| 313 | + |
| 314 | +sd_values <- sapply(data[c("Age", "CGPA", "Study.Satisfaction", "Job.Satisfaction", "Academic.Pressure", "Work.Pressure", "Financial.Stress")], sd, na.rm = TRUE) |
| 315 | +print(sd_values) |
| 316 | + |
| 317 | + |
| 318 | + #Plage Interquartile |
| 319 | +iqr_values <- sapply(data[c("Age", "CGPA", "Study.Satisfaction","Job.Satisfaction","Academic.Pressure","Work.Pressure","Financial.Stress")], IQR, na.rm = TRUE) |
| 320 | +print(iqr_values) |
| 321 | + |
| 322 | + |
| 323 | + #Valeurs aberrantes |
| 324 | +variables <- c("Age", "CGPA", "Study.Satisfaction", "Job.Satisfaction", "Academic.Pressure", "Work.Pressure", "Financial.Stress") |
| 325 | +outliers <- sapply(data[variables], function(x) boxplot.stats(x)$out) |
| 326 | +print(outliers) |
| 327 | + |
| 328 | + |
| 329 | + |
| 330 | + |
| 331 | +## Analyse de la symétrie et de l’asymétrie |
| 332 | + |
| 333 | +library(e1071) |
| 334 | +skewness_values <- sapply(data[c("Age", "CGPA", "Study.Satisfaction", "Job.Satisfaction", "Academic.Pressure", "Work.Pressure", "Financial.Stress")], skewness) |
| 335 | +print(skewness_values) |
| 336 | + |
| 337 | +## *COMMENTAIRES SUR RESULTATS* Les variables Age, CGPA, Study.Satisfaction, et Academic.Pressure ont des valeurs de skewness très proches de zéro, ce qui indique que leurs distributions sont relativement symétriques. |
| 338 | + |
| 339 | + |
| 340 | +## Analyse des correlations |
| 341 | +correlation_matrix <- cor(data[c("Age", "CGPA", "Study.Satisfaction", "Job.Satisfaction", "Academic.Pressure", "Work.Pressure", "Financial.Stress")], use = "complete.obs") |
| 342 | +print(correlation_matrix) |
| 343 | + |
| 344 | + |
| 345 | + |
| 346 | +## Visualisation avec nuage de points |
| 347 | + |
0 commit comments