{
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import joblib \n",
"from IPython.display import Image"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# Student Performance Dataset \n",
"\n",
""
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" school \n",
" sex \n",
" age \n",
" address \n",
" famsize \n",
" Pstatus \n",
" Medu \n",
" Fedu \n",
" Mjob \n",
" Fjob \n",
" ... \n",
" famrel \n",
" freetime \n",
" goout \n",
" Dalc \n",
" Walc \n",
" health \n",
" absences \n",
" G1 \n",
" G2 \n",
" G3 \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" GP \n",
" F \n",
" 18 \n",
" U \n",
" GT3 \n",
" A \n",
" 4 \n",
" 4 \n",
" at_home \n",
" teacher \n",
" ... \n",
" 4 \n",
" 3 \n",
" 4 \n",
" 1 \n",
" 1 \n",
" 3 \n",
" 4 \n",
" 0 \n",
" 11 \n",
" 11 \n",
" \n",
" \n",
" 1 \n",
" GP \n",
" F \n",
" 17 \n",
" U \n",
" GT3 \n",
" T \n",
" 1 \n",
" 1 \n",
" at_home \n",
" other \n",
" ... \n",
" 5 \n",
" 3 \n",
" 3 \n",
" 1 \n",
" 1 \n",
" 3 \n",
" 2 \n",
" 9 \n",
" 11 \n",
" 11 \n",
" \n",
" \n",
" 2 \n",
" GP \n",
" F \n",
" 15 \n",
" U \n",
" LE3 \n",
" T \n",
" 1 \n",
" 1 \n",
" at_home \n",
" other \n",
" ... \n",
" 4 \n",
" 3 \n",
" 2 \n",
" 2 \n",
" 3 \n",
" 3 \n",
" 6 \n",
" 12 \n",
" 13 \n",
" 12 \n",
" \n",
" \n",
" 3 \n",
" GP \n",
" F \n",
" 15 \n",
" U \n",
" GT3 \n",
" T \n",
" 4 \n",
" 2 \n",
" health \n",
" services \n",
" ... \n",
" 3 \n",
" 2 \n",
" 2 \n",
" 1 \n",
" 1 \n",
" 5 \n",
" 0 \n",
" 14 \n",
" 14 \n",
" 14 \n",
" \n",
" \n",
" 4 \n",
" GP \n",
" F \n",
" 16 \n",
" U \n",
" GT3 \n",
" T \n",
" 3 \n",
" 3 \n",
" other \n",
" other \n",
" ... \n",
" 4 \n",
" 3 \n",
" 2 \n",
" 1 \n",
" 2 \n",
" 5 \n",
" 0 \n",
" 11 \n",
" 13 \n",
" 13 \n",
" \n",
" \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" \n",
" \n",
" 644 \n",
" MS \n",
" F \n",
" 19 \n",
" R \n",
" GT3 \n",
" T \n",
" 2 \n",
" 3 \n",
" services \n",
" other \n",
" ... \n",
" 5 \n",
" 4 \n",
" 2 \n",
" 1 \n",
" 2 \n",
" 5 \n",
" 4 \n",
" 10 \n",
" 11 \n",
" 10 \n",
" \n",
" \n",
" 645 \n",
" MS \n",
" F \n",
" 18 \n",
" U \n",
" LE3 \n",
" T \n",
" 3 \n",
" 1 \n",
" teacher \n",
" services \n",
" ... \n",
" 4 \n",
" 3 \n",
" 4 \n",
" 1 \n",
" 1 \n",
" 1 \n",
" 4 \n",
" 15 \n",
" 15 \n",
" 16 \n",
" \n",
" \n",
" 646 \n",
" MS \n",
" F \n",
" 18 \n",
" U \n",
" GT3 \n",
" T \n",
" 1 \n",
" 1 \n",
" other \n",
" other \n",
" ... \n",
" 1 \n",
" 1 \n",
" 1 \n",
" 1 \n",
" 1 \n",
" 5 \n",
" 6 \n",
" 11 \n",
" 12 \n",
" 9 \n",
" \n",
" \n",
" 647 \n",
" MS \n",
" M \n",
" 17 \n",
" U \n",
" LE3 \n",
" T \n",
" 3 \n",
" 1 \n",
" services \n",
" services \n",
" ... \n",
" 2 \n",
" 4 \n",
" 5 \n",
" 3 \n",
" 4 \n",
" 2 \n",
" 6 \n",
" 10 \n",
" 10 \n",
" 10 \n",
" \n",
" \n",
" 648 \n",
" MS \n",
" M \n",
" 18 \n",
" R \n",
" LE3 \n",
" T \n",
" 3 \n",
" 2 \n",
" services \n",
" other \n",
" ... \n",
" 4 \n",
" 4 \n",
" 1 \n",
" 3 \n",
" 4 \n",
" 5 \n",
" 4 \n",
" 10 \n",
" 11 \n",
" 11 \n",
" \n",
" \n",
"
\n",
"
649 rows × 33 columns
\n",
"
"
],
"text/plain": [
" school sex age address famsize Pstatus Medu Fedu Mjob Fjob \\\n",
"0 GP F 18 U GT3 A 4 4 at_home teacher \n",
"1 GP F 17 U GT3 T 1 1 at_home other \n",
"2 GP F 15 U LE3 T 1 1 at_home other \n",
"3 GP F 15 U GT3 T 4 2 health services \n",
"4 GP F 16 U GT3 T 3 3 other other \n",
".. ... .. ... ... ... ... ... ... ... ... \n",
"644 MS F 19 R GT3 T 2 3 services other \n",
"645 MS F 18 U LE3 T 3 1 teacher services \n",
"646 MS F 18 U GT3 T 1 1 other other \n",
"647 MS M 17 U LE3 T 3 1 services services \n",
"648 MS M 18 R LE3 T 3 2 services other \n",
"\n",
" ... famrel freetime goout Dalc Walc health absences G1 G2 G3 \n",
"0 ... 4 3 4 1 1 3 4 0 11 11 \n",
"1 ... 5 3 3 1 1 3 2 9 11 11 \n",
"2 ... 4 3 2 2 3 3 6 12 13 12 \n",
"3 ... 3 2 2 1 1 5 0 14 14 14 \n",
"4 ... 4 3 2 1 2 5 0 11 13 13 \n",
".. ... ... ... ... ... ... ... ... .. .. .. \n",
"644 ... 5 4 2 1 2 5 4 10 11 10 \n",
"645 ... 4 3 4 1 1 1 4 15 15 16 \n",
"646 ... 1 1 1 1 1 5 6 11 12 9 \n",
"647 ... 2 4 5 3 4 2 6 10 10 10 \n",
"648 ... 4 4 1 3 4 5 4 10 11 11 \n",
"\n",
"[649 rows x 33 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('student-por.csv')\n",
"df"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# **Proven hypothesis**\n",
"## *The academic performance of a student depends not only on their academic capabilities but also their socio-economic status.*\n",
"\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "subslide"
},
"tags": []
},
"source": [
"## **Imagine this...**\n",
"\n",
"## *You are a headmaster/headmistress, managing a school of thousands of students. How do you create the ideal environment for students to academically excel in their education?*\n",
"\n",
"\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# **ET VOILA!**"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"source": [
"# **The Student Forecaster Model**: \n",
"# https://student-forecaster.streamlit.app/"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# *Step 1:* Exploratory analysis \n",
"- Distribution and spread of the data \n",
"- Dropping columns which at first look are not relevant\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/m4/kwmg1zjs75j99y7p7rtp4wcc0000gn/T/ipykernel_93639/626108447.py:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_dropped['G3_binary'] = pd.cut(x = df_dropped['G3'], bins=[df_dropped['G3'].min()-1, df_dropped['G3'].mean(), df_dropped['G3'].max()+1], labels=['fail', 'pass'])\n"
]
}
],
"source": [
"df_dropped = df[['sex', 'age', 'Medu', 'reason', 'traveltime', 'studytime', 'freetime', 'higher',\n",
" 'failures', 'internet', 'G3']]\n",
"df_dropped['G3_binary'] = pd.cut(x = df_dropped['G3'], bins=[df_dropped['G3'].min()-1, df_dropped['G3'].mean(), df_dropped['G3'].max()+1], labels=['fail', 'pass'])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [],
"source": [
"d1 = [\n",
" ['home address', 'not relevant to data'],\n",
" ['parents jobs', 'over 50percent of data classified as other'],\n",
" ['attended nursery', 'studies show attendance no longer relevant'],\n",
" ['1st and 2nd term grades', 'G3, the final grade is the only target variable']\n",
"]\n",
"d1_dropped = pd.DataFrame(data = d1, columns=['Columns grouping of data', 'Reason'])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"slideshow": {
"slide_type": "skip"
},
"tags": [
"hide-input"
]
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Columns grouping of data \n",
" Reason \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" home address \n",
" not relevant to data \n",
" \n",
" \n",
" 1 \n",
" parents jobs \n",
" over 50percent of data classified as other \n",
" \n",
" \n",
" 2 \n",
" attended nursery \n",
" studies show attendance no longer relevant \n",
" \n",
" \n",
" 3 \n",
" 1st and 2nd term grades \n",
" G3, the final grade is the only target variable \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Columns grouping of data Reason\n",
"0 home address not relevant to data\n",
"1 parents jobs over 50percent of data classified as other\n",
"2 attended nursery studies show attendance no longer relevant\n",
"3 1st and 2nd term grades G3, the final grade is the only target variable"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#First glance dropped columns\n",
"d1_dropped"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "subslide"
}
},
"source": [
"# *Step 2:* Creating subgroups of data\n",
"- To map out relationships between columns of data"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [],
"source": [
"d3 = [\n",
" ['age, Dalc,Walc, health', 'alcohol intake during weekday and weekend'], \n",
" ['famsize, Pstatus, guardian, famrel', 'quality of family relationships'],\n",
" ['studytime,activities, freetime, goout, absences, romantic', 'time related to studying'],\n",
" ['schoolsup, famsup, paid,higher,failures', 'extra educational support'],\n",
" ['school, age, Medu,reason, traveltime, failures,internet', 'Impact of reason to pick school'],\n",
" ['Medu, Fedu', 'parents level of education']\n",
"\n",
" ]\n",
"\n",
"df3 = pd.DataFrame(data = d3, columns=['original columns', 'Sub-group'])"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" original columns \n",
" Sub-group \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" age, Dalc,Walc, health \n",
" alcohol intake during weekday and weekend \n",
" \n",
" \n",
" 1 \n",
" famsize, Pstatus, guardian, famrel \n",
" quality of family relationships \n",
" \n",
" \n",
" 2 \n",
" studytime,activities, freetime, goout, absence... \n",
" time related to studying \n",
" \n",
" \n",
" 3 \n",
" schoolsup, famsup, paid,higher,failures \n",
" extra educational support \n",
" \n",
" \n",
" 4 \n",
" school, age, Medu,reason, traveltime, failures... \n",
" Impact of reason to pick school \n",
" \n",
" \n",
" 5 \n",
" Medu, Fedu \n",
" parents level of education \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" original columns \\\n",
"0 age, Dalc,Walc, health \n",
"1 famsize, Pstatus, guardian, famrel \n",
"2 studytime,activities, freetime, goout, absence... \n",
"3 schoolsup, famsup, paid,higher,failures \n",
"4 school, age, Medu,reason, traveltime, failures... \n",
"5 Medu, Fedu \n",
"\n",
" Sub-group \n",
"0 alcohol intake during weekday and weekend \n",
"1 quality of family relationships \n",
"2 time related to studying \n",
"3 extra educational support \n",
"4 Impact of reason to pick school \n",
"5 parents level of education "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Subgrouping of data columns\n",
"df3"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "subslide"
}
},
"source": [
"# *Step 3:* Analysis within sub-groups \n",
"- To decide the dominating variables most explaining the target variable"
]
},
{
"cell_type": "code",
"execution_count": 184,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [],
"source": [
"d2 = [\n",
" ['alcohol intake during weekday and weekend', 'heatmap correlation, count by grouping', 'column removed'], \n",
" ['quality of family relationships', 'heatmap correlation, value counts', 'droped all columns as no evident relation to grade'],\n",
" ['time related to studying', 'heatmap correlation', 'dropped all columns except the study time and freetime which impacted G3'],\n",
" ['extra educational support', 'heatmap correlation and value counts', 'if a student has failed then they likely have additional support'],\n",
" ['Impact of reason to pick school', 'percentage value counts and heatmap correlation', 'choice of school has an impact on grades'],\n",
" ['parents level of education', 'value counts by each combination of parents education', 'only keep mother education']\n",
"\n",
" ]\n",
"\n",
"d4 = [\n",
" ['alcohol intake during weekday and weekend', 'column dropped'], \n",
" ['quality of family relationships', 'columns dropped'],\n",
" ['time related to studying', 'dropped all columns except the study time and freetime'],\n",
" ['extra educational support', 'all columns dropped except failure'],\n",
" ['Impact of reason to pick school', 'all columns kept'],\n",
" ['parents level of education', 'fathers education dropped']\n",
" ]\n",
"\n",
"df_analysis2 = pd.DataFrame(data = d4, columns=['Sub-group', 'Action']\n",
" )"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "subslide"
}
},
"source": [
"## **Summary of analytical methods used:**\n",
"- Heatmap correlation\n",
"- Value counts\n",
"- Grouping by bins \n",
"- Histograms\n",
"- Boxplot"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [],
"source": [
"df_family = df[['famsize', 'Pstatus', 'guardian', 'famrel', 'G3']]\n",
"df['famsize2'] = df['famsize'].replace({'GT3': 1, 'LE3': 0})\n",
"df['Pstatus2'] = df['Pstatus'].replace({'T': 1, 'A': 0})\n",
"df['guardian2'] = df['guardian'].replace({'mother': 1, 'father': 0, 'other': 3})\n",
"df_family2 = df[['famsize2', 'Pstatus2', 'guardian2','famrel', 'G3']]\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"source": [
"## Correlation Heatmap"
]
},
{
"cell_type": "code",
"execution_count": 185,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 185,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"#Heatmap of correlation of the subgroup of family relationships with \n",
"#final grade G3\n",
"sns.heatmap(df_family2.corr(), vmin=-1, vmax=1, annot=True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"source": [
"## Creating bins and summarising data "
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Dalc \n",
" Walc \n",
" age \n",
" count \n",
" bins \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 1 \n",
" 1 \n",
" 16 \n",
" 68 \n",
" (14, 17] \n",
" \n",
" \n",
" 8 \n",
" 1 \n",
" 2 \n",
" 15 \n",
" 16 \n",
" (14, 17] \n",
" \n",
" \n",
" 48 \n",
" 1 \n",
" 1 \n",
" 20 \n",
" 2 \n",
" (17, 22] \n",
" \n",
" \n",
" 9 \n",
" 1 \n",
" 3 \n",
" 16 \n",
" 16 \n",
" (14, 17] \n",
" \n",
" \n",
" 7 \n",
" 1 \n",
" 3 \n",
" 17 \n",
" 21 \n",
" (14, 17] \n",
" \n",
" \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" \n",
" \n",
" 62 \n",
" 5 \n",
" 1 \n",
" 18 \n",
" 1 \n",
" (17, 22] \n",
" \n",
" \n",
" 61 \n",
" 5 \n",
" 2 \n",
" 21 \n",
" 1 \n",
" (17, 22] \n",
" \n",
" \n",
" 38 \n",
" 5 \n",
" 5 \n",
" 16 \n",
" 3 \n",
" (14, 17] \n",
" \n",
" \n",
" 32 \n",
" 5 \n",
" 5 \n",
" 18 \n",
" 4 \n",
" (17, 22] \n",
" \n",
" \n",
" 81 \n",
" 5 \n",
" 5 \n",
" 22 \n",
" 1 \n",
" (17, 22] \n",
" \n",
" \n",
"
\n",
"
82 rows × 5 columns
\n",
"
"
],
"text/plain": [
" Dalc Walc age count bins\n",
"0 1 1 16 68 (14, 17]\n",
"8 1 2 15 16 (14, 17]\n",
"48 1 1 20 2 (17, 22]\n",
"9 1 3 16 16 (14, 17]\n",
"7 1 3 17 21 (14, 17]\n",
".. ... ... ... ... ...\n",
"62 5 1 18 1 (17, 22]\n",
"61 5 2 21 1 (17, 22]\n",
"38 5 5 16 3 (14, 17]\n",
"32 5 5 18 4 (17, 22]\n",
"81 5 5 22 1 (17, 22]\n",
"\n",
"[82 rows x 5 columns]"
]
},
"execution_count": 107,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_alcohol = df[['age', 'Dalc','Walc', 'health']]\n",
"df_alcohol_counted = df_alcohol[['Dalc', 'Walc', 'age']].value_counts().reset_index().sort_values('Dalc')\n",
"df_alcohol_counted['bins'] = pd.cut(df_alcohol_counted['age'], bins=[14, 17, 22])\n",
"#df_alcohol_counted = df_alcohol_counted.drop(columns='age')\n"
]
},
{
"cell_type": "code",
"execution_count": 186,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Dalc \n",
" Walc \n",
" age \n",
" count \n",
" bins \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 1 \n",
" 1 \n",
" 16 \n",
" 68 \n",
" (14, 17] \n",
" \n",
" \n",
" 8 \n",
" 1 \n",
" 2 \n",
" 15 \n",
" 16 \n",
" (14, 17] \n",
" \n",
" \n",
" 48 \n",
" 1 \n",
" 1 \n",
" 20 \n",
" 2 \n",
" (17, 22] \n",
" \n",
" \n",
" 9 \n",
" 1 \n",
" 3 \n",
" 16 \n",
" 16 \n",
" (14, 17] \n",
" \n",
" \n",
" 7 \n",
" 1 \n",
" 3 \n",
" 17 \n",
" 21 \n",
" (14, 17] \n",
" \n",
" \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" \n",
" \n",
" 62 \n",
" 5 \n",
" 1 \n",
" 18 \n",
" 1 \n",
" (17, 22] \n",
" \n",
" \n",
" 61 \n",
" 5 \n",
" 2 \n",
" 21 \n",
" 1 \n",
" (17, 22] \n",
" \n",
" \n",
" 38 \n",
" 5 \n",
" 5 \n",
" 16 \n",
" 3 \n",
" (14, 17] \n",
" \n",
" \n",
" 32 \n",
" 5 \n",
" 5 \n",
" 18 \n",
" 4 \n",
" (17, 22] \n",
" \n",
" \n",
" 81 \n",
" 5 \n",
" 5 \n",
" 22 \n",
" 1 \n",
" (17, 22] \n",
" \n",
" \n",
"
\n",
"
82 rows × 5 columns
\n",
"
"
],
"text/plain": [
" Dalc Walc age count bins\n",
"0 1 1 16 68 (14, 17]\n",
"8 1 2 15 16 (14, 17]\n",
"48 1 1 20 2 (17, 22]\n",
"9 1 3 16 16 (14, 17]\n",
"7 1 3 17 21 (14, 17]\n",
".. ... ... ... ... ...\n",
"62 5 1 18 1 (17, 22]\n",
"61 5 2 21 1 (17, 22]\n",
"38 5 5 16 3 (14, 17]\n",
"32 5 5 18 4 (17, 22]\n",
"81 5 5 22 1 (17, 22]\n",
"\n",
"[82 rows x 5 columns]"
]
},
"execution_count": 186,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Value counts of grouping of students by age and level of drinking \n",
"# during the weekday and weekend \n",
"df_alcohol_counted.sort_values(by=['Dalc'])"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"source": [
"## Sub-group analysis deduction"
]
},
{
"cell_type": "code",
"execution_count": 183,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Sub-group \n",
" Conclusion \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" alcohol intake during weekday and weekend \n",
" column dropped \n",
" \n",
" \n",
" 1 \n",
" quality of family relationships \n",
" columns dropped \n",
" \n",
" \n",
" 2 \n",
" time related to studying \n",
" dropped all columns except the study time and ... \n",
" \n",
" \n",
" 3 \n",
" extra educational support \n",
" all columns dropped except failure \n",
" \n",
" \n",
" 4 \n",
" Impact of reason to pick school \n",
" all columns kept \n",
" \n",
" \n",
" 5 \n",
" parents level of education \n",
" fathers education dropped \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Sub-group \\\n",
"0 alcohol intake during weekday and weekend \n",
"1 quality of family relationships \n",
"2 time related to studying \n",
"3 extra educational support \n",
"4 Impact of reason to pick school \n",
"5 parents level of education \n",
"\n",
" Conclusion \n",
"0 column dropped \n",
"1 columns dropped \n",
"2 dropped all columns except the study time and ... \n",
"3 all columns dropped except failure \n",
"4 all columns kept \n",
"5 fathers education dropped "
]
},
"execution_count": 183,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Dataframe of subgroup of data, analytical method used and deductions drawn\n",
"df_analysis2"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "subslide"
}
},
"source": [
"## Conclusion from data analysis\n",
"\n",
"- The biggest impact to G3 is failure\n",
"- If a student has failed then they likely have additional support\n",
"- Nearly 95% of students want to pursue higher education, so data is heavily skewed\n",
"- Over 90% of students do not have paid support but 60% of students have support at home"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" sex \n",
" age \n",
" Medu \n",
" reason \n",
" traveltime \n",
" studytime \n",
" freetime \n",
" higher \n",
" failures \n",
" internet \n",
" G3 \n",
" G3_binary \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" F \n",
" 18 \n",
" 4 \n",
" course \n",
" 2 \n",
" 2 \n",
" 3 \n",
" yes \n",
" 0 \n",
" no \n",
" 11 \n",
" fail \n",
" \n",
" \n",
" 1 \n",
" F \n",
" 17 \n",
" 1 \n",
" course \n",
" 1 \n",
" 2 \n",
" 3 \n",
" yes \n",
" 0 \n",
" yes \n",
" 11 \n",
" fail \n",
" \n",
" \n",
" 2 \n",
" F \n",
" 15 \n",
" 1 \n",
" other \n",
" 1 \n",
" 2 \n",
" 3 \n",
" yes \n",
" 0 \n",
" yes \n",
" 12 \n",
" pass \n",
" \n",
" \n",
" 3 \n",
" F \n",
" 15 \n",
" 4 \n",
" home \n",
" 1 \n",
" 3 \n",
" 2 \n",
" yes \n",
" 0 \n",
" yes \n",
" 14 \n",
" pass \n",
" \n",
" \n",
" 4 \n",
" F \n",
" 16 \n",
" 3 \n",
" home \n",
" 1 \n",
" 2 \n",
" 3 \n",
" yes \n",
" 0 \n",
" no \n",
" 13 \n",
" pass \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sex age Medu reason traveltime studytime freetime higher failures \\\n",
"0 F 18 4 course 2 2 3 yes 0 \n",
"1 F 17 1 course 1 2 3 yes 0 \n",
"2 F 15 1 other 1 2 3 yes 0 \n",
"3 F 15 4 home 1 3 2 yes 0 \n",
"4 F 16 3 home 1 2 3 yes 0 \n",
"\n",
" internet G3 G3_binary \n",
"0 no 11 fail \n",
"1 yes 11 fail \n",
"2 yes 12 pass \n",
"3 yes 14 pass \n",
"4 no 13 pass "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Final dataframe of chosen variables for training the model\n",
"df_dropped.head()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# *Machine Learning:* \n",
"## Preprocessing, Modelling, Scoring "
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"source": [
"# *Machine Learning Part 1:* \n",
"## Label encoding the target variable G3"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/m4/kwmg1zjs75j99y7p7rtp4wcc0000gn/T/ipykernel_93639/3581341554.py:7: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_dropped['G3_binary_label'] = label_encoder.fit_transform(df_dropped['G3_binary'])\n"
]
}
],
"source": [
"#Label encoding the target variable\n",
"\n",
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"label_encoder = LabelEncoder()\n",
"\n",
"df_dropped['G3_binary_label'] = label_encoder.fit_transform(df_dropped['G3_binary'])"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" G3_binary \n",
" G3_binary_label \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" fail \n",
" 0 \n",
" \n",
" \n",
" 1 \n",
" fail \n",
" 0 \n",
" \n",
" \n",
" 2 \n",
" pass \n",
" 1 \n",
" \n",
" \n",
" 3 \n",
" pass \n",
" 1 \n",
" \n",
" \n",
" 4 \n",
" pass \n",
" 1 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" G3_binary G3_binary_label\n",
"0 fail 0\n",
"1 fail 0\n",
"2 pass 1\n",
"3 pass 1\n",
"4 pass 1"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Label encoding the target variable\n",
"df_dropped[['G3_binary',\t'G3_binary_label']].head()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"source": [
"# *Machine Learning Part 2:* \n",
"## Splitting the train and test data "
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"#Split the data into train and test\n",
"train_data, test_data = train_test_split(df_dropped, test_size=0.1, random_state=42)\n",
"\n",
"#Ready X and Ys\n",
"X_train = train_data[['sex', 'age', 'Medu', 'reason', 'traveltime', 'studytime', 'freetime', 'higher',\n",
" 'failures', 'internet']]\n",
"y_train = train_data['G3_binary_label']\n",
"\n",
"X_test = test_data[['sex', 'age', 'Medu', 'reason', 'traveltime', 'studytime', 'freetime', 'higher',\n",
" 'failures', 'internet']]\n",
"y_test = test_data['G3_binary_label']"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"source": [
"# *Machine Learning Part 3:* \n",
"## The pipeline: preprocessing and model \n"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [],
"source": [
"#Boosting\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.ensemble import GradientBoostingClassifier"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"source": [
"## **Pre-processing pipeline:**\n",
"### to *scale the numerical variables* and *One Hot Encode the categorical variables*"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [],
"source": [
"# Scale numerical values: \n",
"num_transformer = Pipeline([('standard_scaler', StandardScaler())])\n",
"\n",
"# Encode categorical values\n",
"cat_transformer = OneHotEncoder()\n",
"\n",
"# Parallelize \"num_transformer\" and \"cat_transfomer\"\n",
"preprocessor = ColumnTransformer([\n",
" ('num_transformer', num_transformer, ['age','Medu', 'traveltime', 'studytime', 'freetime', 'failures']),\n",
" ('cat_transformer', cat_transformer, ['sex', 'reason', 'higher', 'internet']),\n",
"])\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "subslide"
}
},
"source": [
"# Classification model used: Gradient Boosting Classifier"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [],
"source": [
"\n",
"model = GradientBoostingClassifier()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"source": [
"# *Full pipeline*"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [],
"source": [
"pipeline4 = Pipeline([\n",
" ('preprocessor', preprocessor),\n",
" ('boosting', model),\n",
"])"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"source": [
"# *Machine Learning Part 4:* \n",
"## *Training, fitting and comparing scoring metrics across different models*\n"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [],
"source": [
"d_model = [\n",
" ['Lgistic Regression', '0.75', '0.72 vs 0.72', 'No Overfitting'],\n",
" ['KNN KNeighborsClassifier', '0.76', '0.67 vs 0.83','Overfitting'],\n",
" ['Random Forest Classifier', '0.81', '0.70 vs 0.98', 'Overfitting'],\n",
" ['XGB Classifier','0.82' , '0.68 vs 0.95', 'Overfitting'],\n",
" ['Gradient Boosting Classifier', '0.76', '0.70 vs 0.78', 'Slightly Overfitting']\n",
"]\n",
"df_model = pd.DataFrame(data = d_model, columns=['model used', 'precision', 'Test vs Train precision CV', 'Over/Underfitting'])"
]
},
{
"cell_type": "code",
"execution_count": 151,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" model used \n",
" precision \n",
" Test vs Train precision CV \n",
" Over/Underfitting \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" Lgistic Regression \n",
" 0.75 \n",
" 0.72 vs 0.72 \n",
" No Overfitting \n",
" \n",
" \n",
" 1 \n",
" KNN KNeighborsClassifier \n",
" 0.76 \n",
" 0.67 vs 0.83 \n",
" Overfitting \n",
" \n",
" \n",
" 2 \n",
" Random Forest Classifier \n",
" 0.81 \n",
" 0.70 vs 0.98 \n",
" Overfitting \n",
" \n",
" \n",
" 3 \n",
" XGB Classifier \n",
" 0.82 \n",
" 0.68 vs 0.95 \n",
" Overfitting \n",
" \n",
" \n",
" 4 \n",
" Gradient Boosting Classifier \n",
" 0.76 \n",
" 0.70 vs 0.78 \n",
" Slightly Overfitting \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" model used precision Test vs Train precision CV \\\n",
"0 Lgistic Regression 0.75 0.72 vs 0.72 \n",
"1 KNN KNeighborsClassifier 0.76 0.67 vs 0.83 \n",
"2 Random Forest Classifier 0.81 0.70 vs 0.98 \n",
"3 XGB Classifier 0.82 0.68 vs 0.95 \n",
"4 Gradient Boosting Classifier 0.76 0.70 vs 0.78 \n",
"\n",
" Over/Underfitting \n",
"0 No Overfitting \n",
"1 Overfitting \n",
"2 Overfitting \n",
"3 Overfitting \n",
"4 Slightly Overfitting "
]
},
"execution_count": 151,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Comparisons of different models tried and tested\n",
"df_model"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.7608695652173914"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipeline4.fit(X_train, y_train)\n",
"y_predict4 = pipeline4.predict(X_test)\n",
"\n",
"#precision\n",
"from sklearn.metrics import precision_score\n",
"\n",
"precision_score(y_test, y_predict4)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.77\n",
"Precision: 0.76\n",
"Recall: 0.9\n",
"F1 Score: 0.82\n"
]
}
],
"source": [
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
"\n",
"# Calculate the accuracy\n",
"accuracy = accuracy_score(y_test, y_predict4)\n",
"\n",
"# Calculate the precision\n",
"precision = precision_score(y_test, y_predict4)\n",
"\n",
"# Calculate the recall\n",
"recall = recall_score(y_test, y_predict4)\n",
"\n",
"# Calculate the f1 score\n",
"f1 = f1_score(y_test, y_predict4)\n",
"\n",
"# Print the results\n",
"print(\"Accuracy:\",round(accuracy,2))\n",
"print(\"Precision:\", round(precision,2))\n",
"print(\"Recall:\", round(recall,2))\n",
"print(\"F1 Score:\", round(f1,2))\n",
"\n",
"# Returns:\n",
"# Accuracy: 0.956140350877193\n",
"# Precision: 0.9459459459459459\n",
"# Recall: 0.9859154929577465\n",
"# F1 Score: 0.9655172413793103"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "subslide"
}
},
"source": [
"# **The classification model chosen:** \n",
"## *Gradient Boosting Classifier*\n",
"\n",
"\n",
"\n",
"\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## *Model Scoring Metrics*"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.77\n",
"Precision: 0.76\n",
"Recall: 0.9\n",
"F1 Score: 0.82\n"
]
}
],
"source": [
"print(\"Accuracy:\",round(accuracy,2))\n",
"print(\"Precision:\", round(precision,2))\n",
"print(\"Recall:\", round(recall,2))\n",
"print(\"F1 Score:\", round(f1,2))\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# *Thank you for your attention*\n",
""
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}