TP-ML/ex/Lab5_exercises.ipynb

{
 "nbformat": 4,
 "nbformat_minor": 0,
 "metadata": {
  "colab": {
   "name": "Lab5_exercises.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "**Import the necessary libraries**"
   ],
   "metadata": {
    "id": "xlWn8gFVYOXp",
    "pycharm": {
     "name": "#%% md\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "id": "2Ll2oo0bFKbm",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np \n",
    "import pandas as pd \n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.model_selection import train_test_split, cross_validate, LeaveOneOut, GridSearchCV, RandomizedSearchCV\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.feature_selection import RFECV\n",
    "from sklearn.preprocessing import StandardScaler, RobustScaler\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.pipeline import make_pipeline \n",
    "from sklearn.compose import ColumnTransformer"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "**Load the dataframe, get its general information and change the data type of the columns 'anaemia', 'diabetes', 'high_blood_pressure', 'sex', smoking' to categorical.**"
   ],
   "metadata": {
    "id": "y2UWo1OBYWeM",
    "pycharm": {
     "name": "#%% md\n"
    }
   }
  },
  {
   "cell_type": "code",
   "source": [
    "file = '../data/heart_failure_lab.csv'\n",
    "\n",
    "##Read dataframe##\n",
    "\n",
    "df = pd.read_csv(file,index_col=0)\n",
    "df= df.astype({'anaemia':'category','diabetes':'category','high_blood_pressure':'category','sex':'category','smoking':'category',})\n",
    "print(df.head())\n",
    "print(df.info())"
   ],
   "metadata": {
    "id": "0Hm-3k-tU-hh",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "execution_count": 51,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    age anaemia  creatinine_phosphokinase diabetes  ejection_fraction  \\\n",
      "0  75.0     0.0                     582.0      0.0               20.0   \n",
      "1  55.0     0.0                    7861.0      0.0               38.0   \n",
      "2  65.0     0.0                     146.0      0.0               20.0   \n",
      "3  50.0     1.0                     111.0      0.0               20.0   \n",
      "4  65.0     1.0                     160.0      1.0               20.0   \n",
      "\n",
      "  high_blood_pressure  platelets  serum_creatinine  serum_sodium  sex smoking  \\\n",
      "0                 1.0  265000.00               1.9         130.0  1.0     0.0   \n",
      "1                 0.0  263358.03               NaN         136.0  1.0     0.0   \n",
      "2                 0.0  162000.00               1.3         129.0  1.0     1.0   \n",
      "3                 0.0  210000.00               1.9         137.0  1.0     0.0   \n",
      "4                 0.0  327000.00               2.7         116.0  0.0     0.0   \n",
      "\n",
      "   time  DEATH_EVENT  \n",
      "0     4          1.0  \n",
      "1     6          1.0  \n",
      "2     7          1.0  \n",
      "3     7          1.0  \n",
      "4     8          NaN  \n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 299 entries, 0 to 298\n",
      "Data columns (total 13 columns):\n",
      " #   Column                    Non-Null Count  Dtype   \n",
      "---  ------                    --------------  -----   \n",
      " 0   age                       294 non-null    float64 \n",
      " 1   anaemia                   291 non-null    category\n",
      " 2   creatinine_phosphokinase  292 non-null    float64 \n",
      " 3   diabetes                  287 non-null    category\n",
      " 4   ejection_fraction         296 non-null    float64 \n",
      " 5   high_blood_pressure       291 non-null    category\n",
      " 6   platelets                 294 non-null    float64 \n",
      " 7   serum_creatinine          290 non-null    float64 \n",
      " 8   serum_sodium              296 non-null    float64 \n",
      " 9   sex                       292 non-null    category\n",
      " 10  smoking                   293 non-null    category\n",
      " 11  time                      299 non-null    int64   \n",
      " 12  DEATH_EVENT               292 non-null    float64 \n",
      "dtypes: category(5), float64(7), int64(1)\n",
      "memory usage: 23.1 KB\n",
      "None\n"
     ]
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "**Remove the entire row wherever the corresponding column 'DEATH_EVENT contains a missing value. Change the datatype of the column 'DEATH_EVENT' to integer.**"
   ],
   "metadata": {
    "id": "9Y46JzzHZnp0",
    "pycharm": {
     "name": "#%% md\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "age                          5\n",
      "anaemia                      8\n",
      "creatinine_phosphokinase     7\n",
      "diabetes                    11\n",
      "ejection_fraction            3\n",
      "high_blood_pressure          8\n",
      "platelets                    5\n",
      "serum_creatinine             9\n",
      "serum_sodium                 3\n",
      "sex                          7\n",
      "smoking                      6\n",
      "time                         0\n",
      "DEATH_EVENT                  0\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "df.dropna(axis=0, subset=['DEATH_EVENT'], inplace=True)\n",
    "\n",
    "print(df.isna().sum())\n",
    "df = df.astype({'DEATH_EVENT':'int64'})"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "**Replace categorical missing values by the most frequent occurence of the corresponding column. For continuous missing values, replace them by the mean. Create a new dataframe 'df_new' which does not contain the variable 'time'.**"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%% md\n"
    }
   }
  },
  {
   "cell_type": "code",
   "source": [
    "imp_cont = SimpleImputer(missing_values=np.nan, strategy='mean')\n",
    "imp_cat = SimpleImputer(missing_values=np.nan, strategy='most_frequent')\n",
    "\n",
    "cat_columns = df.select_dtypes(include=['category']).columns\n",
    "cont_columns = df.select_dtypes(exclude=['category']).columns\n",
    "\n",
    "df[cat_columns] = imp_cat.fit_transform(df[cat_columns])\n",
    "df[cont_columns] = imp_cont.fit_transform(df[cont_columns])\n",
    "\n",
    "df_new = df.drop(['time'], axis =1)\n",
    "print(df_new.isna().sum())"
   ],
   "metadata": {
    "id": "Pj_OfdtzV4n9",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "execution_count": 53,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "age                         0\n",
      "anaemia                     0\n",
      "creatinine_phosphokinase    0\n",
      "diabetes                    0\n",
      "ejection_fraction           0\n",
      "high_blood_pressure         0\n",
      "platelets                   0\n",
      "serum_creatinine            0\n",
      "serum_sodium                0\n",
      "sex                         0\n",
      "smoking                     0\n",
      "DEATH_EVENT                 0\n",
      "dtype: int64\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "mb_pNlE8aWJY",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "execution_count": 53,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "**Fit a model to predict the target variale 'DEATH_EVENT' given the remaining columns of 'df_new'. To this end, use a SVC, a RandomForestClassifier, and a LogisticRegression. Fit the model using a 10 folds cross-validation, and report the mean training and test accuracy across each folds.**"
   ],
   "metadata": {
    "id": "HwkQkulGa76d",
    "pycharm": {
     "name": "#%% md\n"
    }
   }
  },
  {
   "cell_type": "code",
   "source": [
    "X = df.loc[:, df.columns != 'DEATH_EVENT']\n",
    "Y = df['DEATH_EVENT'].values\n",
    "models = [SVC(),RandomForestClassifier(),LogisticRegression()]\n",
    "\n",
    "for model in models:\n",
    "    cv_results = cross_validate(model, X, Y, cv=10, scoring= 'accuracy',return_train_score=True)\n",
    "    print(sorted(cv_results.keys()))\n",
    "\n",
    "    print(np.mean(cv_results['train_score']))\n",
    "    print(np.mean(cv_results['test_score']))\n",
    "\n",
    "\n"
   ],
   "metadata": {
    "id": "c_W4FeGVaidW",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "execution_count": 59,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['fit_time', 'score_time', 'test_score', 'train_score']\n",
      "0.6780831857893361\n",
      "0.6781609195402301\n",
      "['fit_time', 'score_time', 'test_score', 'train_score']\n",
      "0.9996197718631178\n",
      "0.7547126436781608\n",
      "['fit_time', 'score_time', 'test_score', 'train_score']\n",
      "0.8268699387571475\n",
      "0.7955172413793103\n"
     ]
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "**Like last week, grid search is an useful procedure when trying to find the best subset of hyper-parameters fo a model. Go check the documentation of the class RandomForestClassifier(), and perform a grid search on a given range of selected hyper-parameters. Set the number of folds to 10, and evaluate on the accuracy. Report the best subset of hyper-parameters, and the best score.**"
   ],
   "metadata": {
    "id": "yifQfW7-cE0t",
    "pycharm": {
     "name": "#%% md\n"
    }
   }
  },
  {
   "cell_type": "code",
   "source": [
    "grid_param = {\"n_estimators\":[10, 20, 30]}\n",
    "model = RandomForestClassifier()\n",
    "grid = GridSearchCV(model, grid_param,scoring='accuracy',cv=10)\n",
    "grid.fit(X,Y)\n",
    "\n",
    "best_params = grid.best_params_\n",
    "best_test_score = grid.cv_results_['mean_test_score'].max()\n",
    "\n",
    "print(best_params)\n",
    "print(best_test_score)\n"
   ],
   "metadata": {
    "id": "t7BtcV33i-bR",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "execution_count": 66,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'n_estimators': 20}\n",
      "0.7648275862068967\n"
     ]
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "**Scaling the variables sometimes helps the classifier in its predictions. Check the class StandardScaler() and create a pipeline (using the method 'make_pipeline') that combines both the scaler and a RandomForestClassifier(). Be carefull, the scaler must only be applied to continuous variables. To this end, check the class ColumnTransformer(), and how it can be used to apply a scaler to only a set of specified columns. Use this papeline this perform the same grid search as before. Repeat the experiment using a RobustScaler().**"
   ],
   "metadata": {
    "id": "piIwQ3VMdWdQ",
    "pycharm": {
     "name": "#%% md\n"
    }
   }
  },
  {
   "cell_type": "code",
   "source": [
    "ct = ColumnTransformer([[\"test\",StandardScaler(),df_new.select_dtypes(include=['float']).columns]])\n",
    "print(ct.fit_transform(df_new))"
   ],
   "metadata": {
    "id": "23Wm9KB5KWTe",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "execution_count": 70,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 0.78947368  0.          0.69822485 ...  0.          0.\n",
      "   1.        ]\n",
      " [-0.26315789  0.         16.36040882 ...  0.          0.\n",
      "   1.        ]\n",
      " [ 0.26315789  0.         -0.23991393 ...  0.          1.\n",
      "   1.        ]\n",
      " ...\n",
      " [-0.26315789  0.          3.36202259 ... -1.          0.\n",
      "   0.        ]\n",
      " [-0.78947368  0.          4.63797741 ...  0.          1.\n",
      "   0.        ]\n",
      " [-0.52631579  0.         -0.13232921 ...  0.          1.\n",
      "   0.        ]]\n"
     ]
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "**Let's now see which model amongst a logistic regression, a random forest, a SVC and a multi-layer perceptron performs the best on predicting an heart failure. For each model, perform a grid search cross-validation on a selected subset of hyper-parameters, and report the best model amongst the above. Check the documentation of LogisticRegression(), RandomForestClassifier(), SVC() and MLPClassiffier() to choose the hyper-parameters. Use the same pipeline as before, and evaluate on the accuracy using a 10 folds cross-validation. Report the best model and the best score.**"
   ],
   "metadata": {
    "id": "wItdjyS4f24T",
    "pycharm": {
     "name": "#%% md\n"
    }
   }
  },
  {
   "cell_type": "code",
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n"
   ],
   "metadata": {
    "id": "VQGyZpj797-p",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "**As you might have noticed, grid search can quickly take very long to compute when the hyper-parameters space on which to search becomes large. Another approach, that trades precision in the solution for lower runtime, is called Random Search. In a Random Search, only a defined number of hyper-parameters' subsets are selected randomly and used to fit the model, which considerably fastens the procedure. Perform the same experiment as above, but use the class RandomizedSearchCV() this time. Set the number of subsets to try to 5.**"
   ],
   "metadata": {
    "id": "1G5RUgPhhL8s",
    "pycharm": {
     "name": "#%% md\n"
    }
   }
  },
  {
   "cell_type": "code",
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n"
   ],
   "metadata": {
    "id": "5sVKQJVqbaGA",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "CQX7lUwSnLW3",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "**Using the best model found above, report the precision, the recall, the accuracy, the F1 score and the area under the ROC curve. Use a 10 folds cross-validation, and report the means of these metrics across all folds for the training and test folds.**"
   ],
   "metadata": {
    "id": "ItK-qfL4i5NM",
    "pycharm": {
     "name": "#%% md\n"
    }
   }
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "h8dxh_1ykdUw",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "execution_count": null,
   "outputs": []
  }
 ]
}