kaggle-competitions/competitions/identify-age-related-conditions/train_models/training.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.model_selection import train_test_split, cross_val_score\n",
    "from sklearn.metrics import accuracy_score\n",
    "from xgboost import XGBClassifier\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.feature_selection import mutual_info_regression\n",
    "from sklearn.discriminant_analysis import StandardScaler\n",
    "import numpy as np\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.model_selection import train_test_split, ParameterGrid\n",
    "from keras.callbacks import TensorBoard, EarlyStopping, LearningRateScheduler\n",
    "from keras.models import Model\n",
    "from keras.layers import Activation, Dense, LSTM, Input\n",
    "from keras.optimizers import Adam, RMSprop, SGD\n",
    "from scikeras.wrappers import KerasClassifier\n",
    "import tensorflow as tf\n",
    "from os import path"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Method to load the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_dataset(columns_drop) -> tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n",
    "    train = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\train.csv\")\n",
    "    greeks = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\greeks.csv\")\n",
    "    test = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\test.csv\")\n",
    "    columns_drop = ['Id'] + columns_drop\n",
    "    id_list = test[\"Id\"]\n",
    "    train.drop(columns_drop, inplace=True, axis=1)\n",
    "    test.drop(columns_drop, inplace=True, axis=1)\n",
    "    print(len(train.columns))\n",
    "    return (train, greeks, test, id_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Method to split the data in validation and train set randomly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [],
   "source": [
    "def split_data(df: pd.DataFrame, split)->tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n",
    "    X = df.loc[:, df.columns != \"Class\"]\n",
    "    y = df.loc[:, \"Class\"]\n",
    "    return train_test_split(X, y, test_size=split, random_state=42)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Method to build a Tensorflow model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_tensorflow_model(input_shape:int, output_shape:int, units1: int, units2: int, units3: int, activation1: str, \n",
    "                activation2: str, activation3: str, optimizer: tf.keras.optimizers.Optimizer, learning_rate: float) -> Model:\n",
    "    input = Input(shape=input_shape)\n",
    "    x = Dense(units=units1, activation=activation1)(input)\n",
    "    x = Dense(units=units2, activation=activation2)(x)\n",
    "    x = Dense(units=units3, activation=activation3)(x)\n",
    "    output = Dense(units=output_shape, activation=\"softmax\")(x)\n",
    "    model = Model(inputs=[input], outputs=[output])\n",
    "    \n",
    "    model.compile(loss=\"categorical_crossentropy\",\n",
    "              optimizer=optimizer(learning_rate=learning_rate),\n",
    "              metrics=[\"accuracy\"])   \n",
    "    return model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Method to plot the accuracy of the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_acc_tf_model(history:Model):\n",
    "    # summarize history for accuracy\n",
    "    plt.plot(history.history['accuracy'])\n",
    "    plt.plot(history.history['val_accuracy'])\n",
    "    plt.title('model accuracy')\n",
    "    plt.ylabel('accuracy')\n",
    "    plt.xlabel('epoch')\n",
    "    plt.legend(['Train', 'Validation'], loc='upper left')\n",
    "    plt.show()\n",
    "    \n",
    "    # summarize history for loss\n",
    "    plt.plot(history.history['loss'])\n",
    "    plt.plot(history.history['val_loss'])\n",
    "    plt.title('model loss')\n",
    "    plt.ylabel('loss')\n",
    "    plt.xlabel('epoch')\n",
    "    plt.legend(['Train', 'Validation'], loc='upper left')\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Method to fit the Tensorflow model with ES Callback"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [],
   "source": [
    "es_callback = EarlyStopping(\n",
    "        monitor=\"val_accuracy\",\n",
    "        patience=5,\n",
    "        verbose=1,\n",
    "        restore_best_weights=True,\n",
    "        min_delta=0.005\n",
    "    )\n",
    " \n",
    "def fit_model(model: Model, x: np.ndarray, y: np.ndarray, epochs: int, split: float) -> Model:\n",
    "    #split train and validation\n",
    "    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=split, random_state=42)\n",
    "    #fit the model\n",
    "    history = model.fit(x_train, y_train, epochs=epochs, validation_data=(x_val,y_val), callbacks=[es_callback])\n",
    "    return history "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Method for GridSearch of Tensorflow model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {},
   "outputs": [],
   "source": [
    "grid_params = {\"units1\": [8,16], \"units2\": [16,32], \"units3\": [32,64], \"activation1\": [\"relu\"], \"activation2\": [\"relu\"], \n",
    "          \"activation3\": [\"relu\"], \"optimizer\": [Adam], \"learning_rate\": [0.001]}\n",
    "\n",
    "#GridSearch\n",
    "def grid_search_tf_model(X_train: pd.DataFrame, y_train: pd.DataFrame)->Model:\n",
    "    grid = ParameterGrid(param_grid = grid_params)\n",
    "    results = []\n",
    "    input_shape = len(X_train[1])\n",
    "    output_shape = 2\n",
    "    for idx,params in enumerate(grid):\n",
    "        model = build_tensorflow_model(input_shape=input_shape, output_shape=output_shape, **params)\n",
    "        history = fit_model(model, X_train, y_train, 100, 0.2)\n",
    "        val_loss = history.history['val_loss'][-1]    \n",
    "        val_acc = history.history['val_accuracy'][-1]\n",
    "        results.append([val_loss, val_acc])\n",
    "        \n",
    "    val_accuracies = [i[1] for i in results]\n",
    "    val_losses= [i[0] for i in results]\n",
    "    best_acc = val_accuracies.index(max(val_accuracies))\n",
    "    best_loss = val_losses.index(min(val_losses))\n",
    "    print(f\"best acc at index {best_acc}: {max(val_accuracies)}\")\n",
    "    print(f\"best loss at index {best_loss}: {min(val_losses)}\")\n",
    "    print(grid[best_acc])\n",
    "    \n",
    "    model = build_tensorflow_model(input_shape=input_shape, output_shape=output_shape, **grid[best_acc])\n",
    "    history = fit_model(model, X_train, y_train, 100, 0.2)\n",
    "    #plot_acc_tf_model(history)\n",
    "    return model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Method to build a preprocessing pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_preprocessing_pipeline(df: pd.DataFrame) -> ColumnTransformer:\n",
    "    # Preprocessing for numerical data    \n",
    "    numerical_transformer = Pipeline(steps=[\n",
    "        ('imputer',SimpleImputer(strategy='constant')),\n",
    "        ('scaler', StandardScaler())])\n",
    "    \n",
    "    # Preprocessing for categorical data\n",
    "    categorical_transformer = Pipeline(steps=[\n",
    "        ('imputer', SimpleImputer(strategy='most_frequent')),\n",
    "        ('onehot', OneHotEncoder(handle_unknown='ignore'))\n",
    "    ])\n",
    "\n",
    "    # Bundle preprocessing for numerical and categorical data\n",
    "    numerical_cols = [cname for cname in df.columns if df[cname].dtype in [\"int64\", \"float64\"]]\n",
    "    categorical_cols = [cname for cname in df.columns if df[cname].nunique() < 10]\n",
    "    preprocessor = ColumnTransformer(\n",
    "        transformers=[\n",
    "            ('num', numerical_transformer, numerical_cols),\n",
    "            ('cat', categorical_transformer, categorical_cols)\n",
    "        ])\n",
    "    print(f\"Number of columns: {len(df.columns)}\")\n",
    "    return preprocessor"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Method to generate the Mutual Info scores and plot them"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def make_mi_scores(X: pd.DataFrame, y: pd.DataFrame, discrete_features: list):\n",
    "    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)\n",
    "    mi_scores = pd.Series(mi_scores, name=\"MI Scores\", index=X.columns)\n",
    "    mi_scores = mi_scores.sort_values(ascending=False)\n",
    "    return mi_scores\n",
    "\n",
    "def plot_mi_scores(scores):\n",
    "    scores = scores.sort_values(ascending=True)\n",
    "    width = np.arange(len(scores))\n",
    "    ticks = list(scores.index)\n",
    "    plt.figure(dpi=100, figsize=(16, 16))\n",
    "    plt.barh(width, scores)\n",
    "    plt.yticks(width, ticks)\n",
    "    plt.title(\"Mutual Information Scores\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Display MI scores that are beneath 0.01"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "57\n",
      "dataset shape: (617, 57)\n",
      "X shape: (431, 56) and y shape: (431,)\n",
      " Columns with MI equal zero: ['DN', 'CW ', 'CC', 'EG', 'CU', 'AH', 'CL', 'DF', 'CD ', 'GE', 'GB', 'FS', 'DE', 'FI', 'BD ', 'CB', 'FD ', 'DY', 'AY', 'EP', 'AZ', 'EJ', 'CF'] --> total length: 23\n"
     ]
    }
   ],
   "source": [
    "#get the mutual information of features\n",
    "train, greeks, test, id_list = load_dataset(columns_drop=[])\n",
    "print(f\"dataset shape: {train.shape}\")\n",
    "X_train, X_valid, y_train, y_valid = split_data(train, 0.3)\n",
    "print(f\"X shape: {X_train.shape} and y shape: {y_train.shape}\")\n",
    "\n",
    "cols = list(X_train.columns)\n",
    "cols.append(\"1\")\n",
    "X_transformed = X_train.fillna(0)\n",
    "X_transformed[\"EJ\"].replace(['A', 'B'], [0, 1], inplace=True)\n",
    "discrete_features = X_transformed.dtypes == int\n",
    "mi_scores = make_mi_scores(X_transformed, y_train, discrete_features)\n",
    "bad_scores = list(mi_scores.index[i] for i, score in zip(range(len(mi_scores)),mi_scores) if score < 0.01)\n",
    "#plot_mi_scores(mi_scores)\n",
    "print(f\" Columns with MI equal zero: {bad_scores} --> total length: {len(bad_scores)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create a model and preprocessor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "34\n",
      "dataset shape: (617, 34)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of columns: 33\n",
      "Epoch 1/100\n",
      "11/11 [==============================] - 0s 15ms/step - loss: 0.5836 - accuracy: 0.8372 - val_loss: 0.4572 - val_accuracy: 0.8391\n",
      "Epoch 2/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.5413 - accuracy: 0.8372 - val_loss: 0.4360 - val_accuracy: 0.8391\n",
      "Epoch 3/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.5107 - accuracy: 0.8372 - val_loss: 0.4228 - val_accuracy: 0.8391\n",
      "Epoch 4/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.4856 - accuracy: 0.8372 - val_loss: 0.4138 - val_accuracy: 0.8391\n",
      "Epoch 5/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.4655 - accuracy: 0.8372 - val_loss: 0.4073 - val_accuracy: 0.8391\n",
      "Epoch 6/100\n",
      " 1/11 [=>............................] - ETA: 0s - loss: 0.3383 - accuracy: 0.8438Restoring model weights from the end of the best epoch: 1.\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.4464 - accuracy: 0.8372 - val_loss: 0.4030 - val_accuracy: 0.8391\n",
      "Epoch 6: early stopping\n",
      "Epoch 1/100\n",
      "11/11 [==============================] - 0s 15ms/step - loss: 0.6535 - accuracy: 0.7297 - val_loss: 0.5649 - val_accuracy: 0.8391\n",
      "Epoch 2/100\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.5767 - accuracy: 0.8372 - val_loss: 0.4984 - val_accuracy: 0.8391\n",
      "Epoch 3/100\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.5286 - accuracy: 0.8372 - val_loss: 0.4690 - val_accuracy: 0.8391\n",
      "Epoch 4/100\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.4883 - accuracy: 0.8372 - val_loss: 0.4536 - val_accuracy: 0.8391\n",
      "Epoch 5/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.4519 - accuracy: 0.8372 - val_loss: 0.4419 - val_accuracy: 0.8391\n",
      "Epoch 6/100\n",
      " 1/11 [=>............................] - ETA: 0s - loss: 0.3956 - accuracy: 0.8438Restoring model weights from the end of the best epoch: 1.\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.4211 - accuracy: 0.8372 - val_loss: 0.4367 - val_accuracy: 0.8391\n",
      "Epoch 6: early stopping\n",
      "Epoch 1/100\n",
      "11/11 [==============================] - 0s 16ms/step - loss: 0.7554 - accuracy: 0.4244 - val_loss: 0.6338 - val_accuracy: 0.6667\n",
      "Epoch 2/100\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.5835 - accuracy: 0.7733 - val_loss: 0.5112 - val_accuracy: 0.8391\n",
      "Epoch 3/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.5046 - accuracy: 0.8372 - val_loss: 0.4532 - val_accuracy: 0.8391\n",
      "Epoch 4/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.4713 - accuracy: 0.8372 - val_loss: 0.4223 - val_accuracy: 0.8391\n",
      "Epoch 5/100\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.4393 - accuracy: 0.8372 - val_loss: 0.4078 - val_accuracy: 0.8391\n",
      "Epoch 6/100\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.4144 - accuracy: 0.8372 - val_loss: 0.3960 - val_accuracy: 0.8391\n",
      "Epoch 7/100\n",
      " 1/11 [=>............................] - ETA: 0s - loss: 0.3674 - accuracy: 0.8750Restoring model weights from the end of the best epoch: 2.\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.3898 - accuracy: 0.8401 - val_loss: 0.3847 - val_accuracy: 0.8391\n",
      "Epoch 7: early stopping\n",
      "Epoch 1/100\n",
      "11/11 [==============================] - 0s 15ms/step - loss: 0.6324 - accuracy: 0.8227 - val_loss: 0.5557 - val_accuracy: 0.8391\n",
      "Epoch 2/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.5711 - accuracy: 0.8372 - val_loss: 0.5040 - val_accuracy: 0.8391\n",
      "Epoch 3/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.5245 - accuracy: 0.8372 - val_loss: 0.4676 - val_accuracy: 0.8391\n",
      "Epoch 4/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.4838 - accuracy: 0.8372 - val_loss: 0.4399 - val_accuracy: 0.8391\n",
      "Epoch 5/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.4494 - accuracy: 0.8372 - val_loss: 0.4188 - val_accuracy: 0.8391\n",
      "Epoch 6/100\n",
      " 1/11 [=>............................] - ETA: 0s - loss: 0.3685 - accuracy: 0.8750Restoring model weights from the end of the best epoch: 1.\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.4130 - accuracy: 0.8372 - val_loss: 0.4003 - val_accuracy: 0.8391\n",
      "Epoch 6: early stopping\n",
      "Epoch 1/100\n",
      "11/11 [==============================] - 0s 15ms/step - loss: 0.8912 - accuracy: 0.1860 - val_loss: 0.7869 - val_accuracy: 0.2644\n",
      "Epoch 2/100\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.7306 - accuracy: 0.4070 - val_loss: 0.6774 - val_accuracy: 0.6207\n",
      "Epoch 3/100\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.6506 - accuracy: 0.7936 - val_loss: 0.6167 - val_accuracy: 0.8506\n",
      "Epoch 4/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.6128 - accuracy: 0.8401 - val_loss: 0.5752 - val_accuracy: 0.8391\n",
      "Epoch 5/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.5760 - accuracy: 0.8401 - val_loss: 0.5514 - val_accuracy: 0.8391\n",
      "Epoch 6/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.5444 - accuracy: 0.8401 - val_loss: 0.5272 - val_accuracy: 0.8391\n",
      "Epoch 7/100\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.5119 - accuracy: 0.8459 - val_loss: 0.4959 - val_accuracy: 0.8391\n",
      "Epoch 8/100\n",
      " 1/11 [=>............................] - ETA: 0s - loss: 0.4723 - accuracy: 0.9062Restoring model weights from the end of the best epoch: 3.\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.4759 - accuracy: 0.8488 - val_loss: 0.4645 - val_accuracy: 0.8391\n",
      "Epoch 8: early stopping\n",
      "Epoch 1/100\n",
      "11/11 [==============================] - 0s 16ms/step - loss: 0.7030 - accuracy: 0.5262 - val_loss: 0.6235 - val_accuracy: 0.7701\n",
      "Epoch 2/100\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.6088 - accuracy: 0.8285 - val_loss: 0.5432 - val_accuracy: 0.8391\n",
      "Epoch 3/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.5630 - accuracy: 0.8372 - val_loss: 0.4939 - val_accuracy: 0.8391\n",
      "Epoch 4/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.5231 - accuracy: 0.8372 - val_loss: 0.4641 - val_accuracy: 0.8391\n",
      "Epoch 5/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.4851 - accuracy: 0.8372 - val_loss: 0.4443 - val_accuracy: 0.8391\n",
      "Epoch 6/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.4476 - accuracy: 0.8372 - val_loss: 0.4190 - val_accuracy: 0.8391\n",
      "Epoch 7/100\n",
      " 1/11 [=>............................] - ETA: 0s - loss: 0.4761 - accuracy: 0.7812Restoring model weights from the end of the best epoch: 2.\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.4080 - accuracy: 0.8401 - val_loss: 0.3984 - val_accuracy: 0.8391\n",
      "Epoch 7: early stopping\n",
      "Epoch 1/100\n",
      "11/11 [==============================] - 0s 15ms/step - loss: 0.8484 - accuracy: 0.2122 - val_loss: 0.7485 - val_accuracy: 0.3218\n",
      "Epoch 2/100\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.7046 - accuracy: 0.5000 - val_loss: 0.6538 - val_accuracy: 0.7471\n",
      "Epoch 3/100\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.6363 - accuracy: 0.7936 - val_loss: 0.5938 - val_accuracy: 0.8276\n",
      "Epoch 4/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.5827 - accuracy: 0.8372 - val_loss: 0.5511 - val_accuracy: 0.8506\n",
      "Epoch 5/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.5353 - accuracy: 0.8401 - val_loss: 0.5105 - val_accuracy: 0.8506\n",
      "Epoch 6/100\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.4874 - accuracy: 0.8430 - val_loss: 0.4787 - val_accuracy: 0.8506\n",
      "Epoch 7/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.4375 - accuracy: 0.8459 - val_loss: 0.4518 - val_accuracy: 0.8506\n",
      "Epoch 8/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.3944 - accuracy: 0.8488 - val_loss: 0.4320 - val_accuracy: 0.8506\n",
      "Epoch 9/100\n",
      " 1/11 [=>............................] - ETA: 0s - loss: 0.4943 - accuracy: 0.7812Restoring model weights from the end of the best epoch: 4.\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.3565 - accuracy: 0.8547 - val_loss: 0.4221 - val_accuracy: 0.8506\n",
      "Epoch 9: early stopping\n",
      "Epoch 1/100\n",
      "11/11 [==============================] - 0s 15ms/step - loss: 0.6726 - accuracy: 0.6483 - val_loss: 0.5800 - val_accuracy: 0.8391\n",
      "Epoch 2/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.5425 - accuracy: 0.8430 - val_loss: 0.5210 - val_accuracy: 0.8391\n",
      "Epoch 3/100\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.4813 - accuracy: 0.8430 - val_loss: 0.5047 - val_accuracy: 0.8391\n",
      "Epoch 4/100\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.4377 - accuracy: 0.8517 - val_loss: 0.4948 - val_accuracy: 0.8391\n",
      "Epoch 5/100\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.3982 - accuracy: 0.8547 - val_loss: 0.4875 - val_accuracy: 0.8391\n",
      "Epoch 6/100\n",
      " 1/11 [=>............................] - ETA: 0s - loss: 0.3590 - accuracy: 0.8438Restoring model weights from the end of the best epoch: 1.\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.3683 - accuracy: 0.8663 - val_loss: 0.4850 - val_accuracy: 0.8391\n",
      "Epoch 6: early stopping\n",
      "best acc at index 6: 0.8505747318267822\n",
      "best loss at index 2: 0.3847053050994873\n",
      "{'units3': 32, 'units2': 32, 'units1': 16, 'optimizer': <class 'keras.optimizers.legacy.adam.Adam'>, 'learning_rate': 0.001, 'activation3': 'relu', 'activation2': 'relu', 'activation1': 'relu'}\n",
      "Epoch 1/100\n",
      "11/11 [==============================] - 0s 15ms/step - loss: 0.6807 - accuracy: 0.8372 - val_loss: 0.5413 - val_accuracy: 0.8391\n",
      "Epoch 2/100\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.6144 - accuracy: 0.8372 - val_loss: 0.5060 - val_accuracy: 0.8391\n",
      "Epoch 3/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.5689 - accuracy: 0.8372 - val_loss: 0.4795 - val_accuracy: 0.8391\n",
      "Epoch 4/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.5273 - accuracy: 0.8372 - val_loss: 0.4565 - val_accuracy: 0.8391\n",
      "Epoch 5/100\n",
      "11/11 [==============================] - 0s 3ms/step - loss: 0.4915 - accuracy: 0.8372 - val_loss: 0.4387 - val_accuracy: 0.8391\n",
      "Epoch 6/100\n",
      " 1/11 [=>............................] - ETA: 0s - loss: 0.5601 - accuracy: 0.7812Restoring model weights from the end of the best epoch: 1.\n",
      "11/11 [==============================] - 0s 4ms/step - loss: 0.4573 - accuracy: 0.8401 - val_loss: 0.4216 - val_accuracy: 0.8391\n",
      "Epoch 6: early stopping\n"
     ]
    }
   ],
   "source": [
    "#Newly load the dataset with columns drop and create preprocessing pipeline\n",
    "train, greeks, test, id_list = load_dataset(columns_drop=bad_scores)\n",
    "print(f\"dataset shape: {train.shape}\")\n",
    "X_train, X_valid, y_train, y_valid = split_data(train, 0.3)\n",
    "preprocessor = build_preprocessing_pipeline(X_train)\n",
    "\n",
    "\n",
    "# Define model for the pipeline\n",
    "#model = RandomForestClassifier(n_estimators=100, random_state=22)\n",
    "#model = XGBClassifier(n_estimators=500)\n",
    "preprocessor.fit(X_train)\n",
    "X_preprocessed = preprocessor.transform(X_train)\n",
    "y_train_ohe = pd.get_dummies(y_train, columns = ['Class'])\n",
    "y_valid_ohe = pd.get_dummies(y_valid, columns = ['Class'])\n",
    "model = KerasClassifier(model=grid_search_tf_model(X_train=X_preprocessed, y_train=y_train_ohe), epochs=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Build the final pipeline with preprocessor and model, fit it and display accuracy score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 170,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Building Pipeline\n",
    "def fit_pipeline(X_train: pd.DataFrame, y_train: pd.DataFrame, preprocessor, model):\n",
    "    pipeline = Pipeline(steps=[('preprocessor', preprocessor),\n",
    "                            ('model', model)\n",
    "                            ])\n",
    "    #Fit the Model and make preds\n",
    "    pipeline.fit(X_train, y_train)\n",
    "    preds = pipeline.predict(X_valid)\n",
    "    \"\"\" score = cross_val_score(pipeline, X_valid, y_valid, cv=5, scoring='accuracy')\n",
    "    print(f\"Accuracy of {score}\") \"\"\"\n",
    "    print(classification_report(y_train.to_numpy(), preds))\n",
    "    correct_answers = 0\n",
    "    for y_pred,y_true in zip(preds,y_valid_ohe.to_numpy()):\n",
    "        if(y_pred[0] == y_true[0]):correct_answers+=1\n",
    "    print(correct_answers/len(preds))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Print results of the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6/6 [==============================] - 0s 1ms/step\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "Found input variables with inconsistent numbers of samples: [431, 186]",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[171], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m fit_pipeline(X_train, y_train_ohe, preprocessor, model)\n",
      "Cell \u001b[1;32mIn[170], line 11\u001b[0m, in \u001b[0;36mfit_pipeline\u001b[1;34m(X_train, y_train, preprocessor, model)\u001b[0m\n\u001b[0;32m      8\u001b[0m preds \u001b[39m=\u001b[39m pipeline\u001b[39m.\u001b[39mpredict(X_valid)\n\u001b[0;32m      9\u001b[0m \u001b[39m\"\"\" score = cross_val_score(pipeline, X_valid, y_valid, cv=5, scoring='accuracy')\u001b[39;00m\n\u001b[0;32m     10\u001b[0m \u001b[39mprint(f\"Accuracy of {score}\") \"\"\"\u001b[39;00m\n\u001b[1;32m---> 11\u001b[0m \u001b[39mprint\u001b[39m(classification_report(y_train\u001b[39m.\u001b[39;49mto_numpy(), preds))\n\u001b[0;32m     12\u001b[0m correct_answers \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m\n\u001b[0;32m     13\u001b[0m \u001b[39mfor\u001b[39;00m y_pred,y_true \u001b[39min\u001b[39;00m \u001b[39mzip\u001b[39m(preds,y_valid_ohe\u001b[39m.\u001b[39mto_numpy()):\n",
      "File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:2310\u001b[0m, in \u001b[0;36mclassification_report\u001b[1;34m(y_true, y_pred, labels, target_names, sample_weight, digits, output_dict, zero_division)\u001b[0m\n\u001b[0;32m   2195\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mclassification_report\u001b[39m(\n\u001b[0;32m   2196\u001b[0m     y_true,\n\u001b[0;32m   2197\u001b[0m     y_pred,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   2204\u001b[0m     zero_division\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mwarn\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m   2205\u001b[0m ):\n\u001b[0;32m   2206\u001b[0m     \u001b[39m\"\"\"Build a text report showing the main classification metrics.\u001b[39;00m\n\u001b[0;32m   2207\u001b[0m \n\u001b[0;32m   2208\u001b[0m \u001b[39m    Read more in the :ref:`User Guide <classification_report>`.\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   2307\u001b[0m \u001b[39m    <BLANKLINE>\u001b[39;00m\n\u001b[0;32m   2308\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[1;32m-> 2310\u001b[0m     y_type, y_true, y_pred \u001b[39m=\u001b[39m _check_targets(y_true, y_pred)\n\u001b[0;32m   2312\u001b[0m     \u001b[39mif\u001b[39;00m labels \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m   2313\u001b[0m         labels \u001b[39m=\u001b[39m unique_labels(y_true, y_pred)\n",
      "File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:86\u001b[0m, in \u001b[0;36m_check_targets\u001b[1;34m(y_true, y_pred)\u001b[0m\n\u001b[0;32m     59\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_check_targets\u001b[39m(y_true, y_pred):\n\u001b[0;32m     60\u001b[0m     \u001b[39m\"\"\"Check that y_true and y_pred belong to the same classification task.\u001b[39;00m\n\u001b[0;32m     61\u001b[0m \n\u001b[0;32m     62\u001b[0m \u001b[39m    This converts multiclass or binary types to a common shape, and raises a\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     84\u001b[0m \u001b[39m    y_pred : array or indicator matrix\u001b[39;00m\n\u001b[0;32m     85\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[1;32m---> 86\u001b[0m     check_consistent_length(y_true, y_pred)\n\u001b[0;32m     87\u001b[0m     type_true \u001b[39m=\u001b[39m type_of_target(y_true, input_name\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39my_true\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m     88\u001b[0m     type_pred \u001b[39m=\u001b[39m type_of_target(y_pred, input_name\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39my_pred\u001b[39m\u001b[39m\"\u001b[39m)\n",
      "File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py:397\u001b[0m, in \u001b[0;36mcheck_consistent_length\u001b[1;34m(*arrays)\u001b[0m\n\u001b[0;32m    395\u001b[0m uniques \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39munique(lengths)\n\u001b[0;32m    396\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(uniques) \u001b[39m>\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[1;32m--> 397\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m    398\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mFound input variables with inconsistent numbers of samples: \u001b[39m\u001b[39m%r\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m    399\u001b[0m         \u001b[39m%\u001b[39m [\u001b[39mint\u001b[39m(l) \u001b[39mfor\u001b[39;00m l \u001b[39min\u001b[39;00m lengths]\n\u001b[0;32m    400\u001b[0m     )\n",
      "\u001b[1;31mValueError\u001b[0m: Found input variables with inconsistent numbers of samples: [431, 186]"
     ]
    }
   ],
   "source": [
    "fit_pipeline(X_train, y_train_ohe, preprocessor, model)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Submission"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "' submission = pd.DataFrame()\\nprediction = model.predict(x_test)\\nsubmission.insert(0, \"Id\", id_number, False)\\nsubmission.insert(1, \"class_0\", [round(1-i[0],2) for i in prediction], True)\\nsubmission.insert(2, \"class_1\", [round(i[0],2) for i in prediction], True)\\nsubmission.to_csv(\"/kaggle/working/submission.csv\",index = False) '"
      ]
     },
     "execution_count": 154,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\" submission = pd.DataFrame()\n",
    "prediction = model.predict(x_test)\n",
    "submission.insert(0, \"Id\", id_number, False)\n",
    "submission.insert(1, \"class_0\", [round(1-i[0],2) for i in prediction], True)\n",
    "submission.insert(2, \"class_1\", [round(i[0],2) for i in prediction], True)\n",
    "submission.to_csv(\"/kaggle/working/submission.csv\",index = False) \"\"\""
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}