ai model

2023-07-19 13:30:26 +02:00
parent ee4ef23e06
commit 0e342529ec
3 changed files with 633 additions and 1692 deletions
@@ -0,0 +1,633 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 159,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.impute import SimpleImputer\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.preprocessing import OneHotEncoder\n",
+    "from sklearn.compose import ColumnTransformer\n",
+    "from sklearn.model_selection import train_test_split, cross_val_score\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "from xgboost import XGBClassifier\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from sklearn.feature_selection import mutual_info_regression\n",
+    "from sklearn.discriminant_analysis import StandardScaler\n",
+    "import numpy as np\n",
+    "from sklearn.metrics import classification_report\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.model_selection import train_test_split, ParameterGrid\n",
+    "from keras.callbacks import TensorBoard, EarlyStopping, LearningRateScheduler\n",
+    "from keras.models import Model\n",
+    "from keras.layers import Activation, Dense, LSTM, Input\n",
+    "from keras.optimizers import Adam, RMSprop, SGD\n",
+    "from scikeras.wrappers import KerasClassifier\n",
+    "import tensorflow as tf\n",
+    "from os import path"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Method to load the dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 160,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_dataset(columns_drop) -> tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n",
+    "    train = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\train.csv\")\n",
+    "    greeks = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\greeks.csv\")\n",
+    "    test = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\test.csv\")\n",
+    "    columns_drop = ['Id'] + columns_drop\n",
+    "    id_list = test[\"Id\"]\n",
+    "    train.drop(columns_drop, inplace=True, axis=1)\n",
+    "    test.drop(columns_drop, inplace=True, axis=1)\n",
+    "    print(len(train.columns))\n",
+    "    return (train, greeks, test, id_list)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Method to split the data in validation and train set randomly"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 161,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def split_data(df: pd.DataFrame, split)->tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n",
+    "    X = df.loc[:, df.columns != \"Class\"]\n",
+    "    y = df.loc[:, \"Class\"]\n",
+    "    return train_test_split(X, y, test_size=split, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Method to build a Tensorflow model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 162,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_tensorflow_model(input_shape:int, output_shape:int, units1: int, units2: int, units3: int, activation1: str, \n",
+    "                activation2: str, activation3: str, optimizer: tf.keras.optimizers.Optimizer, learning_rate: float) -> Model:\n",
+    "    input = Input(shape=input_shape)\n",
+    "    x = Dense(units=units1, activation=activation1)(input)\n",
+    "    x = Dense(units=units2, activation=activation2)(x)\n",
+    "    x = Dense(units=units3, activation=activation3)(x)\n",
+    "    output = Dense(units=output_shape, activation=\"softmax\")(x)\n",
+    "    model = Model(inputs=[input], outputs=[output])\n",
+    "    \n",
+    "    model.compile(loss=\"categorical_crossentropy\",\n",
+    "              optimizer=optimizer(learning_rate=learning_rate),\n",
+    "              metrics=[\"accuracy\"])   \n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Method to plot the accuracy of the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 163,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_acc_tf_model(history:Model):\n",
+    "    # summarize history for accuracy\n",
+    "    plt.plot(history.history['accuracy'])\n",
+    "    plt.plot(history.history['val_accuracy'])\n",
+    "    plt.title('model accuracy')\n",
+    "    plt.ylabel('accuracy')\n",
+    "    plt.xlabel('epoch')\n",
+    "    plt.legend(['Train', 'Validation'], loc='upper left')\n",
+    "    plt.show()\n",
+    "    \n",
+    "    # summarize history for loss\n",
+    "    plt.plot(history.history['loss'])\n",
+    "    plt.plot(history.history['val_loss'])\n",
+    "    plt.title('model loss')\n",
+    "    plt.ylabel('loss')\n",
+    "    plt.xlabel('epoch')\n",
+    "    plt.legend(['Train', 'Validation'], loc='upper left')\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Method to fit the Tensorflow model with ES Callback"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 164,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "es_callback = EarlyStopping(\n",
+    "        monitor=\"val_accuracy\",\n",
+    "        patience=5,\n",
+    "        verbose=1,\n",
+    "        restore_best_weights=True,\n",
+    "        min_delta=0.005\n",
+    "    )\n",
+    " \n",
+    "def fit_model(model: Model, x: np.ndarray, y: np.ndarray, epochs: int, split: float) -> Model:\n",
+    "    #split train and validation\n",
+    "    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=split, random_state=42)\n",
+    "    #fit the model\n",
+    "    history = model.fit(x_train, y_train, epochs=epochs, validation_data=(x_val,y_val), callbacks=[es_callback])\n",
+    "    return history "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Method for GridSearch of Tensorflow model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 165,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "grid_params = {\"units1\": [8,16], \"units2\": [16,32], \"units3\": [32,64], \"activation1\": [\"relu\"], \"activation2\": [\"relu\"], \n",
+    "          \"activation3\": [\"relu\"], \"optimizer\": [Adam], \"learning_rate\": [0.001]}\n",
+    "\n",
+    "#GridSearch\n",
+    "def grid_search_tf_model(X_train: pd.DataFrame, y_train: pd.DataFrame)->Model:\n",
+    "    grid = ParameterGrid(param_grid = grid_params)\n",
+    "    results = []\n",
+    "    input_shape = len(X_train[1])\n",
+    "    output_shape = 2\n",
+    "    for idx,params in enumerate(grid):\n",
+    "        model = build_tensorflow_model(input_shape=input_shape, output_shape=output_shape, **params)\n",
+    "        history = fit_model(model, X_train, y_train, 100, 0.2)\n",
+    "        val_loss = history.history['val_loss'][-1]    \n",
+    "        val_acc = history.history['val_accuracy'][-1]\n",
+    "        results.append([val_loss, val_acc])\n",
+    "        \n",
+    "    val_accuracies = [i[1] for i in results]\n",
+    "    val_losses= [i[0] for i in results]\n",
+    "    best_acc = val_accuracies.index(max(val_accuracies))\n",
+    "    best_loss = val_losses.index(min(val_losses))\n",
+    "    print(f\"best acc at index {best_acc}: {max(val_accuracies)}\")\n",
+    "    print(f\"best loss at index {best_loss}: {min(val_losses)}\")\n",
+    "    print(grid[best_acc])\n",
+    "    \n",
+    "    model = build_tensorflow_model(input_shape=input_shape, output_shape=output_shape, **grid[best_acc])\n",
+    "    history = fit_model(model, X_train, y_train, 100, 0.2)\n",
+    "    #plot_acc_tf_model(history)\n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Method to build a preprocessing pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 166,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_preprocessing_pipeline(df: pd.DataFrame) -> ColumnTransformer:\n",
+    "    # Preprocessing for numerical data    \n",
+    "    numerical_transformer = Pipeline(steps=[\n",
+    "        ('imputer',SimpleImputer(strategy='constant')),\n",
+    "        ('scaler', StandardScaler())])\n",
+    "    \n",
+    "    # Preprocessing for categorical data\n",
+    "    categorical_transformer = Pipeline(steps=[\n",
+    "        ('imputer', SimpleImputer(strategy='most_frequent')),\n",
+    "        ('onehot', OneHotEncoder(handle_unknown='ignore'))\n",
+    "    ])\n",
+    "\n",
+    "    # Bundle preprocessing for numerical and categorical data\n",
+    "    numerical_cols = [cname for cname in df.columns if df[cname].dtype in [\"int64\", \"float64\"]]\n",
+    "    categorical_cols = [cname for cname in df.columns if df[cname].nunique() < 10]\n",
+    "    preprocessor = ColumnTransformer(\n",
+    "        transformers=[\n",
+    "            ('num', numerical_transformer, numerical_cols),\n",
+    "            ('cat', categorical_transformer, categorical_cols)\n",
+    "        ])\n",
+    "    print(f\"Number of columns: {len(df.columns)}\")\n",
+    "    return preprocessor"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Method to generate the Mutual Info scores and plot them"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 167,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "def make_mi_scores(X: pd.DataFrame, y: pd.DataFrame, discrete_features: list):\n",
+    "    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)\n",
+    "    mi_scores = pd.Series(mi_scores, name=\"MI Scores\", index=X.columns)\n",
+    "    mi_scores = mi_scores.sort_values(ascending=False)\n",
+    "    return mi_scores\n",
+    "\n",
+    "def plot_mi_scores(scores):\n",
+    "    scores = scores.sort_values(ascending=True)\n",
+    "    width = np.arange(len(scores))\n",
+    "    ticks = list(scores.index)\n",
+    "    plt.figure(dpi=100, figsize=(16, 16))\n",
+    "    plt.barh(width, scores)\n",
+    "    plt.yticks(width, ticks)\n",
+    "    plt.title(\"Mutual Information Scores\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Display MI scores that are beneath 0.01"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 168,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "57\n",
+      "dataset shape: (617, 57)\n",
+      "X shape: (431, 56) and y shape: (431,)\n",
+      " Columns with MI equal zero: ['DN', 'CW ', 'CC', 'EG', 'CU', 'AH', 'CL', 'DF', 'CD ', 'GE', 'GB', 'FS', 'DE', 'FI', 'BD ', 'CB', 'FD ', 'DY', 'AY', 'EP', 'AZ', 'EJ', 'CF'] --> total length: 23\n"
+     ]
+    }
+   ],
+   "source": [
+    "#get the mutual information of features\n",
+    "train, greeks, test, id_list = load_dataset(columns_drop=[])\n",
+    "print(f\"dataset shape: {train.shape}\")\n",
+    "X_train, X_valid, y_train, y_valid = split_data(train, 0.3)\n",
+    "print(f\"X shape: {X_train.shape} and y shape: {y_train.shape}\")\n",
+    "\n",
+    "cols = list(X_train.columns)\n",
+    "cols.append(\"1\")\n",
+    "X_transformed = X_train.fillna(0)\n",
+    "X_transformed[\"EJ\"].replace(['A', 'B'], [0, 1], inplace=True)\n",
+    "discrete_features = X_transformed.dtypes == int\n",
+    "mi_scores = make_mi_scores(X_transformed, y_train, discrete_features)\n",
+    "bad_scores = list(mi_scores.index[i] for i, score in zip(range(len(mi_scores)),mi_scores) if score < 0.01)\n",
+    "#plot_mi_scores(mi_scores)\n",
+    "print(f\" Columns with MI equal zero: {bad_scores} --> total length: {len(bad_scores)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a model and preprocessor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 169,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "34\n",
+      "dataset shape: (617, 34)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of columns: 33\n",
+      "Epoch 1/100\n",
+      "11/11 [==============================] - 0s 15ms/step - loss: 0.5836 - accuracy: 0.8372 - val_loss: 0.4572 - val_accuracy: 0.8391\n",
+      "Epoch 2/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.5413 - accuracy: 0.8372 - val_loss: 0.4360 - val_accuracy: 0.8391\n",
+      "Epoch 3/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.5107 - accuracy: 0.8372 - val_loss: 0.4228 - val_accuracy: 0.8391\n",
+      "Epoch 4/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.4856 - accuracy: 0.8372 - val_loss: 0.4138 - val_accuracy: 0.8391\n",
+      "Epoch 5/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.4655 - accuracy: 0.8372 - val_loss: 0.4073 - val_accuracy: 0.8391\n",
+      "Epoch 6/100\n",
+      " 1/11 [=>............................] - ETA: 0s - loss: 0.3383 - accuracy: 0.8438Restoring model weights from the end of the best epoch: 1.\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.4464 - accuracy: 0.8372 - val_loss: 0.4030 - val_accuracy: 0.8391\n",
+      "Epoch 6: early stopping\n",
+      "Epoch 1/100\n",
+      "11/11 [==============================] - 0s 15ms/step - loss: 0.6535 - accuracy: 0.7297 - val_loss: 0.5649 - val_accuracy: 0.8391\n",
+      "Epoch 2/100\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.5767 - accuracy: 0.8372 - val_loss: 0.4984 - val_accuracy: 0.8391\n",
+      "Epoch 3/100\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.5286 - accuracy: 0.8372 - val_loss: 0.4690 - val_accuracy: 0.8391\n",
+      "Epoch 4/100\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.4883 - accuracy: 0.8372 - val_loss: 0.4536 - val_accuracy: 0.8391\n",
+      "Epoch 5/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.4519 - accuracy: 0.8372 - val_loss: 0.4419 - val_accuracy: 0.8391\n",
+      "Epoch 6/100\n",
+      " 1/11 [=>............................] - ETA: 0s - loss: 0.3956 - accuracy: 0.8438Restoring model weights from the end of the best epoch: 1.\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.4211 - accuracy: 0.8372 - val_loss: 0.4367 - val_accuracy: 0.8391\n",
+      "Epoch 6: early stopping\n",
+      "Epoch 1/100\n",
+      "11/11 [==============================] - 0s 16ms/step - loss: 0.7554 - accuracy: 0.4244 - val_loss: 0.6338 - val_accuracy: 0.6667\n",
+      "Epoch 2/100\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.5835 - accuracy: 0.7733 - val_loss: 0.5112 - val_accuracy: 0.8391\n",
+      "Epoch 3/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.5046 - accuracy: 0.8372 - val_loss: 0.4532 - val_accuracy: 0.8391\n",
+      "Epoch 4/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.4713 - accuracy: 0.8372 - val_loss: 0.4223 - val_accuracy: 0.8391\n",
+      "Epoch 5/100\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.4393 - accuracy: 0.8372 - val_loss: 0.4078 - val_accuracy: 0.8391\n",
+      "Epoch 6/100\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.4144 - accuracy: 0.8372 - val_loss: 0.3960 - val_accuracy: 0.8391\n",
+      "Epoch 7/100\n",
+      " 1/11 [=>............................] - ETA: 0s - loss: 0.3674 - accuracy: 0.8750Restoring model weights from the end of the best epoch: 2.\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.3898 - accuracy: 0.8401 - val_loss: 0.3847 - val_accuracy: 0.8391\n",
+      "Epoch 7: early stopping\n",
+      "Epoch 1/100\n",
+      "11/11 [==============================] - 0s 15ms/step - loss: 0.6324 - accuracy: 0.8227 - val_loss: 0.5557 - val_accuracy: 0.8391\n",
+      "Epoch 2/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.5711 - accuracy: 0.8372 - val_loss: 0.5040 - val_accuracy: 0.8391\n",
+      "Epoch 3/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.5245 - accuracy: 0.8372 - val_loss: 0.4676 - val_accuracy: 0.8391\n",
+      "Epoch 4/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.4838 - accuracy: 0.8372 - val_loss: 0.4399 - val_accuracy: 0.8391\n",
+      "Epoch 5/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.4494 - accuracy: 0.8372 - val_loss: 0.4188 - val_accuracy: 0.8391\n",
+      "Epoch 6/100\n",
+      " 1/11 [=>............................] - ETA: 0s - loss: 0.3685 - accuracy: 0.8750Restoring model weights from the end of the best epoch: 1.\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.4130 - accuracy: 0.8372 - val_loss: 0.4003 - val_accuracy: 0.8391\n",
+      "Epoch 6: early stopping\n",
+      "Epoch 1/100\n",
+      "11/11 [==============================] - 0s 15ms/step - loss: 0.8912 - accuracy: 0.1860 - val_loss: 0.7869 - val_accuracy: 0.2644\n",
+      "Epoch 2/100\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.7306 - accuracy: 0.4070 - val_loss: 0.6774 - val_accuracy: 0.6207\n",
+      "Epoch 3/100\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.6506 - accuracy: 0.7936 - val_loss: 0.6167 - val_accuracy: 0.8506\n",
+      "Epoch 4/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.6128 - accuracy: 0.8401 - val_loss: 0.5752 - val_accuracy: 0.8391\n",
+      "Epoch 5/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.5760 - accuracy: 0.8401 - val_loss: 0.5514 - val_accuracy: 0.8391\n",
+      "Epoch 6/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.5444 - accuracy: 0.8401 - val_loss: 0.5272 - val_accuracy: 0.8391\n",
+      "Epoch 7/100\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.5119 - accuracy: 0.8459 - val_loss: 0.4959 - val_accuracy: 0.8391\n",
+      "Epoch 8/100\n",
+      " 1/11 [=>............................] - ETA: 0s - loss: 0.4723 - accuracy: 0.9062Restoring model weights from the end of the best epoch: 3.\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.4759 - accuracy: 0.8488 - val_loss: 0.4645 - val_accuracy: 0.8391\n",
+      "Epoch 8: early stopping\n",
+      "Epoch 1/100\n",
+      "11/11 [==============================] - 0s 16ms/step - loss: 0.7030 - accuracy: 0.5262 - val_loss: 0.6235 - val_accuracy: 0.7701\n",
+      "Epoch 2/100\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.6088 - accuracy: 0.8285 - val_loss: 0.5432 - val_accuracy: 0.8391\n",
+      "Epoch 3/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.5630 - accuracy: 0.8372 - val_loss: 0.4939 - val_accuracy: 0.8391\n",
+      "Epoch 4/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.5231 - accuracy: 0.8372 - val_loss: 0.4641 - val_accuracy: 0.8391\n",
+      "Epoch 5/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.4851 - accuracy: 0.8372 - val_loss: 0.4443 - val_accuracy: 0.8391\n",
+      "Epoch 6/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.4476 - accuracy: 0.8372 - val_loss: 0.4190 - val_accuracy: 0.8391\n",
+      "Epoch 7/100\n",
+      " 1/11 [=>............................] - ETA: 0s - loss: 0.4761 - accuracy: 0.7812Restoring model weights from the end of the best epoch: 2.\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.4080 - accuracy: 0.8401 - val_loss: 0.3984 - val_accuracy: 0.8391\n",
+      "Epoch 7: early stopping\n",
+      "Epoch 1/100\n",
+      "11/11 [==============================] - 0s 15ms/step - loss: 0.8484 - accuracy: 0.2122 - val_loss: 0.7485 - val_accuracy: 0.3218\n",
+      "Epoch 2/100\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.7046 - accuracy: 0.5000 - val_loss: 0.6538 - val_accuracy: 0.7471\n",
+      "Epoch 3/100\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.6363 - accuracy: 0.7936 - val_loss: 0.5938 - val_accuracy: 0.8276\n",
+      "Epoch 4/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.5827 - accuracy: 0.8372 - val_loss: 0.5511 - val_accuracy: 0.8506\n",
+      "Epoch 5/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.5353 - accuracy: 0.8401 - val_loss: 0.5105 - val_accuracy: 0.8506\n",
+      "Epoch 6/100\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.4874 - accuracy: 0.8430 - val_loss: 0.4787 - val_accuracy: 0.8506\n",
+      "Epoch 7/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.4375 - accuracy: 0.8459 - val_loss: 0.4518 - val_accuracy: 0.8506\n",
+      "Epoch 8/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.3944 - accuracy: 0.8488 - val_loss: 0.4320 - val_accuracy: 0.8506\n",
+      "Epoch 9/100\n",
+      " 1/11 [=>............................] - ETA: 0s - loss: 0.4943 - accuracy: 0.7812Restoring model weights from the end of the best epoch: 4.\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.3565 - accuracy: 0.8547 - val_loss: 0.4221 - val_accuracy: 0.8506\n",
+      "Epoch 9: early stopping\n",
+      "Epoch 1/100\n",
+      "11/11 [==============================] - 0s 15ms/step - loss: 0.6726 - accuracy: 0.6483 - val_loss: 0.5800 - val_accuracy: 0.8391\n",
+      "Epoch 2/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.5425 - accuracy: 0.8430 - val_loss: 0.5210 - val_accuracy: 0.8391\n",
+      "Epoch 3/100\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.4813 - accuracy: 0.8430 - val_loss: 0.5047 - val_accuracy: 0.8391\n",
+      "Epoch 4/100\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.4377 - accuracy: 0.8517 - val_loss: 0.4948 - val_accuracy: 0.8391\n",
+      "Epoch 5/100\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.3982 - accuracy: 0.8547 - val_loss: 0.4875 - val_accuracy: 0.8391\n",
+      "Epoch 6/100\n",
+      " 1/11 [=>............................] - ETA: 0s - loss: 0.3590 - accuracy: 0.8438Restoring model weights from the end of the best epoch: 1.\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.3683 - accuracy: 0.8663 - val_loss: 0.4850 - val_accuracy: 0.8391\n",
+      "Epoch 6: early stopping\n",
+      "best acc at index 6: 0.8505747318267822\n",
+      "best loss at index 2: 0.3847053050994873\n",
+      "{'units3': 32, 'units2': 32, 'units1': 16, 'optimizer': <class 'keras.optimizers.legacy.adam.Adam'>, 'learning_rate': 0.001, 'activation3': 'relu', 'activation2': 'relu', 'activation1': 'relu'}\n",
+      "Epoch 1/100\n",
+      "11/11 [==============================] - 0s 15ms/step - loss: 0.6807 - accuracy: 0.8372 - val_loss: 0.5413 - val_accuracy: 0.8391\n",
+      "Epoch 2/100\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.6144 - accuracy: 0.8372 - val_loss: 0.5060 - val_accuracy: 0.8391\n",
+      "Epoch 3/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.5689 - accuracy: 0.8372 - val_loss: 0.4795 - val_accuracy: 0.8391\n",
+      "Epoch 4/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.5273 - accuracy: 0.8372 - val_loss: 0.4565 - val_accuracy: 0.8391\n",
+      "Epoch 5/100\n",
+      "11/11 [==============================] - 0s 3ms/step - loss: 0.4915 - accuracy: 0.8372 - val_loss: 0.4387 - val_accuracy: 0.8391\n",
+      "Epoch 6/100\n",
+      " 1/11 [=>............................] - ETA: 0s - loss: 0.5601 - accuracy: 0.7812Restoring model weights from the end of the best epoch: 1.\n",
+      "11/11 [==============================] - 0s 4ms/step - loss: 0.4573 - accuracy: 0.8401 - val_loss: 0.4216 - val_accuracy: 0.8391\n",
+      "Epoch 6: early stopping\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Newly load the dataset with columns drop and create preprocessing pipeline\n",
+    "train, greeks, test, id_list = load_dataset(columns_drop=bad_scores)\n",
+    "print(f\"dataset shape: {train.shape}\")\n",
+    "X_train, X_valid, y_train, y_valid = split_data(train, 0.3)\n",
+    "preprocessor = build_preprocessing_pipeline(X_train)\n",
+    "\n",
+    "\n",
+    "# Define model for the pipeline\n",
+    "#model = RandomForestClassifier(n_estimators=100, random_state=22)\n",
+    "#model = XGBClassifier(n_estimators=500)\n",
+    "preprocessor.fit(X_train)\n",
+    "X_preprocessed = preprocessor.transform(X_train)\n",
+    "y_train_ohe = pd.get_dummies(y_train, columns = ['Class'])\n",
+    "y_valid_ohe = pd.get_dummies(y_valid, columns = ['Class'])\n",
+    "model = KerasClassifier(model=grid_search_tf_model(X_train=X_preprocessed, y_train=y_train_ohe), epochs=0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Build the final pipeline with preprocessor and model, fit it and display accuracy score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 170,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Building Pipeline\n",
+    "def fit_pipeline(X_train: pd.DataFrame, y_train: pd.DataFrame, preprocessor, model):\n",
+    "    pipeline = Pipeline(steps=[('preprocessor', preprocessor),\n",
+    "                            ('model', model)\n",
+    "                            ])\n",
+    "    #Fit the Model and make preds\n",
+    "    pipeline.fit(X_train, y_train)\n",
+    "    preds = pipeline.predict(X_valid)\n",
+    "    \"\"\" score = cross_val_score(pipeline, X_valid, y_valid, cv=5, scoring='accuracy')\n",
+    "    print(f\"Accuracy of {score}\") \"\"\"\n",
+    "    print(classification_report(y_train.to_numpy(), preds))\n",
+    "    correct_answers = 0\n",
+    "    for y_pred,y_true in zip(preds,y_valid_ohe.to_numpy()):\n",
+    "        if(y_pred[0] == y_true[0]):correct_answers+=1\n",
+    "    print(correct_answers/len(preds))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Print results of the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 171,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "6/6 [==============================] - 0s 1ms/step\n"
+     ]
+    },
+    {
+     "ename": "ValueError",
+     "evalue": "Found input variables with inconsistent numbers of samples: [431, 186]",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[171], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m fit_pipeline(X_train, y_train_ohe, preprocessor, model)\n",
+      "Cell \u001b[1;32mIn[170], line 11\u001b[0m, in \u001b[0;36mfit_pipeline\u001b[1;34m(X_train, y_train, preprocessor, model)\u001b[0m\n\u001b[0;32m      8\u001b[0m preds \u001b[39m=\u001b[39m pipeline\u001b[39m.\u001b[39mpredict(X_valid)\n\u001b[0;32m      9\u001b[0m \u001b[39m\"\"\" score = cross_val_score(pipeline, X_valid, y_valid, cv=5, scoring='accuracy')\u001b[39;00m\n\u001b[0;32m     10\u001b[0m \u001b[39mprint(f\"Accuracy of {score}\") \"\"\"\u001b[39;00m\n\u001b[1;32m---> 11\u001b[0m \u001b[39mprint\u001b[39m(classification_report(y_train\u001b[39m.\u001b[39;49mto_numpy(), preds))\n\u001b[0;32m     12\u001b[0m correct_answers \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m\n\u001b[0;32m     13\u001b[0m \u001b[39mfor\u001b[39;00m y_pred,y_true \u001b[39min\u001b[39;00m \u001b[39mzip\u001b[39m(preds,y_valid_ohe\u001b[39m.\u001b[39mto_numpy()):\n",
+      "File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:2310\u001b[0m, in \u001b[0;36mclassification_report\u001b[1;34m(y_true, y_pred, labels, target_names, sample_weight, digits, output_dict, zero_division)\u001b[0m\n\u001b[0;32m   2195\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mclassification_report\u001b[39m(\n\u001b[0;32m   2196\u001b[0m     y_true,\n\u001b[0;32m   2197\u001b[0m     y_pred,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   2204\u001b[0m     zero_division\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mwarn\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m   2205\u001b[0m ):\n\u001b[0;32m   2206\u001b[0m     \u001b[39m\"\"\"Build a text report showing the main classification metrics.\u001b[39;00m\n\u001b[0;32m   2207\u001b[0m \n\u001b[0;32m   2208\u001b[0m \u001b[39m    Read more in the :ref:`User Guide <classification_report>`.\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   2307\u001b[0m \u001b[39m    <BLANKLINE>\u001b[39;00m\n\u001b[0;32m   2308\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[1;32m-> 2310\u001b[0m     y_type, y_true, y_pred \u001b[39m=\u001b[39m _check_targets(y_true, y_pred)\n\u001b[0;32m   2312\u001b[0m     \u001b[39mif\u001b[39;00m labels \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m   2313\u001b[0m         labels \u001b[39m=\u001b[39m unique_labels(y_true, y_pred)\n",
+      "File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:86\u001b[0m, in \u001b[0;36m_check_targets\u001b[1;34m(y_true, y_pred)\u001b[0m\n\u001b[0;32m     59\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_check_targets\u001b[39m(y_true, y_pred):\n\u001b[0;32m     60\u001b[0m     \u001b[39m\"\"\"Check that y_true and y_pred belong to the same classification task.\u001b[39;00m\n\u001b[0;32m     61\u001b[0m \n\u001b[0;32m     62\u001b[0m \u001b[39m    This converts multiclass or binary types to a common shape, and raises a\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     84\u001b[0m \u001b[39m    y_pred : array or indicator matrix\u001b[39;00m\n\u001b[0;32m     85\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[1;32m---> 86\u001b[0m     check_consistent_length(y_true, y_pred)\n\u001b[0;32m     87\u001b[0m     type_true \u001b[39m=\u001b[39m type_of_target(y_true, input_name\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39my_true\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m     88\u001b[0m     type_pred \u001b[39m=\u001b[39m type_of_target(y_pred, input_name\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39my_pred\u001b[39m\u001b[39m\"\u001b[39m)\n",
+      "File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py:397\u001b[0m, in \u001b[0;36mcheck_consistent_length\u001b[1;34m(*arrays)\u001b[0m\n\u001b[0;32m    395\u001b[0m uniques \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39munique(lengths)\n\u001b[0;32m    396\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(uniques) \u001b[39m>\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[1;32m--> 397\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m    398\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mFound input variables with inconsistent numbers of samples: \u001b[39m\u001b[39m%r\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m    399\u001b[0m         \u001b[39m%\u001b[39m [\u001b[39mint\u001b[39m(l) \u001b[39mfor\u001b[39;00m l \u001b[39min\u001b[39;00m lengths]\n\u001b[0;32m    400\u001b[0m     )\n",
+      "\u001b[1;31mValueError\u001b[0m: Found input variables with inconsistent numbers of samples: [431, 186]"
+     ]
+    }
+   ],
+   "source": [
+    "fit_pipeline(X_train, y_train_ohe, preprocessor, model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Submission"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "' submission = pd.DataFrame()\\nprediction = model.predict(x_test)\\nsubmission.insert(0, \"Id\", id_number, False)\\nsubmission.insert(1, \"class_0\", [round(1-i[0],2) for i in prediction], True)\\nsubmission.insert(2, \"class_1\", [round(i[0],2) for i in prediction], True)\\nsubmission.to_csv(\"/kaggle/working/submission.csv\",index = False) '"
+      ]
+     },
+     "execution_count": 154,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\"\"\" submission = pd.DataFrame()\n",
+    "prediction = model.predict(x_test)\n",
+    "submission.insert(0, \"Id\", id_number, False)\n",
+    "submission.insert(1, \"class_0\", [round(1-i[0],2) for i in prediction], True)\n",
+    "submission.insert(2, \"class_1\", [round(i[0],2) for i in prediction], True)\n",
+    "submission.to_csv(\"/kaggle/working/submission.csv\",index = False) \"\"\""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}