This commit is contained in:
yann22ahlgrim
2023-07-19 13:30:26 +02:00
parent ee4ef23e06
commit 0e342529ec
3 changed files with 633 additions and 1692 deletions
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,633 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 159,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.model_selection import train_test_split, cross_val_score\n",
"from sklearn.metrics import accuracy_score\n",
"from xgboost import XGBClassifier\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.feature_selection import mutual_info_regression\n",
"from sklearn.discriminant_analysis import StandardScaler\n",
"import numpy as np\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.model_selection import train_test_split, ParameterGrid\n",
"from keras.callbacks import TensorBoard, EarlyStopping, LearningRateScheduler\n",
"from keras.models import Model\n",
"from keras.layers import Activation, Dense, LSTM, Input\n",
"from keras.optimizers import Adam, RMSprop, SGD\n",
"from scikeras.wrappers import KerasClassifier\n",
"import tensorflow as tf\n",
"from os import path"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Method to load the dataset"
]
},
{
"cell_type": "code",
"execution_count": 160,
"metadata": {},
"outputs": [],
"source": [
"def load_dataset(columns_drop) -> tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n",
" train = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\train.csv\")\n",
" greeks = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\greeks.csv\")\n",
" test = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\test.csv\")\n",
" columns_drop = ['Id'] + columns_drop\n",
" id_list = test[\"Id\"]\n",
" train.drop(columns_drop, inplace=True, axis=1)\n",
" test.drop(columns_drop, inplace=True, axis=1)\n",
" print(len(train.columns))\n",
" return (train, greeks, test, id_list)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Method to split the data in validation and train set randomly"
]
},
{
"cell_type": "code",
"execution_count": 161,
"metadata": {},
"outputs": [],
"source": [
"def split_data(df: pd.DataFrame, split)->tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n",
" X = df.loc[:, df.columns != \"Class\"]\n",
" y = df.loc[:, \"Class\"]\n",
" return train_test_split(X, y, test_size=split, random_state=42)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Method to build a Tensorflow model"
]
},
{
"cell_type": "code",
"execution_count": 162,
"metadata": {},
"outputs": [],
"source": [
"def build_tensorflow_model(input_shape:int, output_shape:int, units1: int, units2: int, units3: int, activation1: str, \n",
" activation2: str, activation3: str, optimizer: tf.keras.optimizers.Optimizer, learning_rate: float) -> Model:\n",
" input = Input(shape=input_shape)\n",
" x = Dense(units=units1, activation=activation1)(input)\n",
" x = Dense(units=units2, activation=activation2)(x)\n",
" x = Dense(units=units3, activation=activation3)(x)\n",
" output = Dense(units=output_shape, activation=\"softmax\")(x)\n",
" model = Model(inputs=[input], outputs=[output])\n",
" \n",
" model.compile(loss=\"categorical_crossentropy\",\n",
" optimizer=optimizer(learning_rate=learning_rate),\n",
" metrics=[\"accuracy\"]) \n",
" return model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Method to plot the accuracy of the model"
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {},
"outputs": [],
"source": [
"def plot_acc_tf_model(history:Model):\n",
" # summarize history for accuracy\n",
" plt.plot(history.history['accuracy'])\n",
" plt.plot(history.history['val_accuracy'])\n",
" plt.title('model accuracy')\n",
" plt.ylabel('accuracy')\n",
" plt.xlabel('epoch')\n",
" plt.legend(['Train', 'Validation'], loc='upper left')\n",
" plt.show()\n",
" \n",
" # summarize history for loss\n",
" plt.plot(history.history['loss'])\n",
" plt.plot(history.history['val_loss'])\n",
" plt.title('model loss')\n",
" plt.ylabel('loss')\n",
" plt.xlabel('epoch')\n",
" plt.legend(['Train', 'Validation'], loc='upper left')\n",
" plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Method to fit the Tensorflow model with ES Callback"
]
},
{
"cell_type": "code",
"execution_count": 164,
"metadata": {},
"outputs": [],
"source": [
"es_callback = EarlyStopping(\n",
" monitor=\"val_accuracy\",\n",
" patience=5,\n",
" verbose=1,\n",
" restore_best_weights=True,\n",
" min_delta=0.005\n",
" )\n",
" \n",
"def fit_model(model: Model, x: np.ndarray, y: np.ndarray, epochs: int, split: float) -> Model:\n",
" #split train and validation\n",
" x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=split, random_state=42)\n",
" #fit the model\n",
" history = model.fit(x_train, y_train, epochs=epochs, validation_data=(x_val,y_val), callbacks=[es_callback])\n",
" return history "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Method for GridSearch of Tensorflow model"
]
},
{
"cell_type": "code",
"execution_count": 165,
"metadata": {},
"outputs": [],
"source": [
"grid_params = {\"units1\": [8,16], \"units2\": [16,32], \"units3\": [32,64], \"activation1\": [\"relu\"], \"activation2\": [\"relu\"], \n",
" \"activation3\": [\"relu\"], \"optimizer\": [Adam], \"learning_rate\": [0.001]}\n",
"\n",
"#GridSearch\n",
"def grid_search_tf_model(X_train: pd.DataFrame, y_train: pd.DataFrame)->Model:\n",
" grid = ParameterGrid(param_grid = grid_params)\n",
" results = []\n",
" input_shape = len(X_train[1])\n",
" output_shape = 2\n",
" for idx,params in enumerate(grid):\n",
" model = build_tensorflow_model(input_shape=input_shape, output_shape=output_shape, **params)\n",
" history = fit_model(model, X_train, y_train, 100, 0.2)\n",
" val_loss = history.history['val_loss'][-1] \n",
" val_acc = history.history['val_accuracy'][-1]\n",
" results.append([val_loss, val_acc])\n",
" \n",
" val_accuracies = [i[1] for i in results]\n",
" val_losses= [i[0] for i in results]\n",
" best_acc = val_accuracies.index(max(val_accuracies))\n",
" best_loss = val_losses.index(min(val_losses))\n",
" print(f\"best acc at index {best_acc}: {max(val_accuracies)}\")\n",
" print(f\"best loss at index {best_loss}: {min(val_losses)}\")\n",
" print(grid[best_acc])\n",
" \n",
" model = build_tensorflow_model(input_shape=input_shape, output_shape=output_shape, **grid[best_acc])\n",
" history = fit_model(model, X_train, y_train, 100, 0.2)\n",
" #plot_acc_tf_model(history)\n",
" return model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Method to build a preprocessing pipeline"
]
},
{
"cell_type": "code",
"execution_count": 166,
"metadata": {},
"outputs": [],
"source": [
"def build_preprocessing_pipeline(df: pd.DataFrame) -> ColumnTransformer:\n",
" # Preprocessing for numerical data \n",
" numerical_transformer = Pipeline(steps=[\n",
" ('imputer',SimpleImputer(strategy='constant')),\n",
" ('scaler', StandardScaler())])\n",
" \n",
" # Preprocessing for categorical data\n",
" categorical_transformer = Pipeline(steps=[\n",
" ('imputer', SimpleImputer(strategy='most_frequent')),\n",
" ('onehot', OneHotEncoder(handle_unknown='ignore'))\n",
" ])\n",
"\n",
" # Bundle preprocessing for numerical and categorical data\n",
" numerical_cols = [cname for cname in df.columns if df[cname].dtype in [\"int64\", \"float64\"]]\n",
" categorical_cols = [cname for cname in df.columns if df[cname].nunique() < 10]\n",
" preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', numerical_transformer, numerical_cols),\n",
" ('cat', categorical_transformer, categorical_cols)\n",
" ])\n",
" print(f\"Number of columns: {len(df.columns)}\")\n",
" return preprocessor"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Method to generate the Mutual Info scores and plot them"
]
},
{
"cell_type": "code",
"execution_count": 167,
"metadata": {},
"outputs": [],
"source": [
"\n",
"def make_mi_scores(X: pd.DataFrame, y: pd.DataFrame, discrete_features: list):\n",
" mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)\n",
" mi_scores = pd.Series(mi_scores, name=\"MI Scores\", index=X.columns)\n",
" mi_scores = mi_scores.sort_values(ascending=False)\n",
" return mi_scores\n",
"\n",
"def plot_mi_scores(scores):\n",
" scores = scores.sort_values(ascending=True)\n",
" width = np.arange(len(scores))\n",
" ticks = list(scores.index)\n",
" plt.figure(dpi=100, figsize=(16, 16))\n",
" plt.barh(width, scores)\n",
" plt.yticks(width, ticks)\n",
" plt.title(\"Mutual Information Scores\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Display MI scores that are beneath 0.01"
]
},
{
"cell_type": "code",
"execution_count": 168,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"57\n",
"dataset shape: (617, 57)\n",
"X shape: (431, 56) and y shape: (431,)\n",
" Columns with MI equal zero: ['DN', 'CW ', 'CC', 'EG', 'CU', 'AH', 'CL', 'DF', 'CD ', 'GE', 'GB', 'FS', 'DE', 'FI', 'BD ', 'CB', 'FD ', 'DY', 'AY', 'EP', 'AZ', 'EJ', 'CF'] --> total length: 23\n"
]
}
],
"source": [
"#get the mutual information of features\n",
"train, greeks, test, id_list = load_dataset(columns_drop=[])\n",
"print(f\"dataset shape: {train.shape}\")\n",
"X_train, X_valid, y_train, y_valid = split_data(train, 0.3)\n",
"print(f\"X shape: {X_train.shape} and y shape: {y_train.shape}\")\n",
"\n",
"cols = list(X_train.columns)\n",
"cols.append(\"1\")\n",
"X_transformed = X_train.fillna(0)\n",
"X_transformed[\"EJ\"].replace(['A', 'B'], [0, 1], inplace=True)\n",
"discrete_features = X_transformed.dtypes == int\n",
"mi_scores = make_mi_scores(X_transformed, y_train, discrete_features)\n",
"bad_scores = list(mi_scores.index[i] for i, score in zip(range(len(mi_scores)),mi_scores) if score < 0.01)\n",
"#plot_mi_scores(mi_scores)\n",
"print(f\" Columns with MI equal zero: {bad_scores} --> total length: {len(bad_scores)}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a model and preprocessor"
]
},
{
"cell_type": "code",
"execution_count": 169,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"34\n",
"dataset shape: (617, 34)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of columns: 33\n",
"Epoch 1/100\n",
"11/11 [==============================] - 0s 15ms/step - loss: 0.5836 - accuracy: 0.8372 - val_loss: 0.4572 - val_accuracy: 0.8391\n",
"Epoch 2/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.5413 - accuracy: 0.8372 - val_loss: 0.4360 - val_accuracy: 0.8391\n",
"Epoch 3/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.5107 - accuracy: 0.8372 - val_loss: 0.4228 - val_accuracy: 0.8391\n",
"Epoch 4/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.4856 - accuracy: 0.8372 - val_loss: 0.4138 - val_accuracy: 0.8391\n",
"Epoch 5/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.4655 - accuracy: 0.8372 - val_loss: 0.4073 - val_accuracy: 0.8391\n",
"Epoch 6/100\n",
" 1/11 [=>............................] - ETA: 0s - loss: 0.3383 - accuracy: 0.8438Restoring model weights from the end of the best epoch: 1.\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.4464 - accuracy: 0.8372 - val_loss: 0.4030 - val_accuracy: 0.8391\n",
"Epoch 6: early stopping\n",
"Epoch 1/100\n",
"11/11 [==============================] - 0s 15ms/step - loss: 0.6535 - accuracy: 0.7297 - val_loss: 0.5649 - val_accuracy: 0.8391\n",
"Epoch 2/100\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.5767 - accuracy: 0.8372 - val_loss: 0.4984 - val_accuracy: 0.8391\n",
"Epoch 3/100\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.5286 - accuracy: 0.8372 - val_loss: 0.4690 - val_accuracy: 0.8391\n",
"Epoch 4/100\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.4883 - accuracy: 0.8372 - val_loss: 0.4536 - val_accuracy: 0.8391\n",
"Epoch 5/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.4519 - accuracy: 0.8372 - val_loss: 0.4419 - val_accuracy: 0.8391\n",
"Epoch 6/100\n",
" 1/11 [=>............................] - ETA: 0s - loss: 0.3956 - accuracy: 0.8438Restoring model weights from the end of the best epoch: 1.\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.4211 - accuracy: 0.8372 - val_loss: 0.4367 - val_accuracy: 0.8391\n",
"Epoch 6: early stopping\n",
"Epoch 1/100\n",
"11/11 [==============================] - 0s 16ms/step - loss: 0.7554 - accuracy: 0.4244 - val_loss: 0.6338 - val_accuracy: 0.6667\n",
"Epoch 2/100\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.5835 - accuracy: 0.7733 - val_loss: 0.5112 - val_accuracy: 0.8391\n",
"Epoch 3/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.5046 - accuracy: 0.8372 - val_loss: 0.4532 - val_accuracy: 0.8391\n",
"Epoch 4/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.4713 - accuracy: 0.8372 - val_loss: 0.4223 - val_accuracy: 0.8391\n",
"Epoch 5/100\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.4393 - accuracy: 0.8372 - val_loss: 0.4078 - val_accuracy: 0.8391\n",
"Epoch 6/100\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.4144 - accuracy: 0.8372 - val_loss: 0.3960 - val_accuracy: 0.8391\n",
"Epoch 7/100\n",
" 1/11 [=>............................] - ETA: 0s - loss: 0.3674 - accuracy: 0.8750Restoring model weights from the end of the best epoch: 2.\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.3898 - accuracy: 0.8401 - val_loss: 0.3847 - val_accuracy: 0.8391\n",
"Epoch 7: early stopping\n",
"Epoch 1/100\n",
"11/11 [==============================] - 0s 15ms/step - loss: 0.6324 - accuracy: 0.8227 - val_loss: 0.5557 - val_accuracy: 0.8391\n",
"Epoch 2/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.5711 - accuracy: 0.8372 - val_loss: 0.5040 - val_accuracy: 0.8391\n",
"Epoch 3/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.5245 - accuracy: 0.8372 - val_loss: 0.4676 - val_accuracy: 0.8391\n",
"Epoch 4/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.4838 - accuracy: 0.8372 - val_loss: 0.4399 - val_accuracy: 0.8391\n",
"Epoch 5/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.4494 - accuracy: 0.8372 - val_loss: 0.4188 - val_accuracy: 0.8391\n",
"Epoch 6/100\n",
" 1/11 [=>............................] - ETA: 0s - loss: 0.3685 - accuracy: 0.8750Restoring model weights from the end of the best epoch: 1.\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.4130 - accuracy: 0.8372 - val_loss: 0.4003 - val_accuracy: 0.8391\n",
"Epoch 6: early stopping\n",
"Epoch 1/100\n",
"11/11 [==============================] - 0s 15ms/step - loss: 0.8912 - accuracy: 0.1860 - val_loss: 0.7869 - val_accuracy: 0.2644\n",
"Epoch 2/100\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.7306 - accuracy: 0.4070 - val_loss: 0.6774 - val_accuracy: 0.6207\n",
"Epoch 3/100\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.6506 - accuracy: 0.7936 - val_loss: 0.6167 - val_accuracy: 0.8506\n",
"Epoch 4/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.6128 - accuracy: 0.8401 - val_loss: 0.5752 - val_accuracy: 0.8391\n",
"Epoch 5/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.5760 - accuracy: 0.8401 - val_loss: 0.5514 - val_accuracy: 0.8391\n",
"Epoch 6/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.5444 - accuracy: 0.8401 - val_loss: 0.5272 - val_accuracy: 0.8391\n",
"Epoch 7/100\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.5119 - accuracy: 0.8459 - val_loss: 0.4959 - val_accuracy: 0.8391\n",
"Epoch 8/100\n",
" 1/11 [=>............................] - ETA: 0s - loss: 0.4723 - accuracy: 0.9062Restoring model weights from the end of the best epoch: 3.\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.4759 - accuracy: 0.8488 - val_loss: 0.4645 - val_accuracy: 0.8391\n",
"Epoch 8: early stopping\n",
"Epoch 1/100\n",
"11/11 [==============================] - 0s 16ms/step - loss: 0.7030 - accuracy: 0.5262 - val_loss: 0.6235 - val_accuracy: 0.7701\n",
"Epoch 2/100\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.6088 - accuracy: 0.8285 - val_loss: 0.5432 - val_accuracy: 0.8391\n",
"Epoch 3/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.5630 - accuracy: 0.8372 - val_loss: 0.4939 - val_accuracy: 0.8391\n",
"Epoch 4/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.5231 - accuracy: 0.8372 - val_loss: 0.4641 - val_accuracy: 0.8391\n",
"Epoch 5/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.4851 - accuracy: 0.8372 - val_loss: 0.4443 - val_accuracy: 0.8391\n",
"Epoch 6/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.4476 - accuracy: 0.8372 - val_loss: 0.4190 - val_accuracy: 0.8391\n",
"Epoch 7/100\n",
" 1/11 [=>............................] - ETA: 0s - loss: 0.4761 - accuracy: 0.7812Restoring model weights from the end of the best epoch: 2.\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.4080 - accuracy: 0.8401 - val_loss: 0.3984 - val_accuracy: 0.8391\n",
"Epoch 7: early stopping\n",
"Epoch 1/100\n",
"11/11 [==============================] - 0s 15ms/step - loss: 0.8484 - accuracy: 0.2122 - val_loss: 0.7485 - val_accuracy: 0.3218\n",
"Epoch 2/100\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.7046 - accuracy: 0.5000 - val_loss: 0.6538 - val_accuracy: 0.7471\n",
"Epoch 3/100\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.6363 - accuracy: 0.7936 - val_loss: 0.5938 - val_accuracy: 0.8276\n",
"Epoch 4/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.5827 - accuracy: 0.8372 - val_loss: 0.5511 - val_accuracy: 0.8506\n",
"Epoch 5/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.5353 - accuracy: 0.8401 - val_loss: 0.5105 - val_accuracy: 0.8506\n",
"Epoch 6/100\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.4874 - accuracy: 0.8430 - val_loss: 0.4787 - val_accuracy: 0.8506\n",
"Epoch 7/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.4375 - accuracy: 0.8459 - val_loss: 0.4518 - val_accuracy: 0.8506\n",
"Epoch 8/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.3944 - accuracy: 0.8488 - val_loss: 0.4320 - val_accuracy: 0.8506\n",
"Epoch 9/100\n",
" 1/11 [=>............................] - ETA: 0s - loss: 0.4943 - accuracy: 0.7812Restoring model weights from the end of the best epoch: 4.\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.3565 - accuracy: 0.8547 - val_loss: 0.4221 - val_accuracy: 0.8506\n",
"Epoch 9: early stopping\n",
"Epoch 1/100\n",
"11/11 [==============================] - 0s 15ms/step - loss: 0.6726 - accuracy: 0.6483 - val_loss: 0.5800 - val_accuracy: 0.8391\n",
"Epoch 2/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.5425 - accuracy: 0.8430 - val_loss: 0.5210 - val_accuracy: 0.8391\n",
"Epoch 3/100\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.4813 - accuracy: 0.8430 - val_loss: 0.5047 - val_accuracy: 0.8391\n",
"Epoch 4/100\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.4377 - accuracy: 0.8517 - val_loss: 0.4948 - val_accuracy: 0.8391\n",
"Epoch 5/100\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.3982 - accuracy: 0.8547 - val_loss: 0.4875 - val_accuracy: 0.8391\n",
"Epoch 6/100\n",
" 1/11 [=>............................] - ETA: 0s - loss: 0.3590 - accuracy: 0.8438Restoring model weights from the end of the best epoch: 1.\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.3683 - accuracy: 0.8663 - val_loss: 0.4850 - val_accuracy: 0.8391\n",
"Epoch 6: early stopping\n",
"best acc at index 6: 0.8505747318267822\n",
"best loss at index 2: 0.3847053050994873\n",
"{'units3': 32, 'units2': 32, 'units1': 16, 'optimizer': <class 'keras.optimizers.legacy.adam.Adam'>, 'learning_rate': 0.001, 'activation3': 'relu', 'activation2': 'relu', 'activation1': 'relu'}\n",
"Epoch 1/100\n",
"11/11 [==============================] - 0s 15ms/step - loss: 0.6807 - accuracy: 0.8372 - val_loss: 0.5413 - val_accuracy: 0.8391\n",
"Epoch 2/100\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.6144 - accuracy: 0.8372 - val_loss: 0.5060 - val_accuracy: 0.8391\n",
"Epoch 3/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.5689 - accuracy: 0.8372 - val_loss: 0.4795 - val_accuracy: 0.8391\n",
"Epoch 4/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.5273 - accuracy: 0.8372 - val_loss: 0.4565 - val_accuracy: 0.8391\n",
"Epoch 5/100\n",
"11/11 [==============================] - 0s 3ms/step - loss: 0.4915 - accuracy: 0.8372 - val_loss: 0.4387 - val_accuracy: 0.8391\n",
"Epoch 6/100\n",
" 1/11 [=>............................] - ETA: 0s - loss: 0.5601 - accuracy: 0.7812Restoring model weights from the end of the best epoch: 1.\n",
"11/11 [==============================] - 0s 4ms/step - loss: 0.4573 - accuracy: 0.8401 - val_loss: 0.4216 - val_accuracy: 0.8391\n",
"Epoch 6: early stopping\n"
]
}
],
"source": [
"#Newly load the dataset with columns drop and create preprocessing pipeline\n",
"train, greeks, test, id_list = load_dataset(columns_drop=bad_scores)\n",
"print(f\"dataset shape: {train.shape}\")\n",
"X_train, X_valid, y_train, y_valid = split_data(train, 0.3)\n",
"preprocessor = build_preprocessing_pipeline(X_train)\n",
"\n",
"\n",
"# Define model for the pipeline\n",
"#model = RandomForestClassifier(n_estimators=100, random_state=22)\n",
"#model = XGBClassifier(n_estimators=500)\n",
"preprocessor.fit(X_train)\n",
"X_preprocessed = preprocessor.transform(X_train)\n",
"y_train_ohe = pd.get_dummies(y_train, columns = ['Class'])\n",
"y_valid_ohe = pd.get_dummies(y_valid, columns = ['Class'])\n",
"model = KerasClassifier(model=grid_search_tf_model(X_train=X_preprocessed, y_train=y_train_ohe), epochs=0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Build the final pipeline with preprocessor and model, fit it and display accuracy score"
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {},
"outputs": [],
"source": [
"# Building Pipeline\n",
"def fit_pipeline(X_train: pd.DataFrame, y_train: pd.DataFrame, preprocessor, model):\n",
" pipeline = Pipeline(steps=[('preprocessor', preprocessor),\n",
" ('model', model)\n",
" ])\n",
" #Fit the Model and make preds\n",
" pipeline.fit(X_train, y_train)\n",
" preds = pipeline.predict(X_valid)\n",
" \"\"\" score = cross_val_score(pipeline, X_valid, y_valid, cv=5, scoring='accuracy')\n",
" print(f\"Accuracy of {score}\") \"\"\"\n",
" print(classification_report(y_train.to_numpy(), preds))\n",
" correct_answers = 0\n",
" for y_pred,y_true in zip(preds,y_valid_ohe.to_numpy()):\n",
" if(y_pred[0] == y_true[0]):correct_answers+=1\n",
" print(correct_answers/len(preds))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Print results of the model"
]
},
{
"cell_type": "code",
"execution_count": 171,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"6/6 [==============================] - 0s 1ms/step\n"
]
},
{
"ename": "ValueError",
"evalue": "Found input variables with inconsistent numbers of samples: [431, 186]",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[171], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m fit_pipeline(X_train, y_train_ohe, preprocessor, model)\n",
"Cell \u001b[1;32mIn[170], line 11\u001b[0m, in \u001b[0;36mfit_pipeline\u001b[1;34m(X_train, y_train, preprocessor, model)\u001b[0m\n\u001b[0;32m 8\u001b[0m preds \u001b[39m=\u001b[39m pipeline\u001b[39m.\u001b[39mpredict(X_valid)\n\u001b[0;32m 9\u001b[0m \u001b[39m\"\"\" score = cross_val_score(pipeline, X_valid, y_valid, cv=5, scoring='accuracy')\u001b[39;00m\n\u001b[0;32m 10\u001b[0m \u001b[39mprint(f\"Accuracy of {score}\") \"\"\"\u001b[39;00m\n\u001b[1;32m---> 11\u001b[0m \u001b[39mprint\u001b[39m(classification_report(y_train\u001b[39m.\u001b[39;49mto_numpy(), preds))\n\u001b[0;32m 12\u001b[0m correct_answers \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m\n\u001b[0;32m 13\u001b[0m \u001b[39mfor\u001b[39;00m y_pred,y_true \u001b[39min\u001b[39;00m \u001b[39mzip\u001b[39m(preds,y_valid_ohe\u001b[39m.\u001b[39mto_numpy()):\n",
"File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:2310\u001b[0m, in \u001b[0;36mclassification_report\u001b[1;34m(y_true, y_pred, labels, target_names, sample_weight, digits, output_dict, zero_division)\u001b[0m\n\u001b[0;32m 2195\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mclassification_report\u001b[39m(\n\u001b[0;32m 2196\u001b[0m y_true,\n\u001b[0;32m 2197\u001b[0m y_pred,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 2204\u001b[0m zero_division\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mwarn\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m 2205\u001b[0m ):\n\u001b[0;32m 2206\u001b[0m \u001b[39m\"\"\"Build a text report showing the main classification metrics.\u001b[39;00m\n\u001b[0;32m 2207\u001b[0m \n\u001b[0;32m 2208\u001b[0m \u001b[39m Read more in the :ref:`User Guide <classification_report>`.\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 2307\u001b[0m \u001b[39m <BLANKLINE>\u001b[39;00m\n\u001b[0;32m 2308\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 2310\u001b[0m y_type, y_true, y_pred \u001b[39m=\u001b[39m _check_targets(y_true, y_pred)\n\u001b[0;32m 2312\u001b[0m \u001b[39mif\u001b[39;00m labels \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 2313\u001b[0m labels \u001b[39m=\u001b[39m unique_labels(y_true, y_pred)\n",
"File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:86\u001b[0m, in \u001b[0;36m_check_targets\u001b[1;34m(y_true, y_pred)\u001b[0m\n\u001b[0;32m 59\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_check_targets\u001b[39m(y_true, y_pred):\n\u001b[0;32m 60\u001b[0m \u001b[39m\"\"\"Check that y_true and y_pred belong to the same classification task.\u001b[39;00m\n\u001b[0;32m 61\u001b[0m \n\u001b[0;32m 62\u001b[0m \u001b[39m This converts multiclass or binary types to a common shape, and raises a\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 84\u001b[0m \u001b[39m y_pred : array or indicator matrix\u001b[39;00m\n\u001b[0;32m 85\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m---> 86\u001b[0m check_consistent_length(y_true, y_pred)\n\u001b[0;32m 87\u001b[0m type_true \u001b[39m=\u001b[39m type_of_target(y_true, input_name\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39my_true\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 88\u001b[0m type_pred \u001b[39m=\u001b[39m type_of_target(y_pred, input_name\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39my_pred\u001b[39m\u001b[39m\"\u001b[39m)\n",
"File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py:397\u001b[0m, in \u001b[0;36mcheck_consistent_length\u001b[1;34m(*arrays)\u001b[0m\n\u001b[0;32m 395\u001b[0m uniques \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39munique(lengths)\n\u001b[0;32m 396\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(uniques) \u001b[39m>\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[1;32m--> 397\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 398\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mFound input variables with inconsistent numbers of samples: \u001b[39m\u001b[39m%r\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m 399\u001b[0m \u001b[39m%\u001b[39m [\u001b[39mint\u001b[39m(l) \u001b[39mfor\u001b[39;00m l \u001b[39min\u001b[39;00m lengths]\n\u001b[0;32m 400\u001b[0m )\n",
"\u001b[1;31mValueError\u001b[0m: Found input variables with inconsistent numbers of samples: [431, 186]"
]
}
],
"source": [
"fit_pipeline(X_train, y_train_ohe, preprocessor, model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Submission"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"' submission = pd.DataFrame()\\nprediction = model.predict(x_test)\\nsubmission.insert(0, \"Id\", id_number, False)\\nsubmission.insert(1, \"class_0\", [round(1-i[0],2) for i in prediction], True)\\nsubmission.insert(2, \"class_1\", [round(i[0],2) for i in prediction], True)\\nsubmission.to_csv(\"/kaggle/working/submission.csv\",index = False) '"
]
},
"execution_count": 154,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"\"\" submission = pd.DataFrame()\n",
"prediction = model.predict(x_test)\n",
"submission.insert(0, \"Id\", id_number, False)\n",
"submission.insert(1, \"class_0\", [round(1-i[0],2) for i in prediction], True)\n",
"submission.insert(2, \"class_1\", [round(i[0],2) for i in prediction], True)\n",
"submission.to_csv(\"/kaggle/working/submission.csv\",index = False) \"\"\""
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}