634 lines
32 KiB
Plaintext
634 lines
32 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 159,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
|
"from sklearn.impute import SimpleImputer\n",
|
|
"from sklearn.pipeline import Pipeline\n",
|
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|
"from sklearn.compose import ColumnTransformer\n",
|
|
"from sklearn.model_selection import train_test_split, cross_val_score\n",
|
|
"from sklearn.metrics import accuracy_score\n",
|
|
"from xgboost import XGBClassifier\n",
|
|
"import pandas as pd\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import seaborn as sns\n",
|
|
"from sklearn.feature_selection import mutual_info_regression\n",
|
|
"from sklearn.discriminant_analysis import StandardScaler\n",
|
|
"import numpy as np\n",
|
|
"from sklearn.metrics import classification_report\n",
|
|
"from sklearn.preprocessing import StandardScaler\n",
|
|
"from sklearn.model_selection import train_test_split, ParameterGrid\n",
|
|
"from keras.callbacks import TensorBoard, EarlyStopping, LearningRateScheduler\n",
|
|
"from keras.models import Model\n",
|
|
"from keras.layers import Activation, Dense, LSTM, Input\n",
|
|
"from keras.optimizers import Adam, RMSprop, SGD\n",
|
|
"from scikeras.wrappers import KerasClassifier\n",
|
|
"import tensorflow as tf\n",
|
|
"from os import path"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Method to load the dataset"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 160,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def load_dataset(columns_drop) -> tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n",
|
|
" train = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\train.csv\")\n",
|
|
" greeks = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\greeks.csv\")\n",
|
|
" test = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\test.csv\")\n",
|
|
" columns_drop = ['Id'] + columns_drop\n",
|
|
" id_list = test[\"Id\"]\n",
|
|
" train.drop(columns_drop, inplace=True, axis=1)\n",
|
|
" test.drop(columns_drop, inplace=True, axis=1)\n",
|
|
" print(len(train.columns))\n",
|
|
" return (train, greeks, test, id_list)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Method to split the data in validation and train set randomly"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 161,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def split_data(df: pd.DataFrame, split)->tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n",
|
|
" X = df.loc[:, df.columns != \"Class\"]\n",
|
|
" y = df.loc[:, \"Class\"]\n",
|
|
" return train_test_split(X, y, test_size=split, random_state=42)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Method to build a Tensorflow model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 162,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def build_tensorflow_model(input_shape:int, output_shape:int, units1: int, units2: int, units3: int, activation1: str, \n",
|
|
" activation2: str, activation3: str, optimizer: tf.keras.optimizers.Optimizer, learning_rate: float) -> Model:\n",
|
|
" input = Input(shape=input_shape)\n",
|
|
" x = Dense(units=units1, activation=activation1)(input)\n",
|
|
" x = Dense(units=units2, activation=activation2)(x)\n",
|
|
" x = Dense(units=units3, activation=activation3)(x)\n",
|
|
" output = Dense(units=output_shape, activation=\"softmax\")(x)\n",
|
|
" model = Model(inputs=[input], outputs=[output])\n",
|
|
" \n",
|
|
" model.compile(loss=\"categorical_crossentropy\",\n",
|
|
" optimizer=optimizer(learning_rate=learning_rate),\n",
|
|
" metrics=[\"accuracy\"]) \n",
|
|
" return model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Method to plot the accuracy of the model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 163,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def plot_acc_tf_model(history:Model):\n",
|
|
" # summarize history for accuracy\n",
|
|
" plt.plot(history.history['accuracy'])\n",
|
|
" plt.plot(history.history['val_accuracy'])\n",
|
|
" plt.title('model accuracy')\n",
|
|
" plt.ylabel('accuracy')\n",
|
|
" plt.xlabel('epoch')\n",
|
|
" plt.legend(['Train', 'Validation'], loc='upper left')\n",
|
|
" plt.show()\n",
|
|
" \n",
|
|
" # summarize history for loss\n",
|
|
" plt.plot(history.history['loss'])\n",
|
|
" plt.plot(history.history['val_loss'])\n",
|
|
" plt.title('model loss')\n",
|
|
" plt.ylabel('loss')\n",
|
|
" plt.xlabel('epoch')\n",
|
|
" plt.legend(['Train', 'Validation'], loc='upper left')\n",
|
|
" plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Method to fit the Tensorflow model with ES Callback"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 164,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"es_callback = EarlyStopping(\n",
|
|
" monitor=\"val_accuracy\",\n",
|
|
" patience=5,\n",
|
|
" verbose=1,\n",
|
|
" restore_best_weights=True,\n",
|
|
" min_delta=0.005\n",
|
|
" )\n",
|
|
" \n",
|
|
"def fit_model(model: Model, x: np.ndarray, y: np.ndarray, epochs: int, split: float) -> Model:\n",
|
|
" #split train and validation\n",
|
|
" x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=split, random_state=42)\n",
|
|
" #fit the model\n",
|
|
" history = model.fit(x_train, y_train, epochs=epochs, validation_data=(x_val,y_val), callbacks=[es_callback])\n",
|
|
" return history "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Method for GridSearch of Tensorflow model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 165,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"grid_params = {\"units1\": [8,16], \"units2\": [16,32], \"units3\": [32,64], \"activation1\": [\"relu\"], \"activation2\": [\"relu\"], \n",
|
|
" \"activation3\": [\"relu\"], \"optimizer\": [Adam], \"learning_rate\": [0.001]}\n",
|
|
"\n",
|
|
"#GridSearch\n",
|
|
"def grid_search_tf_model(X_train: pd.DataFrame, y_train: pd.DataFrame)->Model:\n",
|
|
" grid = ParameterGrid(param_grid = grid_params)\n",
|
|
" results = []\n",
|
|
" input_shape = len(X_train[1])\n",
|
|
" output_shape = 2\n",
|
|
" for idx,params in enumerate(grid):\n",
|
|
" model = build_tensorflow_model(input_shape=input_shape, output_shape=output_shape, **params)\n",
|
|
" history = fit_model(model, X_train, y_train, 100, 0.2)\n",
|
|
" val_loss = history.history['val_loss'][-1] \n",
|
|
" val_acc = history.history['val_accuracy'][-1]\n",
|
|
" results.append([val_loss, val_acc])\n",
|
|
" \n",
|
|
" val_accuracies = [i[1] for i in results]\n",
|
|
" val_losses= [i[0] for i in results]\n",
|
|
" best_acc = val_accuracies.index(max(val_accuracies))\n",
|
|
" best_loss = val_losses.index(min(val_losses))\n",
|
|
" print(f\"best acc at index {best_acc}: {max(val_accuracies)}\")\n",
|
|
" print(f\"best loss at index {best_loss}: {min(val_losses)}\")\n",
|
|
" print(grid[best_acc])\n",
|
|
" \n",
|
|
" model = build_tensorflow_model(input_shape=input_shape, output_shape=output_shape, **grid[best_acc])\n",
|
|
" history = fit_model(model, X_train, y_train, 100, 0.2)\n",
|
|
" #plot_acc_tf_model(history)\n",
|
|
" return model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Method to build a preprocessing pipeline"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 166,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def build_preprocessing_pipeline(df: pd.DataFrame) -> ColumnTransformer:\n",
|
|
" # Preprocessing for numerical data \n",
|
|
" numerical_transformer = Pipeline(steps=[\n",
|
|
" ('imputer',SimpleImputer(strategy='constant')),\n",
|
|
" ('scaler', StandardScaler())])\n",
|
|
" \n",
|
|
" # Preprocessing for categorical data\n",
|
|
" categorical_transformer = Pipeline(steps=[\n",
|
|
" ('imputer', SimpleImputer(strategy='most_frequent')),\n",
|
|
" ('onehot', OneHotEncoder(handle_unknown='ignore'))\n",
|
|
" ])\n",
|
|
"\n",
|
|
" # Bundle preprocessing for numerical and categorical data\n",
|
|
" numerical_cols = [cname for cname in df.columns if df[cname].dtype in [\"int64\", \"float64\"]]\n",
|
|
" categorical_cols = [cname for cname in df.columns if df[cname].nunique() < 10]\n",
|
|
" preprocessor = ColumnTransformer(\n",
|
|
" transformers=[\n",
|
|
" ('num', numerical_transformer, numerical_cols),\n",
|
|
" ('cat', categorical_transformer, categorical_cols)\n",
|
|
" ])\n",
|
|
" print(f\"Number of columns: {len(df.columns)}\")\n",
|
|
" return preprocessor"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Method to generate the Mutual Info scores and plot them"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 167,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"def make_mi_scores(X: pd.DataFrame, y: pd.DataFrame, discrete_features: list):\n",
|
|
" mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)\n",
|
|
" mi_scores = pd.Series(mi_scores, name=\"MI Scores\", index=X.columns)\n",
|
|
" mi_scores = mi_scores.sort_values(ascending=False)\n",
|
|
" return mi_scores\n",
|
|
"\n",
|
|
"def plot_mi_scores(scores):\n",
|
|
" scores = scores.sort_values(ascending=True)\n",
|
|
" width = np.arange(len(scores))\n",
|
|
" ticks = list(scores.index)\n",
|
|
" plt.figure(dpi=100, figsize=(16, 16))\n",
|
|
" plt.barh(width, scores)\n",
|
|
" plt.yticks(width, ticks)\n",
|
|
" plt.title(\"Mutual Information Scores\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Display MI scores that are beneath 0.01"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 168,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"57\n",
|
|
"dataset shape: (617, 57)\n",
|
|
"X shape: (431, 56) and y shape: (431,)\n",
|
|
" Columns with MI equal zero: ['DN', 'CW ', 'CC', 'EG', 'CU', 'AH', 'CL', 'DF', 'CD ', 'GE', 'GB', 'FS', 'DE', 'FI', 'BD ', 'CB', 'FD ', 'DY', 'AY', 'EP', 'AZ', 'EJ', 'CF'] --> total length: 23\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"#get the mutual information of features\n",
|
|
"train, greeks, test, id_list = load_dataset(columns_drop=[])\n",
|
|
"print(f\"dataset shape: {train.shape}\")\n",
|
|
"X_train, X_valid, y_train, y_valid = split_data(train, 0.3)\n",
|
|
"print(f\"X shape: {X_train.shape} and y shape: {y_train.shape}\")\n",
|
|
"\n",
|
|
"cols = list(X_train.columns)\n",
|
|
"cols.append(\"1\")\n",
|
|
"X_transformed = X_train.fillna(0)\n",
|
|
"X_transformed[\"EJ\"].replace(['A', 'B'], [0, 1], inplace=True)\n",
|
|
"discrete_features = X_transformed.dtypes == int\n",
|
|
"mi_scores = make_mi_scores(X_transformed, y_train, discrete_features)\n",
|
|
"bad_scores = list(mi_scores.index[i] for i, score in zip(range(len(mi_scores)),mi_scores) if score < 0.01)\n",
|
|
"#plot_mi_scores(mi_scores)\n",
|
|
"print(f\" Columns with MI equal zero: {bad_scores} --> total length: {len(bad_scores)}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Create a model and preprocessor"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 169,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"34\n",
|
|
"dataset shape: (617, 34)\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Number of columns: 33\n",
|
|
"Epoch 1/100\n",
|
|
"11/11 [==============================] - 0s 15ms/step - loss: 0.5836 - accuracy: 0.8372 - val_loss: 0.4572 - val_accuracy: 0.8391\n",
|
|
"Epoch 2/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.5413 - accuracy: 0.8372 - val_loss: 0.4360 - val_accuracy: 0.8391\n",
|
|
"Epoch 3/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.5107 - accuracy: 0.8372 - val_loss: 0.4228 - val_accuracy: 0.8391\n",
|
|
"Epoch 4/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4856 - accuracy: 0.8372 - val_loss: 0.4138 - val_accuracy: 0.8391\n",
|
|
"Epoch 5/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4655 - accuracy: 0.8372 - val_loss: 0.4073 - val_accuracy: 0.8391\n",
|
|
"Epoch 6/100\n",
|
|
" 1/11 [=>............................] - ETA: 0s - loss: 0.3383 - accuracy: 0.8438Restoring model weights from the end of the best epoch: 1.\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.4464 - accuracy: 0.8372 - val_loss: 0.4030 - val_accuracy: 0.8391\n",
|
|
"Epoch 6: early stopping\n",
|
|
"Epoch 1/100\n",
|
|
"11/11 [==============================] - 0s 15ms/step - loss: 0.6535 - accuracy: 0.7297 - val_loss: 0.5649 - val_accuracy: 0.8391\n",
|
|
"Epoch 2/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.5767 - accuracy: 0.8372 - val_loss: 0.4984 - val_accuracy: 0.8391\n",
|
|
"Epoch 3/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.5286 - accuracy: 0.8372 - val_loss: 0.4690 - val_accuracy: 0.8391\n",
|
|
"Epoch 4/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.4883 - accuracy: 0.8372 - val_loss: 0.4536 - val_accuracy: 0.8391\n",
|
|
"Epoch 5/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4519 - accuracy: 0.8372 - val_loss: 0.4419 - val_accuracy: 0.8391\n",
|
|
"Epoch 6/100\n",
|
|
" 1/11 [=>............................] - ETA: 0s - loss: 0.3956 - accuracy: 0.8438Restoring model weights from the end of the best epoch: 1.\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.4211 - accuracy: 0.8372 - val_loss: 0.4367 - val_accuracy: 0.8391\n",
|
|
"Epoch 6: early stopping\n",
|
|
"Epoch 1/100\n",
|
|
"11/11 [==============================] - 0s 16ms/step - loss: 0.7554 - accuracy: 0.4244 - val_loss: 0.6338 - val_accuracy: 0.6667\n",
|
|
"Epoch 2/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.5835 - accuracy: 0.7733 - val_loss: 0.5112 - val_accuracy: 0.8391\n",
|
|
"Epoch 3/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.5046 - accuracy: 0.8372 - val_loss: 0.4532 - val_accuracy: 0.8391\n",
|
|
"Epoch 4/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4713 - accuracy: 0.8372 - val_loss: 0.4223 - val_accuracy: 0.8391\n",
|
|
"Epoch 5/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.4393 - accuracy: 0.8372 - val_loss: 0.4078 - val_accuracy: 0.8391\n",
|
|
"Epoch 6/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.4144 - accuracy: 0.8372 - val_loss: 0.3960 - val_accuracy: 0.8391\n",
|
|
"Epoch 7/100\n",
|
|
" 1/11 [=>............................] - ETA: 0s - loss: 0.3674 - accuracy: 0.8750Restoring model weights from the end of the best epoch: 2.\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.3898 - accuracy: 0.8401 - val_loss: 0.3847 - val_accuracy: 0.8391\n",
|
|
"Epoch 7: early stopping\n",
|
|
"Epoch 1/100\n",
|
|
"11/11 [==============================] - 0s 15ms/step - loss: 0.6324 - accuracy: 0.8227 - val_loss: 0.5557 - val_accuracy: 0.8391\n",
|
|
"Epoch 2/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.5711 - accuracy: 0.8372 - val_loss: 0.5040 - val_accuracy: 0.8391\n",
|
|
"Epoch 3/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.5245 - accuracy: 0.8372 - val_loss: 0.4676 - val_accuracy: 0.8391\n",
|
|
"Epoch 4/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4838 - accuracy: 0.8372 - val_loss: 0.4399 - val_accuracy: 0.8391\n",
|
|
"Epoch 5/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4494 - accuracy: 0.8372 - val_loss: 0.4188 - val_accuracy: 0.8391\n",
|
|
"Epoch 6/100\n",
|
|
" 1/11 [=>............................] - ETA: 0s - loss: 0.3685 - accuracy: 0.8750Restoring model weights from the end of the best epoch: 1.\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.4130 - accuracy: 0.8372 - val_loss: 0.4003 - val_accuracy: 0.8391\n",
|
|
"Epoch 6: early stopping\n",
|
|
"Epoch 1/100\n",
|
|
"11/11 [==============================] - 0s 15ms/step - loss: 0.8912 - accuracy: 0.1860 - val_loss: 0.7869 - val_accuracy: 0.2644\n",
|
|
"Epoch 2/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.7306 - accuracy: 0.4070 - val_loss: 0.6774 - val_accuracy: 0.6207\n",
|
|
"Epoch 3/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.6506 - accuracy: 0.7936 - val_loss: 0.6167 - val_accuracy: 0.8506\n",
|
|
"Epoch 4/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.6128 - accuracy: 0.8401 - val_loss: 0.5752 - val_accuracy: 0.8391\n",
|
|
"Epoch 5/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.5760 - accuracy: 0.8401 - val_loss: 0.5514 - val_accuracy: 0.8391\n",
|
|
"Epoch 6/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.5444 - accuracy: 0.8401 - val_loss: 0.5272 - val_accuracy: 0.8391\n",
|
|
"Epoch 7/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.5119 - accuracy: 0.8459 - val_loss: 0.4959 - val_accuracy: 0.8391\n",
|
|
"Epoch 8/100\n",
|
|
" 1/11 [=>............................] - ETA: 0s - loss: 0.4723 - accuracy: 0.9062Restoring model weights from the end of the best epoch: 3.\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.4759 - accuracy: 0.8488 - val_loss: 0.4645 - val_accuracy: 0.8391\n",
|
|
"Epoch 8: early stopping\n",
|
|
"Epoch 1/100\n",
|
|
"11/11 [==============================] - 0s 16ms/step - loss: 0.7030 - accuracy: 0.5262 - val_loss: 0.6235 - val_accuracy: 0.7701\n",
|
|
"Epoch 2/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.6088 - accuracy: 0.8285 - val_loss: 0.5432 - val_accuracy: 0.8391\n",
|
|
"Epoch 3/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.5630 - accuracy: 0.8372 - val_loss: 0.4939 - val_accuracy: 0.8391\n",
|
|
"Epoch 4/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.5231 - accuracy: 0.8372 - val_loss: 0.4641 - val_accuracy: 0.8391\n",
|
|
"Epoch 5/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4851 - accuracy: 0.8372 - val_loss: 0.4443 - val_accuracy: 0.8391\n",
|
|
"Epoch 6/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4476 - accuracy: 0.8372 - val_loss: 0.4190 - val_accuracy: 0.8391\n",
|
|
"Epoch 7/100\n",
|
|
" 1/11 [=>............................] - ETA: 0s - loss: 0.4761 - accuracy: 0.7812Restoring model weights from the end of the best epoch: 2.\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.4080 - accuracy: 0.8401 - val_loss: 0.3984 - val_accuracy: 0.8391\n",
|
|
"Epoch 7: early stopping\n",
|
|
"Epoch 1/100\n",
|
|
"11/11 [==============================] - 0s 15ms/step - loss: 0.8484 - accuracy: 0.2122 - val_loss: 0.7485 - val_accuracy: 0.3218\n",
|
|
"Epoch 2/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.7046 - accuracy: 0.5000 - val_loss: 0.6538 - val_accuracy: 0.7471\n",
|
|
"Epoch 3/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.6363 - accuracy: 0.7936 - val_loss: 0.5938 - val_accuracy: 0.8276\n",
|
|
"Epoch 4/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.5827 - accuracy: 0.8372 - val_loss: 0.5511 - val_accuracy: 0.8506\n",
|
|
"Epoch 5/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.5353 - accuracy: 0.8401 - val_loss: 0.5105 - val_accuracy: 0.8506\n",
|
|
"Epoch 6/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.4874 - accuracy: 0.8430 - val_loss: 0.4787 - val_accuracy: 0.8506\n",
|
|
"Epoch 7/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4375 - accuracy: 0.8459 - val_loss: 0.4518 - val_accuracy: 0.8506\n",
|
|
"Epoch 8/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.3944 - accuracy: 0.8488 - val_loss: 0.4320 - val_accuracy: 0.8506\n",
|
|
"Epoch 9/100\n",
|
|
" 1/11 [=>............................] - ETA: 0s - loss: 0.4943 - accuracy: 0.7812Restoring model weights from the end of the best epoch: 4.\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.3565 - accuracy: 0.8547 - val_loss: 0.4221 - val_accuracy: 0.8506\n",
|
|
"Epoch 9: early stopping\n",
|
|
"Epoch 1/100\n",
|
|
"11/11 [==============================] - 0s 15ms/step - loss: 0.6726 - accuracy: 0.6483 - val_loss: 0.5800 - val_accuracy: 0.8391\n",
|
|
"Epoch 2/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.5425 - accuracy: 0.8430 - val_loss: 0.5210 - val_accuracy: 0.8391\n",
|
|
"Epoch 3/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.4813 - accuracy: 0.8430 - val_loss: 0.5047 - val_accuracy: 0.8391\n",
|
|
"Epoch 4/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.4377 - accuracy: 0.8517 - val_loss: 0.4948 - val_accuracy: 0.8391\n",
|
|
"Epoch 5/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.3982 - accuracy: 0.8547 - val_loss: 0.4875 - val_accuracy: 0.8391\n",
|
|
"Epoch 6/100\n",
|
|
" 1/11 [=>............................] - ETA: 0s - loss: 0.3590 - accuracy: 0.8438Restoring model weights from the end of the best epoch: 1.\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.3683 - accuracy: 0.8663 - val_loss: 0.4850 - val_accuracy: 0.8391\n",
|
|
"Epoch 6: early stopping\n",
|
|
"best acc at index 6: 0.8505747318267822\n",
|
|
"best loss at index 2: 0.3847053050994873\n",
|
|
"{'units3': 32, 'units2': 32, 'units1': 16, 'optimizer': <class 'keras.optimizers.legacy.adam.Adam'>, 'learning_rate': 0.001, 'activation3': 'relu', 'activation2': 'relu', 'activation1': 'relu'}\n",
|
|
"Epoch 1/100\n",
|
|
"11/11 [==============================] - 0s 15ms/step - loss: 0.6807 - accuracy: 0.8372 - val_loss: 0.5413 - val_accuracy: 0.8391\n",
|
|
"Epoch 2/100\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.6144 - accuracy: 0.8372 - val_loss: 0.5060 - val_accuracy: 0.8391\n",
|
|
"Epoch 3/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.5689 - accuracy: 0.8372 - val_loss: 0.4795 - val_accuracy: 0.8391\n",
|
|
"Epoch 4/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.5273 - accuracy: 0.8372 - val_loss: 0.4565 - val_accuracy: 0.8391\n",
|
|
"Epoch 5/100\n",
|
|
"11/11 [==============================] - 0s 3ms/step - loss: 0.4915 - accuracy: 0.8372 - val_loss: 0.4387 - val_accuracy: 0.8391\n",
|
|
"Epoch 6/100\n",
|
|
" 1/11 [=>............................] - ETA: 0s - loss: 0.5601 - accuracy: 0.7812Restoring model weights from the end of the best epoch: 1.\n",
|
|
"11/11 [==============================] - 0s 4ms/step - loss: 0.4573 - accuracy: 0.8401 - val_loss: 0.4216 - val_accuracy: 0.8391\n",
|
|
"Epoch 6: early stopping\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"#Newly load the dataset with columns drop and create preprocessing pipeline\n",
|
|
"train, greeks, test, id_list = load_dataset(columns_drop=bad_scores)\n",
|
|
"print(f\"dataset shape: {train.shape}\")\n",
|
|
"X_train, X_valid, y_train, y_valid = split_data(train, 0.3)\n",
|
|
"preprocessor = build_preprocessing_pipeline(X_train)\n",
|
|
"\n",
|
|
"\n",
|
|
"# Define model for the pipeline\n",
|
|
"#model = RandomForestClassifier(n_estimators=100, random_state=22)\n",
|
|
"#model = XGBClassifier(n_estimators=500)\n",
|
|
"preprocessor.fit(X_train)\n",
|
|
"X_preprocessed = preprocessor.transform(X_train)\n",
|
|
"y_train_ohe = pd.get_dummies(y_train, columns = ['Class'])\n",
|
|
"y_valid_ohe = pd.get_dummies(y_valid, columns = ['Class'])\n",
|
|
"model = KerasClassifier(model=grid_search_tf_model(X_train=X_preprocessed, y_train=y_train_ohe), epochs=0)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Build the final pipeline with preprocessor and model, fit it and display accuracy score"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 170,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Building Pipeline\n",
|
|
"def fit_pipeline(X_train: pd.DataFrame, y_train: pd.DataFrame, preprocessor, model):\n",
|
|
" pipeline = Pipeline(steps=[('preprocessor', preprocessor),\n",
|
|
" ('model', model)\n",
|
|
" ])\n",
|
|
" #Fit the Model and make preds\n",
|
|
" pipeline.fit(X_train, y_train)\n",
|
|
" preds = pipeline.predict(X_valid)\n",
|
|
" \"\"\" score = cross_val_score(pipeline, X_valid, y_valid, cv=5, scoring='accuracy')\n",
|
|
" print(f\"Accuracy of {score}\") \"\"\"\n",
|
|
" print(classification_report(y_train.to_numpy(), preds))\n",
|
|
" correct_answers = 0\n",
|
|
" for y_pred,y_true in zip(preds,y_valid_ohe.to_numpy()):\n",
|
|
" if(y_pred[0] == y_true[0]):correct_answers+=1\n",
|
|
" print(correct_answers/len(preds))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Print results of the model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 171,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"6/6 [==============================] - 0s 1ms/step\n"
|
|
]
|
|
},
|
|
{
|
|
"ename": "ValueError",
|
|
"evalue": "Found input variables with inconsistent numbers of samples: [431, 186]",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
|
|
"Cell \u001b[1;32mIn[171], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m fit_pipeline(X_train, y_train_ohe, preprocessor, model)\n",
|
|
"Cell \u001b[1;32mIn[170], line 11\u001b[0m, in \u001b[0;36mfit_pipeline\u001b[1;34m(X_train, y_train, preprocessor, model)\u001b[0m\n\u001b[0;32m 8\u001b[0m preds \u001b[39m=\u001b[39m pipeline\u001b[39m.\u001b[39mpredict(X_valid)\n\u001b[0;32m 9\u001b[0m \u001b[39m\"\"\" score = cross_val_score(pipeline, X_valid, y_valid, cv=5, scoring='accuracy')\u001b[39;00m\n\u001b[0;32m 10\u001b[0m \u001b[39mprint(f\"Accuracy of {score}\") \"\"\"\u001b[39;00m\n\u001b[1;32m---> 11\u001b[0m \u001b[39mprint\u001b[39m(classification_report(y_train\u001b[39m.\u001b[39;49mto_numpy(), preds))\n\u001b[0;32m 12\u001b[0m correct_answers \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m\n\u001b[0;32m 13\u001b[0m \u001b[39mfor\u001b[39;00m y_pred,y_true \u001b[39min\u001b[39;00m \u001b[39mzip\u001b[39m(preds,y_valid_ohe\u001b[39m.\u001b[39mto_numpy()):\n",
|
|
"File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:2310\u001b[0m, in \u001b[0;36mclassification_report\u001b[1;34m(y_true, y_pred, labels, target_names, sample_weight, digits, output_dict, zero_division)\u001b[0m\n\u001b[0;32m 2195\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mclassification_report\u001b[39m(\n\u001b[0;32m 2196\u001b[0m y_true,\n\u001b[0;32m 2197\u001b[0m y_pred,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 2204\u001b[0m zero_division\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mwarn\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m 2205\u001b[0m ):\n\u001b[0;32m 2206\u001b[0m \u001b[39m\"\"\"Build a text report showing the main classification metrics.\u001b[39;00m\n\u001b[0;32m 2207\u001b[0m \n\u001b[0;32m 2208\u001b[0m \u001b[39m Read more in the :ref:`User Guide <classification_report>`.\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 2307\u001b[0m \u001b[39m <BLANKLINE>\u001b[39;00m\n\u001b[0;32m 2308\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 2310\u001b[0m y_type, y_true, y_pred \u001b[39m=\u001b[39m _check_targets(y_true, y_pred)\n\u001b[0;32m 2312\u001b[0m \u001b[39mif\u001b[39;00m labels \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 2313\u001b[0m labels \u001b[39m=\u001b[39m unique_labels(y_true, y_pred)\n",
|
|
"File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:86\u001b[0m, in \u001b[0;36m_check_targets\u001b[1;34m(y_true, y_pred)\u001b[0m\n\u001b[0;32m 59\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_check_targets\u001b[39m(y_true, y_pred):\n\u001b[0;32m 60\u001b[0m \u001b[39m\"\"\"Check that y_true and y_pred belong to the same classification task.\u001b[39;00m\n\u001b[0;32m 61\u001b[0m \n\u001b[0;32m 62\u001b[0m \u001b[39m This converts multiclass or binary types to a common shape, and raises a\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 84\u001b[0m \u001b[39m y_pred : array or indicator matrix\u001b[39;00m\n\u001b[0;32m 85\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m---> 86\u001b[0m check_consistent_length(y_true, y_pred)\n\u001b[0;32m 87\u001b[0m type_true \u001b[39m=\u001b[39m type_of_target(y_true, input_name\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39my_true\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 88\u001b[0m type_pred \u001b[39m=\u001b[39m type_of_target(y_pred, input_name\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39my_pred\u001b[39m\u001b[39m\"\u001b[39m)\n",
|
|
"File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py:397\u001b[0m, in \u001b[0;36mcheck_consistent_length\u001b[1;34m(*arrays)\u001b[0m\n\u001b[0;32m 395\u001b[0m uniques \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39munique(lengths)\n\u001b[0;32m 396\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(uniques) \u001b[39m>\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[1;32m--> 397\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 398\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mFound input variables with inconsistent numbers of samples: \u001b[39m\u001b[39m%r\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m 399\u001b[0m \u001b[39m%\u001b[39m [\u001b[39mint\u001b[39m(l) \u001b[39mfor\u001b[39;00m l \u001b[39min\u001b[39;00m lengths]\n\u001b[0;32m 400\u001b[0m )\n",
|
|
"\u001b[1;31mValueError\u001b[0m: Found input variables with inconsistent numbers of samples: [431, 186]"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"fit_pipeline(X_train, y_train_ohe, preprocessor, model)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Submission"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"' submission = pd.DataFrame()\\nprediction = model.predict(x_test)\\nsubmission.insert(0, \"Id\", id_number, False)\\nsubmission.insert(1, \"class_0\", [round(1-i[0],2) for i in prediction], True)\\nsubmission.insert(2, \"class_1\", [round(i[0],2) for i in prediction], True)\\nsubmission.to_csv(\"/kaggle/working/submission.csv\",index = False) '"
|
|
]
|
|
},
|
|
"execution_count": 154,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"\"\"\" submission = pd.DataFrame()\n",
|
|
"prediction = model.predict(x_test)\n",
|
|
"submission.insert(0, \"Id\", id_number, False)\n",
|
|
"submission.insert(1, \"class_0\", [round(1-i[0],2) for i in prediction], True)\n",
|
|
"submission.insert(2, \"class_1\", [round(i[0],2) for i in prediction], True)\n",
|
|
"submission.to_csv(\"/kaggle/working/submission.csv\",index = False) \"\"\""
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "base",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.9"
|
|
},
|
|
"orig_nbformat": 4
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|