{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import accuracy_score\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_dataset() -> tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n",
    "    train = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\train.csv\")\n",
    "    greeks = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\greeks.csv\")\n",
    "    test = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\test.csv\")\n",
    "    train.head()\n",
    "    return (train, greeks, test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "def split_data(df: pd.DataFrame, split)->tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n",
    "    X = df.loc[:, df.columns != \"Class\"]\n",
    "    y = df.loc[:, \"Class\"]\n",
    "    return train_test_split(X, y, test_size=split, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_pipeline(df: pd.DataFrame) -> ColumnTransformer:\n",
    "    # Preprocessing for numerical data    \n",
    "    numerical_transformer = SimpleImputer(strategy='constant')\n",
    "    \n",
    "    # Preprocessing for categorical data\n",
    "    categorical_transformer = Pipeline(steps=[\n",
    "        ('imputer', SimpleImputer(strategy='most_frequent')),\n",
    "        ('onehot', OneHotEncoder(handle_unknown='ignore'))\n",
    "    ])\n",
    "\n",
    "    # Bundle preprocessing for numerical and categorical data\n",
    "    numerical_cols = [cname for cname in df.columns if df[cname].dtype in [\"int64\", \"float64\"]]\n",
    "    categorical_cols = [cname for cname in df.columns if df[cname].nunique() < 10]\n",
    "    preprocessor = ColumnTransformer(\n",
    "        transformers=[\n",
    "            ('num', numerical_transformer, numerical_cols),\n",
    "            ('cat', categorical_transformer, categorical_cols)\n",
    "        ])\n",
    "    return preprocessor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dataset shape: (617, 58)\n",
      "X shape: (431, 57) and y shape: (431,)\n"
     ]
    }
   ],
   "source": [
    "# Define model\n",
    "model = RandomForestClassifier(n_estimators=100, random_state=22)\n",
    "\n",
    "#Splitting\n",
    "train, greeks, test = load_dataset()\n",
    "print(f\"dataset shape: {train.shape}\")\n",
    "X_train, X_valid, y_train, y_valid = split_data(train, 0.3)\n",
    "print(f\"X shape: {X_train.shape} and y shape: {y_train.shape}\")\n",
    "\n",
    "# Building Pipeline\n",
    "preprocessor = build_pipeline(X_train)\n",
    "pipeline = Pipeline(steps=[('preprocessor', preprocessor),\n",
    "                        ('model', model)\n",
    "                        ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "A given column is not a column of the dataframe",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\pandas\\core\\indexes\\base.py:3802\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[1;34m(self, key, method, tolerance)\u001b[0m\n\u001b[0;32m   3801\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m-> 3802\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_engine\u001b[39m.\u001b[39;49mget_loc(casted_key)\n\u001b[0;32m   3803\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mKeyError\u001b[39;00m \u001b[39mas\u001b[39;00m err:\n",
      "File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\pandas\\_libs\\index.pyx:138\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
      "File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\pandas\\_libs\\index.pyx:165\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
      "File \u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi:5745\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[1;34m()\u001b[0m\n",
      "File \u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi:5753\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;31mKeyError\u001b[0m: 'Class'",
      "\nThe above exception was the direct cause of the following exception:\n",
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\sklearn\\utils\\__init__.py:448\u001b[0m, in \u001b[0;36m_get_column_indices\u001b[1;34m(X, key)\u001b[0m\n\u001b[0;32m    447\u001b[0m \u001b[39mfor\u001b[39;00m col \u001b[39min\u001b[39;00m columns:\n\u001b[1;32m--> 448\u001b[0m     col_idx \u001b[39m=\u001b[39m all_columns\u001b[39m.\u001b[39;49mget_loc(col)\n\u001b[0;32m    449\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(col_idx, numbers\u001b[39m.\u001b[39mIntegral):\n",
      "File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\pandas\\core\\indexes\\base.py:3804\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[1;34m(self, key, method, tolerance)\u001b[0m\n\u001b[0;32m   3803\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mKeyError\u001b[39;00m \u001b[39mas\u001b[39;00m err:\n\u001b[1;32m-> 3804\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mKeyError\u001b[39;00m(key) \u001b[39mfrom\u001b[39;00m \u001b[39merr\u001b[39;00m\n\u001b[0;32m   3805\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mTypeError\u001b[39;00m:\n\u001b[0;32m   3806\u001b[0m     \u001b[39m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[0;32m   3807\u001b[0m     \u001b[39m#  InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[0;32m   3808\u001b[0m     \u001b[39m#  the TypeError.\u001b[39;00m\n",
      "\u001b[1;31mKeyError\u001b[0m: 'Class'",
      "\nThe above exception was the direct cause of the following exception:\n",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[74], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m pipeline\u001b[39m.\u001b[39;49mfit(X_train, y_train)\n\u001b[0;32m      2\u001b[0m preds \u001b[39m=\u001b[39m pipeline\u001b[39m.\u001b[39mpredict(X_valid)\n\u001b[0;32m      3\u001b[0m score \u001b[39m=\u001b[39m accuracy_score(y_valid, preds)\n",
      "File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\sklearn\\pipeline.py:401\u001b[0m, in \u001b[0;36mPipeline.fit\u001b[1;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[0;32m    375\u001b[0m \u001b[39m\"\"\"Fit the model.\u001b[39;00m\n\u001b[0;32m    376\u001b[0m \n\u001b[0;32m    377\u001b[0m \u001b[39mFit all the transformers one after the other and transform the\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    398\u001b[0m \u001b[39m    Pipeline with fitted steps.\u001b[39;00m\n\u001b[0;32m    399\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m    400\u001b[0m fit_params_steps \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_check_fit_params(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mfit_params)\n\u001b[1;32m--> 401\u001b[0m Xt \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_fit(X, y, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mfit_params_steps)\n\u001b[0;32m    402\u001b[0m \u001b[39mwith\u001b[39;00m _print_elapsed_time(\u001b[39m\"\u001b[39m\u001b[39mPipeline\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_log_message(\u001b[39mlen\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39msteps) \u001b[39m-\u001b[39m \u001b[39m1\u001b[39m)):\n\u001b[0;32m    403\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_final_estimator \u001b[39m!=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mpassthrough\u001b[39m\u001b[39m\"\u001b[39m:\n",
      "File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\sklearn\\pipeline.py:359\u001b[0m, in \u001b[0;36mPipeline._fit\u001b[1;34m(self, X, y, **fit_params_steps)\u001b[0m\n\u001b[0;32m    357\u001b[0m     cloned_transformer \u001b[39m=\u001b[39m clone(transformer)\n\u001b[0;32m    358\u001b[0m \u001b[39m# Fit or load from cache the current transformer\u001b[39;00m\n\u001b[1;32m--> 359\u001b[0m X, fitted_transformer \u001b[39m=\u001b[39m fit_transform_one_cached(\n\u001b[0;32m    360\u001b[0m     cloned_transformer,\n\u001b[0;32m    361\u001b[0m     X,\n\u001b[0;32m    362\u001b[0m     y,\n\u001b[0;32m    363\u001b[0m     \u001b[39mNone\u001b[39;00m,\n\u001b[0;32m    364\u001b[0m     message_clsname\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mPipeline\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m    365\u001b[0m     message\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_log_message(step_idx),\n\u001b[0;32m    366\u001b[0m     \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mfit_params_steps[name],\n\u001b[0;32m    367\u001b[0m )\n\u001b[0;32m    368\u001b[0m \u001b[39m# Replace the transformer of the step with the fitted\u001b[39;00m\n\u001b[0;32m    369\u001b[0m \u001b[39m# transformer. This is necessary when loading the transformer\u001b[39;00m\n\u001b[0;32m    370\u001b[0m \u001b[39m# from the cache.\u001b[39;00m\n\u001b[0;32m    371\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msteps[step_idx] \u001b[39m=\u001b[39m (name, fitted_transformer)\n",
      "File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\joblib\\memory.py:349\u001b[0m, in \u001b[0;36mNotMemorizedFunc.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m    348\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__call__\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs):\n\u001b[1;32m--> 349\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfunc(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n",
      "File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\sklearn\\pipeline.py:893\u001b[0m, in \u001b[0;36m_fit_transform_one\u001b[1;34m(transformer, X, y, weight, message_clsname, message, **fit_params)\u001b[0m\n\u001b[0;32m    891\u001b[0m \u001b[39mwith\u001b[39;00m _print_elapsed_time(message_clsname, message):\n\u001b[0;32m    892\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39mhasattr\u001b[39m(transformer, \u001b[39m\"\u001b[39m\u001b[39mfit_transform\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[1;32m--> 893\u001b[0m         res \u001b[39m=\u001b[39m transformer\u001b[39m.\u001b[39mfit_transform(X, y, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mfit_params)\n\u001b[0;32m    894\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[0;32m    895\u001b[0m         res \u001b[39m=\u001b[39m transformer\u001b[39m.\u001b[39mfit(X, y, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mfit_params)\u001b[39m.\u001b[39mtransform(X)\n",
      "File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\sklearn\\utils\\_set_output.py:142\u001b[0m, in \u001b[0;36m_wrap_method_output.<locals>.wrapped\u001b[1;34m(self, X, *args, **kwargs)\u001b[0m\n\u001b[0;32m    140\u001b[0m \u001b[39m@wraps\u001b[39m(f)\n\u001b[0;32m    141\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mwrapped\u001b[39m(\u001b[39mself\u001b[39m, X, \u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs):\n\u001b[1;32m--> 142\u001b[0m     data_to_wrap \u001b[39m=\u001b[39m f(\u001b[39mself\u001b[39m, X, \u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m    143\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(data_to_wrap, \u001b[39mtuple\u001b[39m):\n\u001b[0;32m    144\u001b[0m         \u001b[39m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[0;32m    145\u001b[0m         \u001b[39mreturn\u001b[39;00m (\n\u001b[0;32m    146\u001b[0m             _wrap_data_with_container(method, data_to_wrap[\u001b[39m0\u001b[39m], X, \u001b[39mself\u001b[39m),\n\u001b[0;32m    147\u001b[0m             \u001b[39m*\u001b[39mdata_to_wrap[\u001b[39m1\u001b[39m:],\n\u001b[0;32m    148\u001b[0m         )\n",
      "File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\sklearn\\compose\\_column_transformer.py:724\u001b[0m, in \u001b[0;36mColumnTransformer.fit_transform\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m    722\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_check_n_features(X, reset\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[0;32m    723\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_validate_transformers()\n\u001b[1;32m--> 724\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_validate_column_callables(X)\n\u001b[0;32m    725\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_validate_remainder(X)\n\u001b[0;32m    727\u001b[0m result \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_fit_transform(X, y, _fit_transform_one)\n",
      "File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\sklearn\\compose\\_column_transformer.py:426\u001b[0m, in \u001b[0;36mColumnTransformer._validate_column_callables\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m    424\u001b[0m         columns \u001b[39m=\u001b[39m columns(X)\n\u001b[0;32m    425\u001b[0m     all_columns\u001b[39m.\u001b[39mappend(columns)\n\u001b[1;32m--> 426\u001b[0m     transformer_to_input_indices[name] \u001b[39m=\u001b[39m _get_column_indices(X, columns)\n\u001b[0;32m    428\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_columns \u001b[39m=\u001b[39m all_columns\n\u001b[0;32m    429\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_transformer_to_input_indices \u001b[39m=\u001b[39m transformer_to_input_indices\n",
      "File \u001b[1;32mc:\\Users\\yann.MSI\\anaconda3\\lib\\site-packages\\sklearn\\utils\\__init__.py:456\u001b[0m, in \u001b[0;36m_get_column_indices\u001b[1;34m(X, key)\u001b[0m\n\u001b[0;32m    453\u001b[0m             column_indices\u001b[39m.\u001b[39mappend(col_idx)\n\u001b[0;32m    455\u001b[0m     \u001b[39mexcept\u001b[39;00m \u001b[39mKeyError\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m--> 456\u001b[0m         \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39mA given column is not a column of the dataframe\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n\u001b[0;32m    458\u001b[0m     \u001b[39mreturn\u001b[39;00m column_indices\n\u001b[0;32m    459\u001b[0m \u001b[39melse\u001b[39;00m:\n",
      "\u001b[1;31mValueError\u001b[0m: A given column is not a column of the dataframe"
     ]
    }
   ],
   "source": [
    "pipeline.fit(X_train, y_train)\n",
    "preds = pipeline.predict(X_valid)\n",
    "score = accuracy_score(y_valid, preds)\n",
    "print(f\"{score} out of {len(y_valid)} are right\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}