{ "cells": [ { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.model_selection import train_test_split, cross_val_score\n", "from sklearn.metrics import accuracy_score\n", "from xgboost import XGBClassifier\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "def load_dataset() -> tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n", " train = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\train.csv\")\n", " greeks = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\greeks.csv\")\n", " test = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\test.csv\")\n", " train.head()\n", " return (train, greeks, test)" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "def split_data(df: pd.DataFrame, split)->tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n", " X = df.loc[:, df.columns != \"Class\"]\n", " y = df.loc[:, \"Class\"]\n", " return train_test_split(X, y, test_size=split, random_state=42)" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "def build_pipeline(df: pd.DataFrame) -> ColumnTransformer:\n", " # Preprocessing for numerical data \n", " numerical_transformer = SimpleImputer(strategy='constant')\n", " \n", " # Preprocessing for categorical data\n", " categorical_transformer = Pipeline(steps=[\n", " ('imputer', SimpleImputer(strategy='most_frequent')),\n", " ('onehot', OneHotEncoder(handle_unknown='ignore'))\n", " ])\n", "\n", " # Bundle preprocessing for numerical and categorical data\n", " numerical_cols = [cname for cname in df.columns if df[cname].dtype in [\"int64\", \"float64\"]]\n", " categorical_cols = [cname for cname in df.columns if df[cname].nunique() < 10]\n", " preprocessor = ColumnTransformer(\n", " transformers=[\n", " ('num', numerical_transformer, numerical_cols),\n", " ('cat', categorical_transformer, categorical_cols)\n", " ])\n", " return preprocessor" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dataset shape: (617, 58)\n", "X shape: (431, 57) and y shape: (431,)\n" ] } ], "source": [ "# Define model\n", "#model = RandomForestClassifier(n_estimators=100, random_state=22)\n", "model = XGBClassifier(n_estimators=500)\n", "\n", "#Splitting\n", "train, greeks, test = load_dataset()\n", "print(f\"dataset shape: {train.shape}\")\n", "X_train, X_valid, y_train, y_valid = split_data(train, 0.3)\n", "print(f\"X shape: {X_train.shape} and y shape: {y_train.shape}\")\n", "\n", "# Building Pipeline\n", "preprocessor = build_pipeline(X_train)\n", "pipeline = Pipeline(steps=[('preprocessor', preprocessor),\n", " ('model', model)\n", " ])" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy of [0.94736842 0.89189189 0.91891892 0.97297297 0.89189189]\n" ] } ], "source": [ "#Fit the Model and make preds\n", "pipeline.fit(X_train, y_train)\n", "preds = pipeline.predict(X_valid)\n", "score = cross_val_score(pipeline, X_valid, y_valid, cv=5, scoring='accuracy')\n", "print(f\"Accuracy of {score}\")" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }