152 lines
4.7 KiB
Plaintext
152 lines
4.7 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 65,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
|
"from sklearn.impute import SimpleImputer\n",
|
|
"from sklearn.pipeline import Pipeline\n",
|
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|
"from sklearn.compose import ColumnTransformer\n",
|
|
"from sklearn.model_selection import train_test_split, cross_val_score\n",
|
|
"from sklearn.metrics import accuracy_score\n",
|
|
"from xgboost import XGBClassifier\n",
|
|
"import pandas as pd\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import seaborn as sns"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 66,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def load_dataset() -> tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n",
|
|
" train = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\train.csv\")\n",
|
|
" greeks = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\greeks.csv\")\n",
|
|
" test = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\test.csv\")\n",
|
|
" train.head()\n",
|
|
" return (train, greeks, test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 67,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def split_data(df: pd.DataFrame, split)->tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n",
|
|
" X = df.loc[:, df.columns != \"Class\"]\n",
|
|
" y = df.loc[:, \"Class\"]\n",
|
|
" return train_test_split(X, y, test_size=split, random_state=42)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 68,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def build_pipeline(df: pd.DataFrame) -> ColumnTransformer:\n",
|
|
" # Preprocessing for numerical data \n",
|
|
" numerical_transformer = SimpleImputer(strategy='constant')\n",
|
|
" \n",
|
|
" # Preprocessing for categorical data\n",
|
|
" categorical_transformer = Pipeline(steps=[\n",
|
|
" ('imputer', SimpleImputer(strategy='most_frequent')),\n",
|
|
" ('onehot', OneHotEncoder(handle_unknown='ignore'))\n",
|
|
" ])\n",
|
|
"\n",
|
|
" # Bundle preprocessing for numerical and categorical data\n",
|
|
" numerical_cols = [cname for cname in df.columns if df[cname].dtype in [\"int64\", \"float64\"]]\n",
|
|
" categorical_cols = [cname for cname in df.columns if df[cname].nunique() < 10]\n",
|
|
" preprocessor = ColumnTransformer(\n",
|
|
" transformers=[\n",
|
|
" ('num', numerical_transformer, numerical_cols),\n",
|
|
" ('cat', categorical_transformer, categorical_cols)\n",
|
|
" ])\n",
|
|
" return preprocessor"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 69,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"dataset shape: (617, 58)\n",
|
|
"X shape: (431, 57) and y shape: (431,)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Define model\n",
|
|
"#model = RandomForestClassifier(n_estimators=100, random_state=22)\n",
|
|
"model = XGBClassifier(n_estimators=500)\n",
|
|
"\n",
|
|
"#Splitting\n",
|
|
"train, greeks, test = load_dataset()\n",
|
|
"print(f\"dataset shape: {train.shape}\")\n",
|
|
"X_train, X_valid, y_train, y_valid = split_data(train, 0.3)\n",
|
|
"print(f\"X shape: {X_train.shape} and y shape: {y_train.shape}\")\n",
|
|
"\n",
|
|
"# Building Pipeline\n",
|
|
"preprocessor = build_pipeline(X_train)\n",
|
|
"pipeline = Pipeline(steps=[('preprocessor', preprocessor),\n",
|
|
" ('model', model)\n",
|
|
" ])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 70,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Accuracy of [0.94736842 0.89189189 0.91891892 0.97297297 0.89189189]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"#Fit the Model and make preds\n",
|
|
"pipeline.fit(X_train, y_train)\n",
|
|
"preds = pipeline.predict(X_valid)\n",
|
|
"score = cross_val_score(pipeline, X_valid, y_valid, cv=5, scoring='accuracy')\n",
|
|
"print(f\"Accuracy of {score}\")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "base",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.9"
|
|
},
|
|
"orig_nbformat": 4
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|