random forest model
This commit is contained in:
@@ -0,0 +1,144 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 116,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.ensemble import RandomForestClassifier\n",
|
||||
"from sklearn.impute import SimpleImputer\n",
|
||||
"from sklearn.pipeline import Pipeline\n",
|
||||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||||
"from sklearn.compose import ColumnTransformer\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from sklearn.metrics import accuracy_score\n",
|
||||
"import pandas as pd"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 117,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def load_dataset() -> tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n",
|
||||
" train = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\train.csv\")\n",
|
||||
" greeks = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\greeks.csv\")\n",
|
||||
" test = pd.read_csv(\"C:\\\\Projects\\\\kaggle\\\\competitions\\\\identify-age-related-conditions\\\\data\\\\test.csv\")\n",
|
||||
" train.head()\n",
|
||||
" return (train, greeks, test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 118,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def split_data(df: pd.DataFrame, split)->tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]):\n",
|
||||
" X = df.loc[:, :\"Class\"]\n",
|
||||
" y = df.loc[:, \"Class\"]\n",
|
||||
" return train_test_split(X, y, test_size=split, random_state=42)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 119,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def build_pipeline(df: pd.DataFrame) -> ColumnTransformer:\n",
|
||||
" # Preprocessing for numerical data \n",
|
||||
" numerical_transformer = SimpleImputer(strategy='constant')\n",
|
||||
" \n",
|
||||
" # Preprocessing for categorical data\n",
|
||||
" categorical_transformer = Pipeline(steps=[\n",
|
||||
" ('imputer', SimpleImputer(strategy='most_frequent')),\n",
|
||||
" ('onehot', OneHotEncoder(handle_unknown='ignore'))\n",
|
||||
" ])\n",
|
||||
"\n",
|
||||
" # Bundle preprocessing for numerical and categorical data\n",
|
||||
" numerical_cols = [cname for cname in df.columns if df[cname].dtype in [\"int64\", \"float64\"]]\n",
|
||||
" categorical_cols = [cname for cname in df.columns if df[cname].nunique() < 10]\n",
|
||||
" preprocessor = ColumnTransformer(\n",
|
||||
" transformers=[\n",
|
||||
" ('num', numerical_transformer, numerical_cols),\n",
|
||||
" ('cat', categorical_transformer, categorical_cols)\n",
|
||||
" ])\n",
|
||||
" return preprocessor"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 120,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"X shape: (61, 58) and y shape: (61,)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Define model\n",
|
||||
"model = RandomForestClassifier(n_estimators=100, random_state=22)\n",
|
||||
"\n",
|
||||
"# Building Pipeline\n",
|
||||
"train, greeks, test = load_dataset()\n",
|
||||
"preprocessor = build_pipeline(train)\n",
|
||||
"pipeline = Pipeline(steps=[('preprocessor', preprocessor),\n",
|
||||
" ('model', model)\n",
|
||||
" ])\n",
|
||||
"\n",
|
||||
"#Preprocessing data\n",
|
||||
"X_train, X_valid, y_train, y_valid = split_data(train, 0.3)\n",
|
||||
"print(f\"X shape: {X_train.shape} and y shape: {y_train.shape}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 121,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"556 out of 556 are right\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"pipeline.fit(X_train, y_train)\n",
|
||||
"preds = pipeline.predict(X_valid)\n",
|
||||
"score = accuracy_score(y_valid, preds, normalize=False)\n",
|
||||
"print(f\"{score} out of {len(y_valid)} are right\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "base",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
Reference in New Issue
Block a user