diff --git a/lab-hyper-tuning.ipynb b/lab-hyper-tuning.ipynb index 847d487..94cb22e 100644 --- a/lab-hyper-tuning.ipynb +++ b/lab-hyper-tuning.ipynb @@ -35,19 +35,26 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "#Libraries\n", + "# Running this after several \"not defined\" error messages: \n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", + "from sklearn.impute import SimpleImputer\n", "import pandas as pd\n", "import numpy as np\n", - "from sklearn.model_selection import train_test_split" + "from sklearn.model_selection import train_test_split\n", + "from sklearn.impute import SimpleImputer\n", + "\n", + "# First things first" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -200,7 +207,7 @@ "4 True " ] }, - "execution_count": 2, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -221,11 +228,46 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "# Keeping these in ust one block for simplicity\n", + "# Target\n", + "y = spaceship[\"Transported\"].astype(int)\n", + "\n", + "# Features\n", + "X = spaceship.drop(columns=[\"Transported\", \"PassengerId\", \"Name\"])\n", + "\n", + "# Feature engineering\n", + "X[\"Deck\"] = X[\"Cabin\"].astype(str).str[0]\n", + "X = X.drop(columns=[\"Cabin\"])\n", + "\n", + "# Column types\n", + "num_cols = X.select_dtypes(include=[\"int64\", \"float64\"]).columns\n", + "cat_cols = X.select_dtypes(include=[\"object\", \"bool\"]).columns\n", + "\n", + "# Pipelines\n", + "num_pipeline = Pipeline([\n", + " (\"imputer\", SimpleImputer(strategy=\"median\")),\n", + " (\"scaler\", StandardScaler())\n", + "])\n", + "\n", + "cat_pipeline = Pipeline([\n", + " (\"imputer\", SimpleImputer(strategy=\"most_frequent\")),\n", + " (\"onehot\", OneHotEncoder(handle_unknown=\"ignore\"))\n", + "])\n", + "\n", + "# Preprocessor\n", + "preprocessor = ColumnTransformer([\n", + " (\"num\", num_pipeline, num_cols),\n", + " (\"cat\", cat_pipeline, cat_cols)\n", + "])\n", + "\n", + "# Split\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=42, stratify=y\n", + ")\n" ] }, { @@ -237,11 +279,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "#your code here\n", + "param_grid = {\n", + " \"model__n_estimators\": [100, 200],\n", + " \"model__learning_rate\": [0.05, 0.1],\n", + " \"model__max_depth\": [3, 5]\n", + "}" ] }, { @@ -253,11 +300,35 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 32, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Baseline accuracy: 0.8039102932719954\n", + " precision recall f1-score support\n", + "\n", + " 0 0.82 0.77 0.80 863\n", + " 1 0.79 0.83 0.81 876\n", + "\n", + " accuracy 0.80 1739\n", + " macro avg 0.80 0.80 0.80 1739\n", + "weighted avg 0.80 0.80 0.80 1739\n", + "\n" + ] + } + ], "source": [ - "#your code here" + "#your code here\n", + "\n", + "gb_model.fit(X_train, y_train)\n", + "\n", + "y_pred = gb_model.predict(X_test)\n", + "\n", + "print(\"Baseline accuracy:\", accuracy_score(y_test, y_pred))\n", + "print(classification_report(y_test, y_pred))" ] }, { @@ -283,11 +354,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Baseline accuracy: 0.8039102932719954\n", + " precision recall f1-score support\n", + "\n", + " 0 0.82 0.77 0.80 863\n", + " 1 0.79 0.83 0.81 876\n", + "\n", + " accuracy 0.80 1739\n", + " macro avg 0.80 0.80 0.80 1739\n", + "weighted avg 0.80 0.80 0.80 1739\n", + "\n" + ] + } + ], "source": [ - "#your code here" + "#your code here\n", + "\n", + "gb_model.fit(X_train, y_train)\n", + "\n", + "y_pred = gb_model.predict(X_test)\n", + "\n", + "print(\"Baseline accuracy:\", accuracy_score(y_test, y_pred))\n", + "print(classification_report(y_test, y_pred))\n" ] }, { @@ -299,10 +394,1787 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
GridSearchCV(cv=5,\n",
+ " estimator=Pipeline(steps=[('prep',\n",
+ " ColumnTransformer(transformers=[('num',\n",
+ " Pipeline(steps=[('imputer',\n",
+ " SimpleImputer(strategy='median')),\n",
+ " ('scaler',\n",
+ " StandardScaler())]),\n",
+ " Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='object')),\n",
+ " ('cat',\n",
+ " Pipeline(steps=[('imputer',\n",
+ " SimpleImputer(strategy='most_frequent')),\n",
+ " ('onehot',\n",
+ " OneHotEncoder(handle_unknown='ignore'))]),\n",
+ " Index(['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck'], dtype='object'))])),\n",
+ " ('model',\n",
+ " GradientBoostingClassifier(random_state=42))]),\n",
+ " n_jobs=-1,\n",
+ " param_grid={'model__learning_rate': [0.05, 0.1],\n",
+ " 'model__max_depth': [3, 5],\n",
+ " 'model__n_estimators': [100, 200]},\n",
+ " scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='object')
| \n", + " | \n",
+ " \n",
+ " copy\n",
+ " copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.\n", + " \n", + " | \n",
+ " True | \n", + "
| \n", + " | \n",
+ " \n",
+ " with_mean\n",
+ " with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.\n", + " \n", + " | \n",
+ " True | \n", + "
| \n", + " | \n",
+ " \n",
+ " with_std\n",
+ " with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).\n", + " \n", + " | \n",
+ " True | \n", + "
Index(['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck'], dtype='object')