diff --git a/lab-hyper-tuning.ipynb b/lab-hyper-tuning.ipynb index 847d487..c80fb6e 100644 --- a/lab-hyper-tuning.ipynb +++ b/lab-hyper-tuning.ipynb @@ -221,11 +221,49 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "spaceship = spaceship.dropna()\n", + "spaceship.drop(columns=[\"PassengerId\",\"Name\"],inplace=True)\n", + "from sklearn.preprocessing import LabelEncoder\n", + "le = LabelEncoder()\n", + "spaceship[\"HomePlanet\"] = le.fit_transform(spaceship[\"HomePlanet\"])\n", + "spaceship[\"CryoSleep\"] = le.fit_transform(spaceship[\"CryoSleep\"])\n", + "spaceship[\"Cabin\"] = spaceship[\"Cabin\"].str[0]\n", + "spaceship[\"Cabin\"] = le.fit_transform(spaceship[\"Cabin\"])\n", + "spaceship[\"Destination\"] = le.fit_transform(spaceship[\"Destination\"])\n", + "spaceship[\"VIP\"] = le.fit_transform(spaceship[\"VIP\"])\n", + "\n", + "#your code here\n", + "features = spaceship.drop(columns=[\"Transported\"])\n", + "target = spaceship[\"Transported\"]\n", + "spaceship.select_dtypes(exclude=\"number\")\n", + "X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state=0)\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "normalizer = MinMaxScaler()\n", + "normalizer.fit(X_train)\n", + "X_train_norm = normalizer.transform(X_train)\n", + "X_train_norm = pd.DataFrame(X_train_norm, columns = X_train.columns)\n", + "X_test_norm = normalizer.transform(X_test)\n", + "X_test_norm = pd.DataFrame(X_test_norm, columns = X_test.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "features2 = features.drop(columns=[\"Age\",\"VIP\",\"FoodCourt\",\"ShoppingMall\"])\n", + "X_train, X_test, y_train, y_test = train_test_split(features2, target, test_size = 0.20, random_state=0)\n", + "normalizer = MinMaxScaler()\n", + "normalizer.fit(X_train)\n", + "X_train_norm = normalizer.transform(X_train)\n", + "X_train_norm = pd.DataFrame(X_train_norm, columns = X_train.columns)\n", + "X_test_norm = normalizer.transform(X_test)\n", + "X_test_norm = pd.DataFrame(X_test_norm, columns = X_test.columns)" ] }, { @@ -235,13 +273,1039 @@ "- Now let's use the best model we got so far in order to see how it can improve when we fine tune it's hyperparameters." ] }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import fetch_california_housing\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "# New in here:\n", + "from sklearn.ensemble import BaggingClassifier, RandomForestClassifier,AdaBoostClassifier, GradientBoostingClassifier\n", + "\n", + "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n", + "from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "forest = RandomForestClassifier(n_estimators = 100,\n", + " max_depth=15,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
RandomForestClassifier(max_depth=15)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "RandomForestClassifier(max_depth=15)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "forest.fit(X_train_norm, y_train)" ] }, { @@ -253,11 +1317,24 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MAE: 0.22087745839636913\n", + "RMSE: 0.46997601895880725\n", + "R2: 0.11649016641452348\n" + ] + } + ], "source": [ - "#your code here" + "pred = forest.predict(X_test_norm)\n", + "print(\"MAE: \",mean_absolute_error(y_test,pred))\n", + "print(\"RMSE: \",root_mean_squared_error(y_test,pred))\n", + "print(\"R2: \",r2_score(y_test,pred))" ] }, { @@ -283,11 +1360,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "forest = RandomForestClassifier(DecisionTreeClassifier())" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.model_selection import RandomizedSearchCV" + ] + }, + { + "cell_type": "code", + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "grid = {\"n_estimators\": [300, 400, 550, 750, 1000],\n", + " \"max_leaf_nodes\": [15, 30, 50, 60, 75],\n", + " \"max_depth\": [20, 30, 40, 50, 60]}" ] }, { @@ -299,10 +1397,1202 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "model = GridSearchCV(estimator = forest, param_grid = grid, cv = 5, n_jobs = -1)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\rcrds\\anaconda3\\envs\\MyEnvironment\\Lib\\site-packages\\joblib\\externals\\loky\\process_executor.py:782: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "
GridSearchCV(cv=5,\n",
+       "             estimator=RandomForestClassifier(n_estimators=DecisionTreeClassifier()),\n",
+       "             n_jobs=-1,\n",
+       "             param_grid={'max_depth': [20, 30, 40, 50, 60],\n",
+       "                         'max_leaf_nodes': [15, 30, 50, 60, 75],\n",
+       "                         'n_estimators': [300, 400, 550, 750, 1000]})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=5,\n", + " estimator=RandomForestClassifier(n_estimators=DecisionTreeClassifier()),\n", + " n_jobs=-1,\n", + " param_grid={'max_depth': [20, 30, 40, 50, 60],\n", + " 'max_leaf_nodes': [15, 30, 50, 60, 75],\n", + " 'n_estimators': [300, 400, 550, 750, 1000]})" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.fit(X_train_norm, y_train)" + ] }, { "cell_type": "markdown", @@ -313,15 +2603,1061 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'max_depth': 50, 'max_leaf_nodes': 75, 'n_estimators': 550}" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.best_params_" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
RandomForestClassifier(max_depth=50, max_leaf_nodes=75, n_estimators=550)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "RandomForestClassifier(max_depth=50, max_leaf_nodes=75, n_estimators=550)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "best_model = model.best_estimator_\n", + "best_model" + ] + }, + { + "cell_type": "code", + "execution_count": 23, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "pred = best_model.predict(X_test_norm)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MAE: 0.21936459909228442\n", + "RMSE: 0.46836374656060265\n", + "R2: 0.1225416036308623\n" + ] + } + ], + "source": [ + "print(\"MAE: \",mean_absolute_error(y_test,pred))\n", + "print(\"RMSE: \",root_mean_squared_error(y_test,pred))\n", + "print(\"R2: \",r2_score(y_test,pred))" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "MyEnvironment", "language": "python", "name": "python3" }, @@ -335,7 +3671,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.13.11" } }, "nbformat": 4,