diff --git a/lab-hyper-tuning.ipynb b/lab-hyper-tuning.ipynb index 847d487..bea1f5a 100644 --- a/lab-hyper-tuning.ipynb +++ b/lab-hyper-tuning.ipynb @@ -35,178 +35,31 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Libraries\n", "import pandas as pd\n", "import numpy as np\n", - "from sklearn.model_selection import train_test_split" + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import accuracy_score, classification_report\n", + "from sklearn.model_selection import GridSearchCV\n" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
PassengerIdHomePlanetCryoSleepCabinDestinationAgeVIPRoomServiceFoodCourtShoppingMallSpaVRDeckNameTransported
00001_01EuropaFalseB/0/PTRAPPIST-1e39.0False0.00.00.00.00.0Maham OfracculyFalse
10002_01EarthFalseF/0/STRAPPIST-1e24.0False109.09.025.0549.044.0Juanna VinesTrue
20003_01EuropaFalseA/0/STRAPPIST-1e58.0True43.03576.00.06715.049.0Altark SusentFalse
30003_02EuropaFalseA/0/STRAPPIST-1e33.0False0.01283.0371.03329.0193.0Solam SusentFalse
40004_01EarthFalseF/1/STRAPPIST-1e16.0False303.070.0151.0565.02.0Willy SantantinesTrue
\n", - "
" - ], - "text/plain": [ - " PassengerId HomePlanet CryoSleep Cabin Destination Age VIP \\\n", - "0 0001_01 Europa False B/0/P TRAPPIST-1e 39.0 False \n", - "1 0002_01 Earth False F/0/S TRAPPIST-1e 24.0 False \n", - "2 0003_01 Europa False A/0/S TRAPPIST-1e 58.0 True \n", - "3 0003_02 Europa False A/0/S TRAPPIST-1e 33.0 False \n", - "4 0004_01 Earth False F/1/S TRAPPIST-1e 16.0 False \n", - "\n", - " RoomService FoodCourt ShoppingMall Spa VRDeck Name \\\n", - "0 0.0 0.0 0.0 0.0 0.0 Maham Ofracculy \n", - "1 109.0 9.0 25.0 549.0 44.0 Juanna Vines \n", - "2 43.0 3576.0 0.0 6715.0 49.0 Altark Susent \n", - "3 0.0 1283.0 371.0 3329.0 193.0 Solam Susent \n", - "4 303.0 70.0 151.0 565.0 2.0 Willy Santantines \n", - "\n", - " Transported \n", - "0 False \n", - "1 True \n", - "2 False \n", - "3 False \n", - "4 True " - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "spaceship = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv\")\n", + "spaceship = pd.read_csv(\n", + " \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv\"\n", + ")\n", + "\n", "spaceship.head()" ] }, @@ -221,12 +74,10 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "#your code here" - ] + "source": [] }, { "cell_type": "markdown", @@ -241,30 +92,152 @@ "metadata": {}, "outputs": [], "source": [ - "#your code here" + "#your code here\n", + "\n", + "X = spaceship.drop(\"Transported\", axis=1)\n", + "y = spaceship[\"Transported\"]\n", + "\n", + "# Eliminar columnas no numéricas / identificadores\n", + "X = X.drop([\"PassengerId\", \"Name\", \"Cabin\"], axis=1)\n", + "\n", + "# One-hot encoding\n", + "X = pd.get_dummies(X, drop_first=True)\n", + "\n", + "# Train / Test split\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=42\n", + ")\n", + "\n", + "\n", + "# Escalado\n", + "scaler = StandardScaler()\n", + "X_train_scaled = scaler.fit_transform(X_train)\n", + "X_test_scaled = scaler.transform(X_test)\n", + "\n", + "# Modelo base\n", + "log_reg = LogisticRegression(max_iter=1000)\n", + "log_reg.fit(X_train_scaled, y_train)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "- Evaluate your model" + "- Evaluate your model\n", + "\n", + "scaler = StandardScaler()\n", + "\n", + "X_train_scaled = scaler.fit_transform(X_train)\n", + "X_test_scaled = scaler.transform(X_test)\n" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 28, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy modelo optimizado: 0.7768832662449684\n", + " precision recall f1-score support\n", + "\n", + " False 0.79 0.74 0.77 861\n", + " True 0.76 0.81 0.79 878\n", + "\n", + " accuracy 0.78 1739\n", + " macro avg 0.78 0.78 0.78 1739\n", + "weighted avg 0.78 0.78 0.78 1739\n", + "\n" + ] + } + ], "source": [ - "#your code here" + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "from sklearn.model_selection import train_test_split, GridSearchCV\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import accuracy_score, classification_report\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "# =========================\n", + "# 1. Cargar datos\n", + "# =========================\n", + "spaceship = pd.read_csv(\n", + " \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv\"\n", + ")\n", + "\n", + "# =========================\n", + "# 2. Separar X e y\n", + "# =========================\n", + "X = spaceship.drop(\"Transported\", axis=1)\n", + "y = spaceship[\"Transported\"]\n", + "\n", + "# =========================\n", + "# 3. Eliminar columnas irrelevantes\n", + "# =========================\n", + "X = X.drop([\"PassengerId\", \"Name\", \"Cabin\"], axis=1)\n", + "\n", + "# =========================\n", + "# 4. One-hot encoding\n", + "# =========================\n", + "X = pd.get_dummies(X, drop_first=True)\n", + "\n", + "# =========================\n", + "# 5. Train / Test split\n", + "# =========================\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=42\n", + ")\n", + "\n", + "# =========================\n", + "# 6. Pipeline\n", + "# =========================\n", + "pipeline = Pipeline([\n", + " (\"imputer\", SimpleImputer(strategy=\"median\")),\n", + " (\"scaler\", StandardScaler()),\n", + " (\"model\", LogisticRegression(max_iter=1000))\n", + "])\n", + "\n", + "# =========================\n", + "# 7. Grid Search\n", + "# =========================\n", + "param_grid = {\n", + " \"model__C\": [0.01, 0.1, 1, 10]\n", + "}\n", + "\n", + "grid_search = GridSearchCV(\n", + " pipeline,\n", + " param_grid,\n", + " cv=5,\n", + " scoring=\"accuracy\",\n", + " n_jobs=-1\n", + ")\n", + "\n", + "grid_search.fit(X_train, y_train)\n", + "\n", + "# =========================\n", + "# 8. Evaluación final\n", + "# =========================\n", + "best_model = grid_search.best_estimator_\n", + "y_pred = best_model.predict(X_test)\n", + "\n", + "print(\"Accuracy modelo optimizado:\", accuracy_score(y_test, y_pred))\n", + "print(classification_report(y_test, y_pred))\n", + "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Grid/Random Search**" + "**Grid/Random Search**\n", + "\n", + "\n" ] }, { @@ -278,7 +251,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "- Define hyperparameters to fine tune." + "- Define hyperparameters to fine tune.\n", + "\n" ] }, { @@ -287,7 +261,8 @@ "metadata": {}, "outputs": [], "source": [ - "#your code here" + "#your code here\n", + "\n" ] }, { @@ -299,10 +274,89 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy modelo optimizado: 0.7768832662449684\n", + " precision recall f1-score support\n", + "\n", + " False 0.79 0.74 0.77 861\n", + " True 0.76 0.81 0.79 878\n", + "\n", + " accuracy 0.78 1739\n", + " macro avg 0.78 0.78 0.78 1739\n", + "weighted avg 0.78 0.78 0.78 1739\n", + "\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "from sklearn.model_selection import train_test_split, GridSearchCV\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import accuracy_score, classification_report\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "# 1. Cargar datos\n", + "spaceship = pd.read_csv(\n", + " \"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv\"\n", + ")\n", + "\n", + "# 2. Separar X e y\n", + "X = spaceship.drop(\"Transported\", axis=1)\n", + "y = spaceship[\"Transported\"]\n", + "\n", + "# 3. Eliminar columnas irrelevantes\n", + "X = X.drop([\"PassengerId\", \"Name\", \"Cabin\"], axis=1)\n", + "\n", + "# 4. One-hot encoding\n", + "X = pd.get_dummies(X, drop_first=True)\n", + "\n", + "# 5. Train / Test split\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=42\n", + ")\n", + "\n", + "# 6. Pipeline\n", + "pipeline = Pipeline([\n", + " (\"imputer\", SimpleImputer(strategy=\"median\")),\n", + " (\"scaler\", StandardScaler()),\n", + " (\"model\", LogisticRegression(max_iter=1000))\n", + "])\n", + "\n", + "# 7. Grid Search\n", + "param_grid = {\n", + " \"model__C\": [0.01, 0.1, 1, 10]\n", + "}\n", + "\n", + "grid_search = GridSearchCV(\n", + " pipeline,\n", + " param_grid,\n", + " cv=5,\n", + " scoring=\"accuracy\",\n", + " n_jobs=-1\n", + ")\n", + "\n", + "grid_search.fit(X_train, y_train)\n", + "\n", + "# 8. Evaluación final\n", + "best_model = grid_search.best_estimator_\n", + "y_pred = best_model.predict(X_test)\n", + "\n", + "print(\"Accuracy modelo optimizado:\", accuracy_score(y_test, y_pred))\n", + "print(classification_report(y_test, y_pred))\n", + "\n", + "\n", + "\n" + ] }, { "cell_type": "markdown", @@ -313,10 +367,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tuned Accuracy: 0.6716503737780334\n", + " precision recall f1-score support\n", + "\n", + " False 0.66 0.69 0.68 861\n", + " True 0.68 0.65 0.67 878\n", + "\n", + " accuracy 0.67 1739\n", + " macro avg 0.67 0.67 0.67 1739\n", + "weighted avg 0.67 0.67 0.67 1739\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\deysi.galvez\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\utils\\validation.py:2691: UserWarning: X does not have valid feature names, but SimpleImputer was fitted with feature names\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "best_model = grid_search.best_estimator_\n", + "\n", + "y_pred_best = best_model.predict(X_test_scaled)\n", + "\n", + "print(\"Tuned Accuracy:\", accuracy_score(y_test, y_pred_best))\n", + "print(classification_report(y_test, y_pred_best))\n" + ] } ], "metadata": { @@ -335,7 +421,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.12.9" } }, "nbformat": 4,