From 2baf778b14b9ac4ec6594bc507d00ebb5489be93 Mon Sep 17 00:00:00 2001 From: slmj1990-ai Date: Thu, 26 Feb 2026 12:56:53 +0100 Subject: [PATCH] lab tuning --- lab-hyper-tuning.ipynb | 3538 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 3524 insertions(+), 14 deletions(-) diff --git a/lab-hyper-tuning.ipynb b/lab-hyper-tuning.ipynb index 847d487..d4be8f3 100644 --- a/lab-hyper-tuning.ipynb +++ b/lab-hyper-tuning.ipynb @@ -219,13 +219,40 @@ "- Feature Selection\n" ] }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "#separarvariable objetivo\n", + "y = spaceship[\"Transported\"]\n", + "X = spaceship.drop(\"Transported\", axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "#seleccionr variables numéricas \n", + "\n", + "num_cols = X.select_dtypes(include=[\"int64\", \"float64\"]).columns\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "#aplicar standard scaler \n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "scaler = StandardScaler()\n", + "X[num_cols] = scaler.fit_transform(X[num_cols])" ] }, { @@ -237,11 +264,1688 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RoomService 0.200815\n", + "Spa 0.196902\n", + "VRDeck 0.176947\n", + "FoodCourt 0.158392\n", + "ShoppingMall 0.138611\n", + "Age 0.128334\n", + "dtype: float64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#random forest \n", + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "rf = RandomForestClassifier(random_state=42)\n", + "rf.fit(X[num_cols], y)\n", + "\n", + "importances = pd.Series(rf.feature_importances_, index=num_cols)\n", + "importances.sort_values(ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "#dividir datos \n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=42\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "#separara columnas\n", + "cat_cols = X.select_dtypes(include=\"object\").columns\n", + "num_cols = X.select_dtypes(exclude=\"object\").columns" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('preprocessor',\n",
+       "                 ColumnTransformer(transformers=[('num',\n",
+       "                                                  Pipeline(steps=[('imputer',\n",
+       "                                                                   SimpleImputer(strategy='median'))]),\n",
+       "                                                  Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='object')),\n",
+       "                                                 ('cat',\n",
+       "                                                  Pipeline(steps=[('imputer',\n",
+       "                                                                   SimpleImputer(strategy='most_frequent')),\n",
+       "                                                                  ('onehot',\n",
+       "                                                                   OneHotEncoder(handle_unknown='ignore'))]),\n",
+       "                                                  Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP',\n",
+       "       'Name'],\n",
+       "      dtype='object'))])),\n",
+       "                ('classifier', RandomForestClassifier(random_state=42))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "Pipeline(steps=[('preprocessor',\n", + " ColumnTransformer(transformers=[('num',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='median'))]),\n", + " Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='object')),\n", + " ('cat',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='most_frequent')),\n", + " ('onehot',\n", + " OneHotEncoder(handle_unknown='ignore'))]),\n", + " Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP',\n", + " 'Name'],\n", + " dtype='object'))])),\n", + " ('classifier', RandomForestClassifier(random_state=42))])" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.impute import SimpleImputer\n", + "\n", + "# Separar columnas\n", + "cat_cols = X.select_dtypes(include=\"object\").columns\n", + "num_cols = X.select_dtypes(exclude=\"object\").columns\n", + "\n", + "# Transformador para variables numéricas\n", + "num_transformer = Pipeline(steps=[\n", + " (\"imputer\", SimpleImputer(strategy=\"median\"))\n", + "])\n", + "\n", + "# Transformador para variables categóricas\n", + "cat_transformer = Pipeline(steps=[\n", + " (\"imputer\", SimpleImputer(strategy=\"most_frequent\")),\n", + " (\"onehot\", OneHotEncoder(handle_unknown=\"ignore\"))\n", + "])\n", + "\n", + "# ColumnTransformer\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " (\"num\", num_transformer, num_cols),\n", + " (\"cat\", cat_transformer, cat_cols)\n", + " ]\n", + ")\n", + "\n", + "# Pipeline final\n", + "model = Pipeline(steps=[\n", + " (\"preprocessor\", preprocessor),\n", + " (\"classifier\", RandomForestClassifier(random_state=42))\n", + "])\n", + "\n", + "# Entrenar\n", + "model.fit(X_train, y_train)" ] }, { @@ -253,11 +1957,11 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "y_pred = model.predict(X_test)" ] }, { @@ -283,11 +1987,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best parameters: {'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}\n", + "Best CV score: 0.7855927881706137\n" + ] + } + ], "source": [ - "#your code here" + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "param_grid = {\n", + " \"classifier__n_estimators\": [100, 200],\n", + " \"classifier__max_depth\": [None, 10, 20],\n", + " \"classifier__min_samples_split\": [2, 5],\n", + " \"classifier__min_samples_leaf\": [1, 2],\n", + " \"classifier__max_features\": [\"sqrt\", \"log2\"]\n", + "}\n", + "\n", + "grid_search = GridSearchCV(\n", + " model, # tu pipeline\n", + " param_grid,\n", + " cv=5,\n", + " scoring=\"accuracy\",\n", + " n_jobs=-1\n", + ")\n", + "\n", + "grid_search.fit(X_train, y_train)\n", + "\n", + "print(\"Best parameters:\", grid_search.best_params_)\n", + "print(\"Best CV score:\", grid_search.best_score_)" ] }, { @@ -299,10 +2033,1770 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "#definir los hioerparametros\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "param_grid = {\n", + " \"classifier__n_estimators\": [100, 200],\n", + " \"classifier__max_depth\": [None, 10, 20],\n", + " \"classifier__min_samples_split\": [2, 5],\n", + " \"classifier__min_samples_leaf\": [1, 2],\n", + " \"classifier__max_features\": [\"sqrt\", \"log2\"]\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best parameters:\n", + "{'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}\n", + "\n", + "Best cross-validation score:\n", + "0.7855927881706137\n" + ] + } + ], + "source": [ + "#mejores resultados\n", + "print(\"Best parameters:\")\n", + "print(grid_search.best_params_)\n", + "\n", + "print(\"\\nBest cross-validation score:\")\n", + "print(grid_search.best_score_)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 5 folds for each of 48 candidates, totalling 240 fits\n" + ] + }, + { + "data": { + "text/html": [ + "
GridSearchCV(cv=5,\n",
+       "             estimator=Pipeline(steps=[('preprocessor',\n",
+       "                                        ColumnTransformer(transformers=[('num',\n",
+       "                                                                         Pipeline(steps=[('imputer',\n",
+       "                                                                                          SimpleImputer(strategy='median'))]),\n",
+       "                                                                         Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='object')),\n",
+       "                                                                        ('cat',\n",
+       "                                                                         Pipeline(steps=[('imputer',\n",
+       "                                                                                          SimpleImputer(strategy='most_frequent')),\n",
+       "                                                                                         ('onehot',\n",
+       "                                                                                          OneHotEncoder(ha...\n",
+       "                                                                         Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP',\n",
+       "       'Name'],\n",
+       "      dtype='object'))])),\n",
+       "                                       ('classifier',\n",
+       "                                        RandomForestClassifier(random_state=42))]),\n",
+       "             n_jobs=-1,\n",
+       "             param_grid={'classifier__max_depth': [None, 10, 20],\n",
+       "                         'classifier__max_features': ['sqrt', 'log2'],\n",
+       "                         'classifier__min_samples_leaf': [1, 2],\n",
+       "                         'classifier__min_samples_split': [2, 5],\n",
+       "                         'classifier__n_estimators': [100, 200]},\n",
+       "             scoring='accuracy', verbose=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=5,\n", + " estimator=Pipeline(steps=[('preprocessor',\n", + " ColumnTransformer(transformers=[('num',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='median'))]),\n", + " Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='object')),\n", + " ('cat',\n", + " Pipeline(steps=[('imputer',\n", + " SimpleImputer(strategy='most_frequent')),\n", + " ('onehot',\n", + " OneHotEncoder(ha...\n", + " Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP',\n", + " 'Name'],\n", + " dtype='object'))])),\n", + " ('classifier',\n", + " RandomForestClassifier(random_state=42))]),\n", + " n_jobs=-1,\n", + " param_grid={'classifier__max_depth': [None, 10, 20],\n", + " 'classifier__max_features': ['sqrt', 'log2'],\n", + " 'classifier__min_samples_leaf': [1, 2],\n", + " 'classifier__min_samples_split': [2, 5],\n", + " 'classifier__n_estimators': [100, 200]},\n", + " scoring='accuracy', verbose=2)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Ejecutar grid search\n", + "grid_search = GridSearchCV(\n", + " estimator=model,\n", + " param_grid=param_grid,\n", + " cv=5,\n", + " scoring=\"accuracy\",\n", + " n_jobs=-1,\n", + " verbose=2\n", + ")\n", + "\n", + "grid_search.fit(X_train, y_train)" + ] }, { "cell_type": "markdown", @@ -313,10 +3807,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test Accuracy: 0.7843588269120184\n" + ] + } + ], + "source": [ + "#evaluar en test set\n", + "best_model = grid_search.best_estimator_\n", + "\n", + "y_pred = best_model.predict(X_test)\n", + "\n", + "from sklearn.metrics import accuracy_score\n", + "print(\"Test Accuracy:\", accuracy_score(y_test, y_pred))" + ] } ], "metadata": { @@ -335,7 +3845,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.14.2" } }, "nbformat": 4,