diff --git a/antora-playbook.yml b/antora-playbook.yml index 36799e8..8c333cf 100644 --- a/antora-playbook.yml +++ b/antora-playbook.yml @@ -9,6 +9,7 @@ content: asciidoc: attributes: page-pagination: true + experimental: true ui: bundle: diff --git a/exercises/.gitignore b/exercises/.gitignore new file mode 100644 index 0000000..68bc17f --- /dev/null +++ b/exercises/.gitignore @@ -0,0 +1,160 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/exercises/data-pandas.ipynb b/exercises/data-pandas.ipynb new file mode 100644 index 0000000..4a47de3 --- /dev/null +++ b/exercises/data-pandas.ipynb @@ -0,0 +1,267 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "3276ac7a", + "metadata": {}, + "source": [ + "# Lab: Load Data\n", + "\n", + "## Prerequisites\n", + "* Copy the `data/hotels/reservations.csv` file into your S3 bucket.\n", + "* Create a new Data Connection in RHODS and specify the S3 connection parameters.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "a15e9893", + "metadata": {}, + "source": [ + "## Read Local Files" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "205a6432", + "metadata": {}, + "source": [ + "You can read a local CSV file with Pandas, by using the `read_csv` function.\n", + "This file loads the file contents into a `DataFrame` object.\n", + "A data frame is the main data abstraction in Pandas.\n", + "\n", + "Read the `data/hotels/reservations.csv` file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0f5a53d", + "metadata": {}, + "outputs": [], + "source": [ + "pd.read_csv(\"data/hotels/reservations.csv\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "874c944f", + "metadata": {}, + "source": [ + "You can specify how Pandas reads the CSV file.\n", + "\n", + "Reload the CSV by parsing the dates and specifying the index column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6605355f", + "metadata": {}, + "outputs": [], + "source": [ + "reservations = pd.read_csv(\"data/hotels/reservations.csv\", index_col=[\"id\"], parse_dates=[\"reservation_date\", \"arrival_date\"])\n", + "reservations" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "7a0a8a4c", + "metadata": {}, + "source": [ + "Inspect the index of the data frame." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0c52f8d", + "metadata": {}, + "outputs": [], + "source": [ + "reservations.index" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3e43af60", + "metadata": {}, + "source": [ + "Inpect the type of the `reservation_date` column.\n", + "The type is `Series`.\n", + "A series is a one-dimensional array of data.\n", + "This array is stored in the `values` property of the series.\n", + "\n", + "A series also includes the name, which in this case corresponds to the column name, and the index that identifies each row.\n", + "In this case, the series index is the dataframe index." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3ca962e", + "metadata": {}, + "outputs": [], + "source": [ + "print(type(reservations.reservation_date))\n", + "\n", + "print(type(reservations.reservation_date.values))\n", + "\n", + "print(reservations.reservation_date.name)\n", + "\n", + "print(reservations.reservation_date.index)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c9912750", + "metadata": {}, + "source": [ + "Inspect the column. The type is `datetime64`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "212921bb", + "metadata": {}, + "outputs": [], + "source": [ + "reservations.reservation_date" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "1fe23c0f", + "metadata": {}, + "source": [ + "## Read Remote Files\n", + "\n", + "You can also read a remote CSV file, such as from S3.\n", + "If you need to read remote files, then you must install the `fsspec` library.\n", + "If you want to read from an S3 bucket, then you must also install the `s3fs` library.\n", + "\n", + "
\n", + "Tip: You can skip this part if you have not configured an S3 data connection.\n", + "
\n", + "\n", + "Install `fsspec` and the `s3fs` libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1215fee", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install fsspec s3fs" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "116fc53c", + "metadata": {}, + "source": [ + "Read the file from the CSV bucket.\n", + "To configure the connection, Use the environment variables injected by the RHODS data connection." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1129104b", + "metadata": {}, + "outputs": [], + "source": [ + "s3_bucket = os.getenv(\"AWS_S3_BUCKET\")\n", + "s3_key = os.getenv(\"AWS_ACCESS_KEY_ID\")\n", + "s3_secret = os.getenv(\"AWS_SECRET_ACCESS_KEY\")\n", + "s3_endpoint = os.getenv(\"AWS_S3_ENDPOINT\")\n", + "storage_options = {\n", + " \"key\": s3_key,\n", + " \"secret\": s3_secret,\n", + " \"endpoint_url\": s3_endpoint\n", + " }\n", + "\n", + "pd.read_csv(f\"s3://{s3_bucket}/reservations.csv\", storage_options=storage_options)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3cafa923", + "metadata": {}, + "source": [ + "### Reading Large Files in Chunks\n", + "\n", + "If the CSV file is large, then you can read the file in chunks, by using the `chunksize` parameter.\n", + "\n", + "Read and print the file row by row." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4980264f", + "metadata": {}, + "outputs": [], + "source": [ + "url = f\"s3://{s3_bucket}/reservations.csv\"\n", + "\n", + "with pd.read_csv(url, storage_options=storage_options, chunksize=1) as reader:\n", + " for chunk in reader:\n", + " print(chunk)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "1816f599", + "metadata": {}, + "source": [ + "For more details visit https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/exercises/data/hotels/hotels.csv b/exercises/data/hotels/hotels.csv new file mode 100644 index 0000000..c713b8a --- /dev/null +++ b/exercises/data/hotels/hotels.csv @@ -0,0 +1,3 @@ +id,hotel_name,rating +1,Hotel Resort ABC,5 +2,Urban Hotel XYZ,4 diff --git a/exercises/data/hotels/reservations.csv b/exercises/data/hotels/reservations.csv new file mode 100644 index 0000000..ac0f7a7 --- /dev/null +++ b/exercises/data/hotels/reservations.csv @@ -0,0 +1,6 @@ +id,hotel_id,room_type,reservation_date,arrival_date,nights,adults,children +1,1,double,2023-02-24,2023-08-22,4,1,0 +2,1,double,2023-01-21,2023-08-04,6,2,2 +3,2,double,2023-06-01,2023-06-12,7,2,1 +4,2,double,2023-04-17,2023-10-02,7,1,0 +5,2,suite,2023-03-02,2023-08-15,2,2,0 diff --git a/exercises/requirements.local.txt b/exercises/requirements.local.txt new file mode 100644 index 0000000..b963da4 --- /dev/null +++ b/exercises/requirements.local.txt @@ -0,0 +1,115 @@ +aiobotocore==2.5.1 +aiohttp==3.8.4 +aioitertools==0.11.0 +aiosignal==1.3.1 +anyio==3.7.0 +argon2-cffi==21.3.0 +argon2-cffi-bindings==21.2.0 +arrow==1.2.3 +asttokens==2.2.1 +async-lru==2.0.2 +async-timeout==4.0.2 +attrs==23.1.0 +Babel==2.12.1 +backcall==0.2.0 +beautifulsoup4==4.12.2 +bleach==6.0.0 +botocore==1.29.161 +certifi==2023.5.7 +cffi==1.15.1 +charset-normalizer==3.1.0 +comm==0.1.3 +contourpy==1.1.0 +cycler==0.11.0 +debugpy==1.6.7 +decorator==5.1.1 +defusedxml==0.7.1 +exceptiongroup==1.1.1 +executing==1.2.0 +fastjsonschema==2.17.1 +fonttools==4.40.0 +fqdn==1.5.1 +frozenlist==1.3.3 +fsspec==2023.6.0 +idna==3.4 +importlib-metadata==6.7.0 +importlib-resources==5.12.0 +ipykernel==6.23.3 +ipython==8.14.0 +isoduration==20.11.0 +jedi==0.18.2 +Jinja2==3.1.2 +jmespath==1.0.1 +json5==0.9.14 +jsonpointer==2.4 +jsonschema==4.17.3 +jupyter-events==0.6.3 +jupyter-lsp==2.2.0 +jupyter_client==8.3.0 +jupyter_core==5.3.1 +jupyter_server==2.7.0 +jupyter_server_terminals==0.4.4 +jupyterlab==4.0.2 +jupyterlab-pygments==0.2.2 +jupyterlab_server==2.23.0 +kiwisolver==1.4.4 +MarkupSafe==2.1.3 +matplotlib==3.7.1 +matplotlib-inline==0.1.6 +mistune==3.0.1 +multidict==6.0.4 +nbclient==0.8.0 +nbconvert==7.6.0 +nbformat==5.9.0 +nest-asyncio==1.5.6 +notebook_shim==0.2.3 +numpy==1.25.0 +overrides==7.3.1 +packaging==23.1 +pandas==2.0.2 +pandocfilters==1.5.0 +parso==0.8.3 +pexpect==4.8.0 +pickleshare==0.7.5 +Pillow==9.5.0 +platformdirs==3.8.0 +prometheus-client==0.17.0 +prompt-toolkit==3.0.38 +psutil==5.9.5 +ptyprocess==0.7.0 +pure-eval==0.2.2 +pycparser==2.21 +Pygments==2.15.1 +pyparsing==3.1.0 +pyrsistent==0.19.3 +python-dateutil==2.8.2 +python-dotenv==1.0.0 +python-json-logger==2.0.7 +pytz==2023.3 +PyYAML==6.0 +pyzmq==25.1.0 +requests==2.31.0 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +s3fs==2023.6.0 +Send2Trash==1.8.2 +six==1.16.0 +sniffio==1.3.0 +soupsieve==2.4.1 +stack-data==0.6.2 +terminado==0.17.1 +tinycss2==1.2.1 +tomli==2.0.1 +tornado==6.3.2 +traitlets==5.9.0 +typing_extensions==4.6.3 +tzdata==2023.3 +uri-template==1.3.0 +urllib3==1.26.16 +wcwidth==0.2.6 +webcolors==1.13 +webencodings==0.5.1 +websocket-client==1.6.1 +wrapt==1.15.0 +yarl==1.9.2 +zipp==3.15.0 diff --git a/modules/data/images/created-workbench.png b/modules/data/images/created-workbench.png new file mode 100644 index 0000000..aea134f Binary files /dev/null and b/modules/data/images/created-workbench.png differ diff --git a/modules/data/images/data-connection-form.png b/modules/data/images/data-connection-form.png new file mode 100644 index 0000000..7f82a95 Binary files /dev/null and b/modules/data/images/data-connection-form.png differ diff --git a/modules/data/images/git-clone-menu.png b/modules/data/images/git-clone-menu.png new file mode 100644 index 0000000..ace59c9 Binary files /dev/null and b/modules/data/images/git-clone-menu.png differ diff --git a/modules/data/images/ocp-top-bar.png b/modules/data/images/ocp-top-bar.png new file mode 100644 index 0000000..62d3a75 Binary files /dev/null and b/modules/data/images/ocp-top-bar.png differ diff --git a/modules/data/images/rhods-side-menu.png b/modules/data/images/rhods-side-menu.png new file mode 100644 index 0000000..8fc4413 Binary files /dev/null and b/modules/data/images/rhods-side-menu.png differ diff --git a/modules/data/images/workbench-open-link.png b/modules/data/images/workbench-open-link.png new file mode 100644 index 0000000..3b71287 Binary files /dev/null and b/modules/data/images/workbench-open-link.png differ diff --git a/modules/data/pages/pandas.adoc b/modules/data/pages/pandas.adoc index 89af044..1a99743 100644 --- a/modules/data/pages/pandas.adoc +++ b/modules/data/pages/pandas.adoc @@ -2,15 +2,117 @@ == Objectives -* Use Pandas to analyze and transform data in Jupyter notebooks +* Load data into Juypter notebooks by using RHODS and Pandas. == Steps -* https://www.kaggle.com/learn/pandas -* Huge collection of data sets https://www.kaggle.com/datasets -* Data types -* Read and Write data -* Grouping and Sorting -* Summary functions and maps -* Focus on “Business” use-cases and concepts rather than pure science/pure math use-cases. The social sciences are also good because the data sets and analysis is easy to understand. -* some good ones here - https://www.kaggle.com/code/residentmario/creating-reading-and-writing/data \ No newline at end of file + +1. *Open the RHODS dashboard.* + +a. In a web browser, navigate to the Web Console of your Red{nbsp}Hat OpenShift cluster, and log in. + +b. Click the applications menu in the top navigation bar of OpenShift, +then click btn:[Red{nbsp}Hat OpenShift Data Science]. ++ +image::ocp-top-bar.png[align="center"] + +c. If prompted, log in with your Red{nbsp}Hat OpenShift credentials. + + +2. *Configure the workbench of your data science project.* ++ +A RHODS workbench is a containerized application that includes commonly used data science tools and libraries, such as JupyterLab, Tensorflow, and PyTorch. +RHODS provides you with a collection of workbench container images, each one preconfigured and tailored to a specific data science use case. + +a. Click btn:[Data Science Projects] in the left sidebar. ++ +image::rhods-side-menu.png[width=35%,align="center"] + +b. Create a data science project. +Click btn:[Create data science project]. +In the modal window that opens, enter a name and click btn:[Create]. ++ +[NOTE] +==== +If you are using Red{nbsp}Hat OpenShift from the developer sandbox, then a project is already created for you. +==== + +c. Click the newly created project. + +d. In the project page, click btn:[Create workbench] and complete the form with the following values. ++ +[cols="1,1"] +|=== +|*Name* +|`data-load` + +|*Notebook image* - Image selection +|`Standard Data Science` + +|*Notebook image* - Version selection +|Select the recommended option +|=== ++ +Do not modify the default values of the rest of the fields. + +e. Click btn:[Create workbench]. +RHODS creates the workbench and the associated persistent storage. + +3. *Configure a data connection.* ++ +A data connection provides the workbench with access to a storage layer. +In this demo, you use the storage layer to save the trained model. ++ +Additionally, a data connection also configures RHODS Model Serving with the required settings to download the model to be served. ++ +[NOTE] +==== +If you do not have access to an S3 bucket, you can continue to the next step. +==== + +a. Click btn:[Add data connection]. + +b. In the `name` field, enter `data-load-data-connection`. + +c. Complete the `AWS_*` fields with the connection details of an S3-compatible API. ++ +image::data-connection-form.png[width=70%,align="center"] ++ +[NOTE] +==== +This example uses IBM Cloud Object Storage, but you can use any storage service that provides an S3 API. +==== + +d. In the `Connected workbench` field, select `lab` +to assign this data connection to the `lab` workbench. + +e. Click btn:[Add data connection]. +This data connection injects the S3 configuration values as environement variables in the `pytorch` workbench. +RHODS restarts the worbench to inject the variables. + + +4. *Open the workbench and clone the repository.* ++ + +a. Make sure that the `lab` workbench is running and click btn:[Open]. ++ +image::workbench-open-link.png[width=40%,align="center"] + +b. If prompted, log in with your Red{nbsp}Hat OpenShift credentials. + +c. Click btn:[Allow selected permissions] to grant the workbench access to your data science project. + +d. Verify that the JuyperLab interface opens in a new browser tab. + +e. Click the btn:[Git] icon in the left sidebar. + +f. Click btn:[Clone a repository]. ++ +image::git-clone-menu.png[width=40%,align="center"] + +g. Enter https://github.com/RedHatTraining/rhods-quick-course.git as the repository, and click btn:[Clone]. + +h. In the file explorer, navigate to the `rhods-quick-course/exercises` directory. + + +5. *Open the `data-pandas.ipynb` notebook and follow the instructions.* \ No newline at end of file