diff --git a/demos/notebooks/6_pydough_TCPH_guide.ipynb b/demos/notebooks/6_pydough_TCPH_guide.ipynb
new file mode 100644
index 000000000..c0c064374
--- /dev/null
+++ b/demos/notebooks/6_pydough_TCPH_guide.ipynb
@@ -0,0 +1,3391 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%load_ext pydough.jupyter_extensions\n",
+ "\n",
+ "import pydough\n",
+ "import datetime\n",
+ "import pandas as pd\n",
+ "# Setup demo metadata\n",
+ "pydough.active_session.load_metadata_graph(\"../metadata/tpch_demo_graph.json\", \"TPCH\");\n",
+ "pydough.active_session.connect_database(\"sqlite\", database=\"../../tpch.db\");\n",
+ "pd.options.display.float_format = '{:.6f}'.format"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Tutorial: From SQL to PyDough – A Step-by-Step Guide\n",
+ "\n",
+ "In this tutorial, we will explore how to perform similar queries in PyDough that we would typically write in SQL. We will focus on the Customers, Orders, Nations, and Regions tables (from the TPC-H database). For each query, we will first see the SQL version and then convert it into the PyDough equivalent, explaining each step.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 1. Get All Nations\n",
+ "\n",
+ "In SQL, the query to retrieve all nations would look like this:\n",
+ "\n",
+ "```SQL\n",
+ "SELECT * FROM nation; \n",
+ "```\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In PyDough, we can perform the same operation using the nation collection, which retrieves all documents from a collection. \n",
+ "A collection in PyDough is an abstraction for any \"document\", but in most cases represents a table. If we want to access the nations table, we will use our corresponding PyDough collection."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " key | \n",
+ " name | \n",
+ " comment | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " ALGERIA | \n",
+ " haggle. carefully final deposits detect slyly... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " ARGENTINA | \n",
+ " al foxes promise slyly according to the regula... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2 | \n",
+ " BRAZIL | \n",
+ " y alongside of the pending deposits. carefully... | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 3 | \n",
+ " CANADA | \n",
+ " eas hang ironic, silent packages. slyly regula... | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 4 | \n",
+ " EGYPT | \n",
+ " y above the carefully unusual theodolites. fin... | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 5 | \n",
+ " ETHIOPIA | \n",
+ " ven packages wake quickly. regu | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 6 | \n",
+ " FRANCE | \n",
+ " refully final requests. regular, ironi | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 7 | \n",
+ " GERMANY | \n",
+ " l platelets. regular accounts x-ray: unusual, ... | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 8 | \n",
+ " INDIA | \n",
+ " ss excuses cajole slyly across the packages. d... | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 9 | \n",
+ " INDONESIA | \n",
+ " slyly express asymptotes. regular deposits ha... | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 10 | \n",
+ " IRAN | \n",
+ " efully alongside of the slyly final dependenci... | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " 11 | \n",
+ " IRAQ | \n",
+ " nic deposits boost atop the quickly final requ... | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " 12 | \n",
+ " JAPAN | \n",
+ " ously. final, express gifts cajole a | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " 13 | \n",
+ " JORDAN | \n",
+ " ic deposits are blithely about the carefully r... | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " 14 | \n",
+ " KENYA | \n",
+ " pending excuses haggle furiously deposits. pe... | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " 15 | \n",
+ " MOROCCO | \n",
+ " rns. blithely bold courts among the closely re... | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " 16 | \n",
+ " MOZAMBIQUE | \n",
+ " s. ironic, unusual asymptotes wake blithely r | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " 17 | \n",
+ " PERU | \n",
+ " platelets. blithely pending dependencies use f... | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " 18 | \n",
+ " CHINA | \n",
+ " c dependencies. furiously express notornis sle... | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " 19 | \n",
+ " ROMANIA | \n",
+ " ular asymptotes are about the furious multipli... | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " 20 | \n",
+ " SAUDI ARABIA | \n",
+ " ts. silent requests haggle. closely express pa... | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " 21 | \n",
+ " VIETNAM | \n",
+ " hely enticingly express accounts. even, final | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " 22 | \n",
+ " RUSSIA | \n",
+ " requests against the platelets use never acco... | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " 23 | \n",
+ " UNITED KINGDOM | \n",
+ " eans boost carefully special requests. account... | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " 24 | \n",
+ " UNITED STATES | \n",
+ " y final packages. slow foxes cajole quickly. q... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " key name comment\n",
+ "0 0 ALGERIA haggle. carefully final deposits detect slyly...\n",
+ "1 1 ARGENTINA al foxes promise slyly according to the regula...\n",
+ "2 2 BRAZIL y alongside of the pending deposits. carefully...\n",
+ "3 3 CANADA eas hang ironic, silent packages. slyly regula...\n",
+ "4 4 EGYPT y above the carefully unusual theodolites. fin...\n",
+ "5 5 ETHIOPIA ven packages wake quickly. regu\n",
+ "6 6 FRANCE refully final requests. regular, ironi\n",
+ "7 7 GERMANY l platelets. regular accounts x-ray: unusual, ...\n",
+ "8 8 INDIA ss excuses cajole slyly across the packages. d...\n",
+ "9 9 INDONESIA slyly express asymptotes. regular deposits ha...\n",
+ "10 10 IRAN efully alongside of the slyly final dependenci...\n",
+ "11 11 IRAQ nic deposits boost atop the quickly final requ...\n",
+ "12 12 JAPAN ously. final, express gifts cajole a\n",
+ "13 13 JORDAN ic deposits are blithely about the carefully r...\n",
+ "14 14 KENYA pending excuses haggle furiously deposits. pe...\n",
+ "15 15 MOROCCO rns. blithely bold courts among the closely re...\n",
+ "16 16 MOZAMBIQUE s. ironic, unusual asymptotes wake blithely r\n",
+ "17 17 PERU platelets. blithely pending dependencies use f...\n",
+ "18 18 CHINA c dependencies. furiously express notornis sle...\n",
+ "19 19 ROMANIA ular asymptotes are about the furious multipli...\n",
+ "20 20 SAUDI ARABIA ts. silent requests haggle. closely express pa...\n",
+ "21 21 VIETNAM hely enticingly express accounts. even, final \n",
+ "22 22 RUSSIA requests against the platelets use never acco...\n",
+ "23 23 UNITED KINGDOM eans boost carefully special requests. account...\n",
+ "24 24 UNITED STATES y final packages. slow foxes cajole quickly. q..."
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%pydough\n",
+ "\n",
+ "filter_c= nations(key, name, comment)\n",
+ "\n",
+ "pydough.to_df(filter_c)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "regions(): Refers to the regions collection (similar to the regions table in SQL).\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 2. Find Nations Whose Name Starts with \"A\"\n",
+ "Next, we’ll use WHERE and LIKE to filter nations whose names start with the letter \"A\".\n",
+ "\n",
+ "```SQL\n",
+ "SELECT C_NAME, C_ACCTBAL\n",
+ "FROM n\n",
+ "WHERE C_NAME LIKE 'A%';\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In PyDough, the WHERE statement is similar to the WHERE operation in SQL. It can be used to filter unwanted entries in a context. You can use the STARTSWITH() method to match patterns or LIKE() using the same syntax as SQL. Here's how you would do it:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%pydough\n",
+ "\n",
+ "# PyDough equivalent using STARTWITH\n",
+ "nations_startwith= nations(n_name=name, n_comment= comment).WHERE(STARTSWITH(name,'A'))\n",
+ "\n",
+ "# PyDough equivalent using LIKE\n",
+ "nations_like= nations(n_name=name, n_comment= comment).WHERE(LIKE(name,'A%'))\n",
+ "\n",
+ "print(pydough.to_df(nations_startwith))\n",
+ "pydough.to_df(nations_like)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3. Filter Customers by Nation\n",
+ "\n",
+ "The next situation involves a client seeking to identify all customers from a specific country, Peru.\n",
+ "\n",
+ "SQL example: \n",
+ "\n",
+ "```SQL\n",
+ "SELECT C.C_NAME\n",
+ "FROM customers C\n",
+ "JOIN nation N\n",
+ "ON C.C_NATIONKEY = N.N_NATIONKEY\n",
+ "WHERE N.N_NAME = 'Peru';\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In PyDough, to accomplish this task, the query begins by accessing the customers table to collect customer information. It then joins this data with the nations table, which contains country details. By matching the nation key from both tables, the query filters the results to include only customers from Peru. The outcome is a list of Peruvian customer names. The join is handled through metadata relationships using the WHERE method.\n",
+ "\n",
+ "It's crucial to understand that the path between collections is how we integrate data across multiple tables. This approach enables efficient data retrieval and manipulation without requiring explicit joins in the code."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " n_name | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Customer#000000008 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Customer#000000033 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Customer#000000035 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Customer#000000061 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Customer#000000077 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 5970 | \n",
+ " Customer#000149914 | \n",
+ "
\n",
+ " \n",
+ " | 5971 | \n",
+ " Customer#000149928 | \n",
+ "
\n",
+ " \n",
+ " | 5972 | \n",
+ " Customer#000149939 | \n",
+ "
\n",
+ " \n",
+ " | 5973 | \n",
+ " Customer#000149948 | \n",
+ "
\n",
+ " \n",
+ " | 5974 | \n",
+ " Customer#000149970 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5975 rows × 1 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " n_name\n",
+ "0 Customer#000000008\n",
+ "1 Customer#000000033\n",
+ "2 Customer#000000035\n",
+ "3 Customer#000000061\n",
+ "4 Customer#000000077\n",
+ "... ...\n",
+ "5970 Customer#000149914\n",
+ "5971 Customer#000149928\n",
+ "5972 Customer#000149939\n",
+ "5973 Customer#000149948\n",
+ "5974 Customer#000149970\n",
+ "\n",
+ "[5975 rows x 1 columns]"
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%pydough\n",
+ "\n",
+ "peru_nations= customers.WHERE(nation.name == \"PERU\")\n",
+ "\n",
+ "customers_from_peru= peru_nations(n_name=name)\n",
+ "\n",
+ "pydough.to_df(customers_from_peru)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 4. Get a specific customer\n",
+ "\n",
+ "The next situation involves analyzing a specific set of customers based on several conditions. The goal is to identify customers who are in debt, meaning their account balance is negative, and have placed more than 5 orders. Additionally, the focus is on customers from America, excluding those from Brazil."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This query filters customers based on specific conditions related to their account balance, order count, and geographical region. It checks if a customer’s account balance is negative, if they have made at least 5 orders, if their region is \"AMERICA,\" and if they are not from Brazil. The WHERE clause applies all these conditions using & (AND) to ensure that all must be true for a customer to be included in the results.\n",
+ "\n",
+ "PyDough does not yet support the AND, OR, NOT, IN expressions, as well as trying in-between comparisons like (1 < x < 5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " name | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Customer#000000064 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Customer#000000478 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Customer#000000488 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Customer#000000632 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Customer#000000872 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 1441 | \n",
+ " Customer#000149812 | \n",
+ "
\n",
+ " \n",
+ " | 1442 | \n",
+ " Customer#000149815 | \n",
+ "
\n",
+ " \n",
+ " | 1443 | \n",
+ " Customer#000149831 | \n",
+ "
\n",
+ " \n",
+ " | 1444 | \n",
+ " Customer#000149890 | \n",
+ "
\n",
+ " \n",
+ " | 1445 | \n",
+ " Customer#000149914 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1446 rows × 1 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " name\n",
+ "0 Customer#000000064\n",
+ "1 Customer#000000478\n",
+ "2 Customer#000000488\n",
+ "3 Customer#000000632\n",
+ "4 Customer#000000872\n",
+ "... ...\n",
+ "1441 Customer#000149812\n",
+ "1442 Customer#000149815\n",
+ "1443 Customer#000149831\n",
+ "1444 Customer#000149890\n",
+ "1445 Customer#000149914\n",
+ "\n",
+ "[1446 rows x 1 columns]"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%pydough\n",
+ "\n",
+ "customer_in_debt= customers(\n",
+ " name\n",
+ ").WHERE(\n",
+ " (acctbal < 0) &\n",
+ " (COUNT(orders) >= 5) &\n",
+ " (nation.region.name == \"AMERICA\") &\n",
+ " (nation.name != \"BRAZIL\"))\n",
+ "\n",
+ "pydough.to_df(customer_in_debt)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 5. Find the total number of orders per customers placed in 1998\n",
+ "\n",
+ "The next situation consists of analyzing customer order activity from the year 1998. The query retrieves customer details, specifically their customer keys and names, while also counting how many orders each customer placed in 1998. The goal is to identify which customers were the most active that year, ordering them by the total number of orders in descending order\n",
+ "\n",
+ "```SQL\n",
+ "SELECT c.c_custkey, c.c_name, COUNT(o.o_orderkey) AS total_orders\n",
+ "FROM customer c\n",
+ "JOIN orders o ON c.c_custkey = o.o_custkey\n",
+ "WHERE strftime('%Y', o.o_orderdate) = '1998' \n",
+ "ORDER BY total_orders DESC;\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In this query, we'll introduce the use of ORDER BY. In PyDough, the ORDER_BY method is the same as SQL. It uses a COUNT function with a WHERE clause to filter orders by year. The results are then ordered by the total number of orders in descending order, showing the most active customers first.\n",
+ "\n",
+ "Here's how you would perform this task in PyDough:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " key | \n",
+ " name | \n",
+ " num_orders | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 11719 | \n",
+ " Customer#000011719 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 93778 | \n",
+ " Customer#000093778 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 102295 | \n",
+ " Customer#000102295 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 111394 | \n",
+ " Customer#000111394 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 4789 | \n",
+ " Customer#000004789 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 149995 | \n",
+ " 149991 | \n",
+ " Customer#000149991 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 149996 | \n",
+ " 149993 | \n",
+ " Customer#000149993 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 149997 | \n",
+ " 149994 | \n",
+ " Customer#000149994 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 149998 | \n",
+ " 149997 | \n",
+ " Customer#000149997 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 149999 | \n",
+ " 150000 | \n",
+ " Customer#000150000 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
150000 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " key name num_orders\n",
+ "0 11719 Customer#000011719 9\n",
+ "1 93778 Customer#000093778 9\n",
+ "2 102295 Customer#000102295 9\n",
+ "3 111394 Customer#000111394 9\n",
+ "4 4789 Customer#000004789 8\n",
+ "... ... ... ...\n",
+ "149995 149991 Customer#000149991 0\n",
+ "149996 149993 Customer#000149993 0\n",
+ "149997 149994 Customer#000149994 0\n",
+ "149998 149997 Customer#000149997 0\n",
+ "149999 150000 Customer#000150000 0\n",
+ "\n",
+ "[150000 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%pydough\n",
+ "\n",
+ "customer_order_counts = customers(\n",
+ " key,\n",
+ " name,\n",
+ " # Get the total number of orders placed in 1998 by customer\n",
+ " num_orders=COUNT(\n",
+ " orders.WHERE(YEAR(order_date) == 1998)\n",
+ " ),\n",
+ ").ORDER_BY(num_orders.DESC())\n",
+ "pydough.to_df(customer_order_counts )\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This demonstrates an alternative approach to solving the same query by leveraging date and time filtering in the orders table. In this case, we define a specific date range to filter orders, selecting only those with an order_date between January 1, 1998 (inclusive) and January 1, 1999 (exclusive). The WHERE clause applies the condition directly to the orders table, and the key parameter specifies how the results should be grouped or indexed after filtering."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " key | \n",
+ " name | \n",
+ " total_orders_in_1998 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 11719 | \n",
+ " Customer#000011719 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 93778 | \n",
+ " Customer#000093778 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 102295 | \n",
+ " Customer#000102295 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 111394 | \n",
+ " Customer#000111394 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 4789 | \n",
+ " Customer#000004789 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 149995 | \n",
+ " 149991 | \n",
+ " Customer#000149991 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 149996 | \n",
+ " 149993 | \n",
+ " Customer#000149993 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 149997 | \n",
+ " 149994 | \n",
+ " Customer#000149994 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 149998 | \n",
+ " 149997 | \n",
+ " Customer#000149997 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 149999 | \n",
+ " 150000 | \n",
+ " Customer#000150000 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
150000 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " key name total_orders_in_1998\n",
+ "0 11719 Customer#000011719 9\n",
+ "1 93778 Customer#000093778 9\n",
+ "2 102295 Customer#000102295 9\n",
+ "3 111394 Customer#000111394 9\n",
+ "4 4789 Customer#000004789 8\n",
+ "... ... ... ...\n",
+ "149995 149991 Customer#000149991 0\n",
+ "149996 149993 Customer#000149993 0\n",
+ "149997 149994 Customer#000149994 0\n",
+ "149998 149997 Customer#000149997 0\n",
+ "149999 150000 Customer#000150000 0\n",
+ "\n",
+ "[150000 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%pydough\n",
+ "\n",
+ "# Filter orders placed in 1998\n",
+ "orders_in_1998 = orders.WHERE(\n",
+ " (order_date >= datetime.date(1998, 1, 1)) \n",
+ " & (order_date < datetime.date(1999, 1, 1))\n",
+ ")\n",
+ "\n",
+ "# Retrieve customer information along with the total number of orders placed in 1998 for each customer\n",
+ "customer_order_summary = customers(\n",
+ " key,\n",
+ " name,\n",
+ " total_orders_in_1998=COUNT(orders_in_1998),\n",
+ ").ORDER_BY(total_orders_in_1998.DESC())\n",
+ "\n",
+ "pydough.to_df(customer_order_summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 6. Count the Number of Orders in Each Nation:\n",
+ "\n",
+ "This query counts the number of orders placed in each nation, showing them in descending order of count.\n",
+ "```SQL\n",
+ "\n",
+ "SELECT n.n_name, COUNT(o.o_orderkey) AS order_count\n",
+ "FROM nation n\n",
+ "JOIN customer c ON n.n_nationkey = c.c_nationkey\n",
+ "JOIN orders o ON c.c_custkey = o.o_custkey\n",
+ "GROUP BY n.n_name\n",
+ "ORDER BY order_count DESC;\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In PyDough, to achieve this, we would use the metadata relationship and the PARTITION method, which is equivalent to GROUP BY in SQL. Keys can be specified using the by argument, which is the element to be grouped, and data columns to be aggregated can be referenced using the name argument."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " nation_name | \n",
+ " total_orders_in_region | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " FRANCE | \n",
+ " 61600 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " RUSSIA | \n",
+ " 61495 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " INDONESIA | \n",
+ " 61377 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " MOZAMBIQUE | \n",
+ " 61267 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " ROMANIA | \n",
+ " 61012 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " CHINA | \n",
+ " 60784 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " JORDAN | \n",
+ " 60736 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " CANADA | \n",
+ " 60480 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " VIETNAM | \n",
+ " 60347 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " BRAZIL | \n",
+ " 60137 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " UNITED STATES | \n",
+ " 59921 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " INDIA | \n",
+ " 59827 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " GERMANY | \n",
+ " 59724 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " ETHIOPIA | \n",
+ " 59706 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " IRAN | \n",
+ " 59675 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " ALGERIA | \n",
+ " 59622 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " ARGENTINA | \n",
+ " 59547 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " MOROCCO | \n",
+ " 59459 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " UNITED KINGDOM | \n",
+ " 59455 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " JAPAN | \n",
+ " 59405 | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " EGYPT | \n",
+ " 59111 | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " PERU | \n",
+ " 59018 | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " KENYA | \n",
+ " 58940 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " IRAQ | \n",
+ " 58902 | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " SAUDI ARABIA | \n",
+ " 58453 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " nation_name total_orders_in_region\n",
+ "0 FRANCE 61600\n",
+ "1 RUSSIA 61495\n",
+ "2 INDONESIA 61377\n",
+ "3 MOZAMBIQUE 61267\n",
+ "4 ROMANIA 61012\n",
+ "5 CHINA 60784\n",
+ "6 JORDAN 60736\n",
+ "7 CANADA 60480\n",
+ "8 VIETNAM 60347\n",
+ "9 BRAZIL 60137\n",
+ "10 UNITED STATES 59921\n",
+ "11 INDIA 59827\n",
+ "12 GERMANY 59724\n",
+ "13 ETHIOPIA 59706\n",
+ "14 IRAN 59675\n",
+ "15 ALGERIA 59622\n",
+ "16 ARGENTINA 59547\n",
+ "17 MOROCCO 59459\n",
+ "18 UNITED KINGDOM 59455\n",
+ "19 JAPAN 59405\n",
+ "20 EGYPT 59111\n",
+ "21 PERU 59018\n",
+ "22 KENYA 58940\n",
+ "23 IRAQ 58902\n",
+ "24 SAUDI ARABIA 58453"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%pydough\n",
+ "\n",
+ "# Retrieves orders for a specific nation \n",
+ "orders_by_region = orders.customer(\n",
+ " nation_name=nation.name # 'nation.name' specifies which nation's orders to filter\n",
+ ")\n",
+ "\n",
+ "# Partitions the orders by nation and counts the total orders in each region\n",
+ "grouped_orders_by_region = PARTITION(\n",
+ " orders_by_region, name=\"order\", by=(nation_name) # Group orders by 'nation_name'\n",
+ ")(\n",
+ " nation_name=nation_name, # The name of the nation for which orders are grouped\n",
+ " total_orders_in_region=COUNT(order) # Counts the total number of orders in each nation\n",
+ ").ORDER_BY(total_orders_in_region.DESC()) # Orders the result by the total number of orders, descending\n",
+ "\n",
+ "\n",
+ "pydough.to_df(grouped_orders_by_region)\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 7. Determine the number of orders placed in each month of a year:\t\n",
+ "\n",
+ "The next situation consists of analyzing the number of orders placed each month during 1998. The query focuses on extracting the month from each order date and counting how many orders were placed within each month of that year.\n",
+ "\n",
+ "```SQL\n",
+ "SELECT\n",
+ " strftime('%m', o_orderdate) AS order_month,\n",
+ " COUNT(o_orderkey) AS num_orders \n",
+ "FROM\n",
+ " orders\n",
+ "WHERE\n",
+ " o_orderdate >= '1998-01-01' \n",
+ " AND o_orderdate < '1999-01-01'\n",
+ "GROUP BY\n",
+ " order_month\n",
+ "ORDER BY\n",
+ " order_month;\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In PyDough to resolve this query, it is necessary to use Boolean expression. In this case, we are using the **&** operator to perform the AND operation, but we can also use the **|** and **~** being used as logical OR and NOT. The query uses the MONTH() function to extract the month from each order's date, then filters the orders with the WHERE function to include only those placed in 1995. The PARTITION function groups the filtered orders by month, and the COUNT function counts the number of orders in each month. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " order_month | \n",
+ " total_orders_in_month | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " 19472 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " 17721 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " 19313 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 4 | \n",
+ " 18901 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 5 | \n",
+ " 19342 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 6 | \n",
+ " 18874 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 7 | \n",
+ " 19471 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 8 | \n",
+ " 19460 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 9 | \n",
+ " 18746 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 10 | \n",
+ " 19502 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 11 | \n",
+ " 18619 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " 12 | \n",
+ " 19216 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " order_month total_orders_in_month\n",
+ "0 1 19472\n",
+ "1 2 17721\n",
+ "2 3 19313\n",
+ "3 4 18901\n",
+ "4 5 19342\n",
+ "5 6 18874\n",
+ "6 7 19471\n",
+ "7 8 19460\n",
+ "8 9 18746\n",
+ "9 10 19502\n",
+ "10 11 18619\n",
+ "11 12 19216"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%pydough\n",
+ "# Retrieve orders for the year 1995, with each order grouped by the month it was placed\n",
+ "orders_in_1995_by_month = orders.WHERE((order_date >= datetime.date(1995, 1, 1)) \n",
+ "& (order_date < datetime.date(1996, 1, 1)) \n",
+ ")(order_month=MONTH(order_date))\n",
+ "\n",
+ "# Group the filtered orders by month and count the total number of orders per month in 1995\n",
+ "monthly_order_summary_1995 = PARTITION(orders_in_1995_by_month, name=\"order\", by=(order_month))(\n",
+ " order_month=order_month, \n",
+ " total_orders_in_month=COUNT(order) \n",
+ ")\n",
+ "pydough.to_df(monthly_order_summary_1995)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 8. Find Customers Who Have Placed Orders Above $1000\n",
+ "\n",
+ "Let’s filter customers who Have Placed Orders Above $1000 in ASIA region\n",
+ "\n",
+ "```SQL\n",
+ "SELECT c.c_custkey, c.c_name, SUM(o.o_totalprice) AS total_spent\n",
+ "FROM customer c\n",
+ "JOIN orders o ON c.c_custkey = o.o_custkey\n",
+ "JOIN nation n ON c.c_nationkey = n.n_nationkey\n",
+ "JOIN region r ON n.n_regionkey = r.r_regionkey\n",
+ "WHERE r.r_name = 'ASIA'\n",
+ "GROUP BY c.c_custkey, c.c_name\n",
+ "HAVING SUM(o.o_totalprice) > 1000;\n",
+ "```\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In the PyDough we can appreciate how we can also use aggregation functions just like in SQL. In this example, we are using the SUM function, which has the same functionality as the SQL SUM to calculate the total."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " customer_key | \n",
+ " customer_name | \n",
+ " total_spent | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 7 | \n",
+ " Customer#000000007 | \n",
+ " 2957861.160000 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 19 | \n",
+ " Customer#000000019 | \n",
+ " 3611713.600000 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 25 | \n",
+ " Customer#000000025 | \n",
+ " 3135039.320000 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 28 | \n",
+ " Customer#000000028 | \n",
+ " 2429022.210000 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 37 | \n",
+ " Customer#000000037 | \n",
+ " 2860377.420000 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 20019 | \n",
+ " 149980 | \n",
+ " Customer#000149980 | \n",
+ " 3115223.230000 | \n",
+ "
\n",
+ " \n",
+ " | 20020 | \n",
+ " 149981 | \n",
+ " Customer#000149981 | \n",
+ " 1700503.960000 | \n",
+ "
\n",
+ " \n",
+ " | 20021 | \n",
+ " 149984 | \n",
+ " Customer#000149984 | \n",
+ " 1153164.880000 | \n",
+ "
\n",
+ " \n",
+ " | 20022 | \n",
+ " 149987 | \n",
+ " Customer#000149987 | \n",
+ " 472026.460000 | \n",
+ "
\n",
+ " \n",
+ " | 20023 | \n",
+ " 149989 | \n",
+ " Customer#000149989 | \n",
+ " 1703476.450000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
20024 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " customer_key customer_name total_spent\n",
+ "0 7 Customer#000000007 2957861.160000\n",
+ "1 19 Customer#000000019 3611713.600000\n",
+ "2 25 Customer#000000025 3135039.320000\n",
+ "3 28 Customer#000000028 2429022.210000\n",
+ "4 37 Customer#000000037 2860377.420000\n",
+ "... ... ... ...\n",
+ "20019 149980 Customer#000149980 3115223.230000\n",
+ "20020 149981 Customer#000149981 1700503.960000\n",
+ "20021 149984 Customer#000149984 1153164.880000\n",
+ "20022 149987 Customer#000149987 472026.460000\n",
+ "20023 149989 Customer#000149989 1703476.450000\n",
+ "\n",
+ "[20024 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%pydough\n",
+ "\n",
+ "# Retrieve customers with total order price greater than 1000 and from the \"ASIA\" region\n",
+ "high_value_customers_in_asia = customers(\n",
+ " customer_key=key, \n",
+ " customer_name=name, \n",
+ " total_spent=SUM(orders.total_price) \n",
+ ").WHERE((total_spent > 1000) & (nation.region.name == \"ASIA\"))\n",
+ "\n",
+ "pydough.to_df(high_value_customers_in_asia)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Another approach that shows how we can resolve the same query"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " customer_key | \n",
+ " customer_name | \n",
+ " total_spent | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 7 | \n",
+ " Customer#000000007 | \n",
+ " 2957861.160000 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 19 | \n",
+ " Customer#000000019 | \n",
+ " 3611713.600000 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 25 | \n",
+ " Customer#000000025 | \n",
+ " 3135039.320000 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 28 | \n",
+ " Customer#000000028 | \n",
+ " 2429022.210000 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 37 | \n",
+ " Customer#000000037 | \n",
+ " 2860377.420000 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 20019 | \n",
+ " 149980 | \n",
+ " Customer#000149980 | \n",
+ " 3115223.230000 | \n",
+ "
\n",
+ " \n",
+ " | 20020 | \n",
+ " 149981 | \n",
+ " Customer#000149981 | \n",
+ " 1700503.960000 | \n",
+ "
\n",
+ " \n",
+ " | 20021 | \n",
+ " 149984 | \n",
+ " Customer#000149984 | \n",
+ " 1153164.880000 | \n",
+ "
\n",
+ " \n",
+ " | 20022 | \n",
+ " 149987 | \n",
+ " Customer#000149987 | \n",
+ " 472026.460000 | \n",
+ "
\n",
+ " \n",
+ " | 20023 | \n",
+ " 149989 | \n",
+ " Customer#000149989 | \n",
+ " 1703476.450000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
20024 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " customer_key customer_name total_spent\n",
+ "0 7 Customer#000000007 2957861.160000\n",
+ "1 19 Customer#000000019 3611713.600000\n",
+ "2 25 Customer#000000025 3135039.320000\n",
+ "3 28 Customer#000000028 2429022.210000\n",
+ "4 37 Customer#000000037 2860377.420000\n",
+ "... ... ... ...\n",
+ "20019 149980 Customer#000149980 3115223.230000\n",
+ "20020 149981 Customer#000149981 1700503.960000\n",
+ "20021 149984 Customer#000149984 1153164.880000\n",
+ "20022 149987 Customer#000149987 472026.460000\n",
+ "20023 149989 Customer#000149989 1703476.450000\n",
+ "\n",
+ "[20024 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%pydough\n",
+ "\n",
+ "# Filter nations to include only those in the \"ASIA\" region\n",
+ "asian_nations = nations.WHERE(region.name == \"ASIA\")\n",
+ "\n",
+ "# Retrieve high-value customers from Asian nations who have spent more than 1000\n",
+ "high_value_customers_in_asia = asian_nations.customers(\n",
+ " customer_key=key, \n",
+ " customer_name=name, \n",
+ " total_spent=SUM(orders.total_price) \n",
+ ").WHERE(\n",
+ " total_spent > 1000 \n",
+ ")\n",
+ "\n",
+ "pydough.to_df(high_value_customers_in_asia)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 9. Average Order Value by Region \n",
+ "\n",
+ "```SQL \n",
+ "SELECT \n",
+ " r.r_name AS Region, \n",
+ " AVG(o.o_totalprice) AS AvgOrderValue \n",
+ "FROM \n",
+ " orders o\n",
+ "JOIN \n",
+ " customer c ON o.o_custkey = c.c_custkey\n",
+ "JOIN \n",
+ " nation n ON c.c_nationkey = n.n_nationkey\n",
+ "JOIN \n",
+ " region r ON n.n_regionkey = r.r_regionkey\n",
+ "GROUP BY \n",
+ " r.r_name;\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The main idea is to link customers with orders. We store the customer's region and then access the orders subcollection to retrieve the total price. We use the BACK method to return to the ancestor collection and get the variable we defined earlier. Then, we perform the PARTITION to group the collection by region name. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " region_name | \n",
+ " total_revenue | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " AFRICA | \n",
+ " 151274.687459 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " AMERICA | \n",
+ " 151476.057596 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " ASIA | \n",
+ " 151167.942741 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " EUROPE | \n",
+ " 150990.370343 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " MIDDLE EAST | \n",
+ " 151192.105780 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " region_name total_revenue\n",
+ "0 AFRICA 151274.687459\n",
+ "1 AMERICA 151476.057596\n",
+ "2 ASIA 151167.942741\n",
+ "3 EUROPE 150990.370343\n",
+ "4 MIDDLE EAST 151192.105780"
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%pydough\n",
+ "\n",
+ "# Retrieve orders from customers by region and calculate their total order price\n",
+ "selected_customers_by_region = customers(customer_region_name=nation.region.name).orders(\n",
+ " order_price=total_price, \n",
+ " customer_region_name=BACK(1).customer_region_name \n",
+ ")\n",
+ "\n",
+ "# Group the selected customers by region and calculate the average order price for each region\n",
+ "region_revenue_summary = PARTITION(selected_customers_by_region, \"customer\", by=customer_region_name)(\n",
+ " region_name=customer_region_name, \n",
+ " total_revenue=AVG(customer.order_price) \n",
+ ")\n",
+ "\n",
+ "pydough.to_df(region_revenue_summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 10. Identifying the 5 Most Profitable Regions\n",
+ "\n",
+ "Find regions contributing the most to revenue.\n",
+ "```SQL\n",
+ "SELECT \n",
+ " r.r_name AS RegionName, \n",
+ " SUM(o.o_totalprice) AS TotalRevenue\n",
+ "FROM \n",
+ " region r\n",
+ "JOIN nation n ON r.r_regionkey = n.n_regionkey\n",
+ "JOIN customer c ON n.n_nationkey = c.c_nationkey\n",
+ "JOIN orders o ON c.c_custkey = o.o_custkey\n",
+ "GROUP BY \n",
+ " r.r_name\n",
+ "ORDER BY \n",
+ " TotalRevenue DESC\n",
+ "LIMIT 5;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In PyDough, to achieve this, we would use the TOP_K method. The TOP_K operation sorts a collection and then selects the first k values from the ordered results. First we select customers by region and their order total prices. Then, we group the data by region and calculate the total revenue for each region. Finally, we sort the results by total revenue and return the top 5 regions with the highest revenue.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " region_name | \n",
+ " TOTALREVENUE | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " EUROPE | \n",
+ " 9318715232.780001 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " ASIA | \n",
+ " 9300830039.290001 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " EUROPE | \n",
+ " 9282323186.280001 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " AFRICA | \n",
+ " 9249114609.660000 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " MIDDLE EAST | \n",
+ " 9229296044.980000 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " EUROPE | \n",
+ " 9196280024.510000 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " ASIA | \n",
+ " 9161685172.340000 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " AMERICA | \n",
+ " 9143635385.190001 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " ASIA | \n",
+ " 9121689438.200001 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " AMERICA | \n",
+ " 9107367126.700001 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " AMERICA | \n",
+ " 9086969258.889999 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " AFRICA | \n",
+ " 9065723966.780001 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " EUROPE | \n",
+ " 9039194008.740000 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " ASIA | \n",
+ " 9035791922.090000 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " AFRICA | \n",
+ " 9032642974.379999 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " MIDDLE EAST | \n",
+ " 9025552858.090000 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " AMERICA | \n",
+ " 9022490350.049999 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " ASIA | \n",
+ " 8993418470.639999 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " AFRICA | \n",
+ " 8985579085.219999 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " EUROPE | \n",
+ " 8956753007.400000 | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " AMERICA | \n",
+ " 8946481134.379999 | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " MIDDLE EAST | \n",
+ " 8925726169.170000 | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " AFRICA | \n",
+ " 8897163266.180000 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " MIDDLE EAST | \n",
+ " 8892603095.990000 | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " MIDDLE EAST | \n",
+ " 8812280619.530001 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " region_name TOTALREVENUE\n",
+ "0 EUROPE 9318715232.780001\n",
+ "1 ASIA 9300830039.290001\n",
+ "2 EUROPE 9282323186.280001\n",
+ "3 AFRICA 9249114609.660000\n",
+ "4 MIDDLE EAST 9229296044.980000\n",
+ "5 EUROPE 9196280024.510000\n",
+ "6 ASIA 9161685172.340000\n",
+ "7 AMERICA 9143635385.190001\n",
+ "8 ASIA 9121689438.200001\n",
+ "9 AMERICA 9107367126.700001\n",
+ "10 AMERICA 9086969258.889999\n",
+ "11 AFRICA 9065723966.780001\n",
+ "12 EUROPE 9039194008.740000\n",
+ "13 ASIA 9035791922.090000\n",
+ "14 AFRICA 9032642974.379999\n",
+ "15 MIDDLE EAST 9025552858.090000\n",
+ "16 AMERICA 9022490350.049999\n",
+ "17 ASIA 8993418470.639999\n",
+ "18 AFRICA 8985579085.219999\n",
+ "19 EUROPE 8956753007.400000\n",
+ "20 AMERICA 8946481134.379999\n",
+ "21 MIDDLE EAST 8925726169.170000\n",
+ "22 AFRICA 8897163266.180000\n",
+ "23 MIDDLE EAST 8892603095.990000\n",
+ "24 MIDDLE EAST 8812280619.530001"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%pydough\n",
+ "\n",
+ "selected_customers = nations(region_name= region.name, TOTALREVENUE= SUM(customers.orders.total_price))\n",
+ "\n",
+ "pydough.to_df(selected_customers.ORDER_BY(TOTALREVENUE.DESC()))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 11. Max and Min Order Value Difference by Nation and Region\n",
+ "\n",
+ "Retrieve the name of the region and nation for each unique combination of region and nation. For each region-nation pair, calculate the maximum and minimum order values. Then, find the difference between the maximum and minimum order values. Additionally, count the total number of orders for each region-nation pair. Group the data by region and nation to ensure calculations are done separately for each pair. Finally, sort the results by the difference in order values, with the highest difference appearing first.\n",
+ "\n",
+ "```SQL\n",
+ "SELECT \n",
+ " r.r_name AS region_name,\n",
+ " n.n_name AS nation_name,\n",
+ " MAX(o.o_totalprice) AS max_order_value,\n",
+ " MIN(o.o_totalprice) AS min_order_value,\n",
+ " MAX(o.o_totalprice) - MIN(o.o_totalprice) AS order_value_difference,\n",
+ " COUNT(o.o_orderkey) AS total_orders\n",
+ "FROM region r\n",
+ "JOIN nation n ON r.r_regionkey = n.n_regionkey \n",
+ "JOIN customer c ON c.c_nationkey = n.n_nationkey\n",
+ "JOIN orders o ON o.o_custkey = c.c_custkey\n",
+ "GROUP BY r.r_name, n.n_name\n",
+ "ORDER BY order_value_difference DESC;\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In PyDough, to achieve this we first select customers based on their region and nation using selected_customers. Then, we filter their associated orders and calculate the total price for each order. Using PARTITION, we group the data by region and nation and calculate the maximum and minimum order values using the MAX and MIN functions, their difference, and the total number of orders. Finally, we sort the results by the order value difference in descending order using ORDER_BY."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " region_name | \n",
+ " nation_name | \n",
+ " max_order_value | \n",
+ " min_order_value | \n",
+ " order_value_difference | \n",
+ " total_orders | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " EUROPE | \n",
+ " RUSSIA | \n",
+ " 555285.160000 | \n",
+ " 932.410000 | \n",
+ " 554352.750000 | \n",
+ " 61495 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " AMERICA | \n",
+ " PERU | \n",
+ " 544089.090000 | \n",
+ " 891.740000 | \n",
+ " 543197.350000 | \n",
+ " 59018 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " AMERICA | \n",
+ " ARGENTINA | \n",
+ " 530604.440000 | \n",
+ " 877.300000 | \n",
+ " 529727.140000 | \n",
+ " 59547 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " AMERICA | \n",
+ " UNITED STATES | \n",
+ " 525590.570000 | \n",
+ " 913.450000 | \n",
+ " 524677.120000 | \n",
+ " 59921 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " MIDDLE EAST | \n",
+ " IRAN | \n",
+ " 522644.480000 | \n",
+ " 924.510000 | \n",
+ " 521719.970000 | \n",
+ " 59675 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " AMERICA | \n",
+ " CANADA | \n",
+ " 515531.820000 | \n",
+ " 908.180000 | \n",
+ " 514623.640000 | \n",
+ " 60480 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " EUROPE | \n",
+ " FRANCE | \n",
+ " 508668.520000 | \n",
+ " 885.750000 | \n",
+ " 507782.770000 | \n",
+ " 61600 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " AFRICA | \n",
+ " MOZAMBIQUE | \n",
+ " 508047.990000 | \n",
+ " 896.590000 | \n",
+ " 507151.400000 | \n",
+ " 61267 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " ASIA | \n",
+ " VIETNAM | \n",
+ " 504509.060000 | \n",
+ " 911.670000 | \n",
+ " 503597.390000 | \n",
+ " 60347 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " ASIA | \n",
+ " JAPAN | \n",
+ " 502742.760000 | \n",
+ " 857.710000 | \n",
+ " 501885.050000 | \n",
+ " 59405 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " ASIA | \n",
+ " CHINA | \n",
+ " 499794.580000 | \n",
+ " 920.580000 | \n",
+ " 498874.000000 | \n",
+ " 60784 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " EUROPE | \n",
+ " GERMANY | \n",
+ " 499753.010000 | \n",
+ " 927.160000 | \n",
+ " 498825.850000 | \n",
+ " 59724 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " AFRICA | \n",
+ " ETHIOPIA | \n",
+ " 498810.260000 | \n",
+ " 866.900000 | \n",
+ " 497943.360000 | \n",
+ " 59706 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " AFRICA | \n",
+ " ALGERIA | \n",
+ " 498599.910000 | \n",
+ " 956.850000 | \n",
+ " 497643.060000 | \n",
+ " 59622 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " ASIA | \n",
+ " INDIA | \n",
+ " 496620.480000 | \n",
+ " 961.730000 | \n",
+ " 495658.750000 | \n",
+ " 59827 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " MIDDLE EAST | \n",
+ " SAUDI ARABIA | \n",
+ " 492147.150000 | \n",
+ " 896.800000 | \n",
+ " 491250.350000 | \n",
+ " 58453 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " EUROPE | \n",
+ " UNITED KINGDOM | \n",
+ " 490788.010000 | \n",
+ " 875.520000 | \n",
+ " 489912.490000 | \n",
+ " 59455 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " AFRICA | \n",
+ " KENYA | \n",
+ " 487405.740000 | \n",
+ " 945.800000 | \n",
+ " 486459.940000 | \n",
+ " 58940 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " AMERICA | \n",
+ " BRAZIL | \n",
+ " 487207.010000 | \n",
+ " 947.840000 | \n",
+ " 486259.170000 | \n",
+ " 60137 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " MIDDLE EAST | \n",
+ " JORDAN | \n",
+ " 487033.380000 | \n",
+ " 884.520000 | \n",
+ " 486148.860000 | \n",
+ " 60736 | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " EUROPE | \n",
+ " ROMANIA | \n",
+ " 486417.970000 | \n",
+ " 945.170000 | \n",
+ " 485472.800000 | \n",
+ " 61012 | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " AFRICA | \n",
+ " MOROCCO | \n",
+ " 484188.380000 | \n",
+ " 942.270000 | \n",
+ " 483246.110000 | \n",
+ " 59459 | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " MIDDLE EAST | \n",
+ " IRAQ | \n",
+ " 483445.970000 | \n",
+ " 927.910000 | \n",
+ " 482518.060000 | \n",
+ " 58902 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " ASIA | \n",
+ " INDONESIA | \n",
+ " 474185.370000 | \n",
+ " 909.180000 | \n",
+ " 473276.190000 | \n",
+ " 61377 | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " MIDDLE EAST | \n",
+ " EGYPT | \n",
+ " 472194.820000 | \n",
+ " 884.820000 | \n",
+ " 471310.000000 | \n",
+ " 59111 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " region_name nation_name max_order_value min_order_value \\\n",
+ "0 EUROPE RUSSIA 555285.160000 932.410000 \n",
+ "1 AMERICA PERU 544089.090000 891.740000 \n",
+ "2 AMERICA ARGENTINA 530604.440000 877.300000 \n",
+ "3 AMERICA UNITED STATES 525590.570000 913.450000 \n",
+ "4 MIDDLE EAST IRAN 522644.480000 924.510000 \n",
+ "5 AMERICA CANADA 515531.820000 908.180000 \n",
+ "6 EUROPE FRANCE 508668.520000 885.750000 \n",
+ "7 AFRICA MOZAMBIQUE 508047.990000 896.590000 \n",
+ "8 ASIA VIETNAM 504509.060000 911.670000 \n",
+ "9 ASIA JAPAN 502742.760000 857.710000 \n",
+ "10 ASIA CHINA 499794.580000 920.580000 \n",
+ "11 EUROPE GERMANY 499753.010000 927.160000 \n",
+ "12 AFRICA ETHIOPIA 498810.260000 866.900000 \n",
+ "13 AFRICA ALGERIA 498599.910000 956.850000 \n",
+ "14 ASIA INDIA 496620.480000 961.730000 \n",
+ "15 MIDDLE EAST SAUDI ARABIA 492147.150000 896.800000 \n",
+ "16 EUROPE UNITED KINGDOM 490788.010000 875.520000 \n",
+ "17 AFRICA KENYA 487405.740000 945.800000 \n",
+ "18 AMERICA BRAZIL 487207.010000 947.840000 \n",
+ "19 MIDDLE EAST JORDAN 487033.380000 884.520000 \n",
+ "20 EUROPE ROMANIA 486417.970000 945.170000 \n",
+ "21 AFRICA MOROCCO 484188.380000 942.270000 \n",
+ "22 MIDDLE EAST IRAQ 483445.970000 927.910000 \n",
+ "23 ASIA INDONESIA 474185.370000 909.180000 \n",
+ "24 MIDDLE EAST EGYPT 472194.820000 884.820000 \n",
+ "\n",
+ " order_value_difference total_orders \n",
+ "0 554352.750000 61495 \n",
+ "1 543197.350000 59018 \n",
+ "2 529727.140000 59547 \n",
+ "3 524677.120000 59921 \n",
+ "4 521719.970000 59675 \n",
+ "5 514623.640000 60480 \n",
+ "6 507782.770000 61600 \n",
+ "7 507151.400000 61267 \n",
+ "8 503597.390000 60347 \n",
+ "9 501885.050000 59405 \n",
+ "10 498874.000000 60784 \n",
+ "11 498825.850000 59724 \n",
+ "12 497943.360000 59706 \n",
+ "13 497643.060000 59622 \n",
+ "14 495658.750000 59827 \n",
+ "15 491250.350000 58453 \n",
+ "16 489912.490000 59455 \n",
+ "17 486459.940000 58940 \n",
+ "18 486259.170000 60137 \n",
+ "19 486148.860000 60736 \n",
+ "20 485472.800000 61012 \n",
+ "21 483246.110000 59459 \n",
+ "22 482518.060000 58902 \n",
+ "23 473276.190000 61377 \n",
+ "24 471310.000000 59111 "
+ ]
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%pydough\n",
+ "# Retrieve customers' order details for each nation, calculating max, min, and order value differences\n",
+ "selected_customers_by_nation = nations(\n",
+ " region_name=region.name, \n",
+ " nation_name=name, \n",
+ " max_order_value=MAX(customers.orders.total_price), \n",
+ " min_order_value=MIN(customers.orders.total_price), \n",
+ " order_value_difference=MAX(customers.orders.total_price) - MIN(customers.orders.total_price), \n",
+ " total_orders=COUNT(customers.orders.total_price) \n",
+ ")\n",
+ "\n",
+ "# Sort the customers by the difference between max and min order values, in descending order\n",
+ "sorted_customers_by_order_value_difference = selected_customers_by_nation.ORDER_BY(order_value_difference.DESC())\n",
+ "\n",
+ "pydough.to_df(sorted_customers_by_order_value_difference)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 12. Nations with the Most Customers in a Specific Market Segment\n",
+ "\n",
+ "Find the nations with the most customers in AUTOMOBILE and BUILDING market segment.\n",
+ "\n",
+ "```SQL\n",
+ "\n",
+ "SELECT \n",
+ " n.n_name AS nation_name,\n",
+ " COUNT(c.c_custkey) AS customer_count\n",
+ "FROM nation n\n",
+ "JOIN customer c ON c.c_nationkey = n.n_nationkey\n",
+ "WHERE c.c_mktsegment IN ('MACHINERY', 'AUTOMOBILE') \n",
+ "GROUP BY n.n_name\n",
+ "ORDER BY customer_count DESC;\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In PyDough, to achieve this, the steps to follow are filtering the customers based on their market segment using the WHERE and ISIN functions, which represent the IN function in SQL. First, we select the customers that belong to the target market segments, specifically those in the \"MACHINERY\" and \"AUTOMOBILE\" segments. Then, we filter the selected customers based on their nation and customer key. Afterward, we group the selected customers by nation and calculate the count of customers in each nation. Finally, we sort the results by customer count in descending order. This allows us to identify the nations with the highest number of customers in the target market segments, helping us focus on the most relevant regions for business analysis or marketing efforts."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " nation_name | \n",
+ " customer_count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " ROMANIA | \n",
+ " 2545 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " INDONESIA | \n",
+ " 2489 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " CHINA | \n",
+ " 2481 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " ETHIOPIA | \n",
+ " 2423 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " BRAZIL | \n",
+ " 2419 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " EGYPT | \n",
+ " 2414 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " RUSSIA | \n",
+ " 2414 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " GERMANY | \n",
+ " 2402 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " UNITED STATES | \n",
+ " 2399 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " JORDAN | \n",
+ " 2397 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " FRANCE | \n",
+ " 2396 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " CANADA | \n",
+ " 2383 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " KENYA | \n",
+ " 2383 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " ARGENTINA | \n",
+ " 2378 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " MOZAMBIQUE | \n",
+ " 2373 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " UNITED KINGDOM | \n",
+ " 2365 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " VIETNAM | \n",
+ " 2363 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " ALGERIA | \n",
+ " 2355 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " MOROCCO | \n",
+ " 2354 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " INDIA | \n",
+ " 2347 | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " IRAQ | \n",
+ " 2347 | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " JAPAN | \n",
+ " 2340 | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " SAUDI ARABIA | \n",
+ " 2329 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " PERU | \n",
+ " 2328 | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " IRAN | \n",
+ " 2277 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " nation_name customer_count\n",
+ "0 ROMANIA 2545\n",
+ "1 INDONESIA 2489\n",
+ "2 CHINA 2481\n",
+ "3 ETHIOPIA 2423\n",
+ "4 BRAZIL 2419\n",
+ "5 EGYPT 2414\n",
+ "6 RUSSIA 2414\n",
+ "7 GERMANY 2402\n",
+ "8 UNITED STATES 2399\n",
+ "9 JORDAN 2397\n",
+ "10 FRANCE 2396\n",
+ "11 CANADA 2383\n",
+ "12 KENYA 2383\n",
+ "13 ARGENTINA 2378\n",
+ "14 MOZAMBIQUE 2373\n",
+ "15 UNITED KINGDOM 2365\n",
+ "16 VIETNAM 2363\n",
+ "17 ALGERIA 2355\n",
+ "18 MOROCCO 2354\n",
+ "19 INDIA 2347\n",
+ "20 IRAQ 2347\n",
+ "21 JAPAN 2340\n",
+ "22 SAUDI ARABIA 2329\n",
+ "23 PERU 2328\n",
+ "24 IRAN 2277"
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%pydough\n",
+ "\n",
+ "# Retrieve customers in the 'MACHINERY' or 'AUTOMOBILE' market segments\n",
+ "customer_in_target_mktsegment = customers.WHERE(ISIN(mktsegment, ('MACHINERY', 'AUTOMOBILE')))\n",
+ "\n",
+ "# Filter selected customers by their nation and customer key\n",
+ "selected_customers_by_nation = customer_in_target_mktsegment(\n",
+ " nation_name=nation.name, \n",
+ " customer_key=key \n",
+ ")\n",
+ "\n",
+ "# Group selected customers by nation and calculate the count of customers in each nation\n",
+ "nation_customer_count = PARTITION(selected_customers_by_nation, name=\"customer\", by=(nation_name))(\n",
+ " nation_name=nation_name, \n",
+ " customer_count=COUNT(customer.key) \n",
+ ").ORDER_BY(customer_count.DESC()) \n",
+ "\n",
+ "\n",
+ "pydough.to_df(nation_customer_count)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 14. Region with the Highest Percentage of High-Priority Orders\n",
+ "\n",
+ "Calculate the percentage of high-priority orders (e.g., '1-URGENT', '2-HIGH') for each region.\n",
+ "\n",
+ "```SQL\n",
+ "\n",
+ "SELECT r.r_name AS region_name, \n",
+ " ROUND(\n",
+ " SUM(\n",
+ " CASE \n",
+ " WHEN o.o_orderpriority IN ('1-URGENT', '2-HIGH') THEN 1 \n",
+ " ELSE 0 \n",
+ " END\n",
+ " ) * 100.0 / COUNT(o.o_orderkey),\n",
+ " 2\n",
+ " ) AS high_priority_percentage\n",
+ " \n",
+ "FROM orders o\n",
+ "JOIN customer c ON o.o_custkey = c.c_custkey\n",
+ "JOIN nation n ON c.c_nationkey = n.n_nationkey\n",
+ "JOIN region r ON n.n_regionkey = r.r_regionkey\n",
+ "GROUP BY r.r_name\n",
+ "ORDER BY high_priority_percentage DESC;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "As we can see, we start filtering the customers based on their region using the region_name attribute. First, we select the customers in the target region by referencing the region_name in the customers table. Then, we retrieve the orders placed by these customers. Afterward, we partition the selected orders by region and calculate the percentage of high-priority orders for each region. Specifically, we calculate the percentage of orders with a priority of \"1-URGENT\" or \"2-HIGH\" by summing those orders and dividing by the total number of orders per region. Finally, we sort the results by the high-priority order percentage in descending order. This allows us to identify the regions with the highest percentage of high-priority orders, providing valuable insights into order urgency across different regions."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " region_name | \n",
+ " high_priority_percentage | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " MIDDLE EAST | \n",
+ " 40.200000 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " AMERICA | \n",
+ " 40.160000 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " EUROPE | \n",
+ " 39.990000 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " ASIA | \n",
+ " 39.910000 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " AFRICA | \n",
+ " 39.890000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " region_name high_priority_percentage\n",
+ "0 MIDDLE EAST 40.200000\n",
+ "1 AMERICA 40.160000\n",
+ "2 EUROPE 39.990000\n",
+ "3 ASIA 39.910000\n",
+ "4 AFRICA 39.890000"
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%pydough\n",
+ "\n",
+ "# Retrieve customers based on their region\n",
+ "customers_in_region = customers(region_name=nation.region.name)\n",
+ "\n",
+ "# Retrieve orders placed by customers in the selected region\n",
+ "selected_orders_by_region = customers_in_region.orders(\n",
+ " customer_key=key,\n",
+ " region_name=BACK(1).region_name \n",
+ ")\n",
+ "\n",
+ "# Partition the selected orders by region and calculate the high-priority order percentage for each region\n",
+ "region_priority_summary = PARTITION(selected_orders_by_region, name=\"order\", by=(region_name))(\n",
+ " region_name=region_name, \n",
+ " high_priority_percentage=ROUND(\n",
+ " (SUM(ISIN(order.order_priority, ('1-URGENT', '2-HIGH'))) * 100) / COUNT(order.customer_key), 2\n",
+ " ) \n",
+ ").ORDER_BY(high_priority_percentage.DESC()) \n",
+ "\n",
+ "pydough.to_df(region_priority_summary)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 15. Customers Who Have Never Placed Orders\n",
+ "\n",
+ "The next situation consists in identifying customers who have not placed any orders. The query retrieves the customer details, such as their customer key and name.\n",
+ "\n",
+ "```SQL\n",
+ "SELECT c.c_custkey, c.c_name\n",
+ "FROM customer c\n",
+ "LEFT JOIN orders o ON c.c_custkey = o.o_custkey\n",
+ "WHERE o.o_orderkey IS NULL;\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In Pydough we use the function HAS/HASNOT to resolve the is null statement. HAS returns True if at least one record of the sub-collection exists and HASNOT returns True if at least one record of the sub-collection does'nt exists. So the steps to follow are first filtering the customers who have not placed any orders using the WHERE clause combined with the HASNOT function. This identifies the customers who have no associated orders. Then, we select these customers by retrieving their unique customer_key and customer_name. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " customer_key | \n",
+ " customer_name | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 3 | \n",
+ " Customer#000000003 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 6 | \n",
+ " Customer#000000006 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 9 | \n",
+ " Customer#000000009 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 12 | \n",
+ " Customer#000000012 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 15 | \n",
+ " Customer#000000015 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 49999 | \n",
+ " 149988 | \n",
+ " Customer#000149988 | \n",
+ "
\n",
+ " \n",
+ " | 50000 | \n",
+ " 149991 | \n",
+ " Customer#000149991 | \n",
+ "
\n",
+ " \n",
+ " | 50001 | \n",
+ " 149994 | \n",
+ " Customer#000149994 | \n",
+ "
\n",
+ " \n",
+ " | 50002 | \n",
+ " 149997 | \n",
+ " Customer#000149997 | \n",
+ "
\n",
+ " \n",
+ " | 50003 | \n",
+ " 150000 | \n",
+ " Customer#000150000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
50004 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " customer_key customer_name\n",
+ "0 3 Customer#000000003\n",
+ "1 6 Customer#000000006\n",
+ "2 9 Customer#000000009\n",
+ "3 12 Customer#000000012\n",
+ "4 15 Customer#000000015\n",
+ "... ... ...\n",
+ "49999 149988 Customer#000149988\n",
+ "50000 149991 Customer#000149991\n",
+ "50001 149994 Customer#000149994\n",
+ "50002 149997 Customer#000149997\n",
+ "50003 150000 Customer#000150000\n",
+ "\n",
+ "[50004 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%pydough\n",
+ "\n",
+ "# Retrieve customers who have not placed any orders\n",
+ "customers_without_orders = customers.WHERE(HASNOT(orders))\n",
+ "\n",
+ "# Select the customers who do not have any orders, retrieving their unique key and name\n",
+ "selected_customers_without_orders = customers_without_orders(\n",
+ " customer_key=key, \n",
+ " customer_name=name \n",
+ ")\n",
+ "\n",
+ "pydough.to_df(selected_customers_without_orders)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 16. Customer Activity by Nation\n",
+ "Question: How many total, active, and inactive customers are there in each nation, sorted by the total number of customers?\n",
+ "\n",
+ "Purpose: This query shows the total number of customers, active customers (those with orders), and inactive customers (those without orders) for each nation. The results are sorted by the total number of customers.\n",
+ "\n",
+ "```SQL\n",
+ "\n",
+ "SELECT\n",
+ " n.n_name,\n",
+ " COUNT(DISTINCT c.c_custkey) AS total_customers,\n",
+ " COUNT(DISTINCT CASE WHEN o.o_orderkey IS NOT NULL THEN c.c_custkey END) AS active_customers,\n",
+ " COUNT(DISTINCT CASE WHEN o.o_orderkey IS NULL THEN c.c_custkey END) AS inactive_customers\n",
+ "FROM\n",
+ " nation n\n",
+ "JOIN customer c ON n.n_nationkey = c.c_nationkey\n",
+ "LEFT JOIN orders o ON c.c_custkey = o.o_custkey\n",
+ "GROUP BY n.n_name\n",
+ "ORDER BY total_customers DESC;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In Pydoguh to achieve this, we first filter customers by nation and classify them as active or inactive based on their order history using the KEEP_IF function, which is similar to the SQL expression CASE WHEN b THEN a END. Then, we partition the customers by nation, calculating the total, active, and inactive customer counts for each nation. Finally, we sort the results by total customers in descending order, allowing us to identify nations with the highest customer distribution and activity levels."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " nation_name | \n",
+ " total_customers | \n",
+ " active_customers | \n",
+ " inactive_customers | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " INDONESIA | \n",
+ " 6161 | \n",
+ " 4081 | \n",
+ " 2080 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " FRANCE | \n",
+ " 6100 | \n",
+ " 4149 | \n",
+ " 1951 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " ROMANIA | \n",
+ " 6100 | \n",
+ " 4087 | \n",
+ " 2013 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " RUSSIA | \n",
+ " 6078 | \n",
+ " 4089 | \n",
+ " 1989 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " INDIA | \n",
+ " 6042 | \n",
+ " 3958 | \n",
+ " 2084 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " JORDAN | \n",
+ " 6033 | \n",
+ " 4025 | \n",
+ " 2008 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " CHINA | \n",
+ " 6024 | \n",
+ " 4011 | \n",
+ " 2013 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " CANADA | \n",
+ " 6020 | \n",
+ " 4006 | \n",
+ " 2014 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " UNITED KINGDOM | \n",
+ " 6011 | \n",
+ " 3989 | \n",
+ " 2022 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " IRAN | \n",
+ " 6009 | \n",
+ " 4013 | \n",
+ " 1996 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " VIETNAM | \n",
+ " 6008 | \n",
+ " 3985 | \n",
+ " 2023 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " BRAZIL | \n",
+ " 5999 | \n",
+ " 4024 | \n",
+ " 1975 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " EGYPT | \n",
+ " 5995 | \n",
+ " 3972 | \n",
+ " 2023 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " KENYA | \n",
+ " 5992 | \n",
+ " 3987 | \n",
+ " 2005 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " UNITED STATES | \n",
+ " 5983 | \n",
+ " 3984 | \n",
+ " 1999 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " ARGENTINA | \n",
+ " 5975 | \n",
+ " 3985 | \n",
+ " 1990 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " PERU | \n",
+ " 5975 | \n",
+ " 3909 | \n",
+ " 2066 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " MOZAMBIQUE | \n",
+ " 5974 | \n",
+ " 4080 | \n",
+ " 1894 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " IRAQ | \n",
+ " 5963 | \n",
+ " 3917 | \n",
+ " 2046 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " ETHIOPIA | \n",
+ " 5952 | \n",
+ " 3991 | \n",
+ " 1961 | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " JAPAN | \n",
+ " 5948 | \n",
+ " 3989 | \n",
+ " 1959 | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " ALGERIA | \n",
+ " 5925 | \n",
+ " 3940 | \n",
+ " 1985 | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " MOROCCO | \n",
+ " 5921 | \n",
+ " 3957 | \n",
+ " 1964 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " GERMANY | \n",
+ " 5908 | \n",
+ " 3974 | \n",
+ " 1934 | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " SAUDI ARABIA | \n",
+ " 5904 | \n",
+ " 3894 | \n",
+ " 2010 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " nation_name total_customers active_customers inactive_customers\n",
+ "0 INDONESIA 6161 4081 2080\n",
+ "1 FRANCE 6100 4149 1951\n",
+ "2 ROMANIA 6100 4087 2013\n",
+ "3 RUSSIA 6078 4089 1989\n",
+ "4 INDIA 6042 3958 2084\n",
+ "5 JORDAN 6033 4025 2008\n",
+ "6 CHINA 6024 4011 2013\n",
+ "7 CANADA 6020 4006 2014\n",
+ "8 UNITED KINGDOM 6011 3989 2022\n",
+ "9 IRAN 6009 4013 1996\n",
+ "10 VIETNAM 6008 3985 2023\n",
+ "11 BRAZIL 5999 4024 1975\n",
+ "12 EGYPT 5995 3972 2023\n",
+ "13 KENYA 5992 3987 2005\n",
+ "14 UNITED STATES 5983 3984 1999\n",
+ "15 ARGENTINA 5975 3985 1990\n",
+ "16 PERU 5975 3909 2066\n",
+ "17 MOZAMBIQUE 5974 4080 1894\n",
+ "18 IRAQ 5963 3917 2046\n",
+ "19 ETHIOPIA 5952 3991 1961\n",
+ "20 JAPAN 5948 3989 1959\n",
+ "21 ALGERIA 5925 3940 1985\n",
+ "22 MOROCCO 5921 3957 1964\n",
+ "23 GERMANY 5908 3974 1934\n",
+ "24 SAUDI ARABIA 5904 3894 2010"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%pydough\n",
+ "\n",
+ "# Retrieve customers by nation, classifying them into active and inactive based on order history\n",
+ "customers_by_nation = customers(\n",
+ " customer_nation_name=nation.name, \n",
+ " active_customers=KEEP_IF(key, HAS(orders)), \n",
+ " inactive_customers=KEEP_IF(key, HASNOT(orders)) \n",
+ ")\n",
+ "\n",
+ "# Partition the selected customers by nation and calculate customer statistics\n",
+ "nation_customer_summary = PARTITION(customers_by_nation, \"customer\", by=customer_nation_name)(\n",
+ " nation_name=customer_nation_name, \n",
+ " total_customers=COUNT(customer), \n",
+ " active_customers=NDISTINCT(customer.active_customers), \n",
+ " inactive_customers=NDISTINCT(customer.inactive_customers), \n",
+ ").ORDER_BY(total_customers.DESC()) # Sort by total customers in descending order\n",
+ "\n",
+ "pydough.to_df(nation_customer_summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 17. Customers with High Balance but Low Spending\n",
+ "\n",
+ "Question: Retrieve customers who belong to the top 10% in account balance but rank in the bottom 25% in terms of order activity\n",
+ "\n",
+ "Purpose: Find customers with top 10% account balances but bottom 25% order activity.\n",
+ "\n",
+ "```SQL\n",
+ "SELECT c_name, c_acctbal, total_orders\n",
+ "FROM (\n",
+ " SELECT \n",
+ " c.c_name,\n",
+ " c.c_acctbal,\n",
+ " COUNT(o.o_orderkey) AS total_orders,\n",
+ " PERCENT_RANK() OVER (ORDER BY c.c_acctbal DESC) AS balance_percentile,\n",
+ " PERCENT_RANK() OVER (ORDER BY COUNT(o.o_orderkey)) AS order_activity_percentile\n",
+ " FROM customer c\n",
+ " LEFT JOIN orders o ON c.c_custkey = o.o_custkey\n",
+ " GROUP BY c.c_custkey, c.c_name, c.c_acctbal\n",
+ ") sub\n",
+ "WHERE \n",
+ " balance_percentile <= 0.1 \n",
+ " AND order_activity_percentile <= 0.25 \n",
+ "ORDER BY c_acctbal DESC;\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/ara/miniconda3/envs/PyDough/lib/python3.12/site-packages/pydough/sqlglot/sqlglot_relational_expression_visitor.py:82: UserWarning: PyDough when using SQLITE dialect does not support ascending ordering with nulls last (changed to nulls first)\n",
+ " warnings.warn(\n",
+ "/home/ara/miniconda3/envs/PyDough/lib/python3.12/site-packages/pydough/sqlglot/sqlglot_relational_expression_visitor.py:88: UserWarning: PyDough when using SQLITE dialect does not support ascending ordering with nulls first (changed to nulls last)\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " key | \n",
+ " name | \n",
+ " acctbal | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 69321 | \n",
+ " Customer#000069321 | \n",
+ " 9999.960000 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2487 | \n",
+ " Customer#000002487 | \n",
+ " 9999.720000 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 43044 | \n",
+ " Customer#000043044 | \n",
+ " 9999.490000 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 76146 | \n",
+ " Customer#000076146 | \n",
+ " 9999.230000 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 34047 | \n",
+ " Customer#000034047 | \n",
+ " 9998.970000 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 3708 | \n",
+ " 62682 | \n",
+ " Customer#000062682 | \n",
+ " 8894.780000 | \n",
+ "
\n",
+ " \n",
+ " | 3709 | \n",
+ " 82611 | \n",
+ " Customer#000082611 | \n",
+ " 8894.490000 | \n",
+ "
\n",
+ " \n",
+ " | 3710 | \n",
+ " 13560 | \n",
+ " Customer#000013560 | \n",
+ " 8894.430000 | \n",
+ "
\n",
+ " \n",
+ " | 3711 | \n",
+ " 78429 | \n",
+ " Customer#000078429 | \n",
+ " 8894.390000 | \n",
+ "
\n",
+ " \n",
+ " | 3712 | \n",
+ " 99744 | \n",
+ " Customer#000099744 | \n",
+ " 8894.210000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
3713 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " key name acctbal\n",
+ "0 69321 Customer#000069321 9999.960000\n",
+ "1 2487 Customer#000002487 9999.720000\n",
+ "2 43044 Customer#000043044 9999.490000\n",
+ "3 76146 Customer#000076146 9999.230000\n",
+ "4 34047 Customer#000034047 9998.970000\n",
+ "... ... ... ...\n",
+ "3708 62682 Customer#000062682 8894.780000\n",
+ "3709 82611 Customer#000082611 8894.490000\n",
+ "3710 13560 Customer#000013560 8894.430000\n",
+ "3711 78429 Customer#000078429 8894.390000\n",
+ "3712 99744 Customer#000099744 8894.210000\n",
+ "\n",
+ "[3713 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%pydough\n",
+ "\n",
+ "# Filter customers who are in the bottom 10% by account balance and the bottom 25% by order count\n",
+ "customers_in_low_percentiles = customers.WHERE(\n",
+ " (PERCENTILE(by=acctbal.DESC()) <= 10) \n",
+ " & (PERCENTILE(by=COUNT(orders.key).ASC()) <= 25) \n",
+ ")\n",
+ "\n",
+ "# Select the filtered customers, retrieving their key, name, and account balance\n",
+ "selected_customers_in_low_percentiles = customers_in_low_percentiles(\n",
+ " customer_key=key, \n",
+ " customer_name=name, \n",
+ " account_balance=acctbal \n",
+ ")\n",
+ "\n",
+ "# Partition the selected customers by key, name, and account balance, and order by account balance\n",
+ "customer_activity_summary = PARTITION(selected_customers_in_low_percentiles, name=\"customer\", by=(key, name, acctbal))(\n",
+ " key=key, \n",
+ " name=name, \n",
+ " acctbal=acctbal \n",
+ ").ORDER_BY(acctbal.DESC()) \n",
+ "\n",
+ "pydough.to_df(customer_activity_summary)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "PyDough",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}