diff --git a/conda/rapids-gpu-bdb-dask-sql.yml b/conda/rapids-gpu-bdb-dask-sql.yml new file mode 100755 index 00000000..54156f8c --- /dev/null +++ b/conda/rapids-gpu-bdb-dask-sql.yml @@ -0,0 +1,34 @@ +channels: + - rapidsai-nightly + - nvidia + - conda-forge + +dependencies: + - python=3.8 + - cudatoolkit=11.2 + - cudf + - rmm + - dask-cuda + - dask-cudf + - cuml + - dask + - distributed + - ucx-py + - ucx-proc=*=gpu + - dask-sql>=2022.1 + - numba=0.54.* + - scipy + - scikit-learn + - cupy + - spacy=2.3 + - oauth2client + - asyncssh + - psutil + - ipykernel + - jupyterlab + - gspread + - oauth2client + - pytest + - pip + - pip: + - jupyter-server-proxy diff --git a/gpu_bdb/bdb_tools/__init__.py b/gpu_bdb/bdb_tools/__init__.py index ccbb1625..2b586df8 100755 --- a/gpu_bdb/bdb_tools/__init__.py +++ b/gpu_bdb/bdb_tools/__init__.py @@ -1 +1,4 @@ # Copyright (c) 2020, NVIDIA CORPORATION. + +from .rmm_monitor import RMMResourceMonitor +from .dasktasklogger import DaskTaskLogger diff --git a/gpu_bdb/bdb_tools/cluster_startup.py b/gpu_bdb/bdb_tools/cluster_startup.py index e90e2737..02e56640 100755 --- a/gpu_bdb/bdb_tools/cluster_startup.py +++ b/gpu_bdb/bdb_tools/cluster_startup.py @@ -24,43 +24,13 @@ from dask.utils import parse_bytes -def get_bsql_config_options(): - """Loads configuration environment variables. - In case it is not previously set, returns a default value for each one. - - Returns a dictionary object. - For more info: https://docs.blazingdb.com/docs/config_options - """ - config_options = {} - config_options['JOIN_PARTITION_SIZE_THRESHOLD'] = os.environ.get("JOIN_PARTITION_SIZE_THRESHOLD", 300000000) - config_options['MAX_DATA_LOAD_CONCAT_CACHE_BYTE_SIZE'] = os.environ.get("MAX_DATA_LOAD_CONCAT_CACHE_BYTE_SIZE", 400000000) - config_options['BLAZING_DEVICE_MEM_CONSUMPTION_THRESHOLD'] = os.environ.get("BLAZING_DEVICE_MEM_CONSUMPTION_THRESHOLD", 0.6) - config_options['BLAZ_HOST_MEM_CONSUMPTION_THRESHOLD'] = os.environ.get("BLAZ_HOST_MEM_CONSUMPTION_THRESHOLD", 0.6) - config_options['MAX_KERNEL_RUN_THREADS'] = os.environ.get("MAX_KERNEL_RUN_THREADS", 3) - config_options['TABLE_SCAN_KERNEL_NUM_THREADS'] = os.environ.get("TABLE_SCAN_KERNEL_NUM_THREADS", 1) - config_options['MAX_NUM_ORDER_BY_PARTITIONS_PER_NODE'] = os.environ.get("MAX_NUM_ORDER_BY_PARTITIONS_PER_NODE", 20) - config_options['NUM_BYTES_PER_ORDER_BY_PARTITION'] = os.environ.get("NUM_BYTES_PER_ORDER_BY_PARTITION", 400000000) - config_options['MAX_ORDER_BY_SAMPLES_PER_NODE'] = os.environ.get("MAX_ORDER_BY_SAMPLES_PER_NODE", 10000) - config_options['MAX_SEND_MESSAGE_THREADS'] = os.environ.get("MAX_SEND_MESSAGE_THREADS", 20) - config_options['MEMORY_MONITOR_PERIOD'] = os.environ.get("MEMORY_MONITOR_PERIOD", 50) - config_options['TRANSPORT_BUFFER_BYTE_SIZE'] = os.environ.get("TRANSPORT_BUFFER_BYTE_SIZE", 1048576) # 1 MBs - config_options['TRANSPORT_POOL_NUM_BUFFERS'] = os.environ.get("TRANSPORT_POOL_NUM_BUFFERS", 1000) - config_options['BLAZING_LOGGING_DIRECTORY'] = os.environ.get("BLAZING_LOGGING_DIRECTORY", 'blazing_log') - config_options['BLAZING_CACHE_DIRECTORY'] = os.environ.get("BLAZING_CACHE_DIRECTORY", '/tmp/') - config_options['LOGGING_LEVEL'] = os.environ.get("LOGGING_LEVEL", "trace") - config_options['MAX_JOIN_SCATTER_MEM_OVERHEAD'] = os.environ.get("MAX_JOIN_SCATTER_MEM_OVERHEAD", 500000000) - config_options['PROTOCOL'] = os.environ.get("PROTOCOL", "AUTO") - - return config_options - - -def attach_to_cluster(config, create_blazing_context=False): +def attach_to_cluster(config, create_sql_context=False): """Attaches to an existing cluster if available. By default, tries to attach to a cluster running on localhost:8786 (dask's default). This is currently hardcoded to assume the dashboard is running on port 8787. - Optionally, this will also create a BlazingContext. + Optionally, this will also create a Dask-SQL Context. """ scheduler_file = config.get("scheduler_file_path") host = config.get("cluster_host") @@ -131,19 +101,12 @@ def maybe_create_worker_directories(dask_worker): config["40GB_workers"] = worker_counts.get("40GB", 0) config["80GB_workers"] = worker_counts.get("80GB", 0) - bc = None - if create_blazing_context: - from blazingsql import BlazingContext - bc = BlazingContext( - dask_client=client, - pool=os.environ.get("BLAZING_POOL", False), - network_interface=os.environ.get("INTERFACE", "ib0"), - config_options=get_bsql_config_options(), - allocator=os.environ.get("BLAZING_ALLOCATOR_MODE", "existing"), - initial_pool_size=os.environ.get("BLAZING_INITIAL_POOL_SIZE", None) - ) + c = None + if create_sql_context: + from dask_sql import Context + c = Context() - return client, bc + return client, c def worker_count_info(client): @@ -173,7 +136,7 @@ def _get_ucx_config(): Get a subset of ucx config variables relevant for benchmarking """ relevant_configs = ["infiniband", "nvlink"] - ucx_config = dask.config.get("ucx") + ucx_config = dask.config.get("distributed.comm.ucx") # Doing this since when relevant configs are not enabled the value is `None` instead of `False` filtered_ucx_config = { config: ucx_config.get(config) if ucx_config.get(config) else False @@ -196,11 +159,5 @@ def import_query_libs(): "spacy", ] - # optionally include blazingsql - # this is brittle, but it resolves breaking change - # issues as we can control the environment - if os.environ.get("RUNNER_INCLUDE_BSQL"): - library_list.append("blazingsql") - for lib in library_list: importlib.import_module(lib) diff --git a/gpu_bdb/bdb_tools/dasktasklogger.py b/gpu_bdb/bdb_tools/dasktasklogger.py new file mode 100644 index 00000000..fcb4b9bb --- /dev/null +++ b/gpu_bdb/bdb_tools/dasktasklogger.py @@ -0,0 +1,20 @@ +import re +import os +import json +import numpy as np + +class DaskTaskLogger(): + key_expr=re.compile( '([\w-]+)-([0-9a-f-]{32,36})' ) + + def __init__(self, client, outputdir='/tmp'): + self._client=client + self._outputdir=outputdir + + def mark_begin( self ): + self._client.get_task_stream() + + def save_tasks( self, prefix='dask' ): + plotfname=os.path.join(self._outputdir, f"{prefix}_plot.html") + pdata, pfigure = self._client.get_task_stream(plot='save', filename=plotfname) + with open( os.path.join(self._outputdir, f"{prefix}_tasks.json"), 'w') as outf: + json.dump([{k:t[k] for k in filter( lambda x: type(t[x]) != bytes().__class__, t)} for t in pdata],outf) diff --git a/gpu_bdb/bdb_tools/q01_utils.py b/gpu_bdb/bdb_tools/q01_utils.py new file mode 100644 index 00000000..471b96f8 --- /dev/null +++ b/gpu_bdb/bdb_tools/q01_utils.py @@ -0,0 +1,47 @@ +# +# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +from bdb_tools.readers import build_reader + +# -------- Q1 ----------- +q01_i_category_id_IN = 1, 2, 3 +# -- sf1 -> 11 stores, 90k sales in 820k lines +q01_ss_store_sk_IN = 10, 20, 33, 40, 50 +q01_viewed_together_count = 50 +q01_limit = 100 + + +item_cols = ["i_item_sk", "i_category_id"] +ss_cols = ["ss_item_sk", "ss_store_sk", "ss_ticket_number"] + + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + + item_df = table_reader.read("item", relevant_cols=item_cols) + ss_df = table_reader.read("store_sales", relevant_cols=ss_cols) + + if c: + c.create_table("item", item_df, persist=False) + c.create_table("store_sales", ss_df, persist=False) + + return item_df, ss_df + diff --git a/gpu_bdb/bdb_tools/q02_utils.py b/gpu_bdb/bdb_tools/q02_utils.py new file mode 100644 index 00000000..b9f058a6 --- /dev/null +++ b/gpu_bdb/bdb_tools/q02_utils.py @@ -0,0 +1,38 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.readers import build_reader + +q02_item_sk = 10001 +q02_limit = 30 +q02_session_timeout_inSec = 3600 +q02_MAX_ITEMS_PER_BASKET = 5000000 + + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + wcs_cols = ["wcs_user_sk", "wcs_item_sk", "wcs_click_date_sk", "wcs_click_time_sk"] + wcs_df = table_reader.read("web_clickstreams", relevant_cols=wcs_cols) + + if c: + c.create_table("web_clickstreams", wcs_df, persist=False) + + return wcs_df + diff --git a/gpu_bdb/bdb_tools/q03_utils.py b/gpu_bdb/bdb_tools/q03_utils.py new file mode 100644 index 00000000..1a4fb387 --- /dev/null +++ b/gpu_bdb/bdb_tools/q03_utils.py @@ -0,0 +1,138 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import cudf + +from numba import cuda + +from bdb_tools.readers import build_reader + +q03_days_in_sec_before_purchase = 864000 +q03_views_before_purchase = 5 +q03_purchased_item_IN = 10001 +q03_purchased_item_category_IN = 2, 3 +q03_limit = 100 + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + + item_cols = ["i_category_id", "i_item_sk"] + wcs_cols = [ + "wcs_user_sk", + "wcs_click_time_sk", + "wcs_click_date_sk", + "wcs_item_sk", + "wcs_sales_sk", + ] + + item_df = table_reader.read("item", relevant_cols=item_cols) + wcs_df = table_reader.read("web_clickstreams", relevant_cols=wcs_cols) + + if c: + c.create_table("web_clickstreams", wcs_df, persist=False) + c.create_table("item", item_df, persist=False) + + return item_df + + +@cuda.jit +def find_items_viewed_before_purchase_kernel( + relevant_idx_col, user_col, timestamp_col, item_col, out_col, N +): + """ + Find the past N items viewed after a relevant purchase was made, + as defined by the configuration of this query. + """ + i = cuda.grid(1) + + if i < (relevant_idx_col.size): # boundary guard + # every relevant row gets N rows in the output, so we need to map the indexes + # back into their position in the original array + orig_idx = relevant_idx_col[i] + current_user = user_col[orig_idx] + + # look at the previous N clicks (assume sorted descending) + rows_to_check = N + remaining_rows = user_col.size - orig_idx + + if remaining_rows <= rows_to_check: + rows_to_check = remaining_rows - 1 + + for k in range(1, rows_to_check + 1): + if current_user != user_col[orig_idx + k]: + out_col[i * N + k - 1] = 0 + + # only checking relevant purchases via the relevant_idx_col + elif (timestamp_col[orig_idx + k] <= timestamp_col[orig_idx]) & ( + timestamp_col[orig_idx + k] + >= (timestamp_col[orig_idx] - q03_days_in_sec_before_purchase) + ): + out_col[i * N + k - 1] = item_col[orig_idx + k] + else: + out_col[i * N + k - 1] = 0 + + +def apply_find_items_viewed(df, item_mappings): + + # need to sort descending to ensure that the + # next N rows are the previous N clicks + df = df.sort_values( + by=["wcs_user_sk", "tstamp", "wcs_sales_sk", "wcs_item_sk"], + ascending=[False, False, False, False], + ) + df.reset_index(drop=True, inplace=True) + df["relevant_flag"] = (df.wcs_sales_sk != 0) & ( + df.wcs_item_sk == q03_purchased_item_IN + ) + df["relevant_idx_pos"] = df.index.to_series() + df.reset_index(drop=True, inplace=True) + # only allocate output for the relevant rows + sample = df.loc[df.relevant_flag == True] + sample.reset_index(drop=True, inplace=True) + + N = q03_views_before_purchase + size = len(sample) + + # we know this can be int32, since it's going to contain item_sks + out_arr = cuda.device_array(size * N, dtype=df["wcs_item_sk"].dtype) + + find_items_viewed_before_purchase_kernel.forall(size)( + sample["relevant_idx_pos"], + df["wcs_user_sk"], + df["tstamp"], + df["wcs_item_sk"], + out_arr, + N, + ) + + result = cudf.DataFrame({"prior_item_viewed": out_arr}) + + del out_arr + del df + del sample + + filtered = result.merge( + item_mappings, + how="inner", + left_on=["prior_item_viewed"], + right_on=["i_item_sk"], + ) + return filtered + diff --git a/gpu_bdb/bdb_tools/q04_utils.py b/gpu_bdb/bdb_tools/q04_utils.py new file mode 100644 index 00000000..b848f840 --- /dev/null +++ b/gpu_bdb/bdb_tools/q04_utils.py @@ -0,0 +1,96 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import cudf + +from bdb_tools.sessionization import get_sessions + +from bdb_tools.readers import build_reader + + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + + wp_cols = ["wp_type", "wp_web_page_sk"] + wp_df = table_reader.read("web_page", relevant_cols=wp_cols) + + wcs_cols = [ + "wcs_user_sk", + "wcs_click_date_sk", + "wcs_click_time_sk", + "wcs_web_page_sk", + "wcs_sales_sk", + ] + wcs_df = table_reader.read("web_clickstreams", relevant_cols=wcs_cols) + + if c: + c.create_table('web_page_wo_categorical', wp_df, persist=False) + c.create_table('web_clickstreams', wcs_df, persist=False) + + return wp_df, wcs_df + + +def abandonedShoppingCarts(df, DYNAMIC_CAT_CODE, ORDER_CAT_CODE): + + # Select groups where last dynamic row comes after last order row + filtered_df = df[ + (df["wp_type_codes"] == ORDER_CAT_CODE) + | (df["wp_type_codes"] == DYNAMIC_CAT_CODE) + ] + # Create a new column that is the concatenation of timestamp and wp_type_codes + # (eg:123456:3, 234567:5) + filtered_df["wp_type_codes"] = ( + filtered_df["tstamp_inSec"] + .astype("str") + .str.cat(filtered_df["wp_type_codes"].astype("str"), sep=":") + ) + # This gives the last occurrence (by timestamp) within the "order", "dynamic" wp_types + filtered_df = filtered_df.groupby( + ["wcs_user_sk", "session_id"], as_index=False, sort=False + ).agg({"wp_type_codes": "max"}) + # If the max contains dynamic, keep the row else discard. + last_dynamic_df = filtered_df[ + filtered_df["wp_type_codes"].str.contains( + ":" + str(DYNAMIC_CAT_CODE), regex=False + ) + ] + del filtered_df + + # Find counts for each group + grouped_count_df = df.groupby( + ["wcs_user_sk", "session_id"], as_index=False, sort=False + ).agg({"tstamp_inSec": "count"}) + # Merge counts with the "dynamic" shopping cart groups + result = last_dynamic_df.merge( + grouped_count_df, on=["wcs_user_sk", "session_id"], how="inner" + ) + del (last_dynamic_df, grouped_count_df) + return cudf.DataFrame( + {"pagecount": result.tstamp_inSec.sum(), "count": len(result)} + ) + + +def reduction_function(df, keep_cols, DYNAMIC_CAT_CODE, ORDER_CAT_CODE): + df = get_sessions(df, keep_cols=keep_cols) + df = abandonedShoppingCarts( + df, DYNAMIC_CAT_CODE=DYNAMIC_CAT_CODE, ORDER_CAT_CODE=ORDER_CAT_CODE + ) + return df + diff --git a/gpu_bdb/bdb_tools/q05_utils.py b/gpu_bdb/bdb_tools/q05_utils.py new file mode 100644 index 00000000..c4b71fd0 --- /dev/null +++ b/gpu_bdb/bdb_tools/q05_utils.py @@ -0,0 +1,104 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import cupy as cp + +import cuml +from cuml.metrics import confusion_matrix + +from bdb_tools.cupy_metrics import cupy_precision_score + +from bdb_tools.readers import build_reader + +from sklearn.metrics import roc_auc_score + +# Logistic Regression params +# solver = "LBFGS" Used by passing `penalty=None` or "l2" +# step_size = 1 Not used +# numCorrections = 10 Not used +iterations = 100 +C = 10_000 # reg_lambda = 0 hence C for model is a large value +convergence_tol = 1e-9 + +wcs_columns = ["wcs_item_sk", "wcs_user_sk"] +items_columns = ["i_item_sk", "i_category", "i_category_id"] +customer_columns = ["c_customer_sk", "c_current_cdemo_sk"] +customer_dem_columns = ["cd_demo_sk", "cd_gender", "cd_education_status"] + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + + item_ddf = table_reader.read("item", relevant_cols=items_columns, index=False) + customer_ddf = table_reader.read( + "customer", relevant_cols=customer_columns, index=False + ) + customer_dem_ddf = table_reader.read( + "customer_demographics", relevant_cols=customer_dem_columns, index=False + ) + wcs_ddf = table_reader.read( + "web_clickstreams", relevant_cols=wcs_columns, index=False + ) + + if c: + c.create_table("web_clickstreams", wcs_ddf, persist=False) + c.create_table("customer", customer_ddf, persist=False) + c.create_table("item", item_ddf, persist=False) + c.create_table("customer_demographics", customer_dem_ddf, persist=False) + + return (item_ddf, customer_ddf, customer_dem_ddf) + +def build_and_predict_model(ml_input_df): + """ + Create a standardized feature matrix X and target array y. + Returns the model and accuracy statistics + """ + + feature_names = ["college_education", "male"] + [ + "clicks_in_%d" % i for i in range(1, 8) + ] + X = ml_input_df[feature_names] + # Standardize input matrix + X = (X - X.mean()) / X.std() + y = ml_input_df["clicks_in_category"] + + model = cuml.LogisticRegression( + tol=convergence_tol, + penalty="none", + solver="qn", + fit_intercept=True, + max_iter=iterations, + C=C, + ) + model.fit(X, y) + # + # Predict and evaluate accuracy + # (Should be 1.0) at SF-1 + # + results_dict = {} + y_pred = model.predict(X) + + results_dict["auc"] = roc_auc_score(y.to_array(), y_pred.to_array()) + results_dict["precision"] = cupy_precision_score(cp.asarray(y), cp.asarray(y_pred)) + results_dict["confusion_matrix"] = confusion_matrix( + cp.asarray(y, dtype="int32"), cp.asarray(y_pred, dtype="int32") + ) + results_dict["output_type"] = "supervised" + return results_dict + diff --git a/gpu_bdb/bdb_tools/q06_utils.py b/gpu_bdb/bdb_tools/q06_utils.py new file mode 100644 index 00000000..ec4e02b3 --- /dev/null +++ b/gpu_bdb/bdb_tools/q06_utils.py @@ -0,0 +1,72 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.readers import build_reader + +# -------- Q6 ----------- +q06_LIMIT = 100 +# --web_sales and store_sales date +q06_YEAR = 2001 + + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + + web_sales_cols = [ + "ws_bill_customer_sk", + "ws_sold_date_sk", + "ws_ext_list_price", + "ws_ext_wholesale_cost", + "ws_ext_discount_amt", + "ws_ext_sales_price", + ] + store_sales_cols = [ + "ss_customer_sk", + "ss_sold_date_sk", + "ss_ext_list_price", + "ss_ext_wholesale_cost", + "ss_ext_discount_amt", + "ss_ext_sales_price", + ] + date_cols = ["d_date_sk", "d_year", "d_moy"] + customer_cols = [ + "c_customer_sk", + "c_customer_id", + "c_email_address", + "c_first_name", + "c_last_name", + "c_preferred_cust_flag", + "c_birth_country", + "c_login", + ] + + ws_df = table_reader.read("web_sales", relevant_cols=web_sales_cols) + ss_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) + date_df = table_reader.read("date_dim", relevant_cols=date_cols) + customer_df = table_reader.read("customer", relevant_cols=customer_cols) + + if c: + c.create_table('web_sales', ws_df, persist=False) + c.create_table('store_sales', ss_df, persist=False) + c.create_table('date_dim', date_df, persist=False) + c.create_table('customer', customer_df, persist=False) + + return (ws_df, ss_df, date_df, customer_df) + diff --git a/gpu_bdb/bdb_tools/q07_utils.py b/gpu_bdb/bdb_tools/q07_utils.py new file mode 100644 index 00000000..e55b54f1 --- /dev/null +++ b/gpu_bdb/bdb_tools/q07_utils.py @@ -0,0 +1,55 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.readers import build_reader + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + + item_cols = ["i_item_sk", "i_current_price", "i_category"] + store_sales_cols = ["ss_item_sk", "ss_customer_sk", "ss_sold_date_sk"] + date_cols = ["d_date_sk", "d_year", "d_moy"] + customer_cols = ["c_customer_sk", "c_current_addr_sk"] + customer_address_cols = ["ca_address_sk", "ca_state"] + + item_df = table_reader.read("item", relevant_cols=item_cols) + store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) + date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) + customer_df = table_reader.read("customer", relevant_cols=customer_cols) + customer_address_df = table_reader.read( + "customer_address", relevant_cols=customer_address_cols + ) + + if c: + c.create_table("item", item_df, persist=False) + c.create_table("customer", customer_df, persist=False) + c.create_table("store_sales", store_sales_df, persist=False) + c.create_table("date_dim", date_dim_df, persist=False) + c.create_table("customer_address", customer_address_df, persist=False) + + return ( + item_df, + store_sales_df, + date_dim_df, + customer_df, + customer_address_df, + ) + + diff --git a/gpu_bdb/bdb_tools/q08_utils.py b/gpu_bdb/bdb_tools/q08_utils.py new file mode 100644 index 00000000..2a220cb1 --- /dev/null +++ b/gpu_bdb/bdb_tools/q08_utils.py @@ -0,0 +1,158 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import cudf + +import cupy as cp +import numpy as np + +from bdb_tools.readers import build_reader + +q08_STARTDATE = "2001-09-02" +q08_ENDDATE = "2002-09-02" +q08_SECONDS_BEFORE_PURCHASE = 259200 +NA_FLAG = 0 + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + + date_dim_cols = ["d_date_sk", "d_date"] + web_page_cols = ["wp_web_page_sk", "wp_type"] + web_sales_cols = ["ws_net_paid", "ws_order_number", "ws_sold_date_sk"] + wcs_cols = [ + "wcs_user_sk", + "wcs_sales_sk", + "wcs_click_date_sk", + "wcs_click_time_sk", + "wcs_web_page_sk", + ] + + date_dim_df = table_reader.read("date_dim", relevant_cols=date_dim_cols) + web_page_df = table_reader.read("web_page", relevant_cols=web_page_cols) + web_sales_df = table_reader.read("web_sales", relevant_cols=web_sales_cols) + wcs_df = table_reader.read("web_clickstreams", relevant_cols=wcs_cols) + + if c: + c.create_table("web_clickstreams", wcs_df, persist=False) + c.create_table("web_sales", web_sales_df, persist=False) + c.create_table("web_page", web_page_df, persist=False) + c.create_table("date_dim", date_dim_df, persist=False) + + return (date_dim_df, web_page_df, web_sales_df) + +def get_session_id_from_session_boundary(session_change_df, last_session_len): + """ + This function returns session starts given a session change df + """ + + user_session_ids = session_change_df.tstamp_inSec + + ### up shift the session length df + session_len = session_change_df["t_index"].diff().reset_index(drop=True) + session_len = session_len.shift(-1) + + try: + session_len.iloc[-1] = last_session_len + except (AssertionError, IndexError): # IndexError in numba >= 0.48 + session_len = cudf.Series([]) + + session_id_final_series = ( + cudf.Series(user_session_ids).repeat(session_len).reset_index(drop=True) + ) + return session_id_final_series + + +def get_session_id(df): + """ + This function creates a session id column for each click + The session id grows in incremeant for each user's susbequent session + Session boundry is defined by the time_out + """ + + df["user_change_flag"] = df["wcs_user_sk"].diff(periods=1) != 0 + df["user_change_flag"] = df["user_change_flag"].fillna(True) + df["session_change_flag"] = df["review_flag"] | df["user_change_flag"] + + df = df.reset_index(drop=True) + df["t_index"] = cp.arange(start=0, stop=len(df), dtype=np.int32) + + session_change_df = df[df["session_change_flag"]].reset_index(drop=True) + try: + last_session_len = len(df) - session_change_df["t_index"].iloc[-1] + except (AssertionError, IndexError): # IndexError in numba >= 0.48 + last_session_len = 0 + + session_ids = get_session_id_from_session_boundary( + session_change_df, last_session_len + ) + + assert len(session_ids) == len(df) + return session_ids + + +def get_sessions(df): + df = df.sort_values( + by=["wcs_user_sk", "tstamp_inSec", "wcs_sales_sk", "wp_type_codes"] + ).reset_index(drop=True) + df["session_id"] = get_session_id(df) + return df + + +def get_unique_sales_keys_from_sessions(sessionized, review_cat_code): + sessionized["relevant"] = ( + (sessionized.tstamp_inSec - sessionized.session_id) + <= q08_SECONDS_BEFORE_PURCHASE + ) & (sessionized.wcs_sales_sk != NA_FLAG) + unique_sales_sk = ( + sessionized.query(f"wcs_sales_sk != {NA_FLAG}") + .query("relevant == True") + .query(f"wp_type_codes != {review_cat_code}") + .wcs_sales_sk.unique() + ) + + return unique_sales_sk + + +def prep_for_sessionization(df, review_cat_code): + df = df.fillna(NA_FLAG) + df = df.sort_values( + by=["wcs_user_sk", "tstamp_inSec", "wcs_sales_sk", "wp_type_codes"] + ).reset_index(drop=True) + + review_df = df.loc[df["wp_type_codes"] == review_cat_code] + # per user, the index of the first review + # need this to decide if a review was "recent enough" + every_users_first_review = ( + review_df[["wcs_user_sk", "tstamp_inSec"]] + .drop_duplicates() + .reset_index() + .groupby("wcs_user_sk")["index"] + .min() + .reset_index() + ) + every_users_first_review.columns = ["wcs_user_sk", "first_review_index"] + + # then reset the index to keep the old index before parallel join + df_merged = df.reset_index().merge( + every_users_first_review, how="left", on="wcs_user_sk" + ) + df_filtered = df_merged.query("index >= first_review_index") + return df_filtered + diff --git a/gpu_bdb/bdb_tools/q09_utils.py b/gpu_bdb/bdb_tools/q09_utils.py new file mode 100644 index 00000000..42fce78d --- /dev/null +++ b/gpu_bdb/bdb_tools/q09_utils.py @@ -0,0 +1,91 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.readers import build_reader + + +# -------- Q9 ----------- +q09_year = 2001 + +q09_part1_ca_country = "United States" +q09_part1_ca_state_IN = "KY", "GA", "NM" +q09_part1_net_profit_min = 0 +q09_part1_net_profit_max = 2000 +q09_part1_education_status = "4 yr Degree" +q09_part1_marital_status = "M" +q09_part1_sales_price_min = 100 +q09_part1_sales_price_max = 150 + +q09_part2_ca_country = "United States" +q09_part2_ca_state_IN = "MT", "OR", "IN" +q09_part2_net_profit_min = 150 +q09_part2_net_profit_max = 3000 +q09_part2_education_status = "4 yr Degree" +q09_part2_marital_status = "M" +q09_part2_sales_price_min = 50 +q09_part2_sales_price_max = 200 + +q09_part3_ca_country = "United States" +q09_part3_ca_state_IN = "WI", "MO", "WV" +q09_part3_net_profit_min = 50 +q09_part3_net_profit_max = 25000 +q09_part3_education_status = "4 yr Degree" +q09_part3_marital_status = "M" +q09_part3_sales_price_min = 150 +q09_part3_sales_price_max = 200 + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + + ss_columns = [ + "ss_quantity", + "ss_sold_date_sk", + "ss_addr_sk", + "ss_store_sk", + "ss_cdemo_sk", + "ss_sales_price", + "ss_net_profit", + ] + + store_sales = table_reader.read("store_sales", relevant_cols=ss_columns) + + ca_columns = ["ca_address_sk", "ca_country", "ca_state"] + customer_address = table_reader.read("customer_address", relevant_cols=ca_columns) + + cd_columns = ["cd_demo_sk", "cd_marital_status", "cd_education_status"] + customer_demographics = table_reader.read( + "customer_demographics", relevant_cols=cd_columns + ) + + dd_columns = ["d_year", "d_date_sk"] + date_dim = table_reader.read("date_dim", relevant_cols=dd_columns) + + s_columns = ["s_store_sk"] + store = table_reader.read("store", relevant_cols=s_columns) + + if c: + c.create_table("store_sales", store_sales, persist=False) + c.create_table("customer_address", customer_address, persist=False) + c.create_table("customer_demographics", customer_demographics, persist=False) + c.create_table("date_dim", date_dim, persist=False) + c.create_table("store", store, persist=False) + + return (store_sales, customer_address, customer_demographics, date_dim, store) + diff --git a/gpu_bdb/bdb_tools/q10_utils.py b/gpu_bdb/bdb_tools/q10_utils.py new file mode 100644 index 00000000..938aa45f --- /dev/null +++ b/gpu_bdb/bdb_tools/q10_utils.py @@ -0,0 +1,39 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.readers import build_reader + +eol_char = "è" + +def read_tables(config, c=None): + + ### splitting by row groups for better parallelism + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=True, + ) + product_reviews_cols = ["pr_item_sk", "pr_review_content", "pr_review_sk"] + + product_reviews_df = table_reader.read( + "product_reviews", relevant_cols=product_reviews_cols, + ) + + if c: + c.create_table("product_reviews", product_reviews_df, persist=False) + + return product_reviews_df + diff --git a/gpu_bdb/bdb_tools/q11_utils.py b/gpu_bdb/bdb_tools/q11_utils.py new file mode 100644 index 00000000..603d3d79 --- /dev/null +++ b/gpu_bdb/bdb_tools/q11_utils.py @@ -0,0 +1,49 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.readers import build_reader + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + + product_review_cols = [ + "pr_review_rating", + "pr_item_sk", + ] + web_sales_cols = [ + "ws_sold_date_sk", + "ws_net_paid", + "ws_item_sk", + ] + date_cols = ["d_date_sk", "d_date"] + + pr_df = table_reader.read("product_reviews", relevant_cols=product_review_cols) + # we only read int columns here so it should scale up to sf-10k as just 26M rows + pr_df = pr_df.repartition(npartitions=1) + + ws_df = table_reader.read("web_sales", relevant_cols=web_sales_cols) + date_df = table_reader.read("date_dim", relevant_cols=date_cols) + + if c: + c.create_table("web_sales", ws_df, persist=False) + c.create_table("product_reviews", pr_df, persist=False) + c.create_table("date_dim", date_df, persist=False) + + return (pr_df, ws_df, date_df) diff --git a/gpu_bdb/bdb_tools/q12_utils.py b/gpu_bdb/bdb_tools/q12_utils.py new file mode 100644 index 00000000..e1b72cd2 --- /dev/null +++ b/gpu_bdb/bdb_tools/q12_utils.py @@ -0,0 +1,44 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.readers import build_reader + + +q12_i_category_IN = "'Books', 'Electronics'" + +item_cols = ["i_item_sk", "i_category"] +store_sales_cols = ["ss_item_sk", "ss_sold_date_sk", "ss_customer_sk"] +wcs_cols = ["wcs_user_sk", "wcs_click_date_sk", "wcs_item_sk", "wcs_sales_sk"] + + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + + item_df = table_reader.read("item", relevant_cols=item_cols) + store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) + wcs_df = table_reader.read("web_clickstreams", relevant_cols=wcs_cols) + + if c: + c.create_table("web_clickstreams", wcs_df, persist=False) + c.create_table("store_sales", store_sales_df, persist=False) + c.create_table("item", item_df, persist=False) + + return item_df, store_sales_df + diff --git a/gpu_bdb/bdb_tools/q13_utils.py b/gpu_bdb/bdb_tools/q13_utils.py new file mode 100644 index 00000000..96910386 --- /dev/null +++ b/gpu_bdb/bdb_tools/q13_utils.py @@ -0,0 +1,45 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.readers import build_reader + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + + date_cols = ["d_date_sk", "d_year"] + date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) + + customer_cols = ["c_customer_sk", "c_customer_id", "c_first_name", "c_last_name"] + customer_df = table_reader.read("customer", relevant_cols=customer_cols) + + s_sales_cols = ["ss_sold_date_sk", "ss_customer_sk", "ss_net_paid"] + s_sales_df = table_reader.read("store_sales", relevant_cols=s_sales_cols) + + w_sales_cols = ["ws_sold_date_sk", "ws_bill_customer_sk", "ws_net_paid"] + web_sales_df = table_reader.read("web_sales", relevant_cols=w_sales_cols) + + if c: + c.create_table("date_dim", date_dim_df, persist=False) + c.create_table("customer", customer_df, persist=False) + c.create_table("store_sales", s_sales_df, persist=False) + c.create_table("web_sales", web_sales_df, persist=False) + + return (date_dim_df, customer_df, s_sales_df, web_sales_df) + diff --git a/gpu_bdb/bdb_tools/q14_utils.py b/gpu_bdb/bdb_tools/q14_utils.py new file mode 100644 index 00000000..b7c900b4 --- /dev/null +++ b/gpu_bdb/bdb_tools/q14_utils.py @@ -0,0 +1,47 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.readers import build_reader + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + + ws_columns = ["ws_ship_hdemo_sk", "ws_web_page_sk", "ws_sold_time_sk"] + web_sales = table_reader.read("web_sales", relevant_cols=ws_columns) + + hd_columns = ["hd_demo_sk", "hd_dep_count"] + household_demographics = table_reader.read( + "household_demographics", relevant_cols=hd_columns + ) + + wp_columns = ["wp_web_page_sk", "wp_char_count"] + web_page = table_reader.read("web_page", relevant_cols=wp_columns) + + td_columns = ["t_time_sk", "t_hour"] + time_dim = table_reader.read("time_dim", relevant_cols=td_columns) + + if c: + c.create_table("household_demographics", household_demographics, persist=False) + c.create_table("web_page", web_page, persist=False) + c.create_table("web_sales", web_sales, persist=False) + c.create_table("time_dim", time_dim, persist=False) + + return (web_sales, household_demographics, web_page, time_dim) + diff --git a/gpu_bdb/bdb_tools/q15_utils.py b/gpu_bdb/bdb_tools/q15_utils.py new file mode 100644 index 00000000..08f1f6d7 --- /dev/null +++ b/gpu_bdb/bdb_tools/q15_utils.py @@ -0,0 +1,47 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.readers import build_reader + +# --store_sales date range +q15_startDate = "2001-09-02" +# --+1year +q15_endDate = "2002-09-02" +q15_store_sk = 10 + +store_sales_cols = ["ss_sold_date_sk", "ss_net_paid", "ss_store_sk", "ss_item_sk"] +date_cols = ["d_date", "d_date_sk"] +item_cols = ["i_item_sk", "i_category_id"] + + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + + store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) + date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) + item_df = table_reader.read("item", relevant_cols=item_cols) + + if c: + c.create_table("store_sales", store_sales_df, persist=False) + c.create_table("date_dim", date_dim_df, persist=False) + c.create_table("item", item_df, persist=False) + + return store_sales_df, date_dim_df, item_df + diff --git a/gpu_bdb/bdb_tools/q16_utils.py b/gpu_bdb/bdb_tools/q16_utils.py new file mode 100644 index 00000000..8631bb28 --- /dev/null +++ b/gpu_bdb/bdb_tools/q16_utils.py @@ -0,0 +1,52 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.readers import build_reader + +websale_cols = [ + "ws_order_number", + "ws_item_sk", + "ws_warehouse_sk", + "ws_sold_date_sk", + "ws_sales_price", +] +web_returns_cols = ["wr_order_number", "wr_item_sk", "wr_refunded_cash"] +date_cols = ["d_date", "d_date_sk"] +item_cols = ["i_item_sk", "i_item_id"] +warehouse_cols = ["w_warehouse_sk", "w_state"] + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + + web_sales_df = table_reader.read("web_sales", relevant_cols=websale_cols) + web_returns_df = table_reader.read("web_returns", relevant_cols=web_returns_cols) + date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) + item_df = table_reader.read("item", relevant_cols=item_cols) + warehouse_df = table_reader.read("warehouse", relevant_cols=warehouse_cols) + + if c: + c.create_table("web_sales", web_sales_df, persist=False) + c.create_table("web_returns", web_returns_df, persist=False) + c.create_table("date_dim", date_dim_df, persist=False) + c.create_table("item", item_df, persist=False) + c.create_table("warehouse", warehouse_df, persist=False) + + return web_sales_df, web_returns_df, date_dim_df, item_df, warehouse_df + diff --git a/gpu_bdb/bdb_tools/q17_utils.py b/gpu_bdb/bdb_tools/q17_utils.py new file mode 100644 index 00000000..cbcb80f3 --- /dev/null +++ b/gpu_bdb/bdb_tools/q17_utils.py @@ -0,0 +1,74 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.readers import build_reader + +q17_gmt_offset = -5.0 +# --store_sales date +q17_year = 2001 +q17_month = 12 + +store_sales_cols = [ + "ss_ext_sales_price", + "ss_sold_date_sk", + "ss_store_sk", + "ss_customer_sk", + "ss_promo_sk", + "ss_item_sk", +] +item_cols = ["i_category", "i_item_sk"] +customer_cols = ["c_customer_sk", "c_current_addr_sk"] +store_cols = ["s_gmt_offset", "s_store_sk"] +date_cols = ["d_date_sk", "d_year", "d_moy"] +customer_address_cols = ["ca_address_sk", "ca_gmt_offset"] +promotion_cols = ["p_channel_email", "p_channel_dmail", "p_channel_tv", "p_promo_sk"] + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + + store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) + item_df = table_reader.read("item", relevant_cols=item_cols) + customer_df = table_reader.read("customer", relevant_cols=customer_cols) + store_df = table_reader.read("store", relevant_cols=store_cols) + date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) + customer_address_df = table_reader.read( + "customer_address", relevant_cols=customer_address_cols + ) + promotion_df = table_reader.read("promotion", relevant_cols=promotion_cols) + + if c: + c.create_table("store_sales", store_sales_df, persist=False) + c.create_table("item", item_df, persist=False) + c.create_table("customer", customer_df, persist=False) + c.create_table("store", store_df, persist=False) + c.create_table("date_dim", date_dim_df, persist=False) + c.create_table("customer_address", customer_address_df, persist=False) + c.create_table("promotion", promotion_df, persist=False) + + return ( + store_sales_df, + item_df, + customer_df, + store_df, + date_dim_df, + customer_address_df, + promotion_df, + ) + diff --git a/gpu_bdb/bdb_tools/q18_utils.py b/gpu_bdb/bdb_tools/q18_utils.py new file mode 100644 index 00000000..c0a9d45a --- /dev/null +++ b/gpu_bdb/bdb_tools/q18_utils.py @@ -0,0 +1,136 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import cupy as cp +import cudf +from cudf._lib.strings import find_multiple + +from bdb_tools.readers import build_reader + +q18_startDate = "2001-05-02" +# --+90days +q18_endDate = "2001-09-02" + +EOL_CHAR = "è" + + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], basepath=config["data_dir"], + ) + + store_sales_cols = [ + "ss_store_sk", + "ss_sold_date_sk", + "ss_net_paid", + ] + date_cols = ["d_date_sk", "d_date"] + store_cols = ["s_store_sk", "s_store_name"] + + store_sales = table_reader.read("store_sales", relevant_cols=store_sales_cols) + date_dim = table_reader.read("date_dim", relevant_cols=date_cols) + store = table_reader.read("store", relevant_cols=store_cols) + + ### splitting by row groups for better parallelism + pr_table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=True, + ) + + product_reviews_cols = ["pr_review_date", "pr_review_content", "pr_review_sk"] + product_reviews = pr_table_reader.read( + "product_reviews", relevant_cols=product_reviews_cols, + ) + + if c: + c.create_table("store", store, persist=False) + c.create_table("store_sales", store_sales, persist=False) + c.create_table("date_dim", date_dim, persist=False) + c.create_table("product_reviews", product_reviews, persist=False) + + return store_sales, date_dim, store, product_reviews + + +def create_found_reshaped_with_global_pos(found, targets): + """Given the dataframe created by mapping find_targets_in_reviews, + create a new dataframe in which the nonzero values in each row are exploded + to get their own row. Each row will contain the word, its mapping in the column order, + and the pr_review_sk for the review from which it came. + Having these as two separate functions makes managing dask metadata easier. + """ + + target_df = cudf.DataFrame({"word": targets}).reset_index(drop=False) + target_df.columns = ["word_mapping", "word"] + + df_clean = found.drop(["pr_review_sk"], axis=1) + + row_idxs, col_idxs = df_clean.values.nonzero() + + found_reshaped = cudf.DataFrame( + {"word_mapping": col_idxs, "pr_review_sk": found["pr_review_sk"].iloc[row_idxs]} + ) + found_reshaped = found_reshaped.merge(target_df, on="word_mapping", how="inner")[ + ["word", "pr_review_sk"] + ] + return found_reshaped + + +def find_targets_in_reviews_helper(ddf, targets, str_col_name="pr_review_content"): + """returns a N x K matrix, where N is the number of rows in ddf that + contain one of the target words and K is the number of words in targets. + + If a target word is found in a review, the value in that row, column + is non-zero. + + At the end, any row with non-zero values is returned. + + """ + + lowered = ddf[str_col_name].str.lower() + + ## TODO: Do the replace/any in cupy land before going to cuDF + resdf = cudf.DataFrame( + cp.asarray( + find_multiple.find_multiple(lowered._column, targets._column) + ).reshape(-1, len(targets)) + ) + + resdf = resdf.replace([0, -1], [1, 0]) + found_mask = resdf.any(axis=1) + resdf["pr_review_sk"] = ddf["pr_review_sk"] + found = resdf.loc[found_mask] + return create_found_reshaped_with_global_pos(found, targets) + + +def find_relevant_reviews(df, targets, str_col_name="pr_review_content"): + """ + This function finds the reviews containg target stores and returns the + relevant reviews + """ + + targets = cudf.Series(targets) + targets_lower = targets.str.lower() + reviews_found = find_targets_in_reviews_helper(df, targets_lower)[ + ["word", "pr_review_sk"] + ] + + combined = reviews_found.merge( + df[["pr_review_date", "pr_review_sk"]], how="inner", on=["pr_review_sk"] + ) + + return combined + diff --git a/gpu_bdb/bdb_tools/q19_utils.py b/gpu_bdb/bdb_tools/q19_utils.py new file mode 100644 index 00000000..105a914e --- /dev/null +++ b/gpu_bdb/bdb_tools/q19_utils.py @@ -0,0 +1,57 @@ + +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.readers import build_reader + +q19_returns_dates_IN = ["2004-03-08", "2004-08-02", "2004-11-15", "2004-12-20"] + +eol_char = "è" + + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], basepath=config["data_dir"], + ) + date_dim_cols = ["d_week_seq", "d_date_sk", "d_date"] + date_dim_df = table_reader.read("date_dim", relevant_cols=date_dim_cols) + store_returns_cols = ["sr_returned_date_sk", "sr_item_sk", "sr_return_quantity"] + store_returns_df = table_reader.read( + "store_returns", relevant_cols=store_returns_cols + ) + web_returns_cols = ["wr_returned_date_sk", "wr_item_sk", "wr_return_quantity"] + web_returns_df = table_reader.read("web_returns", relevant_cols=web_returns_cols) + + ### splitting by row groups for better parallelism + pr_table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=True, + ) + + product_reviews_cols = ["pr_item_sk", "pr_review_content", "pr_review_sk"] + product_reviews_df = pr_table_reader.read( + "product_reviews", relevant_cols=product_reviews_cols + ) + + if c: + c.create_table('web_returns', web_returns_df, persist=False) + c.create_table('date_dim', date_dim_df, persist=False) + c.create_table('product_reviews', product_reviews_df, persist=False) + c.create_table('store_returns', store_returns_df, persist=False) + + return date_dim_df, store_returns_df, web_returns_df, product_reviews_df + diff --git a/gpu_bdb/bdb_tools/q20_utils.py b/gpu_bdb/bdb_tools/q20_utils.py new file mode 100644 index 00000000..1373d4be --- /dev/null +++ b/gpu_bdb/bdb_tools/q20_utils.py @@ -0,0 +1,83 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import dask_cudf + +from dask import delayed + +from bdb_tools.utils import train_clustering_model + +from bdb_tools.readers import build_reader + +# q20 parameters +N_CLUSTERS = 8 +CLUSTER_ITERATIONS = 20 +N_ITER = 5 + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + + store_sales_cols = [ + "ss_customer_sk", + "ss_ticket_number", + "ss_item_sk", + "ss_net_paid", + ] + store_returns_cols = [ + "sr_item_sk", + "sr_customer_sk", + "sr_ticket_number", + "sr_return_amt", + ] + + store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) + store_returns_df = table_reader.read( + "store_returns", relevant_cols=store_returns_cols + ) + + if c: + c.create_table("store_sales", store_sales_df, persist=False) + c.create_table("store_returns", store_returns_df, persist=False) + + return store_sales_df, store_returns_df + + +def get_clusters(client, ml_input_df, feature_cols): + """ + Takes the dask client, kmeans_input_df and feature columns. + Returns a dictionary matching the output required for q20 + """ + ml_tasks = [ + delayed(train_clustering_model)(df, N_CLUSTERS, CLUSTER_ITERATIONS, N_ITER) + for df in ml_input_df[feature_cols].to_delayed() + ] + + results_dict = client.compute(*ml_tasks, sync=True) + + labels = results_dict["cid_labels"] + + labels_final = dask_cudf.from_cudf(labels, npartitions=ml_input_df.npartitions) + ml_input_df["label"] = labels_final.reset_index()[0] + + output = ml_input_df[["user_sk", "label"]] + + results_dict["cid_labels"] = output + return results_dict + diff --git a/gpu_bdb/bdb_tools/q21_utils.py b/gpu_bdb/bdb_tools/q21_utils.py new file mode 100644 index 00000000..453aea48 --- /dev/null +++ b/gpu_bdb/bdb_tools/q21_utils.py @@ -0,0 +1,69 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.readers import build_reader + +store_sales_cols = [ + "ss_item_sk", + "ss_store_sk", + "ss_customer_sk", + "ss_ticket_number", + "ss_quantity", + "ss_sold_date_sk", +] +date_cols = ["d_date_sk", "d_year", "d_moy"] +websale_cols = ["ws_item_sk", "ws_bill_customer_sk", "ws_quantity", "ws_sold_date_sk"] +sr_cols = [ + "sr_item_sk", + "sr_customer_sk", + "sr_ticket_number", + "sr_return_quantity", + "sr_returned_date_sk", +] +store_cols = ["s_store_name", "s_store_id", "s_store_sk"] +item_cols = ["i_item_id", "i_item_desc", "i_item_sk"] + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + + store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) + date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) + web_sales_df = table_reader.read("web_sales", relevant_cols=websale_cols) + store_returns_df = table_reader.read("store_returns", relevant_cols=sr_cols) + store_table_df = table_reader.read("store", relevant_cols=store_cols) + item_table_df = table_reader.read("item", relevant_cols=item_cols) + + if c: + c.create_table("store_sales", store_sales_df, persist=False) + c.create_table("date_dim", date_dim_df, persist=False) + c.create_table("item", item_table_df, persist=False) + c.create_table("web_sales", web_sales_df, persist=False) + c.create_table("store_returns", store_returns_df, persist=False) + c.create_table("store", store_table_df, persist=False) + + return ( + store_sales_df, + date_dim_df, + web_sales_df, + store_returns_df, + store_table_df, + item_table_df, + ) + diff --git a/gpu_bdb/bdb_tools/q22_utils.py b/gpu_bdb/bdb_tools/q22_utils.py new file mode 100644 index 00000000..db44f325 --- /dev/null +++ b/gpu_bdb/bdb_tools/q22_utils.py @@ -0,0 +1,56 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.readers import build_reader +from bdb_tools.utils import convert_datestring_to_days + +q22_date = "2001-05-08" +q22_i_current_price_min = 0.98 +q22_i_current_price_max = 1.5 + + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + inv_columns = [ + "inv_item_sk", + "inv_warehouse_sk", + "inv_date_sk", + "inv_quantity_on_hand", + ] + inventory = table_reader.read("inventory", relevant_cols=inv_columns) + + item_columns = ["i_item_id", "i_current_price", "i_item_sk"] + item = table_reader.read("item", relevant_cols=item_columns) + + warehouse_columns = ["w_warehouse_sk", "w_warehouse_name"] + warehouse = table_reader.read("warehouse", relevant_cols=warehouse_columns) + + dd_columns = ["d_date_sk", "d_date"] + date_dim = table_reader.read("date_dim", relevant_cols=dd_columns) + date_dim = date_dim.map_partitions(convert_datestring_to_days) + + if c: + c.create_table('inventory', inventory, persist=False) + c.create_table('item', item, persist=False) + c.create_table('warehouse', warehouse, persist=False) + c.create_table('date_dim', date_dim, persist=False) + + return inventory, item, warehouse, date_dim + diff --git a/gpu_bdb/bdb_tools/q23_utils.py b/gpu_bdb/bdb_tools/q23_utils.py new file mode 100644 index 00000000..43ef19a2 --- /dev/null +++ b/gpu_bdb/bdb_tools/q23_utils.py @@ -0,0 +1,45 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.readers import build_reader + +q23_year = 2001 +q23_month = 1 +q23_coefficient = 1.3 + + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], basepath=config["data_dir"], + ) + + date_cols = ["d_date_sk", "d_year", "d_moy"] + date_df = table_reader.read("date_dim", relevant_cols=date_cols) + + inv_cols = [ + "inv_warehouse_sk", + "inv_item_sk", + "inv_date_sk", + "inv_quantity_on_hand", + ] + inv_df = table_reader.read("inventory", relevant_cols=inv_cols) + + if c: + c.create_table('inventory', inv_df, persist=False) + c.create_table('date_dim', date_df, persist=False) + + return date_df, inv_df + diff --git a/gpu_bdb/bdb_tools/q24_utils.py b/gpu_bdb/bdb_tools/q24_utils.py new file mode 100644 index 00000000..a413eb97 --- /dev/null +++ b/gpu_bdb/bdb_tools/q24_utils.py @@ -0,0 +1,49 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.readers import build_reader + +ws_cols = ["ws_item_sk", "ws_sold_date_sk", "ws_quantity"] +item_cols = ["i_item_sk", "i_current_price"] +imp_cols = [ + "imp_item_sk", + "imp_competitor_price", + "imp_start_date", + "imp_end_date", + "imp_sk", +] +ss_cols = ["ss_item_sk", "ss_sold_date_sk", "ss_quantity"] + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + ### read tables + ws_df = table_reader.read("web_sales", relevant_cols=ws_cols) + item_df = table_reader.read("item", relevant_cols=item_cols) + imp_df = table_reader.read("item_marketprices", relevant_cols=imp_cols) + ss_df = table_reader.read("store_sales", relevant_cols=ss_cols) + + if c: + c.create_table("web_sales", ws_df, persist=False) + c.create_table("item", item_df, persist=False) + c.create_table("item_marketprices", imp_df, persist=False) + c.create_table("store_sales", ss_df, persist=False) + + return ws_df, item_df, imp_df, ss_df + diff --git a/gpu_bdb/bdb_tools/q25_utils.py b/gpu_bdb/bdb_tools/q25_utils.py new file mode 100644 index 00000000..523598f5 --- /dev/null +++ b/gpu_bdb/bdb_tools/q25_utils.py @@ -0,0 +1,80 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import dask_cudf + +from bdb_tools.utils import train_clustering_model + +from bdb_tools.readers import build_reader + +from dask import delayed + +q25_date = "2002-01-02" + +N_CLUSTERS = 8 +CLUSTER_ITERATIONS = 20 +N_ITER = 5 + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + + ss_cols = ["ss_customer_sk", "ss_sold_date_sk", "ss_ticket_number", "ss_net_paid"] + ws_cols = [ + "ws_bill_customer_sk", + "ws_sold_date_sk", + "ws_order_number", + "ws_net_paid", + ] + datedim_cols = ["d_date_sk", "d_date"] + + ss_ddf = table_reader.read("store_sales", relevant_cols=ss_cols, index=False) + ws_ddf = table_reader.read("web_sales", relevant_cols=ws_cols, index=False) + datedim_ddf = table_reader.read("date_dim", relevant_cols=datedim_cols, index=False) + + if c: + c.create_table("web_sales", ws_ddf, persist=False) + c.create_table("store_sales", ss_ddf, persist=False) + c.create_table("date_dim", datedim_ddf, persist=False) + + return ss_ddf, ws_ddf, datedim_ddf + + +def get_clusters(client, ml_input_df): + + ml_tasks = [ + delayed(train_clustering_model)(df, N_CLUSTERS, CLUSTER_ITERATIONS, N_ITER) + for df in ml_input_df.to_delayed() + ] + results_dict = client.compute(*ml_tasks, sync=True) + + output = ml_input_df.index.to_frame().reset_index(drop=True) + + labels_final = dask_cudf.from_cudf( + results_dict["cid_labels"], npartitions=output.npartitions + ) + output["label"] = labels_final.reset_index()[0] + + # Sort based on CDH6.1 q25-result formatting + output = output.sort_values(["cid"]) + + results_dict["cid_labels"] = output + return results_dict + + diff --git a/gpu_bdb/bdb_tools/q26_utils.py b/gpu_bdb/bdb_tools/q26_utils.py new file mode 100644 index 00000000..5f299565 --- /dev/null +++ b/gpu_bdb/bdb_tools/q26_utils.py @@ -0,0 +1,42 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bdb_tools.readers import build_reader + +Q26_CATEGORY = "Books" +Q26_ITEM_COUNT = 5 +N_CLUSTERS = 8 +CLUSTER_ITERATIONS = 20 +N_ITER = 5 + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + + ss_cols = ["ss_customer_sk", "ss_item_sk"] + items_cols = ["i_item_sk", "i_category", "i_class_id"] + + ss_ddf = table_reader.read("store_sales", relevant_cols=ss_cols, index=False) + items_ddf = table_reader.read("item", relevant_cols=items_cols, index=False) + + if c: + c.create_table("store_sales", ss_ddf, persist=False) + c.create_table("item", items_ddf, persist=False) + + return ss_ddf, items_ddf + diff --git a/gpu_bdb/bdb_tools/q27_utils.py b/gpu_bdb/bdb_tools/q27_utils.py new file mode 100644 index 00000000..167cd2a0 --- /dev/null +++ b/gpu_bdb/bdb_tools/q27_utils.py @@ -0,0 +1,54 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import spacy + +from bdb_tools.readers import build_reader + +q27_pr_item_sk = 10002 +EOL_CHAR = "." + +def read_tables(config, c=None): + ### splitting by row groups for better parallelism + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=True, + ) + product_reviews_cols = ["pr_item_sk", "pr_review_content", "pr_review_sk"] + product_reviews_df = table_reader.read( + "product_reviews", relevant_cols=product_reviews_cols + ) + + if c: + c.create_table("product_reviews", product_reviews_df, persist=False) + + return product_reviews_df + + +def ner_parser(df, col_string, batch_size=256): + spacy.require_gpu() + nlp = spacy.load("en_core_web_sm") + docs = nlp.pipe(df[col_string], disable=["tagger", "parser"], batch_size=batch_size) + out = [] + for doc in docs: + l = [ent.text for ent in doc.ents if ent.label_ == "ORG"] + val = ", " + l = val.join(l) + out.append(l) + df["company_name_list"] = out + return df + diff --git a/gpu_bdb/bdb_tools/q28_utils.py b/gpu_bdb/bdb_tools/q28_utils.py new file mode 100644 index 00000000..c594dae9 --- /dev/null +++ b/gpu_bdb/bdb_tools/q28_utils.py @@ -0,0 +1,303 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np +import cupy as cp +import cupy + +import cudf + +import dask + +from cuml.feature_extraction.text import HashingVectorizer +from cuml.dask.naive_bayes import MultinomialNB as DistMNB +from cuml.dask.common import to_dask_cudf +from cuml.dask.common.input_utils import DistributedDataHandler + +from distributed import wait + +from uuid import uuid1 + +from bdb_tools.readers import build_reader + +N_FEATURES = 2 ** 23 # Spark is doing 2^20 +ngram_range = (1, 2) +preprocessor = lambda s:s.str.lower() +norm = None +alternate_sign = False + +def read_tables(config, c=None): + ### splitting by row groups for better parallelism + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=True, + ) + + columns = [ + "pr_review_content", + "pr_review_rating", + "pr_review_sk", + ] + pr_df = table_reader.read("product_reviews", relevant_cols=columns) + + if c: + c.create_table("product_reviews", pr_df, persist=False) + + return pr_df + + +def gpu_hashing_vectorizer(x): + vec = HashingVectorizer(n_features=N_FEATURES, + alternate_sign=alternate_sign, + ngram_range=ngram_range, + norm=norm, + preprocessor=preprocessor + ) + return vec.fit_transform(x) + + +def map_labels(ser): + output_ser = cudf.Series(cudf.core.column.full(size=len(ser), fill_value=2, dtype=np.int32)) + zero_flag = (ser==1) | (ser==2) + output_ser.loc[zero_flag]=0 + + three_flag = (ser==3) + output_ser.loc[three_flag]=1 + + return output_ser + +def build_features(t): + X = t["pr_review_content"] + X = X.map_partitions( + gpu_hashing_vectorizer, + meta=dask.array.from_array( + cupy.sparse.csr_matrix(cupy.zeros(1, dtype=cp.float32)) + ), + ) + + X = X.astype(np.float32).persist() + X.compute_chunk_sizes() + + return X + + +def build_labels(reviews_df): + y = reviews_df["pr_review_rating"].map_partitions(map_labels) + y = y.map_partitions(lambda x: cupy.asarray(x, cupy.int32)).persist() + y.compute_chunk_sizes() + + return y + +def categoricalize(num_sr): + return num_sr.astype("str").str.replace(["0", "1", "2"], ["NEG", "NEUT", "POS"]) + + +def sum_tp_fp(y_y_pred, nclasses): + + y, y_pred = y_y_pred + res = cp.zeros((nclasses, 2), order="F") + + for i in range(nclasses): + pos_pred_ix = cp.where(y_pred == i)[0] + + # short circuit + if len(pos_pred_ix) == 0: + res[i] = 0 + break + + tp_sum = (y_pred[pos_pred_ix] == y[pos_pred_ix]).sum() + fp_sum = (y_pred[pos_pred_ix] != y[pos_pred_ix]).sum() + res[i][0] = tp_sum + res[i][1] = fp_sum + return res + +def precision_score(client, y, y_pred, average="binary"): + + nclasses = len(cp.unique(y.map_blocks(lambda x: cp.unique(x)).compute())) + + if average == "binary" and nclasses > 2: + raise ValueError + + if nclasses < 2: + raise ValueError("Single class precision is not yet supported") + + ddh = DistributedDataHandler.create([y, y_pred]) + + precision_scores = client.compute( + [ + client.submit(sum_tp_fp, part, nclasses, workers=[worker]) + for worker, part in ddh.gpu_futures + ], + sync=True, + ) + + res = cp.zeros((nclasses, 2), order="F") + + for i in precision_scores: + res += i + + if average == "binary" or average == "macro": + + prec = cp.zeros(nclasses) + for i in range(nclasses): + tp_sum, fp_sum = res[i] + prec[i] = (tp_sum / (tp_sum + fp_sum)).item() + + if average == "binary": + return prec[nclasses - 1].item() + else: + return prec.mean().item() + else: + global_tp = cp.sum(res[:, 0]) + global_fp = cp.sum(res[:, 1]) + + return global_tp / (global_tp + global_fp).item() + + +def local_cm(y_y_pred, unique_labels, sample_weight): + + y_true, y_pred = y_y_pred + labels = unique_labels + + n_labels = labels.size + + # Assume labels are monotonically increasing for now. + + # intersect y_pred, y_true with labels, eliminate items not in labels + ind = cp.logical_and(y_pred < n_labels, y_true < n_labels) + y_pred = y_pred[ind] + y_true = y_true[ind] + + if sample_weight is None: + sample_weight = cp.ones(y_true.shape[0], dtype=np.int64) + else: + sample_weight = cp.asarray(sample_weight) + + sample_weight = sample_weight[ind] + + cm = cp.sparse.coo_matrix( + (sample_weight, (y_true, y_pred)), shape=(n_labels, n_labels), dtype=cp.float32, + ).toarray() + + return cp.nan_to_num(cm) + + +def confusion_matrix(client, y_true, y_pred, normalize=None, sample_weight=None): + + unique_classes = cp.unique(y_true.map_blocks(lambda x: cp.unique(x)).compute()) + nclasses = len(unique_classes) + + ddh = DistributedDataHandler.create([y_true, y_pred]) + + cms = client.compute( + [ + client.submit( + local_cm, part, unique_classes, sample_weight, workers=[worker] + ) + for worker, part in ddh.gpu_futures + ], + sync=True, + ) + + cm = cp.zeros((nclasses, nclasses)) + for i in cms: + cm += i + + with np.errstate(all="ignore"): + if normalize == "true": + cm = cm / cm.sum(axis=1, keepdims=True) + elif normalize == "pred": + cm = cm / cm.sum(axis=0, keepdims=True) + elif normalize == "all": + cm = cm / cm.sum() + cm = cp.nan_to_num(cm) + + return cm + + +def accuracy_score(client, y, y_hat): + + ddh = DistributedDataHandler.create([y_hat, y]) + + def _count_accurate_predictions(y_hat_y): + y_hat, y = y_hat_y + y_hat = cp.asarray(y_hat, dtype=y_hat.dtype) + y = cp.asarray(y, dtype=y.dtype) + return y.shape[0] - cp.count_nonzero(y - y_hat) + + key = uuid1() + + futures = client.compute( + [ + client.submit( + _count_accurate_predictions, + worker_future[1], + workers=[worker_future[0]], + key="%s-%s" % (key, idx), + ) + for idx, worker_future in enumerate(ddh.gpu_futures) + ], + sync=True, + ) + + return sum(futures) / y.shape[0] + + +def post_etl_processing(client, train_data, test_data): + + # Feature engineering + X_train = build_features(train_data) + X_test = build_features(test_data) + + y_train = build_labels(train_data) + y_test = build_labels(test_data) + + # Perform ML + model = DistMNB(client=client, alpha=0.001) + model.fit(X_train, y_train) + + ### this regression seems to be coming from here + y_hat = model.predict(X_test).persist() + + # Compute distributed performance metrics + acc = accuracy_score(client, y_test, y_hat) + + print("Accuracy: " + str(acc)) + prec = precision_score(client, y_test, y_hat, average="macro") + + print("Precision: " + str(prec)) + cmat = confusion_matrix(client, y_test, y_hat) + + print("Confusion Matrix: " + str(cmat)) + + # Place results back in original Dataframe + ddh = DistributedDataHandler.create(y_hat) + test_preds = to_dask_cudf( + [client.submit(cudf.Series, part) for w, part in ddh.gpu_futures] + ) + + test_preds = test_preds.map_partitions(categoricalize) + + test_data["prediction"] = test_preds + + final_data = test_data[["pr_review_sk", "pr_review_rating", "prediction"]].persist() + + final_data = final_data.sort_values("pr_review_sk").reset_index(drop=True) + wait(final_data) + return final_data, acc, prec, cmat + + diff --git a/gpu_bdb/bdb_tools/q29_utils.py b/gpu_bdb/bdb_tools/q29_utils.py new file mode 100644 index 00000000..b0e0cd8f --- /dev/null +++ b/gpu_bdb/bdb_tools/q29_utils.py @@ -0,0 +1,38 @@ + +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.readers import build_reader + +q29_limit = 100 + + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], basepath=config["data_dir"], + ) + item_cols = ["i_item_sk", "i_category_id"] + item_df = table_reader.read("item", relevant_cols=item_cols) + + ws_cols = ["ws_order_number", "ws_item_sk"] + ws_df = table_reader.read("web_sales", relevant_cols=ws_cols) + + if c: + c.create_table('item', item_df, persist=False) + c.create_table('web_sales', ws_df, persist=False) + + return item_df, ws_df + diff --git a/gpu_bdb/bdb_tools/q30_utils.py b/gpu_bdb/bdb_tools/q30_utils.py new file mode 100644 index 00000000..2d8e3309 --- /dev/null +++ b/gpu_bdb/bdb_tools/q30_utils.py @@ -0,0 +1,44 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.readers import build_reader + +# session timeout in secs +q30_session_timeout_inSec = 3600 +# query output limit +q30_limit = 40 + + + +def read_tables(config, c=None): + table_reader = build_reader( + data_format=config["file_format"], + basepath=config["data_dir"], + split_row_groups=config["split_row_groups"], + ) + + item_cols = ["i_category_id", "i_item_sk"] + item_df = table_reader.read("item", relevant_cols=item_cols) + + wcs_cols = ["wcs_user_sk", "wcs_item_sk", "wcs_click_date_sk", "wcs_click_time_sk"] + wcs_df = table_reader.read("web_clickstreams", relevant_cols=wcs_cols) + + if c: + c.create_table('web_clickstreams', wcs_df, persist=False) + c.create_table('item', item_df, persist=False) + + return item_df + diff --git a/gpu_bdb/bdb_tools/rmm_monitor.py b/gpu_bdb/bdb_tools/rmm_monitor.py new file mode 100644 index 00000000..62e74473 --- /dev/null +++ b/gpu_bdb/bdb_tools/rmm_monitor.py @@ -0,0 +1,78 @@ +import os +import csv +import rmm +import tempfile +import asyncio + +from dask.distributed import Client, Worker, WorkerPlugin + +from typing import List + + +class DependencyInstaller(WorkerPlugin): + def __init__(self, dependencies: List[str]): + self._depencendies = " ".join(f"'{dep}'" for dep in dependencies) + + def setup(self, _worker: Worker): + os.system(f"conda install -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults {self._depencendies}") + +# Wrap this in a method used to initialize the module + pass in teh client instance +dependency_installer = DependencyInstaller(["pynvml"]) + +#client = Client() +#client.register_worker_plugin(dependency_installer) + +class RMMResourceMonitor: + """ + Distributed montor for RMM resource allocations + """ + + def __init__( self, client, outputdir='/tmp' ): + self._client = client if isinstance(client, Client) else None + self._outputdir=outputdir + + def __dispatch__( self, method, **kwargs ): + if self._client: + self._client.run( method, **kwargs ) + else: + return method(*args, **kwargs ) + + def get_remote_output_dir( self ): + return self._outputdir + + def begin_logging( self, prefix="rmmlog"): + """ + enable rmm logging into dask temporary directory + """ + + def _rmmlogstart( basedir, prefix ): + import os + fname=f"{prefix}_{os.getpid()}.log" + rmm.enable_logging( log_file_name=os.path.join( basedir, fname)) + return fname + + self.__dispatch__( _rmmlogstart, prefix=prefix, basedir=self.get_remote_output_dir()) + + def stop_logging( self ): + """ + disable rmm logging and mark files for retrieval + """ + def _rmmlogstop(): + rmm.disable_logging() + + self.__dispatch__( _rmmlogstop ) + + def collect( self ): + """ + distributed command retrieves an logfile + @return reference to dataframe into which rresults are being loaded + """ + def _collect(): + for fname in (rmm.get_log_filenames()): + print( fname ) + #load into memory and return dask_dataframe reference? + + retval = DaskDataframe() + for lf_future in self.__dispatch__( _collect, localfile ): + pass + diff --git a/gpu_bdb/bdb_tools/utils.py b/gpu_bdb/bdb_tools/utils.py index 3e53cbbe..a8183ef8 100755 --- a/gpu_bdb/bdb_tools/utils.py +++ b/gpu_bdb/bdb_tools/utils.py @@ -249,14 +249,14 @@ def remove_benchmark_files(): # Query Runner Utilities ################################# def run_query( - config, client, query_func, write_func=write_result, blazing_context=None + config, client, query_func, write_func=write_result, sql_context=None ): - if blazing_context: - run_bsql_query( + if sql_context: + run_sql_query( config=config, client=client, query_func=query_func, - blazing_context=blazing_context, + sql_context=sql_context, write_func=write_func, ) else: @@ -303,8 +303,8 @@ def run_dask_cudf_query(config, client, query_func, write_func=write_result): push_payload_to_googlesheet(config) -def run_bsql_query( - config, client, query_func, blazing_context, write_func=write_result +def run_sql_query( + config, client, query_func, sql_context, write_func=write_result ): """ Common utility to perform all steps needed to execute a dask-cudf version @@ -320,7 +320,7 @@ def run_bsql_query( dask_profile=config.get("dask_profile"), data_dir=data_dir, client=client, - bc=blazing_context, + c=sql_context, config=config, ) @@ -382,7 +382,7 @@ def gpubdb_argparser(): "sheet": os.environ.get("GOOGLE_SPREADSHEET_NAME"), "tab": os.environ.get("GOOGLE_SPREADSHEET_TAB"), "scheduler_file_path": os.environ.get("SCHEDULER_FILE"), - "benchmark_runner_include_bsql": os.environ.get("RUNNER_INCLUDE_BSQL"), + "benchmark_runner_include_sql": os.environ.get("RUNNER_INCLUDE_SQL"), } for key in args.keys(): @@ -789,7 +789,7 @@ def build_benchmark_googlesheet_payload(config): "Protocol": "UCX" if data.get("nvlink") == True else "TCP", "NVLINK": data.get("nvlink", "NA"), "Infiniband": data.get("infiniband", "NA"), - "Query Type": "blazing" if is_blazing_query() else "dask", + "Query Type": "sql" if is_sql_query() else "dask", "File Format": data.get("file_format"), "Time (seconds)": query_time + writing_time if query_time and writing_time @@ -810,7 +810,7 @@ def build_benchmark_googlesheet_payload(config): "Data Location": data.get("data_dir"), "Current Time": current_time, "cuDF Version": data.get("cudf"), - "BlazingSQL Version": data.get("blazingsql"), + "Dask SQL Version": data.get("sql"), "Dask Version": data.get("dask"), "Distributed Version": data.get("distributed"), "Dask-CUDA Version": data.get("dask-cuda"), @@ -827,15 +827,15 @@ def build_benchmark_googlesheet_payload(config): return payload -def is_blazing_query(): +def is_sql_query(): """ - Method that returns true if caller of the utility is a blazing query, returns false otherwise + Method that returns true if caller of the utility is a SQL query, returns false otherwise Assumes that caller is 3 levels above the stack - query_of_interest -> utils.push_to_google_sheet -> utils.build_payload -> utils.is_blazing_query + query_of_interest -> utils.push_to_google_sheet -> utils.build_payload -> utils.is_sql_query - Another potential solution is checking sys.modules.get("blazing") to check blazing is imported + Another potential solution is checking sys.modules.get("dask_sql") to check Dask-SQL is imported """ - return "bsql" in inspect.stack()[-3].function + return "sql" in inspect.stack()[-3].function def _get_benchmarked_method_time( @@ -866,7 +866,7 @@ def generate_library_information(): "dask-cuda", "rmm", "cupy", - "blazingsql", + "dask-sql", ] conda_list_command = ( @@ -904,7 +904,7 @@ def push_payload_to_googlesheet(config): payload = build_benchmark_googlesheet_payload(config) s = gc.open(config["sheet"]) tab = s.worksheet(config["tab"]) - tab.append_row(payload, value_input_option='USER_ENTERED') + tab.append_row(payload, value_input_option='USER_ENTERED', table_range='A2') ################################# diff --git a/gpu_bdb/benchmark_runner.py b/gpu_bdb/benchmark_runner.py index 6fdef1df..6c383ab5 100755 --- a/gpu_bdb/benchmark_runner.py +++ b/gpu_bdb/benchmark_runner.py @@ -21,9 +21,11 @@ def load_query(qnum, fn): return mod.main -dask_qnums = [str(i).zfill(2) for i in range(1, 31)] -bsql_qnums = [str(i).zfill(2) for i in range(1, 31)] +dask_qnums = [str(i).zfill(2) for i in map(int,os.getenv("DASK_QNUMS"," ".join(map(str,range(1, 31)))).split())] +sql_qnums = [str(i).zfill(2) for i in map(int,os.getenv("BSQL_QNUMS"," ".join(map(str,range(1, 31)))).split())] +from random import shuffle +shuffle(dask_qnums) if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster, import_query_libs @@ -32,66 +34,94 @@ def load_query(qnum, fn): import_query_libs() config = gpubdb_argparser() config["run_id"] = uuid.uuid4().hex - include_blazing = config.get("benchmark_runner_include_bsql") + + include_sql = config.get("benchmark_runner_include_sql") dask_queries = { qnum: load_query(qnum, f"queries/q{qnum}/gpu_bdb_query_{qnum}.py") for qnum in dask_qnums } - if include_blazing: - bsql_queries = { - qnum: load_query(qnum, f"queries/q{qnum}/gpu_bdb_query_{qnum}_sql.py") - for qnum in bsql_qnums + if include_sql: + sql_queries = { + qnum: load_query(qnum, f"queries/q{qnum}/gpu_bdb_query_{qnum}_dask_sql.py") + for qnum in sql_qnums + } + else: + dask_queries = { + qnum: load_query(qnum, f"queries/q{qnum}/gpu_bdb_query_{qnum}.py") + for qnum in dask_qnums } - client, bc = attach_to_cluster(config, create_blazing_context=include_blazing) + client, c = attach_to_cluster(config, create_sql_context=include_sql) # Preload required libraries for queries on all workers client.run(import_query_libs) base_path = os.getcwd() - # Run BSQL Queries - if include_blazing and len(bsql_qnums) > 0: - print("Blazing Queries") - for qnum, q_func in bsql_queries.items(): - print(qnum) - - qpath = f"{base_path}/queries/q{qnum}/" - os.chdir(qpath) - if os.path.exists("current_query_num.txt"): - os.remove("current_query_num.txt") - with open("current_query_num.txt", "w") as fp: - fp.write(qnum) - - for r in range(N_REPEATS): - run_query( - config=config, - client=client, - query_func=q_func, - blazing_context=bc, - ) - client.run(gc.collect) - client.run_on_scheduler(gc.collect) - gc.collect() - time.sleep(3) + if config.get('benchmark_runner_log_rmm', False) or config.get('benchmark_runner_log_tasks', False): + + from bdb_tools import RMMResourceMonitor + from bdb_tools import DaskTaskLogger + + rmm_analyzer=RMMResourceMonitor(client=client, + outputdir=os.getenv('OUTPUT_DIR', '/tmp')) + dasktasklog=DaskTaskLogger( client=client, + outputdir=os.getenv('OUTPUT_DIR', '/tmp')) + + orig_run_query=run_query + def logged_run_query( *args, **kwargs ): + rmm_analyzer.begin_logging( prefix=f"rmmlog{qnum}") + dasktasklog.mark_begin() + orig_run_query( *args, **kwargs ) + rmm_analyzer.stop_logging() + dasktasklog.save_tasks( prefix=f"dasktasklog{qnum}") + + run_query=logged_run_query + + # Run Dask SQL Queries + if include_sql and len(sql_qnums) > 0: + print("Dask SQL Queries") + for r in range(N_REPEATS): + for qnum, q_func in sql_queries.items(): + print(f"run {r+1}: q{qnum}") + + qpath = f"{base_path}/queries/q{qnum}/" + os.chdir(qpath) + if os.path.exists("current_query_num.txt"): + os.remove("current_query_num.txt") + with open("current_query_num.txt", "w") as fp: + fp.write(qnum) + + run_query( + config=config, + client=client, + query_func=q_func, + sql_context=c, + ) + client.run(gc.collect) + client.run_on_scheduler(gc.collect) + gc.collect() + time.sleep(3) # Run Pure Dask Queries if len(dask_qnums) > 0: print("Pure Dask Queries") - for qnum, q_func in dask_queries.items(): - print(qnum) - - qpath = f"{base_path}/queries/q{qnum}/" - os.chdir(qpath) - if os.path.exists("current_query_num.txt"): - os.remove("current_query_num.txt") - with open("current_query_num.txt", "w") as fp: - fp.write(qnum) - - for r in range(N_REPEATS): - run_query(config=config, client=client, query_func=q_func) - client.run(gc.collect) - client.run_on_scheduler(gc.collect) - gc.collect() - time.sleep(3) + for r in range(N_REPEATS): + for qnum, q_func in dask_queries.items(): + print(f"run {r+1}: q{qnum}") + + qpath = f"{base_path}/queries/q{qnum}/" + os.chdir(qpath) + if os.path.exists("current_query_num.txt"): + os.remove("current_query_num.txt") + with open("current_query_num.txt", "w") as fp: + fp.write(qnum) + + run_query(config=config, client=client, query_func=q_func) + client.run(gc.collect) + client.run_on_scheduler(gc.collect) + gc.collect() + time.sleep(3) + + diff --git a/gpu_bdb/benchmark_runner/benchmark_config.yaml b/gpu_bdb/benchmark_runner/benchmark_config.yaml index 54d6bfb8..95af4169 100755 --- a/gpu_bdb/benchmark_runner/benchmark_config.yaml +++ b/gpu_bdb/benchmark_runner/benchmark_config.yaml @@ -1,19 +1,21 @@ # benchmark config yaml ### Please fill these accordingly -data_dir: +data_dir: /raid/gpu-bdb/sf1000/parquet_2gb output_dir: file_format: parquet output_filetype: parquet split_row_groups: False repartition_small_table: True -benchmark_runner_include_bsql: +benchmark_runner_include_sql: +benchmark_runner_log_rmm: False +benchmark_runner_log_tasks: False -scheduler_file_path: +scheduler_file_path: /raid/adattagupta/dask-sql-work/dask-local-directory/scheduler.json dask_profile: False verify_results: False verify_dir: -sheet: -tab: +sheet: GPU-BDB Dask-SQL +tab: SF1K Dask-SQL get_read_time: False diff --git a/gpu_bdb/queries/load_test/gpu_bdb_load_test.py b/gpu_bdb/queries/load_test/gpu_bdb_load_test.py index ab57193f..830e9977 100755 --- a/gpu_bdb/queries/load_test/gpu_bdb_load_test.py +++ b/gpu_bdb/queries/load_test/gpu_bdb_load_test.py @@ -24,7 +24,7 @@ tables = [table.split(".")[0] for table in os.listdir(spark_schema_dir)] scale = [x for x in config["data_dir"].split("/") if "sf" in x][0] -part_size = 3 +part_size = 2 chunksize = "128 MiB" # Spark uses different names for column types, and RAPIDS doesn't yet support Decimal types. @@ -127,7 +127,7 @@ def repartition(table, outdir, npartitions=None, chunksize=None, compression="sn ) read_csv_table(table, chunksize).repartition( npartitions=npartitions - ).to_parquet(outdir + table, compression=compression) + ).to_parquet(outdir + table, compression=compression, index=False) def main(client, config): diff --git a/gpu_bdb/queries/q01/gpu_bdb_query_01.py b/gpu_bdb/queries/q01/gpu_bdb_query_01.py index 041f674c..6cd6bbd7 100755 --- a/gpu_bdb/queries/q01/gpu_bdb_query_01.py +++ b/gpu_bdb/queries/q01/gpu_bdb_query_01.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,8 +15,14 @@ # from bdb_tools.utils import benchmark, gpubdb_argparser, run_query -from bdb_tools.readers import build_reader +from bdb_tools.q01_utils import ( + q01_i_category_id_IN, + q01_ss_store_sk_IN, + q01_viewed_together_count, + q01_limit, + read_tables +) ### Implementation Notes: # `drop_duplicates` and `groupby` by default brings result to single partition @@ -27,30 +33,6 @@ # Settinng index + merge using map_parition can be a work-around if dask native merge is slow -# -------- Q1 ----------- -q01_i_category_id_IN = [1, 2, 3] -# -- sf1 -> 11 stores, 90k sales in 820k lines -q01_ss_store_sk_IN = [10, 20, 33, 40, 50] -q01_viewed_together_count = 50 -q01_limit = 100 - - -item_cols = ["i_item_sk", "i_category_id"] -ss_cols = ["ss_item_sk", "ss_store_sk", "ss_ticket_number"] - - -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - - item_df = table_reader.read("item", relevant_cols=item_cols) - ss_df = table_reader.read("store_sales", relevant_cols=ss_cols) - return item_df, ss_df - - ### Inner Self join to get pairs # Select t1.ss_item_sk as item_sk_1 , t2.ss_item_sk as item_sk_2 # FROM ( @@ -163,8 +145,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q01/gpu_bdb_query_01_dask_sql.py b/gpu_bdb/queries/q01/gpu_bdb_query_01_dask_sql.py new file mode 100755 index 00000000..9d0f21ad --- /dev/null +++ b/gpu_bdb/queries/q01/gpu_bdb_query_01_dask_sql.py @@ -0,0 +1,77 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.q01_utils import ( + q01_i_category_id_IN, + q01_ss_store_sk_IN, + q01_viewed_together_count, + q01_limit, + read_tables +) + +from dask.distributed import wait + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query_distinct = f""" + SELECT DISTINCT ss_item_sk, ss_ticket_number + FROM store_sales s, item i + WHERE s.ss_item_sk = i.i_item_sk + AND i.i_category_id IN {q01_i_category_id_IN} + AND s.ss_store_sk IN {q01_ss_store_sk_IN} + """ + result_distinct = c.sql(query_distinct) + + result_distinct = result_distinct.persist() + wait(result_distinct) + c.create_table("distinct_table", result_distinct, persist=False) + + query = f""" + SELECT item_sk_1, item_sk_2, COUNT(*) AS cnt + FROM + ( + SELECT CAST(t1.ss_item_sk as BIGINT) AS item_sk_1, + CAST(t2.ss_item_sk AS BIGINT) AS item_sk_2 + FROM distinct_table t1 + INNER JOIN distinct_table t2 + ON t1.ss_ticket_number = t2.ss_ticket_number + WHERE t1.ss_item_sk < t2.ss_item_sk + ) + GROUP BY item_sk_1, item_sk_2 + HAVING COUNT(*) > {q01_viewed_together_count} + ORDER BY cnt DESC, CAST(item_sk_1 AS VARCHAR), + CAST(item_sk_2 AS VARCHAR) + LIMIT {q01_limit} + """ + result = c.sql(query) + + c.drop_table("distinct_table") + return result + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q02/gpu_bdb_query_02.py b/gpu_bdb/queries/q02/gpu_bdb_query_02.py index cc7cb5a5..c6c11e40 100755 --- a/gpu_bdb/queries/q02/gpu_bdb_query_02.py +++ b/gpu_bdb/queries/q02/gpu_bdb_query_02.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,8 +19,13 @@ gpubdb_argparser, run_query, ) -from bdb_tools.readers import build_reader from bdb_tools.sessionization import get_distinct_sessions +from bdb_tools.q02_utils import ( + q02_item_sk, + q02_limit, + q02_session_timeout_inSec, + read_tables +) ### Implementation Notes: @@ -28,13 +33,6 @@ # The bottleneck of current implimenation is `set-index`, once ucx is working correctly # it should go away -# -------- Q2 ----------- -q02_item_sk = 10001 -q02_MAX_ITEMS_PER_BASKET = 5000000 -q02_limit = 30 -q02_session_timeout_inSec = 3600 - - def get_relevant_item_series(df, q02_item_sk): """ Returns relevant items directly @@ -65,17 +63,6 @@ def reduction_function(df, q02_session_timeout_inSec): return grouped_df -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - wcs_cols = ["wcs_user_sk", "wcs_item_sk", "wcs_click_date_sk", "wcs_click_time_sk"] - wcs_df = table_reader.read("web_clickstreams", relevant_cols=wcs_cols) - return wcs_df - - def pre_repartition_task(wcs_df): f_wcs_df = wcs_df[ @@ -149,8 +136,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q02/gpu_bdb_query_02_dask_sql.py b/gpu_bdb/queries/q02/gpu_bdb_query_02_dask_sql.py new file mode 100755 index 00000000..38c1668f --- /dev/null +++ b/gpu_bdb/queries/q02/gpu_bdb_query_02_dask_sql.py @@ -0,0 +1,88 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.sessionization import get_distinct_sessions + +from bdb_tools.q02_utils import ( + q02_item_sk, + q02_limit, + q02_session_timeout_inSec, + read_tables +) + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query_1 = """ + SELECT + CAST(wcs_user_sk AS INTEGER) AS wcs_user_sk, + CAST(wcs_item_sk AS INTEGER) AS wcs_item_sk, + (wcs_click_date_sk * 86400 + wcs_click_time_sk) AS tstamp_inSec + FROM web_clickstreams + WHERE wcs_item_sk IS NOT NULL + AND wcs_user_sk IS NOT NULL + DISTRIBUTE BY wcs_user_sk + """ + wcs_result = c.sql(query_1) + + session_df = wcs_result.map_partitions( + get_distinct_sessions, + keep_cols=["wcs_user_sk", "wcs_item_sk"], + time_out=q02_session_timeout_inSec, + ) + del wcs_result + + c.create_table('session_df', session_df, persist=False) + + last_query = f""" + WITH item_df AS ( + SELECT wcs_user_sk, session_id + FROM session_df + WHERE wcs_item_sk = {q02_item_sk} + ) + SELECT sd.wcs_item_sk as item_sk_1, + count(sd.wcs_item_sk) as cnt + FROM session_df sd + INNER JOIN item_df id + ON sd.wcs_user_sk = id.wcs_user_sk + AND sd.session_id = id.session_id + AND sd.wcs_item_sk <> {q02_item_sk} + GROUP BY sd.wcs_item_sk + ORDER BY cnt desc + LIMIT {q02_limit} + """ + result = c.sql(last_query) + result["item_sk_2"] = q02_item_sk + result_order = ["item_sk_1", "item_sk_2", "cnt"] + result = result[result_order] + + del session_df + c.drop_table("session_df") + return result + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q03/gpu_bdb_query_03.py b/gpu_bdb/queries/q03/gpu_bdb_query_03.py index 68a04af4..a563ff83 100755 --- a/gpu_bdb/queries/q03/gpu_bdb_query_03.py +++ b/gpu_bdb/queries/q03/gpu_bdb_query_03.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,34 +14,32 @@ # limitations under the License. # -import sys import os +import cudf +import dask_cudf from bdb_tools.utils import ( benchmark, gpubdb_argparser, run_query, ) -from bdb_tools.readers import build_reader + +from bdb_tools.q03_utils import ( + apply_find_items_viewed, + q03_purchased_item_IN, + q03_purchased_item_category_IN, + q03_limit, + read_tables +) from distributed import wait import numpy as np -from numba import cuda import glob from dask import delayed - -q03_days_in_sec_before_purchase = 864000 -q03_views_before_purchase = 5 -q03_purchased_item_IN = 10001 -q03_purchased_item_category_IN = [2, 3] -q03_limit = 100 - - def get_wcs_minima(config): - import dask_cudf wcs_df = dask_cudf.read_parquet( os.path.join(config["data_dir"], "web_clickstreams/*.parquet"), @@ -55,7 +53,6 @@ def get_wcs_minima(config): def pre_repartition_task(wcs_fn, item_df, wcs_tstamp_min): - import cudf wcs_cols = [ "wcs_user_sk", @@ -65,7 +62,7 @@ def pre_repartition_task(wcs_fn, item_df, wcs_tstamp_min): "wcs_click_time_sk", ] wcs_df = cudf.read_parquet(wcs_fn, columns=wcs_cols) - wcs_df = wcs_df._drop_na_rows(subset=["wcs_user_sk", "wcs_item_sk"]) + wcs_df = wcs_df.dropna(axis=0, subset=["wcs_user_sk", "wcs_item_sk"]) wcs_df["tstamp"] = wcs_df["wcs_click_date_sk"] * 86400 + wcs_df["wcs_click_time_sk"] wcs_df["tstamp"] = wcs_df["tstamp"] - wcs_tstamp_min @@ -108,108 +105,7 @@ def reduction_function(df, item_df_filtered): return grouped_df -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - - item_cols = ["i_category_id", "i_item_sk"] - item_df = table_reader.read("item", relevant_cols=item_cols) - return item_df - - -@cuda.jit -def find_items_viewed_before_purchase_kernel( - relevant_idx_col, user_col, timestamp_col, item_col, out_col, N -): - """ - Find the past N items viewed after a relevant purchase was made, - as defined by the configuration of this query. - """ - i = cuda.grid(1) - relevant_item = q03_purchased_item_IN - - if i < (relevant_idx_col.size): # boundary guard - # every relevant row gets N rows in the output, so we need to map the indexes - # back into their position in the original array - orig_idx = relevant_idx_col[i] - current_user = user_col[orig_idx] - - # look at the previous N clicks (assume sorted descending) - rows_to_check = N - remaining_rows = user_col.size - orig_idx - - if remaining_rows <= rows_to_check: - rows_to_check = remaining_rows - 1 - - for k in range(1, rows_to_check + 1): - if current_user != user_col[orig_idx + k]: - out_col[i * N + k - 1] = 0 - - # only checking relevant purchases via the relevant_idx_col - elif (timestamp_col[orig_idx + k] <= timestamp_col[orig_idx]) & ( - timestamp_col[orig_idx + k] - >= (timestamp_col[orig_idx] - q03_days_in_sec_before_purchase) - ): - out_col[i * N + k - 1] = item_col[orig_idx + k] - else: - out_col[i * N + k - 1] = 0 - - -def apply_find_items_viewed(df, item_mappings): - import cudf - - # need to sort descending to ensure that the - # next N rows are the previous N clicks - df = df.sort_values( - by=["wcs_user_sk", "tstamp", "wcs_sales_sk", "wcs_item_sk"], - ascending=[False, False, False, False], - ) - df.reset_index(drop=True, inplace=True) - df["relevant_flag"] = (df.wcs_sales_sk != 0) & ( - df.wcs_item_sk == q03_purchased_item_IN - ) - df["relevant_idx_pos"] = df.index.to_series() - df.reset_index(drop=True, inplace=True) - # only allocate output for the relevant rows - sample = df.loc[df.relevant_flag == True] - sample.reset_index(drop=True, inplace=True) - - N = q03_views_before_purchase - size = len(sample) - - # we know this can be int32, since it's going to contain item_sks - out_arr = cuda.device_array(size * N, dtype=df["wcs_item_sk"].dtype) - - find_items_viewed_before_purchase_kernel.forall(size)( - sample["relevant_idx_pos"], - df["wcs_user_sk"], - df["tstamp"], - df["wcs_item_sk"], - out_arr, - N, - ) - - result = cudf.DataFrame({"prior_item_viewed": out_arr}) - - del out_arr - del df - del sample - - filtered = result.merge( - item_mappings, - how="inner", - left_on=["prior_item_viewed"], - right_on=["i_item_sk"], - ) - return filtered - - def main(client, config): - import dask_cudf - import cudf item_df = benchmark( read_tables, @@ -289,8 +185,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q03/gpu_bdb_query_03_dask_sql.py b/gpu_bdb/queries/q03/gpu_bdb_query_03_dask_sql.py new file mode 100755 index 00000000..031dfe0f --- /dev/null +++ b/gpu_bdb/queries/q03/gpu_bdb_query_03_dask_sql.py @@ -0,0 +1,101 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.q03_utils import ( + apply_find_items_viewed, + q03_purchased_item_IN, + q03_purchased_item_category_IN, + q03_limit, + read_tables +) + +from dask.distributed import wait + + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query_1 = """ + SELECT i_item_sk, + CAST(i_category_id AS TINYINT) AS i_category_id + FROM item + """ + item_df = c.sql(query_1) + + item_df = item_df.persist() + wait(item_df) + c.create_table("item_df", item_df, persist=False) + + query_2 = """ + SELECT CAST(w.wcs_user_sk AS INTEGER) as wcs_user_sk, + wcs_click_date_sk * 86400 + wcs_click_time_sk AS tstamp, + CAST(w.wcs_item_sk AS INTEGER) as wcs_item_sk, + CAST(COALESCE(w.wcs_sales_sk, 0) AS INTEGER) as wcs_sales_sk + FROM web_clickstreams AS w + INNER JOIN item_df AS i ON w.wcs_item_sk = i.i_item_sk + WHERE w.wcs_user_sk IS NOT NULL + AND w.wcs_item_sk IS NOT NULL + DISTRIBUTE BY wcs_user_sk + """ + merged_df = c.sql(query_2) + + query_3 = f""" + SELECT i_item_sk, i_category_id + FROM item_df + WHERE i_category_id IN {q03_purchased_item_category_IN} + """ + item_df_filtered = c.sql(query_3) + + product_view_results = merged_df.map_partitions( + apply_find_items_viewed, item_mappings=item_df_filtered + ) + + + c.drop_table("item_df") + del item_df + del merged_df + del item_df_filtered + + c.create_table('product_result', product_view_results, persist=False) + + last_query = f""" + SELECT CAST({q03_purchased_item_IN} AS BIGINT) AS purchased_item, + i_item_sk AS lastviewed_item, + COUNT(i_item_sk) AS cnt + FROM product_result + GROUP BY i_item_sk + ORDER BY purchased_item, cnt desc, lastviewed_item + LIMIT {q03_limit} + """ + result = c.sql(last_query) + + c.drop_table("product_result") + del product_view_results + return result + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q04/gpu_bdb_query_04.py b/gpu_bdb/queries/q04/gpu_bdb_query_04.py index 9c4cb5a5..98fba61e 100755 --- a/gpu_bdb/queries/q04/gpu_bdb_query_04.py +++ b/gpu_bdb/queries/q04/gpu_bdb_query_04.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,100 +14,20 @@ # limitations under the License. # -import sys - +import cudf from bdb_tools.utils import ( benchmark, gpubdb_argparser, run_query, ) -from bdb_tools.readers import build_reader -from bdb_tools.sessionization import get_sessions - - -# parameters -q04_session_timeout_inSec = 3600 - - -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - - wp_cols = ["wp_type", "wp_web_page_sk"] - wp_df = table_reader.read("web_page", relevant_cols=wp_cols) - - wcs_cols = [ - "wcs_user_sk", - "wcs_click_date_sk", - "wcs_click_time_sk", - "wcs_web_page_sk", - "wcs_sales_sk", - ] - web_clicksteams_df = table_reader.read("web_clickstreams", relevant_cols=wcs_cols) - - return wp_df, web_clicksteams_df - - -def abandonedShoppingCarts(df, DYNAMIC_CAT_CODE, ORDER_CAT_CODE): - import cudf - - # TODO: test without reset index - df.reset_index(drop=True, inplace=True) - - # Select groups where last dynamic row comes after last order row - filtered_df = df[ - (df["wp_type_codes"] == ORDER_CAT_CODE) - | (df["wp_type_codes"] == DYNAMIC_CAT_CODE) - ] - # TODO: test without reset index - filtered_df.reset_index(drop=True, inplace=True) - # Create a new column that is the concatenation of timestamp and wp_type_codes - # (eg:123456:3, 234567:5) - filtered_df["wp_type_codes"] = ( - filtered_df["tstamp_inSec"] - .astype("str") - .str.cat(filtered_df["wp_type_codes"].astype("str"), sep=":") - ) - # This gives the last occurrence (by timestamp) within the "order", "dynamic" wp_types - filtered_df = filtered_df.groupby( - ["wcs_user_sk", "session_id"], as_index=False, sort=False - ).agg({"wp_type_codes": "max"}) - # If the max contains dynamic, keep the row else discard. - last_dynamic_df = filtered_df[ - filtered_df["wp_type_codes"].str.contains( - ":" + str(DYNAMIC_CAT_CODE), regex=False - ) - ] - del filtered_df - - # Find counts for each group - grouped_count_df = df.groupby( - ["wcs_user_sk", "session_id"], as_index=False, sort=False - ).agg({"tstamp_inSec": "count"}) - # Merge counts with the "dynamic" shopping cart groups - result = last_dynamic_df.merge( - grouped_count_df, on=["wcs_user_sk", "session_id"], how="inner" - ) - del (last_dynamic_df, grouped_count_df) - return cudf.DataFrame( - {"pagecount": result.tstamp_inSec.sum(), "count": len(result)} - ) - - -def reduction_function(df, keep_cols, DYNAMIC_CAT_CODE, ORDER_CAT_CODE): - df = get_sessions(df, keep_cols=keep_cols) - df = abandonedShoppingCarts( - df, DYNAMIC_CAT_CODE=DYNAMIC_CAT_CODE, ORDER_CAT_CODE=ORDER_CAT_CODE - ) - return df +from bdb_tools.q04_utils import ( + reduction_function, + read_tables +) def main(client, config): - import cudf wp, wcs_df = benchmark( read_tables, @@ -166,8 +86,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q04/gpu_bdb_query_04_dask_sql.py b/gpu_bdb/queries/q04/gpu_bdb_query_04_dask_sql.py new file mode 100755 index 00000000..3af8ef10 --- /dev/null +++ b/gpu_bdb/queries/q04/gpu_bdb_query_04_dask_sql.py @@ -0,0 +1,94 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import cudf + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.q04_utils import ( + reduction_function, + read_tables +) + +from dask.distributed import wait + + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query_web_page = """ + SELECT wp_type, wp_web_page_sk + FROM web_page_wo_categorical + """ + wp = c.sql(query_web_page) + + # Convert wp_type to categorical and get cat_id of review and dynamic type + wp["wp_type"] = wp["wp_type"].map_partitions( + lambda ser: ser.astype("category")) + + cpu_categories = wp["wp_type"].compute().cat.categories.to_pandas() + + DYNAMIC_CAT_CODE = cpu_categories.get_loc("dynamic") + ORDER_CAT_CODE = cpu_categories.get_loc("order") + + # ### cast to minimum viable dtype + wp["wp_type_codes"] = wp["wp_type"].cat.codes + cols_2_keep = ["wp_web_page_sk", "wp_type_codes"] + wp = wp[cols_2_keep] + + wp = wp.persist() + wait(wp) + c.create_table('web_page', wp, persist=False) + + query = """ + SELECT + c.wcs_user_sk, + w.wp_type_codes, + (wcs_click_date_sk * 86400 + wcs_click_time_sk) AS tstamp_inSec + FROM web_clickstreams c, web_page w + WHERE c.wcs_web_page_sk = w.wp_web_page_sk + AND c.wcs_web_page_sk IS NOT NULL + AND c.wcs_user_sk IS NOT NULL + AND c.wcs_sales_sk IS NULL --abandoned implies: no sale + DISTRIBUTE BY wcs_user_sk + """ + merged_df = c.sql(query) + + keep_cols = ["wcs_user_sk", "wp_type_codes", "tstamp_inSec"] + result_df = merged_df.map_partitions( + reduction_function, keep_cols, DYNAMIC_CAT_CODE, ORDER_CAT_CODE + ) + + result = result_df["pagecount"].sum() / result_df["count"].sum() + # Persist before computing to ensure scalar transfer only on compute + result = result.persist() + + result = result.compute() + result_df = cudf.DataFrame({"sum(pagecount)/count(*)": [result]}) + c.drop_table("web_page") + return result_df + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q05/gpu_bdb_query_05.py b/gpu_bdb/queries/q05/gpu_bdb_query_05.py index 290cf127..5e99a10f 100755 --- a/gpu_bdb/queries/q05/gpu_bdb_query_05.py +++ b/gpu_bdb/queries/q05/gpu_bdb_query_05.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,25 +14,27 @@ # limitations under the License. # -import sys import os import glob +import cudf +import dask_cudf + from bdb_tools.utils import ( benchmark, gpubdb_argparser, run_query, ) -from bdb_tools.readers import build_reader -from bdb_tools.cupy_metrics import cupy_precision_score +from bdb_tools.q05_utils import ( + build_and_predict_model, + wcs_columns, + read_tables +) -import cupy as cp import numpy as np from dask import delayed -import dask import pandas as pd -from sklearn.metrics import roc_auc_score # # Query Configuration @@ -40,84 +42,10 @@ COLLEGE_ED_STRS = ["Advanced Degree", "College", "4 yr Degree", "2 yr Degree"] Q05_I_CATEGORY = "Books" -wcs_columns = ["wcs_item_sk", "wcs_user_sk"] -items_columns = ["i_item_sk", "i_category", "i_category_id"] -customer_columns = ["c_customer_sk", "c_current_cdemo_sk"] -customer_dem_columns = ["cd_demo_sk", "cd_gender", "cd_education_status"] - -# Logistic Regression params -# solver = "LBFGS" Used by passing `penalty=None` or "l2" -# step_size = 1 Not used -# numCorrections = 10 Not used -iterations = 100 -C = 10_000 # reg_lambda = 0 hence C for model is a large value -convergence_tol = 1e-9 - - -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - - item_ddf = table_reader.read("item", relevant_cols=items_columns, index=False) - customer_ddf = table_reader.read( - "customer", relevant_cols=customer_columns, index=False - ) - customer_dem_ddf = table_reader.read( - "customer_demographics", relevant_cols=customer_dem_columns, index=False - ) - - return (item_ddf, customer_ddf, customer_dem_ddf) - - -def build_and_predict_model(ml_input_df): - """ - Create a standardized feature matrix X and target array y. - Returns the model and accuracy statistics - """ - import cuml - from cuml.metrics import confusion_matrix - - feature_names = ["college_education", "male"] + [ - "clicks_in_%d" % i for i in range(1, 8) - ] - X = ml_input_df[feature_names] - # Standardize input matrix - X = (X - X.mean()) / X.std() - y = ml_input_df["clicks_in_category"] - - model = cuml.LogisticRegression( - tol=convergence_tol, - penalty="none", - solver="qn", - fit_intercept=True, - max_iter=iterations, - C=C, - ) - model.fit(X, y) - # - # Predict and evaluate accuracy - # (Should be 1.0) at SF-1 - # - results_dict = {} - y_pred = model.predict(X) - - results_dict["auc"] = roc_auc_score(y.to_array(), y_pred.to_array()) - results_dict["precision"] = cupy_precision_score(cp.asarray(y), cp.asarray(y_pred)) - results_dict["confusion_matrix"] = confusion_matrix( - cp.asarray(y, dtype="int32"), cp.asarray(y_pred, dtype="int32") - ) - results_dict["output_type"] = "supervised" - return results_dict - - def get_groupby_results(file_list, item_df): """ Functionial approach for better scaling """ - import cudf sum_by_cat_ddf = None for fn in file_list: @@ -129,12 +57,12 @@ def get_groupby_results(file_list, item_df): keep_cols = ["wcs_user_sk", "i_category_id", "clicks_in_category"] wcs_ddf = wcs_ddf[keep_cols] - wcs_ddf = cudf.DataFrame.one_hot_encoding( + wcs_ddf = cudf.get_dummies( wcs_ddf, - column="i_category_id", + columns=["i_category_id"], prefix="clicks_in", prefix_sep="_", - cats=[i for i in range(1, 8)], + cats={"i_category_id":np.arange(1, 8, dtype="int32")}, dtype=np.int8, ) keep_cols = ["wcs_user_sk", "clicks_in_category"] + [ @@ -162,8 +90,6 @@ def get_groupby_results(file_list, item_df): def main(client, config): - import cudf - import dask_cudf item_ddf, customer_ddf, customer_dem_ddf = benchmark( read_tables, @@ -268,9 +194,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf - import cuml config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q05/gpu_bdb_query_05_dask_sql.py b/gpu_bdb/queries/q05/gpu_bdb_query_05_dask_sql.py new file mode 100755 index 00000000..e0a628ca --- /dev/null +++ b/gpu_bdb/queries/q05/gpu_bdb_query_05_dask_sql.py @@ -0,0 +1,97 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster +from dask.distributed import wait +from dask import delayed + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) +from bdb_tools.q05_utils import ( + build_and_predict_model, + read_tables +) + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query = """ + SELECT + --wcs_user_sk, + clicks_in_category, + CASE WHEN cd_education_status IN ('Advanced Degree', 'College', '4 yr Degree', '2 yr Degree') + THEN 1 ELSE 0 END AS college_education, + CASE WHEN cd_gender = 'M' THEN 1 ELSE 0 END AS male, + clicks_in_1, + clicks_in_2, + clicks_in_3, + clicks_in_4, + clicks_in_5, + clicks_in_6, + clicks_in_7 + FROM + ( + SELECT + wcs_user_sk, + SUM( CASE WHEN i_category = 'Books' THEN 1 ELSE 0 END) AS clicks_in_category, + SUM( CASE WHEN i_category_id = 1 THEN 1 ELSE 0 END) AS clicks_in_1, + SUM( CASE WHEN i_category_id = 2 THEN 1 ELSE 0 END) AS clicks_in_2, + SUM( CASE WHEN i_category_id = 3 THEN 1 ELSE 0 END) AS clicks_in_3, + SUM( CASE WHEN i_category_id = 4 THEN 1 ELSE 0 END) AS clicks_in_4, + SUM( CASE WHEN i_category_id = 5 THEN 1 ELSE 0 END) AS clicks_in_5, + SUM( CASE WHEN i_category_id = 6 THEN 1 ELSE 0 END) AS clicks_in_6, + SUM( CASE WHEN i_category_id = 7 THEN 1 ELSE 0 END) AS clicks_in_7 + FROM web_clickstreams + INNER JOIN item it ON + ( + wcs_item_sk = i_item_sk + AND wcs_user_sk IS NOT NULL + ) + GROUP BY wcs_user_sk + ) q05_user_clicks_in_cat + INNER JOIN customer ct ON wcs_user_sk = c_customer_sk + INNER JOIN customer_demographics ON c_current_cdemo_sk = cd_demo_sk + """ + + cust_and_clicks_ddf = c.sql(query) + + cust_and_clicks_ddf = cust_and_clicks_ddf.repartition(npartitions=1) + + # Convert clicks_in_category to a binary label + cust_and_clicks_ddf["clicks_in_category"] = ( + cust_and_clicks_ddf["clicks_in_category"] + > cust_and_clicks_ddf["clicks_in_category"].mean() + ).astype("int64") + + # Converting the dataframe to float64 as cuml logistic reg requires this + ml_input_df = cust_and_clicks_ddf.astype("float64") + + ml_input_df = ml_input_df.persist() + wait(ml_input_df) + + ml_tasks = [delayed(build_and_predict_model)(df) for df in ml_input_df.to_delayed()] + results_dict = client.compute(*ml_tasks, sync=True) + + return results_dict + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q06/gpu_bdb_query_06.py b/gpu_bdb/queries/q06/gpu_bdb_query_06.py index b7326ab3..9e3e9ff7 100755 --- a/gpu_bdb/queries/q06/gpu_bdb_query_06.py +++ b/gpu_bdb/queries/q06/gpu_bdb_query_06.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,64 +14,17 @@ # limitations under the License. # -import sys - - from bdb_tools.utils import ( benchmark, gpubdb_argparser, run_query, ) -from bdb_tools.readers import build_reader -from distributed import wait - - -q06_YEAR = 2001 -q6_limit_rows = 100 - - -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - - web_sales_cols = [ - "ws_bill_customer_sk", - "ws_sold_date_sk", - "ws_ext_list_price", - "ws_ext_wholesale_cost", - "ws_ext_discount_amt", - "ws_ext_sales_price", - ] - store_sales_cols = [ - "ss_customer_sk", - "ss_sold_date_sk", - "ss_ext_list_price", - "ss_ext_wholesale_cost", - "ss_ext_discount_amt", - "ss_ext_sales_price", - ] - date_cols = ["d_date_sk", "d_year", "d_moy"] - customer_cols = [ - "c_customer_sk", - "c_customer_id", - "c_email_address", - "c_first_name", - "c_last_name", - "c_preferred_cust_flag", - "c_birth_country", - "c_login", - ] - - ws_df = table_reader.read("web_sales", relevant_cols=web_sales_cols) - ss_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) - date_df = table_reader.read("date_dim", relevant_cols=date_cols) - customer_df = table_reader.read("customer", relevant_cols=customer_cols) - - return (ws_df, ss_df, date_df, customer_df) +from bdb_tools.q06_utils import ( + q06_YEAR, + q06_LIMIT, + read_tables +) def get_sales_ratio(df, table="store_sales"): assert table in ("store_sales", "web_sales") @@ -247,13 +200,11 @@ def main(client, config): ) ) - return result_df.head(q6_limit_rows) + return result_df.head(q06_LIMIT) if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q06/gpu_bdb_query_06_dask_sql.py b/gpu_bdb/queries/q06/gpu_bdb_query_06_dask_sql.py new file mode 100755 index 00000000..736319c4 --- /dev/null +++ b/gpu_bdb/queries/q06/gpu_bdb_query_06_dask_sql.py @@ -0,0 +1,103 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.q06_utils import ( + q06_LIMIT, + q06_YEAR, + read_tables +) + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query = f""" + WITH temp_table_1 as + ( + SELECT ss_customer_sk AS customer_sk, + sum( case when (d_year = {q06_YEAR}) THEN (((ss_ext_list_price-ss_ext_wholesale_cost-ss_ext_discount_amt)+ss_ext_sales_price)/2.0) ELSE 0.0 END) + AS first_year_total, + sum( case when (d_year = {q06_YEAR + 1}) THEN (((ss_ext_list_price-ss_ext_wholesale_cost-ss_ext_discount_amt)+ss_ext_sales_price)/2.0) ELSE 0.0 END) + AS second_year_total + FROM store_sales, + date_dim + WHERE ss_sold_date_sk = d_date_sk + AND d_year BETWEEN {q06_YEAR} AND {q06_YEAR + 1} + GROUP BY ss_customer_sk + -- first_year_total is an aggregation, rewrite all sum () statement + HAVING sum( case when (d_year = {q06_YEAR}) THEN (((ss_ext_list_price-ss_ext_wholesale_cost-ss_ext_discount_amt)+ss_ext_sales_price)/2.0) ELSE 0.0 END) > 0.0 + ), + temp_table_2 AS + ( + SELECT ws_bill_customer_sk AS customer_sk , + sum( case when (d_year = {q06_YEAR}) THEN (((ws_ext_list_price-ws_ext_wholesale_cost-ws_ext_discount_amt)+ws_ext_sales_price)/2.0) ELSE 0.0 END) + AS first_year_total, + sum( case when (d_year = {q06_YEAR + 1}) THEN (((ws_ext_list_price-ws_ext_wholesale_cost-ws_ext_discount_amt)+ws_ext_sales_price)/2.0) ELSE 0.0 END) + AS second_year_total + FROM web_sales, + date_dim + WHERE ws_sold_date_sk = d_date_sk + AND d_year BETWEEN {q06_YEAR} AND {q06_YEAR + 1} + GROUP BY ws_bill_customer_sk + -- required to avoid division by 0, because later we will divide by this value + HAVING sum( case when (d_year = {q06_YEAR}) THEN (((ws_ext_list_price-ws_ext_wholesale_cost-ws_ext_discount_amt)+ws_ext_sales_price)/2.0)ELSE 0.0 END) > 0.0 + ) + -- MAIN QUERY + SELECT + CAST( (web.second_year_total / web.first_year_total) AS DOUBLE) AS web_sales_increase_ratio, + c_customer_sk, + c_first_name, + c_last_name, + c_preferred_cust_flag, + c_birth_country, + c_login, + c_email_address + FROM temp_table_1 store, + temp_table_2 web, + customer c + WHERE store.customer_sk = web.customer_sk + AND web.customer_sk = c_customer_sk + -- if customer has sales in first year for both store and websales, + -- select him only if web second_year_total/first_year_total + -- ratio is bigger then his store second_year_total/first_year_total ratio. + AND (web.second_year_total / web.first_year_total) > + (store.second_year_total / store.first_year_total) + ORDER BY + web_sales_increase_ratio DESC, + c_customer_sk, + c_first_name, + c_last_name, + c_preferred_cust_flag, + c_birth_country, + c_login + LIMIT {q06_LIMIT} + """ + result = c.sql(query) + return result + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) + diff --git a/gpu_bdb/queries/q07/gpu_bdb_query_07.py b/gpu_bdb/queries/q07/gpu_bdb_query_07.py index a14cbcfd..89e8903a 100755 --- a/gpu_bdb/queries/q07/gpu_bdb_query_07.py +++ b/gpu_bdb/queries/q07/gpu_bdb_query_07.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,16 +14,12 @@ # limitations under the License. # -import sys - - from bdb_tools.utils import ( benchmark, gpubdb_argparser, run_query, ) -from bdb_tools.readers import build_reader - +from bdb_tools.q07_utils import read_tables q07_HIGHER_PRICE_RATIO = 1.2 # --store_sales date @@ -51,44 +47,10 @@ def create_high_price_items_df(item_df): return high_price_items_df -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - - item_cols = ["i_item_sk", "i_current_price", "i_category"] - store_sales_cols = ["ss_item_sk", "ss_customer_sk", "ss_sold_date_sk"] - store_cols = ["s_store_sk"] - date_cols = ["d_date_sk", "d_year", "d_moy"] - customer_cols = ["c_customer_sk", "c_current_addr_sk"] - customer_address_cols = ["ca_address_sk", "ca_state"] - - item_df = table_reader.read("item", relevant_cols=item_cols) - store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) - store_df = table_reader.read("store", relevant_cols=store_cols) - date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) - customer_df = table_reader.read("customer", relevant_cols=customer_cols) - customer_address_df = table_reader.read( - "customer_address", relevant_cols=customer_address_cols - ) - - return ( - item_df, - store_sales_df, - store_df, - date_dim_df, - customer_df, - customer_address_df, - ) - - def main(client, config): ( item_df, store_sales_df, - store_df, date_dim_df, customer_df, customer_address_df, @@ -155,8 +117,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q07/gpu_bdb_query_07_dask_sql.py b/gpu_bdb/queries/q07/gpu_bdb_query_07_dask_sql.py new file mode 100755 index 00000000..8ff1e73f --- /dev/null +++ b/gpu_bdb/queries/q07/gpu_bdb_query_07_dask_sql.py @@ -0,0 +1,74 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.q07_utils import read_tables + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query = """ + WITH temp_table as + ( + SELECT k.i_item_sk + FROM item k, + ( + SELECT i_category, + SUM(j.i_current_price) / COUNT(j.i_current_price) * 1.2 AS avg_price + FROM item j + GROUP BY j.i_category + ) avgCategoryPrice + WHERE avgCategoryPrice.i_category = k.i_category + AND k.i_current_price > avgCategoryPrice.avg_price + ) + SELECT ca_state, COUNT(*) AS cnt + FROM + customer_address a, + customer c, + store_sales s, + temp_table highPriceItems + WHERE a.ca_address_sk = c.c_current_addr_sk + AND c.c_customer_sk = s.ss_customer_sk + AND ca_state IS NOT NULL + AND ss_item_sk = highPriceItems.i_item_sk + AND s.ss_sold_date_sk IN + ( + SELECT d_date_sk + FROM date_dim + WHERE d_year = 2004 + AND d_moy = 7 + ) + GROUP BY ca_state + HAVING COUNT(*) >= 10 + ORDER BY cnt DESC, ca_state + LIMIT 10 + """ + + result = c.sql(query) + return result + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q08/gpu_bdb_query_08.py b/gpu_bdb/queries/q08/gpu_bdb_query_08.py index 686ea05b..451cbe9e 100755 --- a/gpu_bdb/queries/q08/gpu_bdb_query_08.py +++ b/gpu_bdb/queries/q08/gpu_bdb_query_08.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,34 +14,33 @@ # limitations under the License. # -import sys import os import glob +import cudf +import dask_cudf + from bdb_tools.utils import ( benchmark, gpubdb_argparser, run_query, - convert_datestring_to_days, + convert_datestring_to_days ) -from bdb_tools.readers import build_reader from bdb_tools.merge_util import hash_merge +from bdb_tools.q08_utils import ( + get_sessions, + get_unique_sales_keys_from_sessions, + prep_for_sessionization, + q08_STARTDATE, + q08_ENDDATE, + read_tables +) import numpy as np from distributed import wait -import cupy as cp -import rmm from dask import delayed - -q08_STARTDATE = "2001-09-02" -q08_ENDDATE = "2002-09-02" -q08_SECONDS_BEFORE_PURCHASE = 259200 -NA_FLAG = 0 - - def etl_wcs(wcs_fn, filtered_date_df, web_page_df): - import cudf filtered_date_df = filtered_date_df web_page_df = web_page_df @@ -81,125 +80,6 @@ def etl_wcs(wcs_fn, filtered_date_df, web_page_df): return merged_df[cols_to_keep] -def get_session_id_from_session_boundary(session_change_df, last_session_len): - """ - This function returns session starts given a session change df - """ - import cudf - - user_session_ids = session_change_df.tstamp_inSec - - ### up shift the session length df - session_len = session_change_df["t_index"].diff().reset_index(drop=True) - session_len = session_len.shift(-1) - - try: - session_len.iloc[-1] = last_session_len - except (AssertionError, IndexError) as e: # IndexError in numba >= 0.48 - session_len = cudf.Series([]) - - session_id_final_series = ( - cudf.Series(user_session_ids).repeat(session_len).reset_index(drop=True) - ) - return session_id_final_series - - -def get_session_id(df): - """ - This function creates a session id column for each click - The session id grows in incremeant for each user's susbequent session - Session boundry is defined by the time_out - """ - - df["user_change_flag"] = df["wcs_user_sk"].diff(periods=1) != 0 - df["user_change_flag"] = df["user_change_flag"].fillna(True) - df["session_change_flag"] = df["review_flag"] | df["user_change_flag"] - - df = df.reset_index(drop=True) - df["t_index"] = cp.arange(start=0, stop=len(df), dtype=np.int32) - - session_change_df = df[df["session_change_flag"]].reset_index(drop=True) - try: - last_session_len = len(df) - session_change_df["t_index"].iloc[-1] - except (AssertionError, IndexError) as e: # IndexError in numba >= 0.48 - last_session_len = 0 - - session_ids = get_session_id_from_session_boundary( - session_change_df, last_session_len - ) - - assert len(session_ids) == len(df) - return session_ids - - -def get_sessions(df): - df = df.sort_values( - by=["wcs_user_sk", "tstamp_inSec", "wcs_sales_sk", "wp_type_codes"] - ).reset_index(drop=True) - df["session_id"] = get_session_id(df) - return df - - -def get_unique_sales_keys_from_sessions(sessionized, review_cat_code): - sessionized["relevant"] = ( - (sessionized.tstamp_inSec - sessionized.session_id) - <= q08_SECONDS_BEFORE_PURCHASE - ) & (sessionized.wcs_sales_sk != NA_FLAG) - unique_sales_sk = ( - sessionized.query(f"wcs_sales_sk != {NA_FLAG}") - .query("relevant == True") - .query(f"wp_type_codes != {review_cat_code}") - .wcs_sales_sk.unique() - ) - - return unique_sales_sk - - -def prep_for_sessionization(df, review_cat_code): - df = df.fillna(NA_FLAG) - df = df.sort_values( - by=["wcs_user_sk", "tstamp_inSec", "wcs_sales_sk", "wp_type_codes"] - ).reset_index(drop=True) - - review_df = df.loc[df["wp_type_codes"] == review_cat_code] - # per user, the index of the first review - # need this to decide if a review was "recent enough" - every_users_first_review = ( - review_df[["wcs_user_sk", "tstamp_inSec"]] - .drop_duplicates() - .reset_index() - .groupby("wcs_user_sk")["index"] - .min() - .reset_index() - ) - every_users_first_review.columns = ["wcs_user_sk", "first_review_index"] - - # then reset the index to keep the old index before parallel join - df_merged = df.reset_index().merge( - every_users_first_review, how="left", on="wcs_user_sk" - ) - df_filtered = df_merged.query("index >= first_review_index") - return df_filtered - - -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - - date_dim_cols = ["d_date_sk", "d_date"] - web_page_cols = ["wp_web_page_sk", "wp_type"] - web_sales_cols = ["ws_net_paid", "ws_order_number", "ws_sold_date_sk"] - - date_dim_df = table_reader.read("date_dim", relevant_cols=date_dim_cols) - web_page_df = table_reader.read("web_page", relevant_cols=web_page_cols) - web_sales_df = table_reader.read("web_sales", relevant_cols=web_sales_cols) - - return (date_dim_df, web_page_df, web_sales_df) - - def reduction_function(df, REVIEW_CAT_CODE): # category code of review records @@ -213,8 +93,6 @@ def reduction_function(df, REVIEW_CAT_CODE): def main(client, config): - import cudf - import dask_cudf (date_dim_df, web_page_df, web_sales_df) = benchmark( read_tables, @@ -327,8 +205,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q08/gpu_bdb_query_08_dask_sql.py b/gpu_bdb/queries/q08/gpu_bdb_query_08_dask_sql.py new file mode 100755 index 00000000..6a85bc1c --- /dev/null +++ b/gpu_bdb/queries/q08/gpu_bdb_query_08_dask_sql.py @@ -0,0 +1,137 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.q08_utils import ( + get_sessions, + get_unique_sales_keys_from_sessions, + prep_for_sessionization, + q08_STARTDATE, + q08_ENDDATE, + read_tables +) + +from dask.distributed import wait + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query_1 = f""" + SELECT d_date_sk + FROM date_dim + WHERE CAST(d_date as date) IN (date '{q08_STARTDATE}', + date '{q08_ENDDATE}') + ORDER BY CAST(d_date as date) asc + """ + result_dates_sk_filter = c.sql(query_1).compute() + + # because `result_dates_sk_filter` has repetitive index + result_dates_sk_filter.index = list(range(0, result_dates_sk_filter.shape[0])) + q08_start_dt = result_dates_sk_filter['d_date_sk'][0] + q08_end_dt = result_dates_sk_filter['d_date_sk'][1] + + query_aux = """ + SELECT + wp_web_page_sk, + wp_type + FROM web_page + """ + web_page_df = c.sql(query_aux) + + # cast to minimum viable dtype + web_page_df["wp_type"] = web_page_df["wp_type"].map_partitions( + lambda ser: ser.astype("category") + ) + + cpu_categories = web_page_df["wp_type"].compute().cat.categories.to_pandas() + REVIEW_CAT_CODE = cpu_categories.get_loc("review") + + web_page_df["wp_type_codes"] = web_page_df["wp_type"].cat.codes + + web_page_newcols = ["wp_web_page_sk", "wp_type_codes"] + web_page_df = web_page_df[web_page_newcols] + + web_page_df = web_page_df.persist() + wait(web_page_df) + c.create_table('web_page_2', web_page_df, persist=False) + + query_2 = f""" + SELECT + CAST(wcs_user_sk AS INTEGER) AS wcs_user_sk, + (wcs_click_date_sk * 86400 + wcs_click_time_sk) AS tstamp_inSec, + wcs_sales_sk, + wp_type_codes + FROM web_clickstreams + INNER JOIN web_page_2 ON wcs_web_page_sk = wp_web_page_sk + WHERE wcs_user_sk IS NOT NULL + AND wcs_click_date_sk BETWEEN {q08_start_dt} AND {q08_end_dt} + --in the future we want to remove this ORDER BY + DISTRIBUTE BY wcs_user_sk + """ + merged_df = c.sql(query_2) + + c.drop_table("web_page_2") + del web_page_df + + merged_df = merged_df.shuffle(on=["wcs_user_sk"]) + merged_df["review_flag"] = merged_df.wp_type_codes == REVIEW_CAT_CODE + + prepped = merged_df.map_partitions( + prep_for_sessionization, review_cat_code=REVIEW_CAT_CODE + ) + + sessionized = prepped.map_partitions(get_sessions) + + unique_review_sales = sessionized.map_partitions( + get_unique_sales_keys_from_sessions, review_cat_code=REVIEW_CAT_CODE + ) + + unique_review_sales = unique_review_sales.to_frame() + + unique_review_sales = unique_review_sales.persist() + wait(unique_review_sales) + c.create_table("reviews", unique_review_sales, persist=False) + last_query = f""" + SELECT + CAST(review_total AS BIGINT) AS q08_review_sales_amount, + CAST(total - review_total AS BIGINT) AS no_q08_review_sales_amount + FROM + ( + SELECT + SUM(ws_net_paid) AS total, + SUM(CASE when wcs_sales_sk IS NULL THEN 0 ELSE 1 END * ws_net_paid) AS review_total + FROM web_sales + LEFT OUTER JOIN reviews ON ws_order_number = wcs_sales_sk + WHERE ws_sold_date_sk between {q08_start_dt} AND {q08_end_dt} + ) + """ + result = c.sql(last_query) + + c.drop_table("reviews") + return result + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q09/gpu_bdb_query_09.py b/gpu_bdb/queries/q09/gpu_bdb_query_09.py index 8c4bc9d8..0d44b61a 100755 --- a/gpu_bdb/queries/q09/gpu_bdb_query_09.py +++ b/gpu_bdb/queries/q09/gpu_bdb_query_09.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,86 +14,44 @@ # limitations under the License. # -from dask.distributed import Client -import sys +import cudf from bdb_tools.utils import ( benchmark, gpubdb_argparser, run_query, ) -from bdb_tools.readers import build_reader - - -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - - ss_columns = [ - "ss_quantity", - "ss_sold_date_sk", - "ss_addr_sk", - "ss_store_sk", - "ss_cdemo_sk", - "ss_sales_price", - "ss_net_profit", - ] - - store_sales = table_reader.read("store_sales", relevant_cols=ss_columns) - - ca_columns = ["ca_address_sk", "ca_country", "ca_state"] - customer_address = table_reader.read("customer_address", relevant_cols=ca_columns) - - cd_columns = ["cd_demo_sk", "cd_marital_status", "cd_education_status"] - customer_demographics = table_reader.read( - "customer_demographics", relevant_cols=cd_columns - ) - - dd_columns = ["d_year", "d_date_sk"] - date_dim = table_reader.read("date_dim", relevant_cols=dd_columns) - - s_columns = ["s_store_sk"] - store = table_reader.read("store", relevant_cols=s_columns) - - return store_sales, customer_address, customer_demographics, date_dim, store +from bdb_tools.q09_utils import ( + q09_year, + q09_part1_ca_country, + q09_part1_ca_state_IN, + q09_part1_net_profit_min, + q09_part1_net_profit_max, + q09_part1_education_status, + q09_part1_marital_status, + q09_part1_sales_price_min, + q09_part1_sales_price_max, + q09_part2_ca_country, + q09_part2_ca_state_IN, + q09_part2_net_profit_min, + q09_part2_net_profit_max, + q09_part2_education_status, + q09_part2_marital_status, + q09_part2_sales_price_min, + q09_part2_sales_price_max, + q09_part3_ca_country, + q09_part3_ca_state_IN, + q09_part3_net_profit_min, + q09_part3_net_profit_max, + q09_part3_education_status, + q09_part3_marital_status, + q09_part3_sales_price_min, + q09_part3_sales_price_max, + read_tables +) def main(client, config): - import cudf - - # Conf variables - - q09_year = 2001 - - q09_part1_ca_country = "United States" - q09_part1_ca_state_IN = "KY", "GA", "NM" - q09_part1_net_profit_min = 0 - q09_part1_net_profit_max = 2000 - q09_part1_education_status = "4 yr Degree" - q09_part1_marital_status = "M" - q09_part1_sales_price_min = 100 - q09_part1_sales_price_max = 150 - - q09_part2_ca_country = "United States" - q09_part2_ca_state_IN = "MT", "OR", "IN" - q09_part2_net_profit_min = 150 - q09_part2_net_profit_max = 3000 - q09_part2_education_status = "4 yr Degree" - q09_part2_marital_status = "M" - q09_part2_sales_price_min = 50 - q09_part2_sales_price_max = 200 - - q09_part3_ca_country = "United States" - q09_part3_ca_state_IN = "WI", "MO", "WV" - q09_part3_net_profit_min = 50 - q09_part3_net_profit_max = 25000 - q09_part3_education_status = "4 yr Degree" - q09_part3_marital_status = "M" - q09_part3_sales_price_min = 150 - q09_part3_sales_price_max = 200 ( store_sales, @@ -208,8 +166,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q09/gpu_bdb_query_09_dask_sql.py b/gpu_bdb/queries/q09/gpu_bdb_query_09_dask_sql.py new file mode 100755 index 00000000..16e71c7c --- /dev/null +++ b/gpu_bdb/queries/q09/gpu_bdb_query_09_dask_sql.py @@ -0,0 +1,125 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.q09_utils import ( + q09_year, + q09_part1_ca_country, + q09_part1_ca_state_IN, + q09_part1_net_profit_min, + q09_part1_net_profit_max, + q09_part1_education_status, + q09_part1_marital_status, + q09_part1_sales_price_min, + q09_part1_sales_price_max, + q09_part2_ca_country, + q09_part2_ca_state_IN, + q09_part2_net_profit_min, + q09_part2_net_profit_max, + q09_part2_education_status, + q09_part2_marital_status, + q09_part2_sales_price_min, + q09_part2_sales_price_max, + q09_part3_ca_country, + q09_part3_ca_state_IN, + q09_part3_net_profit_min, + q09_part3_net_profit_max, + q09_part3_education_status, + q09_part3_marital_status, + q09_part3_sales_price_min, + q09_part3_sales_price_max, + read_tables +) + + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query = f""" + SELECT SUM(ss1.ss_quantity) + FROM store_sales ss1, + date_dim dd,customer_address ca1, + store s, + customer_demographics cd + -- select date range + WHERE ss1.ss_sold_date_sk = dd.d_date_sk + AND dd.d_year = {q09_year} + AND ss1.ss_addr_sk = ca1.ca_address_sk + AND s.s_store_sk = ss1.ss_store_sk + AND cd.cd_demo_sk = ss1.ss_cdemo_sk + AND + ( + ( + cd.cd_marital_status = '{q09_part1_marital_status}' + AND cd.cd_education_status = '{q09_part1_education_status}' + AND {q09_part1_sales_price_min} <= ss1.ss_sales_price + AND ss1.ss_sales_price <= {q09_part1_sales_price_max} + ) + OR + ( + cd.cd_marital_status = '{q09_part2_marital_status}' + AND cd.cd_education_status = '{q09_part2_education_status}' + AND {q09_part2_sales_price_min} <= ss1.ss_sales_price + AND ss1.ss_sales_price <= {q09_part2_sales_price_max} + ) + OR + ( + cd.cd_marital_status = '{q09_part3_marital_status}' + AND cd.cd_education_status = '{q09_part3_education_status}' + AND {q09_part3_sales_price_min} <= ss1.ss_sales_price + AND ss1.ss_sales_price <= {q09_part3_sales_price_max} + ) + ) + AND + ( + ( + ca1.ca_country = '{q09_part1_ca_country}' + AND ca1.ca_state IN {q09_part1_ca_state_IN} + AND {q09_part1_net_profit_min} <= ss1.ss_net_profit + AND ss1.ss_net_profit <= {q09_part1_net_profit_max} + ) + OR + ( + ca1.ca_country = '{q09_part2_ca_country}' + AND ca1.ca_state IN {q09_part2_ca_state_IN} + AND {q09_part2_net_profit_min} <= ss1.ss_net_profit + AND ss1.ss_net_profit <= {q09_part2_net_profit_max} + ) + OR + ( + ca1.ca_country = '{q09_part3_ca_country}' + AND ca1.ca_state IN {q09_part3_ca_state_IN} + AND {q09_part3_net_profit_min} <= ss1.ss_net_profit + AND ss1.ss_net_profit <= {q09_part3_net_profit_max} + ) + ) + """ + result = c.sql(query) + result.columns = ["sum(ss_quantity)"] + return result + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q10/gpu_bdb_query_10.py b/gpu_bdb/queries/q10/gpu_bdb_query_10.py index cb24ef88..4cc3e833 100755 --- a/gpu_bdb/queries/q10/gpu_bdb_query_10.py +++ b/gpu_bdb/queries/q10/gpu_bdb_query_10.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,47 +14,25 @@ # limitations under the License. # -import sys import os +import cudf +import dask_cudf + from bdb_tools.utils import ( benchmark, gpubdb_argparser, run_query, ) from bdb_tools.text import create_sentences_from_reviews, create_words_from_sentences +from bdb_tools.q10_utils import ( + eol_char, + read_tables +) - -import rmm -import cupy as cp -import distributed - -from bdb_tools.readers import build_reader -from dask.distributed import Client, wait - - -# -------- Q10 ----------- -eol_char = "è" - - -def read_tables(config): - - ### splitting by row groups for better parallelism - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=True, - ) - product_reviews_cols = ["pr_item_sk", "pr_review_content", "pr_review_sk"] - - product_reviews_df = table_reader.read( - "product_reviews", relevant_cols=product_reviews_cols, - ) - return product_reviews_df - +from dask.distributed import wait def load_sentiment_words(filename, sentiment): - import cudf with open(filename) as fh: sentiment_words = list(map(str.strip, fh.readlines())) @@ -67,8 +45,6 @@ def load_sentiment_words(filename, sentiment): def main(client, config): - import cudf - import dask_cudf product_reviews_df = benchmark( read_tables, @@ -150,8 +126,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q10/gpu_bdb_query_10_dask_sql.py b/gpu_bdb/queries/q10/gpu_bdb_query_10_dask_sql.py new file mode 100755 index 00000000..64dba763 --- /dev/null +++ b/gpu_bdb/queries/q10/gpu_bdb_query_10_dask_sql.py @@ -0,0 +1,146 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os + +import dask_cudf + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.text import ( + create_sentences_from_reviews, + create_words_from_sentences +) + +from bdb_tools.q10_utils import ( + eol_char, + read_tables +) + +from dask.distributed import wait + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query_1 = """ + SELECT pr_item_sk, + pr_review_content, + pr_review_sk + FROM product_reviews + where pr_review_content IS NOT NULL + ORDER BY pr_item_sk, pr_review_content, pr_review_sk + """ + product_reviews_df = c.sql(query_1) + + product_reviews_df[ + "pr_review_content" + ] = product_reviews_df.pr_review_content.str.lower() + product_reviews_df[ + "pr_review_content" + ] = product_reviews_df.pr_review_content.str.replace( + [".", "?", "!"], [eol_char], regex=False + ) + + sentences = product_reviews_df.map_partitions(create_sentences_from_reviews) + + product_reviews_df = product_reviews_df[["pr_item_sk", "pr_review_sk"]] + product_reviews_df["pr_review_sk"] = product_reviews_df["pr_review_sk"].astype("int32") + + # need the global position in the sentence tokenized df + sentences["x"] = 1 + sentences["sentence_tokenized_global_pos"] = sentences.x.cumsum() + del sentences["x"] + + word_df = sentences.map_partitions( + create_words_from_sentences, + global_position_column="sentence_tokenized_global_pos", + ) + + product_reviews_df = product_reviews_df.persist() + wait(product_reviews_df) + c.create_table('product_reviews_df', product_reviews_df, persist=False) + + sentences = sentences.persist() + wait(sentences) + c.create_table('sentences', sentences, persist=False) + + # These files come from the official TPCx-BB kit + # We extracted them from bigbenchqueriesmr.jar + # Need to pass the absolute path for these txt files + sentiment_dir = os.path.join(config["data_dir"], "sentiment_files") + ns_df = dask_cudf.read_csv(os.path.join(sentiment_dir, "negativeSentiment.txt"), names=["sentiment_word"], persist=False) + c.create_table('negative_sentiment', ns_df, persist=False) + ps_df = dask_cudf.read_csv(os.path.join(sentiment_dir, "positiveSentiment.txt"), names=["sentiment_word"], persist=False) + c.create_table('positive_sentiment', ps_df, persist=False) + + word_df = word_df.persist() + wait(word_df) + c.create_table('word_df', word_df, persist=False) + + query = ''' + SELECT pr_item_sk as item_sk, + sentence as review_sentence, + sentiment, + sentiment_word FROM + ( + SELECT review_idx_global_pos, + sentiment_word, + sentiment, + sentence FROM + ( + WITH sent_df AS + ( + (SELECT sentiment_word, 'POS' as sentiment + FROM positive_sentiment + GROUP BY sentiment_word) + UNION ALL + (SELECT sentiment_word, 'NEG' as sentiment + FROM negative_sentiment + GROUP BY sentiment_word) + ) + SELECT * FROM word_df + INNER JOIN sent_df + ON word_df.word = sent_df.sentiment_word + ) word_sentence_sentiment + LEFT JOIN sentences + ON word_sentence_sentiment.sentence_idx_global_pos = sentences.sentence_tokenized_global_pos + ) temp + INNER JOIN product_reviews_df + ON temp.review_idx_global_pos = product_reviews_df.pr_review_sk + ORDER BY item_sk, review_sentence, sentiment, sentiment_word + ''' + result = c.sql(query) + + c.drop_table("product_reviews_df") + del product_reviews_df + c.drop_table("sentences") + del sentences + c.drop_table("word_df") + del word_df + + return result + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q11/gpu_bdb_query_11.py b/gpu_bdb/queries/q11/gpu_bdb_query_11.py index 6ff0b5b3..224daf40 100755 --- a/gpu_bdb/queries/q11/gpu_bdb_query_11.py +++ b/gpu_bdb/queries/q11/gpu_bdb_query_11.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,8 +14,7 @@ # limitations under the License. # -import sys - +import cudf from bdb_tools.utils import ( benchmark, @@ -23,46 +22,15 @@ run_query, convert_datestring_to_days, ) -from bdb_tools.readers import build_reader -from numba import cuda -import numpy as np +from bdb_tools.q11_utils import read_tables +import numpy as np q11_start_date = "2003-01-02" q11_end_date = "2003-02-02" - -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - - product_review_cols = [ - "pr_review_rating", - "pr_item_sk", - ] - web_sales_cols = [ - "ws_sold_date_sk", - "ws_net_paid", - "ws_item_sk", - ] - date_cols = ["d_date_sk", "d_date"] - - pr_df = table_reader.read("product_reviews", relevant_cols=product_review_cols) - # we only read int columns here so it should scale up to sf-10k as just 26M rows - pr_df = pr_df.repartition(npartitions=1) - - ws_df = table_reader.read("web_sales", relevant_cols=web_sales_cols) - date_df = table_reader.read("date_dim", relevant_cols=date_cols) - - return pr_df, ws_df, date_df - - def main(client, config): - import cudf pr_df, ws_df, date_df = benchmark( read_tables, @@ -125,8 +93,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q11/gpu_bdb_query_11_dask_sql.py b/gpu_bdb/queries/q11/gpu_bdb_query_11_dask_sql.py new file mode 100755 index 00000000..b5d41715 --- /dev/null +++ b/gpu_bdb/queries/q11/gpu_bdb_query_11_dask_sql.py @@ -0,0 +1,67 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster +import cudf + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.q11_utils import read_tables + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query = """ + WITH p AS + ( + SELECT + pr_item_sk, + count(pr_item_sk) AS r_count, + AVG( CAST(pr_review_rating AS DOUBLE) ) avg_rating + FROM product_reviews + WHERE pr_item_sk IS NOT NULL + GROUP BY pr_item_sk + ), s AS + ( + SELECT + ws_item_sk + FROM web_sales ws + INNER JOIN date_dim d ON ws.ws_sold_date_sk = d.d_date_sk + WHERE ws_item_sk IS NOT null + AND CAST(d.d_date AS DATE) >= DATE '2003-01-02' + AND CAST(d.d_date AS DATE) <= DATE '2003-02-02' + GROUP BY ws_item_sk + ) + SELECT p.r_count AS x, + p.avg_rating AS y + FROM s INNER JOIN p ON p.pr_item_sk = s.ws_item_sk + """ + + result = c.sql(query) + sales_corr = result["x"].corr(result["y"]).compute() + result_df = cudf.DataFrame([sales_corr]) + result_df.columns = ["corr(CAST(reviews_count AS DOUBLE), avg_rating)"] + return result_df + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q12/gpu_bdb_query_12.py b/gpu_bdb/queries/q12/gpu_bdb_query_12.py index e912c6f3..39d05a42 100755 --- a/gpu_bdb/queries/q12/gpu_bdb_query_12.py +++ b/gpu_bdb/queries/q12/gpu_bdb_query_12.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,16 +14,18 @@ # limitations under the License. # -import sys import os import glob +import cudf +import dask_cudf + from bdb_tools.utils import ( benchmark, gpubdb_argparser, run_query, ) -from bdb_tools.readers import build_reader +from bdb_tools.q12_utils import read_tables from distributed import wait import numpy as np @@ -36,17 +38,11 @@ ### These parameters are not used -# q12_startDate='2001-09-02' -# q12_endDate1='2001-10-02' -# q12_endDate2='2001-12-02' q12_i_category_IN = ["Books", "Electronics"] ### below was hard coded in the orignal query q12_store_sale_sk_start_date = 37134 -item_cols = ["i_item_sk", "i_category"] -store_sales_cols = ["ss_item_sk", "ss_sold_date_sk", "ss_customer_sk"] - ### Util Functions def string_filter(df, col_name, col_values): """ @@ -63,19 +59,6 @@ def string_filter(df, col_name, col_values): return df[bool_flag].reset_index(drop=True) -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - - item_df = table_reader.read("item", relevant_cols=item_cols) - store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) - - return item_df, store_sales_df - - def filter_wcs_table(web_clickstreams_fn, filtered_item_df): """ Filter web clickstreams table @@ -90,7 +73,6 @@ def filter_wcs_table(web_clickstreams_fn, filtered_item_df): ## AND wcs_user_sk IS NOT NULL ### AND wcs_sales_sk IS NULL --only views, not purchases """ - import cudf web_clickstreams_cols = [ "wcs_user_sk", @@ -150,7 +132,6 @@ def filter_ss_table(store_sales_df, filtered_item_df): def main(client, config): - import cudf, dask_cudf item_df, store_sales_df = benchmark( read_tables, @@ -242,8 +223,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q12/gpu_bdb_query_12_dask_sql.py b/gpu_bdb/queries/q12/gpu_bdb_query_12_dask_sql.py new file mode 100755 index 00000000..2656553a --- /dev/null +++ b/gpu_bdb/queries/q12/gpu_bdb_query_12_dask_sql.py @@ -0,0 +1,67 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.q12_utils import read_tables + +q12_i_category_IN = "'Books', 'Electronics'" + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query = f""" + SELECT DISTINCT wcs_user_sk + FROM + ( + SELECT DISTINCT + wcs_user_sk, + wcs_click_date_sk + FROM web_clickstreams, item + WHERE wcs_click_date_sk BETWEEN 37134 AND 37164 + AND i_category IN ({q12_i_category_IN}) + AND wcs_item_sk = i_item_sk + AND wcs_user_sk IS NOT NULL + AND wcs_sales_sk IS NULL + ) webInRange, + ( + SELECT DISTINCT + ss_customer_sk, + ss_sold_date_sk + FROM store_sales, item + WHERE ss_sold_date_sk BETWEEN 37134 AND 37224 + AND i_category IN ({q12_i_category_IN}) -- filter given category + AND ss_item_sk = i_item_sk + AND ss_customer_sk IS NOT NULL + ) storeInRange + WHERE wcs_user_sk = ss_customer_sk + AND wcs_click_date_sk < ss_sold_date_sk + ORDER BY wcs_user_sk + """ + result = c.sql(query) + return result + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q13/gpu_bdb_query_13.py b/gpu_bdb/queries/q13/gpu_bdb_query_13.py index a61dbfec..79afefe9 100755 --- a/gpu_bdb/queries/q13/gpu_bdb_query_13.py +++ b/gpu_bdb/queries/q13/gpu_bdb_query_13.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,14 +14,13 @@ # limitations under the License. # -import sys - from bdb_tools.utils import ( benchmark, gpubdb_argparser, run_query, ) -from bdb_tools.readers import build_reader +from bdb_tools.q13_utils import read_tables + from distributed import wait @@ -46,28 +45,6 @@ def get_sales_ratio(df): return df -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - - date_cols = ["d_date_sk", "d_year"] - date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) - - customer_cols = ["c_customer_sk", "c_customer_id", "c_first_name", "c_last_name"] - customer_df = table_reader.read("customer", relevant_cols=customer_cols) - - s_sales_cols = ["ss_sold_date_sk", "ss_customer_sk", "ss_net_paid"] - s_sales_df = table_reader.read("store_sales", relevant_cols=s_sales_cols) - - w_sales_cols = ["ws_sold_date_sk", "ws_bill_customer_sk", "ws_net_paid"] - web_sales_df = table_reader.read("web_sales", relevant_cols=w_sales_cols) - - return date_dim_df, customer_df, s_sales_df, web_sales_df - - def main(client, config): date_dim_df, customer_df, s_sales_df, web_sales_df = benchmark( read_tables, @@ -212,8 +189,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q13/gpu_bdb_query_13_dask_sql.py b/gpu_bdb/queries/q13/gpu_bdb_query_13_dask_sql.py new file mode 100644 index 00000000..19c501f9 --- /dev/null +++ b/gpu_bdb/queries/q13/gpu_bdb_query_13_dask_sql.py @@ -0,0 +1,101 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.q13_utils import read_tables + +from dask.distributed import wait + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query_1 = """ + SELECT + ss.ss_customer_sk AS customer_sk, + sum( case when (d_year = 2001) THEN ss_net_paid ELSE 0.0 END) first_year_total, + sum( case when (d_year = 2002) THEN ss_net_paid ELSE 0.0 END) second_year_total + FROM store_sales ss + JOIN + ( + SELECT d_date_sk, d_year + FROM date_dim d + WHERE d.d_year in (2001, 2002) + ) dd on ( ss.ss_sold_date_sk = dd.d_date_sk ) + GROUP BY ss.ss_customer_sk + HAVING sum( case when (d_year = 2001) THEN ss_net_paid ELSE 0.0 END) > 0.0 + """ + temp_table1 = c.sql(query_1) + + temp_table1 = temp_table1.persist() + wait(temp_table1) + c.create_table("temp_table1", temp_table1, persist=False) + query_2 = """ + SELECT + ws.ws_bill_customer_sk AS customer_sk, + sum( case when (d_year = 2001) THEN ws_net_paid ELSE 0.0 END) first_year_total, + sum( case when (d_year = 2002) THEN ws_net_paid ELSE 0.0 END) second_year_total + FROM web_sales ws + JOIN + ( + SELECT d_date_sk, d_year + FROM date_dim d + WHERE d.d_year in (2001, 2002) + ) dd ON ( ws.ws_sold_date_sk = dd.d_date_sk ) + GROUP BY ws.ws_bill_customer_sk + HAVING sum( case when (d_year = 2001) THEN ws_net_paid ELSE 0.0 END) > 0.0 + """ + temp_table2 = c.sql(query_2) + + temp_table2 = temp_table2.persist() + wait(temp_table2) + c.create_table("temp_table2", temp_table2, persist=False) + query = """ + SELECT + CAST(c_customer_sk AS BIGINT) as c_customer_sk, + c_first_name, + c_last_name, + (store.second_year_total / store.first_year_total) AS storeSalesIncreaseRatio, + (web.second_year_total / web.first_year_total) AS webSalesIncreaseRatio + FROM temp_table1 store, + temp_table2 web, + customer c + WHERE store.customer_sk = web.customer_sk + AND web.customer_sk = c_customer_sk + AND (web.second_year_total / web.first_year_total) > (store.second_year_total / store.first_year_total) + ORDER BY webSalesIncreaseRatio DESC, + c_customer_sk, + c_first_name, + c_last_name + LIMIT 100 + """ + result = c.sql(query) + + c.drop_table("temp_table1") + c.drop_table("temp_table2") + return result + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q14/gpu_bdb_query_14.py b/gpu_bdb/queries/q14/gpu_bdb_query_14.py index 52cbc09f..2549910b 100755 --- a/gpu_bdb/queries/q14/gpu_bdb_query_14.py +++ b/gpu_bdb/queries/q14/gpu_bdb_query_14.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,46 +14,18 @@ # limitations under the License. # -from dask.distributed import Client - import numpy as np -import sys +import cudf from bdb_tools.utils import ( benchmark, gpubdb_argparser, run_query, ) -from bdb_tools.readers import build_reader - - -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - - ws_columns = ["ws_ship_hdemo_sk", "ws_web_page_sk", "ws_sold_time_sk"] - web_sales = table_reader.read("web_sales", relevant_cols=ws_columns) - - hd_columns = ["hd_demo_sk", "hd_dep_count"] - household_demographics = table_reader.read( - "household_demographics", relevant_cols=hd_columns - ) - - wp_columns = ["wp_web_page_sk", "wp_char_count"] - web_page = table_reader.read("web_page", relevant_cols=wp_columns) - - td_columns = ["t_time_sk", "t_hour"] - time_dim = table_reader.read("time_dim", relevant_cols=td_columns) - - return web_sales, household_demographics, web_page, time_dim - +from bdb_tools.q14_utils import read_tables def main(client, config): - import cudf q14_dependents = 5 q14_morning_startHour = 7 @@ -140,8 +112,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q14/gpu_bdb_query_14_dask_sql.py b/gpu_bdb/queries/q14/gpu_bdb_query_14_dask_sql.py new file mode 100755 index 00000000..ca6850f2 --- /dev/null +++ b/gpu_bdb/queries/q14/gpu_bdb_query_14_dask_sql.py @@ -0,0 +1,56 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.q14_utils import read_tables + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query = """ + SELECT CASE WHEN pmc > 0.0 THEN CAST (amc AS DOUBLE) / CAST (pmc AS DOUBLE) ELSE -1.0 END AS am_pm_ratio + FROM + ( + SELECT SUM(amc1) AS amc, SUM(pmc1) AS pmc + FROM + ( + SELECT + CASE WHEN t_hour BETWEEN 7 AND 8 THEN COUNT(1) ELSE 0 END AS amc1, + CASE WHEN t_hour BETWEEN 19 AND 20 THEN COUNT(1) ELSE 0 END AS pmc1 + FROM web_sales ws + JOIN household_demographics hd ON (hd.hd_demo_sk = ws.ws_ship_hdemo_sk and hd.hd_dep_count = 5) + JOIN web_page wp ON (wp.wp_web_page_sk = ws.ws_web_page_sk and wp.wp_char_count BETWEEN 5000 AND 6000) + JOIN time_dim td ON (td.t_time_sk = ws.ws_sold_time_sk and td.t_hour IN (7,8,19,20)) + GROUP BY t_hour + ) cnt_am_pm + ) sum_am_pm + """ + + result = c.sql(query) + return result + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q15/gpu_bdb_query_15.py b/gpu_bdb/queries/q15/gpu_bdb_query_15.py index 7699d087..e8e24cd4 100755 --- a/gpu_bdb/queries/q15/gpu_bdb_query_15.py +++ b/gpu_bdb/queries/q15/gpu_bdb_query_15.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,44 +14,21 @@ # limitations under the License. # -import sys -from collections import OrderedDict - - from bdb_tools.utils import ( benchmark, gpubdb_argparser, run_query, convert_datestring_to_days, ) -from bdb_tools.readers import build_reader +from bdb_tools.q15_utils import ( + q15_startDate, + q15_endDate, + q15_store_sk, + store_sales_cols, + read_tables +) import datetime -import numpy as np - - -q15_startDate = "2001-09-02" -q15_endDate = "2002-09-02" -q15_store_sk = "10" - -store_sales_cols = ["ss_sold_date_sk", "ss_net_paid", "ss_store_sk", "ss_item_sk"] -date_cols = ["d_date", "d_date_sk"] -item_cols = ["i_item_sk", "i_category_id"] - - -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - - store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) - date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) - item_df = table_reader.read("item", relevant_cols=item_cols) - - return store_sales_df, date_dim_df, item_df - def main(client, config): @@ -166,8 +143,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q15/gpu_bdb_query_15_dask_sql.py b/gpu_bdb/queries/q15/gpu_bdb_query_15_dask_sql.py new file mode 100755 index 00000000..c04ea8c2 --- /dev/null +++ b/gpu_bdb/queries/q15/gpu_bdb_query_15_dask_sql.py @@ -0,0 +1,72 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.q15_utils import ( + q15_startDate, + q15_endDate, + q15_store_sk, + read_tables +) + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query = f""" + SELECT * + FROM + ( + SELECT + cat, + ( (count(x) * SUM(xy) - SUM(x) * SUM(y)) / (count(x) * SUM(xx) - SUM(x) * SUM(x)) ) AS slope, + (SUM(y) - ((count(x) * SUM(xy) - SUM(x) * SUM(y)) / (count(x) * SUM(xx) - SUM(x)*SUM(x)) ) * SUM(x)) / count(x) AS intercept + FROM + ( + SELECT + i.i_category_id AS cat, + s.ss_sold_date_sk AS x, + CAST(SUM(s.ss_net_paid) AS DOUBLE) AS y, + CAST(s.ss_sold_date_sk * SUM(s.ss_net_paid) AS DOUBLE) AS xy, + CAST(s.ss_sold_date_sk * s.ss_sold_date_sk AS DOUBLE) AS xx + FROM store_sales s + INNER JOIN item i ON s.ss_item_sk = i.i_item_sk + INNER JOIN date_dim d ON s.ss_sold_date_sk = d.d_date_sk + WHERE s.ss_store_sk = {q15_store_sk} + AND i.i_category_id IS NOT NULL + AND CAST(d.d_date AS DATE) >= DATE '{q15_startDate}' + AND CAST(d.d_date AS DATE) <= DATE '{q15_endDate}' + GROUP BY i.i_category_id, s.ss_sold_date_sk + ) temp + GROUP BY cat + ) regression + WHERE slope <= 0.0 + ORDER BY cat + """ + result = c.sql(query) + return result + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q16/gpu_bdb_query_16.py b/gpu_bdb/queries/q16/gpu_bdb_query_16.py index e093427d..7f2747b4 100755 --- a/gpu_bdb/queries/q16/gpu_bdb_query_16.py +++ b/gpu_bdb/queries/q16/gpu_bdb_query_16.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,8 +14,7 @@ # limitations under the License. # -import sys - +import cudf from bdb_tools.utils import ( benchmark, @@ -24,7 +23,8 @@ convert_datestring_to_days, ) from bdb_tools.merge_util import hash_merge -from bdb_tools.readers import build_reader +from bdb_tools.q16_utils import read_tables + from dask.distributed import wait import numpy as np @@ -33,19 +33,6 @@ ### conf q16_date = "2001-03-16" -websale_cols = [ - "ws_order_number", - "ws_item_sk", - "ws_warehouse_sk", - "ws_sold_date_sk", - "ws_sales_price", -] -web_returns_cols = ["wr_order_number", "wr_item_sk", "wr_refunded_cash"] -date_cols = ["d_date", "d_date_sk"] -item_cols = ["i_item_sk", "i_item_id"] -warehouse_cols = ["w_warehouse_sk", "w_state"] - - # INSERT INTO TABLE ${hiveconf:RESULT_TABLE} # SELECT w_state, i_item_id, # SUM( @@ -72,23 +59,7 @@ def get_before_after_sales(df, q16_timestamp): return df -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - - web_sales_df = table_reader.read("web_sales", relevant_cols=websale_cols) - web_returns_df = table_reader.read("web_returns", relevant_cols=web_returns_cols) - date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) - item_df = table_reader.read("item", relevant_cols=item_cols) - warehouse_df = table_reader.read("warehouse", relevant_cols=warehouse_cols) - return web_sales_df, web_returns_df, date_dim_df, item_df, warehouse_df - - def main(client, config): - import cudf web_sales_df, web_returns_df, date_dim_df, item_df, warehouse_df = benchmark( read_tables, @@ -264,8 +235,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q16/gpu_bdb_query_16_dask_sql.py b/gpu_bdb/queries/q16/gpu_bdb_query_16_dask_sql.py new file mode 100755 index 00000000..8ddb145e --- /dev/null +++ b/gpu_bdb/queries/q16/gpu_bdb_query_16_dask_sql.py @@ -0,0 +1,93 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster + +import datetime +from datetime import timedelta +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.q16_utils import read_tables + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + date = datetime.datetime(2001, 3, 16) + start = (date + timedelta(days=-30)).strftime("%Y-%m-%d") + end = (date + timedelta(days=30)).strftime("%Y-%m-%d") + mid = date.strftime("%Y-%m-%d") + + date_query = f""" + SELECT d_date_sk + FROM date_dim + WHERE CAST(d_date as DATE) IN (DATE '{start}', DATE '{mid}', DATE '{end}') + ORDER BY CAST(d_date as date) ASC + """ + + dates = c.sql(date_query) + + cpu_dates = dates["d_date_sk"].compute().to_pandas() + cpu_dates.index = list(range(0, cpu_dates.shape[0])) + + last_query = f""" + SELECT w_state, i_item_id, + SUM + ( + CASE WHEN ws_sold_date_sk < {str(cpu_dates[1])} + THEN ws_sales_price - COALESCE(wr_refunded_cash,0) + ELSE 0.0 END + ) AS sales_before, + SUM + ( + CASE WHEN ws_sold_date_sk >= {str(cpu_dates[1])} + THEN ws_sales_price - COALESCE(wr_refunded_cash,0) + ELSE 0.0 END + ) AS sales_after + FROM + ( + SELECT ws_item_sk, + ws_warehouse_sk, + ws_sold_date_sk, + ws_sales_price, + wr_refunded_cash + FROM web_sales ws + LEFT OUTER JOIN web_returns wr ON + ( + ws.ws_order_number = wr.wr_order_number + AND ws.ws_item_sk = wr.wr_item_sk + ) + WHERE ws_sold_date_sk BETWEEN {str(cpu_dates[0])} + AND {str(cpu_dates[2])} + ) a1 + JOIN item i ON a1.ws_item_sk = i.i_item_sk + JOIN warehouse w ON a1.ws_warehouse_sk = w.w_warehouse_sk + GROUP BY w_state,i_item_id + ORDER BY w_state,i_item_id + LIMIT 100 + """ + + result = c.sql(last_query) + return result + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q17/gpu_bdb_query_17.py b/gpu_bdb/queries/q17/gpu_bdb_query_17.py index 8f36a11e..c7113e48 100755 --- a/gpu_bdb/queries/q17/gpu_bdb_query_17.py +++ b/gpu_bdb/queries/q17/gpu_bdb_query_17.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,72 +14,25 @@ # limitations under the License. # -import sys -from collections import OrderedDict +import cudf from bdb_tools.utils import ( benchmark, gpubdb_argparser, left_semi_join, run_query, + ) +from bdb_tools.q17_utils import ( + q17_gmt_offset, + q17_year, + q17_month, + store_sales_cols, + read_tables ) -from bdb_tools.readers import build_reader - -### conf -q17_gmt_offset = -5 -# --store_sales date -q17_year = 2001 -q17_month = 12 q17_i_category_IN = "Books", "Music" - -store_sales_cols = [ - "ss_ext_sales_price", - "ss_sold_date_sk", - "ss_store_sk", - "ss_customer_sk", - "ss_promo_sk", - "ss_item_sk", -] -item_cols = ["i_category", "i_item_sk"] -customer_cols = ["c_customer_sk", "c_current_addr_sk"] -store_cols = ["s_gmt_offset", "s_store_sk"] -date_cols = ["d_date_sk", "d_year", "d_moy"] -customer_address_cols = ["ca_address_sk", "ca_gmt_offset"] -promotion_cols = ["p_channel_email", "p_channel_dmail", "p_channel_tv", "p_promo_sk"] - - -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - - store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) - item_df = table_reader.read("item", relevant_cols=item_cols) - customer_df = table_reader.read("customer", relevant_cols=customer_cols) - store_df = table_reader.read("store", relevant_cols=store_cols) - date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) - customer_address_df = table_reader.read( - "customer_address", relevant_cols=customer_address_cols - ) - promotion_df = table_reader.read("promotion", relevant_cols=promotion_cols) - - return ( - store_sales_df, - item_df, - customer_df, - store_df, - date_dim_df, - customer_address_df, - promotion_df, - ) - - def main(client, config): - import cudf ( store_sales_df, @@ -214,8 +167,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q17/gpu_bdb_query_17_dask_sql.py b/gpu_bdb/queries/q17/gpu_bdb_query_17_dask_sql.py new file mode 100755 index 00000000..d65181e7 --- /dev/null +++ b/gpu_bdb/queries/q17/gpu_bdb_query_17_dask_sql.py @@ -0,0 +1,85 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.q17_utils import ( + q17_gmt_offset, + q17_year, + q17_month, + read_tables +) + +q17_i_category_IN = "'Books', 'Music'" + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query_date = f""" + select min(d_date_sk) as min_d_date_sk, + max(d_date_sk) as max_d_date_sk + from date_dim + where d_year = {q17_year} + and d_moy = {q17_month} + """ + dates_result = c.sql(query_date).compute() + + min_date_sk_val = dates_result["min_d_date_sk"][0] + max_date_sk_val = dates_result["max_d_date_sk"][0] + + query = f""" + SELECT sum(promotional) as promotional, + sum(total) as total, + CASE WHEN sum(total) > 0.0 THEN (100.0 * sum(promotional)) / sum(total) + ELSE 0.0 END as promo_percent + FROM + ( + SELECT p_channel_email, + p_channel_dmail, + p_channel_tv, + SUM( CAST(ss_ext_sales_price AS DOUBLE) ) total, + CASE WHEN (p_channel_dmail = 'Y' OR p_channel_email = 'Y' OR p_channel_tv = 'Y') + THEN SUM(CAST(ss_ext_sales_price AS DOUBLE)) ELSE 0 END as promotional + FROM store_sales ss + INNER JOIN promotion p ON ss.ss_promo_sk = p.p_promo_sk + inner join item i on ss.ss_item_sk = i.i_item_sk + inner join store s on ss.ss_store_sk = s.s_store_sk + inner join customer c on c.c_customer_sk = ss.ss_customer_sk + inner join customer_address ca + on c.c_current_addr_sk = ca.ca_address_sk + WHERE i.i_category IN ({q17_i_category_IN}) + AND s.s_gmt_offset = {q17_gmt_offset} + AND ca.ca_gmt_offset = {q17_gmt_offset} + AND ss.ss_sold_date_sk >= {min_date_sk_val} + AND ss.ss_sold_date_sk <= {max_date_sk_val} + GROUP BY p_channel_email, p_channel_dmail, p_channel_tv + ) sum_promotional + -- we don't need a 'ON' join condition. result is just two numbers. + """ + result = c.sql(query) + return result + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q18/gpu_bdb_query_18.py b/gpu_bdb/queries/q18/gpu_bdb_query_18.py index 899d1c86..f8c260a1 100755 --- a/gpu_bdb/queries/q18/gpu_bdb_query_18.py +++ b/gpu_bdb/queries/q18/gpu_bdb_query_18.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,10 +14,10 @@ # limitations under the License. # -import sys import os -from collections import OrderedDict +import cudf +import dask_cudf from bdb_tools.utils import ( benchmark, @@ -25,136 +25,24 @@ left_semi_join, run_query, ) - -from bdb_tools.readers import build_reader from bdb_tools.text import ( create_sentences_from_reviews, create_words_from_sentences, ) +from bdb_tools.q18_utils import ( + find_relevant_reviews, + q18_startDate, + q18_endDate, + EOL_CHAR, + read_tables +) + import numpy as np -import cupy as cp from distributed import wait - -# -------- Q18 ----------- -# -- store_sales date range -q18_startDate = "2001-05-02" -# --+90days -q18_endDate = "2001-09-02" TEMP_TABLE1 = "TEMP_TABLE1" -EOL_CHAR = "è" - - -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], basepath=config["data_dir"], - ) - - store_sales_cols = [ - "ss_store_sk", - "ss_sold_date_sk", - "ss_net_paid", - ] - date_cols = ["d_date_sk", "d_date"] - store_cols = ["s_store_sk", "s_store_name"] - - store_sales = table_reader.read("store_sales", relevant_cols=store_sales_cols) - date_dim = table_reader.read("date_dim", relevant_cols=date_cols) - store = table_reader.read("store", relevant_cols=store_cols) - - ### splitting by row groups for better parallelism - pr_table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=True, - ) - - product_reviews_cols = ["pr_review_date", "pr_review_content", "pr_review_sk"] - product_reviews = pr_table_reader.read( - "product_reviews", relevant_cols=product_reviews_cols, - ) - - return store_sales, date_dim, store, product_reviews - - -def create_found_reshaped_with_global_pos(found, targets): - """Given the dataframe created by mapping find_targets_in_reviews, - create a new dataframe in which the nonzero values in each row are exploded - to get their own row. Each row will contain the word, its mapping in the column order, - and the pr_review_sk for the review from which it came. - - Having these as two separate functions makes managing dask metadata easier. - """ - import cudf - - target_df = cudf.DataFrame({"word": targets}).reset_index(drop=False) - target_df.columns = ["word_mapping", "word"] - - df_clean = found.drop(["pr_review_sk"], axis=1) - - row_idxs, col_idxs = df_clean.values.nonzero() - - found_reshaped = cudf.DataFrame( - {"word_mapping": col_idxs, "pr_review_sk": found["pr_review_sk"].iloc[row_idxs]} - ) - found_reshaped = found_reshaped.merge(target_df, on="word_mapping", how="inner")[ - ["word", "pr_review_sk"] - ] - return found_reshaped - - -def find_targets_in_reviews_helper(ddf, targets, str_col_name="pr_review_content"): - """returns a N x K matrix, where N is the number of rows in ddf that - contain one of the target words and K is the number of words in targets. - - If a target word is found in a review, the value in that row, column - is non-zero. - - At the end, any row with non-zero values is returned. - - """ - import cudf - from cudf._lib.strings import find_multiple - - lowered = ddf[str_col_name].str.lower() - - ## TODO: Do the replace/any in cupy land before going to cuDF - resdf = cudf.DataFrame( - cp.asarray( - find_multiple.find_multiple(lowered._column, targets._column) - ).reshape(-1, len(targets)) - ) - - resdf = resdf.replace([0, -1], [1, 0]) - found_mask = resdf.any(axis=1) - resdf["pr_review_sk"] = ddf["pr_review_sk"] - found = resdf.loc[found_mask] - return create_found_reshaped_with_global_pos(found, targets) - - -def find_relevant_reviews(df, targets, str_col_name="pr_review_content"): - """ - This function finds the reviews containg target stores and returns the - relevant reviews - """ - import cudf - - targets = cudf.Series(targets) - targets_lower = targets.str.lower() - reviews_found = find_targets_in_reviews_helper(df, targets_lower)[ - ["word", "pr_review_sk"] - ] - - combined = reviews_found.merge( - df[["pr_review_date", "pr_review_sk"]], how="inner", on=["pr_review_sk"] - ) - - return combined - def main(client, config): - import cudf - import dask_cudf store_sales, date_dim, store, product_reviews = benchmark( read_tables, @@ -236,7 +124,6 @@ def main(client, config): .to_arrow() .to_pylist() ) - n_targets = len(targets) no_nulls = pr[~pr.pr_review_content.isnull()].reset_index(drop=True) no_nulls["pr_review_sk"] = no_nulls["pr_review_sk"].astype("int32") @@ -337,8 +224,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q18/gpu_bdb_query_18_dask_sql.py b/gpu_bdb/queries/q18/gpu_bdb_query_18_dask_sql.py new file mode 100755 index 00000000..4a1eba70 --- /dev/null +++ b/gpu_bdb/queries/q18/gpu_bdb_query_18_dask_sql.py @@ -0,0 +1,239 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os + +from bdb_tools.cluster_startup import attach_to_cluster +import numpy as np + +import dask_cudf + +from bdb_tools.text import create_sentences_from_reviews, create_words_from_sentences + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.q18_utils import ( + find_relevant_reviews, + q18_startDate, + q18_endDate, + EOL_CHAR, + read_tables +) + +from dask.distributed import wait + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query_1 = f""" + WITH temp_table1 AS + ( + SELECT CAST(s.s_store_sk AS INTEGER) AS s_store_sk, + s.s_store_name , + CAST(s.s_store_sk AS VARCHAR) || '_' || s.s_store_name + AS store_ID + FROM store s, + ( + SELECT temp.ss_store_sk, + ((count(temp.x) * SUM(temp.xy) - SUM(temp.x) * SUM(temp.y)) + / (count(temp.x) * SUM(temp.xx) - SUM(temp.x) * SUM(temp.x)) + ) AS slope + FROM + ( + SELECT + s.ss_store_sk, + s.ss_sold_date_sk AS x, + CAST( SUM(s.ss_net_paid) AS DOUBLE) AS y, + s.ss_sold_date_sk * SUM(s.ss_net_paid) AS xy, + s.ss_sold_date_sk * s.ss_sold_date_sk AS xx + FROM store_sales s + WHERE EXISTS + ( + SELECT * -- d_date_sk + FROM date_dim d + WHERE s.ss_sold_date_sk = d.d_date_sk + AND CAST(d.d_date AS DATE) >= DATE '{q18_startDate}' + AND CAST(d.d_date AS DATE) <= DATE '{q18_endDate}' + ) + GROUP BY s.ss_store_sk, s.ss_sold_date_sk + ) temp + GROUP BY temp.ss_store_sk + ) regression_analysis + WHERE slope <= 0 --flat or declining sales + AND s.s_store_sk = regression_analysis.ss_store_sk + ) + SELECT * FROM temp_table1 + """ + stores_with_regression = c.sql(query_1) + + query_2 = """ + SELECT pr_review_date, + pr_review_content, + CAST(pr_review_sk AS INTEGER) AS pr_review_sk + FROM product_reviews + WHERE pr_review_content IS NOT NULL + ORDER BY pr_review_date, pr_review_content, pr_review_sk + """ + no_nulls = c.sql(query_2) + + targets = ( + stores_with_regression.s_store_name.str.lower() + .unique() + .compute() + .to_arrow() + .to_pylist() + ) + + # perssiting because no_nulls is used twice + no_nulls = no_nulls.persist() + + import cudf + + temp_table2_meta_empty_df = cudf.DataFrame( + { + "word": ["a"], + "pr_review_sk": np.ones(1, dtype=np.int64), + "pr_review_date": ["a"], + } + ).head(0) + + # get relevant reviews + combined = no_nulls.map_partitions( + find_relevant_reviews, targets, meta=temp_table2_meta_empty_df, + ) + + no_nulls["pr_review_content"] = no_nulls.pr_review_content.str.replace( + [". ", "? ", "! "], [EOL_CHAR], regex=False + ) + + stores_with_regression["store_ID"] = stores_with_regression.s_store_sk.astype( + "str" + ).str.cat(stores_with_regression.s_store_name, sep="_") + + stores_with_regression[ + "s_store_name" + ] = stores_with_regression.s_store_name.str.lower() + + stores_with_regression = stores_with_regression.persist() + wait(stores_with_regression) + c.create_table("stores_with_regression", stores_with_regression, persist=False) + + combined = combined.persist() + wait(combined) + c.create_table("combined", combined, persist=False) + + query_3 = """ + SELECT store_ID, + pr_review_date, + CAST(pr_review_sk AS INTEGER) AS pr_review_sk + FROM stores_with_regression + INNER JOIN combined ON s_store_name = word + """ + temp_table2 = c.sql(query_3) + + c.drop_table("stores_with_regression") + del stores_with_regression + + c.drop_table("combined") + del combined + + # REAL QUERY + sentences = no_nulls.map_partitions(create_sentences_from_reviews) + + # need the global position in the sentence tokenized df + sentences["x"] = 1 + sentences["sentence_tokenized_global_pos"] = sentences.x.cumsum() + del sentences["x"] + + word_df = sentences.map_partitions( + create_words_from_sentences, + global_position_column="sentence_tokenized_global_pos", + ) + + # This txt file comes from the official TPCx-BB kit + # We extracted it from bigbenchqueriesmr.jar + # Need to pass the absolute path for this txt file + sentiment_dir = os.path.join(config["data_dir"], "sentiment_files") + ns_df = dask_cudf.read_csv(os.path.join(sentiment_dir, "negativeSentiment.txt"), names=["sentiment_word"]) + c.create_table('sent_df', ns_df, persist=False) + + word_df = word_df.persist() + wait(word_df) + c.create_table("word_df", word_df, persist=False) + + sentences = sentences.persist() + wait(sentences) + c.create_table("sentences", sentences, persist=False) + + temp_table2 = temp_table2.persist() + wait(temp_table2) + c.create_table("temp_table2", temp_table2, persist=False) + + query_4 = """ + WITH sentences_table AS + ( + select sentence, + review_idx_global_pos, + CAST(sentence_tokenized_global_pos AS BIGINT) AS + sentence_tokenized_global_pos + from sentences + ), negativeSentiment AS + ( + SELECT DISTINCT sentiment_word AS word + FROM sent_df + ), word_sentence_sentiment AS + ( + SELECT n.word, + CAST(wd.sentence_idx_global_pos AS BIGINT) AS + sentence_idx_global_pos, + 'NEG' AS sentiment + FROM word_df wd + INNER JOIN negativeSentiment n ON wd.word = n.word + ), word_sentence_sentiment_with_sentence_info AS + ( + SELECT * FROM word_sentence_sentiment + LEFT JOIN sentences_table + ON sentence_idx_global_pos = sentence_tokenized_global_pos + ) + SELECT tt2.store_ID AS s_name, + tt2.pr_review_date AS r_date, + wsswsi.sentence AS r_sentence, + wsswsi.sentiment AS sentiment, + wsswsi.word AS sentiment_word + FROM word_sentence_sentiment_with_sentence_info wsswsi + INNER JOIN temp_table2 tt2 + ON wsswsi.review_idx_global_pos = tt2.pr_review_sk + ORDER BY s_name, r_date, r_sentence, sentiment_word + """ + result = c.sql(query_4) + + c.drop_table("word_df") + del word_df + c.drop_table("sentences") + del sentences + c.drop_table("temp_table2") + del temp_table2 + return result + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q19/gpu_bdb_query_19.py b/gpu_bdb/queries/q19/gpu_bdb_query_19.py index 8d4e29a2..06b51580 100755 --- a/gpu_bdb/queries/q19/gpu_bdb_query_19.py +++ b/gpu_bdb/queries/q19/gpu_bdb_query_19.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,58 +14,26 @@ # limitations under the License. # -import sys import os +import cudf +import dask_cudf + from bdb_tools.utils import ( benchmark, gpubdb_argparser, run_query, ) from bdb_tools.text import create_sentences_from_reviews, create_words_from_sentences +from bdb_tools.q19_utils import ( + q19_returns_dates_IN, + eol_char, + read_tables +) - -from bdb_tools.readers import build_reader -from dask.distributed import Client, wait -import distributed - - -# -------- Q19 ----------- -q19_returns_dates = ["2004-03-08", "2004-08-02", "2004-11-15", "2004-12-20"] -eol_char = "è" - - -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], basepath=config["data_dir"], - ) - date_dim_cols = ["d_week_seq", "d_date_sk", "d_date"] - date_dim_df = table_reader.read("date_dim", relevant_cols=date_dim_cols) - store_returns_cols = ["sr_returned_date_sk", "sr_item_sk", "sr_return_quantity"] - store_returns_df = table_reader.read( - "store_returns", relevant_cols=store_returns_cols - ) - web_returns_cols = ["wr_returned_date_sk", "wr_item_sk", "wr_return_quantity"] - web_returns_df = table_reader.read("web_returns", relevant_cols=web_returns_cols) - - ### splitting by row groups for better parallelism - pr_table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=True, - ) - - product_reviews_cols = ["pr_item_sk", "pr_review_content", "pr_review_sk"] - product_reviews = pr_table_reader.read( - "product_reviews", relevant_cols=product_reviews_cols - ) - - return date_dim_df, store_returns_df, web_returns_df, product_reviews - +from dask.distributed import wait def main(client, config): - import cudf - import dask_cudf date_dim_df, store_returns_df, web_returns_df, product_reviews_df = benchmark( read_tables, @@ -78,7 +46,7 @@ def main(client, config): date_dim_df = date_dim_df.merge( date_dim_df, on=["d_week_seq"], how="outer", suffixes=("", "_r") ) - date_dim_df = date_dim_df[date_dim_df.d_date_r.isin(q19_returns_dates)].reset_index( + date_dim_df = date_dim_df[date_dim_df.d_date_r.isin(q19_returns_dates_IN)].reset_index( drop=True ) @@ -207,8 +175,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q19/gpu_bdb_query_19_dask_sql.py b/gpu_bdb/queries/q19/gpu_bdb_query_19_dask_sql.py new file mode 100755 index 00000000..fc2b6183 --- /dev/null +++ b/gpu_bdb/queries/q19/gpu_bdb_query_19_dask_sql.py @@ -0,0 +1,171 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os + +import dask_cudf + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.text import ( + create_sentences_from_reviews, + create_words_from_sentences +) + +from bdb_tools.q19_utils import ( + q19_returns_dates_IN, + eol_char, + read_tables +) + +from dask.distributed import wait + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query = f""" + WITH dateFilter AS + ( + -- within the week ending a given date + SELECT d1.d_date_sk + FROM date_dim d1, date_dim d2 + WHERE d1.d_week_seq = d2.d_week_seq + AND CAST(d2.d_date AS DATE) IN (DATE '{q19_returns_dates_IN[0]}', + DATE '{q19_returns_dates_IN[1]}', + DATE '{q19_returns_dates_IN[2]}', + DATE '{q19_returns_dates_IN[3]}') + ), fsr AS + ( + --store returns in week ending given date + SELECT sr_item_sk, SUM(sr_return_quantity) sr_item_qty + FROM store_returns sr + INNER JOIN dateFilter d + ON sr.sr_returned_date_sk = d.d_date_sk + GROUP BY sr_item_sk --across all store and web channels + HAVING SUM(sr_return_quantity) > 0 + ), fwr AS + ( + --web returns in week ending given date + SELECT wr_item_sk, SUM(wr_return_quantity) wr_item_qty + FROM web_returns wr + INNER JOIN dateFilter d + ON wr.wr_returned_date_sk = d_date_sk + GROUP BY wr_item_sk --across all store and web channels + HAVING SUM(wr_return_quantity) > 0 + ), extract_sentiment AS + ( + SELECT pr.pr_item_sk, pr.pr_review_content, pr.pr_review_sk + FROM product_reviews pr + INNER JOIN fsr + ON pr.pr_item_sk = fsr.sr_item_sk + INNER JOIN fwr + ON fsr.sr_item_sk = fwr.wr_item_sk + WHERE pr.pr_review_content IS NOT NULL ---- add as rapids + AND abs( CAST((sr_item_qty-wr_item_qty) AS DOUBLE) / + ((sr_item_qty + wr_item_qty)/2) ) <= 0.1 + ) + SELECT * FROM extract_sentiment + ORDER BY pr_item_sk, pr_review_content, pr_review_sk + """ + merged_df = c.sql(query) + + # second step -- Sentiment Word Extraction + merged_df["pr_review_sk"] = merged_df["pr_review_sk"].astype("int32") + merged_df["pr_review_content"] = merged_df.pr_review_content.str.lower() + merged_df["pr_review_content"] = merged_df.pr_review_content.str.replace( + [".", "?", "!"], [eol_char], regex=False + ) + + sentences = merged_df.map_partitions(create_sentences_from_reviews) + # need the global position in the sentence tokenized df + sentences["x"] = 1 + sentences['sentence_tokenized_global_pos'] = sentences['x'].cumsum() + del sentences["x"] + + word_df = sentences.map_partitions( + create_words_from_sentences, + global_position_column="sentence_tokenized_global_pos", + ) + + # This txt file comes from the official TPCx-BB kit + # We extracted it from bigbenchqueriesmr.jar + # Need to pass the absolute path for this txt file + sentiment_dir = os.path.join(config["data_dir"], "sentiment_files") + ns_df = dask_cudf.read_csv(os.path.join(sentiment_dir, "negativeSentiment.txt"), names=["sentiment_word"]) + c.create_table('sent_df', ns_df, persist=False) + + sentences = sentences.persist() + wait(sentences) + c.create_table('sentences_df', sentences, persist=False) + + word_df = word_df.persist() + wait(word_df) + c.create_table('word_df', word_df, persist=False) + + merged_df = merged_df.persist() + wait(merged_df) + c.create_table('merged_df', merged_df, persist=False) + + query = """ + WITH negativesent AS + ( + SELECT distinct sentiment_word + FROM sent_df + ), word_sentence_sentiment AS + ( + SELECT sd.sentiment_word, + wd.sentence_idx_global_pos + FROM word_df wd + INNER JOIN negativesent sd ON wd.word = sd.sentiment_word + ), temp AS + ( + SELECT s.review_idx_global_pos, + w.sentiment_word, + s.sentence + FROM word_sentence_sentiment w + LEFT JOIN sentences_df s + ON w.sentence_idx_global_pos = s.sentence_tokenized_global_pos + ) + SELECT pr_item_sk AS item_sk, + sentence AS review_sentence, + 'NEG' AS sentiment, + sentiment_word + FROM temp + INNER JOIN merged_df ON pr_review_sk = review_idx_global_pos + ORDER BY pr_item_sk, review_sentence, sentiment_word + """ + result = c.sql(query) + + c.drop_table("sentences_df") + del sentences + c.drop_table("word_df") + del word_df + c.drop_table("merged_df") + del merged_df + + return result + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q20/gpu_bdb_query_20.py b/gpu_bdb/queries/q20/gpu_bdb_query_20.py index 8db6d19d..85c30cfb 100755 --- a/gpu_bdb/queries/q20/gpu_bdb_query_20.py +++ b/gpu_bdb/queries/q20/gpu_bdb_query_20.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,81 +14,19 @@ # limitations under the License. # -import sys -import cupy as cp -import rmm import numpy as np - from bdb_tools.utils import ( benchmark, gpubdb_argparser, - train_clustering_model, run_query, ) -from bdb_tools.readers import build_reader -from dask import delayed +from bdb_tools.q20_utils import ( + get_clusters, + read_tables +) from dask.distributed import wait - -# q20 parameters -N_CLUSTERS = 8 -CLUSTER_ITERATIONS = 20 -N_ITER = 5 - - -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - - store_sales_cols = [ - "ss_customer_sk", - "ss_ticket_number", - "ss_item_sk", - "ss_net_paid", - ] - store_returns_cols = [ - "sr_item_sk", - "sr_customer_sk", - "sr_ticket_number", - "sr_return_amt", - ] - - store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) - store_returns_df = table_reader.read( - "store_returns", relevant_cols=store_returns_cols - ) - return store_sales_df, store_returns_df - - -def get_clusters(client, ml_input_df, feature_cols): - """ - Takes the dask client, kmeans_input_df and feature columns. - Returns a dictionary matching the output required for q20 - """ - import dask_cudf - - ml_tasks = [ - delayed(train_clustering_model)(df, N_CLUSTERS, CLUSTER_ITERATIONS, N_ITER) - for df in ml_input_df[feature_cols].to_delayed() - ] - - results_dict = client.compute(*ml_tasks, sync=True) - - labels = results_dict["cid_labels"] - - labels_final = dask_cudf.from_cudf(labels, npartitions=ml_input_df.npartitions) - ml_input_df["label"] = labels_final.reset_index()[0] - - output = ml_input_df[["user_sk", "label"]] - - results_dict["cid_labels"] = output - return results_dict - - def remove_inf_and_nulls(df, column_names, value=0.0): """ Replace all nulls, inf, -inf with value column_name from df @@ -224,8 +162,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q20/gpu_bdb_query_20_dask_sql.py b/gpu_bdb/queries/q20/gpu_bdb_query_20_dask_sql.py new file mode 100755 index 00000000..4715177d --- /dev/null +++ b/gpu_bdb/queries/q20/gpu_bdb_query_20_dask_sql.py @@ -0,0 +1,99 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster +from dask.distributed import wait + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.q20_utils import ( + get_clusters, + read_tables +) + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query = """ + SELECT + ss_customer_sk AS user_sk, + round(CASE WHEN ((returns_count IS NULL) OR (orders_count IS NULL) + OR ((returns_count / orders_count) IS NULL) ) THEN 0.0 + ELSE (returns_count / orders_count) END, 7) AS orderRatio, + round(CASE WHEN ((returns_items IS NULL) OR (orders_items IS NULL) + OR ((returns_items / orders_items) IS NULL) ) THEN 0.0 + ELSE (returns_items / orders_items) END, 7) AS itemsRatio, + round(CASE WHEN ((returns_money IS NULL) OR (orders_money IS NULL) + OR ((returns_money / orders_money) IS NULL) ) THEN 0.0 + ELSE (returns_money / orders_money) END, 7) AS monetaryRatio, + round(CASE WHEN ( returns_count IS NULL) THEN 0.0 + ELSE returns_count END, 0) AS frequency + FROM + ( + SELECT + ss_customer_sk, + -- return order ratio + CAST (COUNT(distinct(ss_ticket_number)) AS DOUBLE) + AS orders_count, + -- return ss_item_sk ratio + CAST (COUNT(ss_item_sk) AS DOUBLE) AS orders_items, + -- return monetary amount ratio + CAST(SUM( ss_net_paid ) AS DOUBLE) AS orders_money + FROM store_sales s + GROUP BY ss_customer_sk + ) orders + LEFT OUTER JOIN + ( + SELECT + sr_customer_sk, + -- return order ratio + CAST(count(distinct(sr_ticket_number)) AS DOUBLE) + AS returns_count, + -- return ss_item_sk ratio + CAST (COUNT(sr_item_sk) AS DOUBLE) AS returns_items, + -- return monetary amount ratio + CAST( SUM( sr_return_amt ) AS DOUBLE) AS returns_money + FROM store_returns + GROUP BY sr_customer_sk + ) returned ON ss_customer_sk=sr_customer_sk + """ + final_df = c.sql(query) + + final_df = final_df.fillna(0) + final_df = final_df.repartition(npartitions=1).persist() + wait(final_df) + + final_df = final_df.sort_values(["user_sk"]).reset_index(drop=True) + final_df = final_df.persist() + wait(final_df) + + feature_cols = ["orderRatio", "itemsRatio", "monetaryRatio", "frequency"] + + results_dict = get_clusters( + client=client, ml_input_df=final_df, feature_cols=feature_cols + ) + + return results_dict + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q21/gpu_bdb_query_21.py b/gpu_bdb/queries/q21/gpu_bdb_query_21.py index 4d1e1217..084e0392 100755 --- a/gpu_bdb/queries/q21/gpu_bdb_query_21.py +++ b/gpu_bdb/queries/q21/gpu_bdb_query_21.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,8 +14,6 @@ # limitations under the License. # -import sys - from bdb_tools.utils import ( benchmark, gpubdb_argparser, @@ -23,67 +21,20 @@ ) from bdb_tools.merge_util import hash_merge -from bdb_tools.readers import build_reader -from dask.distributed import Client, wait +from bdb_tools.q21_utils import read_tables + +from dask.distributed import wait q21_year = 2003 q21_month = 1 q21_limit = 100 - -store_sales_cols = [ - "ss_item_sk", - "ss_store_sk", - "ss_customer_sk", - "ss_ticket_number", - "ss_quantity", - "ss_sold_date_sk", -] -date_cols = ["d_date_sk", "d_year", "d_moy"] -websale_cols = ["ws_item_sk", "ws_bill_customer_sk", "ws_quantity", "ws_sold_date_sk"] -sr_cols = [ - "sr_item_sk", - "sr_customer_sk", - "sr_ticket_number", - "sr_return_quantity", - "sr_returned_date_sk", -] -store_cols = ["s_store_name", "s_store_id", "s_store_sk"] -item_cols = ["i_item_id", "i_item_desc", "i_item_sk"] - -# todo: See if persisting the date table improves performence as its used all over - - -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - - store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) - date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) - web_sales_df = table_reader.read("web_sales", relevant_cols=websale_cols) - store_retuns_df = table_reader.read("store_returns", relevant_cols=sr_cols) - store_table_df = table_reader.read("store", relevant_cols=store_cols) - item_table_df = table_reader.read("item", relevant_cols=item_cols) - - return ( - store_sales_df, - date_dim_df, - web_sales_df, - store_retuns_df, - store_table_df, - item_table_df, - ) - - def main(client, config): ( store_sales_df, date_dim_df, web_sales_df, - store_retuns_df, + store_returns_df, store_table_df, item_table_df, ) = benchmark( @@ -105,7 +56,7 @@ def main(client, config): meta=date_dim_df._meta, ).reset_index(drop=True) - part_sr = store_retuns_df.merge( + part_sr = store_returns_df.merge( d2, left_on="sr_returned_date_sk", right_on="d_date_sk", how="inner" ) @@ -289,8 +240,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q21/gpu_bdb_query_21_dask_sql.py b/gpu_bdb/queries/q21/gpu_bdb_query_21_dask_sql.py new file mode 100755 index 00000000..eca3dd03 --- /dev/null +++ b/gpu_bdb/queries/q21/gpu_bdb_query_21_dask_sql.py @@ -0,0 +1,117 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.q21_utils import read_tables + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query = """ + SELECT + part_i.i_item_id AS i_item_id, + part_i.i_item_desc AS i_item_desc, + part_s.s_store_id AS s_store_id, + part_s.s_store_name AS s_store_name, + CAST(SUM(part_ss.ss_quantity) AS BIGINT) AS store_sales_quantity, + CAST(SUM(part_sr.sr_return_quantity) AS BIGINT) AS store_returns_quantity, + CAST(SUM(part_ws.ws_quantity) AS BIGINT) AS web_sales_quantity + FROM + ( + SELECT + sr_item_sk, + sr_customer_sk, + sr_ticket_number, + sr_return_quantity + FROM + store_returns sr, + date_dim d2 + WHERE d2.d_year = 2003 + AND d2.d_moy BETWEEN 1 AND 7 --which were returned in the next six months + AND sr.sr_returned_date_sk = d2.d_date_sk + ) part_sr + INNER JOIN + ( + SELECT + ws_item_sk, + ws_bill_customer_sk, + ws_quantity + FROM + web_sales ws, + date_dim d3 + -- in the following three years (re-purchased by the returning customer afterwards through the web sales channel) + WHERE d3.d_year BETWEEN 2003 AND 2005 + AND ws.ws_sold_date_sk = d3.d_date_sk + ) part_ws ON + ( + part_sr.sr_item_sk = part_ws.ws_item_sk + AND part_sr.sr_customer_sk = part_ws.ws_bill_customer_sk + ) INNER JOIN + ( + SELECT + ss_item_sk, + ss_store_sk, + ss_customer_sk, + ss_ticket_number, + ss_quantity + FROM + store_sales ss, + date_dim d1 + WHERE d1.d_year = 2003 + AND d1.d_moy = 1 + AND ss.ss_sold_date_sk = d1.d_date_sk + ) part_ss ON + ( + part_ss.ss_ticket_number = part_sr.sr_ticket_number + AND part_ss.ss_item_sk = part_sr.sr_item_sk + AND part_ss.ss_customer_sk = part_sr.sr_customer_sk + ) + INNER JOIN store part_s ON + ( + part_s.s_store_sk = part_ss.ss_store_sk + ) + INNER JOIN item part_i ON + ( + part_i.i_item_sk = part_ss.ss_item_sk + ) + GROUP BY + part_i.i_item_id, + part_i.i_item_desc, + part_s.s_store_id, + part_s.s_store_name + ORDER BY + part_i.i_item_id, + part_i.i_item_desc, + part_s.s_store_id, + part_s.s_store_name + LIMIT 100 + """ + result = c.sql(query) + result['i_item_desc'] = result['i_item_desc'].str.strip() + return result + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q22/gpu_bdb_query_22.py b/gpu_bdb/queries/q22/gpu_bdb_query_22.py index 2dfebb1f..3a56d3fb 100755 --- a/gpu_bdb/queries/q22/gpu_bdb_query_22.py +++ b/gpu_bdb/queries/q22/gpu_bdb_query_22.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,19 +14,19 @@ # limitations under the License. # -from numba import cuda import numpy as np -import sys - from bdb_tools.utils import ( benchmark, gpubdb_argparser, run_query, - convert_datestring_to_days, ) -from bdb_tools.readers import build_reader - +from bdb_tools.q22_utils import ( + q22_date, + q22_i_current_price_min, + q22_i_current_price_max, + read_tables +) def inventory_before_after(df, date): df["inv_before"] = df["inv_quantity_on_hand"].copy() @@ -36,38 +36,7 @@ def inventory_before_after(df, date): return df -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - inv_columns = [ - "inv_item_sk", - "inv_warehouse_sk", - "inv_date_sk", - "inv_quantity_on_hand", - ] - inventory = table_reader.read("inventory", relevant_cols=inv_columns) - - item_columns = ["i_item_id", "i_current_price", "i_item_sk"] - item = table_reader.read("item", relevant_cols=item_columns) - - warehouse_columns = ["w_warehouse_sk", "w_warehouse_name"] - warehouse = table_reader.read("warehouse", relevant_cols=warehouse_columns) - - dd_columns = ["d_date_sk", "d_date"] - date_dim = table_reader.read("date_dim", relevant_cols=dd_columns) - - return inventory, item, warehouse, date_dim - - def main(client, config): - - q22_date = "2001-05-08" - q22_i_current_price_min = 0.98 - q22_i_current_price_max = 1.5 - inventory, item, warehouse, date_dim = benchmark( read_tables, config=config, @@ -99,7 +68,6 @@ def main(client, config): output_table = output_table[keep_columns] - date_dim = date_dim.map_partitions(convert_datestring_to_days) # Filter limit in days min_date = np.datetime64(q22_date, "D").astype(int) - 30 @@ -161,8 +129,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q22/gpu_bdb_query_22_dask_sql.py b/gpu_bdb/queries/q22/gpu_bdb_query_22_dask_sql.py new file mode 100755 index 00000000..6393842c --- /dev/null +++ b/gpu_bdb/queries/q22/gpu_bdb_query_22_dask_sql.py @@ -0,0 +1,85 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query +) + +from bdb_tools.q22_utils import ( + q22_date, + q22_i_current_price_min, + q22_i_current_price_max, + read_tables +) + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + # Filter limit in days + min_date = np.datetime64(q22_date, "D").astype(int) - 30 + max_date = np.datetime64(q22_date, "D").astype(int) + 30 + d_date_int = np.datetime64(q22_date, "D").astype(int) + ratio_min = 2.0 / 3.0 + ratio_max = 3.0 / 2.0 + query = f""" + SELECT + w_warehouse_name, + i_item_id, + SUM(CASE WHEN d_date - {d_date_int} < 0 THEN inv_quantity_on_hand ELSE 0 END) AS inv_before, + SUM(CASE WHEN d_date - {d_date_int} >= 0 THEN inv_quantity_on_hand ELSE 0 END) AS inv_after + FROM + inventory inv, + item i, + warehouse w, + date_dim d + WHERE i_current_price BETWEEN {q22_i_current_price_min} AND {q22_i_current_price_max} + AND i_item_sk = inv_item_sk + AND inv_warehouse_sk = w_warehouse_sk + AND inv_date_sk = d_date_sk + AND d_date >= {min_date} + AND d_date <= {max_date} + GROUP BY w_warehouse_name, i_item_id + """ + intermediate = c.sql(query) + c.create_table("intermediate", intermediate ,persist=False) + + query_2 = f""" + SELECT + w_warehouse_name, + i_item_id, + inv_before, + inv_after + FROM intermediate + WHERE inv_before > 0 + AND CAST(inv_after AS DOUBLE) / CAST(inv_before AS DOUBLE) >= {ratio_min} + AND CAST(inv_after AS DOUBLE) / CAST(inv_before AS DOUBLE) <= {ratio_max} + ORDER BY w_warehouse_name, i_item_id + LIMIT 100 + """ + result = c.sql(query_2) + return result + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q23/gpu_bdb_query_23.py b/gpu_bdb/queries/q23/gpu_bdb_query_23.py index 08e1b09d..0dcb558a 100755 --- a/gpu_bdb/queries/q23/gpu_bdb_query_23.py +++ b/gpu_bdb/queries/q23/gpu_bdb_query_23.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,46 +14,20 @@ # limitations under the License. # -import cupy as cp -import sys -import rmm - - -from bdb_tools.readers import build_reader from bdb_tools.utils import ( benchmark, gpubdb_argparser, run_query, ) +from bdb_tools.q23_utils import ( + q23_year, + q23_month, + q23_coefficient, + read_tables +) from distributed import wait - -### inventory date -q23_year = 2001 -q23_month = 1 -q23_coefficient = 1.3 - - -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], basepath=config["data_dir"], - ) - - date_cols = ["d_date_sk", "d_year", "d_moy"] - date_df = table_reader.read("date_dim", relevant_cols=date_cols) - - inv_cols = [ - "inv_warehouse_sk", - "inv_item_sk", - "inv_date_sk", - "inv_quantity_on_hand", - ] - inv_df = table_reader.read("inventory", relevant_cols=inv_cols) - - return date_df, inv_df - - def get_iteration1(merged_inv_dates, n_workers): grouped_df = merged_inv_dates.groupby(["inv_warehouse_sk", "inv_item_sk", "d_moy"]) q23_tmp_inv_part = grouped_df.agg( @@ -129,8 +103,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q23/gpu_bdb_query_23_dask_sql.py b/gpu_bdb/queries/q23/gpu_bdb_query_23_dask_sql.py new file mode 100755 index 00000000..ef3debd8 --- /dev/null +++ b/gpu_bdb/queries/q23/gpu_bdb_query_23_dask_sql.py @@ -0,0 +1,99 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.q23_utils import ( + q23_year, + q23_month, + q23_coefficient, + read_tables +) + +from dask.distributed import wait + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query_1 = f""" + SELECT inv_warehouse_sk, + inv_item_sk, + inv_quantity_on_hand, + d_moy + FROM inventory inv + INNER JOIN date_dim d ON inv.inv_date_sk = d.d_date_sk + AND d.d_year = {q23_year} + AND d_moy between {q23_month} AND {q23_month + 1} + """ + inv_dates_result = c.sql(query_1) + + c.create_table('inv_dates', inv_dates_result, persist=False) + query_2 = """ + SELECT inv_warehouse_sk, + inv_item_sk, + d_moy, + AVG(CAST(inv_quantity_on_hand AS DOUBLE)) AS q_mean, + stddev_samp(CAST(inv_quantity_on_hand as DOUBLE)) AS q_std + FROM inv_dates + GROUP BY inv_warehouse_sk, inv_item_sk, d_moy + """ + iteration_1 = c.sql(query_2) + + c.create_table('iteration_1', iteration_1, persist=False) + query_3 = f""" + SELECT inv_warehouse_sk, + inv_item_sk, + d_moy, + q_std / q_mean AS qty_cov + FROM iteration_1 + WHERE (q_std / q_mean) >= {q23_coefficient} + """ + + iteration_2 = c.sql(query_3) + + c.create_table('temp_table', iteration_2, persist=False) + query = f""" + SELECT inv1.inv_warehouse_sk, + inv1.inv_item_sk, + inv1.d_moy, + inv1.qty_cov AS cov, + inv2.d_moy AS inv2_d_moy, + inv2.qty_cov AS inv2_cov + FROM temp_table inv1 + INNER JOIN temp_table inv2 ON inv1.inv_warehouse_sk = inv2.inv_warehouse_sk + AND inv1.inv_item_sk = inv2.inv_item_sk + AND inv1.d_moy = {q23_month} + AND inv2.d_moy = {q23_month + 1} + ORDER BY inv1.inv_warehouse_sk, + inv1.inv_item_sk + """ + result = c.sql(query) + result = result.persist() + wait(result) + c.drop_table("temp_table") + return result + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q24/gpu_bdb_query_24.py b/gpu_bdb/queries/q24/gpu_bdb_query_24.py index 5b7b2a07..90f2bf3b 100755 --- a/gpu_bdb/queries/q24/gpu_bdb_query_24.py +++ b/gpu_bdb/queries/q24/gpu_bdb_query_24.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,51 +14,20 @@ # limitations under the License. # -import sys - - from bdb_tools.utils import ( benchmark, gpubdb_argparser, run_query, ) -from bdb_tools.readers import build_reader +from bdb_tools.q24_utils import read_tables from distributed import wait ### Current Implimenation Assumption ### Grouped Store sales and web sales of 1 item grouped by `date_sk` should fit in memory as number of dates is limited - ## query parameter q24_i_item_sk = 10000 -ws_cols = ["ws_item_sk", "ws_sold_date_sk", "ws_quantity"] -item_cols = ["i_item_sk", "i_current_price"] -imp_cols = [ - "imp_item_sk", - "imp_competitor_price", - "imp_start_date", - "imp_end_date", - "imp_sk", -] -ss_cols = ["ss_item_sk", "ss_sold_date_sk", "ss_quantity"] - - -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - ### read tables - ws_df = table_reader.read("web_sales", relevant_cols=ws_cols) - item_df = table_reader.read("item", relevant_cols=item_cols) - imp_df = table_reader.read("item_marketprices", relevant_cols=imp_cols) - ss_df = table_reader.read("store_sales", relevant_cols=ss_cols) - - return ws_df, item_df, imp_df, ss_df - - def get_helper_query_table(imp_df, item_df): f_imp_df = ( imp_df.query(f"imp_item_sk == {q24_i_item_sk}", meta=imp_df._meta) @@ -254,8 +223,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q24/gpu_bdb_query_24_dask_sql.py b/gpu_bdb/queries/q24/gpu_bdb_query_24_dask_sql.py new file mode 100755 index 00000000..f418beb2 --- /dev/null +++ b/gpu_bdb/queries/q24/gpu_bdb_query_24_dask_sql.py @@ -0,0 +1,82 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.q24_utils import read_tables + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query = """ + WITH temp_table as + ( + SELECT + i_item_sk, + imp_sk, + (imp_competitor_price - i_current_price) / i_current_price AS price_change, + imp_start_date, + (imp_end_date - imp_start_date) AS no_days_comp_price + FROM item i ,item_marketprices imp + WHERE i.i_item_sk = imp.imp_item_sk + AND i.i_item_sk = 10000 + ORDER BY i_item_sk, imp_sk, imp_start_date + ) + SELECT ws_item_sk, + -- avg ( (current_ss_quant + current_ws_quant - prev_ss_quant - prev_ws_quant) / ((prev_ss_quant + prev_ws_quant) * ws.price_change) ) -- single node + sum( (current_ss_quant+current_ws_quant-prev_ss_quant-prev_ws_quant) / (prev_ss_quant*ws.price_change+prev_ws_quant*ws.price_change) ) + / count( (current_ss_quant + current_ws_quant - prev_ss_quant - prev_ws_quant) / ((prev_ss_quant + prev_ws_quant) * ws.price_change) ) AS cross_price_elasticity + FROM + ( + SELECT + ws_item_sk, + imp_sk, + price_change, + SUM( CASE WHEN ( (ws_sold_date_sk >= c.imp_start_date) AND (ws_sold_date_sk < (c.imp_start_date + c.no_days_comp_price))) THEN ws_quantity ELSE 0 END ) AS current_ws_quant, + SUM( CASE WHEN ( (ws_sold_date_sk >= (c.imp_start_date - c.no_days_comp_price)) AND (ws_sold_date_sk < c.imp_start_date)) THEN ws_quantity ELSE 0 END ) AS prev_ws_quant + FROM web_sales ws + JOIN temp_table c ON ws.ws_item_sk = c.i_item_sk + GROUP BY ws_item_sk, imp_sk, price_change + ) ws JOIN + ( + SELECT + ss_item_sk, + imp_sk, + price_change, + SUM( CASE WHEN ((ss_sold_date_sk >= c.imp_start_date) AND (ss_sold_date_sk < (c.imp_start_date + c.no_days_comp_price))) THEN ss_quantity ELSE 0 END) AS current_ss_quant, + SUM( CASE WHEN ((ss_sold_date_sk >= (c.imp_start_date - c.no_days_comp_price)) AND (ss_sold_date_sk < c.imp_start_date)) THEN ss_quantity ELSE 0 END) AS prev_ss_quant + FROM store_sales ss + JOIN temp_table c ON c.i_item_sk = ss.ss_item_sk + GROUP BY ss_item_sk, imp_sk, price_change + ) ss + ON (ws.ws_item_sk = ss.ss_item_sk and ws.imp_sk = ss.imp_sk) + GROUP BY ws.ws_item_sk + """ + + result = c.sql(query) + return result + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q25/gpu_bdb_query_25.py b/gpu_bdb/queries/q25/gpu_bdb_query_25.py index 5d88b643..2e0fde62 100755 --- a/gpu_bdb/queries/q25/gpu_bdb_query_25.py +++ b/gpu_bdb/queries/q25/gpu_bdb_query_25.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,10 +14,9 @@ # limitations under the License. # -import sys - import numpy as np -from numba import cuda + +import dask_cudf from bdb_tools.utils import ( benchmark, @@ -26,40 +25,15 @@ run_query, convert_datestring_to_days, ) -from bdb_tools.readers import build_reader +from bdb_tools.q25_utils import ( + q25_date, + N_CLUSTERS, + CLUSTER_ITERATIONS, + N_ITER, + read_tables +) from dask import delayed - -# q25 parameters -Q25_DATE = "2002-01-02" -N_CLUSTERS = 8 -CLUSTER_ITERATIONS = 20 -N_ITER = 5 - - -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - - ss_cols = ["ss_customer_sk", "ss_sold_date_sk", "ss_ticket_number", "ss_net_paid"] - ws_cols = [ - "ws_bill_customer_sk", - "ws_sold_date_sk", - "ws_order_number", - "ws_net_paid", - ] - datedim_cols = ["d_date_sk", "d_date"] - - ss_ddf = table_reader.read("store_sales", relevant_cols=ss_cols, index=False) - ws_ddf = table_reader.read("web_sales", relevant_cols=ws_cols, index=False) - datedim_ddf = table_reader.read("date_dim", relevant_cols=datedim_cols, index=False) - - return (ss_ddf, ws_ddf, datedim_ddf) - - def agg_count_distinct(df, group_key, counted_key, client): """Returns a Series that is the result of counting distinct instances of 'counted_key' within each 'group_key'. The series' index will have one entry per unique 'group_key' value. @@ -77,7 +51,6 @@ def agg_count_distinct(df, group_key, counted_key, client): def get_clusters(client, ml_input_df): - import dask_cudf ml_tasks = [ delayed(train_clustering_model)(df, N_CLUSTERS, CLUSTER_ITERATIONS, N_ITER) @@ -100,7 +73,6 @@ def get_clusters(client, ml_input_df): def main(client, config): - import dask_cudf ss_ddf, ws_ddf, datedim_ddf = benchmark( read_tables, @@ -109,7 +81,7 @@ def main(client, config): dask_profile=config["dask_profile"], ) datedim_ddf = datedim_ddf.map_partitions(convert_datestring_to_days) - min_date = np.datetime64(Q25_DATE, "D").astype(int) + min_date = np.datetime64(q25_date, "D").astype(int) # Filter by date valid_dates_ddf = datedim_ddf[datedim_ddf["d_date"] > min_date].reset_index( drop=True @@ -173,8 +145,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q25/gpu_bdb_query_25_dask_sql.py b/gpu_bdb/queries/q25/gpu_bdb_query_25_dask_sql.py new file mode 100755 index 00000000..3ae0afad --- /dev/null +++ b/gpu_bdb/queries/q25/gpu_bdb_query_25_dask_sql.py @@ -0,0 +1,185 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, + train_clustering_model +) + +from bdb_tools.q25_utils import ( + q25_date, + N_CLUSTERS, + CLUSTER_ITERATIONS, + N_ITER, + read_tables +) + +from dask import delayed + +def get_clusters(client, ml_input_df): + import dask_cudf + + ml_tasks = [ + delayed(train_clustering_model)(df, N_CLUSTERS, CLUSTER_ITERATIONS, N_ITER) + for df in ml_input_df.to_delayed() + ] + results_dict = client.compute(*ml_tasks, sync=True) + + output = ml_input_df.index.to_frame().reset_index(drop=True) + + labels_final = dask_cudf.from_cudf( + results_dict["cid_labels"], npartitions=output.npartitions + ) + output["label"] = labels_final.reset_index()[0] + + # Based on CDH6.1 q25-result formatting + results_dict["cid_labels"] = output + return results_dict + + +def agg_count_distinct(df, group_key, counted_key): + """Returns a Series that is the result of counting distinct instances of 'counted_key' within each 'group_key'. + The series' index will have one entry per unique 'group_key' value. + Workaround for lack of nunique aggregate function on Dask df. + """ + + ### going via repartition for split_out drop duplicates + unique_df = df[[group_key, counted_key]].map_partitions( + lambda df: df.drop_duplicates() + ) + unique_df = unique_df.shuffle(on=[group_key]) + unique_df = unique_df.map_partitions(lambda df: df.drop_duplicates()) + + unique_df = unique_df.groupby(group_key)[counted_key].count() + return unique_df.reset_index(drop=False) + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + ss_join_query= f""" + SELECT + ss_customer_sk, + ss_sold_date_sk, + ss_net_paid, + ss_ticket_number + FROM + store_sales ss + JOIN + date_dim d ON ss.ss_sold_date_sk = d.d_date_sk + WHERE + CAST(d.d_date AS DATE) > DATE '{q25_date}' + AND + ss_customer_sk IS NOT NULL + """ + + + ws_join_query = f""" + SELECT + ws_bill_customer_sk, + ws_order_number, + ws_sold_date_sk, + ws_net_paid + FROM + web_sales ws + JOIN + date_dim d ON ws.ws_sold_date_sk = d.d_date_sk + WHERE + CAST(d.d_date AS DATE) > DATE '{q25_date}' + AND + ws_bill_customer_sk IS NOT NULL + """ + + ss_merged_df = c.sql(ss_join_query) + ws_merged_df = c.sql(ws_join_query) + + c.create_table('ss_merged_table', ss_merged_df, persist=False) + c.create_table('ws_merged_table', ws_merged_df, persist=False) + + ss_agg_query = """ + SELECT + ss_customer_sk AS cid, + -- count(distinct ss_ticket_number) AS frequency, # distinct count groupby OOMS with dask-sql + max(ss_sold_date_sk) AS most_recent_date, + CAST( SUM(ss_net_paid) AS DOUBLE) AS amount + FROM ss_merged_table + GROUP BY ss_customer_sk + """ + ws_agg_query= """ + SELECT + ws_bill_customer_sk AS cid, + -- count(distinct ws_order_number) AS frequency, # distinct count groupby OOMS with dask-sql + max(ws_sold_date_sk) AS most_recent_date, + CAST( SUM(ws_net_paid) AS DOUBLE) AS amount + FROM ws_merged_table + GROUP BY ws_bill_customer_sk + """ + + ss_distinct_count_agg = agg_count_distinct(ss_merged_df,'ss_customer_sk','ss_ticket_number') + ss_distinct_count_agg = ss_distinct_count_agg.rename(columns={'ss_customer_sk':'cid', + 'ss_ticket_number':'frequency'}) + ss_agg_df = c.sql(ss_agg_query) + ### add distinct count + ss_agg_df = ss_agg_df.merge(ss_distinct_count_agg) + + ws_distinct_count_agg = agg_count_distinct(ws_merged_df,'ws_bill_customer_sk','ws_order_number') + ws_distinct_count_agg = ws_distinct_count_agg.rename(columns={'ws_bill_customer_sk':'cid', + 'ws_order_number':'frequency'}) + ws_agg_df = c.sql(ws_agg_query) + ### add distinct count + ws_agg_df = ws_agg_df.merge(ws_distinct_count_agg) + + c.create_table('ss_agg_df', ss_agg_df, persist=False) + c.create_table('ws_agg_df', ws_agg_df, persist=False) + + + result_query = ''' + WITH concat_table AS + ( + SELECT * FROM ss_agg_df + UNION ALL + SELECT * FROM ws_agg_df + ) + SELECT + cid AS cid, + CASE WHEN 37621 - max(most_recent_date) < 60 THEN 1.0 + ELSE 0.0 END AS recency, -- 37621 == 2003-01-02 + CAST( SUM(frequency) AS BIGINT) AS frequency, --total frequency + CAST( SUM(amount) AS DOUBLE) AS amount --total amount + FROM concat_table + GROUP BY cid + ORDER BY cid + ''' + cluster_input_ddf = c.sql(result_query) + + # Prepare df for KMeans clustering + cluster_input_ddf["recency"] = cluster_input_ddf["recency"].astype("int64") + + cluster_input_ddf = cluster_input_ddf.repartition(npartitions=1) + cluster_input_ddf = cluster_input_ddf.persist() + cluster_input_ddf = cluster_input_ddf.set_index('cid') + results_dict = get_clusters(client=client, ml_input_df=cluster_input_ddf) + + return results_dict + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q26/gpu_bdb_query_26.py b/gpu_bdb/queries/q26/gpu_bdb_query_26.py index 82596f0f..fa4b81b3 100755 --- a/gpu_bdb/queries/q26/gpu_bdb_query_26.py +++ b/gpu_bdb/queries/q26/gpu_bdb_query_26.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,45 +14,23 @@ # limitations under the License. # -import sys - -import numpy as np -from numba import cuda - from bdb_tools.utils import ( benchmark, gpubdb_argparser, train_clustering_model, run_query, ) -from bdb_tools.readers import build_reader +from bdb_tools.q26_utils import ( + Q26_CATEGORY, + Q26_ITEM_COUNT, + N_CLUSTERS, + CLUSTER_ITERATIONS, + N_ITER, + read_tables +) +import numpy as np from dask import delayed - -# q26 parameters -Q26_CATEGORY = "Books" -Q26_ITEM_COUNT = 5 -N_CLUSTERS = 8 -CLUSTER_ITERATIONS = 20 -N_ITER = 5 - - -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - - ss_cols = ["ss_customer_sk", "ss_item_sk"] - items_cols = ["i_item_sk", "i_category", "i_class_id"] - - ss_ddf = table_reader.read("store_sales", relevant_cols=ss_cols, index=False) - items_ddf = table_reader.read("item", relevant_cols=items_cols, index=False) - - return (ss_ddf, items_ddf) - - def agg_count_distinct(df, group_key, counted_key): """Returns a Series that is the result of counting distinct instances of 'counted_key' within each 'group_key'. The series' index will have one entry per unique 'group_key' value. @@ -113,10 +91,10 @@ def main(client, config): # One-Hot-Encode i_class_id merged_ddf = merged_ddf.map_partitions( - cudf.DataFrame.one_hot_encoding, - column="i_class_id", + cudf.get_dummies, + columns=["i_class_id"], prefix="id", - cats=[i for i in range(1, 16)], + cats={"i_class_id": np.arange(1, 16, dtype="int32")}, prefix_sep="", dtype="float32", ) @@ -139,8 +117,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q26/gpu_bdb_query_26_dask_sql.py b/gpu_bdb/queries/q26/gpu_bdb_query_26_dask_sql.py new file mode 100755 index 00000000..95458d56 --- /dev/null +++ b/gpu_bdb/queries/q26/gpu_bdb_query_26_dask_sql.py @@ -0,0 +1,102 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, + train_clustering_model +) + +from bdb_tools.q26_utils import ( + Q26_CATEGORY, + Q26_ITEM_COUNT, + N_CLUSTERS, + CLUSTER_ITERATIONS, + N_ITER, + read_tables +) + +from dask import delayed + +def get_clusters(client, kmeans_input_df): + import dask_cudf + + ml_tasks = [ + delayed(train_clustering_model)(df, N_CLUSTERS, CLUSTER_ITERATIONS, N_ITER) + for df in kmeans_input_df.to_delayed() + ] + + results_dict = client.compute(*ml_tasks, sync=True) + + output = kmeans_input_df.index.to_frame().reset_index(drop=True) + + labels_final = dask_cudf.from_cudf( + results_dict["cid_labels"], npartitions=output.npartitions + ) + output["label"] = labels_final.reset_index()[0] + + # Based on CDH6.1 q26-result formatting + results_dict["cid_labels"] = output + return results_dict + + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query = f""" + SELECT + ss.ss_customer_sk AS cid, + CAST( count(CASE WHEN i.i_class_id=1 THEN 1 ELSE NULL END) AS DOUBLE ) AS id1, + CAST( count(CASE WHEN i.i_class_id=2 THEN 1 ELSE NULL END) AS DOUBLE ) AS id2, + CAST( count(CASE WHEN i.i_class_id=3 THEN 1 ELSE NULL END) AS DOUBLE ) AS id3, + CAST( count(CASE WHEN i.i_class_id=4 THEN 1 ELSE NULL END) AS DOUBLE ) AS id4, + CAST( count(CASE WHEN i.i_class_id=5 THEN 1 ELSE NULL END) AS DOUBLE ) AS id5, + CAST( count(CASE WHEN i.i_class_id=6 THEN 1 ELSE NULL END) AS DOUBLE ) AS id6, + CAST( count(CASE WHEN i.i_class_id=7 THEN 1 ELSE NULL END) AS DOUBLE ) AS id7, + CAST( count(CASE WHEN i.i_class_id=8 THEN 1 ELSE NULL END) AS DOUBLE ) AS id8, + CAST( count(CASE WHEN i.i_class_id=9 THEN 1 ELSE NULL END) AS DOUBLE ) AS id9, + CAST( count(CASE WHEN i.i_class_id=10 THEN 1 ELSE NULL END) AS DOUBLE ) AS id10, + CAST( count(CASE WHEN i.i_class_id=11 THEN 1 ELSE NULL END) AS DOUBLE ) AS id11, + CAST( count(CASE WHEN i.i_class_id=12 THEN 1 ELSE NULL END) AS DOUBLE ) AS id12, + CAST( count(CASE WHEN i.i_class_id=13 THEN 1 ELSE NULL END) AS DOUBLE ) AS id13, + CAST( count(CASE WHEN i.i_class_id=14 THEN 1 ELSE NULL END) AS DOUBLE ) AS id14, + CAST( count(CASE WHEN i.i_class_id=15 THEN 1 ELSE NULL END) AS DOUBLE ) AS id15 + FROM store_sales ss + INNER JOIN item i + ON + ( + ss.ss_item_sk = i.i_item_sk + AND i.i_category IN ('{Q26_CATEGORY}') + AND ss.ss_customer_sk IS NOT NULL + ) + GROUP BY ss.ss_customer_sk + HAVING count(ss.ss_item_sk) > {Q26_ITEM_COUNT} + ORDER BY cid + """ + result = c.sql(query) + result = result.repartition(npartitions=1) + result_ml = result.set_index('cid') + ml_result_dict = get_clusters(client=client, kmeans_input_df=result_ml) + return ml_result_dict + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q27/gpu_bdb_query_27.py b/gpu_bdb/queries/q27/gpu_bdb_query_27.py index 25eb247e..0634d2a8 100755 --- a/gpu_bdb/queries/q27/gpu_bdb_query_27.py +++ b/gpu_bdb/queries/q27/gpu_bdb_query_27.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,63 +14,29 @@ # limitations under the License. # -import sys -import time -import argparse - -import spacy -import rmm -import cupy as cp -import distributed +import dask_cudf from bdb_tools.utils import ( benchmark, gpubdb_argparser, - left_semi_join, - run_query, + run_query ) -from bdb_tools.text import create_sentences_from_reviews, create_words_from_sentences -from bdb_tools.readers import build_reader -from dask_cuda import LocalCUDACluster -from dask.distributed import Client, wait - - -# -------- Q27 ----------- -q27_pr_item_sk = 10002 -EOL_CHAR = "." - - -def read_tables(config): - ### splitting by row groups for better parallelism - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=True, - ) - product_reviews_cols = ["pr_item_sk", "pr_review_content", "pr_review_sk"] - product_reviews_df = table_reader.read( - "product_reviews", relevant_cols=product_reviews_cols - ) - return product_reviews_df - +from bdb_tools.text import ( + create_sentences_from_reviews, + create_words_from_sentences +) -def ner_parser(df, col_string, batch_size=256): - spacy.require_gpu() - nlp = spacy.load("en_core_web_sm") - docs = nlp.pipe(df[col_string], disable=["tagger", "parser"], batch_size=batch_size) - out = [] - for doc in docs: - l = [ent.text for ent in doc.ents if ent.label_ == "ORG"] - val = ", " - l = val.join(l) - out.append(l) - df["company_name_list"] = out - return df +from bdb_tools.q27_utils import ( + ner_parser, + q27_pr_item_sk, + EOL_CHAR, + read_tables +) +from dask.distributed import wait def main(client, config): - import dask_cudf product_reviews_df = benchmark( read_tables, @@ -138,8 +104,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q27/gpu_bdb_query_27_dask_sql.py b/gpu_bdb/queries/q27/gpu_bdb_query_27_dask_sql.py new file mode 100755 index 00000000..50e27a5a --- /dev/null +++ b/gpu_bdb/queries/q27/gpu_bdb_query_27_dask_sql.py @@ -0,0 +1,111 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.text import ( + create_sentences_from_reviews, + create_words_from_sentences +) + +from bdb_tools.cluster_startup import attach_to_cluster +from dask.distributed import wait + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.q27_utils import ( + ner_parser, + q27_pr_item_sk, + EOL_CHAR, + read_tables +) + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + import dask_cudf + + query = f""" + SELECT pr_review_sk, pr_item_sk, pr_review_content + FROM product_reviews + WHERE pr_item_sk = {q27_pr_item_sk} + """ + product_reviews_df = c.sql(query) + + sentences = product_reviews_df.map_partitions( + create_sentences_from_reviews, + review_column="pr_review_content", + end_of_line_char=EOL_CHAR, + ) + + # need the global position in the sentence tokenized df + sentences["x"] = 1 + sentences["sentence_tokenized_global_pos"] = sentences.x.cumsum() + del sentences["x"] + del product_reviews_df + + # Do the NER + sentences = sentences.to_dask_dataframe() + ner_parsed = sentences.map_partitions(ner_parser, "sentence") + ner_parsed = dask_cudf.from_dask_dataframe(ner_parsed) + ner_parsed = ner_parsed.persist() + wait(ner_parsed) + + ner_parsed = ner_parsed[ner_parsed.company_name_list != ""] + + # separate NER results into one row per found company + repeated_names = ner_parsed.map_partitions( + create_words_from_sentences, + sentence_column="company_name_list", + global_position_column="sentence_tokenized_global_pos", + delimiter="é", + ) + del sentences + + # recombine + repeated_names = repeated_names.persist() + wait(repeated_names) + c.create_table('repeated_names', repeated_names, persist=False) + + ner_parsed = ner_parsed.persist() + wait(ner_parsed) + c.create_table('ner_parsed', ner_parsed, persist=False) + + query = f""" + SELECT review_idx_global_pos as review_sk, + CAST({q27_pr_item_sk} AS BIGINT) as item_sk, + word as company_name, + sentence as review_sentence + FROM repeated_names left join ner_parsed + ON sentence_idx_global_pos = sentence_tokenized_global_pos + ORDER BY review_idx_global_pos, item_sk, word, sentence + """ + recombined = c.sql(query) + + c.drop_table("repeated_names") + c.drop_table("ner_parsed") + del ner_parsed + del repeated_names + return recombined + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) + diff --git a/gpu_bdb/queries/q28/gpu_bdb_query_28.py b/gpu_bdb/queries/q28/gpu_bdb_query_28.py index 8ecdf712..281c84ac 100755 --- a/gpu_bdb/queries/q28/gpu_bdb_query_28.py +++ b/gpu_bdb/queries/q28/gpu_bdb_query_28.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,311 +15,22 @@ # import cupy -import dask -import distributed -import numpy as np -import time import cupy as cp import copyreg -import sys, os -import traceback - -from distributed import wait -from cuml.feature_extraction.text import HashingVectorizer from bdb_tools.utils import ( benchmark, gpubdb_argparser, run_query, ) -from bdb_tools.readers import build_reader - - -QUERY_NUM = os.getcwd().split("/")[-1][1:] - -N_FEATURES = 2 ** 23 # Spark is doing 2^20 -ngram_range = (1, 2) -preprocessor = lambda s:s.str.lower() -norm = None -alternate_sign = False - - -def gpu_hashing_vectorizer(x): - vec = HashingVectorizer(n_features=N_FEATURES, - alternate_sign=alternate_sign, - ngram_range=ngram_range, - norm=norm, - preprocessor=preprocessor - ) - return vec.fit_transform(x) - - -def map_labels(ser): - import cudf - output_ser = cudf.Series(cudf.core.column.full(size=len(ser), fill_value=2, dtype=np.int32)) - zero_flag = (ser==1) | (ser==2) - output_ser.loc[zero_flag]=0 - - three_flag = (ser==3) - output_ser.loc[three_flag]=1 - - return output_ser - - -def build_features(t): - X = t["pr_review_content"] - X = X.map_partitions( - gpu_hashing_vectorizer, - meta=dask.array.from_array( - cupy.sparse.csr_matrix(cupy.zeros(1, dtype=cp.float32)) - ), - ) - - X = X.astype(np.float32).persist() - X.compute_chunk_sizes() - - return X - - -def build_labels(reviews_df): - y = reviews_df["pr_review_rating"].map_partitions(map_labels) - y = y.map_partitions(lambda x: cupy.asarray(x, cupy.int32)).persist() - y.compute_chunk_sizes() - - return y - - -def read_tables(config): - ### splitting by row groups for better parallelism - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=True, - ) - - columns = [ - "pr_review_content", - "pr_review_rating", - "pr_review_sk", - ] - ret = table_reader.read("product_reviews", relevant_cols=columns) - return ret - - -def categoricalize(num_sr): - return num_sr.astype("str").str.replace(["0", "1", "2"], ["NEG", "NEUT", "POS"]) - - -def sum_tp_fp(y_y_pred, nclasses): - - y, y_pred = y_y_pred - res = cp.zeros((nclasses, 2), order="F") - - for i in range(nclasses): - pos_pred_ix = cp.where(y_pred == i)[0] - - # short circuit - if len(pos_pred_ix) == 0: - res[i] = 0 - break - - tp_sum = (y_pred[pos_pred_ix] == y[pos_pred_ix]).sum() - fp_sum = (y_pred[pos_pred_ix] != y[pos_pred_ix]).sum() - res[i][0] = tp_sum - res[i][1] = fp_sum - return res - - -def precision_score(client, y, y_pred, average="binary"): - from cuml.dask.common.input_utils import DistributedDataHandler - - nclasses = len(cp.unique(y.map_blocks(lambda x: cp.unique(x)).compute())) - - if average == "binary" and nclasses > 2: - raise ValueError - - if nclasses < 2: - raise ValueError("Single class precision is not yet supported") - - ddh = DistributedDataHandler.create([y, y_pred]) - - precision_scores = client.compute( - [ - client.submit(sum_tp_fp, part, nclasses, workers=[worker]) - for worker, part in ddh.gpu_futures - ], - sync=True, - ) - - res = cp.zeros((nclasses, 2), order="F") - - for i in precision_scores: - res += i - - if average == "binary" or average == "macro": - - prec = cp.zeros(nclasses) - for i in range(nclasses): - tp_sum, fp_sum = res[i] - prec[i] = (tp_sum / (tp_sum + fp_sum)).item() - - if average == "binary": - return prec[nclasses - 1].item() - else: - return prec.mean().item() - else: - global_tp = cp.sum(res[:, 0]) - global_fp = cp.sum(res[:, 1]) - - return global_tp / (global_tp + global_fp).item() - - -def local_cm(y_y_pred, unique_labels, sample_weight): - - y_true, y_pred = y_y_pred - labels = unique_labels - - n_labels = labels.size - - # Assume labels are monotonically increasing for now. - - # intersect y_pred, y_true with labels, eliminate items not in labels - ind = cp.logical_and(y_pred < n_labels, y_true < n_labels) - y_pred = y_pred[ind] - y_true = y_true[ind] - - if sample_weight is None: - sample_weight = cp.ones(y_true.shape[0], dtype=np.int64) - else: - sample_weight = cp.asarray(sample_weight) - - sample_weight = sample_weight[ind] - - cm = cp.sparse.coo_matrix( - (sample_weight, (y_true, y_pred)), shape=(n_labels, n_labels), dtype=cp.float32, - ).toarray() - - return cp.nan_to_num(cm) - - -def confusion_matrix(client, y_true, y_pred, normalize=None, sample_weight=None): - from cuml.dask.common.input_utils import DistributedDataHandler - - unique_classes = cp.unique(y_true.map_blocks(lambda x: cp.unique(x)).compute()) - nclasses = len(unique_classes) - - ddh = DistributedDataHandler.create([y_true, y_pred]) - - cms = client.compute( - [ - client.submit( - local_cm, part, unique_classes, sample_weight, workers=[worker] - ) - for worker, part in ddh.gpu_futures - ], - sync=True, - ) - - cm = cp.zeros((nclasses, nclasses)) - for i in cms: - cm += i - - with np.errstate(all="ignore"): - if normalize == "true": - cm = cm / cm.sum(axis=1, keepdims=True) - elif normalize == "pred": - cm = cm / cm.sum(axis=0, keepdims=True) - elif normalize == "all": - cm = cm / cm.sum() - cm = cp.nan_to_num(cm) - - return cm - - -def accuracy_score(client, y, y_hat): - from uuid import uuid1 - from cuml.dask.common.input_utils import DistributedDataHandler - - ddh = DistributedDataHandler.create([y_hat, y]) - - def _count_accurate_predictions(y_hat_y): - y_hat, y = y_hat_y - y_hat = cp.asarray(y_hat, dtype=y_hat.dtype) - y = cp.asarray(y, dtype=y.dtype) - return y.shape[0] - cp.count_nonzero(y - y_hat) - - key = uuid1() - - futures = client.compute( - [ - client.submit( - _count_accurate_predictions, - worker_future[1], - workers=[worker_future[0]], - key="%s-%s" % (key, idx), - ) - for idx, worker_future in enumerate(ddh.gpu_futures) - ], - sync=True, - ) - - return sum(futures) / y.shape[0] - - -def post_etl_processing(client, train_data, test_data): - import cudf - from cuml.dask.naive_bayes import MultinomialNB as DistMNB - from cuml.dask.common import to_dask_cudf - from cuml.dask.common.input_utils import DistributedDataHandler - - # Feature engineering - X_train = build_features(train_data) - X_test = build_features(test_data) - - y_train = build_labels(train_data) - y_test = build_labels(test_data) - - # Perform ML - model = DistMNB(client=client, alpha=0.001) - model.fit(X_train, y_train) - - ### this regression seems to be coming from here - test_pred_st = time.time() - y_hat = model.predict(X_test).persist() - - # Compute distributed performance metrics - acc = accuracy_score(client, y_test, y_hat) - - print("Accuracy: " + str(acc)) - prec = precision_score(client, y_test, y_hat, average="macro") - - print("Precision: " + str(prec)) - cmat = confusion_matrix(client, y_test, y_hat) - - print("Confusion Matrix: " + str(cmat)) - metric_et = time.time() - - # Place results back in original Dataframe - - ddh = DistributedDataHandler.create(y_hat) - test_preds = to_dask_cudf( - [client.submit(cudf.Series, part) for w, part in ddh.gpu_futures] - ) - - test_preds = test_preds.map_partitions(categoricalize) - - test_data["prediction"] = test_preds - - final_data = test_data[["pr_review_sk", "pr_review_rating", "prediction"]].persist() - - final_data = final_data.sort_values("pr_review_sk").reset_index(drop=True) - wait(final_data) - return final_data, acc, prec, cmat +from bdb_tools.q28_utils import ( + post_etl_processing, + read_tables +) def main(client, config): - q_st = time.time() product_reviews_df = benchmark( read_tables, config=config, @@ -360,11 +71,6 @@ def serialize_mat_descriptor(m): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - from cuml.dask.naive_bayes import MultinomialNB as DistMNB - from cuml.dask.common.input_utils import DistributedDataHandler - from cuml.dask.common import to_dask_cudf - config = gpubdb_argparser() client, bc = attach_to_cluster(config) run_query(config=config, client=client, query_func=main) diff --git a/gpu_bdb/queries/q28/gpu_bdb_query_28_dask_sql.py b/gpu_bdb/queries/q28/gpu_bdb_query_28_dask_sql.py new file mode 100755 index 00000000..aa6c5e76 --- /dev/null +++ b/gpu_bdb/queries/q28/gpu_bdb_query_28_dask_sql.py @@ -0,0 +1,77 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.q28_utils import ( + post_etl_processing, + read_tables +) + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + # 10 % of data + query1 = """ + SELECT + pr_review_sk, + pr_review_rating, + pr_review_content + FROM product_reviews + WHERE mod(pr_review_sk, 10) IN (0) + AND pr_review_content IS NOT NULL + ORDER BY pr_review_sk + """ + test_data = c.sql(query1) + + # 90 % of data + query2 = """ + SELECT + pr_review_sk, + pr_review_rating, + pr_review_content + FROM product_reviews + WHERE mod(pr_review_sk, 10) IN (1,2,3,4,5,6,7,8,9) + AND pr_review_content IS NOT NULL + ORDER BY pr_review_sk + """ + train_data = c.sql(query2) + + final_data, acc, prec, cmat = post_etl_processing( + client=client, train_data=train_data, test_data=test_data + ) + + payload = { + "df": final_data, + "acc": acc, + "prec": prec, + "cmat": cmat, + "output_type": "supervised", + } + + return payload + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q29/gpu_bdb_query_29.py b/gpu_bdb/queries/q29/gpu_bdb_query_29.py index 88ead76f..7dc8c29c 100755 --- a/gpu_bdb/queries/q29/gpu_bdb_query_29.py +++ b/gpu_bdb/queries/q29/gpu_bdb_query_29.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,16 +14,15 @@ # limitations under the License. # -import sys - from bdb_tools.utils import ( benchmark, gpubdb_argparser, run_query, ) -from bdb_tools.readers import build_reader -from bdb_tools.utils import benchmark -from distributed import wait +from bdb_tools.q29_utils import ( + q29_limit, + read_tables +) ### Implementation Notes: # * `drop_duplicates` and `groupby` by default brings result to single partition @@ -39,25 +38,8 @@ ### Scalabilty problems # * The ws_item_join table after distincts has `48M` rows, can cause problems on bigger scale factors - -# -------- Q29 ----------- -q29_limit = 100 q29_session_timeout_inSec = 3600 - -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], basepath=config["data_dir"], - ) - item_cols = ["i_item_sk", "i_category_id"] - item_df = table_reader.read("item", relevant_cols=item_cols) - - ws_cols = ["ws_order_number", "ws_item_sk"] - ws_df = table_reader.read("web_sales", relevant_cols=ws_cols) - - return item_df, ws_df - - ### # Select t1.i_category_id AS category_id_1 , t2.i_category_id AS category_id_2 # FROM ( @@ -149,8 +131,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q29/gpu_bdb_query_29_dask_sql.py b/gpu_bdb/queries/q29/gpu_bdb_query_29_dask_sql.py new file mode 100755 index 00000000..cb34a5e9 --- /dev/null +++ b/gpu_bdb/queries/q29/gpu_bdb_query_29_dask_sql.py @@ -0,0 +1,84 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.q29_utils import ( + q29_limit, + read_tables +) + +from dask.distributed import wait + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + n_workers = len(client.scheduler_info()["workers"]) + + join_query = """ + -- Removed distinct as we do it in + -- dask_cudf based drop_duplicates with split_out + -- 553 M rows dont fit on single GPU (int32,int64 column) + -- TODO: Remove when we support Split Out + -- https://github.com/dask-contrib/dask-sql/issues/241 + + SELECT i_category_id, ws_order_number + FROM web_sales ws, item i + WHERE ws.ws_item_sk = i.i_item_sk + AND i.i_category_id IS NOT NULL + """ + result = c.sql(join_query) + + # Distinct Calculatiin + result_distinct = result.drop_duplicates(split_out=n_workers,ignore_index=True) + ## Remove the int64 index that was created + ## TODO Raise a issue for this + result_distinct = result_distinct.reset_index(drop=True) + ### Persiting cause Order by causes execution + c.create_table('distinct_table', result_distinct, persist=True) + + query = f""" + SELECT category_id_1, category_id_2, COUNT (*) AS cnt + FROM + ( + SELECT CAST(t1.i_category_id as BIGINT) AS category_id_1, + CAST(t2.i_category_id as BIGINT) AS category_id_2 + FROM distinct_table t1 + INNER JOIN distinct_table t2 + ON t1.ws_order_number = t2.ws_order_number + WHERE t1.i_category_id < t2.i_category_id + ) + GROUP BY category_id_1, category_id_2 + ORDER BY cnt DESC, category_id_1, category_id_2 + LIMIT {q29_limit} + """ + result = c.sql(query) + result = result.persist() + wait(result); + + c.drop_table("distinct_table") + return result + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c) diff --git a/gpu_bdb/queries/q30/gpu_bdb_query_30.py b/gpu_bdb/queries/q30/gpu_bdb_query_30.py index 5f9eaac5..1f37f718 100755 --- a/gpu_bdb/queries/q30/gpu_bdb_query_30.py +++ b/gpu_bdb/queries/q30/gpu_bdb_query_30.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,17 +14,23 @@ # limitations under the License. # -import sys import glob import os +import cudf +import dask_cudf + from bdb_tools.utils import ( benchmark, gpubdb_argparser, run_query, ) -from bdb_tools.readers import build_reader -from bdb_tools.sessionization import get_session_id, get_distinct_sessions, get_pairs +from bdb_tools.q30_utils import ( + q30_session_timeout_inSec, + q30_limit, + read_tables +) +from bdb_tools.sessionization import get_distinct_sessions, get_pairs from dask import delayed import numpy as np @@ -35,30 +41,10 @@ # The bottleneck of current implementation is `set-index`, once ucx is working correctly # it should go away - -### session timeout in secs -q30_session_timeout_inSec = 3600 -### query output limit -q30_limit = 40 - - -def read_tables(config): - table_reader = build_reader( - data_format=config["file_format"], - basepath=config["data_dir"], - split_row_groups=config["split_row_groups"], - ) - - item_cols = ["i_category_id", "i_item_sk"] - item_df = table_reader.read("item", relevant_cols=item_cols) - return item_df - - def pre_repartition_task(wcs_fn, f_item_df): """ Runs the pre-repartition task """ - import cudf wcs_cols = ["wcs_user_sk", "wcs_item_sk", "wcs_click_date_sk", "wcs_click_time_sk"] wcs_df = cudf.read_parquet(wcs_fn, columns=wcs_cols) @@ -80,8 +66,6 @@ def pre_repartition_task(wcs_fn, f_item_df): def main(client, config): - import dask_cudf - import cudf item_df = benchmark( read_tables, @@ -163,8 +147,6 @@ def main(client, config): if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster - import cudf - import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) diff --git a/gpu_bdb/queries/q30/gpu_bdb_query_30_dask_sql.py b/gpu_bdb/queries/q30/gpu_bdb_query_30_dask_sql.py new file mode 100755 index 00000000..d7ca3868 --- /dev/null +++ b/gpu_bdb/queries/q30/gpu_bdb_query_30_dask_sql.py @@ -0,0 +1,99 @@ +# +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from bdb_tools.cluster_startup import attach_to_cluster + +from bdb_tools.utils import ( + benchmark, + gpubdb_argparser, + run_query, +) + +from bdb_tools.sessionization import ( + get_distinct_sessions, + get_pairs +) + +from bdb_tools.q30_utils import ( + q30_session_timeout_inSec, + q30_limit, + read_tables +) + +from dask.distributed import wait + +def main(data_dir, client, c, config): + benchmark(read_tables, config, c, dask_profile=config["dask_profile"]) + + query_1 = """ + SELECT i_item_sk, + CAST(i_category_id AS TINYINT) AS i_category_id + FROM item + """ + item_df = c.sql(query_1) + + item_df = item_df.persist() + wait(item_df) + c.create_table("item_df", item_df, persist=False) + + query_2 = """ + SELECT wcs_user_sk, + (wcs_click_date_sk * 86400 + wcs_click_time_sk) AS tstamp_inSec, + i_category_id + FROM web_clickstreams wcs, item_df i + WHERE wcs.wcs_item_sk = i.i_item_sk + AND i.i_category_id IS NOT NULL + AND wcs.wcs_user_sk IS NOT NULL + DISTRIBUTE BY wcs_user_sk + """ + merged_df = c.sql(query_2) + + c.drop_table("item_df") + del item_df + + distinct_session_df = merged_df.map_partitions(get_distinct_sessions, + keep_cols=["wcs_user_sk", "i_category_id"], + time_out=q30_session_timeout_inSec) + + del merged_df + pair_df = distinct_session_df.map_partitions( + get_pairs, + pair_col="i_category_id", + output_col_1="category_id_1", + output_col_2="category_id_2") + del distinct_session_df + + c.create_table('pair_df', pair_df, persist=False) + + last_query = f""" + SELECT CAST(category_id_1 AS BIGINT) AS category_id_1, + CAST(category_id_2 AS BIGINT) AS category_id_2, + COUNT(category_id_2) AS cnt + FROM pair_df + GROUP BY category_id_1, category_id_2 + ORDER BY cnt desc + LIMIT {q30_limit} + """ + result = c.sql(last_query) + + c.drop_table("pair_df") + return result + + +if __name__ == "__main__": + config = gpubdb_argparser() + client, c = attach_to_cluster(config, create_sql_context=True) + run_query(config=config, client=client, query_func=main, sql_context=c)