From dbd5cc9453cae088fe5e48949336867edae17608 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bernhard=20Sch=C3=A4fer?= Date: Sat, 17 Jan 2026 11:38:04 +0100 Subject: [PATCH 1/9] query_foundry_sql(return_type='polars') --- docs/dev/contribute.md | 4 +- docs/examples/dataset.md | 4 +- .../foundry_dev_tools/clients/data_proxy.py | 31 +++- .../clients/foundry_sql_server.py | 144 ++++++++++-------- .../foundry_dev_tools/foundry_api_client.py | 31 +++- .../foundry_dev_tools/resources/dataset.py | 30 ++-- .../src/foundry_dev_tools/utils/api_types.py | 3 +- .../clients/test_foundry_sql_server.py | 11 ++ tests/integration/resources/test_dataset.py | 4 + 9 files changed, 173 insertions(+), 89 deletions(-) diff --git a/docs/dev/contribute.md b/docs/dev/contribute.md index 648e2fa6..469752d3 100644 --- a/docs/dev/contribute.md +++ b/docs/dev/contribute.md @@ -19,9 +19,11 @@ cd foundry-dev-tools ````{tab} Conda/Mamba First create an environment specifically for foundry-dev-tools with pdm and activate it ```shell -mamba create -n foundry-dev-tools pdm openjdk===17 +mamba create -n foundry-dev-tools python=3.12 pdm openjdk=17 mamba activate foundry-dev-tools ``` +Note that python>3.12 throws an error when running `pdm install`: `configured Python interpreter version (3.14) is newer than PyO3's maximum supported version (3.12)`. + ```` ````{tab} Without Conda/Mamba If you don't want to use conda or mamba, you'll need to install pdm through other means. diff --git a/docs/examples/dataset.md b/docs/examples/dataset.md index 1d037e77..f8687b8f 100644 --- a/docs/examples/dataset.md +++ b/docs/examples/dataset.md @@ -260,9 +260,7 @@ import polars as pl ctx = FoundryContext() ds = ctx.get_dataset_by_path("/path/to/test_dataset") -arrow_table = ds.query_foundry_sql("SELECT *",return_type="arrow") - -df = pl.from_arrow(arrow_table) +df = ds.query_foundry_sql("SELECT *", return_type="polars") print(df) ``` ```` diff --git a/libs/foundry-dev-tools/src/foundry_dev_tools/clients/data_proxy.py b/libs/foundry-dev-tools/src/foundry_dev_tools/clients/data_proxy.py index 77ce84b0..a8544485 100644 --- a/libs/foundry-dev-tools/src/foundry_dev_tools/clients/data_proxy.py +++ b/libs/foundry-dev-tools/src/foundry_dev_tools/clients/data_proxy.py @@ -27,6 +27,7 @@ from pathlib import Path import pandas as pd + import polars as pl import pyarrow as pa import pyspark import requests @@ -111,7 +112,7 @@ def query_foundry_sql_legacy( branch: Ref = ..., sql_dialect: SqlDialect = ..., timeout: int = ..., - ) -> pd.core.frame.DataFrame: ... + ) -> pd.DataFrame: ... @overload def query_foundry_sql_legacy( @@ -133,6 +134,16 @@ def query_foundry_sql_legacy( timeout: int = ..., ) -> pa.Table: ... + @overload + def query_foundry_sql_legacy( + self, + query: str, + return_type: Literal["polars"], + branch: Ref = ..., + sql_dialect: SqlDialect = ..., + timeout: int = ..., + ) -> pl.DataFrame: ... + @overload def query_foundry_sql_legacy( self, @@ -151,7 +162,7 @@ def query_foundry_sql_legacy( branch: Ref = ..., sql_dialect: SqlDialect = ..., timeout: int = ..., - ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pa.Table | pyspark.sql.DataFrame: ... + ) -> tuple[dict, list[list]] | pd.DataFrame | pl.DataFrame | pa.Table | pyspark.sql.DataFrame: ... def query_foundry_sql_legacy( self, @@ -160,7 +171,7 @@ def query_foundry_sql_legacy( branch: Ref = "master", sql_dialect: SqlDialect = "SPARK", timeout: int = 600, - ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pa.Table | pyspark.sql.DataFrame: + ) -> tuple[dict, list[list]] | pd.DataFrame | pl.DataFrame | pa.Table | pyspark.sql.DataFrame: """Queries the dataproxy query API with spark SQL. Example: @@ -213,13 +224,23 @@ def query_foundry_sql_legacy( data=response_json["rows"], columns=[e["name"] for e in response_json["foundrySchema"]["fieldSchemaList"]], ) - if return_type == "arrow": + if return_type in {"arrow", "polars"}: from foundry_dev_tools._optional.pyarrow import pa - return pa.table( + arrow_table = pa.table( data=response_json["rows"], names=[e["name"] for e in response_json["foundrySchema"]["fieldSchemaList"]], ) + if return_type == "arrow": + return arrow_table + + from foundry_dev_tools._optional.polars import pl + + if getattr(pl, "__fake__", False): + msg = "The optional 'polars' package is not installed. Please install it to request polars results." + raise ImportError(msg) + + return pl.from_arrow(arrow_table) if return_type == "spark": from foundry_dev_tools.utils.converter.foundry_spark import ( foundry_schema_to_spark_schema, diff --git a/libs/foundry-dev-tools/src/foundry_dev_tools/clients/foundry_sql_server.py b/libs/foundry-dev-tools/src/foundry_dev_tools/clients/foundry_sql_server.py index 5a1cb706..e18bd490 100644 --- a/libs/foundry-dev-tools/src/foundry_dev_tools/clients/foundry_sql_server.py +++ b/libs/foundry-dev-tools/src/foundry_dev_tools/clients/foundry_sql_server.py @@ -17,6 +17,7 @@ if TYPE_CHECKING: import pandas as pd + import polars as pl import pyarrow as pa import pyspark import requests @@ -35,7 +36,7 @@ def query_foundry_sql( branch: Ref = ..., sql_dialect: SqlDialect = ..., timeout: int = ..., - ) -> pd.core.frame.DataFrame: ... + ) -> pd.DataFrame: ... @overload def query_foundry_sql( @@ -57,6 +58,16 @@ def query_foundry_sql( timeout: int = ..., ) -> pa.Table: ... + @overload + def query_foundry_sql( + self, + query: str, + return_type: Literal["polars"], + branch: Ref = ..., + sql_dialect: SqlDialect = ..., + timeout: int = ..., + ) -> pl.DataFrame: ... + @overload def query_foundry_sql( self, @@ -75,16 +86,16 @@ def query_foundry_sql( branch: Ref = ..., sql_dialect: SqlDialect = ..., timeout: int = ..., - ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pa.Table | pyspark.sql.DataFrame: ... + ) -> tuple[dict, list[list]] | pd.DataFrame | pl.DataFrame | pa.Table | pyspark.sql.DataFrame: ... - def query_foundry_sql( + def query_foundry_sql( # noqa: C901 self, query: str, return_type: SQLReturnType = "pandas", branch: Ref = "master", sql_dialect: SqlDialect = "SPARK", timeout: int = 600, - ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pa.Table | pyspark.sql.DataFrame: + ) -> tuple[dict, list[list]] | pd.DataFrame | pl.DataFrame | pa.Table | pyspark.sql.DataFrame: """Queries the Foundry SQL server with spark SQL dialect. Uses Arrow IPC to communicate with the Foundry SQL Server Endpoint. @@ -105,75 +116,80 @@ def query_foundry_sql( timeout: Query Timeout, default value is 600 seconds Returns: - :external+pandas:py:class:`~pandas.DataFrame` | :external+pyarrow:py:class:`~pyarrow.Table` | :external+spark:py:class:`~pyspark.sql.DataFrame`: + :external+pandas:py:class:`~pandas.DataFrame` | :external+polars:py:class:`~polars.DataFrame` | :external+pyarrow:py:class:`~pyarrow.Table` | :external+spark:py:class:`~pyspark.sql.DataFrame`: - A pandas DataFrame, Spark DataFrame or pyarrow.Table with the result. + A pandas, polars, Spark DataFrame or pyarrow.Table with the result. Raises: ValueError: Only direct read eligible queries can be returned as arrow Table. """ # noqa: E501 - if return_type != "raw": - try: - response_json = self.api_queries_execute( - query, - branch=branch, - dialect=sql_dialect, - timeout=timeout, - ).json() + if return_type == "raw": + warnings.warn("Falling back to query_foundry_sql_legacy!") + return self.context.data_proxy.query_foundry_sql_legacy( + query=query, + return_type=return_type, + branch=branch, + sql_dialect=sql_dialect, + timeout=timeout, + ) + + try: + response_json = self.api_queries_execute( + query, + branch=branch, + dialect=sql_dialect, + timeout=timeout, + ).json() + query_id = response_json["queryId"] + status = response_json["status"] + + if status != {"ready": {}, "type": "ready"}: + start_time = time.time() query_id = response_json["queryId"] - status = response_json["status"] - - if status != {"ready": {}, "type": "ready"}: - start_time = time.time() - query_id = response_json["queryId"] - while response_json["status"]["type"] == "running": - response = self.api_queries_status(query_id) - response_json = response.json() - if response_json["status"]["type"] == "failed": - raise FoundrySqlQueryFailedError(response) - if time.time() > start_time + timeout: - raise FoundrySqlQueryClientTimedOutError(response, timeout=timeout) - time.sleep(0.2) - - arrow_stream_reader = self.read_fsql_query_results_arrow(query_id=query_id) - if return_type == "pandas": - return arrow_stream_reader.read_pandas() - - if return_type == "spark": - from foundry_dev_tools.utils.converter.foundry_spark import ( - arrow_stream_to_spark_dataframe, - ) - - return arrow_stream_to_spark_dataframe(arrow_stream_reader) - return arrow_stream_reader.read_all() + while response_json["status"]["type"] == "running": + response = self.api_queries_status(query_id) + response_json = response.json() + if response_json["status"]["type"] == "failed": + raise FoundrySqlQueryFailedError(response) + if time.time() > start_time + timeout: + raise FoundrySqlQueryClientTimedOutError(response, timeout=timeout) + time.sleep(0.2) + + arrow_stream_reader = self.read_fsql_query_results_arrow(query_id=query_id) + if return_type == "pandas": + return arrow_stream_reader.read_pandas() + if return_type == "spark": + from foundry_dev_tools.utils.converter.foundry_spark import ( + arrow_stream_to_spark_dataframe, + ) + + return arrow_stream_to_spark_dataframe(arrow_stream_reader) + if return_type == "polars": + from foundry_dev_tools._optional.polars import pl - return self._query_fsql( - query=query, - branch=branch, - return_type=return_type, + if getattr(pl, "__fake__", False): + msg = "The optional 'polars' package is not installed. Please install it to request polars results." + raise ImportError(msg) # noqa: TRY301 + + arrow_table = arrow_stream_reader.read_all() + return pl.from_arrow(arrow_table) + if return_type == "arrow": + return arrow_stream_reader.read_all() + raise ValueError(f"Unsupported return_type: {return_type}") # noqa: EM102, TRY003 + except ( + FoundrySqlSerializationFormatNotImplementedError, + ImportError, + ) as exc: + if return_type == "arrow": + msg = ( + "Only direct read eligible queries can be returned as arrow Table. Consider using setting" + " return_type to 'pandas'." ) - except ( - FoundrySqlSerializationFormatNotImplementedError, - ImportError, - ) as exc: - if return_type == "arrow": - msg = ( - "Only direct read eligible queries can be returned as arrow Table. Consider using setting" - " return_type to 'pandas'." - ) - raise ValueError( - msg, - ) from exc - - warnings.warn("Falling back to query_foundry_sql_legacy!") - return self.context.data_proxy.query_foundry_sql_legacy( - query=query, - return_type=return_type, - branch=branch, - sql_dialect=sql_dialect, - timeout=timeout, - ) + raise ValueError( + msg, + ) from exc + raise def read_fsql_query_results_arrow(self, query_id: str) -> pa.ipc.RecordBatchStreamReader: """Create a bytes io reader if query returned arrow.""" diff --git a/libs/foundry-dev-tools/src/foundry_dev_tools/foundry_api_client.py b/libs/foundry-dev-tools/src/foundry_dev_tools/foundry_api_client.py index 337591e2..1c54276c 100644 --- a/libs/foundry-dev-tools/src/foundry_dev_tools/foundry_api_client.py +++ b/libs/foundry-dev-tools/src/foundry_dev_tools/foundry_api_client.py @@ -2,7 +2,7 @@ One of the gaols of this module is to be self-contained so that it can be dropped into any python installation with minimal dependency to 'requests' -Optional dependencies for the SQL functionality to work are pandas and pyarrow. +Optional dependencies for the SQL functionality to work are pandas, polars and pyarrow. """ @@ -41,6 +41,7 @@ from collections.abc import Iterator import pandas as pd + import polars as pl import pyarrow as pa import pyspark import requests @@ -924,6 +925,16 @@ def query_foundry_sql_legacy( timeout: int = ..., ) -> pa.Table: ... + @overload + def query_foundry_sql_legacy( + self, + query: str, + return_type: Literal["polars"], + branch: api_types.Ref = ..., + sql_dialect: api_types.SqlDialect = ..., + timeout: int = ..., + ) -> pl.DataFrame: ... + @overload def query_foundry_sql_legacy( self, @@ -942,7 +953,7 @@ def query_foundry_sql_legacy( branch: api_types.Ref = ..., sql_dialect: api_types.SqlDialect = ..., timeout: int = ..., - ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pa.Table | pyspark.sql.DataFrame: ... + ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pl.DataFrame | pa.Table | pyspark.sql.DataFrame: ... def query_foundry_sql_legacy( self, @@ -951,7 +962,7 @@ def query_foundry_sql_legacy( branch: api_types.Ref = "master", sql_dialect: api_types.SqlDialect = "SPARK", timeout: int = 600, - ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pa.Table | pyspark.sql.DataFrame: + ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pl.DataFrame | pa.Table | pyspark.sql.DataFrame: """Queries the dataproxy query API with spark SQL. Example: @@ -1022,6 +1033,16 @@ def query_foundry_sql( timeout: int = ..., ) -> pa.Table: ... + @overload + def query_foundry_sql( + self, + query: str, + return_type: Literal["polars"], + branch: api_types.Ref = ..., + sql_dialect: api_types.SqlDialect = ..., + timeout: int = ..., + ) -> pl.DataFrame: ... + @overload def query_foundry_sql( self, @@ -1040,7 +1061,7 @@ def query_foundry_sql( branch: api_types.Ref = ..., sql_dialect: api_types.SqlDialect = ..., timeout: int = ..., - ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pa.Table | pyspark.sql.DataFrame: ... + ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pl.DataFrame | pa.Table | pyspark.sql.DataFrame: ... def query_foundry_sql( self, @@ -1049,7 +1070,7 @@ def query_foundry_sql( branch: api_types.Ref = "master", sql_dialect: api_types.SqlDialect = "SPARK", timeout: int = 600, - ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pa.Table | pyspark.sql.DataFrame: + ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pl.DataFrame | pa.Table | pyspark.sql.DataFrame: """Queries the Foundry SQL server with spark SQL dialect. Uses Arrow IPC to communicate with the Foundry SQL Server Endpoint. diff --git a/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py b/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py index f78d2b25..9d0e5d60 100644 --- a/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py +++ b/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py @@ -31,7 +31,7 @@ import pandas as pd import pandas.core.frame - import polars.dataframe.frame + import polars as pl import pyarrow as pa import pyspark.sql @@ -539,7 +539,7 @@ def download_files_temporary( def save_dataframe( self, - df: pandas.core.frame.DataFrame | polars.dataframe.frame.DataFrame | pyspark.sql.DataFrame, + df: pandas.core.frame.DataFrame | pl.DataFrame | pyspark.sql.DataFrame, transaction_type: api_types.FoundryTransaction = "SNAPSHOT", foundry_schema: api_types.FoundrySchema | None = None, ) -> Self: @@ -717,6 +717,15 @@ def query_foundry_sql( timeout: int = ..., ) -> pa.Table: ... + @overload + def query_foundry_sql( + self, + query: str, + return_type: Literal["polars"], + sql_dialect: api_types.SqlDialect = ..., + timeout: int = ..., + ) -> pl.DataFrame: ... + @overload def query_foundry_sql( self, @@ -733,7 +742,7 @@ def query_foundry_sql( return_type: api_types.SQLReturnType = ..., sql_dialect: api_types.SqlDialect = ..., timeout: int = ..., - ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pa.Table | pyspark.sql.DataFrame: ... + ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pl.DataFrame | pa.Table | pyspark.sql.DataFrame: ... def query_foundry_sql( self, @@ -741,7 +750,7 @@ def query_foundry_sql( return_type: api_types.SQLReturnType = "pandas", sql_dialect: api_types.SqlDialect = "SPARK", timeout: int = 600, - ) -> tuple[dict, list[list]] | pd.DataFrame | pa.Table | pyspark.sql.DataFrame: + ) -> tuple[dict, list[list]] | pd.DataFrame | pl.DataFrame | pa.Table | pyspark.sql.DataFrame: """Wrapper around :py:meth:`foundry_dev_tools.clients.foundry_sql_server.FoundrySqlServerClient.query_foundry_sql`. But it automatically prepends the dataset location, so instead of: @@ -783,17 +792,18 @@ def to_pandas(self) -> pandas.core.frame.DataFrame: """ return self.query_foundry_sql("SELECT *", return_type="pandas") - def to_polars(self) -> polars.dataframe.frame.DataFrame: + def to_polars(self) -> pl.DataFrame: """Get dataset as a :py:class:`polars.DataFrame`. Via :py:meth:`foundry_dev_tools.resources.dataset.Dataset.query_foundry_sql` """ - try: - import polars as pl - except ImportError as e: + from foundry_dev_tools._optional.polars import pl + + if getattr(pl, "__fake__", False): msg = "The optional 'polars' package is not installed. Please install it to use the 'to_polars' method" - raise ImportError(msg) from e - return pl.from_arrow(self.to_arrow()) + raise ImportError(msg) + + return self.query_foundry_sql("SELECT *", return_type="polars") @contextmanager def transaction_context( diff --git a/libs/foundry-dev-tools/src/foundry_dev_tools/utils/api_types.py b/libs/foundry-dev-tools/src/foundry_dev_tools/utils/api_types.py index f262e97a..0eb3a9c6 100644 --- a/libs/foundry-dev-tools/src/foundry_dev_tools/utils/api_types.py +++ b/libs/foundry-dev-tools/src/foundry_dev_tools/utils/api_types.py @@ -95,10 +95,11 @@ def assert_in_literal(option, literal, variable_name) -> None: # noqa: ANN001 SqlDialect = Literal["ANSI", "SPARK"] """The SQL Dialect for Foundry SQL queries.""" -SQLReturnType = Literal["pandas", "spark", "arrow", "raw"] +SQLReturnType = Literal["pandas", "polars", "spark", "arrow", "raw"] """The return_types for sql queries. pandas: :external+pandas:py:class:`pandas.DataFrame` +polars: :external+polars:py:class:`polars.DataFrame` arrow: :external+pyarrow:py:class:`pyarrow.Table` spark: :external+spark:py:class:`~pyspark.sql.DataFrame` raw: Tuple of (foundry_schema, data) (can only be used in legacy) diff --git a/tests/integration/clients/test_foundry_sql_server.py b/tests/integration/clients/test_foundry_sql_server.py index 7dd079dc..929cddde 100644 --- a/tests/integration/clients/test_foundry_sql_server.py +++ b/tests/integration/clients/test_foundry_sql_server.py @@ -1,3 +1,4 @@ +import polars as pl import pytest from foundry_dev_tools.errors.dataset import BranchNotFoundError, DatasetHasNoSchemaError, DatasetNotFoundError @@ -12,6 +13,16 @@ def test_smoke(): assert one_row_one_column.shape == (1, 1) +def test_polars_return_type(): + polars_df = TEST_SINGLETON.ctx.foundry_sql_server.query_foundry_sql( + f"SELECT sepal_length FROM `{TEST_SINGLETON.iris_new.rid}` LIMIT 2", + return_type="polars", + ) + assert isinstance(polars_df, pl.DataFrame) + assert polars_df.height == 2 + assert polars_df.width == 1 + + def test_exceptions(): with pytest.raises(BranchNotFoundError) as exc: TEST_SINGLETON.ctx.foundry_sql_server.query_foundry_sql( diff --git a/tests/integration/resources/test_dataset.py b/tests/integration/resources/test_dataset.py index a764d0d8..6e202395 100644 --- a/tests/integration/resources/test_dataset.py +++ b/tests/integration/resources/test_dataset.py @@ -97,6 +97,10 @@ def test_crud_dataset(spark_session, tmp_path): # noqa: PLR0915 ds_polars_branch = TEST_SINGLETON.ctx.get_dataset(ds.rid, branch="polars") ds_polars_branch.save_dataframe(polars_df) pl_assert_frame_equal(polars_df, ds_polars_branch.to_polars()) + pl_assert_frame_equal( + polars_df, + ds_polars_branch.query_foundry_sql("SELECT *", return_type="polars"), + ) ds_spark_branch = TEST_SINGLETON.ctx.get_dataset(ds.rid, branch="spark") ds_spark_branch.save_dataframe(spark_df) From 18d0657d74bd04e7052a6e741fd502245b54ff84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bernhard=20Sch=C3=A4fer?= Date: Sat, 17 Jan 2026 11:43:05 +0100 Subject: [PATCH 2/9] docs update --- docs/dev/contribute.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/dev/contribute.md b/docs/dev/contribute.md index 469752d3..e21dae67 100644 --- a/docs/dev/contribute.md +++ b/docs/dev/contribute.md @@ -22,9 +22,9 @@ First create an environment specifically for foundry-dev-tools with pdm and acti mamba create -n foundry-dev-tools python=3.12 pdm openjdk=17 mamba activate foundry-dev-tools ``` +```` Note that python>3.12 throws an error when running `pdm install`: `configured Python interpreter version (3.14) is newer than PyO3's maximum supported version (3.12)`. -```` ````{tab} Without Conda/Mamba If you don't want to use conda or mamba, you'll need to install pdm through other means. It is available in most Linux package managers, Homebrew, Scoop and also via pip. From 06b0f45a8483a3f3cf756955d1496950808397e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bernhard=20Sch=C3=A4fer?= Date: Sat, 17 Jan 2026 11:51:50 +0100 Subject: [PATCH 3/9] use assert_in_literal --- .../src/foundry_dev_tools/clients/foundry_sql_server.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libs/foundry-dev-tools/src/foundry_dev_tools/clients/foundry_sql_server.py b/libs/foundry-dev-tools/src/foundry_dev_tools/clients/foundry_sql_server.py index e18bd490..56fc3b5b 100644 --- a/libs/foundry-dev-tools/src/foundry_dev_tools/clients/foundry_sql_server.py +++ b/libs/foundry-dev-tools/src/foundry_dev_tools/clients/foundry_sql_server.py @@ -124,6 +124,8 @@ def query_foundry_sql( # noqa: C901 ValueError: Only direct read eligible queries can be returned as arrow Table. """ # noqa: E501 + assert_in_literal(return_type, SQLReturnType, "return_type") + if return_type == "raw": warnings.warn("Falling back to query_foundry_sql_legacy!") return self.context.data_proxy.query_foundry_sql_legacy( @@ -176,7 +178,6 @@ def query_foundry_sql( # noqa: C901 return pl.from_arrow(arrow_table) if return_type == "arrow": return arrow_stream_reader.read_all() - raise ValueError(f"Unsupported return_type: {return_type}") # noqa: EM102, TRY003 except ( FoundrySqlSerializationFormatNotImplementedError, ImportError, From 9b0f911ef00be319a54f542147085384faab5288 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bernhard=20Sch=C3=A4fer?= Date: Sat, 17 Jan 2026 12:03:10 +0100 Subject: [PATCH 4/9] do no extend query_foundry_sql_legacy --- .../foundry_dev_tools/clients/data_proxy.py | 28 +++---------------- .../foundry_dev_tools/foundry_api_client.py | 14 ++-------- .../foundry_dev_tools/resources/dataset.py | 8 +----- 3 files changed, 7 insertions(+), 43 deletions(-) diff --git a/libs/foundry-dev-tools/src/foundry_dev_tools/clients/data_proxy.py b/libs/foundry-dev-tools/src/foundry_dev_tools/clients/data_proxy.py index a8544485..7acdb62c 100644 --- a/libs/foundry-dev-tools/src/foundry_dev_tools/clients/data_proxy.py +++ b/libs/foundry-dev-tools/src/foundry_dev_tools/clients/data_proxy.py @@ -27,7 +27,6 @@ from pathlib import Path import pandas as pd - import polars as pl import pyarrow as pa import pyspark import requests @@ -134,16 +133,6 @@ def query_foundry_sql_legacy( timeout: int = ..., ) -> pa.Table: ... - @overload - def query_foundry_sql_legacy( - self, - query: str, - return_type: Literal["polars"], - branch: Ref = ..., - sql_dialect: SqlDialect = ..., - timeout: int = ..., - ) -> pl.DataFrame: ... - @overload def query_foundry_sql_legacy( self, @@ -162,7 +151,7 @@ def query_foundry_sql_legacy( branch: Ref = ..., sql_dialect: SqlDialect = ..., timeout: int = ..., - ) -> tuple[dict, list[list]] | pd.DataFrame | pl.DataFrame | pa.Table | pyspark.sql.DataFrame: ... + ) -> tuple[dict, list[list]] | pd.DataFrame | pa.Table | pyspark.sql.DataFrame: ... def query_foundry_sql_legacy( self, @@ -171,7 +160,7 @@ def query_foundry_sql_legacy( branch: Ref = "master", sql_dialect: SqlDialect = "SPARK", timeout: int = 600, - ) -> tuple[dict, list[list]] | pd.DataFrame | pl.DataFrame | pa.Table | pyspark.sql.DataFrame: + ) -> tuple[dict, list[list]] | pd.DataFrame | pa.Table | pyspark.sql.DataFrame: """Queries the dataproxy query API with spark SQL. Example: @@ -224,23 +213,14 @@ def query_foundry_sql_legacy( data=response_json["rows"], columns=[e["name"] for e in response_json["foundrySchema"]["fieldSchemaList"]], ) - if return_type in {"arrow", "polars"}: + if return_type == "arrow": from foundry_dev_tools._optional.pyarrow import pa - arrow_table = pa.table( + return pa.table( data=response_json["rows"], names=[e["name"] for e in response_json["foundrySchema"]["fieldSchemaList"]], ) - if return_type == "arrow": - return arrow_table - - from foundry_dev_tools._optional.polars import pl - - if getattr(pl, "__fake__", False): - msg = "The optional 'polars' package is not installed. Please install it to request polars results." - raise ImportError(msg) - return pl.from_arrow(arrow_table) if return_type == "spark": from foundry_dev_tools.utils.converter.foundry_spark import ( foundry_schema_to_spark_schema, diff --git a/libs/foundry-dev-tools/src/foundry_dev_tools/foundry_api_client.py b/libs/foundry-dev-tools/src/foundry_dev_tools/foundry_api_client.py index 1c54276c..c38686a3 100644 --- a/libs/foundry-dev-tools/src/foundry_dev_tools/foundry_api_client.py +++ b/libs/foundry-dev-tools/src/foundry_dev_tools/foundry_api_client.py @@ -925,16 +925,6 @@ def query_foundry_sql_legacy( timeout: int = ..., ) -> pa.Table: ... - @overload - def query_foundry_sql_legacy( - self, - query: str, - return_type: Literal["polars"], - branch: api_types.Ref = ..., - sql_dialect: api_types.SqlDialect = ..., - timeout: int = ..., - ) -> pl.DataFrame: ... - @overload def query_foundry_sql_legacy( self, @@ -953,7 +943,7 @@ def query_foundry_sql_legacy( branch: api_types.Ref = ..., sql_dialect: api_types.SqlDialect = ..., timeout: int = ..., - ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pl.DataFrame | pa.Table | pyspark.sql.DataFrame: ... + ) -> tuple[dict, list[list]] | pd.DataFrame | pa.Table | pyspark.sql.DataFrame: ... def query_foundry_sql_legacy( self, @@ -962,7 +952,7 @@ def query_foundry_sql_legacy( branch: api_types.Ref = "master", sql_dialect: api_types.SqlDialect = "SPARK", timeout: int = 600, - ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pl.DataFrame | pa.Table | pyspark.sql.DataFrame: + ) -> tuple[dict, list[list]] | pd.DataFrame | pa.Table | pyspark.sql.DataFrame: """Queries the dataproxy query API with spark SQL. Example: diff --git a/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py b/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py index 9d0e5d60..8a731878 100644 --- a/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py +++ b/libs/foundry-dev-tools/src/foundry_dev_tools/resources/dataset.py @@ -697,7 +697,7 @@ def query_foundry_sql( return_type: Literal["pandas"], sql_dialect: api_types.SqlDialect = ..., timeout: int = ..., - ) -> pd.core.frame.DataFrame: ... + ) -> pd.DataFrame: ... @overload def query_foundry_sql( @@ -797,12 +797,6 @@ def to_polars(self) -> pl.DataFrame: Via :py:meth:`foundry_dev_tools.resources.dataset.Dataset.query_foundry_sql` """ - from foundry_dev_tools._optional.polars import pl - - if getattr(pl, "__fake__", False): - msg = "The optional 'polars' package is not installed. Please install it to use the 'to_polars' method" - raise ImportError(msg) - return self.query_foundry_sql("SELECT *", return_type="polars") @contextmanager From 65bb9f8dc010e030cd71c07bc5e0b7676ad85049 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bernhard=20Sch=C3=A4fer?= Date: Sat, 17 Jan 2026 12:06:12 +0100 Subject: [PATCH 5/9] reduce diff --- .../src/foundry_dev_tools/clients/data_proxy.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/libs/foundry-dev-tools/src/foundry_dev_tools/clients/data_proxy.py b/libs/foundry-dev-tools/src/foundry_dev_tools/clients/data_proxy.py index 7acdb62c..77ce84b0 100644 --- a/libs/foundry-dev-tools/src/foundry_dev_tools/clients/data_proxy.py +++ b/libs/foundry-dev-tools/src/foundry_dev_tools/clients/data_proxy.py @@ -111,7 +111,7 @@ def query_foundry_sql_legacy( branch: Ref = ..., sql_dialect: SqlDialect = ..., timeout: int = ..., - ) -> pd.DataFrame: ... + ) -> pd.core.frame.DataFrame: ... @overload def query_foundry_sql_legacy( @@ -151,7 +151,7 @@ def query_foundry_sql_legacy( branch: Ref = ..., sql_dialect: SqlDialect = ..., timeout: int = ..., - ) -> tuple[dict, list[list]] | pd.DataFrame | pa.Table | pyspark.sql.DataFrame: ... + ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pa.Table | pyspark.sql.DataFrame: ... def query_foundry_sql_legacy( self, @@ -160,7 +160,7 @@ def query_foundry_sql_legacy( branch: Ref = "master", sql_dialect: SqlDialect = "SPARK", timeout: int = 600, - ) -> tuple[dict, list[list]] | pd.DataFrame | pa.Table | pyspark.sql.DataFrame: + ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pa.Table | pyspark.sql.DataFrame: """Queries the dataproxy query API with spark SQL. Example: @@ -220,7 +220,6 @@ def query_foundry_sql_legacy( data=response_json["rows"], names=[e["name"] for e in response_json["foundrySchema"]["fieldSchemaList"]], ) - if return_type == "spark": from foundry_dev_tools.utils.converter.foundry_spark import ( foundry_schema_to_spark_schema, From 883e5ac7599ff5ad9960e23e90ad42c1a0ab8c76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bernhard=20Sch=C3=A4fer?= Date: Sat, 17 Jan 2026 12:09:40 +0100 Subject: [PATCH 6/9] reduce diff v2 --- .../src/foundry_dev_tools/foundry_api_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/foundry-dev-tools/src/foundry_dev_tools/foundry_api_client.py b/libs/foundry-dev-tools/src/foundry_dev_tools/foundry_api_client.py index c38686a3..c4b91d0a 100644 --- a/libs/foundry-dev-tools/src/foundry_dev_tools/foundry_api_client.py +++ b/libs/foundry-dev-tools/src/foundry_dev_tools/foundry_api_client.py @@ -943,7 +943,7 @@ def query_foundry_sql_legacy( branch: api_types.Ref = ..., sql_dialect: api_types.SqlDialect = ..., timeout: int = ..., - ) -> tuple[dict, list[list]] | pd.DataFrame | pa.Table | pyspark.sql.DataFrame: ... + ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pa.Table | pyspark.sql.DataFrame: ... def query_foundry_sql_legacy( self, @@ -952,7 +952,7 @@ def query_foundry_sql_legacy( branch: api_types.Ref = "master", sql_dialect: api_types.SqlDialect = "SPARK", timeout: int = 600, - ) -> tuple[dict, list[list]] | pd.DataFrame | pa.Table | pyspark.sql.DataFrame: + ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pa.Table | pyspark.sql.DataFrame: """Queries the dataproxy query API with spark SQL. Example: From 403181a810a95d73a6de3fd8bf9dc28681531a29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bernhard=20Sch=C3=A4fer?= Date: Sat, 17 Jan 2026 12:11:45 +0100 Subject: [PATCH 7/9] v3 --- .../src/foundry_dev_tools/foundry_api_client.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/foundry-dev-tools/src/foundry_dev_tools/foundry_api_client.py b/libs/foundry-dev-tools/src/foundry_dev_tools/foundry_api_client.py index c4b91d0a..cf0064ba 100644 --- a/libs/foundry-dev-tools/src/foundry_dev_tools/foundry_api_client.py +++ b/libs/foundry-dev-tools/src/foundry_dev_tools/foundry_api_client.py @@ -1001,7 +1001,7 @@ def query_foundry_sql( branch: api_types.Ref = ..., sql_dialect: api_types.SqlDialect = ..., timeout: int = ..., - ) -> pd.core.frame.DataFrame: ... + ) -> pd.DataFrame: ... @overload def query_foundry_sql( @@ -1051,7 +1051,7 @@ def query_foundry_sql( branch: api_types.Ref = ..., sql_dialect: api_types.SqlDialect = ..., timeout: int = ..., - ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pl.DataFrame | pa.Table | pyspark.sql.DataFrame: ... + ) -> tuple[dict, list[list]] | pd.DataFrame | pl.DataFrame | pa.Table | pyspark.sql.DataFrame: ... def query_foundry_sql( self, @@ -1060,7 +1060,7 @@ def query_foundry_sql( branch: api_types.Ref = "master", sql_dialect: api_types.SqlDialect = "SPARK", timeout: int = 600, - ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pl.DataFrame | pa.Table | pyspark.sql.DataFrame: + ) -> tuple[dict, list[list]] | pd.DataFrame | pl.DataFrame | pa.Table | pyspark.sql.DataFrame: """Queries the Foundry SQL server with spark SQL dialect. Uses Arrow IPC to communicate with the Foundry SQL Server Endpoint. From 6ca205eacb6476d75241124c3413b9aa4154fe30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bernhard=20Sch=C3=A4fer?= Date: Wed, 21 Jan 2026 11:11:07 +0100 Subject: [PATCH 8/9] add legacy query polars to follow same workflow as pandas --- .../foundry_dev_tools/clients/data_proxy.py | 30 +++- .../clients/foundry_sql_server.py | 131 +++++++++--------- 2 files changed, 91 insertions(+), 70 deletions(-) diff --git a/libs/foundry-dev-tools/src/foundry_dev_tools/clients/data_proxy.py b/libs/foundry-dev-tools/src/foundry_dev_tools/clients/data_proxy.py index 77ce84b0..687d5948 100644 --- a/libs/foundry-dev-tools/src/foundry_dev_tools/clients/data_proxy.py +++ b/libs/foundry-dev-tools/src/foundry_dev_tools/clients/data_proxy.py @@ -27,6 +27,7 @@ from pathlib import Path import pandas as pd + import polars as pl import pyarrow as pa import pyspark import requests @@ -111,7 +112,17 @@ def query_foundry_sql_legacy( branch: Ref = ..., sql_dialect: SqlDialect = ..., timeout: int = ..., - ) -> pd.core.frame.DataFrame: ... + ) -> pd.DataFrame: ... + + @overload + def query_foundry_sql_legacy( + self, + query: str, + return_type: Literal["polars"], + branch: Ref = ..., + sql_dialect: SqlDialect = ..., + timeout: int = ..., + ) -> pl.DataFrame: ... @overload def query_foundry_sql_legacy( @@ -151,7 +162,7 @@ def query_foundry_sql_legacy( branch: Ref = ..., sql_dialect: SqlDialect = ..., timeout: int = ..., - ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pa.Table | pyspark.sql.DataFrame: ... + ) -> tuple[dict, list[list]] | pd.DataFrame | pl.DataFrame | pa.Table | pyspark.sql.DataFrame: ... def query_foundry_sql_legacy( self, @@ -160,7 +171,7 @@ def query_foundry_sql_legacy( branch: Ref = "master", sql_dialect: SqlDialect = "SPARK", timeout: int = 600, - ) -> tuple[dict, list[list]] | pd.core.frame.DataFrame | pa.Table | pyspark.sql.DataFrame: + ) -> tuple[dict, list[list]] | pd.DataFrame | pl.DataFrame | pa.Table | pyspark.sql.DataFrame: """Queries the dataproxy query API with spark SQL. Example: @@ -206,6 +217,9 @@ def query_foundry_sql_legacy( response_json = response.json() if return_type == "raw": return response_json["foundrySchema"], response_json["rows"] + # return_type arrows, pandas and polars use the FakeModule implementation in + # their _optional packages. The FakeModule throws an ImportError when trying + # to access attributes of the module, so no need to explicitly catch ImportError. if return_type == "pandas": from foundry_dev_tools._optional.pandas import pd @@ -213,13 +227,19 @@ def query_foundry_sql_legacy( data=response_json["rows"], columns=[e["name"] for e in response_json["foundrySchema"]["fieldSchemaList"]], ) - if return_type == "arrow": + if return_type in {"arrow", "polars"}: from foundry_dev_tools._optional.pyarrow import pa - return pa.table( + table = pa.table( data=response_json["rows"], names=[e["name"] for e in response_json["foundrySchema"]["fieldSchemaList"]], ) + if return_type == "arrow": + return table + + from foundry_dev_tools._optional.polars import pl + + return pl.from_arrow(table) if return_type == "spark": from foundry_dev_tools.utils.converter.foundry_spark import ( foundry_schema_to_spark_schema, diff --git a/libs/foundry-dev-tools/src/foundry_dev_tools/clients/foundry_sql_server.py b/libs/foundry-dev-tools/src/foundry_dev_tools/clients/foundry_sql_server.py index 56fc3b5b..26a6d853 100644 --- a/libs/foundry-dev-tools/src/foundry_dev_tools/clients/foundry_sql_server.py +++ b/libs/foundry-dev-tools/src/foundry_dev_tools/clients/foundry_sql_server.py @@ -124,73 +124,74 @@ def query_foundry_sql( # noqa: C901 ValueError: Only direct read eligible queries can be returned as arrow Table. """ # noqa: E501 - assert_in_literal(return_type, SQLReturnType, "return_type") - - if return_type == "raw": - warnings.warn("Falling back to query_foundry_sql_legacy!") - return self.context.data_proxy.query_foundry_sql_legacy( - query=query, - return_type=return_type, - branch=branch, - sql_dialect=sql_dialect, - timeout=timeout, - ) - - try: - response_json = self.api_queries_execute( - query, - branch=branch, - dialect=sql_dialect, - timeout=timeout, - ).json() - query_id = response_json["queryId"] - status = response_json["status"] - - if status != {"ready": {}, "type": "ready"}: - start_time = time.time() + if return_type != "raw": + try: + response_json = self.api_queries_execute( + query, + branch=branch, + dialect=sql_dialect, + timeout=timeout, + ).json() query_id = response_json["queryId"] - while response_json["status"]["type"] == "running": - response = self.api_queries_status(query_id) - response_json = response.json() - if response_json["status"]["type"] == "failed": - raise FoundrySqlQueryFailedError(response) - if time.time() > start_time + timeout: - raise FoundrySqlQueryClientTimedOutError(response, timeout=timeout) - time.sleep(0.2) - - arrow_stream_reader = self.read_fsql_query_results_arrow(query_id=query_id) - if return_type == "pandas": - return arrow_stream_reader.read_pandas() - if return_type == "spark": - from foundry_dev_tools.utils.converter.foundry_spark import ( - arrow_stream_to_spark_dataframe, - ) - - return arrow_stream_to_spark_dataframe(arrow_stream_reader) - if return_type == "polars": - from foundry_dev_tools._optional.polars import pl - - if getattr(pl, "__fake__", False): - msg = "The optional 'polars' package is not installed. Please install it to request polars results." - raise ImportError(msg) # noqa: TRY301 - - arrow_table = arrow_stream_reader.read_all() - return pl.from_arrow(arrow_table) - if return_type == "arrow": + status = response_json["status"] + + if status != {"ready": {}, "type": "ready"}: + start_time = time.time() + query_id = response_json["queryId"] + while response_json["status"]["type"] == "running": + response = self.api_queries_status(query_id) + response_json = response.json() + if response_json["status"]["type"] == "failed": + raise FoundrySqlQueryFailedError(response) + if time.time() > start_time + timeout: + raise FoundrySqlQueryClientTimedOutError(response, timeout=timeout) + time.sleep(0.2) + + arrow_stream_reader = self.read_fsql_query_results_arrow(query_id=query_id) + if return_type == "pandas": + return arrow_stream_reader.read_pandas() + if return_type == "polars": + # The FakeModule implementation used in the _optional packages + # throws an ImportError when trying to access attributes of the module. + # This ImportError is caught below to fall back to query_foundry_sql_legacy + # which will again raise an ImportError when polars is not installed. + from foundry_dev_tools._optional.polars import pl + + arrow_table = arrow_stream_reader.read_all() + return pl.from_arrow(arrow_table) + + if return_type == "spark": + from foundry_dev_tools.utils.converter.foundry_spark import ( + arrow_stream_to_spark_dataframe, + ) + + return arrow_stream_to_spark_dataframe(arrow_stream_reader) return arrow_stream_reader.read_all() - except ( - FoundrySqlSerializationFormatNotImplementedError, - ImportError, - ) as exc: - if return_type == "arrow": - msg = ( - "Only direct read eligible queries can be returned as arrow Table. Consider using setting" - " return_type to 'pandas'." - ) - raise ValueError( - msg, - ) from exc - raise + except ( + FoundrySqlSerializationFormatNotImplementedError, + ImportError, + ) as exc: + # Swallow exception when return_type != 'arrow' + # to fall back to query_foundry_sql_legacy + if return_type == "arrow": + msg = ( + "Only direct read eligible queries can be returned as arrow Table. Consider using setting" + " return_type to 'pandas'." + ) + raise ValueError( + msg, + ) from exc + + # this fallback is not only used if return_type is 'raw', but also when one of + # the above exceptions is caught and return_type != 'arrow' + warnings.warn("Falling back to query_foundry_sql_legacy!") + return self.context.data_proxy.query_foundry_sql_legacy( + query=query, + return_type=return_type, + branch=branch, + sql_dialect=sql_dialect, + timeout=timeout, + ) def read_fsql_query_results_arrow(self, query_id: str) -> pa.ipc.RecordBatchStreamReader: """Create a bytes io reader if query returned arrow.""" From dee9e35cde0aa8bfba5bac32e688d4361935a960 Mon Sep 17 00:00:00 2001 From: Nicolas Renkamp Date: Fri, 23 Jan 2026 14:56:48 +0100 Subject: [PATCH 9/9] Update libs/foundry-dev-tools/src/foundry_dev_tools/clients/data_proxy.py --- .../src/foundry_dev_tools/clients/data_proxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/foundry-dev-tools/src/foundry_dev_tools/clients/data_proxy.py b/libs/foundry-dev-tools/src/foundry_dev_tools/clients/data_proxy.py index 687d5948..c682ada9 100644 --- a/libs/foundry-dev-tools/src/foundry_dev_tools/clients/data_proxy.py +++ b/libs/foundry-dev-tools/src/foundry_dev_tools/clients/data_proxy.py @@ -217,7 +217,7 @@ def query_foundry_sql_legacy( response_json = response.json() if return_type == "raw": return response_json["foundrySchema"], response_json["rows"] - # return_type arrows, pandas and polars use the FakeModule implementation in + # return_type arrow, pandas and polars use the FakeModule implementation in # their _optional packages. The FakeModule throws an ImportError when trying # to access attributes of the module, so no need to explicitly catch ImportError. if return_type == "pandas":