Fix: convert pyspark/snowpark dataframes into pandas when testing (#2737)

georgesittas · web-flow · commit 723e2f066767 · 2024-06-05T03:54:50.000+03:00
diff --git a/sqlmesh/core/test/definition.py b/sqlmesh/core/test/definition.py
@@ -618,10 +618,9 @@ def _execute_model(self) -> pd.DataFrame:
         time_ctx = freeze_time(self._execution_time) if self._execution_time else nullcontext()
         with patch.dict(self._test_adapter_dialect.generator_class.TRANSFORMS, self._transforms):
             with t.cast(AbstractContextManager, time_ctx):
-                return t.cast(
-                    pd.DataFrame,
-                    next(self.model.render(context=self.context, **self.body.get("vars", {}))),
-                )
+                df = next(self.model.render(context=self.context, **self.body.get("vars", {})))
+                assert not isinstance(df, exp.Expression)
+                return df if isinstance(df, pd.DataFrame) else df.toPandas()
 
 
 def generate_test(
diff --git a/tests/core/test_test.py b/tests/core/test_test.py
@@ -15,6 +15,7 @@
 from sqlmesh.core.config import (
     Config,
     DuckDBConnectionConfig,
+    SparkConnectionConfig,
     GatewayConfig,
     ModelDefaultsConfig,
 )
@@ -1410,6 +1411,44 @@ def test_generate_input_data_using_sql(mocker: MockerFixture, tmp_path: Path) ->
         )
 
 
+def test_pyspark_python_model() -> None:
+    spark_connection_config = SparkConnectionConfig(
+        config={
+            "spark.master": "local",
+            "spark.sql.warehouse.dir": "/tmp/data_dir",
+            "spark.driver.extraJavaOptions": "-Dderby.system.home=/tmp/derby_dir",
+        },
+    )
+    config = Config(
+        gateways=GatewayConfig(test_connection=spark_connection_config),
+        model_defaults=ModelDefaultsConfig(dialect="spark"),
+    )
+    context = Context(config=config)
+
+    @model("pyspark_model", columns={"col": "int"})
+    def execute(context, start, end, execution_time, **kwargs):
+        return context.spark.sql("SELECT 1 AS col")
+
+    _check_successful_or_raise(
+        _create_test(
+            body=load_yaml(
+                """
+test_pyspark_model:
+  model: pyspark_model
+  outputs:
+    query:
+      - col: 1
+                """
+            ),
+            test_name="test_pyspark_model",
+            model=model.get_registry()["pyspark_model"].model(
+                module_path=Path("."), path=Path(".")
+            ),
+            context=context,
+        ).run()
+    )
+
+
 def test_test_generation(tmp_path: Path) -> None:
     init_example_project(tmp_path, dialect="duckdb")
 
diff --git a/tests/integrations/jupyter/test_magics.py b/tests/integrations/jupyter/test_magics.py
@@ -201,9 +201,9 @@ def test_evaluate(notebook, loaded_sushi_context):
 def test_format(notebook, sushi_context):
     with capture_output():
         test_model_path = sushi_context.path / "models" / "test_model.sql"
-        test_model_path.write_text("MODEL(name db.test); SELECT 1 AS foo FROM table")
+        test_model_path.write_text("MODEL(name db.test); SELECT 1 AS foo FROM t")
         sushi_context.load()
-    assert test_model_path.read_text() == "MODEL(name db.test); SELECT 1 AS foo FROM table"
+    assert test_model_path.read_text() == "MODEL(name db.test); SELECT 1 AS foo FROM t"
     with capture_output() as output:
         notebook.run_line_magic(magic_name="format", line="")
 
@@ -218,7 +218,7 @@ def test_format(notebook, sushi_context):
 
 SELECT
   1 AS foo
-FROM table"""
+FROM t"""
     )