Fix: Concat Dataframes if insert overwrite (#890)

eakmanrq · web-flow · commit 51b7ef927529 · 2023-05-24T15:46:41.000-07:00
* concat dataframes if insert overwrite

* add comment

* restructure
diff --git a/sqlmesh/core/engine_adapter/base.py b/sqlmesh/core/engine_adapter/base.py
@@ -64,6 +64,7 @@ class EngineAdapter:
     DEFAULT_SQL_GEN_KWARGS: t.Dict[str, str | bool | int] = {}
     ESCAPE_JSON = False
     SUPPORTS_INDEXES = False
+    SUPPORTS_INSERT_OVERWRITE = False
     SCHEMA_DIFFER = SchemaDiffer()
 
     def __init__(
diff --git a/sqlmesh/core/engine_adapter/spark.py b/sqlmesh/core/engine_adapter/spark.py
@@ -30,6 +30,7 @@
 class SparkEngineAdapter(EngineAdapter):
     DIALECT = "spark"
     ESCAPE_JSON = True
+    SUPPORTS_INSERT_OVERWRITE = True
 
     @property
     def spark(self) -> PySparkSession:
diff --git a/sqlmesh/core/snapshot/evaluator.py b/sqlmesh/core/snapshot/evaluator.py
@@ -24,7 +24,9 @@
 import logging
 import typing as t
 from contextlib import contextmanager
+from functools import reduce
 
+import pandas as pd
 from sqlglot import exp, select
 from sqlglot.executor import execute
 
@@ -165,18 +167,34 @@ def apply(query_or_df: QueryOrDF, index: int = 0) -> None:
             if model.kind.is_view or model.kind.is_full
             else TransactionType.DML
         ):
-            for index, query_or_df in enumerate(queries_or_dfs):
-                if limit and limit > 0:
-                    if isinstance(query_or_df, exp.Select):
-                        existing_limit = query_or_df.args.get("limit")
-                        if existing_limit:
-                            limit = min(
-                                limit,
-                                execute(exp.select(existing_limit.expression)).rows[0][0],
-                            )
-                    return query_or_df.head(limit) if hasattr(query_or_df, "head") else self.adapter._fetch_native_df(query_or_df.limit(limit))  # type: ignore
-
-                apply(query_or_df, index)
+            if limit and limit > 0:
+                query_or_df = next(queries_or_dfs)
+                if isinstance(query_or_df, exp.Select):
+                    existing_limit = query_or_df.args.get("limit")
+                    if existing_limit:
+                        limit = min(
+                            limit,
+                            execute(exp.select(existing_limit.expression)).rows[0][0],
+                        )
+                return query_or_df.head(limit) if hasattr(query_or_df, "head") else self.adapter._fetch_native_df(query_or_df.limit(limit))  # type: ignore
+            # DataFrames, unlike SQL expressions, can provide partial results by yielding dataframes. As a result,
+            # if the engine supports INSERT OVERWRITE and the snapshot is incremental by time range, we risk
+            # having a partial result since each dataframe write can re-truncate partitions. To avoid this, we
+            # union all the dataframes together before writing. For pandas this could result in OOM and a potential
+            # workaround for that would be to serialize pandas to disk and then read it back with Spark.
+            # Note: We assume that if multiple things are yielded from `queries_or_dfs` that they are dataframes
+            # and not SQL expressions.
+            elif self.adapter.SUPPORTS_INSERT_OVERWRITE and snapshot.is_incremental_by_time_range:
+                query_or_df = reduce(
+                    lambda a, b: a.union_all(b)  # type: ignore
+                    if self.adapter.is_pyspark_df(a)
+                    else pd.concat([a, b], ignore_index=True),  # type: ignore
+                    queries_or_dfs,
+                )
+                apply(query_or_df, index=0)
+            else:
+                for index, query_or_df in enumerate(queries_or_dfs):
+                    apply(query_or_df, index)
 
             model.run_post_hooks(
                 context=context,
diff --git a/tests/core/test_snapshot_evaluator.py b/tests/core/test_snapshot_evaluator.py
@@ -14,7 +14,9 @@
     IncrementalByTimeRangeKind,
     ModelKind,
     ModelKindName,
+    PythonModel,
     SqlModel,
+    TimeColumn,
     load_model,
 )
 from sqlmesh.core.model.meta import IntervalUnit
@@ -26,6 +28,7 @@
     SnapshotTableInfo,
 )
 from sqlmesh.utils.errors import ConfigError, SQLMeshError
+from sqlmesh.utils.metaprogramming import Executable
 
 
 @pytest.fixture
@@ -337,3 +340,97 @@ def test_audit_unversioned(mocker: MockerFixture, adapter_mock, make_snapshot):
         match="Cannot audit 'db.model' because it has not been versioned yet. Apply a plan first.",
     ):
         evaluator.audit(snapshot=snapshot, snapshots={})
+
+
+@pytest.mark.parametrize(
+    "input_dfs, output_dict",
+    [
+        (
+            """pd.DataFrame({"a": [1, 2, 3], "ds": ["2023-01-01", "2023-01-02", "2023-01-03"]}),
+        pd.DataFrame({"a": [4, 5, 6], "ds": ["2023-01-04", "2023-01-05", "2023-01-06"]}),
+        pd.DataFrame({"a": [7, 8, 9], "ds": ["2023-01-07", "2023-01-08", "2023-01-09"]})""",
+            {
+                "a": {
+                    0: 1,
+                    1: 2,
+                    2: 3,
+                    3: 4,
+                    4: 5,
+                    5: 6,
+                    6: 7,
+                    7: 8,
+                    8: 9,
+                },
+                "ds": {
+                    0: "2023-01-01",
+                    1: "2023-01-02",
+                    2: "2023-01-03",
+                    3: "2023-01-04",
+                    4: "2023-01-05",
+                    5: "2023-01-06",
+                    6: "2023-01-07",
+                    7: "2023-01-08",
+                    8: "2023-01-09",
+                },
+            },
+        ),
+        (
+            """pd.DataFrame({"a": [1, 2, 3], "ds": ["2023-01-01", "2023-01-02", "2023-01-03"]})""",
+            {
+                "a": {
+                    0: 1,
+                    1: 2,
+                    2: 3,
+                },
+                "ds": {
+                    0: "2023-01-01",
+                    1: "2023-01-02",
+                    2: "2023-01-03",
+                },
+            },
+        ),
+    ],
+)
+def test_snapshot_evaluator_yield_pd(adapter_mock, make_snapshot, input_dfs, output_dict):
+    adapter_mock.is_pyspark_df.return_value = False
+    adapter_mock.SUPPORTS_INSERT_OVERWRITE = True
+    adapter_mock.try_get_df = lambda x: x
+    evaluator = SnapshotEvaluator(adapter_mock)
+
+    snapshot = make_snapshot(
+        PythonModel(
+            name="db.model",
+            entrypoint="python_func",
+            kind=IncrementalByTimeRangeKind(time_column=TimeColumn(column="ds", format="%Y-%m-%d")),
+            columns={
+                "a": "INT",
+                "ds": "STRING",
+            },
+            python_env={
+                "python_func": Executable(
+                    name="python_func",
+                    alias="python_func",
+                    path="test_snapshot_evaluator.py",
+                    payload=f"""import pandas as pd
+def python_func(**kwargs):
+    for df in [
+        {input_dfs}
+    ]:
+        yield df""",
+                )
+            },
+        )
+    )
+
+    snapshot.categorize_as(SnapshotChangeCategory.BREAKING)
+    evaluator.create([snapshot], {})
+
+    evaluator.evaluate(
+        snapshot,
+        "2023-01-01",
+        "2023-01-09",
+        "2023-01-09",
+        snapshots={},
+    )
+
+    assert adapter_mock.insert_overwrite_by_time_partition.call_args[0][1].to_dict() == output_dict