Add databricks support

treysp · treysp · commit b7a6937fca71 · 2025-08-19T18:36:15.000-05:00
diff --git a/sqlmesh/core/engine_adapter/databricks.py b/sqlmesh/core/engine_adapter/databricks.py
@@ -4,7 +4,7 @@
 import typing as t
 from functools import partial
 
-from sqlglot import exp
+from sqlglot import exp, parse_one
 from sqlmesh.core.dialect import to_schema
 from sqlmesh.core.engine_adapter.shared import (
     CatalogSupport,
@@ -16,6 +16,7 @@
 from sqlmesh.core.engine_adapter.spark import SparkEngineAdapter
 from sqlmesh.core.node import IntervalUnit
 from sqlmesh.core.schema_diff import SchemaDiffer
+from sqlmesh.core.snapshot.execution_tracker import QueryExecutionTracker
 from sqlmesh.engines.spark.db_api.spark_session import connection, SparkSessionConnection
 from sqlmesh.utils.errors import SQLMeshError, MissingDefaultCatalogError
 
@@ -34,6 +35,7 @@ class DatabricksEngineAdapter(SparkEngineAdapter):
     SUPPORTS_CLONING = True
     SUPPORTS_MATERIALIZED_VIEWS = True
     SUPPORTS_MATERIALIZED_VIEW_SCHEMA = True
+    SUPPORTS_QUERY_EXECUTION_TRACKING = True
     SCHEMA_DIFFER = SchemaDiffer(
         support_positional_add=True,
         support_nested_operations=True,
@@ -364,3 +366,52 @@ def _build_table_properties_exp(
             expressions.append(clustered_by_exp)
             properties = exp.Properties(expressions=expressions)
         return properties
+
+    def _record_execution_stats(
+        self, sql: str, rowcount: t.Optional[int] = None, bytes_processed: t.Optional[int] = None
+    ) -> None:
+        parsed = parse_one(sql, dialect=self.dialect)
+        table = parsed.find(exp.Table)
+        table_name = table.sql(dialect=self.dialect) if table else None
+
+        if table_name:
+            try:
+                self.cursor.execute(f"DESCRIBE HISTORY {table_name}")
+            except:
+                return
+
+            history = self.cursor.fetchall_arrow()
+            if history.num_rows:
+                history_df = history.to_pandas()
+                write_df = history_df[history_df["operation"] == "WRITE"]
+                write_df = write_df[write_df["timestamp"] == write_df["timestamp"].max()]
+                if not write_df.empty:
+                    metrics = write_df["operationMetrics"][0]
+                    if metrics:
+                        rowcount = None
+                        rowcount_str = [
+                            metric[1] for metric in metrics if metric[0] == "numOutputRows"
+                        ]
+                        if rowcount_str:
+                            try:
+                                rowcount = int(rowcount_str[0])
+                            except (TypeError, ValueError):
+                                pass
+
+                        bytes_processed = None
+                        bytes_str = [
+                            metric[1] for metric in metrics if metric[0] == "numOutputBytes"
+                        ]
+                        if bytes_str:
+                            try:
+                                bytes_processed = int(bytes_str[0])
+                            except (TypeError, ValueError):
+                                pass
+
+                        if rowcount is not None or bytes_processed is not None:
+                            # if no rows were written, df contains 0 for bytes but no value for rows
+                            rowcount = (
+                                0 if rowcount is None and bytes_processed is not None else rowcount
+                            )
+
+                            QueryExecutionTracker.record_execution(sql, rowcount, bytes_processed)
diff --git a/sqlmesh/core/snapshot/execution_tracker.py b/sqlmesh/core/snapshot/execution_tracker.py
@@ -41,7 +41,7 @@ def __post_init__(self) -> None:
     def add_execution(
         self, sql: str, row_count: t.Optional[int], bytes_processed: t.Optional[int]
     ) -> None:
-        if row_count is not None:
+        if row_count is not None and row_count >= 0:
             if self.stats.total_rows_processed is None:
                 self.stats.total_rows_processed = row_count
             else:
diff --git a/tests/core/engine_adapter/integration/test_integration.py b/tests/core/engine_adapter/integration/test_integration.py
@@ -2414,11 +2414,12 @@ def capture_execution_stats(
         # seed rows aren't tracked
         assert actual_execution_stats["seed_model"].total_rows_processed is None
 
-        if ctx.mark.startswith("bigquery"):
-            assert actual_execution_stats["incremental_model"].total_bytes_processed
-            assert actual_execution_stats["full_model"].total_bytes_processed
+        if ctx.mark.startswith("bigquery") or ctx.mark.startswith("databricks"):
+            assert actual_execution_stats["incremental_model"].total_bytes_processed is not None
+            assert actual_execution_stats["full_model"].total_bytes_processed is not None
 
     # run that loads 0 rows in incremental model
+    actual_execution_stats = {}
     with patch.object(
         context.console, "update_snapshot_evaluation_progress", capture_execution_stats
     ):
@@ -2432,9 +2433,9 @@ def capture_execution_stats(
             None if ctx.mark.startswith("snowflake") else 3
         )
 
-        if ctx.mark.startswith("bigquery"):
-            assert actual_execution_stats["incremental_model"].total_bytes_processed
-            assert actual_execution_stats["full_model"].total_bytes_processed
+        if ctx.mark.startswith("bigquery") or ctx.mark.startswith("databricks"):
+            assert actual_execution_stats["incremental_model"].total_bytes_processed is not None
+            assert actual_execution_stats["full_model"].total_bytes_processed is not None
 
     # make and validate unmodified dev environment
     no_change_plan: Plan = context.plan_builder(