Remove databricks, snowflake metadata calls

treysp · treysp · commit 7dca8daa582c · 2025-08-22T16:05:38.000-05:00
diff --git a/sqlmesh/core/engine_adapter/databricks.py b/sqlmesh/core/engine_adapter/databricks.py
@@ -4,7 +4,7 @@
 import typing as t
 from functools import partial
 
-from sqlglot import exp, parse_one
+from sqlglot import exp
 from sqlmesh.core.dialect import to_schema
 from sqlmesh.core.engine_adapter.shared import (
     CatalogSupport,
@@ -14,10 +14,8 @@
     SourceQuery,
 )
 from sqlmesh.core.engine_adapter.spark import SparkEngineAdapter
-from sqlmesh.engines.spark.db_api.spark_session import SparkSessionCursor
 from sqlmesh.core.node import IntervalUnit
 from sqlmesh.core.schema_diff import NestedSupport
-from sqlmesh.core.snapshot.execution_tracker import QueryExecutionTracker
 from sqlmesh.engines.spark.db_api.spark_session import connection, SparkSessionConnection
 from sqlmesh.utils.errors import SQLMeshError, MissingDefaultCatalogError
 
@@ -36,7 +34,6 @@ class DatabricksEngineAdapter(SparkEngineAdapter):
     SUPPORTS_CLONING = True
     SUPPORTS_MATERIALIZED_VIEWS = True
     SUPPORTS_MATERIALIZED_VIEW_SCHEMA = True
-    SUPPORTS_QUERY_EXECUTION_TRACKING = True
     SCHEMA_DIFFER_KWARGS = {
         "support_positional_add": True,
         "nested_support": NestedSupport.ALL,
@@ -366,73 +363,3 @@ def _build_table_properties_exp(
             expressions.append(clustered_by_exp)
             properties = exp.Properties(expressions=expressions)
         return properties
-
-    def _record_execution_stats(
-        self, sql: str, rowcount: t.Optional[int] = None, bytes_processed: t.Optional[int] = None
-    ) -> None:
-        parsed = parse_one(sql, dialect=self.dialect)
-        table = parsed.find(exp.Table)
-        table_name = table.sql(dialect=self.dialect) if table else None
-
-        if table_name:
-            try:
-                self.cursor.execute(f"DESCRIBE HISTORY {table_name}")
-            except:
-                return
-
-            history = (
-                self.cursor.fetchdf()
-                if isinstance(self.cursor, SparkSessionCursor)
-                else self.cursor.fetchall_arrow()
-            )
-            if history is not None:
-                from pandas import DataFrame as PandasDataFrame
-                from pyspark.sql import DataFrame as PySparkDataFrame
-                from pyspark.sql.connect.dataframe import DataFrame as PySparkConnectDataFrame
-
-                history_df = None
-                if isinstance(history, PandasDataFrame):
-                    history_df = history
-                elif isinstance(history, (PySparkDataFrame, PySparkConnectDataFrame)):
-                    history_df = history.toPandas()
-                else:
-                    # arrow table
-                    history_df = history.to_pandas()
-
-                if history_df is not None and not history_df.empty:
-                    write_df = history_df[history_df["operation"] == "WRITE"]
-                    write_df = write_df[write_df["timestamp"] == write_df["timestamp"].max()]
-                    if not write_df.empty and "operationMetrics" in write_df.columns:
-                        metrics = write_df["operationMetrics"].iloc[0]
-                        if metrics:
-                            rowcount = None
-                            rowcount_str = [
-                                metric[1] for metric in metrics if metric[0] == "numOutputRows"
-                            ]
-                            if rowcount_str:
-                                try:
-                                    rowcount = int(rowcount_str[0])
-                                except (TypeError, ValueError):
-                                    pass
-
-                            bytes_processed = None
-                            bytes_str = [
-                                metric[1] for metric in metrics if metric[0] == "numOutputBytes"
-                            ]
-                            if bytes_str:
-                                try:
-                                    bytes_processed = int(bytes_str[0])
-                                except (TypeError, ValueError):
-                                    pass
-
-                            if rowcount is not None or bytes_processed is not None:
-                                # if no rows were written, df contains 0 for bytes but no value for rows
-                                rowcount = (
-                                    0
-                                    if rowcount is None and bytes_processed is not None
-                                    else rowcount
-                                )
-
-                                QueryExecutionTracker.record_execution(
-                                    sql, rowcount, bytes_processed
-                                )
diff --git a/sqlmesh/core/engine_adapter/snowflake.py b/sqlmesh/core/engine_adapter/snowflake.py
@@ -2,9 +2,10 @@
 
 import contextlib
 import logging
+import re
 import typing as t
 
-from sqlglot import exp, parse_one
+from sqlglot import exp
 from sqlglot.helper import ensure_list
 from sqlglot.optimizer.normalize_identifiers import normalize_identifiers
 from sqlglot.optimizer.qualify_columns import quote_identifiers
@@ -672,28 +673,31 @@ def _record_execution_stats(
     ) -> None:
         """Snowflake does not report row counts for CTAS like other DML operations.
 
-        They neither report the sentinel value -1 nor do they report 0 rows. Instead, they return a single data row
-        containing the string "Table <table_name> successfully created." and a row count of 1.
+        They neither report the sentinel value -1 nor do they report 0 rows. Instead, they report a rowcount
+        of 1 and return a single data row containing one of the strings:
+          - "Table <table_name> successfully created."
+          - "<table_name> already exists, statement succeeded."
 
-        We do not want to record the incorrect row count of 1, so we check whether:
-          - There is exactly one row to fetch (in general, DML operations should return no rows to fetch from the cursor)
-          - That row contains the table successfully created string
-
-        If so, we return early and do not record the row count.
+        We do not want to record the incorrect row count of 1, so we check whether that row contains the table
+        successfully created string. If so, we return early and do not record the row count.
         """
         if rowcount == 1:
-            query_parsed = parse_one(sql, dialect=self.dialect)
-            if isinstance(query_parsed, exp.Create):
-                if query_parsed.expression and isinstance(query_parsed.expression, exp.Select):
-                    table = query_parsed.find(exp.Table)
-                    if table:
-                        row_query = f"SELECT ROW_COUNT as row_count FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = '{table.db}' AND TABLE_NAME = '{table.name}'"
-                        row_query_results = self.fetchone(row_query, quote_identifiers=True)
-                        if row_query_results:
-                            rowcount = row_query_results[0]
-                        else:
-                            return
-                else:
+            results = self.cursor.fetchone()
+            if results:
+                try:
+                    results_str = str(results[0])
+                except (ValueError, TypeError):
+                    return
+
+                # Snowflake identifiers may be:
+                # - An unquoted contiguous set of [a-zA-Z0-9_$] characters
+                # - A double-quoted string that may contain spaces and nested double-quotes represented by `""`. Example: " my ""table"" name "
+                is_created = re.match(r'Table [a-zA-Z0-9_$"]*? successfully created\.', results_str)
+                is_already_exists = re.match(
+                    r'[a-zA-Z0-9_$"]*? already exists, statement succeeded\.',
+                    results_str,
+                )
+                if is_created or is_already_exists:
                     return
 
         QueryExecutionTracker.record_execution(sql, rowcount, bytes_processed)
diff --git a/tests/core/engine_adapter/integration/test_integration.py b/tests/core/engine_adapter/integration/test_integration.py
@@ -7,13 +7,12 @@
 import sys
 import typing as t
 import shutil
-from datetime import date, datetime, timedelta
+from datetime import datetime, timedelta
 from unittest.mock import patch
 import numpy as np  # noqa: TID253
 import pandas as pd  # noqa: TID253
 import pytest
 import pytz
-import time_machine
 from sqlglot import exp, parse_one
 from sqlglot.optimizer.normalize_identifiers import normalize_identifiers
 from sqlglot.optimizer.qualify_columns import quote_identifiers
@@ -2457,14 +2456,18 @@ def capture_execution_stats(
     assert len(physical_layer_results.tables) == len(physical_layer_results.non_temp_tables) == 3
 
     if ctx.engine_adapter.SUPPORTS_QUERY_EXECUTION_TRACKING:
-        assert actual_execution_stats["seed_model"].total_rows_processed == 7
         assert actual_execution_stats["incremental_model"].total_rows_processed == 7
         # snowflake doesn't track rows for CTAS
-        assert actual_execution_stats["full_model"].total_rows_processed == 3
+        assert actual_execution_stats["full_model"].total_rows_processed == (
+            None if ctx.mark.startswith("snowflake") else 3
+        )
+        assert actual_execution_stats["seed_model"].total_rows_processed == (
+            None if ctx.mark.startswith("snowflake") else 7
+        )
 
-        if ctx.mark.startswith("bigquery") or ctx.mark.startswith("databricks"):
-            assert actual_execution_stats["incremental_model"].total_bytes_processed is not None
-            assert actual_execution_stats["full_model"].total_bytes_processed is not None
+        if ctx.mark.startswith("bigquery"):
+            assert actual_execution_stats["incremental_model"].total_bytes_processed
+            assert actual_execution_stats["full_model"].total_bytes_processed
 
     # run that loads 0 rows in incremental model
     # - some cloud DBs error because time travel messes up token expiration