Fix: stop treating dbt schema data types as columns_to_types

georgesittas · georgesittas · commit ffe7a0444949 · 2025-08-27T21:21:48.000+03:00
diff --git a/examples/sushi_dbt/models/schema.yml b/examples/sushi_dbt/models/schema.yml
@@ -36,6 +36,8 @@ models:
               field: waiter_id
       - name: revenue
         description: Revenue from orders served by this waiter
+      - name: unused_column
+        data_type: int
   - name: waiters
     columns:
       - name: waiter_id
diff --git a/examples/sushi_dbt/models/top_waiters.sql b/examples/sushi_dbt/models/top_waiters.sql
@@ -6,7 +6,8 @@
 
 SELECT
   waiter_id::INT AS waiter_id,
-  revenue::DOUBLE AS revenue
+  revenue::DOUBLE AS revenue,
+  1 AS unused_column
 FROM {{ ref('waiter_revenue_by_day', version=1) }}
 WHERE
   ds = (
diff --git a/sqlmesh/dbt/basemodel.py b/sqlmesh/dbt/basemodel.py
@@ -328,12 +328,9 @@ def sqlmesh_model_kwargs(
             dependencies.macros, package=self.package_name
         )
         jinja_macros.add_globals(self._model_jinja_context(model_context, dependencies))
-        return {
+
+        model_kwargs = {
             "audits": [(test.name, {}) for test in self.tests],
-            "columns": column_types_to_sqlmesh(
-                column_types_override or self.columns, self.dialect(context)
-            )
-            or None,
             "column_descriptions": column_descriptions_to_sqlmesh(self.columns) or None,
             "depends_on": {
                 model.canonical_name(context) for model in model_context.refs.values()
@@ -349,6 +346,21 @@ def sqlmesh_model_kwargs(
             **self.sqlmesh_config_kwargs,
         }
 
+        # dbt doesn't respect the data_type field for DDL statements– instead, it optionally uses
+        # it to validate the actual data types at runtime through contracts or external plugins.
+        # Only the `columns_types` config of seed models is actually respected. We don't set the
+        # columns attribute to self.columns intentionally in all other cases, as that could result
+        # in unfaithful types when models are materialized.
+        #
+        # See:
+        # - https://docs.getdbt.com/reference/resource-properties/columns
+        # - https://docs.getdbt.com/reference/resource-configs/contract
+        # - https://docs.getdbt.com/reference/resource-configs/column_types
+        if column_types_override:
+            model_kwargs["columns"] = column_types_to_sqlmesh(column_types_override)
+
+        return model_kwargs
+
     @abstractmethod
     def to_sqlmesh(
         self,
diff --git a/tests/dbt/test_transformation.py b/tests/dbt/test_transformation.py
@@ -608,7 +608,10 @@ def test_model_columns():
         name="target", schema="test", database="test", account="foo", user="bar", password="baz"
     )
     sqlmesh_model = model.to_sqlmesh(context)
-    assert sqlmesh_model.columns_to_types == expected_column_types
+
+    # Columns being present in a schema.yaml are not respected in DDLs, so SQLMesh doesn't
+    # set the corresponding columns_to_types_ attribute either to match dbt's behavior
+    assert sqlmesh_model.columns_to_types == None
     assert sqlmesh_model.column_descriptions == expected_column_descriptions
 
 
@@ -623,8 +626,11 @@ def test_seed_columns():
         },
     )
 
+    # dbt doesn't respect the data_type field in the DDLs– instead, it optionally uses it to
+    # validate the actual data types at runtime through contracts or external plugins. Thus,
+    # the actual data type is int, because that is what is inferred from the seed file.
     expected_column_types = {
-        "id": exp.DataType.build("text"),
+        "id": exp.DataType.build("int"),
         "name": exp.DataType.build("text"),
     }
     expected_column_descriptions = {