Skip to content

Commit 3507a19

Browse files
authored
Fix: Small fixes for seed models (#5217)
1 parent 46f20d7 commit 3507a19

File tree

4 files changed

+105
-2
lines changed

4 files changed

+105
-2
lines changed

sqlmesh/core/model/definition.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1586,6 +1586,7 @@ def render_seed(self) -> t.Iterator[QueryOrDF]:
15861586
string_columns = []
15871587

15881588
columns_to_types = self.columns_to_types_ or {}
1589+
column_names_to_check = set(columns_to_types)
15891590
for name, tpe in columns_to_types.items():
15901591
if tpe.this in (exp.DataType.Type.DATE, exp.DataType.Type.DATE32):
15911592
date_columns.append(name)
@@ -1605,6 +1606,14 @@ def render_seed(self) -> t.Iterator[QueryOrDF]:
16051606
rename_dict[normalized_name] = column
16061607
if rename_dict:
16071608
df.rename(columns=rename_dict, inplace=True)
1609+
# These names have already been checked
1610+
column_names_to_check -= set(rename_dict)
1611+
1612+
missing_columns = column_names_to_check - set(df.columns)
1613+
if missing_columns:
1614+
raise_config_error(
1615+
f"Seed model '{self.name}' has missing columns: {missing_columns}", self._path
1616+
)
16081617

16091618
# convert all date/time types to native pandas timestamp
16101619
for column in [*date_columns, *datetime_columns]:
@@ -1614,7 +1623,15 @@ def render_seed(self) -> t.Iterator[QueryOrDF]:
16141623

16151624
# extract datetime.date from pandas timestamp for DATE columns
16161625
for column in date_columns:
1617-
df[column] = df[column].dt.date
1626+
try:
1627+
df[column] = df[column].dt.date
1628+
except Exception as ex:
1629+
logger.error(
1630+
"Failed to convert column '%s' to date in seed model '%s': %s",
1631+
column,
1632+
self.name,
1633+
ex,
1634+
)
16181635

16191636
for column in bool_columns:
16201637
df[column] = df[column].apply(lambda i: str_to_bool(str(i)))

sqlmesh/dbt/seed.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
from sqlmesh.core.config.common import VirtualEnvironmentMode
1919
from sqlmesh.core.model import Model, SeedKind, create_seed_model
20+
from sqlmesh.core.model.seed import CsvSettings
2021
from sqlmesh.dbt.basemodel import BaseModelConfig
2122
from sqlmesh.dbt.column import ColumnConfig
2223

@@ -80,9 +81,12 @@ def to_sqlmesh(
8081

8182
kwargs["columns"] = new_columns
8283

84+
# dbt treats single whitespace as a null value
85+
csv_settings = CsvSettings(na_values=[" "], keep_default_na=True)
86+
8387
return create_seed_model(
8488
self.canonical_name(context),
85-
SeedKind(path=seed_path),
89+
SeedKind(path=seed_path, csv_settings=csv_settings),
8690
dialect=self.dialect(context),
8791
audit_definitions=audit_definitions,
8892
virtual_environment_mode=virtual_environment_mode,

tests/core/test_model.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9997,6 +9997,61 @@ def test_seed_coerce_datetime(tmp_path):
99979997
assert df["bad_datetime"].iloc[0] == "9999-12-31 23:59:59"
99989998

99999999

10000+
def test_seed_invalid_date_column(tmp_path):
10001+
model_csv_path = (tmp_path / "model.csv").absolute()
10002+
10003+
with open(model_csv_path, "w", encoding="utf-8") as fd:
10004+
fd.write("bad_date\n9999-12-31\n2025-01-01\n1000-01-01")
10005+
10006+
expressions = d.parse(
10007+
f"""
10008+
MODEL (
10009+
name db.seed,
10010+
kind SEED (
10011+
path '{str(model_csv_path)}',
10012+
),
10013+
columns (
10014+
bad_date date,
10015+
),
10016+
);
10017+
"""
10018+
)
10019+
10020+
model = load_sql_based_model(expressions, path=Path("./examples/sushi/models/test_model.sql"))
10021+
df = next(model.render(context=None))
10022+
# The conversion to date should not raise an error
10023+
assert df["bad_date"].to_list() == ["9999-12-31", "2025-01-01", "1000-01-01"]
10024+
10025+
10026+
def test_seed_missing_columns(tmp_path):
10027+
model_csv_path = (tmp_path / "model.csv").absolute()
10028+
10029+
with open(model_csv_path, "w", encoding="utf-8") as fd:
10030+
fd.write("key,value\n1,2\n3,4")
10031+
10032+
expressions = d.parse(
10033+
f"""
10034+
MODEL (
10035+
name db.seed,
10036+
kind SEED (
10037+
path '{str(model_csv_path)}',
10038+
),
10039+
columns (
10040+
key int,
10041+
value int,
10042+
missing_column int,
10043+
),
10044+
);
10045+
"""
10046+
)
10047+
10048+
model = load_sql_based_model(expressions, path=Path("./examples/sushi/models/test_model.sql"))
10049+
with pytest.raises(
10050+
ConfigError, match="Seed model 'db.seed' has missing columns: {'missing_column'}.*"
10051+
):
10052+
next(model.render(context=None))
10053+
10054+
1000010055
def test_missing_column_data_in_columns_key():
1000110056
expressions = d.parse(
1000210057
"""

tests/dbt/test_transformation.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -689,6 +689,33 @@ def test_seed_column_inference(tmp_path):
689689
}
690690

691691

692+
def test_seed_single_whitespace_is_na(tmp_path):
693+
seed_csv = tmp_path / "seed.csv"
694+
with open(seed_csv, "w", encoding="utf-8") as fd:
695+
fd.write("col_a, col_b\n")
696+
fd.write(" ,1\n")
697+
fd.write("2, \n")
698+
699+
seed = SeedConfig(
700+
name="test_model",
701+
package="foo",
702+
path=Path(seed_csv),
703+
)
704+
705+
context = DbtContext()
706+
context.project_name = "foo"
707+
context.target = DuckDbConfig(name="target", schema="test")
708+
sqlmesh_seed = seed.to_sqlmesh(context)
709+
assert sqlmesh_seed.columns_to_types == {
710+
"col_a": exp.DataType.build("int"),
711+
"col_b": exp.DataType.build("int"),
712+
}
713+
714+
df = next(sqlmesh_seed.render_seed())
715+
assert df["col_a"].to_list() == [None, 2]
716+
assert df["col_b"].to_list() == [1, None]
717+
718+
692719
def test_seed_partial_column_inference(tmp_path):
693720
seed_csv = tmp_path / "seed.csv"
694721
with open(seed_csv, "w", encoding="utf-8") as fd:

0 commit comments

Comments
 (0)