Feat: add support for recursive CTEs in unit tests (#3351)

georgesittas · web-flow · commit 855688d1351e · 2024-11-09T19:02:27.000+02:00
diff --git a/docs/reference/cli.md b/docs/reference/cli.md
@@ -92,7 +92,7 @@ Usage: sqlmesh create_test [OPTIONS] MODEL
 
 Options:
   -q, --query <TEXT TEXT>...  Queries that will be used to generate data for
-                              the model's dependencies.  [required]
+                              the model's dependencies.
   -o, --overwrite             When true, the fixture file will be overwritten
                               in case it already exists.
   -v, --var <TEXT TEXT>...    Key-value pairs that will define variables
diff --git a/docs/reference/notebook.md b/docs/reference/notebook.md
@@ -366,8 +366,9 @@ options:
 
 #### create_test
 ```
-%create_test --query QUERY [QUERY ...] [--overwrite]
+%create_test [--query QUERY [QUERY ...]] [--overwrite]
                    [--var VAR [VAR ...]] [--path PATH] [--name NAME]
+                   [--include-ctes]
                    model
 
 Generate a unit test fixture for a given model.
diff --git a/sqlmesh/cli/main.py b/sqlmesh/cli/main.py
@@ -530,7 +530,7 @@ def dag(ctx: click.Context, file: str, select_model: t.List[str]) -> None:
     "queries",
     type=(str, str),
     multiple=True,
-    required=True,
+    default=[],
     help="Queries that will be used to generate data for the model's dependencies.",
 )
 @click.option(
diff --git a/sqlmesh/core/test/definition.py b/sqlmesh/core/test/definition.py
@@ -5,6 +5,7 @@
 import unittest
 from collections import Counter
 from contextlib import AbstractContextManager, nullcontext
+from itertools import chain
 from pathlib import Path
 from unittest.mock import patch
 
@@ -126,25 +127,25 @@ def setUp(self) -> None:
 
         for name, values in self.body.get("inputs", {}).items():
             all_types_are_known = False
-            known_columns_to_types: t.Dict[str, exp.DataType] = {}
+            columns_to_known_types: t.Dict[str, exp.DataType] = {}
 
             model = self.models.get(name)
             if model:
                 inferred_columns_to_types = model.columns_to_types or {}
-                known_columns_to_types = {
+                columns_to_known_types = {
                     c: t for c, t in inferred_columns_to_types.items() if type_is_known(t)
                 }
                 all_types_are_known = bool(inferred_columns_to_types) and (
-                    len(known_columns_to_types) == len(inferred_columns_to_types)
+                    len(columns_to_known_types) == len(inferred_columns_to_types)
                 )
 
             # Types specified in the test will override the corresponding inferred ones
-            known_columns_to_types.update(values.get("columns", {}))
+            columns_to_known_types.update(values.get("columns", {}))
 
             rows = values.get("rows")
             if not all_types_are_known and rows:
                 for col, value in rows[0].items():
-                    if col not in known_columns_to_types:
+                    if col not in columns_to_known_types:
                         v_type = annotate_types(exp.convert(value)).type or type(value).__name__
                         v_type = exp.maybe_parse(
                             v_type, into=exp.DataType, dialect=self._test_adapter_dialect
@@ -159,21 +160,21 @@ def setUp(self) -> None:
                                 self.path,
                             )
 
-                        known_columns_to_types[col] = v_type
+                        columns_to_known_types[col] = v_type
 
             if rows is None:
                 query_or_df: exp.Query | pd.DataFrame = self._add_missing_columns(
-                    values["query"], known_columns_to_types
+                    values["query"], columns_to_known_types
                 )
-                if known_columns_to_types:
-                    known_columns_to_types = {
-                        col: known_columns_to_types[col] for col in query_or_df.named_selects
+                if columns_to_known_types:
+                    columns_to_known_types = {
+                        col: columns_to_known_types[col] for col in query_or_df.named_selects
                     }
             else:
-                query_or_df = self._create_df(values, columns=known_columns_to_types)
+                query_or_df = self._create_df(values, columns=columns_to_known_types)
 
             self.engine_adapter.create_view(
-                self._test_fixture_table(name), query_or_df, known_columns_to_types
+                self._test_fixture_table(name), query_or_df, columns_to_known_types
             )
 
     def tearDown(self) -> None:
@@ -525,7 +526,7 @@ def _add_missing_columns(
 
 
 class SqlModelTest(ModelTest):
-    def test_ctes(self, ctes: t.Dict[str, exp.Expression]) -> None:
+    def test_ctes(self, ctes: t.Dict[str, exp.Expression], recursive: bool = False) -> None:
         """Run CTE queries and compare output to expected output"""
         for cte_name, values in self.body["outputs"].get("ctes", {}).items():
             with self.subTest(cte=cte_name):
@@ -535,11 +536,13 @@ def test_ctes(self, ctes: t.Dict[str, exp.Expression]) -> None:
                     )
 
                 cte_query = ctes[cte_name].this
-                for alias, cte in ctes.items():
-                    cte_query = cte_query.with_(alias, cte.this)
 
-                partial = values.get("partial")
                 sort = cte_query.args.get("order") is None
+                partial = values.get("partial")
+
+                cte_query = exp.select(*_projection_identifiers(cte_query)).from_(cte_name)
+                for alias, cte in ctes.items():
+                    cte_query = cte_query.with_(alias, cte.this, recursive=recursive)
 
                 actual = self._execute(cte_query)
                 expected = self._create_df(values, columns=cte_query.named_selects, partial=partial)
@@ -548,13 +551,16 @@ def test_ctes(self, ctes: t.Dict[str, exp.Expression]) -> None:
 
     def runTest(self) -> None:
         query = self._render_model_query()
-
-        self.test_ctes(
-            {
-                self._normalize_model_name(cte.alias, with_default_catalog=False): cte
-                for cte in query.ctes
-            }
-        )
+        with_clause = query.args.get("with")
+
+        if with_clause:
+            self.test_ctes(
+                {
+                    self._normalize_model_name(cte.alias, with_default_catalog=False): cte
+                    for cte in query.ctes
+                },
+                recursive=with_clause.recursive,
+            )
 
         values = self.body["outputs"].get("query")
         if values is not None:
@@ -732,14 +738,23 @@ def generate_test(
     if isinstance(model, SqlModel):
         assert isinstance(test, SqlModelTest)
         model_query = test._render_model_query()
+        with_clause = model_query.args.get("with")
 
-        if include_ctes:
+        if with_clause and include_ctes:
             ctes = {}
+            recursive = with_clause.recursive
             previous_ctes: t.List[exp.CTE] = []
+
             for cte in model_query.ctes:
                 cte_query = cte.this
-                for prev in previous_ctes:
-                    cte_query = cte_query.with_(prev.alias, prev.this)
+                cte_identifier = cte.args["alias"].this
+
+                cte_query = exp.select(*_projection_identifiers(cte_query)).from_(cte_identifier)
+
+                for prev in chain(previous_ctes, [cte]):
+                    cte_query = cte_query.with_(
+                        prev.args["alias"].this, prev.this, recursive=recursive
+                    )
 
                 cte_output = test._execute(cte_query)
                 ctes[cte.alias] = (
@@ -775,6 +790,19 @@ def generate_test(
         yaml.dump({test_name: test_body}, file)
 
 
+def _projection_identifiers(query: exp.Query) -> t.List[str | exp.Identifier]:
+    identifiers: t.List[str | exp.Identifier] = []
+    for select in query.selects:
+        if isinstance(select, exp.Alias):
+            identifiers.append(select.args["alias"])
+        elif isinstance(select, exp.Column):
+            identifiers.append(select.this)
+        else:
+            identifiers.append(select.output_name)
+
+    return identifiers
+
+
 def _raise_if_unexpected_columns(
     expected_cols: t.Collection[str], actual_cols: t.Collection[str]
 ) -> None:
diff --git a/sqlmesh/magics.py b/sqlmesh/magics.py
@@ -858,7 +858,7 @@ def janitor(self, context: Context, line: str) -> None:
         "-q",
         type=str,
         nargs="+",
-        required=True,
+        default=[],
         help="Queries that will be used to generate data for the model's dependencies.",
     )
     @argument(
diff --git a/tests/core/test_test.py b/tests/core/test_test.py
@@ -1450,12 +1450,12 @@ def test_generate_input_data_using_sql(mocker: MockerFixture, tmp_path: Path) ->
         )
 
 
-def test_pyspark_python_model() -> None:
+def test_pyspark_python_model(tmp_path: Path) -> None:
     spark_connection_config = SparkConnectionConfig(
         config={
             "spark.master": "local",
-            "spark.sql.warehouse.dir": "/tmp/data_dir",
-            "spark.driver.extraJavaOptions": "-Dderby.system.home=/tmp/derby_dir",
+            "spark.sql.warehouse.dir": f"{tmp_path}/data_dir",
+            "spark.driver.extraJavaOptions": f"-Dderby.system.home={tmp_path}/derby_dir",
         },
     )
     config = Config(
@@ -1572,6 +1572,100 @@ def test_custom_testing_schema(mocker: MockerFixture) -> None:
     )
 
 
+def test_complicated_recursive_cte() -> None:
+    model_sql = """
+WITH
+    RECURSIVE
+    chained_contacts AS (
+        -- Start with the initial set of contacts and their immediate nodes
+        SELECT
+            id_contact_a,
+            id_contact_b
+        FROM
+            source
+
+        UNION ALL
+
+        -- Recursive step to find further connected nodes
+        SELECT
+            chained_contacts.id_contact_a,
+            unfactorized_duplicates.id_contact_b
+        FROM
+            chained_contacts
+                JOIN source AS unfactorized_duplicates
+                     ON chained_contacts.id_contact_b = unfactorized_duplicates.id_contact_a
+    ),
+    id_contact_a_with_aggregated_id_contact_bs AS (
+        SELECT
+            id_contact_a,
+            ARRAY_AGG(DISTINCT id_contact_b ORDER BY id_contact_b) AS aggregated_id_contact_bs
+        FROM
+            chained_contacts
+        GROUP BY
+            id_contact_a
+    )
+SELECT
+    ARRAY_CONCAT([id_contact_a], aggregated_id_contact_bs) AS aggregated_duplicates
+FROM
+    id_contact_a_with_aggregated_id_contact_bs
+WHERE
+    id_contact_a NOT IN (
+        SELECT DISTINCT
+            id_contact_b
+        FROM
+            source
+    )
+ORDER BY
+    id_contact_a
+    """
+
+    _check_successful_or_raise(
+        _create_test(
+            body=load_yaml(
+                """
+test_recursive_ctes:
+  model: test
+  inputs:
+    source:
+      rows:
+        - id_contact_a: "a"
+          id_contact_b: "b"
+        - id_contact_a: "b"
+          id_contact_b: "c"
+        - id_contact_a: "c"
+          id_contact_b: "d"
+        - id_contact_a: "a"
+          id_contact_b: "g"
+        - id_contact_a: "b"
+          id_contact_b: "e"
+        - id_contact_a: "c"
+          id_contact_b: "f"
+        - id_contact_a: "x"
+          id_contact_b: "y"
+  outputs:
+    ctes:
+      id_contact_a_with_aggregated_id_contact_bs:
+        - id_contact_a: a
+          aggregated_id_contact_bs: [b, c, d, e, f, g]
+        - id_contact_a: x
+          aggregated_id_contact_bs: [y]
+        - id_contact_a: b
+          aggregated_id_contact_bs: [c, d, e, f]
+        - id_contact_a: c
+          aggregated_id_contact_bs: [d, f]
+    query:
+      rows:
+        - aggregated_duplicates: [a, b, c, d, e, f, g]
+        - aggregated_duplicates: [x, y]
+                """
+            ),
+            test_name="test_recursive_ctes",
+            model=_create_model(model_sql),
+            context=Context(config=Config(model_defaults=ModelDefaultsConfig(dialect="duckdb"))),
+        ).run()
+    )
+
+
 def test_test_generation(tmp_path: Path) -> None:
     init_example_project(tmp_path, dialect="duckdb")
 
@@ -1789,3 +1883,35 @@ def test_test_generation_with_decimal(tmp_path: Path, mocker: MockerFixture) ->
     assert "test_foo" in test
     assert test["test_foo"]["inputs"] == {"sqlmesh_example.bar": [{"dec_col": "1.23"}]}
     assert test["test_foo"]["outputs"] == {"query": [{"dec_col": "1.23"}]}
+
+
+def test_test_generation_with_recursive_ctes(tmp_path: Path) -> None:
+    init_example_project(tmp_path, dialect="duckdb")
+
+    config = Config(
+        default_connection=DuckDBConnectionConfig(),
+        model_defaults=ModelDefaultsConfig(dialect="duckdb"),
+    )
+    foo_sql_file = tmp_path / "models" / "foo.sql"
+    foo_sql_file.write_text(
+        "MODEL (name sqlmesh_example.foo);"
+        "WITH RECURSIVE t AS (SELECT 1 AS c UNION ALL SELECT c + 1 FROM t WHERE c < 3) SELECT c FROM t"
+    )
+
+    context = Context(paths=tmp_path, config=config)
+    context.plan(auto_apply=True)
+
+    context.create_test("sqlmesh_example.foo", input_queries={}, overwrite=True, include_ctes=True)
+
+    test = load_yaml(context.path / c.TESTS / "test_foo.yaml")
+    assert len(test) == 1
+    assert "test_foo" in test
+    assert test["test_foo"]["inputs"] == {}
+    assert test["test_foo"]["outputs"] == {
+        "query": [{"c": 1}, {"c": 2}, {"c": 3}],
+        "ctes": {
+            "t": [{"c": 1}, {"c": 2}, {"c": 3}],
+        },
+    }
+
+    _check_successful_or_raise(context.test())

Original file line number	Diff line number	Diff line change
`@@ -530,7 +530,7 @@ def dag(ctx: click.Context, file: str, select_model: t.List[str]) -> None:`
`530`	`530`	`"queries",`
`531`	`531`	`type=(str, str),`
`532`	`532`	`multiple=True,`
`533`		`- required=True,`
	`533`	`+ default=[],`
`534`	`534`	`help="Queries that will be used to generate data for the model's dependencies.",`
`535`	`535`	`)`
`536`	`536`	`@click.option(`
Original file line number	Diff line number	Diff line change
`@@ -858,7 +858,7 @@ def janitor(self, context: Context, line: str) -> None:`
`858`	`858`	`"-q",`
`859`	`859`	`type=str,`
`860`	`860`	`nargs="+",`
`861`		`- required=True,`
	`861`	`+ default=[],`
`862`	`862`	`help="Queries that will be used to generate data for the model's dependencies.",`
`863`	`863`	`)`
`864`	`864`	`@argument(`