deduplicate SQLMesh Native Macro (#2960)

sungchun12 · georgesittas · web-flow · commit 6e71e5fd60ad · 2024-08-01T14:25:38.000-07:00
Co-authored-by: Jo &lt;46752250+georgesittas@users.noreply.github.com&gt;
diff --git a/docs/concepts/macros/sqlmesh_macros.md b/docs/concepts/macros/sqlmesh_macros.md
@@ -901,6 +901,40 @@ FROM rides
 GROUP BY 1
 ```
 
+### @DEDUPLICATE
+
+`@DEDUPLICATE` is used to deduplicate rows in a table based on the specified partition and order columns with a window function.
+
+It supports the following arguments, in this order:
+
+- `relation`: The table or CTE name to deduplicate
+- `partition_by`: column names, or expressions to use to identify a window of rows out of which to select one as the deduplicated row
+- `order_by`: A list of strings representing the ORDER BY clause, optional - nulls ordering: ['<column> <asc|desc> nulls <first|last>']
+
+For example, the following query:
+```sql linenums="1"
+with raw_data as (
+@deduplicate(my_table, [id, cast(event_date as date)], ['event_date DESC', 'status ASC'])
+)
+
+select * from raw_data
+```
+
+would be rendered as:
+
+```sql linenums="1"
+WITH "raw_data" AS (                                                                           
+  SELECT                                                                                       
+    *                                                                                          
+  FROM "my_table" AS "my_table"                                                                
+  QUALIFY                                                                                      
+    ROW_NUMBER() OVER (PARTITION BY "id", CAST("event_date" AS DATE) ORDER BY "event_date" DESC, "status" ASC) = 1                                                                        
+)                                                                                        
+SELECT                                                                                         
+  *                                                                                            
+FROM "raw_data" AS "raw_data"
+```
+
 ### @AND
 
 `@AND` combines a sequence of operands using the `AND` operator, filtering out any NULL expressions.
diff --git a/sqlmesh/core/macros.py b/sqlmesh/core/macros.py
@@ -1100,6 +1100,63 @@ def var(
     return exp.convert(evaluator.var(var_name.this, default))
 
 
+@macro()
+def deduplicate(
+    evaluator: MacroEvaluator,
+    relation: exp.Expression,
+    partition_by: t.List[exp.Expression],
+    order_by: t.List[str],
+) -> exp.Query:
+    """Returns a QUERY to deduplicate rows within a table
+
+    Args:
+        relation: table or CTE name to deduplicate
+        partition_by: column names, or expressions to use to identify a window of rows out of which to select one as the deduplicated row
+        order_by: A list of strings representing the ORDER BY clause
+
+    Example:
+        >>> from sqlglot import parse_one
+        >>> from sqlglot.schema import MappingSchema
+        >>> from sqlmesh.core.macros import MacroEvaluator
+        >>> sql = "@deduplicate(demo.table, [user_id, cast(timestamp as date)], ['timestamp desc', 'status asc'])"
+        >>> MacroEvaluator().transform(parse_one(sql)).sql()
+        'SELECT * FROM demo.table QUALIFY ROW_NUMBER() OVER (PARTITION BY user_id, CAST(timestamp AS DATE) ORDER BY timestamp DESC, status ASC) = 1'
+    """
+    if not isinstance(partition_by, list):
+        raise SQLMeshError(
+            "partition_by must be a list of columns: [<column>, cast(<column> as <type>)]"
+        )
+
+    if not isinstance(order_by, list):
+        raise SQLMeshError(
+            "order_by must be a list of strings, optional - nulls ordering: ['<column> <asc|desc> nulls <first|last>']"
+        )
+
+    partition_clause = exp.tuple_(*partition_by)
+
+    order_expressions = [
+        evaluator.transform(parse_one(order_item, into=exp.Ordered, dialect=evaluator.dialect))
+        for order_item in order_by
+    ]
+
+    if not order_expressions:
+        raise SQLMeshError(
+            "order_by must be a list of strings, optional - nulls ordering: ['<column> <asc|desc> nulls <first|last>']"
+        )
+
+    order_clause = exp.Order(expressions=order_expressions)
+
+    window_function = exp.Window(
+        this=exp.RowNumber(), partition_by=partition_clause, order=order_clause
+    )
+
+    first_unique_row = window_function.eq(1)
+
+    query = exp.select("*").from_(relation).qualify(first_unique_row)
+
+    return query
+
+
 def normalize_macro_name(name: str) -> str:
     """Prefix macro name with @ and upcase"""
     return f"@{name.upper()}"
diff --git a/tests/core/test_macros.py b/tests/core/test_macros.py
@@ -663,3 +663,94 @@ def test_macro_first_value_ignore_respect_nulls(assert_exp_eq) -> None:
         "SELECT FIRST_VALUE(@test(x) RESPECT NULLS) OVER (ORDER BY y) AS column_test"
     )
     assert_exp_eq(evaluator.transform(actual_expr), expected_sql, dialect="duckdb")
+
+
+DEDUPLICATE_SQL = """
+@deduplicate(
+    my_table,
+    [user_id, CAST(timestamp AS DATE)],
+    ['timestamp DESC', 'status ASC nulls last']
+)
+"""
+
+
+@pytest.mark.parametrize(
+    "dialect, sql, expected_sql",
+    [
+        *[
+            (
+                dialect,
+                DEDUPLICATE_SQL,
+                """
+                SELECT *
+                FROM my_table
+                QUALIFY ROW_NUMBER() OVER (
+                    PARTITION BY user_id, CAST(timestamp AS DATE)
+                    ORDER BY timestamp DESC, status ASC NULLS LAST
+                ) = 1
+                """,
+            )
+            for dialect in ["bigquery", "databricks", "snowflake", "duckdb"]
+        ],
+        (
+            "redshift",
+            DEDUPLICATE_SQL,
+            """
+            SELECT *
+            FROM my_table
+            QUALIFY ROW_NUMBER() OVER (
+                PARTITION BY user_id, CAST("timestamp" AS DATE)
+                ORDER BY "timestamp" DESC, status ASC NULLS LAST
+            ) = 1
+            """,
+        ),
+        *[
+            (
+                dialect,
+                DEDUPLICATE_SQL,
+                """
+                SELECT *
+                FROM (
+                    SELECT *, ROW_NUMBER() OVER (
+                        PARTITION BY user_id, CAST(timestamp AS DATE)
+                        ORDER BY timestamp DESC, status ASC NULLS LAST
+                    ) AS _w
+                    FROM my_table
+                ) as _t
+                WHERE _w = 1
+                """,
+            )
+            for dialect in ["trino", "postgres"]
+        ],
+    ],
+)
+def test_deduplicate(assert_exp_eq, dialect, sql, expected_sql):
+    schema = MappingSchema({}, dialect=dialect)
+    evaluator = MacroEvaluator(schema=schema, dialect=dialect)
+    assert_exp_eq(evaluator.transform(parse_one(sql)), expected_sql, dialect=dialect)
+
+
+def test_deduplicate_error_handling(macro_evaluator):
+    # Test error handling: non-list partition_by
+    with pytest.raises(SQLMeshError) as e:
+        macro_evaluator.evaluate(parse_one("@deduplicate(my_table, user_id, ['timestamp DESC'])"))
+    assert (
+        str(e.value.__cause__)
+        == "partition_by must be a list of columns: [<column>, cast(<column> as <type>)]"
+    )
+
+    # Test error handling: non-list order_by
+    with pytest.raises(SQLMeshError) as e:
+        macro_evaluator.evaluate(parse_one("@deduplicate(my_table, [user_id], 'timestamp DESC')"))
+    assert (
+        str(e.value.__cause__)
+        == "order_by must be a list of strings, optional - nulls ordering: ['<column> <asc|desc> nulls <first|last>']"
+    )
+
+    # Test error handling: empty order_by
+    with pytest.raises(SQLMeshError) as e:
+        macro_evaluator.evaluate(parse_one("@deduplicate(my_table, [user_id], [])"))
+    assert (
+        str(e.value.__cause__)
+        == "order_by must be a list of strings, optional - nulls ordering: ['<column> <asc|desc> nulls <first|last>']"
+    )