feat: add the ability to return range for key and value

benfdking · benfdking · commit ea963255abf1 · 2025-08-14T14:11:17.000+02:00
diff --git a/sqlmesh/core/linter/helpers.py b/sqlmesh/core/linter/helpers.py
@@ -2,7 +2,7 @@
 
 from sqlmesh.core.linter.rule import Range, Position
 from sqlmesh.utils.pydantic import PydanticModel
-from sqlglot import tokenize, TokenType
+from sqlglot import tokenize, TokenType, Token
 import typing as t
 
 
@@ -122,57 +122,14 @@ def read_range_from_file(file: Path, text_range: Range) -> str:
     return read_range_from_string("".join(lines), text_range)
 
 
-def get_range_of_model_block(
-    sql: str,
-    dialect: str,
-) -> t.Optional[Range]:
-    """
-    Get the range of the model block in an SQL file.
-    """
-    tokens = tokenize(sql, dialect=dialect)
-
-    # Find start of the model block
-    start = next(
-        (t for t in tokens if t.token_type is TokenType.VAR and t.text.upper() == "MODEL"),
-        None,
-    )
-    end = next((t for t in tokens if t.token_type is TokenType.SEMICOLON), None)
-
-    if start is None or end is None:
-        return None
-
-    start_position = TokenPositionDetails(
-        line=start.line,
-        col=start.col,
-        start=start.start,
-        end=start.end,
-    )
-    end_position = TokenPositionDetails(
-        line=end.line,
-        col=end.col,
-        start=end.start,
-        end=end.end,
-    )
-
-    splitlines = sql.splitlines()
-    return Range(
-        start=start_position.to_range(splitlines).start,
-        end=end_position.to_range(splitlines).end,
-    )
-
-
-def get_range_of_a_key_in_model_block(
-    sql: str,
-    dialect: str,
-    key: str,
-) -> t.Optional[Range]:
+def get_start_and_end_of_model_block(
+    tokens: t.List[Token],
+) -> t.Optional[t.Tuple[int, int]]:
     """
-    Get the range of a specific key in the model block of an SQL file.
+    Returns the start and end tokens of the MODEL block in an SQL file.
+    The MODEL block is defined as the first occurrence of the keyword "MODEL" followed by
+    an opening parenthesis and a closing parenthesis that matches the opening one.
     """
-    tokens = tokenize(sql, dialect=dialect)
-    if not tokens:
-        return None
-
     # 1) Find the MODEL token
     try:
         model_idx = next(
@@ -216,6 +173,65 @@ def get_range_of_a_key_in_model_block(
             )
         except StopIteration:
             return None
+    return (
+        lparen_idx,
+        rparen_idx,
+    )
+
+
+def get_range_of_model_block(
+    sql: str,
+    dialect: str,
+) -> t.Optional[Range]:
+    """
+    Get the range of the model block in an SQL file,
+    """
+    tokens = tokenize(sql, dialect=dialect)
+
+    block = get_start_and_end_of_model_block(tokens)
+    if not block:
+        return None
+
+    (start_idx, end_idx) = block
+    start = tokens[start_idx - 1]
+    end = tokens[end_idx + 1]
+    start_position = TokenPositionDetails(
+        line=start.line,
+        col=start.col,
+        start=start.start,
+        end=start.end,
+    )
+    end_position = TokenPositionDetails(
+        line=end.line,
+        col=end.col,
+        start=end.start,
+        end=end.end,
+    )
+    splitlines = sql.splitlines()
+    return Range(
+        start=start_position.to_range(splitlines).start,
+        end=end_position.to_range(splitlines).end,
+    )
+
+
+def get_range_of_a_key_in_model_block(
+    sql: str,
+    dialect: str,
+    key: str,
+) -> t.Optional[t.Tuple[Range, Range]]:
+    """
+    Get the ranges of a specific key and its value in the MODEL block of an SQL file.
+
+    Returns a tuple of (key_range, value_range) if found, otherwise None.
+    """
+    tokens = tokenize(sql, dialect=dialect)
+    if not tokens:
+        return None
+
+    block = get_start_and_end_of_model_block(tokens)
+    if not block:
+        return None
+    (lparen_idx, rparen_idx) = block
 
     # 4) Scan within the MODEL property list for the key at top-level (depth == 1)
     # Initialize depth to 1 since we're inside the first parentheses
@@ -237,17 +253,78 @@ def get_range_of_a_key_in_model_block(
         if depth == 1 and tt is TokenType.VAR and tok.text.upper() == key.upper():
             # Validate key position: it should immediately follow '(' or ',' at top level
             prev_idx = i - 1
-            # Skip over non-significant tokens we don't want to gate on (e.g., comments)
-            while prev_idx >= 0 and tokens[prev_idx].token_type in (TokenType.COMMENT,):
-                prev_idx -= 1
             prev_tt = tokens[prev_idx].token_type if prev_idx >= 0 else None
-            if prev_tt in (TokenType.L_PAREN, TokenType.COMMA):
-                position = TokenPositionDetails(
-                    line=tok.line,
-                    col=tok.col,
-                    start=tok.start,
-                    end=tok.end,
-                )
-                return position.to_range(sql.splitlines())
+            if prev_tt not in (TokenType.L_PAREN, TokenType.COMMA):
+                continue
+
+            # Key range
+            lines = sql.splitlines()
+            key_start = TokenPositionDetails(
+                line=tok.line, col=tok.col, start=tok.start, end=tok.end
+            )
+            key_range = key_start.to_range(lines)
+
+            # Find value start: the next non-comment token after the key
+            value_start_idx = i + 1
+            if value_start_idx >= rparen_idx:
+                return None
+
+            # Walk to the end of the value expression: until top-level comma or closing paren
+            # Track internal nesting for (), [], {}
+            nested = 0
+            j = value_start_idx
+            value_end_idx = value_start_idx
+
+            def is_open(t: TokenType) -> bool:
+                return t in (TokenType.L_PAREN, TokenType.L_BRACE, TokenType.L_BRACKET)
+
+            def is_close(t: TokenType) -> bool:
+                return t in (TokenType.R_PAREN, TokenType.R_BRACE, TokenType.R_BRACKET)
+
+            while j < rparen_idx:
+                ttype = tokens[j].token_type
+                if is_open(ttype):
+                    nested += 1
+                elif is_close(ttype):
+                    nested -= 1
+
+                # End of value: at top-level (nested == 0) encountering a comma or the end paren
+                if nested == 0 and (
+                    ttype is TokenType.COMMA or (ttype is TokenType.R_PAREN and depth == 1)
+                ):
+                    # For comma, don't include it in the value range
+                    # For closing paren, include it only if it's part of the value structure
+                    if ttype is TokenType.COMMA:
+                        # Don't include the comma in the value range
+                        break
+                    else:
+                        # Include the closing parenthesis in the value range
+                        value_end_idx = j
+                        break
+
+                value_end_idx = j
+                j += 1
+
+            value_start_tok = tokens[value_start_idx]
+            value_end_tok = tokens[value_end_idx]
+
+            value_start_pos = TokenPositionDetails(
+                line=value_start_tok.line,
+                col=value_start_tok.col,
+                start=value_start_tok.start,
+                end=value_start_tok.end,
+            )
+            value_end_pos = TokenPositionDetails(
+                line=value_end_tok.line,
+                col=value_end_tok.col,
+                start=value_end_tok.start,
+                end=value_end_tok.end,
+            )
+            value_range = Range(
+                start=value_start_pos.to_range(lines).start,
+                end=value_end_pos.to_range(lines).end,
+            )
+
+            return (key_range, value_range)
 
     return None
diff --git a/tests/core/linter/test_helpers.py b/tests/core/linter/test_helpers.py
@@ -52,8 +52,17 @@ def test_get_range_of_a_key_in_model_block_testing_on_sushi():
     ]
     assert len(sql_models) > 0
 
+    # Test that the function works for all keys in the model block
     for model in sql_models:
-        possible_keys = ["name", "tags", "description", "columns", "owner", "cron", "dialect"]
+        possible_keys = [
+            "name",
+            "tags",
+            "description",
+            "column_descriptions",
+            "owner",
+            "cron",
+            "dialect",
+        ]
 
         dialect = model.dialect
         assert dialect is not None
@@ -67,12 +76,55 @@ def test_get_range_of_a_key_in_model_block_testing_on_sushi():
         count_properties_checked = 0
 
         for key in possible_keys:
-            range = get_range_of_a_key_in_model_block(content, dialect, key)
-
-            # Check that the range starts with the key and ends with ;
-            if range:
-                read_range = read_range_from_file(path, range)
-                assert read_range.lower() == key.lower()
+            ranges = get_range_of_a_key_in_model_block(content, dialect, key)
+
+            if ranges:
+                key_range, value_range = ranges
+                read_key = read_range_from_file(path, key_range)
+                assert read_key.lower() == key.lower()
+                # Value range should be non-empty
+                read_value = read_range_from_file(path, value_range)
+                assert len(read_value) > 0
                 count_properties_checked += 1
 
         assert count_properties_checked > 0
+
+    # Test that the function works for different kind of value blocks
+    tests = [
+        ("sushi.customers", "name", "sushi.customers"),
+        (
+            "sushi.customers",
+            "tags",
+            "(pii, fact)",
+        ),
+        ("sushi.customers", "description", "'Sushi customer data'"),
+        (
+            "sushi.customers",
+            "column_descriptions",
+            "(    customer_id = 'customer_id uniquely identifies customers'  )",
+        ),
+        ("sushi.customers", "owner", "jen"),
+        ("sushi.customers", "cron", "'@daily'"),
+    ]
+    for model_name, key, value in tests:
+        model = context.get_model(model_name)
+        assert model is not None
+
+        dialect = model.dialect
+        assert dialect is not None
+
+        path = model._path
+        assert path is not None
+
+        with open(path, "r", encoding="utf-8") as file:
+            content = file.read()
+
+        ranges = get_range_of_a_key_in_model_block(content, dialect, key)
+        assert ranges is not None, f"Could not find key '{key}' in model '{model_name}'"
+
+        key_range, value_range = ranges
+        read_key = read_range_from_file(path, key_range)
+        assert read_key.lower() == key.lower()
+
+        read_value = read_range_from_file(path, value_range)
+        assert read_value == value