From 67b935624fbac7c92f0aead11b92967c0283cf96 Mon Sep 17 00:00:00 2001 From: eakmanrq <6326532+eakmanrq@users.noreply.github.com> Date: Mon, 7 Jul 2025 11:43:49 -0700 Subject: [PATCH 1/2] fix!: make repr deterministic for fingerprinting --- .../migrations/v0085_deterministic_repr.py | 140 +++++++++++++++ sqlmesh/utils/metaprogramming.py | 36 +++- tests/utils/test_metaprogramming.py | 162 +++++++++++++++++- 3 files changed, 336 insertions(+), 2 deletions(-) create mode 100644 sqlmesh/migrations/v0085_deterministic_repr.py diff --git a/sqlmesh/migrations/v0085_deterministic_repr.py b/sqlmesh/migrations/v0085_deterministic_repr.py new file mode 100644 index 0000000000..da151b0667 --- /dev/null +++ b/sqlmesh/migrations/v0085_deterministic_repr.py @@ -0,0 +1,140 @@ +""" +When serializing some objects, like `__sqlmesh__vars__`, the order of keys in the dictionary were not deterministic +and therefore this migration applies deterministic sorting to the keys of the dictionary. +""" + +import ast +import json +import typing as t + +from sqlglot import exp + +from sqlmesh.utils.migration import index_text_type, blob_text_type + + +def _deterministic_repr(obj: t.Any) -> str: + """ + This is a copy of the function from utils.metaprogramming + """ + + def _normalize_for_repr(o: t.Any) -> t.Any: + if isinstance(o, dict): + sorted_items = sorted(o.items(), key=lambda x: str(x[0])) + return {k: _normalize_for_repr(v) for k, v in sorted_items} + if isinstance(o, (list, tuple)): + # Recursively normalize nested structures + normalized = [_normalize_for_repr(item) for item in o] + return type(o)(normalized) + return o + + return repr(_normalize_for_repr(obj)) + + +def migrate(state_sync, **kwargs): # type: ignore + import pandas as pd + + engine_adapter = state_sync.engine_adapter + schema = state_sync.schema + snapshots_table = "_snapshots" + if schema: + snapshots_table = f"{schema}.{snapshots_table}" + + migration_needed = False + new_snapshots = [] + + for ( + name, + identifier, + version, + snapshot, + kind_name, + updated_ts, + unpaused_ts, + ttl_ms, + unrestorable, + ) in engine_adapter.fetchall( + exp.select( + "name", + "identifier", + "version", + "snapshot", + "kind_name", + "updated_ts", + "unpaused_ts", + "ttl_ms", + "unrestorable", + ).from_(snapshots_table), + quote_identifiers=True, + ): + parsed_snapshot = json.loads(snapshot) + python_env = parsed_snapshot["node"].get("python_env") + + if python_env: + for key, executable in python_env.items(): + if isinstance(executable, dict) and executable.get("kind") == "value": + old_payload = executable["payload"] + try: + # Try to parse the old payload and re-serialize it deterministically + parsed_value = ast.literal_eval(old_payload) + new_payload = _deterministic_repr(parsed_value) + + # Only update if the representation changed + if old_payload != new_payload: + executable["payload"] = new_payload + migration_needed = True + except (ValueError, SyntaxError): + # Special handling for dictionaries containing SqlValue objects + # These can't be parsed by ast.literal_eval but we can still make them deterministic + if old_payload.startswith("{") and "SqlValue(" in old_payload: + try: + # Use eval in a safe context to parse SqlValue objects + # This is safe because we're only running this on our own serialized data + from sqlmesh.utils.metaprogramming import SqlValue + + safe_globals = {"SqlValue": SqlValue} + parsed_value = eval(old_payload, safe_globals, {}) + new_payload = _deterministic_repr(parsed_value) + + # Only update if the representation changed + if old_payload != new_payload: + executable["payload"] = new_payload + migration_needed = True + except (ValueError, SyntaxError, NameError): + # If we still can't parse it, leave it as-is + pass + + new_snapshots.append( + { + "name": name, + "identifier": identifier, + "version": version, + "snapshot": json.dumps(parsed_snapshot), + "kind_name": kind_name, + "updated_ts": updated_ts, + "unpaused_ts": unpaused_ts, + "ttl_ms": ttl_ms, + "unrestorable": unrestorable, + } + ) + + if migration_needed and new_snapshots: + engine_adapter.delete_from(snapshots_table, "TRUE") + + index_type = index_text_type(engine_adapter.dialect) + blob_type = blob_text_type(engine_adapter.dialect) + + engine_adapter.insert_append( + snapshots_table, + pd.DataFrame(new_snapshots), + columns_to_types={ + "name": exp.DataType.build(index_type), + "identifier": exp.DataType.build(index_type), + "version": exp.DataType.build(index_type), + "snapshot": exp.DataType.build(blob_type), + "kind_name": exp.DataType.build("text"), + "updated_ts": exp.DataType.build("bigint"), + "unpaused_ts": exp.DataType.build("bigint"), + "ttl_ms": exp.DataType.build("bigint"), + "unrestorable": exp.DataType.build("boolean"), + }, + ) diff --git a/sqlmesh/utils/metaprogramming.py b/sqlmesh/utils/metaprogramming.py index 9bfb8efc4e..c4340c0321 100644 --- a/sqlmesh/utils/metaprogramming.py +++ b/sqlmesh/utils/metaprogramming.py @@ -425,7 +425,9 @@ def is_value(self) -> bool: @classmethod def value(cls, v: t.Any, is_metadata: t.Optional[bool] = None) -> Executable: - return Executable(payload=repr(v), kind=ExecutableKind.VALUE, is_metadata=is_metadata) + return Executable( + payload=_deterministic_repr(v), kind=ExecutableKind.VALUE, is_metadata=is_metadata + ) def serialize_env(env: t.Dict[str, t.Any], path: Path) -> t.Dict[str, Executable]: @@ -633,6 +635,38 @@ def print_exception( out.write(tb) +def _deterministic_repr(obj: t.Any) -> str: + """Create a deterministic representation by ensuring consistent ordering before repr(). + + For dictionaries, ensures consistent key ordering to prevent non-deterministic + serialization that affects fingerprinting. Uses Python's native repr() logic + for all formatting to handle edge cases properly. + + Note that this function assumes list/tuple order is significant and therefore does not sort them. + + Args: + obj: The object to represent as a string. + + Returns: + A deterministic string representation of the object. + """ + + def _normalize_for_repr(o: t.Any) -> t.Any: + if isinstance(o, dict): + sorted_items = sorted(o.items(), key=lambda x: str(x[0])) + return {k: _normalize_for_repr(v) for k, v in sorted_items} + if isinstance(o, (list, tuple)): + # Recursively normalize nested structures + normalized = [_normalize_for_repr(item) for item in o] + return type(o)(normalized) + return o + + try: + return repr(_normalize_for_repr(obj)) + except Exception: + return repr(obj) + + def import_python_file(path: Path, relative_base: Path = Path()) -> types.ModuleType: relative_path = path.absolute().relative_to(relative_base.absolute()) module_name = str(relative_path.with_suffix("")).replace(os.path.sep, ".") diff --git a/tests/utils/test_metaprogramming.py b/tests/utils/test_metaprogramming.py index 8cca48ac6e..cb8421fac8 100644 --- a/tests/utils/test_metaprogramming.py +++ b/tests/utils/test_metaprogramming.py @@ -22,6 +22,7 @@ from sqlmesh.utils.metaprogramming import ( Executable, ExecutableKind, + _deterministic_repr, build_env, func_globals, normalize_source, @@ -48,7 +49,7 @@ def test_print_exception(mocker: MockerFixture): except Exception as ex: print_exception(ex, test_env, out_mock) - expected_message = r""" File ".*?.tests.utils.test_metaprogramming\.py", line 47, in test_print_exception + expected_message = r""" File ".*?.tests.utils.test_metaprogramming\.py", line 48, in test_print_exception eval\("test_fun\(\)", env\).* File '/test/path.py' \(or imported file\), line 2, in test_fun @@ -457,3 +458,162 @@ def test_serialize_env_with_enum_import_appearing_in_two_functions() -> None: } assert serialized_env == expected_env + + +def test_deterministic_repr_basic_types(): + """Test _deterministic_repr with basic Python types.""" + # Test basic types that should use standard repr + assert _deterministic_repr(42) == "42" + assert _deterministic_repr("hello") == "'hello'" + assert _deterministic_repr(True) == "True" + assert _deterministic_repr(None) == "None" + assert _deterministic_repr(3.14) == "3.14" + + +def test_deterministic_repr_dict_ordering(): + """Test that _deterministic_repr produces consistent output for dicts with different key ordering.""" + # Same dict with different key ordering + dict1 = {"c": 3, "a": 1, "b": 2} + dict2 = {"a": 1, "b": 2, "c": 3} + dict3 = {"b": 2, "c": 3, "a": 1} + + repr1 = _deterministic_repr(dict1) + repr2 = _deterministic_repr(dict2) + repr3 = _deterministic_repr(dict3) + + # All should produce the same representation + assert repr1 == repr2 == repr3 + assert repr1 == "{'a': 1, 'b': 2, 'c': 3}" + + +def test_deterministic_repr_mixed_key_types(): + """Test _deterministic_repr with mixed key types (strings and numbers).""" + dict1 = {42: "number", "string": "text", 1: "one"} + dict2 = {"string": "text", 1: "one", 42: "number"} + + repr1 = _deterministic_repr(dict1) + repr2 = _deterministic_repr(dict2) + + # Should produce consistent ordering despite mixed key types + assert repr1 == repr2 + # Numbers come before strings when sorting by string representation + assert repr1 == "{1: 'one', 42: 'number', 'string': 'text'}" + + +def test_deterministic_repr_nested_structures(): + """Test _deterministic_repr with deeply nested dictionaries.""" + nested1 = {"outer": {"z": 26, "a": 1}, "list": [3, {"y": 2, "x": 1}], "simple": "value"} + + nested2 = {"simple": "value", "list": [3, {"x": 1, "y": 2}], "outer": {"a": 1, "z": 26}} + + repr1 = _deterministic_repr(nested1) + repr2 = _deterministic_repr(nested2) + + assert repr1 == repr2 + # Verify structure is maintained with sorted keys + expected = "{'list': [3, {'x': 1, 'y': 2}], 'outer': {'a': 1, 'z': 26}, 'simple': 'value'}" + assert repr1 == expected + + +def test_deterministic_repr_lists_and_tuples(): + """Test _deterministic_repr preserves order for lists/tuples but sorts nested dicts.""" + # Lists should maintain their order + list_with_dicts = [{"b": 2, "a": 1}, {"d": 4, "c": 3}] + list_repr = _deterministic_repr(list_with_dicts) + expected_list = "[{'a': 1, 'b': 2}, {'c': 3, 'd': 4}]" + assert list_repr == expected_list + + # Tuples should maintain their order + tuple_with_dicts = ({"z": 26, "a": 1}, {"y": 25, "b": 2}) + tuple_repr = _deterministic_repr(tuple_with_dicts) + expected_tuple = "({'a': 1, 'z': 26}, {'b': 2, 'y': 25})" + assert tuple_repr == expected_tuple + + +def test_deterministic_repr_empty_containers(): + """Test _deterministic_repr with empty containers.""" + assert _deterministic_repr({}) == "{}" + assert _deterministic_repr([]) == "[]" + assert _deterministic_repr(()) == "()" + + +def test_deterministic_repr_special_characters(): + """Test _deterministic_repr handles special characters correctly.""" + special_dict = { + "quotes": "text with 'single' and \"double\" quotes", + "unicode": "unicode: ñáéíóú", + "newlines": "text\nwith\nnewlines", + "backslashes": "path\\to\\file", + } + + result = _deterministic_repr(special_dict) + + # Should be valid Python that can be evaluated + reconstructed = eval(result) + assert reconstructed == special_dict + + # Should be deterministic - same input produces same output + result2 = _deterministic_repr(special_dict) + assert result == result2 + + +def test_deterministic_repr_executable_integration(): + """Test that _deterministic_repr works correctly with Executable.value().""" + # Test the integration with Executable.value which is the main use case + variables1 = {"env": "dev", "debug": True, "timeout": 30} + variables2 = {"timeout": 30, "debug": True, "env": "dev"} + + exec1 = Executable.value(variables1) + exec2 = Executable.value(variables2) + + # Should produce identical payloads despite different input ordering + assert exec1.payload == exec2.payload + assert exec1.payload == "{'debug': True, 'env': 'dev', 'timeout': 30}" + + # Should be valid Python + reconstructed = eval(exec1.payload) + assert reconstructed == variables1 + + +def test_deterministic_repr_complex_example(): + """Test _deterministic_repr with a complex real-world-like structure.""" + complex_vars = { + "database_config": { + "host": "localhost", + "port": 5432, + "credentials": {"username": "admin", "password": "secret"}, + }, + "feature_flags": ["flag_b", "flag_a"], + "metadata": { + "version": "1.0.0", + "environment": "production", + "tags": {"team": "data", "project": "analytics"}, + }, + 42: "numeric_key", + "arrays": [{"config": {"nested": True, "level": 2}}, {"simple": "value"}], + } + + expected_structure = { + 42: "numeric_key", + "arrays": [{"config": {"level": 2, "nested": True}}, {"simple": "value"}], + "database_config": { + "credentials": {"password": "secret", "username": "admin"}, + "host": "localhost", + "port": 5432, + }, + "feature_flags": ["flag_b", "flag_a"], + "metadata": { + "environment": "production", + "tags": {"project": "analytics", "team": "data"}, + "version": "1.0.0", + }, + } + + actual_repr = _deterministic_repr(complex_vars) + expected_repr = repr(expected_structure) + assert actual_repr == expected_repr + + # Should be valid Python + reconstructed = eval(actual_repr) + assert isinstance(reconstructed, dict) + assert reconstructed == complex_vars From 78cdac788e7de8141438199588156bf81f6b6494 Mon Sep 17 00:00:00 2001 From: eakmanrq <6326532+eakmanrq@users.noreply.github.com> Date: Tue, 8 Jul 2025 08:01:54 -0700 Subject: [PATCH 2/2] feedback --- .../migrations/v0085_deterministic_repr.py | 40 ++++++++----------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/sqlmesh/migrations/v0085_deterministic_repr.py b/sqlmesh/migrations/v0085_deterministic_repr.py index da151b0667..9364926068 100644 --- a/sqlmesh/migrations/v0085_deterministic_repr.py +++ b/sqlmesh/migrations/v0085_deterministic_repr.py @@ -3,15 +3,23 @@ and therefore this migration applies deterministic sorting to the keys of the dictionary. """ -import ast import json import typing as t +from dataclasses import dataclass from sqlglot import exp from sqlmesh.utils.migration import index_text_type, blob_text_type +# Make sure `SqlValue` is defined so it can be used by `eval` call in the migration +@dataclass +class SqlValue: + """A SQL string representing a generated SQLGlot AST.""" + + sql: str + + def _deterministic_repr(obj: t.Any) -> str: """ This is a copy of the function from utils.metaprogramming @@ -27,7 +35,10 @@ def _normalize_for_repr(o: t.Any) -> t.Any: return type(o)(normalized) return o - return repr(_normalize_for_repr(obj)) + try: + return repr(_normalize_for_repr(obj)) + except Exception: + return repr(obj) def migrate(state_sync, **kwargs): # type: ignore @@ -75,33 +86,16 @@ def migrate(state_sync, **kwargs): # type: ignore old_payload = executable["payload"] try: # Try to parse the old payload and re-serialize it deterministically - parsed_value = ast.literal_eval(old_payload) + parsed_value = eval(old_payload) new_payload = _deterministic_repr(parsed_value) # Only update if the representation changed if old_payload != new_payload: executable["payload"] = new_payload migration_needed = True - except (ValueError, SyntaxError): - # Special handling for dictionaries containing SqlValue objects - # These can't be parsed by ast.literal_eval but we can still make them deterministic - if old_payload.startswith("{") and "SqlValue(" in old_payload: - try: - # Use eval in a safe context to parse SqlValue objects - # This is safe because we're only running this on our own serialized data - from sqlmesh.utils.metaprogramming import SqlValue - - safe_globals = {"SqlValue": SqlValue} - parsed_value = eval(old_payload, safe_globals, {}) - new_payload = _deterministic_repr(parsed_value) - - # Only update if the representation changed - if old_payload != new_payload: - executable["payload"] = new_payload - migration_needed = True - except (ValueError, SyntaxError, NameError): - # If we still can't parse it, leave it as-is - pass + except Exception: + # If we still can't eval it, leave it as-is + pass new_snapshots.append( {