Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 134 additions & 0 deletions sqlmesh/migrations/v0085_deterministic_repr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
"""
When serializing some objects, like `__sqlmesh__vars__`, the order of keys in the dictionary were not deterministic
and therefore this migration applies deterministic sorting to the keys of the dictionary.
"""

import json
import typing as t
from dataclasses import dataclass

from sqlglot import exp

from sqlmesh.utils.migration import index_text_type, blob_text_type


# Make sure `SqlValue` is defined so it can be used by `eval` call in the migration
@dataclass
class SqlValue:
"""A SQL string representing a generated SQLGlot AST."""

sql: str


def _deterministic_repr(obj: t.Any) -> str:
"""
This is a copy of the function from utils.metaprogramming
"""

def _normalize_for_repr(o: t.Any) -> t.Any:
if isinstance(o, dict):
sorted_items = sorted(o.items(), key=lambda x: str(x[0]))
return {k: _normalize_for_repr(v) for k, v in sorted_items}
if isinstance(o, (list, tuple)):
# Recursively normalize nested structures
normalized = [_normalize_for_repr(item) for item in o]
return type(o)(normalized)
return o

try:
return repr(_normalize_for_repr(obj))
except Exception:
return repr(obj)


def migrate(state_sync, **kwargs): # type: ignore
import pandas as pd

engine_adapter = state_sync.engine_adapter
schema = state_sync.schema
snapshots_table = "_snapshots"
if schema:
snapshots_table = f"{schema}.{snapshots_table}"

migration_needed = False
new_snapshots = []

for (
name,
identifier,
version,
snapshot,
kind_name,
updated_ts,
unpaused_ts,
ttl_ms,
unrestorable,
) in engine_adapter.fetchall(
exp.select(
"name",
"identifier",
"version",
"snapshot",
"kind_name",
"updated_ts",
"unpaused_ts",
"ttl_ms",
"unrestorable",
).from_(snapshots_table),
quote_identifiers=True,
):
parsed_snapshot = json.loads(snapshot)
python_env = parsed_snapshot["node"].get("python_env")

if python_env:
for key, executable in python_env.items():
if isinstance(executable, dict) and executable.get("kind") == "value":
old_payload = executable["payload"]
try:
# Try to parse the old payload and re-serialize it deterministically
parsed_value = eval(old_payload)
new_payload = _deterministic_repr(parsed_value)

# Only update if the representation changed
if old_payload != new_payload:
executable["payload"] = new_payload
migration_needed = True
except Exception:
# If we still can't eval it, leave it as-is
pass

new_snapshots.append(
{
"name": name,
"identifier": identifier,
"version": version,
"snapshot": json.dumps(parsed_snapshot),
"kind_name": kind_name,
"updated_ts": updated_ts,
"unpaused_ts": unpaused_ts,
"ttl_ms": ttl_ms,
"unrestorable": unrestorable,
}
)

if migration_needed and new_snapshots:
engine_adapter.delete_from(snapshots_table, "TRUE")

index_type = index_text_type(engine_adapter.dialect)
blob_type = blob_text_type(engine_adapter.dialect)

engine_adapter.insert_append(
snapshots_table,
pd.DataFrame(new_snapshots),
columns_to_types={
"name": exp.DataType.build(index_type),
"identifier": exp.DataType.build(index_type),
"version": exp.DataType.build(index_type),
"snapshot": exp.DataType.build(blob_type),
"kind_name": exp.DataType.build("text"),
"updated_ts": exp.DataType.build("bigint"),
"unpaused_ts": exp.DataType.build("bigint"),
"ttl_ms": exp.DataType.build("bigint"),
"unrestorable": exp.DataType.build("boolean"),
},
)
36 changes: 35 additions & 1 deletion sqlmesh/utils/metaprogramming.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,9 @@ def is_value(self) -> bool:

@classmethod
def value(cls, v: t.Any, is_metadata: t.Optional[bool] = None) -> Executable:
return Executable(payload=repr(v), kind=ExecutableKind.VALUE, is_metadata=is_metadata)
return Executable(
payload=_deterministic_repr(v), kind=ExecutableKind.VALUE, is_metadata=is_metadata
)


def serialize_env(env: t.Dict[str, t.Any], path: Path) -> t.Dict[str, Executable]:
Expand Down Expand Up @@ -633,6 +635,38 @@ def print_exception(
out.write(tb)


def _deterministic_repr(obj: t.Any) -> str:
"""Create a deterministic representation by ensuring consistent ordering before repr().

For dictionaries, ensures consistent key ordering to prevent non-deterministic
serialization that affects fingerprinting. Uses Python's native repr() logic
for all formatting to handle edge cases properly.

Note that this function assumes list/tuple order is significant and therefore does not sort them.

Args:
obj: The object to represent as a string.

Returns:
A deterministic string representation of the object.
"""

def _normalize_for_repr(o: t.Any) -> t.Any:
if isinstance(o, dict):
sorted_items = sorted(o.items(), key=lambda x: str(x[0]))
return {k: _normalize_for_repr(v) for k, v in sorted_items}
if isinstance(o, (list, tuple)):
# Recursively normalize nested structures
normalized = [_normalize_for_repr(item) for item in o]
return type(o)(normalized)
return o

try:
return repr(_normalize_for_repr(obj))
except Exception:
return repr(obj)


def import_python_file(path: Path, relative_base: Path = Path()) -> types.ModuleType:
relative_path = path.absolute().relative_to(relative_base.absolute())
module_name = str(relative_path.with_suffix("")).replace(os.path.sep, ".")
Expand Down
162 changes: 161 additions & 1 deletion tests/utils/test_metaprogramming.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from sqlmesh.utils.metaprogramming import (
Executable,
ExecutableKind,
_deterministic_repr,
build_env,
func_globals,
normalize_source,
Expand All @@ -48,7 +49,7 @@ def test_print_exception(mocker: MockerFixture):
except Exception as ex:
print_exception(ex, test_env, out_mock)

expected_message = r""" File ".*?.tests.utils.test_metaprogramming\.py", line 47, in test_print_exception
expected_message = r""" File ".*?.tests.utils.test_metaprogramming\.py", line 48, in test_print_exception
eval\("test_fun\(\)", env\).*

File '/test/path.py' \(or imported file\), line 2, in test_fun
Expand Down Expand Up @@ -457,3 +458,162 @@ def test_serialize_env_with_enum_import_appearing_in_two_functions() -> None:
}

assert serialized_env == expected_env


def test_deterministic_repr_basic_types():
"""Test _deterministic_repr with basic Python types."""
# Test basic types that should use standard repr
assert _deterministic_repr(42) == "42"
assert _deterministic_repr("hello") == "'hello'"
assert _deterministic_repr(True) == "True"
assert _deterministic_repr(None) == "None"
assert _deterministic_repr(3.14) == "3.14"


def test_deterministic_repr_dict_ordering():
"""Test that _deterministic_repr produces consistent output for dicts with different key ordering."""
# Same dict with different key ordering
dict1 = {"c": 3, "a": 1, "b": 2}
dict2 = {"a": 1, "b": 2, "c": 3}
dict3 = {"b": 2, "c": 3, "a": 1}

repr1 = _deterministic_repr(dict1)
repr2 = _deterministic_repr(dict2)
repr3 = _deterministic_repr(dict3)

# All should produce the same representation
assert repr1 == repr2 == repr3
assert repr1 == "{'a': 1, 'b': 2, 'c': 3}"


def test_deterministic_repr_mixed_key_types():
"""Test _deterministic_repr with mixed key types (strings and numbers)."""
dict1 = {42: "number", "string": "text", 1: "one"}
dict2 = {"string": "text", 1: "one", 42: "number"}

repr1 = _deterministic_repr(dict1)
repr2 = _deterministic_repr(dict2)

# Should produce consistent ordering despite mixed key types
assert repr1 == repr2
# Numbers come before strings when sorting by string representation
assert repr1 == "{1: 'one', 42: 'number', 'string': 'text'}"


def test_deterministic_repr_nested_structures():
"""Test _deterministic_repr with deeply nested dictionaries."""
nested1 = {"outer": {"z": 26, "a": 1}, "list": [3, {"y": 2, "x": 1}], "simple": "value"}

nested2 = {"simple": "value", "list": [3, {"x": 1, "y": 2}], "outer": {"a": 1, "z": 26}}

repr1 = _deterministic_repr(nested1)
repr2 = _deterministic_repr(nested2)

assert repr1 == repr2
# Verify structure is maintained with sorted keys
expected = "{'list': [3, {'x': 1, 'y': 2}], 'outer': {'a': 1, 'z': 26}, 'simple': 'value'}"
assert repr1 == expected


def test_deterministic_repr_lists_and_tuples():
"""Test _deterministic_repr preserves order for lists/tuples but sorts nested dicts."""
# Lists should maintain their order
list_with_dicts = [{"b": 2, "a": 1}, {"d": 4, "c": 3}]
list_repr = _deterministic_repr(list_with_dicts)
expected_list = "[{'a': 1, 'b': 2}, {'c': 3, 'd': 4}]"
assert list_repr == expected_list

# Tuples should maintain their order
tuple_with_dicts = ({"z": 26, "a": 1}, {"y": 25, "b": 2})
tuple_repr = _deterministic_repr(tuple_with_dicts)
expected_tuple = "({'a': 1, 'z': 26}, {'b': 2, 'y': 25})"
assert tuple_repr == expected_tuple


def test_deterministic_repr_empty_containers():
"""Test _deterministic_repr with empty containers."""
assert _deterministic_repr({}) == "{}"
assert _deterministic_repr([]) == "[]"
assert _deterministic_repr(()) == "()"


def test_deterministic_repr_special_characters():
"""Test _deterministic_repr handles special characters correctly."""
special_dict = {
"quotes": "text with 'single' and \"double\" quotes",
"unicode": "unicode: ñáéíóú",
"newlines": "text\nwith\nnewlines",
"backslashes": "path\\to\\file",
}

result = _deterministic_repr(special_dict)

# Should be valid Python that can be evaluated
reconstructed = eval(result)
assert reconstructed == special_dict

# Should be deterministic - same input produces same output
result2 = _deterministic_repr(special_dict)
assert result == result2


def test_deterministic_repr_executable_integration():
"""Test that _deterministic_repr works correctly with Executable.value()."""
# Test the integration with Executable.value which is the main use case
variables1 = {"env": "dev", "debug": True, "timeout": 30}
variables2 = {"timeout": 30, "debug": True, "env": "dev"}

exec1 = Executable.value(variables1)
exec2 = Executable.value(variables2)

# Should produce identical payloads despite different input ordering
assert exec1.payload == exec2.payload
assert exec1.payload == "{'debug': True, 'env': 'dev', 'timeout': 30}"

# Should be valid Python
reconstructed = eval(exec1.payload)
assert reconstructed == variables1


def test_deterministic_repr_complex_example():
"""Test _deterministic_repr with a complex real-world-like structure."""
complex_vars = {
"database_config": {
"host": "localhost",
"port": 5432,
"credentials": {"username": "admin", "password": "secret"},
},
"feature_flags": ["flag_b", "flag_a"],
"metadata": {
"version": "1.0.0",
"environment": "production",
"tags": {"team": "data", "project": "analytics"},
},
42: "numeric_key",
"arrays": [{"config": {"nested": True, "level": 2}}, {"simple": "value"}],
}

expected_structure = {
42: "numeric_key",
"arrays": [{"config": {"level": 2, "nested": True}}, {"simple": "value"}],
"database_config": {
"credentials": {"password": "secret", "username": "admin"},
"host": "localhost",
"port": 5432,
},
"feature_flags": ["flag_b", "flag_a"],
"metadata": {
"environment": "production",
"tags": {"project": "analytics", "team": "data"},
"version": "1.0.0",
},
}

actual_repr = _deterministic_repr(complex_vars)
expected_repr = repr(expected_structure)
assert actual_repr == expected_repr

# Should be valid Python
reconstructed = eval(actual_repr)
assert isinstance(reconstructed, dict)
assert reconstructed == complex_vars