From 34a58623fc627f49cc656f2f5d0df785f2c0dd1e Mon Sep 17 00:00:00 2001 From: eakmanrq <6326532+eakmanrq@users.noreply.github.com> Date: Tue, 30 Sep 2025 09:37:16 -0700 Subject: [PATCH] feat!: support unicode characters --- .circleci/continue_config.yml | 12 ++++---- sqlmesh/core/engine_adapter/databricks.py | 2 ++ sqlmesh/utils/__init__.py | 7 +++-- .../integration/test_integration.py | 29 +++++++++++++++++++ tests/utils/__init__.py | 22 ++++++++++++++ tests/utils/test_cache.py | 1 + 6 files changed, 65 insertions(+), 8 deletions(-) diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml index c549c0ae78..fecfa25ca3 100644 --- a/.circleci/continue_config.yml +++ b/.circleci/continue_config.yml @@ -300,8 +300,8 @@ workflows: name: cloud_engine_<< matrix.engine >> context: - sqlmesh_cloud_database_integration - requires: - - engine_tests_docker +# requires: +# - engine_tests_docker matrix: parameters: engine: @@ -313,10 +313,10 @@ workflows: - athena - fabric - gcp-postgres - filters: - branches: - only: - - main +# filters: +# branches: +# only: +# - main - ui_style - ui_test - vscode_test diff --git a/sqlmesh/core/engine_adapter/databricks.py b/sqlmesh/core/engine_adapter/databricks.py index 946a7bdf74..173e1b08af 100644 --- a/sqlmesh/core/engine_adapter/databricks.py +++ b/sqlmesh/core/engine_adapter/databricks.py @@ -34,6 +34,8 @@ class DatabricksEngineAdapter(SparkEngineAdapter): SUPPORTS_CLONING = True SUPPORTS_MATERIALIZED_VIEWS = True SUPPORTS_MATERIALIZED_VIEW_SCHEMA = True + # Spark has this set to false for compatibility when mixing with Trino but that isn't a concern with Databricks + QUOTE_IDENTIFIERS_IN_VIEWS = True SCHEMA_DIFFER_KWARGS = { "support_positional_add": True, "nested_support": NestedSupport.ALL, diff --git a/sqlmesh/utils/__init__.py b/sqlmesh/utils/__init__.py index c220de4847..9d7a85d4e1 100644 --- a/sqlmesh/utils/__init__.py +++ b/sqlmesh/utils/__init__.py @@ -13,6 +13,7 @@ import types import typing as t import uuid +import unicodedata from dataclasses import dataclass from collections import defaultdict from contextlib import contextmanager @@ -289,11 +290,13 @@ def sqlglot_dialects() -> str: return "'" + "', '".join(Dialects.__members__.values()) + "'" -NON_ALNUM = re.compile(r"[^a-zA-Z0-9_]") +NON_WORD = re.compile(r"\W", flags=re.UNICODE) def sanitize_name(name: str) -> str: - return NON_ALNUM.sub("_", name) + s = unicodedata.normalize("NFC", name) + s = NON_WORD.sub("_", s) + return s def groupby( diff --git a/tests/core/engine_adapter/integration/test_integration.py b/tests/core/engine_adapter/integration/test_integration.py index 5190d26e98..9164c79016 100644 --- a/tests/core/engine_adapter/integration/test_integration.py +++ b/tests/core/engine_adapter/integration/test_integration.py @@ -3990,3 +3990,32 @@ def _set_config(gateway: str, config: Config) -> None: was_evaluated=True, day_delta=4, ) + + +def test_unicode_characters(ctx: TestContext, tmp_path: Path): + if ctx.dialect in ["spark", "trino"]: + # It is possible that Trino could support this if we changed `QUOTE_IDENTIFIERS_IN_VIEWS` but that would + # break the compatibility it has when be mixed with Spark for compute + pytest.skip("Skipping as these engines have issues with unicode characters in model names") + + model_name = "客户数据" + table = ctx.table(model_name).sql(dialect=ctx.dialect) + (tmp_path / "models").mkdir(exist_ok=True) + + model_def = f""" + MODEL ( + name {table}, + kind FULL, + dialect '{ctx.dialect}' + ); + SELECT 1 as id + """ + + (tmp_path / "models" / "客户数据.sql").write_text(model_def) + + context = ctx.create_context(path=tmp_path) + context.plan(auto_apply=True, no_prompts=True) + + results = ctx.get_metadata_results() + assert len(results.views) == 1 + assert results.views[0].lower() == model_name diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py index e69de29bb2..b395cefaaa 100644 --- a/tests/utils/__init__.py +++ b/tests/utils/__init__.py @@ -0,0 +1,22 @@ +import pytest + +from sqlmesh.utils import sanitize_name + + +@pytest.mark.parametrize( + "raw,expected", + [ + ("simple", "simple"), + ("snake_case", "snake_case"), + ("客户数据", "客户数据"), # pure Chinese kept + ("客户-数据 v2", "客户_数据_v2"), # dash/space -> underscore + ("中文,逗号", "中文_逗号"), # full-width comma -> underscore + ("a/b", "a_b"), # slash -> underscore + ("spaces\tand\nnewlines", "spaces_and_newlines"), + ("data📦2025", "data_2025"), + ("MiXeD123_名字", "MiXeD123_名字"), + ("", ""), + ], +) +def test_sanitize_known_cases(raw, expected): + assert sanitize_name(raw) == expected diff --git a/tests/utils/test_cache.py b/tests/utils/test_cache.py index cd1fdb0115..0b6d335446 100644 --- a/tests/utils/test_cache.py +++ b/tests/utils/test_cache.py @@ -39,6 +39,7 @@ def test_file_cache(tmp_path: Path, mocker: MockerFixture): loader.assert_called_once() assert "___test_model_" in cache._cache_entry_path('"test_model"').name + assert "客户数据" in cache._cache_entry_path("客户数据").name def test_optimized_query_cache(tmp_path: Path, mocker: MockerFixture):