|
13 | 13 | import pandas as pd # noqa: TID253 |
14 | 14 | import pytest |
15 | 15 | import pytz |
| 16 | +import time_machine |
16 | 17 | from sqlglot import exp, parse_one |
17 | 18 | from sqlglot.optimizer.normalize_identifiers import normalize_identifiers |
18 | 19 |
|
19 | 20 | from sqlmesh import Config, Context |
20 | 21 | from sqlmesh.cli.project_init import init_example_project |
21 | 22 | from sqlmesh.core.config import load_config_from_paths |
22 | | -from sqlmesh.core.config.connection import ConnectionConfig |
| 23 | +from sqlmesh.core.config.connection import ConnectionConfig, DuckDBConnectionConfig |
23 | 24 | import sqlmesh.core.dialect as d |
24 | 25 | from sqlmesh.core.dialect import select_from_values |
25 | 26 | from sqlmesh.core.model import Model, load_sql_based_model |
| 27 | +from sqlmesh.core.engine_adapter import EngineAdapter |
26 | 28 | from sqlmesh.core.engine_adapter.shared import DataObject, DataObjectType |
27 | 29 | from sqlmesh.core.engine_adapter.mixins import RowDiffMixin, LogicalMergeMixin |
28 | 30 | from sqlmesh.core.model.definition import create_sql_model |
29 | 31 | from sqlmesh.core.plan import Plan |
| 32 | +from sqlmesh.core.state_sync.cache import CachingStateSync |
30 | 33 | from sqlmesh.core.state_sync.db import EngineAdapterStateSync |
31 | | -from sqlmesh.core.snapshot import Snapshot, SnapshotChangeCategory |
32 | | -from sqlmesh.utils.date import now, to_date, to_time_column |
| 34 | +from sqlmesh.core.snapshot import Snapshot, SnapshotChangeCategory, SnapshotId |
| 35 | +from sqlmesh.utils.date import now, to_date, to_time_column, to_ds |
33 | 36 | from sqlmesh.core.table_diff import TableDiff |
34 | 37 | from sqlmesh.utils.errors import SQLMeshError |
35 | 38 | from sqlmesh.utils.pydantic import PydanticModel |
@@ -2799,3 +2802,180 @@ def test_identifier_length_limit(ctx: TestContext): |
2799 | 2802 | match=re.escape(match), |
2800 | 2803 | ): |
2801 | 2804 | adapter.create_table(long_table_name, {"col": exp.DataType.build("int")}) |
| 2805 | + |
| 2806 | + |
| 2807 | +def test_janitor_drops_downstream_unexpired_hard_dependencies( |
| 2808 | + ctx: TestContext, tmp_path: pathlib.Path |
| 2809 | +): |
| 2810 | + """ |
| 2811 | + Scenario: |
| 2812 | +
|
| 2813 | + Ensure that cleaning up expired table snapshots also cleans up any unexpired view snapshots that depend on them |
| 2814 | +
|
| 2815 | + - We create a A (table) <- B (view) |
| 2816 | + - In dev, we modify A - triggers new version of A and a dev preview of B that both expire in 7 days |
| 2817 | + - We advance time by 3 days |
| 2818 | + - In dev, we modify B - triggers a new version of B that depends on A but expires 3 days after A |
| 2819 | + - We advance time by 5 days so that A has reached its expiry but B has not |
| 2820 | + - We expire dev so that none of these snapshots are promoted and are thus targets for cleanup |
| 2821 | + - We run the janitor |
| 2822 | +
|
| 2823 | + Expected outcome: |
| 2824 | + - All the dev versions of A and B should be dropped |
| 2825 | + - We should not get a 'ERROR: cannot drop table x because other objects depend on it' on engines that do schema binding |
| 2826 | + """ |
| 2827 | + |
| 2828 | + def _state_sync_engine_adapter(context: Context) -> EngineAdapter: |
| 2829 | + assert isinstance(context.state_sync, CachingStateSync) |
| 2830 | + assert isinstance(context.state_sync.state_sync, EngineAdapterStateSync) |
| 2831 | + return context.state_sync.state_sync.engine_adapter |
| 2832 | + |
| 2833 | + models_dir = tmp_path / "models" |
| 2834 | + models_dir.mkdir() |
| 2835 | + schema = exp.to_table(ctx.schema(TEST_SCHEMA)).this |
| 2836 | + |
| 2837 | + (models_dir / "model_a.sql").write_text(f""" |
| 2838 | + MODEL ( |
| 2839 | + name {schema}.model_a, |
| 2840 | + kind FULL |
| 2841 | + ); |
| 2842 | +
|
| 2843 | + SELECT 1 as a, 2 as b; |
| 2844 | + """) |
| 2845 | + |
| 2846 | + (models_dir / "model_b.sql").write_text(f""" |
| 2847 | + MODEL ( |
| 2848 | + name {schema}.model_b, |
| 2849 | + kind VIEW |
| 2850 | + ); |
| 2851 | +
|
| 2852 | + SELECT a from {schema}.model_a; |
| 2853 | + """) |
| 2854 | + |
| 2855 | + def _mutate_config(gateway: str, config: Config): |
| 2856 | + config.gateways[gateway].state_connection = DuckDBConnectionConfig( |
| 2857 | + database=str(tmp_path / "state.db") |
| 2858 | + ) |
| 2859 | + |
| 2860 | + with time_machine.travel("2020-01-01 00:00:00"): |
| 2861 | + sqlmesh = ctx.create_context( |
| 2862 | + path=tmp_path, config_mutator=_mutate_config, ephemeral_state_connection=False |
| 2863 | + ) |
| 2864 | + sqlmesh.plan(auto_apply=True) |
| 2865 | + |
| 2866 | + model_a_snapshot = next(s for n, s in sqlmesh.snapshots.items() if "model_a" in n) |
| 2867 | + # expiry is last updated + ttl |
| 2868 | + assert timedelta(milliseconds=model_a_snapshot.ttl_ms) == timedelta(weeks=1) |
| 2869 | + assert to_ds(model_a_snapshot.updated_ts) == "2020-01-01" |
| 2870 | + assert to_ds(model_a_snapshot.expiration_ts) == "2020-01-08" |
| 2871 | + |
| 2872 | + model_b_snapshot = next(s for n, s in sqlmesh.snapshots.items() if "model_b" in n) |
| 2873 | + assert timedelta(milliseconds=model_b_snapshot.ttl_ms) == timedelta(weeks=1) |
| 2874 | + assert to_ds(model_b_snapshot.updated_ts) == "2020-01-01" |
| 2875 | + assert to_ds(model_b_snapshot.expiration_ts) == "2020-01-08" |
| 2876 | + |
| 2877 | + model_a_prod_snapshot = model_a_snapshot |
| 2878 | + model_b_prod_snapshot = model_b_snapshot |
| 2879 | + |
| 2880 | + # move forward 1 days |
| 2881 | + # new dev environment - touch models to create new snapshots |
| 2882 | + # model a / b expiry in prod should remain unmodified |
| 2883 | + # model a / b expiry in dev should be as at today |
| 2884 | + with time_machine.travel("2020-01-02 00:00:00"): |
| 2885 | + (models_dir / "model_a.sql").write_text(f""" |
| 2886 | + MODEL ( |
| 2887 | + name {schema}.model_a, |
| 2888 | + kind FULL |
| 2889 | + ); |
| 2890 | +
|
| 2891 | + SELECT 1 as a, 2 as b, 3 as c; |
| 2892 | + """) |
| 2893 | + |
| 2894 | + sqlmesh = ctx.create_context( |
| 2895 | + path=tmp_path, config_mutator=_mutate_config, ephemeral_state_connection=False |
| 2896 | + ) |
| 2897 | + sqlmesh.plan(environment="dev", auto_apply=True) |
| 2898 | + |
| 2899 | + # should now have 4 snapshots in state - 2x model a and 2x model b |
| 2900 | + # the new model b is a dev preview because its upstream model changed |
| 2901 | + assert ( |
| 2902 | + len(_state_sync_engine_adapter(sqlmesh).fetchall(f"select * from sqlmesh._snapshots")) |
| 2903 | + == 4 |
| 2904 | + ) |
| 2905 | + |
| 2906 | + # context just has the two latest |
| 2907 | + assert len(sqlmesh.snapshots) == 2 |
| 2908 | + |
| 2909 | + # these expire 1 day later than what's in prod |
| 2910 | + model_a_snapshot = next(s for n, s in sqlmesh.snapshots.items() if "model_a" in n) |
| 2911 | + assert timedelta(milliseconds=model_a_snapshot.ttl_ms) == timedelta(weeks=1) |
| 2912 | + assert to_ds(model_a_snapshot.updated_ts) == "2020-01-02" |
| 2913 | + assert to_ds(model_a_snapshot.expiration_ts) == "2020-01-09" |
| 2914 | + |
| 2915 | + model_b_snapshot = next(s for n, s in sqlmesh.snapshots.items() if "model_b" in n) |
| 2916 | + assert timedelta(milliseconds=model_b_snapshot.ttl_ms) == timedelta(weeks=1) |
| 2917 | + assert to_ds(model_b_snapshot.updated_ts) == "2020-01-02" |
| 2918 | + assert to_ds(model_b_snapshot.expiration_ts) == "2020-01-09" |
| 2919 | + |
| 2920 | + # move forward 3 days |
| 2921 | + # touch model b in dev but leave model a |
| 2922 | + # this bumps the model b expiry but model a remains unchanged, so will expire before model b even though model b depends on it |
| 2923 | + with time_machine.travel("2020-01-05 00:00:00"): |
| 2924 | + (models_dir / "model_b.sql").write_text(f""" |
| 2925 | + MODEL ( |
| 2926 | + name {schema}.model_b, |
| 2927 | + kind VIEW |
| 2928 | + ); |
| 2929 | +
|
| 2930 | + SELECT a, 'b' as b from {schema}.model_a; |
| 2931 | + """) |
| 2932 | + |
| 2933 | + sqlmesh = ctx.create_context( |
| 2934 | + path=tmp_path, config_mutator=_mutate_config, ephemeral_state_connection=False |
| 2935 | + ) |
| 2936 | + sqlmesh.plan(environment="dev", auto_apply=True) |
| 2937 | + |
| 2938 | + # should now have 5 snapshots in state - 2x model a and 3x model b |
| 2939 | + assert ( |
| 2940 | + len(_state_sync_engine_adapter(sqlmesh).fetchall(f"select * from sqlmesh._snapshots")) |
| 2941 | + == 5 |
| 2942 | + ) |
| 2943 | + |
| 2944 | + # context just has the two latest |
| 2945 | + assert len(sqlmesh.snapshots) == 2 |
| 2946 | + |
| 2947 | + # model a expiry should not have changed |
| 2948 | + model_a_snapshot = next(s for n, s in sqlmesh.snapshots.items() if "model_a" in n) |
| 2949 | + assert timedelta(milliseconds=model_a_snapshot.ttl_ms) == timedelta(weeks=1) |
| 2950 | + assert to_ds(model_a_snapshot.updated_ts) == "2020-01-02" |
| 2951 | + assert to_ds(model_a_snapshot.expiration_ts) == "2020-01-09" |
| 2952 | + |
| 2953 | + # model b should now expire well after model a |
| 2954 | + model_b_snapshot = next(s for n, s in sqlmesh.snapshots.items() if "model_b" in n) |
| 2955 | + assert timedelta(milliseconds=model_b_snapshot.ttl_ms) == timedelta(weeks=1) |
| 2956 | + assert to_ds(model_b_snapshot.updated_ts) == "2020-01-05" |
| 2957 | + assert to_ds(model_b_snapshot.expiration_ts) == "2020-01-12" |
| 2958 | + |
| 2959 | + # move forward to date where after model a has expired but before model b has expired |
| 2960 | + # invalidate dev to trigger cleanups |
| 2961 | + # run janitor. model a is expired so will be cleaned up and this will cascade to model b. |
| 2962 | + with time_machine.travel("2020-01-10 00:00:00"): |
| 2963 | + sqlmesh = ctx.create_context( |
| 2964 | + path=tmp_path, config_mutator=_mutate_config, ephemeral_state_connection=False |
| 2965 | + ) |
| 2966 | + |
| 2967 | + before_snapshots = _state_sync_engine_adapter(sqlmesh).fetchall( |
| 2968 | + f"select name, identifier from sqlmesh._snapshots" |
| 2969 | + ) |
| 2970 | + sqlmesh.invalidate_environment("dev") |
| 2971 | + sqlmesh.run_janitor(ignore_ttl=False) |
| 2972 | + after_snapshots = _state_sync_engine_adapter(sqlmesh).fetchall( |
| 2973 | + f"select name, identifier from sqlmesh._snapshots" |
| 2974 | + ) |
| 2975 | + |
| 2976 | + assert len(before_snapshots) != len(after_snapshots) |
| 2977 | + |
| 2978 | + # all that's left should be the two snapshots that were in prod |
| 2979 | + assert set( |
| 2980 | + [SnapshotId(name=name, identifier=identifier) for name, identifier in after_snapshots] |
| 2981 | + ) == set([model_a_prod_snapshot.snapshot_id, model_b_prod_snapshot.snapshot_id]) |
0 commit comments