|
13 | 13 | import pandas as pd # noqa: TID253 |
14 | 14 | import pytest |
15 | 15 | import pytz |
| 16 | +import time_machine |
16 | 17 | from sqlglot import exp, parse_one |
17 | 18 | from sqlglot.optimizer.normalize_identifiers import normalize_identifiers |
18 | 19 |
|
19 | 20 | from sqlmesh import Config, Context |
20 | 21 | from sqlmesh.cli.project_init import init_example_project |
21 | 22 | from sqlmesh.core.config import load_config_from_paths |
22 | | -from sqlmesh.core.config.connection import ConnectionConfig |
| 23 | +from sqlmesh.core.config.connection import ConnectionConfig, DuckDBConnectionConfig |
23 | 24 | import sqlmesh.core.dialect as d |
24 | 25 | from sqlmesh.core.dialect import select_from_values |
25 | 26 | from sqlmesh.core.model import Model, load_sql_based_model |
| 27 | +from sqlmesh.core.engine_adapter import EngineAdapter |
26 | 28 | from sqlmesh.core.engine_adapter.shared import DataObject, DataObjectType |
27 | 29 | from sqlmesh.core.engine_adapter.mixins import RowDiffMixin |
28 | 30 | from sqlmesh.core.model.definition import create_sql_model |
29 | 31 | from sqlmesh.core.plan import Plan |
| 32 | +from sqlmesh.core.state_sync.cache import CachingStateSync |
30 | 33 | from sqlmesh.core.state_sync.db import EngineAdapterStateSync |
31 | | -from sqlmesh.core.snapshot import Snapshot, SnapshotChangeCategory |
32 | | -from sqlmesh.utils.date import now, to_date, to_time_column |
| 34 | +from sqlmesh.core.snapshot import Snapshot, SnapshotChangeCategory, SnapshotId |
| 35 | +from sqlmesh.utils.date import now, to_date, to_time_column, to_ds |
33 | 36 | from sqlmesh.core.table_diff import TableDiff |
34 | 37 | from sqlmesh.utils.errors import SQLMeshError |
35 | 38 | from sqlmesh.utils.pydantic import PydanticModel |
@@ -2672,3 +2675,180 @@ def test_identifier_length_limit(ctx: TestContext): |
2672 | 2675 | match=re.escape(match), |
2673 | 2676 | ): |
2674 | 2677 | adapter.create_table(long_table_name, {"col": exp.DataType.build("int")}) |
| 2678 | + |
| 2679 | + |
| 2680 | +def test_janitor_drops_downstream_unexpired_hard_dependencies( |
| 2681 | + ctx: TestContext, tmp_path: pathlib.Path |
| 2682 | +): |
| 2683 | + """ |
| 2684 | + Scenario: |
| 2685 | +
|
| 2686 | + Ensure that cleaning up expired table snapshots also cleans up any unexpired view snapshots that depend on them |
| 2687 | +
|
| 2688 | + - We create a A (table) <- B (view) |
| 2689 | + - In dev, we modify A - triggers new version of A and a dev preview of B that both expire in 7 days |
| 2690 | + - We advance time by 3 days |
| 2691 | + - In dev, we modify B - triggers a new version of B that depends on A but expires 3 days after A |
| 2692 | + - We advance time by 5 days so that A has reached its expiry but B has not |
| 2693 | + - We expire dev so that none of these snapshots are promoted and are thus targets for cleanup |
| 2694 | + - We run the janitor |
| 2695 | +
|
| 2696 | + Expected outcome: |
| 2697 | + - All the dev versions of A and B should be dropped |
| 2698 | + - We should not get a 'ERROR: cannot drop table x because other objects depend on it' on engines that do schema binding |
| 2699 | + """ |
| 2700 | + |
| 2701 | + def _state_sync_engine_adapter(context: Context) -> EngineAdapter: |
| 2702 | + assert isinstance(context.state_sync, CachingStateSync) |
| 2703 | + assert isinstance(context.state_sync.state_sync, EngineAdapterStateSync) |
| 2704 | + return context.state_sync.state_sync.engine_adapter |
| 2705 | + |
| 2706 | + models_dir = tmp_path / "models" |
| 2707 | + models_dir.mkdir() |
| 2708 | + schema = exp.to_table(ctx.schema(TEST_SCHEMA)).this |
| 2709 | + |
| 2710 | + (models_dir / "model_a.sql").write_text(f""" |
| 2711 | + MODEL ( |
| 2712 | + name {schema}.model_a, |
| 2713 | + kind FULL |
| 2714 | + ); |
| 2715 | +
|
| 2716 | + SELECT 1 as a, 2 as b; |
| 2717 | + """) |
| 2718 | + |
| 2719 | + (models_dir / "model_b.sql").write_text(f""" |
| 2720 | + MODEL ( |
| 2721 | + name {schema}.model_b, |
| 2722 | + kind VIEW |
| 2723 | + ); |
| 2724 | +
|
| 2725 | + SELECT a from {schema}.model_a; |
| 2726 | + """) |
| 2727 | + |
| 2728 | + def _mutate_config(gateway: str, config: Config): |
| 2729 | + config.gateways[gateway].state_connection = DuckDBConnectionConfig( |
| 2730 | + database=str(tmp_path / "state.db") |
| 2731 | + ) |
| 2732 | + |
| 2733 | + with time_machine.travel("2020-01-01 00:00:00"): |
| 2734 | + sqlmesh = ctx.create_context( |
| 2735 | + path=tmp_path, config_mutator=_mutate_config, ephemeral_state_connection=False |
| 2736 | + ) |
| 2737 | + sqlmesh.plan(auto_apply=True) |
| 2738 | + |
| 2739 | + model_a_snapshot = next(s for n, s in sqlmesh.snapshots.items() if "model_a" in n) |
| 2740 | + # expiry is last updated + ttl |
| 2741 | + assert timedelta(milliseconds=model_a_snapshot.ttl_ms) == timedelta(weeks=1) |
| 2742 | + assert to_ds(model_a_snapshot.updated_ts) == "2020-01-01" |
| 2743 | + assert to_ds(model_a_snapshot.expiration_ts) == "2020-01-08" |
| 2744 | + |
| 2745 | + model_b_snapshot = next(s for n, s in sqlmesh.snapshots.items() if "model_b" in n) |
| 2746 | + assert timedelta(milliseconds=model_b_snapshot.ttl_ms) == timedelta(weeks=1) |
| 2747 | + assert to_ds(model_b_snapshot.updated_ts) == "2020-01-01" |
| 2748 | + assert to_ds(model_b_snapshot.expiration_ts) == "2020-01-08" |
| 2749 | + |
| 2750 | + model_a_prod_snapshot = model_a_snapshot |
| 2751 | + model_b_prod_snapshot = model_b_snapshot |
| 2752 | + |
| 2753 | + # move forward 1 days |
| 2754 | + # new dev environment - touch models to create new snapshots |
| 2755 | + # model a / b expiry in prod should remain unmodified |
| 2756 | + # model a / b expiry in dev should be as at today |
| 2757 | + with time_machine.travel("2020-01-02 00:00:00"): |
| 2758 | + (models_dir / "model_a.sql").write_text(f""" |
| 2759 | + MODEL ( |
| 2760 | + name {schema}.model_a, |
| 2761 | + kind FULL |
| 2762 | + ); |
| 2763 | +
|
| 2764 | + SELECT 1 as a, 2 as b, 3 as c; |
| 2765 | + """) |
| 2766 | + |
| 2767 | + sqlmesh = ctx.create_context( |
| 2768 | + path=tmp_path, config_mutator=_mutate_config, ephemeral_state_connection=False |
| 2769 | + ) |
| 2770 | + sqlmesh.plan(environment="dev", auto_apply=True) |
| 2771 | + |
| 2772 | + # should now have 4 snapshots in state - 2x model a and 2x model b |
| 2773 | + # the new model b is a dev preview because its upstream model changed |
| 2774 | + assert ( |
| 2775 | + len(_state_sync_engine_adapter(sqlmesh).fetchall(f"select * from sqlmesh._snapshots")) |
| 2776 | + == 4 |
| 2777 | + ) |
| 2778 | + |
| 2779 | + # context just has the two latest |
| 2780 | + assert len(sqlmesh.snapshots) == 2 |
| 2781 | + |
| 2782 | + # these expire 1 day later than what's in prod |
| 2783 | + model_a_snapshot = next(s for n, s in sqlmesh.snapshots.items() if "model_a" in n) |
| 2784 | + assert timedelta(milliseconds=model_a_snapshot.ttl_ms) == timedelta(weeks=1) |
| 2785 | + assert to_ds(model_a_snapshot.updated_ts) == "2020-01-02" |
| 2786 | + assert to_ds(model_a_snapshot.expiration_ts) == "2020-01-09" |
| 2787 | + |
| 2788 | + model_b_snapshot = next(s for n, s in sqlmesh.snapshots.items() if "model_b" in n) |
| 2789 | + assert timedelta(milliseconds=model_b_snapshot.ttl_ms) == timedelta(weeks=1) |
| 2790 | + assert to_ds(model_b_snapshot.updated_ts) == "2020-01-02" |
| 2791 | + assert to_ds(model_b_snapshot.expiration_ts) == "2020-01-09" |
| 2792 | + |
| 2793 | + # move forward 3 days |
| 2794 | + # touch model b in dev but leave model a |
| 2795 | + # this bumps the model b expiry but model a remains unchanged, so will expire before model b even though model b depends on it |
| 2796 | + with time_machine.travel("2020-01-05 00:00:00"): |
| 2797 | + (models_dir / "model_b.sql").write_text(f""" |
| 2798 | + MODEL ( |
| 2799 | + name {schema}.model_b, |
| 2800 | + kind VIEW |
| 2801 | + ); |
| 2802 | +
|
| 2803 | + SELECT a, 'b' as b from {schema}.model_a; |
| 2804 | + """) |
| 2805 | + |
| 2806 | + sqlmesh = ctx.create_context( |
| 2807 | + path=tmp_path, config_mutator=_mutate_config, ephemeral_state_connection=False |
| 2808 | + ) |
| 2809 | + sqlmesh.plan(environment="dev", auto_apply=True) |
| 2810 | + |
| 2811 | + # should now have 5 snapshots in state - 2x model a and 3x model b |
| 2812 | + assert ( |
| 2813 | + len(_state_sync_engine_adapter(sqlmesh).fetchall(f"select * from sqlmesh._snapshots")) |
| 2814 | + == 5 |
| 2815 | + ) |
| 2816 | + |
| 2817 | + # context just has the two latest |
| 2818 | + assert len(sqlmesh.snapshots) == 2 |
| 2819 | + |
| 2820 | + # model a expiry should not have changed |
| 2821 | + model_a_snapshot = next(s for n, s in sqlmesh.snapshots.items() if "model_a" in n) |
| 2822 | + assert timedelta(milliseconds=model_a_snapshot.ttl_ms) == timedelta(weeks=1) |
| 2823 | + assert to_ds(model_a_snapshot.updated_ts) == "2020-01-02" |
| 2824 | + assert to_ds(model_a_snapshot.expiration_ts) == "2020-01-09" |
| 2825 | + |
| 2826 | + # model b should now expire well after model a |
| 2827 | + model_b_snapshot = next(s for n, s in sqlmesh.snapshots.items() if "model_b" in n) |
| 2828 | + assert timedelta(milliseconds=model_b_snapshot.ttl_ms) == timedelta(weeks=1) |
| 2829 | + assert to_ds(model_b_snapshot.updated_ts) == "2020-01-05" |
| 2830 | + assert to_ds(model_b_snapshot.expiration_ts) == "2020-01-12" |
| 2831 | + |
| 2832 | + # move forward to date where after model a has expired but before model b has expired |
| 2833 | + # invalidate dev to trigger cleanups |
| 2834 | + # run janitor. model a is expired so will be cleaned up and this will cascade to model b. |
| 2835 | + with time_machine.travel("2020-01-10 00:00:00"): |
| 2836 | + sqlmesh = ctx.create_context( |
| 2837 | + path=tmp_path, config_mutator=_mutate_config, ephemeral_state_connection=False |
| 2838 | + ) |
| 2839 | + |
| 2840 | + before_snapshots = _state_sync_engine_adapter(sqlmesh).fetchall( |
| 2841 | + f"select name, identifier from sqlmesh._snapshots" |
| 2842 | + ) |
| 2843 | + sqlmesh.invalidate_environment("dev") |
| 2844 | + sqlmesh.run_janitor(ignore_ttl=False) |
| 2845 | + after_snapshots = _state_sync_engine_adapter(sqlmesh).fetchall( |
| 2846 | + f"select name, identifier from sqlmesh._snapshots" |
| 2847 | + ) |
| 2848 | + |
| 2849 | + assert len(before_snapshots) != len(after_snapshots) |
| 2850 | + |
| 2851 | + # all that's left should be the two snapshots that were in prod |
| 2852 | + assert set( |
| 2853 | + [SnapshotId(name=name, identifier=identifier) for name, identifier in after_snapshots] |
| 2854 | + ) == set([model_a_prod_snapshot.snapshot_id, model_b_prod_snapshot.snapshot_id]) |
0 commit comments