|
13 | 13 | import pandas as pd # noqa: TID253 |
14 | 14 | import pytest |
15 | 15 | import pytz |
| 16 | +import time_machine |
16 | 17 | from sqlglot import exp, parse_one |
17 | 18 | from sqlglot.optimizer.normalize_identifiers import normalize_identifiers |
18 | 19 |
|
|
28 | 29 | from sqlmesh.core.model.definition import create_sql_model |
29 | 30 | from sqlmesh.core.plan import Plan |
30 | 31 | from sqlmesh.core.state_sync.db import EngineAdapterStateSync |
31 | | -from sqlmesh.core.snapshot import Snapshot, SnapshotChangeCategory |
32 | | -from sqlmesh.utils.date import now, to_date, to_time_column |
| 32 | +from sqlmesh.core.snapshot import Snapshot, SnapshotChangeCategory, SnapshotId |
| 33 | +from sqlmesh.utils.date import now, to_date, to_time_column, to_ds |
33 | 34 | from sqlmesh.core.table_diff import TableDiff |
34 | 35 | from sqlmesh.utils.errors import SQLMeshError |
35 | 36 | from sqlmesh.utils.pydantic import PydanticModel |
@@ -2672,3 +2673,166 @@ def test_identifier_length_limit(ctx: TestContext): |
2672 | 2673 | match=re.escape(match), |
2673 | 2674 | ): |
2674 | 2675 | adapter.create_table(long_table_name, {"col": exp.DataType.build("int")}) |
| 2676 | + |
| 2677 | + |
| 2678 | +def test_janitor_out_of_order_drop(ctx: TestContext, tmp_path: pathlib.Path): |
| 2679 | + """ |
| 2680 | + Scenario: |
| 2681 | +
|
| 2682 | + Ensure that cleaning up expired table snapshots also cleans up any unexpired view snapshots that depend on them |
| 2683 | +
|
| 2684 | + - We create a A (table) <- B (view) |
| 2685 | + - In dev, we modify A - triggers new version of A and a dev preview of B that both expire in 7 days |
| 2686 | + - We advance time by 3 days |
| 2687 | + - In dev, we modify B - triggers a new version of B that depends on A but expires 3 days after A |
| 2688 | + - We advance time by 5 days so that A has reached its expiry but B has not |
| 2689 | + - We expire dev so that none of these snapshots are promoted and are thus targets for cleanup |
| 2690 | + - We run the janitor |
| 2691 | +
|
| 2692 | + Expected outcome: |
| 2693 | + - All the dev versions of A and B should be dropped |
| 2694 | + - We should not get a 'ERROR: cannot drop table x because other objects depend on it' on engines that do schema binding |
| 2695 | + """ |
| 2696 | + |
| 2697 | + models_dir = tmp_path / "models" |
| 2698 | + models_dir.mkdir() |
| 2699 | + schema = exp.to_table(ctx.schema(TEST_SCHEMA)).this |
| 2700 | + |
| 2701 | + (models_dir / "model_a.sql").write_text(f""" |
| 2702 | + MODEL ( |
| 2703 | + name {schema}.model_a, |
| 2704 | + kind FULL |
| 2705 | + ); |
| 2706 | +
|
| 2707 | + SELECT 1 as a, 2 as b; |
| 2708 | + """) |
| 2709 | + |
| 2710 | + (models_dir / "model_b.sql").write_text(f""" |
| 2711 | + MODEL ( |
| 2712 | + name {schema}.model_b, |
| 2713 | + kind VIEW |
| 2714 | + ); |
| 2715 | +
|
| 2716 | + SELECT a from {schema}.model_a; |
| 2717 | + """) |
| 2718 | + |
| 2719 | + def _mutate_config(gateway: str, config: Config): |
| 2720 | + # helps with debugging when you can see the SQLMesh state right next to the model tables |
| 2721 | + config.gateways[gateway].state_schema = f"{schema}" |
| 2722 | + |
| 2723 | + with time_machine.travel("2020-01-01 00:00:00"): |
| 2724 | + sqlmesh = ctx.create_context( |
| 2725 | + path=tmp_path, config_mutator=_mutate_config, ephemeral_state_connection=False |
| 2726 | + ) |
| 2727 | + sqlmesh.plan(auto_apply=True) |
| 2728 | + |
| 2729 | + model_a_snapshot = next(s for n, s in sqlmesh.snapshots.items() if "model_a" in n) |
| 2730 | + # expiry is last updated + ttl |
| 2731 | + assert timedelta(milliseconds=model_a_snapshot.ttl_ms) == timedelta(weeks=1) |
| 2732 | + assert to_ds(model_a_snapshot.updated_ts) == "2020-01-01" |
| 2733 | + assert to_ds(model_a_snapshot.expiration_ts) == "2020-01-08" |
| 2734 | + |
| 2735 | + model_b_snapshot = next(s for n, s in sqlmesh.snapshots.items() if "model_b" in n) |
| 2736 | + assert timedelta(milliseconds=model_b_snapshot.ttl_ms) == timedelta(weeks=1) |
| 2737 | + assert to_ds(model_b_snapshot.updated_ts) == "2020-01-01" |
| 2738 | + assert to_ds(model_b_snapshot.expiration_ts) == "2020-01-08" |
| 2739 | + |
| 2740 | + model_a_prod_snapshot = model_a_snapshot |
| 2741 | + model_b_prod_snapshot = model_b_snapshot |
| 2742 | + |
| 2743 | + # move forward 1 days |
| 2744 | + # new dev environment - touch models to create new snapshots |
| 2745 | + # model a / b expiry in prod should remain unmodified |
| 2746 | + # model a / b expiry in dev should be as at today |
| 2747 | + with time_machine.travel("2020-01-02 00:00:00"): |
| 2748 | + (models_dir / "model_a.sql").write_text(f""" |
| 2749 | + MODEL ( |
| 2750 | + name {schema}.model_a, |
| 2751 | + kind FULL |
| 2752 | + ); |
| 2753 | +
|
| 2754 | + SELECT 1 as a, 2 as b, 3 as c; |
| 2755 | + """) |
| 2756 | + |
| 2757 | + sqlmesh = ctx.create_context( |
| 2758 | + path=tmp_path, config_mutator=_mutate_config, ephemeral_state_connection=False |
| 2759 | + ) |
| 2760 | + sqlmesh.plan(environment="dev", auto_apply=True) |
| 2761 | + |
| 2762 | + # should now have 4 snapshots in state - 2x model a and 2x model b |
| 2763 | + # the new model b is a dev preview because its upstream model changed |
| 2764 | + assert len(sqlmesh.engine_adapter.fetchall(f"select * from {schema}._snapshots")) == 4 |
| 2765 | + |
| 2766 | + # context just has the two latest |
| 2767 | + assert len(sqlmesh.snapshots) == 2 |
| 2768 | + |
| 2769 | + # these expire 1 day later than what's in prod |
| 2770 | + model_a_snapshot = next(s for n, s in sqlmesh.snapshots.items() if "model_a" in n) |
| 2771 | + assert timedelta(milliseconds=model_a_snapshot.ttl_ms) == timedelta(weeks=1) |
| 2772 | + assert to_ds(model_a_snapshot.updated_ts) == "2020-01-02" |
| 2773 | + assert to_ds(model_a_snapshot.expiration_ts) == "2020-01-09" |
| 2774 | + |
| 2775 | + model_b_snapshot = next(s for n, s in sqlmesh.snapshots.items() if "model_b" in n) |
| 2776 | + assert timedelta(milliseconds=model_b_snapshot.ttl_ms) == timedelta(weeks=1) |
| 2777 | + assert to_ds(model_b_snapshot.updated_ts) == "2020-01-02" |
| 2778 | + assert to_ds(model_b_snapshot.expiration_ts) == "2020-01-09" |
| 2779 | + |
| 2780 | + # move forward 3 days |
| 2781 | + # touch model b in dev but leave model a |
| 2782 | + # this bumps the model b expiry but model a remains unchanged, so will expire before model b even though model b depends on it |
| 2783 | + with time_machine.travel("2020-01-05 00:00:00"): |
| 2784 | + (models_dir / "model_b.sql").write_text(f""" |
| 2785 | + MODEL ( |
| 2786 | + name {schema}.model_b, |
| 2787 | + kind VIEW |
| 2788 | + ); |
| 2789 | +
|
| 2790 | + SELECT a, 'b' as b from {schema}.model_a; |
| 2791 | + """) |
| 2792 | + |
| 2793 | + sqlmesh = ctx.create_context( |
| 2794 | + path=tmp_path, config_mutator=_mutate_config, ephemeral_state_connection=False |
| 2795 | + ) |
| 2796 | + sqlmesh.plan(environment="dev", auto_apply=True) |
| 2797 | + |
| 2798 | + # should now have 5 snapshots in state - 2x model a and 3x model b |
| 2799 | + assert len(sqlmesh.engine_adapter.fetchall(f"select * from {schema}._snapshots")) == 5 |
| 2800 | + |
| 2801 | + # context just has the two latest |
| 2802 | + assert len(sqlmesh.snapshots) == 2 |
| 2803 | + |
| 2804 | + # model a expiry should not have changed |
| 2805 | + model_a_snapshot = next(s for n, s in sqlmesh.snapshots.items() if "model_a" in n) |
| 2806 | + assert timedelta(milliseconds=model_a_snapshot.ttl_ms) == timedelta(weeks=1) |
| 2807 | + assert to_ds(model_a_snapshot.updated_ts) == "2020-01-02" |
| 2808 | + assert to_ds(model_a_snapshot.expiration_ts) == "2020-01-09" |
| 2809 | + |
| 2810 | + # model b should now expire well after model a |
| 2811 | + model_b_snapshot = next(s for n, s in sqlmesh.snapshots.items() if "model_b" in n) |
| 2812 | + assert timedelta(milliseconds=model_b_snapshot.ttl_ms) == timedelta(weeks=1) |
| 2813 | + assert to_ds(model_b_snapshot.updated_ts) == "2020-01-05" |
| 2814 | + assert to_ds(model_b_snapshot.expiration_ts) == "2020-01-12" |
| 2815 | + |
| 2816 | + # move forward to date where after model a has expired but before model b has expired |
| 2817 | + # invalidate dev to trigger cleanups |
| 2818 | + # run janitor. model a is expired so will be cleaned up and this will cascade to model b. |
| 2819 | + with time_machine.travel("2020-01-10 00:00:00"): |
| 2820 | + sqlmesh = ctx.create_context( |
| 2821 | + path=tmp_path, config_mutator=_mutate_config, ephemeral_state_connection=False |
| 2822 | + ) |
| 2823 | + |
| 2824 | + before_snapshots = sqlmesh.engine_adapter.fetchall( |
| 2825 | + f"select name, identifier from {schema}._snapshots" |
| 2826 | + ) |
| 2827 | + sqlmesh.invalidate_environment("dev") |
| 2828 | + sqlmesh.run_janitor(ignore_ttl=False) |
| 2829 | + after_snapshots = sqlmesh.engine_adapter.fetchall( |
| 2830 | + f"select name, identifier from {schema}._snapshots" |
| 2831 | + ) |
| 2832 | + |
| 2833 | + assert len(before_snapshots) != len(after_snapshots) |
| 2834 | + |
| 2835 | + # all that's left should be the two snapshots that were in prod |
| 2836 | + assert set( |
| 2837 | + [SnapshotId(name=name, identifier=identifier) for name, identifier in after_snapshots] |
| 2838 | + ) == set([model_a_prod_snapshot.snapshot_id, model_b_prod_snapshot.snapshot_id]) |
0 commit comments