From d99cf964d75715e25ca8dba925dc4f57d06f3f8c Mon Sep 17 00:00:00 2001 From: Becky Smith Date: Mon, 2 Feb 2026 17:22:58 +0000 Subject: [PATCH] Extend the db-maintenance check logic to handle duplicate ongoing builds The weekly build starts on a Friday and typically takes about 5 days, with DB maintenance happening in the last 10-20 hours when the (quick) SwapTables event and the (not quick) CodedEvent_SNOMED rebuild event happen. Occasionally something delays or slows down a build so that it takes longer than a week, and the next Fridays build starts before the previous one had finished. The previous logic for the maintenance mode check only looked at the most recently started overall build; this meant that if a new build started before the previous one had finished, it would have no associated SwapTables/CodedEvent_SNOMED events yet, and we're report that we were out of maintenance mode when we weren't. We now look for the most recent TWO builds, so we can check that they're not both ongoing. If they are both ongoing, we use the earliest of the two to check for associated SwapTables/CodedEvent_SNOMED events. If we determine that we're not in maintenance mode, we now also do a final check to ensure that the CodedEvent_SNOMED table really is available. We return both the maintenance mode status and the build count so that the RAP Agent can include the build count in telemetry, and we can set up alerts in honeycomb if we ever see a build count of 2. --- tests/test_maintenance_mode.py | 109 +++++++++++++++++++++++-- tpp_database_utils/main.py | 7 +- tpp_database_utils/maintenance_mode.py | 53 +++++++++--- 3 files changed, 148 insertions(+), 21 deletions(-) diff --git a/tests/test_maintenance_mode.py b/tests/test_maintenance_mode.py index 0cee1d4..4faf007 100644 --- a/tests/test_maintenance_mode.py +++ b/tests/test_maintenance_mode.py @@ -42,10 +42,38 @@ def _build_progress(values): tpp_connection.cursor().execute("DELETE FROM BuildProgress") +@pytest.fixture +def cleanup_coded_event_snomed_table(tpp_connection): + yield + with tpp_connection.cursor() as cursor: + table = "CodedEvent_SNOMED" + cursor.execute( + f""" + IF OBJECT_ID('{table}', 'U') IS NOT NULL + DROP TABLE {table} + """ + ) + + +def create_coded_event_snomed_table(tpp_connection): + with tpp_connection.cursor() as cursor: + table = "CodedEvent_SNOMED" + cursor.execute( + f""" + IF OBJECT_ID('{table}', 'U') IS NOT NULL + DROP TABLE {table} + + CREATE TABLE {table} ( + CodedEvent_ID INT, + ) + """ + ) + + @pytest.mark.parametrize( - "description,events,is_in_maintenance_mode", + "description,events,is_in_maintenance_mode,expected_build_count", [ - ("No events", [], False), + ("No events", [], False, 0), ( "Historical finished maintenance mode", [ @@ -72,6 +100,7 @@ def _build_progress(values): ), ], False, + 0, ), ( "Historical unfinished maintenance mode", @@ -113,6 +142,7 @@ def _build_progress(values): ), ], False, + 0, ), ( "Inconsistent entries, assume maintenance for safety", @@ -154,6 +184,7 @@ def _build_progress(values): ), ], True, + 1, ), ( "Swapping tables", @@ -174,6 +205,7 @@ def _build_progress(values): ), ], True, + 1, ), ( "Building CodedEvent_SNOMED", @@ -201,18 +233,85 @@ def _build_progress(values): ), ], True, + 1, + ), + ( + "Multiple ongoing builds", + [ + ( + "OpenSAFELY", + "2025-06-12T14:25:10", + "2025-06-12T14:25:10", + "9999-12-31T00:00:00", + None, + ), + ( + "Swap Tables", + "2025-06-12T14:25:10", + "2025-06-12T14:25:10", + "9999-12-31T00:00:00", + None, + ), + ( + "OpenSAFELY", + "2026-02-01T14:25:10", + "2026-02-01T14:25:10", + "9999-12-31T00:00:00", + None, + ), + ], + True, + 2, ), ], ) def test_in_maintenance_mode( - tpp_connection, build_progress_factory, description, events, is_in_maintenance_mode + tpp_connection, + build_progress_factory, + description, + events, + is_in_maintenance_mode, + expected_build_count, ): for event in events: build_progress_factory(event) verify_build_progress_count(tpp_connection, events) - mode = in_maintenance_mode(tpp_connection) + mode, build_count = in_maintenance_mode(tpp_connection) assert mode is is_in_maintenance_mode, description + assert build_count == expected_build_count + + +@pytest.mark.parametrize( + "table_available,is_in_maintenance_mode", [(True, False), (False, True)] +) +def test_in_maintenance_mode_checks_coded_event_snomed_availability( + tpp_connection, + build_progress_factory, + cleanup_coded_event_snomed_table, + table_available, + is_in_maintenance_mode, +): + # Main build has started, SwapTables and CodedEvent_SNOMED events have not started yet + # We are not in maintenance mode according to the BuildProgress table, but the final + # check to ensure availability of the CodedEvent_SNOMED table can override this + events = [ + ( + "OpenSAFELY", + "2025-06-12T14:25:10", + "2025-06-12T14:25:10", + "9999-12-31T00:00:00", + None, + ) + ] + for event in events: + build_progress_factory(event) + verify_build_progress_count(tpp_connection, events) + + if table_available: + create_coded_event_snomed_table(tpp_connection) + + assert in_maintenance_mode(tpp_connection)[0] == is_in_maintenance_mode def test_in_maintenance_mode_custom_event(tpp_connection, build_progress_factory): @@ -239,7 +338,7 @@ def test_in_maintenance_mode_custom_event(tpp_connection, build_progress_factory build_progress_factory(event) verify_build_progress_count(tpp_connection, events) - assert in_maintenance_mode(tpp_connection) is True + assert in_maintenance_mode(tpp_connection)[0] is True def verify_build_progress_count(tpp_connection, events): diff --git a/tpp_database_utils/main.py b/tpp_database_utils/main.py index c822001..993aeb9 100644 --- a/tpp_database_utils/main.py +++ b/tpp_database_utils/main.py @@ -10,12 +10,13 @@ def in_maintenance_mode(tpp_connection): - mode = maintenance_mode.in_maintenance_mode(tpp_connection) - + mode, build_count = maintenance_mode.in_maintenance_mode(tpp_connection) # This should be the only output on stdout, # anything else needs to be on stderr if mode: - print("db-maintenance") + print(f"db-maintenance;{build_count}") + else: + print(f"none;{build_count}") def update_custom_medication_dictionary(tpp_connection, temp_database_name): diff --git a/tpp_database_utils/maintenance_mode.py b/tpp_database_utils/maintenance_mode.py index 1d051ef..81ef8da 100644 --- a/tpp_database_utils/maintenance_mode.py +++ b/tpp_database_utils/maintenance_mode.py @@ -1,5 +1,7 @@ import os +from pymssql.exceptions import ProgrammingError + def in_maintenance_mode(tpp_connection): """TPP Specific maintenance mode query. @@ -26,38 +28,63 @@ def in_maintenance_mode(tpp_connection): The trigger to exit maintenance mode is when the final OpenSAFELY event finishes. - """ + Occasionally the build is slow (usually due to intensive concurrent DB queries), + and the previous week's build does not complete before the next build starts. In + this case we end up with two ongoing builds, so in order to determine DB status + we need to check for ongoing SwapTables or CodedEvent_SNOMED events since the start + of the earlier build. The presence of two ongoing builds is a flag for something + that we want to alert on, so we also return the build count. + """ cursor = tpp_connection.cursor() + # Select the TWO most recently started overall OpenSAFELY build events cursor.execute( """ - SELECT TOP 1 EventStart, EventEnd + SELECT TOP 2 EventStart, EventEnd FROM BuildProgress WHERE Event = 'OpenSAFELY' ORDER BY EventStart DESC """ ) - latest_rebuild = cursor.fetchone() - if not latest_rebuild: + latest_rebuilds = cursor.fetchall() + if not latest_rebuilds: # No events at all, we can't be in maintenance mode - return False + return False, 0 - rebuild_start, rebuild_end = latest_rebuild - if rebuild_end.year < 9999: - # Rebuild has completed therefore we aren't in maintenance mode - return False + # Of the two most recently started builds, iddentify those that are ongoing (i.e. + # those with an end date of 9999-12-31) + _, most_recent_end = latest_rebuilds[0] + if most_recent_end.year < 9999: + # The most recent build is complete, so we're not in maintenance mode, and any + # previous build is a historical one which we can ignore + return False, 0 + ongoing_builds = [rebuild for rebuild in latest_rebuilds if rebuild[1].year == 9999] + build_count = len(ongoing_builds) + + # Check for incomplete events starting after the start of the earliest ongoing build + earliest_build_start_date = min(ongoing_builds)[0] cursor.execute( - "SELECT Event FROM BuildProgress WHERE EventStart >= %s", - rebuild_start, + "SELECT Event, EventEnd FROM BuildProgress WHERE EventStart >= %s", + earliest_build_start_date, ) - current_events = {row[0] for row in cursor.fetchall()} + current_events = {row[0] for row in cursor.fetchall() if row[1].year == 9999} # Env var allows quick change of start event logic if needed start_events = os.environ.get( "TPP_MAINTENANCE_START_EVENT", "Swap Tables,CodedEvent_SNOMED" ).split(",") + in_maintenance_mode = bool(current_events.intersection(start_events)) + + if not in_maintenance_mode: + # According to the events, we're not in maintenance mode. As a final check, + # make sure that the CodedEvent_SNOMED table really is available. + try: + cursor.execute("SELECT TOP 1 CodedEvent_ID FROM CodedEvent_SNOMED") + except ProgrammingError: + in_maintenance_mode = True + # We start maintenance mode as soon as we see any of the "trigger" events # and then don't exit until the entire build is finished - return bool(current_events.intersection(start_events)) + return in_maintenance_mode, build_count