Skip to content

Commit 7c51186

Browse files
committed
unpause after checkpoint rejected by platform
1 parent ad9cf28 commit 7c51186

File tree

2 files changed

+46
-6
lines changed

2 files changed

+46
-6
lines changed

apps/coordinator/src/checkpointer.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,16 @@ export class Checkpointer {
601601
}
602602
}
603603

604+
async unpause(runId: string, attemptNumber?: number): Promise<void> {
605+
try {
606+
const containterNameWithAttempt = this.#getRunContainerName(runId, attemptNumber);
607+
const exec = new Exec({ logger: this.#logger });
608+
await exec.x("docker", ["unpause", containterNameWithAttempt]);
609+
} catch (error) {
610+
this.#logger.error("[Docker] Error during unpause", { runId, attemptNumber, error });
611+
}
612+
}
613+
604614
async #createDockerCheckpoint(
605615
abortSignal: AbortSignal,
606616
runId: string,

apps/coordinator/src/index.ts

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,11 +1015,14 @@ class TaskCoordinator {
10151015
return;
10161016
}
10171017

1018+
const runId = socket.data.runId;
1019+
const attemptNumber = getAttemptNumber();
1020+
10181021
const checkpoint = await this.#checkpointer.checkpointAndPush({
1019-
runId: socket.data.runId,
1022+
runId,
10201023
projectRef: socket.data.projectRef,
10211024
deploymentVersion: socket.data.deploymentVersion,
1022-
attemptNumber: getAttemptNumber(),
1025+
attemptNumber,
10231026
});
10241027

10251028
if (!checkpoint) {
@@ -1045,6 +1048,13 @@ class TaskCoordinator {
10451048

10461049
if (ack?.keepRunAlive) {
10471050
log.log("keeping run alive after duration checkpoint");
1051+
1052+
if (checkpoint.docker && willSimulate) {
1053+
// The container is still paused so we need to unpause it
1054+
log.log("unpausing container after duration checkpoint");
1055+
this.#checkpointer.unpause(runId, attemptNumber);
1056+
}
1057+
10481058
return;
10491059
}
10501060

@@ -1103,12 +1113,15 @@ class TaskCoordinator {
11031113
}
11041114
}
11051115

1116+
const runId = socket.data.runId;
1117+
const attemptNumber = getAttemptNumber();
1118+
11061119
const checkpoint = await this.#checkpointer.checkpointAndPush(
11071120
{
1108-
runId: socket.data.runId,
1121+
runId,
11091122
projectRef: socket.data.projectRef,
11101123
deploymentVersion: socket.data.deploymentVersion,
1111-
attemptNumber: getAttemptNumber(),
1124+
attemptNumber,
11121125
},
11131126
WAIT_FOR_TASK_CHECKPOINT_DELAY_MS
11141127
);
@@ -1141,6 +1154,13 @@ class TaskCoordinator {
11411154
if (ack?.keepRunAlive) {
11421155
socket.data.requiresCheckpointResumeWithMessage = undefined;
11431156
log.log("keeping run alive after task checkpoint");
1157+
1158+
if (checkpoint.docker && willSimulate) {
1159+
// The container is still paused so we need to unpause it
1160+
log.log("unpausing container after duration checkpoint");
1161+
this.#checkpointer.unpause(runId, attemptNumber);
1162+
}
1163+
11441164
return;
11451165
}
11461166

@@ -1199,12 +1219,15 @@ class TaskCoordinator {
11991219
}
12001220
}
12011221

1222+
const runId = socket.data.runId;
1223+
const attemptNumber = getAttemptNumber();
1224+
12021225
const checkpoint = await this.#checkpointer.checkpointAndPush(
12031226
{
1204-
runId: socket.data.runId,
1227+
runId,
12051228
projectRef: socket.data.projectRef,
12061229
deploymentVersion: socket.data.deploymentVersion,
1207-
attemptNumber: getAttemptNumber(),
1230+
attemptNumber,
12081231
},
12091232
WAIT_FOR_BATCH_CHECKPOINT_DELAY_MS
12101233
);
@@ -1238,6 +1261,13 @@ class TaskCoordinator {
12381261
if (ack?.keepRunAlive) {
12391262
socket.data.requiresCheckpointResumeWithMessage = undefined;
12401263
log.log("keeping run alive after batch checkpoint");
1264+
1265+
if (checkpoint.docker && willSimulate) {
1266+
// The container is still paused so we need to unpause it
1267+
log.log("unpausing container after batch checkpoint");
1268+
this.#checkpointer.unpause(runId, attemptNumber);
1269+
}
1270+
12411271
return;
12421272
}
12431273

0 commit comments

Comments
 (0)