Skip to content

Commit 39e2d2f

Browse files
committed
runner failling should imply the runner is dead
this simplifies some restart logic that caused the runners to loop. we only create runners if we failed or the instance has no failed runners (the runners were revived successfully)
1 parent 7b26e19 commit 39e2d2f

3 files changed

Lines changed: 12 additions & 8 deletions

File tree

src/exo/worker/plan.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def plan(
5959
return (
6060
_cancel_tasks(runners, tasks)
6161
or _kill_runner(runners, all_runners, instances)
62-
or _create_runner(node_id, runners, instances, instance_backoff)
62+
or _create_runner(node_id, runners, all_runners, instances, instance_backoff)
6363
or _model_needs_download(
6464
node_id, runners, global_download_status, download_backoff
6565
)
@@ -96,22 +96,26 @@ def _kill_runner(
9696
def _create_runner(
9797
node_id: NodeId,
9898
runners: Mapping[RunnerId, RunnerSupervisor],
99+
all_runners: Mapping[RunnerId, RunnerStatus],
99100
instances: Mapping[InstanceId, Instance],
100101
instance_backoff: KeyedBackoff[InstanceId],
101102
) -> CreateRunner | None:
102103
for instance in instances.values():
103-
if not instance_backoff.should_proceed(instance.instance_id):
104-
continue
105-
106104
runner_id = instance.shard_assignments.node_to_runner.get(node_id, None)
107105
if runner_id is None:
108106
continue
109107

110108
if runner_id in runners:
111109
continue
112110

113-
shard = instance.shard(runner_id)
114-
assert shard is not None
111+
# don't create runners if any other nodes have runners that have failed - wait for them to fix themselves first.
112+
instance_has_failed_runner = any( isinstance(all_runners.get(remote_runner_id), RunnerFailed) for remote_runner_id in instance.shard_assignments.node_to_runner.values() if remote_runner_id != runner_id )
113+
we_have_failed_before = isinstance(all_runners.get(runner_id), RunnerFailed)
114+
if instance_has_failed_runner and not we_have_failed_before:
115+
continue
116+
117+
if not instance_backoff.should_proceed(instance.instance_id):
118+
continue
115119

116120
return CreateRunner(
117121
instance_id=instance.instance_id,

src/exo/worker/runner/image_models/runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,7 @@ def main(self):
332332
def handle_task(self, task: Task):
333333
match task:
334334
case ConnectToGroup() if isinstance(
335-
self.current_status, (RunnerIdle, RunnerFailed)
335+
self.current_status, RunnerIdle
336336
):
337337
logger.info("runner connecting")
338338
self.update_status(RunnerConnecting())

src/exo/worker/runner/llm_inference/runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ def handle_first_task(self, task: Task):
150150

151151
match task:
152152
case ConnectToGroup() if isinstance(
153-
self.current_status, (RunnerIdle, RunnerFailed)
153+
self.current_status, RunnerIdle
154154
):
155155
assert isinstance(self.generator, Builder)
156156
logger.info("runner connecting")

0 commit comments

Comments
 (0)