Don't reject uneven placements in placement

rltakashige · rltakashige · commit be097ba81eb3 · 2026-03-31T23:43:03.000+01:00
diff --git a/nix/mlx.nix b/nix/mlx.nix
@@ -49,7 +49,7 @@ let
       owner = "rltakashige";
       repo = "mlx-jaccl-fix-small-recv";
       rev = uvLockMlxRev;
-      hash = "sha256-GosFIWxIB48Egb1MqJrR3xhsUsQeWdRk5rV93USY6wQ=";
+      hash = "sha256-jMkUrQP7G6ceQ2ARlo/yTg+hM67dPNoDQl8MdbIKkgk=";
     };
 
     patches = [
diff --git a/src/exo/master/placement.py b/src/exo/master/placement.py
@@ -128,26 +128,12 @@ def place_instance(
     if len(cycles_with_sufficient_memory) == 0:
         raise ValueError("No cycles found with sufficient memory")
 
-    if command.sharding == Sharding.Tensor:
-        if not command.model_card.supports_tensor:
-            raise ValueError(
-                f"Requested Tensor sharding but this model does not support tensor parallelism: {command.model_card.model_id}"
-            )
-        # TODO: the condition here for tensor parallel is not correct, but it works good enough for now.
-        kv_heads = command.model_card.num_key_value_heads
-        cycles_with_sufficient_memory = [
-            cycle
-            for cycle in cycles_with_sufficient_memory
-            if command.model_card.hidden_size % len(cycle) == 0
-            and (kv_heads is None or kv_heads % len(cycle) == 0)
-        ]
-        if not cycles_with_sufficient_memory:
-            raise ValueError(
-                f"No tensor sharding found for model with "
-                f"hidden_size={command.model_card.hidden_size}"
-                f"{f', num_key_value_heads={kv_heads}' if kv_heads is not None else ''}"
-                f" across candidate cycles"
-            )
+    if command.sharding == Sharding.Tensor and not command.model_card.supports_tensor:
+        raise ValueError(
+            f"Requested Tensor sharding but this model does not support tensor parallelism: {command.model_card.model_id}"
+        )
+
+    # Uneven tensor sharding handles arbitrary world sizes — no divisibility check needed
     if command.sharding == Sharding.Pipeline and command.model_card.model_id == ModelId(
         "mlx-community/DeepSeek-V3.1-8bit"
     ):
diff --git a/src/exo/shared/types/worker/shards.py b/src/exo/shared/types/worker/shards.py
@@ -1,7 +1,7 @@
 from enum import Enum
 from typing import TypeAlias, final
 
-from pydantic import Field
+from pydantic import Field, field_validator
 
 from exo.shared.models.model_cards import ModelCard
 from exo.utils.pydantic_ext import TaggedModel
@@ -91,6 +91,15 @@ class TensorShardMetadata(BaseShardMetadata):
     shard_weights: list[float] | None = None
     shard_mode: TensorShardMode = TensorShardMode.Constant
 
+    @field_validator("shard_mode", mode="before")
+    @classmethod
+    def _coerce_shard_mode(cls, v: object) -> TensorShardMode:
+        if isinstance(v, str):
+            return TensorShardMode(v)
+        if isinstance(v, TensorShardMode):
+            return v
+        raise ValueError(f"expected TensorShardMode or str, got {type(v).__name__}")
+
 
 ShardMetadata: TypeAlias = (
     PipelineShardMetadata | CfgShardMetadata | TensorShardMetadata
diff --git a/src/exo/worker/engines/mlx/auto_parallel.py b/src/exo/worker/engines/mlx/auto_parallel.py
@@ -778,16 +778,20 @@ def shard_model(
                 layer.self_attn.k_proj.weight.shape[0] // head_dim
             )
 
+            mlp_unit = getattr(layer.mlp.gate_proj, "group_size", 1)
             layer.mlp.gate_proj = self.all_to_sharded_linear(
                 layer.mlp.gate_proj,
+                unit=mlp_unit,
                 weights=self._greedy_weights_for("gate", intermediate),
             )
             layer.mlp.down_proj = self.sharded_to_all_linear(
                 layer.mlp.down_proj,
+                unit=mlp_unit,
                 weights=self._greedy_weights_for("down", intermediate),
             )
             layer.mlp.up_proj = self.all_to_sharded_linear(
                 layer.mlp.up_proj,
+                unit=mlp_unit,
                 weights=self._greedy_weights_for("up", intermediate),
             )
             mx.eval(layer)
@@ -890,16 +894,20 @@ def shard_heads(w: mx.array, sh: int = sh, eh: int = eh) -> mx.array:
             # Shard the MLP
             if isinstance(layer.mlp, (DeepseekV3MLP, DeepseekV32MLP)):
                 intermediate = layer.mlp.gate_proj.weight.shape[0]
+                mlp_unit = getattr(layer.mlp.gate_proj, "group_size", 1)
                 layer.mlp.gate_proj = self.all_to_sharded_linear(
                     layer.mlp.gate_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("gate", intermediate),
                 )
                 layer.mlp.down_proj = self.sharded_to_all_linear(
                     layer.mlp.down_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("down", intermediate),
                 )
                 layer.mlp.up_proj = self.all_to_sharded_linear(
                     layer.mlp.up_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("up", intermediate),
                 )
 
@@ -1037,16 +1045,20 @@ def shard_heads(w: mx.array, sh: int = sh, eh: int = eh) -> mx.array:
 
             if isinstance(layer.mlp, Glm4MoeLiteMLP):
                 intermediate = layer.mlp.gate_proj.weight.shape[0]
+                mlp_unit = getattr(layer.mlp.gate_proj, "group_size", 1)
                 layer.mlp.gate_proj = self.all_to_sharded_linear(
                     layer.mlp.gate_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("gate", intermediate),
                 )
                 layer.mlp.down_proj = self.sharded_to_all_linear(
                     layer.mlp.down_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("down", intermediate),
                 )
                 layer.mlp.up_proj = self.all_to_sharded_linear(
                     layer.mlp.up_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("up", intermediate),
                 )
 
@@ -1516,16 +1528,20 @@ def shard_model(
             # Shard the MLP
             else:
                 intermediate = layer.mlp.gate_proj.weight.shape[0]
+                mlp_unit = getattr(layer.mlp.gate_proj, "group_size", 1)
                 layer.mlp.gate_proj = self.all_to_sharded_linear(
                     layer.mlp.gate_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("gate", intermediate),
                 )
                 layer.mlp.down_proj = self.sharded_to_all_linear(
                     layer.mlp.down_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("down", intermediate),
                 )
                 layer.mlp.up_proj = self.all_to_sharded_linear(
                     layer.mlp.up_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("up", intermediate),
                 )
 
@@ -1622,16 +1638,20 @@ def shard_model(
 
             else:
                 intermediate = layer.mlp.gate_proj.weight.shape[0]
+                mlp_unit = getattr(layer.mlp.gate_proj, "group_size", 1)
                 layer.mlp.gate_proj = self.all_to_sharded_linear(
                     layer.mlp.gate_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("gate", intermediate),
                 )
                 layer.mlp.down_proj = self.sharded_to_all_linear(
                     layer.mlp.down_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("down", intermediate),
                 )
                 layer.mlp.up_proj = self.all_to_sharded_linear(
                     layer.mlp.up_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("up", intermediate),
                 )
 
@@ -1792,16 +1812,20 @@ def shard_model(
 
             if isinstance(layer.mlp, Step35MLP):
                 intermediate = layer.mlp.gate_proj.weight.shape[0]
+                mlp_unit = getattr(layer.mlp.gate_proj, "group_size", 1)
                 layer.mlp.gate_proj = self.all_to_sharded_linear(
                     layer.mlp.gate_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("gate", intermediate),
                 )
                 layer.mlp.up_proj = self.all_to_sharded_linear(
                     layer.mlp.up_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("up", intermediate),
                 )
                 layer.mlp.down_proj = self.sharded_to_all_linear(
                     layer.mlp.down_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("down", intermediate),
                 )
             else:
diff --git a/uv.lock b/uv.lock