Don't reject uneven placements in placement

rltakashige · rltakashige · commit e78454de41e8 · 2026-04-01T11:46:53.000+01:00
diff --git a/nix/mlx.nix b/nix/mlx.nix
@@ -49,7 +49,7 @@ let
       owner = "rltakashige";
       repo = "mlx-jaccl-fix-small-recv";
       rev = uvLockMlxRev;
-      hash = "sha256-GosFIWxIB48Egb1MqJrR3xhsUsQeWdRk5rV93USY6wQ=";
+      hash = "sha256-WZGQKGcKQR9uyXf5X/a1+79ycPdbcs/spfTykDUjLE4=";
     };
 
     patches = [
diff --git a/src/exo/master/placement.py b/src/exo/master/placement.py
@@ -128,26 +128,12 @@ def place_instance(
     if len(cycles_with_sufficient_memory) == 0:
         raise ValueError("No cycles found with sufficient memory")
 
-    if command.sharding == Sharding.Tensor:
-        if not command.model_card.supports_tensor:
-            raise ValueError(
-                f"Requested Tensor sharding but this model does not support tensor parallelism: {command.model_card.model_id}"
-            )
-        # TODO: the condition here for tensor parallel is not correct, but it works good enough for now.
-        kv_heads = command.model_card.num_key_value_heads
-        cycles_with_sufficient_memory = [
-            cycle
-            for cycle in cycles_with_sufficient_memory
-            if command.model_card.hidden_size % len(cycle) == 0
-            and (kv_heads is None or kv_heads % len(cycle) == 0)
-        ]
-        if not cycles_with_sufficient_memory:
-            raise ValueError(
-                f"No tensor sharding found for model with "
-                f"hidden_size={command.model_card.hidden_size}"
-                f"{f', num_key_value_heads={kv_heads}' if kv_heads is not None else ''}"
-                f" across candidate cycles"
-            )
+    if command.sharding == Sharding.Tensor and not command.model_card.supports_tensor:
+        raise ValueError(
+            f"Requested Tensor sharding but this model does not support tensor parallelism: {command.model_card.model_id}"
+        )
+
+    # Uneven tensor sharding handles arbitrary world sizes — no divisibility check needed
     if command.sharding == Sharding.Pipeline and command.model_card.model_id == ModelId(
         "mlx-community/DeepSeek-V3.1-8bit"
     ):
diff --git a/src/exo/shared/types/worker/shards.py b/src/exo/shared/types/worker/shards.py
@@ -1,7 +1,7 @@
 from enum import Enum
 from typing import TypeAlias, final
 
-from pydantic import Field
+from pydantic import Field, field_validator
 
 from exo.shared.models.model_cards import ModelCard
 from exo.utils.pydantic_ext import TaggedModel
@@ -91,6 +91,15 @@ class TensorShardMetadata(BaseShardMetadata):
     shard_weights: list[float] | None = None
     shard_mode: TensorShardMode = TensorShardMode.Constant
 
+    @field_validator("shard_mode", mode="before")
+    @classmethod
+    def _coerce_shard_mode(cls, v: object) -> TensorShardMode:
+        if isinstance(v, str):
+            return TensorShardMode(v)
+        if isinstance(v, TensorShardMode):
+            return v
+        raise ValueError(f"expected TensorShardMode or str, got {type(v).__name__}")
+
 
 ShardMetadata: TypeAlias = (
     PipelineShardMetadata | CfgShardMetadata | TensorShardMetadata
diff --git a/src/exo/worker/engines/mlx/auto_parallel.py b/src/exo/worker/engines/mlx/auto_parallel.py
@@ -536,13 +536,70 @@ def _sharded_to_all(path: str, weight: mx.array):
             return None
         return -1, segments
 
-    sharded_to_all_linear_in_place = partial(
+    _base_sharded_to_all_in_place = partial(
         shard_inplace,
         sharding=_sharded_to_all,  # type: ignore
         group=group,
         weights=shard_weights,
     )
 
+    _base_all_to_sharded_in_place = all_to_sharded_linear_in_place
+
+    def _quantized_moe_shard_inplace(
+        module: nn.Module,
+        sharding: Literal["all-to-sharded", "sharded-to-all"],
+        weights: list[float] | None = None,
+    ) -> None:
+        N = group.size()
+        r = group.rank()
+        gs = module.group_size  # pyright: ignore[reportAttributeAccessIssue]
+        bits = module.bits  # pyright: ignore[reportAttributeAccessIssue]
+        params = module.parameters()
+        scales = params["scales"]
+
+        if sharding == "all-to-sharded":
+            dim = params["weight"].shape[max(params["weight"].ndim - 2, 0)]
+            sizes = compute_shard_sizes(dim, N, gs, weights)
+            result: dict[str, Any] = {}
+            for key, param in params.items():
+                if not isinstance(param, mx.array):
+                    result[key] = param
+                    continue
+                axis = max(param.ndim - 2, 0)
+                indices = [sum(sizes[:i]) for i in range(1, len(sizes))]
+                result[key] = mx.contiguous(mx.split(param, indices, axis=axis)[r])
+        else:
+            num_groups = scales.shape[-1]
+            group_counts = compute_shard_sizes(num_groups, N, 1, weights)
+            weight_ppg = gs * bits // 32
+            result = {}
+            for key, param in params.items():
+                if not isinstance(param, mx.array):
+                    result[key] = param
+                    continue
+                if key == "weight":
+                    s = [gc * weight_ppg for gc in group_counts]
+                elif key in ("scales", "biases"):
+                    s = list(group_counts)
+                else:
+                    result[key] = param
+                    continue
+                indices = [sum(s[:i]) for i in range(1, len(s))]
+                result[key] = mx.contiguous(mx.split(param, indices, axis=-1)[r])
+        module.update(result)
+
+    def all_to_sharded_linear_in_place(module: nn.Module, **kwargs: Any) -> None:
+        if getattr(module, "group_size", 0) > 0 and getattr(module, "bits", 0) > 0 and "scales" in module.parameters():
+            _quantized_moe_shard_inplace(module, "all-to-sharded", weights=kwargs.get("weights"))
+        else:
+            _base_all_to_sharded_in_place(module, **kwargs)
+
+    def sharded_to_all_linear_in_place(module: nn.Module, **kwargs: Any) -> None:
+        if getattr(module, "group_size", 0) > 0 and getattr(module, "bits", 0) > 0 and "scales" in module.parameters():
+            _quantized_moe_shard_inplace(module, "sharded-to-all", weights=kwargs.get("weights"))
+        else:
+            _base_sharded_to_all_in_place(module, **kwargs)
+
     if isinstance(model, (LlamaModel, Ministral3Model)):
         tensor_parallel_sharding_strategy = LlamaShardingStrategy(
             group,
@@ -778,16 +835,20 @@ def shard_model(
                 layer.self_attn.k_proj.weight.shape[0] // head_dim
             )
 
+            mlp_unit = getattr(layer.mlp.gate_proj, "group_size", 1)
             layer.mlp.gate_proj = self.all_to_sharded_linear(
                 layer.mlp.gate_proj,
+                unit=mlp_unit,
                 weights=self._greedy_weights_for("gate", intermediate),
             )
             layer.mlp.down_proj = self.sharded_to_all_linear(
                 layer.mlp.down_proj,
+                unit=mlp_unit,
                 weights=self._greedy_weights_for("down", intermediate),
             )
             layer.mlp.up_proj = self.all_to_sharded_linear(
                 layer.mlp.up_proj,
+                unit=mlp_unit,
                 weights=self._greedy_weights_for("up", intermediate),
             )
             mx.eval(layer)
@@ -890,16 +951,20 @@ def shard_heads(w: mx.array, sh: int = sh, eh: int = eh) -> mx.array:
             # Shard the MLP
             if isinstance(layer.mlp, (DeepseekV3MLP, DeepseekV32MLP)):
                 intermediate = layer.mlp.gate_proj.weight.shape[0]
+                mlp_unit = getattr(layer.mlp.gate_proj, "group_size", 1)
                 layer.mlp.gate_proj = self.all_to_sharded_linear(
                     layer.mlp.gate_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("gate", intermediate),
                 )
                 layer.mlp.down_proj = self.sharded_to_all_linear(
                     layer.mlp.down_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("down", intermediate),
                 )
                 layer.mlp.up_proj = self.all_to_sharded_linear(
                     layer.mlp.up_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("up", intermediate),
                 )
 
@@ -1037,16 +1102,20 @@ def shard_heads(w: mx.array, sh: int = sh, eh: int = eh) -> mx.array:
 
             if isinstance(layer.mlp, Glm4MoeLiteMLP):
                 intermediate = layer.mlp.gate_proj.weight.shape[0]
+                mlp_unit = getattr(layer.mlp.gate_proj, "group_size", 1)
                 layer.mlp.gate_proj = self.all_to_sharded_linear(
                     layer.mlp.gate_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("gate", intermediate),
                 )
                 layer.mlp.down_proj = self.sharded_to_all_linear(
                     layer.mlp.down_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("down", intermediate),
                 )
                 layer.mlp.up_proj = self.all_to_sharded_linear(
                     layer.mlp.up_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("up", intermediate),
                 )
 
@@ -1516,16 +1585,20 @@ def shard_model(
             # Shard the MLP
             else:
                 intermediate = layer.mlp.gate_proj.weight.shape[0]
+                mlp_unit = getattr(layer.mlp.gate_proj, "group_size", 1)
                 layer.mlp.gate_proj = self.all_to_sharded_linear(
                     layer.mlp.gate_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("gate", intermediate),
                 )
                 layer.mlp.down_proj = self.sharded_to_all_linear(
                     layer.mlp.down_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("down", intermediate),
                 )
                 layer.mlp.up_proj = self.all_to_sharded_linear(
                     layer.mlp.up_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("up", intermediate),
                 )
 
@@ -1622,16 +1695,20 @@ def shard_model(
 
             else:
                 intermediate = layer.mlp.gate_proj.weight.shape[0]
+                mlp_unit = getattr(layer.mlp.gate_proj, "group_size", 1)
                 layer.mlp.gate_proj = self.all_to_sharded_linear(
                     layer.mlp.gate_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("gate", intermediate),
                 )
                 layer.mlp.down_proj = self.sharded_to_all_linear(
                     layer.mlp.down_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("down", intermediate),
                 )
                 layer.mlp.up_proj = self.all_to_sharded_linear(
                     layer.mlp.up_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("up", intermediate),
                 )
 
@@ -1792,16 +1869,20 @@ def shard_model(
 
             if isinstance(layer.mlp, Step35MLP):
                 intermediate = layer.mlp.gate_proj.weight.shape[0]
+                mlp_unit = getattr(layer.mlp.gate_proj, "group_size", 1)
                 layer.mlp.gate_proj = self.all_to_sharded_linear(
                     layer.mlp.gate_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("gate", intermediate),
                 )
                 layer.mlp.up_proj = self.all_to_sharded_linear(
                     layer.mlp.up_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("up", intermediate),
                 )
                 layer.mlp.down_proj = self.sharded_to_all_linear(
                     layer.mlp.down_proj,
+                    unit=mlp_unit,
                     weights=self._greedy_weights_for("down", intermediate),
                 )
             else:
diff --git a/uv.lock b/uv.lock