edwinyyyu
diff --git a/‎packages/client/client_tests/test_format.py‎
Lines changed: 79 additions & 0 deletions b/‎packages/client/client_tests/test_format.py‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎packages/client/src/memmachine_client/format.py‎
Lines changed: 3 additions & 3 deletions b/‎packages/client/src/memmachine_client/format.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎packages/server/server_tests/memmachine_server/common/episode_store/test_episode_model.py‎
Lines changed: 49 additions & 0 deletions b/‎packages/server/server_tests/memmachine_server/common/episode_store/test_episode_model.py‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎packages/server/server_tests/memmachine_server/episodic_memory/declarative_memory/test_string_from_episode_context.py‎
Lines changed: 79 additions & 0 deletions b/‎packages/server/server_tests/memmachine_server/episodic_memory/declarative_memory/test_string_from_episode_context.py‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎packages/server/server_tests/memmachine_server/semantic_memory/test_semantic_llm.py‎
Lines changed: 130 additions & 0 deletions b/‎packages/server/server_tests/memmachine_server/semantic_memory/test_semantic_llm.py‎
Lines changed: 130 additions & 0 deletions
@@ -99,6 +99,59 @@ def test_content_json_escaped(self):
         result = format_episodes([ep])
         assert json.dumps('She said "hello"') in result
 
+    def test_non_ascii_content_preserved_literally(self):
+        """Non-ASCII characters must appear literally in the LLM context, not
+        as ``\\uXXXX`` escapes — escaping bloats token counts and degrades
+        recall on multilingual content."""
+        ep = EpisodeResponse(
+            uid="1",
+            content="寿司 café 🍕 naïve résumé Привет",
+            producer_id="user_1",
+            producer_role="user",
+            created_at=datetime(2024, 1, 1, 0, 0, tzinfo=timezone.utc),
+        )
+        result = format_episodes([ep])
+        assert "寿司" in result
+        assert "café" in result
+        assert "🍕" in result
+        assert "naïve" in result
+        assert "résumé" in result
+        assert "Привет" in result
+        # Sanity: no escaped CJK / cyrillic / accented sequences.
+        assert "\\u" not in result
+
+    def test_non_ascii_content_lossless_roundtrip(self):
+        """The JSON-encoded portion must round-trip back to the original
+        string so downstream LLM consumers (and any client-side
+        post-processing) still see correct text."""
+        original = '日本語 — "quoted" + emoji 🎉'
+        ep = EpisodeResponse(
+            uid="1",
+            content=original,
+            producer_id="user_1",
+            producer_role="user",
+            created_at=None,
+        )
+        result = format_episodes([ep])
+        json_part = result.removeprefix("user_1: ").rstrip("\n")
+        assert json.loads(json_part) == original
+
+    def test_output_is_utf8_encodable(self):
+        """The LLM-visible string must be safe to send over UTF-8 transports
+        (HTTP body, logging sinks). ``ensure_ascii=False`` produces
+        unescaped surrogates only for malformed inputs; clean Unicode must
+        encode without error."""
+        ep = EpisodeResponse(
+            uid="1",
+            content="Mixed: ASCII + 中文 + 🚀",
+            producer_id="user_1",
+            producer_role="user",
+            created_at=datetime(2024, 1, 1, 0, 0, tzinfo=timezone.utc),
+        )
+        result = format_episodes([ep])
+        encoded = result.encode("utf-8")
+        assert encoded.decode("utf-8") == result
+
 
 class TestFormatSemanticMemories:
     """Tests for format_semantic_memories."""
@@ -146,6 +199,32 @@ def test_groups_by_tag(self):
             "background": {"role": "engineer"},
         }
 
+    def test_non_ascii_value_preserved_literally(self):
+        feature = SemanticFeature(
+            category="profile",
+            tag="prefs",
+            feature_name="favorite_food",
+            value="寿司 🍣",
+        )
+        result = format_semantic_memories([feature])
+        assert "寿司" in result
+        assert "🍣" in result
+        assert "\\u" not in result
+        # And the JSON is still valid.
+        assert json.loads(result) == {"prefs": {"favorite_food": "寿司 🍣"}}
+
+    def test_non_ascii_tag_and_feature_name_preserved(self):
+        feature = SemanticFeature(
+            category="profile",
+            tag="préférences",
+            feature_name="種類",
+            value="ramen",
+        )
+        result = format_semantic_memories([feature])
+        assert "préférences" in result
+        assert "種類" in result
+        assert json.loads(result) == {"préférences": {"種類": "ramen"}}
+
     def test_metadata_excluded(self):
         feature = SemanticFeature(
             set_id="set_1",
 
@@ -43,9 +43,9 @@ def format_episodes(episodes: Iterable[EpisodeResponse | Episode]) -> str:
         if episode.created_at is not None:
             date_str = episode.created_at.strftime("%A, %B %d, %Y")
             time_str = episode.created_at.strftime("%I:%M %p")
-            result += f"[{date_str} at {time_str}] {episode.producer_id}: {json.dumps(episode.content)}\n"
+            result += f"[{date_str} at {time_str}] {episode.producer_id}: {json.dumps(episode.content, ensure_ascii=False)}\n"
         else:
-            result += f"{episode.producer_id}: {json.dumps(episode.content)}\n"
+            result += f"{episode.producer_id}: {json.dumps(episode.content, ensure_ascii=False)}\n"
     return result
 
 
@@ -66,7 +66,7 @@ def format_semantic_memories(features: Iterable[SemanticFeature]) -> str:
     structured: dict[str, dict[str, str]] = {}
     for feature in features:
         structured.setdefault(feature.tag, {})[feature.feature_name] = feature.value
-    return json.dumps(structured)
+    return json.dumps(structured, ensure_ascii=False)
 
 
 def format_search_result(result: SearchResult) -> str:
 
@@ -1,5 +1,6 @@
 """Test for the Episode models."""
 
+import json
 from datetime import UTC, datetime
 
 import pytest
@@ -60,3 +61,51 @@ def test_episodes_to_string_with_episode_response(base_episode_data):
     assert len(lines) == 1
     line0 = '[Wednesday, January 14, 2026 at 01:30 PM] user_1: "Hello world"'
     assert lines[0] == line0
+
+
+def test_episodes_to_string_message_preserves_non_ascii(base_episode_data):
+    """Non-ASCII content must appear literally in the LLM context, not as
+    ``\\uXXXX`` escapes — escapes inflate the prompt token count and
+    obscure semantic content."""
+    base_episode_data["content"] = "寿司 café 🍕 Привет"
+    ep = Episode(**base_episode_data)
+    result = episodes_to_string([ep])
+
+    assert "寿司" in result
+    assert "café" in result
+    assert "🍕" in result
+    assert "Привет" in result
+    assert "\\u" not in result
+
+    # The JSON-quoted content must round-trip back to the original string.
+    line = result.rstrip("\n")
+    json_part = line.split(": ", 1)[1]
+    assert json.loads(json_part) == "寿司 café 🍕 Привет"
+
+
+def test_episodes_to_string_non_message_preserves_non_ascii(base_episode_data):
+    """The ``case _:`` fallback (e.g. an EpisodeResponse with no episode
+    type) must also preserve Unicode literally."""
+    fallback_data = {k: v for k, v in base_episode_data.items() if k != "session_key"}
+    fallback_data["episode_type"] = None
+    fallback_data["content"] = "要約: ☕ résumé"
+    er = EpisodeResponse(**fallback_data)
+    result = episodes_to_string([er])
+
+    assert "要約" in result
+    assert "☕" in result
+    assert "résumé" in result
+    assert "\\u" not in result
+    assert json.loads(result.rstrip("\n")) == "要約: ☕ résumé"
+
+
+def test_episodes_to_string_output_is_utf8_encodable(base_episode_data):
+    """The formatted string is the exact text fed to LanguageModel prompts;
+    it must be losslessly UTF-8 encodable (no surrogate pairs from broken
+    escaping)."""
+    base_episode_data["content"] = "ASCII + 中文 + 🚀 + emoji modifier 👨‍👩‍👧‍👦"
+    ep = Episode(**base_episode_data)
+    result = episodes_to_string([ep])
+
+    encoded = result.encode("utf-8")
+    assert encoded.decode("utf-8") == result
@@ -0,0 +1,79 @@
+"""Unit tests for ``DeclarativeMemory.string_from_episode_context``.
+
+These tests target the static formatter directly and do not need any of
+the heavyweight integration fixtures (Neo4j, embedders, rerankers) used
+by ``test_declarative_memory.py``.
+"""
+
+import json
+from datetime import UTC, datetime
+
+from memmachine_server.episodic_memory.declarative_memory import (
+    ContentType,
+    DeclarativeMemory,
+    Episode,
+)
+
+
+def _make_episode(content):
+    return Episode(
+        uid="ep_1",
+        timestamp=datetime(2026, 1, 14, 13, 30, tzinfo=UTC),
+        source="user_1",
+        content_type=ContentType.MESSAGE,
+        content=content,
+    )
+
+
+def test_ascii_content_baseline():
+    """Sanity check that ASCII content still formats as before — guards
+    against regressions in the timestamp/source/JSON layout."""
+    result = DeclarativeMemory.string_from_episode_context(
+        [_make_episode("Hello world")]
+    )
+    assert result == '[Wednesday, January 14, 2026 at 01:30 PM] user_1: "Hello world"\n'
+
+
+def test_non_ascii_content_preserved_literally():
+    """Non-ASCII characters must reach the reranker / LLM as-is, not as
+    ``\\uXXXX`` escapes — escapes inflate token counts and obscure
+    semantic content for the reranker."""
+    result = DeclarativeMemory.string_from_episode_context(
+        [_make_episode("寿司 café 🍕 Привет naïve")]
+    )
+
+    assert "寿司" in result
+    assert "café" in result
+    assert "🍕" in result
+    assert "Привет" in result
+    assert "naïve" in result
+    assert "\\u" not in result
+
+
+def test_non_ascii_content_lossless_roundtrip():
+    """The JSON-encoded content portion must round-trip back to the
+    original string — the reranker scoring relies on the literal text
+    matching the query distribution."""
+    original = '日本語 — "quoted" + emoji 🎉'
+    result = DeclarativeMemory.string_from_episode_context([_make_episode(original)])
+    json_part = result.split(": ", 1)[1].rstrip("\n")
+    assert json.loads(json_part) == original
+
+
+def test_output_is_utf8_encodable():
+    """The context string is fed to ``Reranker.score`` and (via siblings)
+    into LLM prompts; it must encode to UTF-8 cleanly."""
+    result = DeclarativeMemory.string_from_episode_context(
+        [_make_episode("Mixed: ASCII + 中文 + 🚀 + 👨‍👩‍👧‍👦")]
+    )
+    assert result.encode("utf-8").decode("utf-8") == result
+
+
+def test_multiple_episodes_each_preserve_unicode():
+    eps = [_make_episode("café"), _make_episode("寿司"), _make_episode("🚀")]
+    result = DeclarativeMemory.string_from_episode_context(eps)
+    lines = result.strip().split("\n")
+    assert len(lines) == 3
+    assert "café" in lines[0]
+    assert "寿司" in lines[1]
+    assert "🚀" in lines[2]
@@ -312,3 +312,133 @@ def test_consolidation_format_preserves_all_fields(self, features_with_ids):
         assert entry["feature"] == "observer_fix"
         assert entry["value"] == "Fixed observer subagent bug"
         assert entry["metadata"] == {"id": "42"}
+
+
+class TestNonAsciiPromptSerialization:
+    """Both ``llm_feature_update`` and ``llm_consolidate_features`` embed
+    the existing feature set into the user prompt via
+    ``json.dumps(..., ensure_ascii=False)``. The non-ASCII payload must
+    survive into the prompt as literal Unicode (so the LLM sees
+    ``"寿司"`` and not ``"\\u5bff\\u53f8"``) and the prompt must remain
+    a valid UTF-8 string."""
+
+    @pytest.fixture
+    def non_ascii_features(self):
+        return [
+            SemanticFeature(
+                category="Profile",
+                tag="食べ物",  # tag itself is non-ASCII
+                feature_name="favorite_dish",
+                value="寿司 🍣",
+                metadata=SemanticFeature.Metadata(id="100"),
+            ),
+            SemanticFeature(
+                category="Profile",
+                tag="préférences",
+                feature_name="café",
+                value="naïve résumé — Привет",
+                metadata=SemanticFeature.Metadata(id="101"),
+            ),
+        ]
+
+    @pytest.mark.asyncio
+    async def test_feature_update_prompt_preserves_non_ascii_literally(
+        self,
+        magic_mock_llm_model: MagicMock,
+        non_ascii_features: list[SemanticFeature],
+    ):
+        magic_mock_llm_model.generate_parsed_response.return_value = {"commands": []}
+
+        await llm_feature_update(
+            features=non_ascii_features,
+            message_content="I had 寿司 for lunch",
+            model=magic_mock_llm_model,
+            update_prompt="Update features",
+        )
+
+        # The user_prompt is the second positional or 'user_prompt' kwarg.
+        call_kwargs = magic_mock_llm_model.generate_parsed_response.call_args.kwargs
+        user_prompt = call_kwargs["user_prompt"]
+
+        # Literal Unicode reaches the LLM, no escape sequences.
+        assert "食べ物" in user_prompt
+        assert "寿司 🍣" in user_prompt
+        assert "préférences" in user_prompt
+        assert "café" in user_prompt
+        assert "naïve résumé — Привет" in user_prompt
+        assert "\\u" not in user_prompt
+
+        # The prompt is UTF-8 transport-safe.
+        assert user_prompt.encode("utf-8").decode("utf-8") == user_prompt
+
+    @pytest.mark.asyncio
+    async def test_consolidate_prompt_preserves_non_ascii_literally(
+        self,
+        magic_mock_llm_model: MagicMock,
+        non_ascii_features: list[SemanticFeature],
+    ):
+        magic_mock_llm_model.generate_parsed_response.return_value = {
+            "consolidated_memories": [],
+            "keep_memories": None,
+        }
+
+        await llm_consolidate_features(
+            features=non_ascii_features,
+            model=magic_mock_llm_model,
+            consolidate_prompt="Consolidate features",
+        )
+
+        call_kwargs = magic_mock_llm_model.generate_parsed_response.call_args.kwargs
+        user_prompt = call_kwargs["user_prompt"]
+
+        assert "食べ物" in user_prompt
+        assert "寿司 🍣" in user_prompt
+        assert "préférences" in user_prompt
+        assert "naïve résumé — Привет" in user_prompt
+        assert "\\u" not in user_prompt
+        assert user_prompt.encode("utf-8").decode("utf-8") == user_prompt
+
+        # The consolidation prompt is bare JSON — verify it still parses
+        # and round-trips losslessly.
+        import json
+
+        parsed = json.loads(user_prompt)
+        assert parsed[0]["tag"] == "食べ物"
+        assert parsed[0]["value"] == "寿司 🍣"
+        assert parsed[0]["metadata"] == {"id": "100"}
+        assert parsed[1]["feature"] == "café"
+        assert parsed[1]["value"] == "naïve résumé — Привет"
+
+    @pytest.mark.asyncio
+    async def test_feature_update_prompt_old_profile_block_is_valid_json(
+        self,
+        magic_mock_llm_model: MagicMock,
+        non_ascii_features: list[SemanticFeature],
+    ):
+        """The feature-update prompt wraps the JSON inside ``<OLD_PROFILE>``
+        delimiters; the inner block must still parse as JSON so the LLM
+        is shown structurally valid input."""
+        magic_mock_llm_model.generate_parsed_response.return_value = {"commands": []}
+
+        await llm_feature_update(
+            features=non_ascii_features,
+            message_content="…",
+            model=magic_mock_llm_model,
+            update_prompt="Update features",
+        )
+
+        user_prompt = magic_mock_llm_model.generate_parsed_response.call_args.kwargs[
+            "user_prompt"
+        ]
+
+        start = user_prompt.index("<OLD_PROFILE>\n") + len("<OLD_PROFILE>\n")
+        end = user_prompt.index("\n</OLD_PROFILE>")
+        old_profile_json = user_prompt[start:end]
+
+        import json
+
+        parsed = json.loads(old_profile_json)
+        assert parsed == {
+            "食べ物": {"favorite_dish": "寿司 🍣"},
+            "préférences": {"café": "naïve résumé — Привет"},
+        }